--- title: "Tidy Data" author: "JJB + Course" date: "10/03/2018" output: html_document: toc: true toc_float: collapse: false --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Pipe Operator ## Example: Piping Operator ```{r example-funcs} # install.packages("magrittr") library("magrittr") 4 %>% # Take the number four and, then sqrt() # find the square root # Same as # sqrt(4) c(7, 42, 1, 25) %>% # Combine four elements and, then log() %>% # take the natural log and, then round(2) %>% # round to the second decimal and, then diff() # take the difference between consecutive elements # Same as # diff(round(log(c(7,42,1,25)), 2)) ``` ## Example: Multi-step forms ```{r multi-step-funcs} # Embedded / Nested Functions set.seed(821) mean(rnorm(10)) # Temporary Intermediate Variables set.seed(821) rand_nums = rnorm(10) mean_nums = mean(rand_nums) # Piped # install.packages("magrittr") library("magrittr") set.seed(821) rnorm(10) %>% # Generate 10 random values from a normal and, then mean() # take the mean. ``` ## Example: Inside-out form of Pipe **Embedded/Nested** Function Calls ```{r nested-funcs, eval = FALSE} pickup( # Step 5 goto( # Step 4 order( # Step 3 store( # Step 2 drink("Java Chip Frap"), # Step 1 loc = "Green St.") ) ) ) ``` **Piped** ```{r piped-funcs, eval = FALSE} "Java Chip Frap" %>% drink() %>% # Step 1 store(loc = "Green St.") %>% # Step 2 order() %>% # Step 3 goto() %>% # Step 4 pickup() # Step 5 ``` ## Example: Pipe to Other Arugments ```{r pipe-other-args} subtract_vals = function(x, y) { x - y } x = 7; y = 4 subtract_vals(x, y) x %>% subtract_vals(y) # Default, e.g. subtract_vals(x, y) x %>% subtract_vals( . , y) # Default, e.g. subtract_vals(x, y) # ^ Period specifies where the value should go y %>% subtract_vals(x, . ) # Pipe y to second argument, e.g. subtract_vals(x, y) # ^ Period specifies where the value should go x %>% subtract_vals(y, . ) # Pipe x to second argument, e.g. subtract_vals(y, x) # ^ Period specifies where the value should go ``` ## Example: Accessing Values in Data Mid-Pipe ```{r pipe-data-access} my_df = data.frame(x = c(0, 1), y = c(2, 3)) my_df %>% .[["x"]] # ^ Period specifies where the data frame should go # Equivalent to my_df$x my_df[["x"]] my_df[[1]] # Nested period usage to retrieve the last column my_df %>% .[[ncol(.)]] # ^ ^ Period specifies where the data frame should go # Equivalent to my_df[[ncol(my_df)]] ``` ### Exercise: Working with Pipes Make the following pipeable ```{r} # install.packages("dplyr") library("dplyr") # How can we transform this embedded code statement? tail(filter(iris, Petal.Width > mean(Petal.Width))) # We want to transform it using %>% tail( filter( iris, Petal.Width > mean(Petal.Width) ) ) # Take the inside result and switch it with the outside portion # a.k.a inverting the function order iris %>% filter(Petal.Width > mean(Petal.Width)) %>% tail() ``` Write a pipe that provides the `sqrt` of `2+2` ```{r} sqrt(2 + 2) 2 + 2 %>% sqrt() 2 + sqrt(2) (2 + 2) %>% sqrt() ``` ## Example: Enrollment Untidy to Tidy ```{r transform-enrollment} # install.packages("tidyr") library("tidyr") # Untidy Data enrolled_fa17 = data.frame( undergrads = c(18345, 15267, 12), profs = c(352, 640, 0), grads = c(7173, 6028, 9), gender = c("Men", "Women", "Unknown") ) # Why do we need to include gender inside the data.frame? # Tidy the data enrolled_fa17_tidy = gather(enrolled_fa17, key = "Year", # What the key is value = "Enrolled", # Specify what the _third variable_ should be undergrads:grads) # Take variables from undergrads to grads # similar to seq(from, to) enrolled_fa17_tidy ``` ## Example: Alternative Meanings - ChickWeights ```{r transform-chicks} # install.packages("tidyr") library("tidyr") # "Long"-form or "Tidy Data" head(ChickWeight) # "Wide"-form or "Messy data" ChickWeight_wide = ChickWeight %>% spread(Time, weight) # Check data head(ChickWeight_wide) # Recover "long"-form or "Tidy Data" ChickWeight_long = ChickWeight_wide %>% gather(key = Time, # Key for the key/value pairing value = weight, # Column for measurements `0`:`21`) # Column seleciton # Check data head(ChickWeight_long) ``` ## Example: Alternative Meanings - Science! Load in "Wide Data" ```{r load-wide-data} experiment = read.table(header=TRUE, text=' subject sex control a b S1 F 4.2 4.1 2.2 S2 M 5.9 7.2 6.8 S3 M 9.1 9.8 10.2 S5 F 2.1 23.5 5.2 ') # Show wide-experiment data experiment ``` Convert experiment data to long format ```{r wide-to-long-exp} experiment_long = gather(experiment, condition, measurement, control:b) # Example of Wide Format head(experiment_long) ``` ### Exercise: Making a Data Set Messy Make the `mtcars` data set messy by converting it to: ``` # model type value # 1 AMC Javelin mpg 15.200 # 2 AMC Javelin cyl 8.000 # 3 AMC Javelin disp 304.000 # 4 AMC Javelin hp 150.000 # 5 AMC Javelin drat 3.150 # 6 AMC Javelin wt 3.435 ``` ```{r convert-mtcars-data-to-long, eval = FALSE} library("tidyr") rownames(mtcars) # Move the rowname to a variable name inside the data set. mtcars$model = rownames(mtcars) head(mtcars) mtcars_long = mtcars %>% gather(key = "____", value = "____", ____:____) # In IDE data viewer # View(mtcars_long) ``` Now, fix the data by converting it back. ```{r convert-mtcars-data-to-wide, eval = FALSE} mtcars_wide = mtcars_long %>% spread(key = "_____", value = "____") head(mtcars_wide) ``` ## Example: Splitting Values - Location Data ```{r breakdown_loc} cities = data.frame(stringsAsFactors=FALSE, city = c("Houston", "Miami", "Atlanta", "Chicago", "Los Angeles", "Washington, D.C.", "New York"), loc = c("29.81997438, -95.33997929", "25.7876107, -80.22410608", "33.83001385, -84.39994938", "41.82999066, -87.75005497", "33.98997825, -118.1799805", "38.89954938, -77.00941858", "40.74997906, -73.98001693"), pop = c(4053287, 2983947, 2464454, 5915976, 8097410, 2445216.5, 13524139), iso3 = c("USA", "USA", "USA", "USA", "USA", "USA", "USA"), province = c("Texas", "Florida", "Georgia", "Illinois", "California", "District of Columbia", "New York") ) cities_split = cities %>% separate(loc, c("lat", "lng"), sep=",") cities_split ``` ## Example: Uniting Values - Location Data ```{r combine_locs, dependson = "breakdown_loc"} cities_split %>% unite(loc, c("lat", "lng"), sep = ",") ``` ### Exercise: Tidying WHO data ```{r who-messy, eval = FALSE} who = tidyr::who View(who) head(who) library("ggplot2") library("stringr") colnames(who) # Convert from a combined value to a split value. colnames(who) = str_replace_all(colnames(who), pattern = "newrel", replacement = "new_rel") # Check to see that files are all in alignment e.g. new_sp_mXXXX colnames(who) # Matching codes # newrel => new_rel tidied_who = who %>% gather(key = "____", # Key is where we should fold values under value = "____", # Value is where the observation is ____:____, # Specifying the range of column names to transform na.rm = TRUE) %>% # We remove any case that has a missing value separate(key, c("____", "____", "____")) %>% # Split apart variables separate(sexage, c("____", "____"), sep = 1) # Break apart more variables ```