class: center, middle, inverse, title-slide .title[ # Data wrangling: tidy data ] .author[ ### INFO 5940
Cornell University ] --- class: inverse, middle # Importing data in R --- ## `readr` vs. base R <img src="index_files/figure-html/compare-speed-small-plot-1.png" width="80%" style="display: block; margin: auto;" /> --- ## `readr` vs. base R <img src="index_files/figure-html/compare-speed-large-plot-1.png" width="80%" style="display: block; margin: auto;" /> --- ## Alternative file formats * CSV * RDS * Feather * Excel * SPSS/Stata --- ## `challenge` ``` ## # A tibble: 2,000 × 2 ## x y ## <dbl> <date> ## 1 404 NA ## 2 4172 NA ## 3 3004 NA ## 4 787 NA ## 5 37 NA ## 6 2332 NA ## 7 2489 NA ## 8 1449 NA ## 9 3665 NA ## 10 3863 NA ## # … with 1,990 more rows ``` --- ## RDS ```r # compare file size file.info(here("static", "data", "challenge.rds"))$size %>% utils:::format.object_size("auto") ``` ``` ## [1] "11.6 Kb" ``` ```r file.info(here("static", "data", "challenge.csv"))$size %>% utils:::format.object_size("auto") ``` ``` ## [1] "37.1 Kb" ``` --- ## RDS <img src="index_files/figure-html/rds-3-1.png" width="80%" style="display: block; margin: auto;" /> --- ## `feather` <img src="index_files/figure-html/feather-2-1.png" width="80%" style="display: block; margin: auto;" /> --- ## `readxl` ```r library(readxl) xlsx_example <- readxl_example(path = "datasets.xlsx") read_excel(path = xlsx_example) ``` ``` ## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <chr> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows ``` --- ## `readxl` ```r excel_sheets(path = xlsx_example) ``` ``` ## [1] "iris" "mtcars" "chickwts" "quakes" ``` ```r read_excel(path = xlsx_example, sheet = "chickwts") ``` ``` ## # A tibble: 71 × 2 ## weight feed ## <dbl> <chr> ## 1 179 horsebean ## 2 160 horsebean ## 3 136 horsebean ## 4 227 horsebean ## 5 217 horsebean ## 6 168 horsebean ## 7 108 horsebean ## 8 124 horsebean ## 9 143 horsebean ## 10 140 horsebean ## # … with 61 more rows ``` --- ## `haven` and SAS ```r library(haven) read_sas(data_file = system.file("examples", "iris.sas7bdat", package = "haven" )) ``` ``` ## # A tibble: 150 × 5 ## Sepal_Length Sepal_Width Petal_Length Petal_Width Species ## <dbl> <dbl> <dbl> <dbl> <chr> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows ``` --- ## `haven` and SPSS ```r read_sav(file = system.file("examples", "iris.sav", package = "haven" )) ``` ``` ## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <dbl+lbl> ## 1 5.1 3.5 1.4 0.2 1 [setosa] ## 2 4.9 3 1.4 0.2 1 [setosa] ## 3 4.7 3.2 1.3 0.2 1 [setosa] ## 4 4.6 3.1 1.5 0.2 1 [setosa] ## 5 5 3.6 1.4 0.2 1 [setosa] ## 6 5.4 3.9 1.7 0.4 1 [setosa] ## 7 4.6 3.4 1.4 0.3 1 [setosa] ## 8 5 3.4 1.5 0.2 1 [setosa] ## 9 4.4 2.9 1.4 0.2 1 [setosa] ## 10 4.9 3.1 1.5 0.1 1 [setosa] ## # … with 140 more rows ``` --- ## `haven` and Stata ```r read_dta(file = system.file("examples", "iris.dta", package = "haven" )) ``` ``` ## # A tibble: 150 × 5 ## sepallength sepalwidth petallength petalwidth species ## <dbl> <dbl> <dbl> <dbl> <chr> ## 1 5.10 3.5 1.40 0.200 setosa ## 2 4.90 3 1.40 0.200 setosa ## 3 4.70 3.20 1.30 0.200 setosa ## 4 4.60 3.10 1.5 0.200 setosa ## 5 5 3.60 1.40 0.200 setosa ## 6 5.40 3.90 1.70 0.400 setosa ## 7 4.60 3.40 1.40 0.300 setosa ## 8 5 3.40 1.5 0.200 setosa ## 9 4.40 2.90 1.40 0.200 setosa ## 10 4.90 3.10 1.5 0.100 setosa ## # … with 140 more rows ``` --- class: inverse, middle # Tidy data --- ## Tidy data <img src="../../../../../../../../img/tidydata_1.jpg" width="80%" style="display: block; margin: auto;" /> .footnote[Source: [Allison Horst](https://github.com/allisonhorst/stats-illustrations)] --- ## Tidy data <img src="../../../../../../../../img/tidydata_2.jpg" width="80%" style="display: block; margin: auto;" /> .footnote[Source: [Allison Horst](https://github.com/allisonhorst/stats-illustrations)] --- ## Common tidying tasks * Pivoting * Longer * Wider * Separating * Uniting --- ## Pivot longer .pull-left[ ```r table4a ``` ``` ## # A tibble: 3 × 3 ## country `1999` `2000` ## * <chr> <int> <int> ## 1 Afghanistan 745 2666 ## 2 Brazil 37737 80488 ## 3 China 212258 213766 ``` ] -- .pull-right[ ```r pivot_longer( data = table4a, cols = c(`1999`, `2000`), names_to = "year", values_to = "cases" ) ``` ``` ## # A tibble: 6 × 3 ## country year cases ## <chr> <chr> <int> ## 1 Afghanistan 1999 745 ## 2 Afghanistan 2000 2666 ## 3 Brazil 1999 37737 ## 4 Brazil 2000 80488 ## 5 China 1999 212258 ## 6 China 2000 213766 ``` ] --- ## Pivot wider .pull-left[ ```r table2 ``` ``` ## # A tibble: 12 × 4 ## country year type count ## <chr> <int> <chr> <int> ## 1 Afghanistan 1999 cases 745 ## 2 Afghanistan 1999 population 19987071 ## 3 Afghanistan 2000 cases 2666 ## 4 Afghanistan 2000 population 20595360 ## 5 Brazil 1999 cases 37737 ## 6 Brazil 1999 population 172006362 ## 7 Brazil 2000 cases 80488 ## 8 Brazil 2000 population 174504898 ## 9 China 1999 cases 212258 ## 10 China 1999 population 1272915272 ## 11 China 2000 cases 213766 ## 12 China 2000 population 1280428583 ``` ] -- .pull-right[ ```r pivot_wider( data = table2, names_from = type, values_from = count ) ``` ``` ## # A tibble: 6 × 4 ## country year cases population ## <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583 ``` ] --- ## Separating .pull-left[ ```r table3 ``` ``` ## # A tibble: 6 × 3 ## country year rate ## * <chr> <int> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583 ``` ] -- .pull-right[ ```r separate( data = table3, col = rate, into = c( "cases", "population" ), convert = TRUE ) ``` ``` ## # A tibble: 6 × 4 ## country year cases population ## <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583 ``` ] --- ## Uniting .pull-left[ ```r table5 ``` ``` ## # A tibble: 6 × 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583 ``` ] -- .pull-right[ ```r unite( data = table5, col = "year", century, year ) ``` ``` ## # A tibble: 6 × 3 ## country year rate ## <chr> <chr> <chr> ## 1 Afghanistan 19_99 745/19987071 ## 2 Afghanistan 20_00 2666/20595360 ## 3 Brazil 19_99 37737/172006362 ## 4 Brazil 20_00 80488/174504898 ## 5 China 19_99 212258/1272915272 ## 6 China 20_00 213766/1280428583 ``` ] --- ## Uniting .pull-left[ ```r table5 ``` ``` ## # A tibble: 6 × 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583 ``` ] .pull-right[ ```r # remove underscore unite( data = table5, col = "year", century, year, sep = "" ) %>% mutate(year = parse_number(year)) ``` ``` ## # A tibble: 6 × 3 ## country year rate ## <chr> <dbl> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583 ``` ]