class: center, middle, inverse, title-slide .title[ # Data wrangling: tidy data ] .author[ ### INFO 5940
Cornell University ] --- ## Announcements .pull-left[ - What I did - Check in on homework 2 - Access assigned pull requests [here](https://github.coecis.cornell.edu/pulls/assigned) - Breather until homework 3 ] .pull-right[ <blockquote class="instagram-media" data-instgrm-permalink="https://www.instagram.com/p/CiLO2fHu0wW/?utm_source=ig_embed&utm_campaign=loading" data-instgrm-version="14" style=" background:#FFF; border:0; border-radius:3px; box-shadow:0 0 1px 0 rgba(0,0,0,0.5),0 1px 10px 0 rgba(0,0,0,0.15); margin: 1px; max-width:400px; min-width:326px; padding:0; width:99.375%; width:-webkit-calc(100% - 2px); width:calc(100% - 2px);"><div style="padding:16px;"> <a href="https://www.instagram.com/p/CiLO2fHu0wW/?utm_source=ig_embed&utm_campaign=loading" style=" background:#FFFFFF; line-height:0; padding:0 0; text-align:center; text-decoration:none; width:100%;" target="_blank"> <div style=" display: flex; flex-direction: row; align-items: center;"> <div style="background-color: #F4F4F4; border-radius: 50%; flex-grow: 0; height: 40px; margin-right: 14px; width: 40px;"></div> <div style="display: flex; flex-direction: column; flex-grow: 1; justify-content: center;"> <div style=" background-color: #F4F4F4; border-radius: 4px; flex-grow: 0; height: 14px; margin-bottom: 6px; width: 100px;"></div> <div style=" background-color: #F4F4F4; border-radius: 4px; flex-grow: 0; height: 14px; width: 60px;"></div></div></div><div style="padding: 19% 0;"></div> <div style="display:block; height:50px; margin:0 auto 12px; width:50px;"><svg width="50px" height="50px" viewBox="0 0 60 60" version="1.1" xmlns="https://www.w3.org/2000/svg" xmlns:xlink="https://www.w3.org/1999/xlink"><g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"><g transform="translate(-511.000000, -20.000000)" fill="#000000"><g><path d="M556.869,30.41 C554.814,30.41 553.148,32.076 553.148,34.131 C553.148,36.186 554.814,37.852 556.869,37.852 C558.924,37.852 560.59,36.186 560.59,34.131 C560.59,32.076 558.924,30.41 556.869,30.41 M541,60.657 C535.114,60.657 530.342,55.887 530.342,50 C530.342,44.114 535.114,39.342 541,39.342 C546.887,39.342 551.658,44.114 551.658,50 C551.658,55.887 546.887,60.657 541,60.657 M541,33.886 C532.1,33.886 524.886,41.1 524.886,50 C524.886,58.899 532.1,66.113 541,66.113 C549.9,66.113 557.115,58.899 557.115,50 C557.115,41.1 549.9,33.886 541,33.886 M565.378,62.101 C565.244,65.022 564.756,66.606 564.346,67.663 C563.803,69.06 563.154,70.057 562.106,71.106 C561.058,72.155 560.06,72.803 558.662,73.347 C557.607,73.757 556.021,74.244 553.102,74.378 C549.944,74.521 548.997,74.552 541,74.552 C533.003,74.552 532.056,74.521 528.898,74.378 C525.979,74.244 524.393,73.757 523.338,73.347 C521.94,72.803 520.942,72.155 519.894,71.106 C518.846,70.057 518.197,69.06 517.654,67.663 C517.244,66.606 516.755,65.022 516.623,62.101 C516.479,58.943 516.448,57.996 516.448,50 C516.448,42.003 516.479,41.056 516.623,37.899 C516.755,34.978 517.244,33.391 517.654,32.338 C518.197,30.938 518.846,29.942 519.894,28.894 C520.942,27.846 521.94,27.196 523.338,26.654 C524.393,26.244 525.979,25.756 528.898,25.623 C532.057,25.479 533.004,25.448 541,25.448 C548.997,25.448 549.943,25.479 553.102,25.623 C556.021,25.756 557.607,26.244 558.662,26.654 C560.06,27.196 561.058,27.846 562.106,28.894 C563.154,29.942 563.803,30.938 564.346,32.338 C564.756,33.391 565.244,34.978 565.378,37.899 C565.522,41.056 565.552,42.003 565.552,50 C565.552,57.996 565.522,58.943 565.378,62.101 M570.82,37.631 C570.674,34.438 570.167,32.258 569.425,30.349 C568.659,28.377 567.633,26.702 565.965,25.035 C564.297,23.368 562.623,22.342 560.652,21.575 C558.743,20.834 556.562,20.326 553.369,20.18 C550.169,20.033 549.148,20 541,20 C532.853,20 531.831,20.033 528.631,20.18 C525.438,20.326 523.257,20.834 521.349,21.575 C519.376,22.342 517.703,23.368 516.035,25.035 C514.368,26.702 513.342,28.377 512.574,30.349 C511.834,32.258 511.326,34.438 511.181,37.631 C511.035,40.831 511,41.851 511,50 C511,58.147 511.035,59.17 511.181,62.369 C511.326,65.562 511.834,67.743 512.574,69.651 C513.342,71.625 514.368,73.296 516.035,74.965 C517.703,76.634 519.376,77.658 521.349,78.425 C523.257,79.167 525.438,79.673 528.631,79.82 C531.831,79.965 532.853,80.001 541,80.001 C549.148,80.001 550.169,79.965 553.369,79.82 C556.562,79.673 558.743,79.167 560.652,78.425 C562.623,77.658 564.297,76.634 565.965,74.965 C567.633,73.296 568.659,71.625 569.425,69.651 C570.167,67.743 570.674,65.562 570.82,62.369 C570.966,59.17 571,58.147 571,50 C571,41.851 570.966,40.831 570.82,37.631"></path></g></g></g></svg></div><div style="padding-top: 8px;"> <div style=" color:#3897f0; font-family:Arial,sans-serif; font-size:14px; font-style:normal; font-weight:550; line-height:18px;">View this post on Instagram</div></div><div style="padding: 12.5% 0;"></div> <div style="display: flex; flex-direction: row; margin-bottom: 14px; align-items: center;"><div> <div style="background-color: #F4F4F4; border-radius: 50%; height: 12.5px; width: 12.5px; transform: translateX(0px) translateY(7px);"></div> <div style="background-color: #F4F4F4; height: 12.5px; transform: rotate(-45deg) translateX(3px) translateY(1px); width: 12.5px; flex-grow: 0; margin-right: 14px; margin-left: 2px;"></div> <div style="background-color: #F4F4F4; border-radius: 50%; height: 12.5px; width: 12.5px; transform: translateX(9px) translateY(-18px);"></div></div><div style="margin-left: 8px;"> <div style=" background-color: #F4F4F4; border-radius: 50%; flex-grow: 0; height: 20px; width: 20px;"></div> <div style=" width: 0; height: 0; border-top: 2px solid transparent; border-left: 6px solid #f4f4f4; border-bottom: 2px solid transparent; transform: translateX(16px) translateY(-4px) rotate(30deg)"></div></div><div style="margin-left: auto;"> <div style=" width: 0px; border-top: 8px solid #F4F4F4; border-right: 8px solid transparent; transform: translateY(16px);"></div> <div style=" background-color: #F4F4F4; flex-grow: 0; height: 12px; width: 16px; transform: translateY(-4px);"></div> <div style=" width: 0; height: 0; border-top: 8px solid #F4F4F4; border-left: 8px solid transparent; transform: translateY(-4px) translateX(8px);"></div></div></div> <div style="display: flex; flex-direction: column; flex-grow: 1; justify-content: center; margin-bottom: 24px;"> <div style=" background-color: #F4F4F4; border-radius: 4px; flex-grow: 0; height: 14px; margin-bottom: 6px; width: 224px;"></div> <div style=" background-color: #F4F4F4; border-radius: 4px; flex-grow: 0; height: 14px; width: 144px;"></div></div></a><p style=" color:#c9c8cd; font-family:Arial,sans-serif; font-size:14px; line-height:17px; margin-bottom:0; margin-top:8px; overflow:hidden; padding:8px 0 7px; text-align:center; text-overflow:ellipsis; white-space:nowrap;"><a href="https://www.instagram.com/p/CiLO2fHu0wW/?utm_source=ig_embed&utm_campaign=loading" style=" color:#c9c8cd; font-family:Arial,sans-serif; font-size:14px; font-style:normal; font-weight:normal; line-height:17px; text-decoration:none;" target="_blank">A post shared by Amanda Soltoff (@soltoffak)</a></p></div></blockquote> <script async src="//www.instagram.com/embed.js"></script> ] --- class: inverse, middle # Importing data in R --- ## `readr` vs. base R <img src="index_files/figure-html/compare-speed-small-plot-1.png" width="80%" style="display: block; margin: auto;" /> --- ## `readr` vs. base R <img src="index_files/figure-html/compare-speed-large-plot-1.png" width="80%" style="display: block; margin: auto;" /> --- ## Alternative file formats * CSV * RDS * Feather * Excel * SPSS/Stata --- ## `challenge` ``` ## # A tibble: 2,000 × 2 ## x y ## <dbl> <date> ## 1 404 NA ## 2 4172 NA ## 3 3004 NA ## 4 787 NA ## 5 37 NA ## 6 2332 NA ## 7 2489 NA ## 8 1449 NA ## 9 3665 NA ## 10 3863 NA ## # … with 1,990 more rows ``` --- ## RDS ```r # compare file size file.info(here("static", "data", "challenge.rds"))$size %>% utils:::format.object_size("auto") ``` ``` ## [1] "11.6 Kb" ``` ```r file.info(here("static", "data", "challenge.csv"))$size %>% utils:::format.object_size("auto") ``` ``` ## [1] "37.1 Kb" ``` --- ## RDS <img src="index_files/figure-html/rds-3-1.png" width="80%" style="display: block; margin: auto;" /> --- ## `feather` <img src="index_files/figure-html/feather-2-1.png" width="80%" style="display: block; margin: auto;" /> --- ## `readxl` ```r library(readxl) xlsx_example <- readxl_example(path = "datasets.xlsx") read_excel(path = xlsx_example) ``` ``` ## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <chr> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows ``` --- ## `readxl` ```r excel_sheets(path = xlsx_example) ``` ``` ## [1] "iris" "mtcars" "chickwts" "quakes" ``` ```r read_excel(path = xlsx_example, sheet = "chickwts") ``` ``` ## # A tibble: 71 × 2 ## weight feed ## <dbl> <chr> ## 1 179 horsebean ## 2 160 horsebean ## 3 136 horsebean ## 4 227 horsebean ## 5 217 horsebean ## 6 168 horsebean ## 7 108 horsebean ## 8 124 horsebean ## 9 143 horsebean ## 10 140 horsebean ## # … with 61 more rows ``` --- ## `haven` and SAS ```r library(haven) read_sas(data_file = system.file("examples", "iris.sas7bdat", package = "haven" )) ``` ``` ## # A tibble: 150 × 5 ## Sepal_Length Sepal_Width Petal_Length Petal_Width Species ## <dbl> <dbl> <dbl> <dbl> <chr> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows ``` --- ## `haven` and SPSS ```r read_sav(file = system.file("examples", "iris.sav", package = "haven" )) ``` ``` ## # A tibble: 150 × 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <dbl+lbl> ## 1 5.1 3.5 1.4 0.2 1 [setosa] ## 2 4.9 3 1.4 0.2 1 [setosa] ## 3 4.7 3.2 1.3 0.2 1 [setosa] ## 4 4.6 3.1 1.5 0.2 1 [setosa] ## 5 5 3.6 1.4 0.2 1 [setosa] ## 6 5.4 3.9 1.7 0.4 1 [setosa] ## 7 4.6 3.4 1.4 0.3 1 [setosa] ## 8 5 3.4 1.5 0.2 1 [setosa] ## 9 4.4 2.9 1.4 0.2 1 [setosa] ## 10 4.9 3.1 1.5 0.1 1 [setosa] ## # … with 140 more rows ``` --- ## `haven` and Stata ```r read_dta(file = system.file("examples", "iris.dta", package = "haven" )) ``` ``` ## # A tibble: 150 × 5 ## sepallength sepalwidth petallength petalwidth species ## <dbl> <dbl> <dbl> <dbl> <chr> ## 1 5.10 3.5 1.40 0.200 setosa ## 2 4.90 3 1.40 0.200 setosa ## 3 4.70 3.20 1.30 0.200 setosa ## 4 4.60 3.10 1.5 0.200 setosa ## 5 5 3.60 1.40 0.200 setosa ## 6 5.40 3.90 1.70 0.400 setosa ## 7 4.60 3.40 1.40 0.300 setosa ## 8 5 3.40 1.5 0.200 setosa ## 9 4.40 2.90 1.40 0.200 setosa ## 10 4.90 3.10 1.5 0.100 setosa ## # … with 140 more rows ``` --- class: inverse, middle # Tidy data --- ## Tidy data <img src="../../../../../../../../img/tidydata_1.jpg" alt="Stylized text providing an overview of Tidy Data. The top reads 'Tidy datais a standard way of mapping the meaning of a dataset to its structure. - HadleyWickham.' On the left reads 'In tidy data: each variable forms a column; eachobservation forms a row; each cell is a single measurement.' There is an exampletable on the lower right with columns ‘id’, ‘name’ and ‘color’ with observations fordifferent cats, illustrating tidy data structure." width="70%" style="display: block; margin: auto;" /> .footnote[Illustrations from the [Openscapes](https://www.openscapes.org/) blog [*Tidy Data for reproducibility, efficiency, and collaboration*](https://www.openscapes.org/blog/2020/10/12/tidy-data/) by Julia Lowndes and Allison Horst)] --- ## Tidy data <img src="../../../../../../../../img/tidydata_2.jpg" alt="There are two sets of anthropomorphized data tables. The top group ofthree tables are all rectangular and smiling, with a shared speech bubble reading'our columns are variables and our rows are observations!'. Text to the left of thatgroup reads 'The standard structure of tidy data means that 'tidy datasets are allalike…' The lower group of four tables are all different shapes, look ragged andconcerned, and have different speech bubbles reading (from left to right) 'my columnare values and my rows are variables', 'I have variables in columns AND in rows', 'Ihave multiple variables in a single column', and 'I don’t even KNOW what my deal is.'Next to the frazzled data tables is text '...but every messy dataset is messy in itsown way. -Hadley Wickham.'" width="70%" style="display: block; margin: auto;" /> .footnote[Illustrations from the [Openscapes](https://www.openscapes.org/) blog [*Tidy Data for reproducibility, efficiency, and collaboration*](https://www.openscapes.org/blog/2020/10/12/tidy-data/) by Julia Lowndes and Allison Horst)] --- ## Tidy data <img src="../../../../../../../../img/tidydata_3.jpg" alt="On the left is a happy cute fuzzy monster holding a rectangular data framewith a tool that fits the data frame shape. On the workbench behind the monster areother data frames of similar rectangular shape, and neatly arranged tools that alsolook like they would fit those data frames. The workbench looks uncluttered and tidy.The text above the tidy workbench reads 'When working with tidy data, we can use thesame tools in similar ways for different datasets…' On the right is a cute monsterlooking very frustrated, using duct tape and other tools to haphazardly tie datatables together, each in a different way. The monster is in front of a messy,cluttered workbench. The text above the frustrated monster reads '...but working withuntidy data often means reinventing the wheel with one-time approaches that are hardto iterate or reuse.'" width="70%" style="display: block; margin: auto;" /> .footnote[Illustrations from the [Openscapes](https://www.openscapes.org/) blog [*Tidy Data for reproducibility, efficiency, and collaboration*](https://www.openscapes.org/blog/2020/10/12/tidy-data/) by Julia Lowndes and Allison Horst)] --- ## Tidy data <img src="../../../../../../../../img/tidydata_6.jpg" alt="Digital illustration of a cute fuzzy monster holding a brief case thatsays 'tidy data,' standing beside a friendly looking data table character, beingwelcomed with cheers by many other data tables and another cute monster jumping withjoy." width="70%" style="display: block; margin: auto;" /> .footnote[Illustrations from the [Openscapes](https://www.openscapes.org/) blog [*Tidy Data for reproducibility, efficiency, and collaboration*](https://www.openscapes.org/blog/2020/10/12/tidy-data/) by Julia Lowndes and Allison Horst)] --- ## Tidy data <img src="../../../../../../../../img/tidydata_7.jpg" alt="Digital illustration of two cute fuzzy monsters sitting on a park benchwith a smiling data table between them, all eating ice cream together. In text abovethe illustration are the hand drawn words 'make friends with tidy data.'" width="70%" style="display: block; margin: auto;" /> .footnote[Illustrations from the [Openscapes](https://www.openscapes.org/) blog [*Tidy Data for reproducibility, efficiency, and collaboration*](https://www.openscapes.org/blog/2020/10/12/tidy-data/) by Julia Lowndes and Allison Horst)] --- ## Common tidying tasks * Pivoting * Longer * Wider * Separating * Uniting --- ## Pivot longer .pull-left[ ```r table4a ``` ``` ## # A tibble: 3 × 3 ## country `1999` `2000` ## * <chr> <int> <int> ## 1 Afghanistan 745 2666 ## 2 Brazil 37737 80488 ## 3 China 212258 213766 ``` ] -- .pull-right[ ```r pivot_longer( data = table4a, cols = c(`1999`, `2000`), names_to = "year", values_to = "cases" ) ``` ``` ## # A tibble: 6 × 3 ## country year cases ## <chr> <chr> <int> ## 1 Afghanistan 1999 745 ## 2 Afghanistan 2000 2666 ## 3 Brazil 1999 37737 ## 4 Brazil 2000 80488 ## 5 China 1999 212258 ## 6 China 2000 213766 ``` ] --- ## Pivot wider .pull-left[ ```r table2 ``` ``` ## # A tibble: 12 × 4 ## country year type count ## <chr> <int> <chr> <int> ## 1 Afghanistan 1999 cases 745 ## 2 Afghanistan 1999 population 19987071 ## 3 Afghanistan 2000 cases 2666 ## 4 Afghanistan 2000 population 20595360 ## 5 Brazil 1999 cases 37737 ## 6 Brazil 1999 population 172006362 ## 7 Brazil 2000 cases 80488 ## 8 Brazil 2000 population 174504898 ## 9 China 1999 cases 212258 ## 10 China 1999 population 1272915272 ## 11 China 2000 cases 213766 ## 12 China 2000 population 1280428583 ``` ] -- .pull-right[ ```r pivot_wider( data = table2, names_from = type, values_from = count ) ``` ``` ## # A tibble: 6 × 4 ## country year cases population ## <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583 ``` ] --- ## Separating .pull-left[ ```r table3 ``` ``` ## # A tibble: 6 × 3 ## country year rate ## * <chr> <int> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583 ``` ] -- .pull-right[ ```r separate( data = table3, col = rate, into = c( "cases", "population" ), convert = TRUE ) ``` ``` ## # A tibble: 6 × 4 ## country year cases population ## <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583 ``` ] --- ## Uniting .pull-left[ ```r table5 ``` ``` ## # A tibble: 6 × 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583 ``` ] -- .pull-right[ ```r unite( data = table5, col = "year", century, year ) ``` ``` ## # A tibble: 6 × 3 ## country year rate ## <chr> <chr> <chr> ## 1 Afghanistan 19_99 745/19987071 ## 2 Afghanistan 20_00 2666/20595360 ## 3 Brazil 19_99 37737/172006362 ## 4 Brazil 20_00 80488/174504898 ## 5 China 19_99 212258/1272915272 ## 6 China 20_00 213766/1280428583 ``` ] --- ## Uniting .pull-left[ ```r table5 ``` ``` ## # A tibble: 6 × 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583 ``` ] .pull-right[ ```r unite( data = table5, col = "year", century, year, # remove underscore sep = "" ) ``` ``` ## # A tibble: 6 × 3 ## country year rate ## <chr> <chr> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583 ``` ] --- ## Uniting .pull-left[ ```r table5 ``` ``` ## # A tibble: 6 × 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583 ``` ] .pull-right[ ```r unite( data = table5, col = "year", century, year, # remove underscore sep = "" ) %>% # store as numeric mutate(year = parse_number(year)) ``` ``` ## # A tibble: 6 × 3 ## country year rate ## <chr> <dbl> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583 ``` ] --- # Let's get messy! <img src="https://media.giphy.com/media/fCUCbWXe9JONVsJSUd/giphy.gif" width="50%" style="display: block; margin: auto;" />