+ - 0:00:00
Notes for current slide
Notes for next slide

Data wrangling: tidy data

MACS 30500
University of Chicago

1 / 23

Importing data in R

2 / 23

readr vs. base R

3 / 23

readr vs. base R

4 / 23

Alternative file formats

  • CSV
  • RDS
  • Feather
  • Excel
  • SPSS/Stata
5 / 23

challenge

## # A tibble: 2,000 × 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # … with 1,990 more rows
6 / 23

RDS

# compare file size
file.info(here("static", "data", "challenge.rds"))$size %>%
utils:::format.object_size("auto")
## [1] "11.6 Kb"
file.info(here("static", "data", "challenge.csv"))$size %>%
utils:::format.object_size("auto")
## [1] "37.1 Kb"
7 / 23

RDS

8 / 23

feather

9 / 23

readxl

library(readxl)
xlsx_example <- readxl_example(path = "datasets.xlsx")
read_excel(path = xlsx_example)
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
10 / 23

readxl

excel_sheets(path = xlsx_example)
## [1] "iris" "mtcars" "chickwts" "quakes"
read_excel(path = xlsx_example, sheet = "chickwts")
## # A tibble: 71 × 2
## weight feed
## <dbl> <chr>
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
## 7 108 horsebean
## 8 124 horsebean
## 9 143 horsebean
## 10 140 horsebean
## # … with 61 more rows
11 / 23

haven and SAS

library(haven)
read_sas(data_file = system.file("examples", "iris.sas7bdat",
package = "haven"
))
## # A tibble: 150 × 5
## Sepal_Length Sepal_Width Petal_Length Petal_Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
12 / 23

haven and SPSS

read_sav(file = system.file("examples", "iris.sav",
package = "haven"
))
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <dbl+lbl>
## 1 5.1 3.5 1.4 0.2 1 [setosa]
## 2 4.9 3 1.4 0.2 1 [setosa]
## 3 4.7 3.2 1.3 0.2 1 [setosa]
## 4 4.6 3.1 1.5 0.2 1 [setosa]
## 5 5 3.6 1.4 0.2 1 [setosa]
## 6 5.4 3.9 1.7 0.4 1 [setosa]
## 7 4.6 3.4 1.4 0.3 1 [setosa]
## 8 5 3.4 1.5 0.2 1 [setosa]
## 9 4.4 2.9 1.4 0.2 1 [setosa]
## 10 4.9 3.1 1.5 0.1 1 [setosa]
## # … with 140 more rows
13 / 23

haven and Stata

read_dta(file = system.file("examples", "iris.dta",
package = "haven"
))
## # A tibble: 150 × 5
## sepallength sepalwidth petallength petalwidth species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.10 3.5 1.40 0.200 setosa
## 2 4.90 3 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## 4 4.60 3.10 1.5 0.200 setosa
## 5 5 3.60 1.40 0.200 setosa
## 6 5.40 3.90 1.70 0.400 setosa
## 7 4.60 3.40 1.40 0.300 setosa
## 8 5 3.40 1.5 0.200 setosa
## 9 4.40 2.90 1.40 0.200 setosa
## 10 4.90 3.10 1.5 0.100 setosa
## # … with 140 more rows
14 / 23

Tidy data

15 / 23

Tidy data

Source: Allison Horst

16 / 23

Tidy data

Source: Allison Horst

17 / 23

Common tidying tasks

  • Pivoting
    • Longer
    • Wider
  • Separating
  • Uniting
18 / 23

Pivot longer

table4a
## # A tibble: 3 × 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
19 / 23

Pivot longer

table4a
## # A tibble: 3 × 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
pivot_longer(
data = table4a,
cols = c(`1999`, `2000`),
names_to = "year",
values_to = "cases"
)
## # A tibble: 6 × 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Afghanistan 2000 2666
## 3 Brazil 1999 37737
## 4 Brazil 2000 80488
## 5 China 1999 212258
## 6 China 2000 213766
19 / 23

Pivot wider

table2
## # A tibble: 12 × 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
20 / 23

Pivot wider

table2
## # A tibble: 12 × 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
pivot_wider(
data = table2,
names_from = type,
values_from = count
)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
20 / 23

Separating

table3
## # A tibble: 6 × 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
21 / 23

Separating

table3
## # A tibble: 6 × 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
separate(
data = table3,
col = rate,
into = c(
"cases",
"population"
),
convert = TRUE
)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
21 / 23

Uniting

table5
## # A tibble: 6 × 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
22 / 23

Uniting

table5
## # A tibble: 6 × 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
unite(
data = table5,
col = "year",
century, year
)
## # A tibble: 6 × 3
## country year rate
## <chr> <chr> <chr>
## 1 Afghanistan 19_99 745/19987071
## 2 Afghanistan 20_00 2666/20595360
## 3 Brazil 19_99 37737/172006362
## 4 Brazil 20_00 80488/174504898
## 5 China 19_99 212258/1272915272
## 6 China 20_00 213766/1280428583
22 / 23

Uniting

table5
## # A tibble: 6 × 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
# remove underscore
unite(
data = table5,
col = "year",
century, year,
sep = ""
) %>%
mutate(year = parse_number(year))
## # A tibble: 6 × 3
## country year rate
## <chr> <dbl> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
23 / 23

Importing data in R

2 / 23
Paused

Help

Keyboard shortcuts

, , Pg Up, k Go to previous slide
, , Pg Dn, Space, j Go to next slide
Home Go to first slide
End Go to last slide
Number + Return Go to specific slide
b / m / f Toggle blackout / mirrored / fullscreen mode
c Clone slideshow
p Toggle presenter mode
t Restart the presentation timer
?, h Toggle this help
Esc Back to slideshow