Data Transformation With dplyr

Jan 30, 2018

A typical data science project:

nycflights13 data

Available from the nycflights13 package.
336,776 flights that departed from New York City in 2013:

library("tidyverse")

library("nycflights13")
flights

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      544            545     -1.00     1004
##  5  2013     1     1      554            600     -6.00      812
##  6  2013     1     1      554            558     -4.00      740
##  7  2013     1     1      555            600     -5.00      913
##  8  2013     1     1      557            600     -3.00      709
##  9  2013     1     1      557            600     -3.00      838
## 10  2013     1     1      558            600     -2.00      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

dplyr basics

Pick observations by their values: filter().
Reorder the rows: arrange().
Pick variables by their names: select().
Create new variables with functions of existing variables: mutate().
Collapse many values down to a single summary: summarise().

Manipulate rows (cases)

Filter rows with `filter()`

Flights on Jan 1st:

# same as filter(flights, month == 1 & day == 1)
filter(flights, month == 1, day == 1)

## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      544            545     -1.00     1004
##  5  2013     1     1      554            600     -6.00      812
##  6  2013     1     1      554            558     -4.00      740
##  7  2013     1     1      555            600     -5.00      913
##  8  2013     1     1      557            600     -3.00      709
##  9  2013     1     1      557            600     -3.00      838
## 10  2013     1     1      558            600     -2.00      753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Flights in Nov or Dec:

filter(flights, month == 11 | month == 12)

## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1        5           2359      6.00      352
##  2  2013    11     1       35           2250    105         123
##  3  2013    11     1      455            500   -  5.00      641
##  4  2013    11     1      539            545   -  6.00      856
##  5  2013    11     1      542            545   -  3.00      831
##  6  2013    11     1      549            600   - 11.0       912
##  7  2013    11     1      550            600   - 10.0       705
##  8  2013    11     1      554            600   -  6.00      659
##  9  2013    11     1      554            600   -  6.00      826
## 10  2013    11     1      554            600   -  6.00      749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Remove rows with duplicate values

One row from each month:

distinct(flights, month, .keep_all = TRUE)

## # A tibble: 12 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013    10     1      447            500   - 13.0       614
##  3  2013    11     1        5           2359      6.00      352
##  4  2013    12     1       13           2359     14.0       446
##  5  2013     2     1      456            500   -  4.00      652
##  6  2013     3     1        4           2159    125         318
##  7  2013     4     1      454            500   -  6.00      636
##  8  2013     5     1        9           1655    434         308
##  9  2013     6     1        2           2359      3.00      341
## 10  2013     7     1        1           2029    212         236
## 11  2013     8     1       12           2130    162         257
## 12  2013     9     1        9           2359     10.0       343
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Sample rows

Randomly select n rows:

sample_n(flights, 10, replace = TRUE)

## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     8     1627           1630   -  3.00     1807
##  2  2013    11     5     1651           1659   -  8.00     1844
##  3  2013    12    12     1211           1220   -  9.00     1410
##  4  2013     9    14      854            900   -  6.00     1001
##  5  2013    11    29     1920           1930   - 10.0      2200
##  6  2013    12     9      905            900      5.00     1106
##  7  2013     7     9     1453           1500   -  7.00     1621
##  8  2013    10    24     1114           1120   -  6.00     1218
##  9  2013     4    13     1107           1110   -  3.00     1238
## 10  2013     8     8     1930           1734    116        2239
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Randomly select fraction of rows:

sample_frac(flights, 0.1, replace = TRUE)

## # A tibble: 33,678 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     3     7     1834           1833      1.00     2135
##  2  2013     7    18      557            604    - 7.00      713
##  3  2013    11     9      900            900      0        1224
##  4  2013     8    30     1614           1620    - 6.00     1917
##  5  2013     9    30     1240           1245    - 5.00     1525
##  6  2013     1    29     1452           1455    - 3.00     1624
##  7  2013     8    28      600            600      0         833
##  8  2013    11     4      819            830    -11.0      1006
##  9  2013    12    25      908            913    - 5.00     1154
## 10  2013     8     8      658            700    - 2.00      835
## # ... with 33,668 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Select rows by position

Select rows by position:

slice(flights, 1:5)

## # A tibble: 5 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515      2.00      830
## 2  2013     1     1      533            529      4.00      850
## 3  2013     1     1      542            540      2.00      923
## 4  2013     1     1      544            545     -1.00     1004
## 5  2013     1     1      554            600     -6.00      812
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Top n rows:

top_n(flights, 5)

## Selecting by time_hour

## # A tibble: 5 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013    12    31       13           2359     14.0       439
## 2  2013    12    31       18           2359     19.0       449
## 3  2013    12    31     2328           2330    - 2.00      412
## 4  2013    12    31     2355           2359    - 4.00      430
## 5  2013    12    31     2356           2359    - 3.00      436
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Arrange rows with `arrange()`

Sort in ascending order:

arrange(flights, year, month, day)

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      544            545     -1.00     1004
##  5  2013     1     1      554            600     -6.00      812
##  6  2013     1     1      554            558     -4.00      740
##  7  2013     1     1      555            600     -5.00      913
##  8  2013     1     1      557            600     -3.00      709
##  9  2013     1     1      557            600     -3.00      838
## 10  2013     1     1      558            600     -2.00      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Sort in descending order:

arrange(flights, desc(arr_delay))

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     6    15     1432           1935      1137     1607
##  3  2013     1    10     1121           1635      1126     1239
##  4  2013     9    20     1139           1845      1014     1457
##  5  2013     7    22      845           1600      1005     1044
##  6  2013     4    10     1100           1900       960     1342
##  7  2013     3    17     2321            810       911      135
##  8  2013     7    22     2257            759       898      121
##  9  2013    12     5      756           1700       896     1058
## 10  2013     5     3     1133           2055       878     1250
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Manipulate columns (variables)

Select columns with `select()`

Select columns by variable name:

select(flights, year, month, day)

## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

Select columns between two variables:

select(flights, year:day)

## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

Select columns except those between two variables:

select(flights, -(year:day))

## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1      517            515      2.00      830            819     11.0 
##  2      533            529      4.00      850            830     20.0 
##  3      542            540      2.00      923            850     33.0 
##  4      544            545     -1.00     1004           1022    -18.0 
##  5      554            600     -6.00      812            837    -25.0 
##  6      554            558     -4.00      740            728     12.0 
##  7      555            600     -5.00      913            854     19.0 
##  8      557            600     -3.00      709            723    -14.0 
##  9      557            600     -3.00      838            846    - 8.00
## 10      558            600     -2.00      753            745      8.00
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Select columns by positions:

select(flights, seq(1, 10, by = 2))

## # A tibble: 336,776 x 5
##     year   day sched_dep_time arr_time arr_delay
##    <int> <int>          <int>    <int>     <dbl>
##  1  2013     1            515      830     11.0 
##  2  2013     1            529      850     20.0 
##  3  2013     1            540      923     33.0 
##  4  2013     1            545     1004    -18.0 
##  5  2013     1            600      812    -25.0 
##  6  2013     1            558      740     12.0 
##  7  2013     1            600      913     19.0 
##  8  2013     1            600      709    -14.0 
##  9  2013     1            600      838    - 8.00
## 10  2013     1            600      753      8.00
## # ... with 336,766 more rows

Move variables to the start of data frame:

select(flights, time_hour, air_time, everything())

## # A tibble: 336,776 x 19
##    time_hour           air_time  year month   day dep_time sched_dep_time
##    <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
##  1 2013-01-01 05:00:00    227    2013     1     1      517            515
##  2 2013-01-01 05:00:00    227    2013     1     1      533            529
##  3 2013-01-01 05:00:00    160    2013     1     1      542            540
##  4 2013-01-01 05:00:00    183    2013     1     1      544            545
##  5 2013-01-01 06:00:00    116    2013     1     1      554            600
##  6 2013-01-01 05:00:00    150    2013     1     1      554            558
##  7 2013-01-01 06:00:00    158    2013     1     1      555            600
##  8 2013-01-01 06:00:00     53.0  2013     1     1      557            600
##  9 2013-01-01 06:00:00    140    2013     1     1      557            600
## 10 2013-01-01 06:00:00    138    2013     1     1      558            600
## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## #   hour <dbl>, minute <dbl>

Helper functions.
- starts_with("abc"): matches names that begin with “abc”.
- ends_with("xyz"): matches names that end with “xyz”.
- contains("ijk"): matches names that contain “ijk”.
- matches("(.)\\1"): selects variables that match a regular expression.
- num_range("x", 1:3): matches x1, x2 and x3.
- one_of()

Add new variables with `mutate()`

Add variables gain and speed:

flights_sml <- 
  select(flights, year:day, ends_with("delay"), distance, air_time)

mutate(flights_sml,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60
)

## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time   gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>  <dbl> <dbl>
##  1  2013     1     1      2.00     11.0      1400    227     9.00   370
##  2  2013     1     1      4.00     20.0      1416    227    16.0    374
##  3  2013     1     1      2.00     33.0      1089    160    31.0    408
##  4  2013     1     1     -1.00    -18.0      1576    183   -17.0    517
##  5  2013     1     1     -6.00    -25.0       762    116   -19.0    394
##  6  2013     1     1     -4.00     12.0       719    150    16.0    288
##  7  2013     1     1     -5.00     19.0      1065    158    24.0    404
##  8  2013     1     1     -3.00    -14.0       229     53.0 -11.0    259
##  9  2013     1     1     -3.00    - 8.00      944    140   - 5.00   405
## 10  2013     1     1     -2.00      8.00      733    138    10.0    319
## # ... with 336,766 more rows

Refer to columns that you’ve just created:

mutate(flights_sml,
  gain = arr_delay - dep_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

## # A tibble: 336,776 x 10
##     year month   day dep_delay arr_delay distance air_time   gain hours
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>  <dbl> <dbl>
##  1  2013     1     1      2.00     11.0      1400    227     9.00 3.78 
##  2  2013     1     1      4.00     20.0      1416    227    16.0  3.78 
##  3  2013     1     1      2.00     33.0      1089    160    31.0  2.67 
##  4  2013     1     1     -1.00    -18.0      1576    183   -17.0  3.05 
##  5  2013     1     1     -6.00    -25.0       762    116   -19.0  1.93 
##  6  2013     1     1     -4.00     12.0       719    150    16.0  2.50 
##  7  2013     1     1     -5.00     19.0      1065    158    24.0  2.63 
##  8  2013     1     1     -3.00    -14.0       229     53.0 -11.0  0.883
##  9  2013     1     1     -3.00    - 8.00      944    140   - 5.00 2.33 
## 10  2013     1     1     -2.00      8.00      733    138    10.0  2.30 
## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>

Only keep the new variables by transmute():

transmute(flights,
  gain = arr_delay - dep_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

## # A tibble: 336,776 x 3
##      gain hours gain_per_hour
##     <dbl> <dbl>         <dbl>
##  1   9.00 3.78           2.38
##  2  16.0  3.78           4.23
##  3  31.0  2.67          11.6 
##  4 -17.0  3.05         - 5.57
##  5 -19.0  1.93         - 9.83
##  6  16.0  2.50           6.40
##  7  24.0  2.63           9.11
##  8 -11.0  0.883        -12.5 
##  9 - 5.00 2.33         - 2.14
## 10  10.0  2.30           4.35
## # ... with 336,766 more rows

mutate_all(): apply funs to all columns.
```
mutate_all(data, funs(log(.), log2(.)))
```

mutate_at(): apply funs to specifc columns.

mutate_at(data, vars(-Species), funs(log(.)))

mutate_if(): apply funs of one type

mutate_if(data, is.numeric, funs(log(.)))

Summaries

Summaries with `summarise()`

Mean of a variable:

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))

## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6

Convert a tibble into a grouped tibble:

(by_day <- group_by(flights, year, month, day))

## # A tibble: 336,776 x 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      544            545     -1.00     1004
##  5  2013     1     1      554            600     -6.00      812
##  6  2013     1     1      554            558     -4.00      740
##  7  2013     1     1      555            600     -5.00      913
##  8  2013     1     1      557            600     -3.00      709
##  9  2013     1     1      557            600     -3.00      838
## 10  2013     1     1      558            600     -2.00      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Grouped summaries:

summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

Pipe

Consider following analysis:

by_dest <- group_by(flights, dest)
delay <- summarise(by_dest, count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
(delay <- filter(delay, count > 20, dest != "HNL"))

## # A tibble: 96 x 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254  1826  4.38
##  2 ACK     265   199  4.85
##  3 ALB     439   143 14.4 
##  4 ATL   17215   757 11.3 
##  5 AUS    2439  1514  6.02
##  6 AVL     275   584  8.00
##  7 BDL     443   116  7.05
##  8 BGR     375   378  8.03
##  9 BHM     297   866 16.9 
## 10 BNA    6333   758 11.8 
## # ... with 86 more rows

Cleaner code using pipe %>%:

(delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL"))

## # A tibble: 96 x 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254  1826  4.38
##  2 ACK     265   199  4.85
##  3 ALB     439   143 14.4 
##  4 ATL   17215   757 11.3 
##  5 AUS    2439  1514  6.02
##  6 AVL     275   584  8.00
##  7 BDL     443   116  7.05
##  8 BGR     375   378  8.03
##  9 BHM     297   866 16.9 
## 10 BNA    6333   758 11.8 
## # ... with 86 more rows

Unfortunately ggplot2 does not use pipe.

ggplot(data = delays, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess'

Other summary functions

Location: mean(x), median(x).

not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay1 = mean(arr_delay),
    avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )

## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day avg_delay1 avg_delay2
##    <int> <int> <int>      <dbl>      <dbl>
##  1  2013     1     1     12.7         32.5
##  2  2013     1     2     12.7         32.0
##  3  2013     1     3      5.73        27.7
##  4  2013     1     4    - 1.93        28.3
##  5  2013     1     5    - 1.53        22.6
##  6  2013     1     6      4.24        24.4
##  7  2013     1     7    - 4.95        27.8
##  8  2013     1     8    - 3.23        20.8
##  9  2013     1     9    - 0.264       25.6
## 10  2013     1    10    - 5.90        27.3
## # ... with 355 more rows

Spread: sd(x), IQR(x), mad(x).

not_cancelled %>% 
  group_by(dest) %>% 
  summarise(distance_sd = sd(distance)) %>% 
  arrange(desc(distance_sd))

## # A tibble: 104 x 2
##    dest  distance_sd
##    <chr>       <dbl>
##  1 EGE         10.5 
##  2 SAN         10.4 
##  3 SFO         10.2 
##  4 HNL         10.0 
##  5 SEA          9.98
##  6 LAS          9.91
##  7 PDX          9.87
##  8 PHX          9.86
##  9 LAX          9.66
## 10 IND          9.46
## # ... with 94 more rows

Rank: min(x), quantile(x, 0.25), max(x).

# When do the first and last flights leave each day?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )

## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day  first  last
##    <int> <int> <int>  <dbl> <dbl>
##  1  2013     1     1 517     2356
##  2  2013     1     2  42.0   2354
##  3  2013     1     3  32.0   2349
##  4  2013     1     4  25.0   2358
##  5  2013     1     5  14.0   2357
##  6  2013     1     6  16.0   2355
##  7  2013     1     7  49.0   2359
##  8  2013     1     8 454     2351
##  9  2013     1     9   2.00  2252
## 10  2013     1    10   3.00  2320
## # ... with 355 more rows

Position: first(x), nth(x, 2), last(x).

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )

## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day first_dep last_dep
##    <int> <int> <int>     <int>    <int>
##  1  2013     1     1       517     2356
##  2  2013     1     2        42     2354
##  3  2013     1     3        32     2349
##  4  2013     1     4        25     2358
##  5  2013     1     5        14     2357
##  6  2013     1     6        16     2355
##  7  2013     1     7        49     2359
##  8  2013     1     8       454     2351
##  9  2013     1     9         2     2252
## 10  2013     1    10         3     2320
## # ... with 355 more rows

Count: n(x), sum(!is.na(x)), n_distinct(x).

# Which destinations have the most carriers?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

## # A tibble: 104 x 2
##    dest  carriers
##    <chr>    <int>
##  1 ATL          7
##  2 BOS          7
##  3 CLT          7
##  4 ORD          7
##  5 TPA          7
##  6 AUS          6
##  7 DCA          6
##  8 DTW          6
##  9 IAD          6
## 10 MSP          6
## # ... with 94 more rows

not_cancelled %>% 
  count(dest)

## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows

not_cancelled %>% 
  count(tailnum, wt = distance)

## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # ... with 4,027 more rows

# How many flights left before 5am? (these usually indicate delayed
# flights from the previous day)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time < 500))

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day n_early
##    <int> <int> <int>   <int>
##  1  2013     1     1       0
##  2  2013     1     2       3
##  3  2013     1     3       4
##  4  2013     1     4       3
##  5  2013     1     5       3
##  6  2013     1     6       2
##  7  2013     1     7       2
##  8  2013     1     8       1
##  9  2013     1     9       3
## 10  2013     1    10       3
## # ... with 355 more rows

# What proportion of flights are delayed by more than an hour?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_perc = mean(arr_delay > 60))

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day hour_perc
##    <int> <int> <int>     <dbl>
##  1  2013     1     1    0.0722
##  2  2013     1     2    0.0851
##  3  2013     1     3    0.0567
##  4  2013     1     4    0.0396
##  5  2013     1     5    0.0349
##  6  2013     1     6    0.0470
##  7  2013     1     7    0.0333
##  8  2013     1     8    0.0213
##  9  2013     1     9    0.0202
## 10  2013     1    10    0.0183
## # ... with 355 more rows

Grouped mutates (and filters)

Find the worst members of each group:

flights_sml %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)

## # A tibble: 3,306 x 7
## # Groups:   year, month, day [365]
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1       853       851      184     41.0
##  2  2013     1     1       290       338     1134    213  
##  3  2013     1     1       260       263      266     46.0
##  4  2013     1     1       157       174      213     60.0
##  5  2013     1     1       216       222      708    121  
##  6  2013     1     1       255       250      589    115  
##  7  2013     1     1       285       246     1085    146  
##  8  2013     1     1       192       191      199     44.0
##  9  2013     1     1       379       456     1092    222  
## 10  2013     1     2       224       207      550     94.0
## # ... with 3,296 more rows

Find all groups bigger than a threshold:

(popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365))

## # A tibble: 332,577 x 19
## # Groups:   dest [77]
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      544            545     -1.00     1004
##  5  2013     1     1      554            600     -6.00      812
##  6  2013     1     1      554            558     -4.00      740
##  7  2013     1     1      555            600     -5.00      913
##  8  2013     1     1      557            600     -3.00      709
##  9  2013     1     1      557            600     -3.00      838
## 10  2013     1     1      558            600     -2.00      753
## # ... with 332,567 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Standardise to compute per group metrics:

popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)

## # A tibble: 131,106 x 6
## # Groups:   dest [77]
##     year month   day dest  arr_delay prop_delay
##    <int> <int> <int> <chr>     <dbl>      <dbl>
##  1  2013     1     1 IAH       11.0   0.000111 
##  2  2013     1     1 IAH       20.0   0.000201 
##  3  2013     1     1 MIA       33.0   0.000235 
##  4  2013     1     1 ORD       12.0   0.0000424
##  5  2013     1     1 FLL       19.0   0.0000938
##  6  2013     1     1 ORD        8.00  0.0000283
##  7  2013     1     1 LAX        7.00  0.0000344
##  8  2013     1     1 DFW       31.0   0.000282 
##  9  2013     1     1 ATL       12.0   0.0000400
## 10  2013     1     1 DTW       16.0   0.000116 
## # ... with 131,096 more rows

Combine tables

nycflights13 has >1 tables:

We already know a lot about flights:

flights

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      544            545     -1.00     1004
##  5  2013     1     1      554            600     -6.00      812
##  6  2013     1     1      554            558     -4.00      740
##  7  2013     1     1      555            600     -5.00      913
##  8  2013     1     1      557            600     -3.00      709
##  9  2013     1     1      557            600     -3.00      838
## 10  2013     1     1      558            600     -2.00      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

airlines:

airlines

## # A tibble: 16 x 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.

airports:

airports

## # A tibble: 1,458 x 8
##    faa   name                   lat    lon   alt    tz dst   tzone        
##    <chr> <chr>                <dbl>  <dbl> <int> <dbl> <chr> <chr>        
##  1 04G   Lansdowne Airport     41.1 - 80.6  1044 -5.00 A     America/New_…
##  2 06A   Moton Field Municip…  32.5 - 85.7   264 -6.00 A     America/Chic…
##  3 06C   Schaumburg Regional   42.0 - 88.1   801 -6.00 A     America/Chic…
##  4 06N   Randall Airport       41.4 - 74.4   523 -5.00 A     America/New_…
##  5 09J   Jekyll Island Airpo…  31.1 - 81.4    11 -5.00 A     America/New_…
##  6 0A9   Elizabethton Munici…  36.4 - 82.2  1593 -5.00 A     America/New_…
##  7 0G6   Williams County Air…  41.5 - 84.5   730 -5.00 A     America/New_…
##  8 0G7   Finger Lakes Region…  42.9 - 76.8   492 -5.00 A     America/New_…
##  9 0P2   Shoestring Aviation…  39.8 - 76.6  1000 -5.00 U     America/New_…
## 10 0S9   Jefferson County In…  48.1 -123     108 -8.00 A     America/Los_…
## # ... with 1,448 more rows

planes:

planes

## # A tibble: 3,322 x 9
##    tailnum  year type      manufacturer  model  engines seats speed engine
##    <chr>   <int> <chr>     <chr>         <chr>    <int> <int> <int> <chr> 
##  1 N10156   2004 Fixed wi… EMBRAER       EMB-1…       2    55    NA Turbo…
##  2 N102UW   1998 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
##  3 N103US   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
##  4 N104UW   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
##  5 N10575   2002 Fixed wi… EMBRAER       EMB-1…       2    55    NA Turbo…
##  6 N105UW   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
##  7 N107US   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
##  8 N108UW   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
##  9 N109UW   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
## 10 N110UW   1999 Fixed wi… AIRBUS INDUS… A320-…       2   182    NA Turbo…
## # ... with 3,312 more rows

Weather:

weather

## # A tibble: 26,130 x 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013  1.00     1     0  37.0  21.9  54.0      230      10.4 
##  2 EWR     2013  1.00     1     1  37.0  21.9  54.0      230      13.8 
##  3 EWR     2013  1.00     1     2  37.9  21.9  52.1      230      12.7 
##  4 EWR     2013  1.00     1     3  37.9  23.0  54.5      230      13.8 
##  5 EWR     2013  1.00     1     4  37.9  24.1  57.0      240      15.0 
##  6 EWR     2013  1.00     1     6  39.0  26.1  59.4      270      10.4 
##  7 EWR     2013  1.00     1     7  39.0  27.0  61.6      250       8.06
##  8 EWR     2013  1.00     1     8  39.0  28.0  64.4      240      11.5 
##  9 EWR     2013  1.00     1     9  39.9  28.0  62.2      250      12.7 
## 10 EWR     2013  1.00     1    10  39.0  28.0  64.4      260      12.7 
## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>

Relational data

Keys

A primary key uniquely identifies an observation in its own table.
A foreign key uniquely identifies an observation in another table.

Combine variables (columns)

Demo tables

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)

Inner join

An inner join matches pairs of observations whenever their keys are equal:

inner_join(x, y, by = "key")

## # A tibble: 2 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1  1.00 x1    y1   
## 2  2.00 x2    y2

Same as

x %>% inner_join(y, by = "key")

Outer join

An outer join keeps observations that appear in at least one of the tables.
Three types of outer joins:
- A left join keeps all observations in x.
- A right join keeps all observations in y.
- A full join keeps all observations in x or y.

Duplicate keys

One table has duplicate keys.

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     1, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2"
)
left_join(x, y, by = "key")

## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1  1.00 x1    y1   
## 2  2.00 x2    y2   
## 3  2.00 x3    y2   
## 4  1.00 x4    y1

Both tables have duplicate keys. You get all possible combinations, the Cartesian product:

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     3, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     2, "y3",
     3, "y4"
)
left_join(x, y, by = "key")

## # A tibble: 6 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1  1.00 x1    y1   
## 2  2.00 x2    y2   
## 3  2.00 x2    y3   
## 4  2.00 x3    y2   
## 5  2.00 x3    y3   
## 6  3.00 x4    y4

Let's create a narrower table from the flights data:

flights2 <- flights %>% 
  select(year:day, hour, origin, dest, tailnum, carrier)
flights2

## # A tibble: 336,776 x 8
##     year month   day  hour origin dest  tailnum carrier
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
##  1  2013     1     1  5.00 EWR    IAH   N14228  UA     
##  2  2013     1     1  5.00 LGA    IAH   N24211  UA     
##  3  2013     1     1  5.00 JFK    MIA   N619AA  AA     
##  4  2013     1     1  5.00 JFK    BQN   N804JB  B6     
##  5  2013     1     1  6.00 LGA    ATL   N668DN  DL     
##  6  2013     1     1  5.00 EWR    ORD   N39463  UA     
##  7  2013     1     1  6.00 EWR    FLL   N516JB  B6     
##  8  2013     1     1  6.00 LGA    IAD   N829AS  EV     
##  9  2013     1     1  6.00 JFK    MCO   N593JB  B6     
## 10  2013     1     1  6.00 LGA    ORD   N3ALAA  AA     
## # ... with 336,766 more rows

Defining the key columns

by = NULL (default): use all variables that appear in both tables:

# same as: flights2 %>% left_join(weather)
left_join(flights2, weather)

## Joining, by = c("year", "month", "day", "hour", "origin")

## # A tibble: 336,776 x 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid
##    <dbl> <dbl> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
##  1  2013  1.00     1  5.00 EWR    IAH   N14228  UA       NA    NA    NA  
##  2  2013  1.00     1  5.00 LGA    IAH   N24211  UA       NA    NA    NA  
##  3  2013  1.00     1  5.00 JFK    MIA   N619AA  AA       NA    NA    NA  
##  4  2013  1.00     1  5.00 JFK    BQN   N804JB  B6       NA    NA    NA  
##  5  2013  1.00     1  6.00 LGA    ATL   N668DN  DL       39.9  26.1  57.3
##  6  2013  1.00     1  5.00 EWR    ORD   N39463  UA       NA    NA    NA  
##  7  2013  1.00     1  6.00 EWR    FLL   N516JB  B6       39.0  26.1  59.4
##  8  2013  1.00     1  6.00 LGA    IAD   N829AS  EV       39.9  26.1  57.3
##  9  2013  1.00     1  6.00 JFK    MCO   N593JB  B6       39.0  26.1  59.4
## 10  2013  1.00     1  6.00 LGA    ORD   N3ALAA  AA       39.9  26.1  57.3
## # ... with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>

by = "x": use the common variable x:

# same as: flights2 %>% left_join(weather)
left_join(flights2, planes, by = "tailnum")

## # A tibble: 336,776 x 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type      
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>     
##  1   2013     1     1  5.00 EWR    IAH   N14228  UA        1999 Fixed win…
##  2   2013     1     1  5.00 LGA    IAH   N24211  UA        1998 Fixed win…
##  3   2013     1     1  5.00 JFK    MIA   N619AA  AA        1990 Fixed win…
##  4   2013     1     1  5.00 JFK    BQN   N804JB  B6        2012 Fixed win…
##  5   2013     1     1  6.00 LGA    ATL   N668DN  DL        1991 Fixed win…
##  6   2013     1     1  5.00 EWR    ORD   N39463  UA        2012 Fixed win…
##  7   2013     1     1  6.00 EWR    FLL   N516JB  B6        2000 Fixed win…
##  8   2013     1     1  6.00 LGA    IAD   N829AS  EV        1998 Fixed win…
##  9   2013     1     1  6.00 JFK    MCO   N593JB  B6        2004 Fixed win…
## 10   2013     1     1  6.00 LGA    ORD   N3ALAA  AA          NA <NA>      
## # ... with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>

by = c("a" = "b"): match variable a in table x to the variable b in table y.

# same as: flights2 %>% left_join(weather)
left_join(flights2, airports, by = c("dest" = "faa"))

## # A tibble: 336,776 x 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl>
##  1  2013     1     1  5.00 EWR    IAH   N14228  UA      Georg…  30.0 -95.3
##  2  2013     1     1  5.00 LGA    IAH   N24211  UA      Georg…  30.0 -95.3
##  3  2013     1     1  5.00 JFK    MIA   N619AA  AA      Miami…  25.8 -80.3
##  4  2013     1     1  5.00 JFK    BQN   N804JB  B6      <NA>    NA    NA  
##  5  2013     1     1  6.00 LGA    ATL   N668DN  DL      Harts…  33.6 -84.4
##  6  2013     1     1  5.00 EWR    ORD   N39463  UA      Chica…  42.0 -87.9
##  7  2013     1     1  6.00 EWR    FLL   N516JB  B6      Fort …  26.1 -80.2
##  8  2013     1     1  6.00 LGA    IAD   N829AS  EV      Washi…  38.9 -77.5
##  9  2013     1     1  6.00 JFK    MCO   N593JB  B6      Orlan…  28.4 -81.3
## 10  2013     1     1  6.00 LGA    ORD   N3ALAA  AA      Chica…  42.0 -87.9
## # ... with 336,766 more rows, and 4 more variables: alt <int>, tz <dbl>,
## #   dst <chr>, tzone <chr>

Combine cases (rows)

Top 10 most popular destinations:

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10)
top_dest

## # A tibble: 10 x 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705

How to filter the cases that fly to these destinations?

Semi-join

semi_join(x, y) keesp the rows in x that have a match in y.
Useful to see what will be joined.

semi_join(flights, top_dest)

## Joining, by = "dest"

## # A tibble: 141,145 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      542            540      2.00      923
##  2  2013     1     1      554            600     -6.00      812
##  3  2013     1     1      554            558     -4.00      740
##  4  2013     1     1      555            600     -5.00      913
##  5  2013     1     1      557            600     -3.00      838
##  6  2013     1     1      558            600     -2.00      753
##  7  2013     1     1      558            600     -2.00      924
##  8  2013     1     1      558            600     -2.00      923
##  9  2013     1     1      559            559      0         702
## 10  2013     1     1      600            600      0         851
## # ... with 141,135 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Anti-join

anti_join(x, y) keeps the rows that don’t have a match.
Useful to see what will not be joined.

flights %>% 
  anti_join(planes, by = "tailnum") %>%
  count(tailnum, sort = TRUE)

## # A tibble: 722 x 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # ... with 712 more rows

Set operations

Generate two tables:

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)
df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)

bind_rows(x, y) stacks table x one on top of y.

bind_rows(df1, df2)

## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1  1.00  1.00
## 2  2.00  1.00
## 3  1.00  1.00
## 4  1.00  2.00

intersect(x, y) returns rows that appear in both x and y.

intersect(df1, df2)

## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1  1.00  1.00

union(x, y) returns unique observations in x and y.

union(df1, df2)

## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1  1.00  2.00
## 2  2.00  1.00
## 3  1.00  1.00

setdiff(x, y) returns rows that appear in x but not in y.

setdiff(df1, df2)

## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1  2.00  1.00

setdiff(df2, df1)

## # A tibble: 1 x 2
##       x     y
##   <dbl> <dbl>
## 1  1.00  2.00