A typical data science project:
Jan 30, 2018
A typical data science project:
Available from the nycflights13 package.
336,776 flights that departed from New York City in 2013:
library("tidyverse")
library("nycflights13") flights
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Pick observations by their values: filter()
.
Reorder the rows: arrange()
.
Pick variables by their names: select()
.
Create new variables with functions of existing variables: mutate()
.
Collapse many values down to a single summary: summarise()
.
filter()
Flights on Jan 1st:
# same as filter(flights, month == 1 & day == 1) filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Flights in Nov or Dec:
filter(flights, month == 11 | month == 12)
## # A tibble: 55,403 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 11 1 5 2359 6.00 352 ## 2 2013 11 1 35 2250 105 123 ## 3 2013 11 1 455 500 - 5.00 641 ## 4 2013 11 1 539 545 - 6.00 856 ## 5 2013 11 1 542 545 - 3.00 831 ## 6 2013 11 1 549 600 - 11.0 912 ## 7 2013 11 1 550 600 - 10.0 705 ## 8 2013 11 1 554 600 - 6.00 659 ## 9 2013 11 1 554 600 - 6.00 826 ## 10 2013 11 1 554 600 - 6.00 749 ## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
One row from each month:
distinct(flights, month, .keep_all = TRUE)
## # A tibble: 12 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 10 1 447 500 - 13.0 614 ## 3 2013 11 1 5 2359 6.00 352 ## 4 2013 12 1 13 2359 14.0 446 ## 5 2013 2 1 456 500 - 4.00 652 ## 6 2013 3 1 4 2159 125 318 ## 7 2013 4 1 454 500 - 6.00 636 ## 8 2013 5 1 9 1655 434 308 ## 9 2013 6 1 2 2359 3.00 341 ## 10 2013 7 1 1 2029 212 236 ## 11 2013 8 1 12 2130 162 257 ## 12 2013 9 1 9 2359 10.0 343 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
Randomly select n
rows:
sample_n(flights, 10, replace = TRUE)
## # A tibble: 10 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 11 8 1627 1630 - 3.00 1807 ## 2 2013 11 5 1651 1659 - 8.00 1844 ## 3 2013 12 12 1211 1220 - 9.00 1410 ## 4 2013 9 14 854 900 - 6.00 1001 ## 5 2013 11 29 1920 1930 - 10.0 2200 ## 6 2013 12 9 905 900 5.00 1106 ## 7 2013 7 9 1453 1500 - 7.00 1621 ## 8 2013 10 24 1114 1120 - 6.00 1218 ## 9 2013 4 13 1107 1110 - 3.00 1238 ## 10 2013 8 8 1930 1734 116 2239 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
Randomly select fraction of rows:
sample_frac(flights, 0.1, replace = TRUE)
## # A tibble: 33,678 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 3 7 1834 1833 1.00 2135 ## 2 2013 7 18 557 604 - 7.00 713 ## 3 2013 11 9 900 900 0 1224 ## 4 2013 8 30 1614 1620 - 6.00 1917 ## 5 2013 9 30 1240 1245 - 5.00 1525 ## 6 2013 1 29 1452 1455 - 3.00 1624 ## 7 2013 8 28 600 600 0 833 ## 8 2013 11 4 819 830 -11.0 1006 ## 9 2013 12 25 908 913 - 5.00 1154 ## 10 2013 8 8 658 700 - 2.00 835 ## # ... with 33,668 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Select rows by position:
slice(flights, 1:5)
## # A tibble: 5 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
Top n
rows:
top_n(flights, 5)
## Selecting by time_hour
## # A tibble: 5 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 12 31 13 2359 14.0 439 ## 2 2013 12 31 18 2359 19.0 449 ## 3 2013 12 31 2328 2330 - 2.00 412 ## 4 2013 12 31 2355 2359 - 4.00 430 ## 5 2013 12 31 2356 2359 - 3.00 436 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
arrange()
Sort in ascending order:
arrange(flights, year, month, day)
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Sort in descending order:
arrange(flights, desc(arr_delay))
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 9 641 900 1301 1242 ## 2 2013 6 15 1432 1935 1137 1607 ## 3 2013 1 10 1121 1635 1126 1239 ## 4 2013 9 20 1139 1845 1014 1457 ## 5 2013 7 22 845 1600 1005 1044 ## 6 2013 4 10 1100 1900 960 1342 ## 7 2013 3 17 2321 810 911 135 ## 8 2013 7 22 2257 759 898 121 ## 9 2013 12 5 756 1700 896 1058 ## 10 2013 5 3 1133 2055 878 1250 ## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
select()
Select columns by variable name:
select(flights, year, month, day)
## # A tibble: 336,776 x 3 ## year month day ## <int> <int> <int> ## 1 2013 1 1 ## 2 2013 1 1 ## 3 2013 1 1 ## 4 2013 1 1 ## 5 2013 1 1 ## 6 2013 1 1 ## 7 2013 1 1 ## 8 2013 1 1 ## 9 2013 1 1 ## 10 2013 1 1 ## # ... with 336,766 more rows
Select columns between two variables:
select(flights, year:day)
## # A tibble: 336,776 x 3 ## year month day ## <int> <int> <int> ## 1 2013 1 1 ## 2 2013 1 1 ## 3 2013 1 1 ## 4 2013 1 1 ## 5 2013 1 1 ## 6 2013 1 1 ## 7 2013 1 1 ## 8 2013 1 1 ## 9 2013 1 1 ## 10 2013 1 1 ## # ... with 336,766 more rows
Select columns except those between two variables:
select(flights, -(year:day))
## # A tibble: 336,776 x 16 ## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay ## <int> <int> <dbl> <int> <int> <dbl> ## 1 517 515 2.00 830 819 11.0 ## 2 533 529 4.00 850 830 20.0 ## 3 542 540 2.00 923 850 33.0 ## 4 544 545 -1.00 1004 1022 -18.0 ## 5 554 600 -6.00 812 837 -25.0 ## 6 554 558 -4.00 740 728 12.0 ## 7 555 600 -5.00 913 854 19.0 ## 8 557 600 -3.00 709 723 -14.0 ## 9 557 600 -3.00 838 846 - 8.00 ## 10 558 600 -2.00 753 745 8.00 ## # ... with 336,766 more rows, and 10 more variables: carrier <chr>, ## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Select columns by positions:
select(flights, seq(1, 10, by = 2))
## # A tibble: 336,776 x 5 ## year day sched_dep_time arr_time arr_delay ## <int> <int> <int> <int> <dbl> ## 1 2013 1 515 830 11.0 ## 2 2013 1 529 850 20.0 ## 3 2013 1 540 923 33.0 ## 4 2013 1 545 1004 -18.0 ## 5 2013 1 600 812 -25.0 ## 6 2013 1 558 740 12.0 ## 7 2013 1 600 913 19.0 ## 8 2013 1 600 709 -14.0 ## 9 2013 1 600 838 - 8.00 ## 10 2013 1 600 753 8.00 ## # ... with 336,766 more rows
Move variables to the start of data frame:
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 x 19 ## time_hour air_time year month day dep_time sched_dep_time ## <dttm> <dbl> <int> <int> <int> <int> <int> ## 1 2013-01-01 05:00:00 227 2013 1 1 517 515 ## 2 2013-01-01 05:00:00 227 2013 1 1 533 529 ## 3 2013-01-01 05:00:00 160 2013 1 1 542 540 ## 4 2013-01-01 05:00:00 183 2013 1 1 544 545 ## 5 2013-01-01 06:00:00 116 2013 1 1 554 600 ## 6 2013-01-01 05:00:00 150 2013 1 1 554 558 ## 7 2013-01-01 06:00:00 158 2013 1 1 555 600 ## 8 2013-01-01 06:00:00 53.0 2013 1 1 557 600 ## 9 2013-01-01 06:00:00 140 2013 1 1 557 600 ## 10 2013-01-01 06:00:00 138 2013 1 1 558 600 ## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>, ## # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, ## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>, ## # hour <dbl>, minute <dbl>
Helper functions.
starts_with("abc")
: matches names that begin with “abc”.
ends_with("xyz")
: matches names that end with “xyz”.
contains("ijk")
: matches names that contain “ijk”.
matches("(.)\\1")
: selects variables that match a regular expression.
num_range("x", 1:3)
: matches x1, x2 and x3.
one_of()
mutate()
Add variables gain
and speed
:
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time)
mutate(flights_sml, gain = arr_delay - dep_delay, speed = distance / air_time * 60 )
## # A tibble: 336,776 x 9 ## year month day dep_delay arr_delay distance air_time gain speed ## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 2013 1 1 2.00 11.0 1400 227 9.00 370 ## 2 2013 1 1 4.00 20.0 1416 227 16.0 374 ## 3 2013 1 1 2.00 33.0 1089 160 31.0 408 ## 4 2013 1 1 -1.00 -18.0 1576 183 -17.0 517 ## 5 2013 1 1 -6.00 -25.0 762 116 -19.0 394 ## 6 2013 1 1 -4.00 12.0 719 150 16.0 288 ## 7 2013 1 1 -5.00 19.0 1065 158 24.0 404 ## 8 2013 1 1 -3.00 -14.0 229 53.0 -11.0 259 ## 9 2013 1 1 -3.00 - 8.00 944 140 - 5.00 405 ## 10 2013 1 1 -2.00 8.00 733 138 10.0 319 ## # ... with 336,766 more rows
Refer to columns that you’ve just created:
mutate(flights_sml, gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours )
## # A tibble: 336,776 x 10 ## year month day dep_delay arr_delay distance air_time gain hours ## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 2013 1 1 2.00 11.0 1400 227 9.00 3.78 ## 2 2013 1 1 4.00 20.0 1416 227 16.0 3.78 ## 3 2013 1 1 2.00 33.0 1089 160 31.0 2.67 ## 4 2013 1 1 -1.00 -18.0 1576 183 -17.0 3.05 ## 5 2013 1 1 -6.00 -25.0 762 116 -19.0 1.93 ## 6 2013 1 1 -4.00 12.0 719 150 16.0 2.50 ## 7 2013 1 1 -5.00 19.0 1065 158 24.0 2.63 ## 8 2013 1 1 -3.00 -14.0 229 53.0 -11.0 0.883 ## 9 2013 1 1 -3.00 - 8.00 944 140 - 5.00 2.33 ## 10 2013 1 1 -2.00 8.00 733 138 10.0 2.30 ## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
Only keep the new variables by transmute()
:
transmute(flights, gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours )
## # A tibble: 336,776 x 3 ## gain hours gain_per_hour ## <dbl> <dbl> <dbl> ## 1 9.00 3.78 2.38 ## 2 16.0 3.78 4.23 ## 3 31.0 2.67 11.6 ## 4 -17.0 3.05 - 5.57 ## 5 -19.0 1.93 - 9.83 ## 6 16.0 2.50 6.40 ## 7 24.0 2.63 9.11 ## 8 -11.0 0.883 -12.5 ## 9 - 5.00 2.33 - 2.14 ## 10 10.0 2.30 4.35 ## # ... with 336,766 more rows
mutate_all()
: apply funs to all columns.
mutate_all(data, funs(log(.), log2(.)))
mutate_at()
: apply funs to specifc columns.
mutate_at(data, vars(-Species), funs(log(.)))
mutate_if()
: apply funs of one type
mutate_if(data, is.numeric, funs(log(.)))
summarise()
Mean of a variable:
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1 ## delay ## <dbl> ## 1 12.6
Convert a tibble into a grouped tibble:
(by_day <- group_by(flights, year, month, day))
## # A tibble: 336,776 x 19 ## # Groups: year, month, day [365] ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Grouped summaries:
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4 ## # Groups: year, month [?] ## year month day delay ## <int> <int> <int> <dbl> ## 1 2013 1 1 11.5 ## 2 2013 1 2 13.9 ## 3 2013 1 3 11.0 ## 4 2013 1 4 8.95 ## 5 2013 1 5 5.73 ## 6 2013 1 6 7.15 ## 7 2013 1 7 5.42 ## 8 2013 1 8 2.55 ## 9 2013 1 9 2.28 ## 10 2013 1 10 2.84 ## # ... with 355 more rows
Consider following analysis:
by_dest <- group_by(flights, dest) delay <- summarise(by_dest, count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) (delay <- filter(delay, count > 20, dest != "HNL"))
## # A tibble: 96 x 4 ## dest count dist delay ## <chr> <int> <dbl> <dbl> ## 1 ABQ 254 1826 4.38 ## 2 ACK 265 199 4.85 ## 3 ALB 439 143 14.4 ## 4 ATL 17215 757 11.3 ## 5 AUS 2439 1514 6.02 ## 6 AVL 275 584 8.00 ## 7 BDL 443 116 7.05 ## 8 BGR 375 378 8.03 ## 9 BHM 297 866 16.9 ## 10 BNA 6333 758 11.8 ## # ... with 86 more rows
Cleaner code using pipe %>%
:
(delays <- flights %>% group_by(dest) %>% summarise( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter(count > 20, dest != "HNL"))
## # A tibble: 96 x 4 ## dest count dist delay ## <chr> <int> <dbl> <dbl> ## 1 ABQ 254 1826 4.38 ## 2 ACK 265 199 4.85 ## 3 ALB 439 143 14.4 ## 4 ATL 17215 757 11.3 ## 5 AUS 2439 1514 6.02 ## 6 AVL 275 584 8.00 ## 7 BDL 443 116 7.05 ## 8 BGR 375 378 8.03 ## 9 BHM 297 866 16.9 ## 10 BNA 6333 758 11.8 ## # ... with 86 more rows
Unfortunately ggplot2 does not use pipe.
ggplot(data = delays, mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess'
Location: mean(x)
, median(x)
.
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>% group_by(year, month, day) %>% summarise( avg_delay1 = mean(arr_delay), avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay )
## # A tibble: 365 x 5 ## # Groups: year, month [?] ## year month day avg_delay1 avg_delay2 ## <int> <int> <int> <dbl> <dbl> ## 1 2013 1 1 12.7 32.5 ## 2 2013 1 2 12.7 32.0 ## 3 2013 1 3 5.73 27.7 ## 4 2013 1 4 - 1.93 28.3 ## 5 2013 1 5 - 1.53 22.6 ## 6 2013 1 6 4.24 24.4 ## 7 2013 1 7 - 4.95 27.8 ## 8 2013 1 8 - 3.23 20.8 ## 9 2013 1 9 - 0.264 25.6 ## 10 2013 1 10 - 5.90 27.3 ## # ... with 355 more rows
Spread: sd(x)
, IQR(x)
, mad(x)
.
not_cancelled %>% group_by(dest) %>% summarise(distance_sd = sd(distance)) %>% arrange(desc(distance_sd))
## # A tibble: 104 x 2 ## dest distance_sd ## <chr> <dbl> ## 1 EGE 10.5 ## 2 SAN 10.4 ## 3 SFO 10.2 ## 4 HNL 10.0 ## 5 SEA 9.98 ## 6 LAS 9.91 ## 7 PDX 9.87 ## 8 PHX 9.86 ## 9 LAX 9.66 ## 10 IND 9.46 ## # ... with 94 more rows
Rank: min(x)
, quantile(x, 0.25)
, max(x)
.
# When do the first and last flights leave each day? not_cancelled %>% group_by(year, month, day) %>% summarise( first = min(dep_time), last = max(dep_time) )
## # A tibble: 365 x 5 ## # Groups: year, month [?] ## year month day first last ## <int> <int> <int> <dbl> <dbl> ## 1 2013 1 1 517 2356 ## 2 2013 1 2 42.0 2354 ## 3 2013 1 3 32.0 2349 ## 4 2013 1 4 25.0 2358 ## 5 2013 1 5 14.0 2357 ## 6 2013 1 6 16.0 2355 ## 7 2013 1 7 49.0 2359 ## 8 2013 1 8 454 2351 ## 9 2013 1 9 2.00 2252 ## 10 2013 1 10 3.00 2320 ## # ... with 355 more rows
Position: first(x)
, nth(x, 2)
, last(x)
.
not_cancelled %>% group_by(year, month, day) %>% summarise( first_dep = first(dep_time), last_dep = last(dep_time) )
## # A tibble: 365 x 5 ## # Groups: year, month [?] ## year month day first_dep last_dep ## <int> <int> <int> <int> <int> ## 1 2013 1 1 517 2356 ## 2 2013 1 2 42 2354 ## 3 2013 1 3 32 2349 ## 4 2013 1 4 25 2358 ## 5 2013 1 5 14 2357 ## 6 2013 1 6 16 2355 ## 7 2013 1 7 49 2359 ## 8 2013 1 8 454 2351 ## 9 2013 1 9 2 2252 ## 10 2013 1 10 3 2320 ## # ... with 355 more rows
Count: n(x)
, sum(!is.na(x))
, n_distinct(x)
.
# Which destinations have the most carriers? not_cancelled %>% group_by(dest) %>% summarise(carriers = n_distinct(carrier)) %>% arrange(desc(carriers))
## # A tibble: 104 x 2 ## dest carriers ## <chr> <int> ## 1 ATL 7 ## 2 BOS 7 ## 3 CLT 7 ## 4 ORD 7 ## 5 TPA 7 ## 6 AUS 6 ## 7 DCA 6 ## 8 DTW 6 ## 9 IAD 6 ## 10 MSP 6 ## # ... with 94 more rows
not_cancelled %>% count(dest)
## # A tibble: 104 x 2 ## dest n ## <chr> <int> ## 1 ABQ 254 ## 2 ACK 264 ## 3 ALB 418 ## 4 ANC 8 ## 5 ATL 16837 ## 6 AUS 2411 ## 7 AVL 261 ## 8 BDL 412 ## 9 BGR 358 ## 10 BHM 269 ## # ... with 94 more rows
not_cancelled %>% count(tailnum, wt = distance)
## # A tibble: 4,037 x 2 ## tailnum n ## <chr> <dbl> ## 1 D942DN 3418 ## 2 N0EGMQ 239143 ## 3 N10156 109664 ## 4 N102UW 25722 ## 5 N103US 24619 ## 6 N104UW 24616 ## 7 N10575 139903 ## 8 N105UW 23618 ## 9 N107US 21677 ## 10 N108UW 32070 ## # ... with 4,027 more rows
# How many flights left before 5am? (these usually indicate delayed # flights from the previous day) not_cancelled %>% group_by(year, month, day) %>% summarise(n_early = sum(dep_time < 500))
## # A tibble: 365 x 4 ## # Groups: year, month [?] ## year month day n_early ## <int> <int> <int> <int> ## 1 2013 1 1 0 ## 2 2013 1 2 3 ## 3 2013 1 3 4 ## 4 2013 1 4 3 ## 5 2013 1 5 3 ## 6 2013 1 6 2 ## 7 2013 1 7 2 ## 8 2013 1 8 1 ## 9 2013 1 9 3 ## 10 2013 1 10 3 ## # ... with 355 more rows
# What proportion of flights are delayed by more than an hour? not_cancelled %>% group_by(year, month, day) %>% summarise(hour_perc = mean(arr_delay > 60))
## # A tibble: 365 x 4 ## # Groups: year, month [?] ## year month day hour_perc ## <int> <int> <int> <dbl> ## 1 2013 1 1 0.0722 ## 2 2013 1 2 0.0851 ## 3 2013 1 3 0.0567 ## 4 2013 1 4 0.0396 ## 5 2013 1 5 0.0349 ## 6 2013 1 6 0.0470 ## 7 2013 1 7 0.0333 ## 8 2013 1 8 0.0213 ## 9 2013 1 9 0.0202 ## 10 2013 1 10 0.0183 ## # ... with 355 more rows
Find the worst members of each group:
flights_sml %>% group_by(year, month, day) %>% filter(rank(desc(arr_delay)) < 10)
## # A tibble: 3,306 x 7 ## # Groups: year, month, day [365] ## year month day dep_delay arr_delay distance air_time ## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> ## 1 2013 1 1 853 851 184 41.0 ## 2 2013 1 1 290 338 1134 213 ## 3 2013 1 1 260 263 266 46.0 ## 4 2013 1 1 157 174 213 60.0 ## 5 2013 1 1 216 222 708 121 ## 6 2013 1 1 255 250 589 115 ## 7 2013 1 1 285 246 1085 146 ## 8 2013 1 1 192 191 199 44.0 ## 9 2013 1 1 379 456 1092 222 ## 10 2013 1 2 224 207 550 94.0 ## # ... with 3,296 more rows
Find all groups bigger than a threshold:
(popular_dests <- flights %>% group_by(dest) %>% filter(n() > 365))
## # A tibble: 332,577 x 19 ## # Groups: dest [77] ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## # ... with 332,567 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
Standardise to compute per group metrics:
popular_dests %>% filter(arr_delay > 0) %>% mutate(prop_delay = arr_delay / sum(arr_delay)) %>% select(year:day, dest, arr_delay, prop_delay)
## # A tibble: 131,106 x 6 ## # Groups: dest [77] ## year month day dest arr_delay prop_delay ## <int> <int> <int> <chr> <dbl> <dbl> ## 1 2013 1 1 IAH 11.0 0.000111 ## 2 2013 1 1 IAH 20.0 0.000201 ## 3 2013 1 1 MIA 33.0 0.000235 ## 4 2013 1 1 ORD 12.0 0.0000424 ## 5 2013 1 1 FLL 19.0 0.0000938 ## 6 2013 1 1 ORD 8.00 0.0000283 ## 7 2013 1 1 LAX 7.00 0.0000344 ## 8 2013 1 1 DFW 31.0 0.000282 ## 9 2013 1 1 ATL 12.0 0.0000400 ## 10 2013 1 1 DTW 16.0 0.000116 ## # ... with 131,096 more rows
nycflights13 has >1 tables:
We already know a lot about flights:
flights
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
airlines:
airlines
## # A tibble: 16 x 2 ## carrier name ## <chr> <chr> ## 1 9E Endeavor Air Inc. ## 2 AA American Airlines Inc. ## 3 AS Alaska Airlines Inc. ## 4 B6 JetBlue Airways ## 5 DL Delta Air Lines Inc. ## 6 EV ExpressJet Airlines Inc. ## 7 F9 Frontier Airlines Inc. ## 8 FL AirTran Airways Corporation ## 9 HA Hawaiian Airlines Inc. ## 10 MQ Envoy Air ## 11 OO SkyWest Airlines Inc. ## 12 UA United Air Lines Inc. ## 13 US US Airways Inc. ## 14 VX Virgin America ## 15 WN Southwest Airlines Co. ## 16 YV Mesa Airlines Inc.
airports:
airports
## # A tibble: 1,458 x 8 ## faa name lat lon alt tz dst tzone ## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> <chr> ## 1 04G Lansdowne Airport 41.1 - 80.6 1044 -5.00 A America/New_… ## 2 06A Moton Field Municip… 32.5 - 85.7 264 -6.00 A America/Chic… ## 3 06C Schaumburg Regional 42.0 - 88.1 801 -6.00 A America/Chic… ## 4 06N Randall Airport 41.4 - 74.4 523 -5.00 A America/New_… ## 5 09J Jekyll Island Airpo… 31.1 - 81.4 11 -5.00 A America/New_… ## 6 0A9 Elizabethton Munici… 36.4 - 82.2 1593 -5.00 A America/New_… ## 7 0G6 Williams County Air… 41.5 - 84.5 730 -5.00 A America/New_… ## 8 0G7 Finger Lakes Region… 42.9 - 76.8 492 -5.00 A America/New_… ## 9 0P2 Shoestring Aviation… 39.8 - 76.6 1000 -5.00 U America/New_… ## 10 0S9 Jefferson County In… 48.1 -123 108 -8.00 A America/Los_… ## # ... with 1,448 more rows
planes:
planes
## # A tibble: 3,322 x 9 ## tailnum year type manufacturer model engines seats speed engine ## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr> ## 1 N10156 2004 Fixed wi… EMBRAER EMB-1… 2 55 NA Turbo… ## 2 N102UW 1998 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 3 N103US 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 4 N104UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 5 N10575 2002 Fixed wi… EMBRAER EMB-1… 2 55 NA Turbo… ## 6 N105UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 7 N107US 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 8 N108UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 9 N109UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## 10 N110UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo… ## # ... with 3,312 more rows
Weather:
weather
## # A tibble: 26,130 x 15 ## origin year month day hour temp dewp humid wind_dir wind_speed ## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 EWR 2013 1.00 1 0 37.0 21.9 54.0 230 10.4 ## 2 EWR 2013 1.00 1 1 37.0 21.9 54.0 230 13.8 ## 3 EWR 2013 1.00 1 2 37.9 21.9 52.1 230 12.7 ## 4 EWR 2013 1.00 1 3 37.9 23.0 54.5 230 13.8 ## 5 EWR 2013 1.00 1 4 37.9 24.1 57.0 240 15.0 ## 6 EWR 2013 1.00 1 6 39.0 26.1 59.4 270 10.4 ## 7 EWR 2013 1.00 1 7 39.0 27.0 61.6 250 8.06 ## 8 EWR 2013 1.00 1 8 39.0 28.0 64.4 240 11.5 ## 9 EWR 2013 1.00 1 9 39.9 28.0 62.2 250 12.7 ## 10 EWR 2013 1.00 1 10 39.0 28.0 64.4 260 12.7 ## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>, ## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
A primary key uniquely identifies an observation in its own table.
A foreign key uniquely identifies an observation in another table.
x <- tribble( ~key, ~val_x, 1, "x1", 2, "x2", 3, "x3" ) y <- tribble( ~key, ~val_y, 1, "y1", 2, "y2", 4, "y3" )
inner_join(x, y, by = "key")
## # A tibble: 2 x 3 ## key val_x val_y ## <dbl> <chr> <chr> ## 1 1.00 x1 y1 ## 2 2.00 x2 y2
Same as
x %>% inner_join(y, by = "key")
An outer join keeps observations that appear in at least one of the tables.
Three types of outer joins:
A left join keeps all observations in x
.
A right join keeps all observations in y
.
A full join keeps all observations in x
or y
.
One table has duplicate keys.
x <- tribble( ~key, ~val_x, 1, "x1", 2, "x2", 2, "x3", 1, "x4" ) y <- tribble( ~key, ~val_y, 1, "y1", 2, "y2" ) left_join(x, y, by = "key")
## # A tibble: 4 x 3 ## key val_x val_y ## <dbl> <chr> <chr> ## 1 1.00 x1 y1 ## 2 2.00 x2 y2 ## 3 2.00 x3 y2 ## 4 1.00 x4 y1
Both tables have duplicate keys. You get all possible combinations, the Cartesian product:
x <- tribble( ~key, ~val_x, 1, "x1", 2, "x2", 2, "x3", 3, "x4" ) y <- tribble( ~key, ~val_y, 1, "y1", 2, "y2", 2, "y3", 3, "y4" ) left_join(x, y, by = "key")
## # A tibble: 6 x 3 ## key val_x val_y ## <dbl> <chr> <chr> ## 1 1.00 x1 y1 ## 2 2.00 x2 y2 ## 3 2.00 x2 y3 ## 4 2.00 x3 y2 ## 5 2.00 x3 y3 ## 6 3.00 x4 y4
Let's create a narrower table from the flights data:
flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier) flights2
## # A tibble: 336,776 x 8 ## year month day hour origin dest tailnum carrier ## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> ## 1 2013 1 1 5.00 EWR IAH N14228 UA ## 2 2013 1 1 5.00 LGA IAH N24211 UA ## 3 2013 1 1 5.00 JFK MIA N619AA AA ## 4 2013 1 1 5.00 JFK BQN N804JB B6 ## 5 2013 1 1 6.00 LGA ATL N668DN DL ## 6 2013 1 1 5.00 EWR ORD N39463 UA ## 7 2013 1 1 6.00 EWR FLL N516JB B6 ## 8 2013 1 1 6.00 LGA IAD N829AS EV ## 9 2013 1 1 6.00 JFK MCO N593JB B6 ## 10 2013 1 1 6.00 LGA ORD N3ALAA AA ## # ... with 336,766 more rows
by = NULL
(default): use all variables that appear in both tables:
# same as: flights2 %>% left_join(weather) left_join(flights2, weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 x 18 ## year month day hour origin dest tailnum carrier temp dewp humid ## <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> ## 1 2013 1.00 1 5.00 EWR IAH N14228 UA NA NA NA ## 2 2013 1.00 1 5.00 LGA IAH N24211 UA NA NA NA ## 3 2013 1.00 1 5.00 JFK MIA N619AA AA NA NA NA ## 4 2013 1.00 1 5.00 JFK BQN N804JB B6 NA NA NA ## 5 2013 1.00 1 6.00 LGA ATL N668DN DL 39.9 26.1 57.3 ## 6 2013 1.00 1 5.00 EWR ORD N39463 UA NA NA NA ## 7 2013 1.00 1 6.00 EWR FLL N516JB B6 39.0 26.1 59.4 ## 8 2013 1.00 1 6.00 LGA IAD N829AS EV 39.9 26.1 57.3 ## 9 2013 1.00 1 6.00 JFK MCO N593JB B6 39.0 26.1 59.4 ## 10 2013 1.00 1 6.00 LGA ORD N3ALAA AA 39.9 26.1 57.3 ## # ... with 336,766 more rows, and 7 more variables: wind_dir <dbl>, ## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>, ## # visib <dbl>, time_hour <dttm>
by = "x"
: use the common variable x
:
# same as: flights2 %>% left_join(weather) left_join(flights2, planes, by = "tailnum")
## # A tibble: 336,776 x 16 ## year.x month day hour origin dest tailnum carrier year.y type ## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr> ## 1 2013 1 1 5.00 EWR IAH N14228 UA 1999 Fixed win… ## 2 2013 1 1 5.00 LGA IAH N24211 UA 1998 Fixed win… ## 3 2013 1 1 5.00 JFK MIA N619AA AA 1990 Fixed win… ## 4 2013 1 1 5.00 JFK BQN N804JB B6 2012 Fixed win… ## 5 2013 1 1 6.00 LGA ATL N668DN DL 1991 Fixed win… ## 6 2013 1 1 5.00 EWR ORD N39463 UA 2012 Fixed win… ## 7 2013 1 1 6.00 EWR FLL N516JB B6 2000 Fixed win… ## 8 2013 1 1 6.00 LGA IAD N829AS EV 1998 Fixed win… ## 9 2013 1 1 6.00 JFK MCO N593JB B6 2004 Fixed win… ## 10 2013 1 1 6.00 LGA ORD N3ALAA AA NA <NA> ## # ... with 336,766 more rows, and 6 more variables: manufacturer <chr>, ## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
by = c("a" = "b")
: match variable a
in table x
to the variable b
in table y
.
# same as: flights2 %>% left_join(weather) left_join(flights2, airports, by = c("dest" = "faa"))
## # A tibble: 336,776 x 15 ## year month day hour origin dest tailnum carrier name lat lon ## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> ## 1 2013 1 1 5.00 EWR IAH N14228 UA Georg… 30.0 -95.3 ## 2 2013 1 1 5.00 LGA IAH N24211 UA Georg… 30.0 -95.3 ## 3 2013 1 1 5.00 JFK MIA N619AA AA Miami… 25.8 -80.3 ## 4 2013 1 1 5.00 JFK BQN N804JB B6 <NA> NA NA ## 5 2013 1 1 6.00 LGA ATL N668DN DL Harts… 33.6 -84.4 ## 6 2013 1 1 5.00 EWR ORD N39463 UA Chica… 42.0 -87.9 ## 7 2013 1 1 6.00 EWR FLL N516JB B6 Fort … 26.1 -80.2 ## 8 2013 1 1 6.00 LGA IAD N829AS EV Washi… 38.9 -77.5 ## 9 2013 1 1 6.00 JFK MCO N593JB B6 Orlan… 28.4 -81.3 ## 10 2013 1 1 6.00 LGA ORD N3ALAA AA Chica… 42.0 -87.9 ## # ... with 336,766 more rows, and 4 more variables: alt <int>, tz <dbl>, ## # dst <chr>, tzone <chr>
Top 10 most popular destinations:
top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10) top_dest
## # A tibble: 10 x 2 ## dest n ## <chr> <int> ## 1 ORD 17283 ## 2 ATL 17215 ## 3 LAX 16174 ## 4 BOS 15508 ## 5 MCO 14082 ## 6 CLT 14064 ## 7 SFO 13331 ## 8 FLL 12055 ## 9 MIA 11728 ## 10 DCA 9705
How to filter the cases that fly to these destinations?
semi_join(x, y)
keesp the rows in x
that have a match in y
.
Useful to see what will be joined.
semi_join(flights, top_dest)
## Joining, by = "dest"
## # A tibble: 141,145 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 542 540 2.00 923 ## 2 2013 1 1 554 600 -6.00 812 ## 3 2013 1 1 554 558 -4.00 740 ## 4 2013 1 1 555 600 -5.00 913 ## 5 2013 1 1 557 600 -3.00 838 ## 6 2013 1 1 558 600 -2.00 753 ## 7 2013 1 1 558 600 -2.00 924 ## 8 2013 1 1 558 600 -2.00 923 ## 9 2013 1 1 559 559 0 702 ## 10 2013 1 1 600 600 0 851 ## # ... with 141,135 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>
anti_join(x, y)
keeps the rows that don’t have a match.
Useful to see what will not be joined.
flights %>% anti_join(planes, by = "tailnum") %>% count(tailnum, sort = TRUE)
## # A tibble: 722 x 2 ## tailnum n ## <chr> <int> ## 1 <NA> 2512 ## 2 N725MQ 575 ## 3 N722MQ 513 ## 4 N723MQ 507 ## 5 N713MQ 483 ## 6 N735MQ 396 ## 7 N0EGMQ 371 ## 8 N534MQ 364 ## 9 N542MQ 363 ## 10 N531MQ 349 ## # ... with 712 more rows
Generate two tables:
df1 <- tribble( ~x, ~y, 1, 1, 2, 1 ) df2 <- tribble( ~x, ~y, 1, 1, 1, 2 )
bind_rows(x, y)
stacks table x
one on top of y
.
bind_rows(df1, df2)
## # A tibble: 4 x 2 ## x y ## <dbl> <dbl> ## 1 1.00 1.00 ## 2 2.00 1.00 ## 3 1.00 1.00 ## 4 1.00 2.00
intersect(x, y)
returns rows that appear in both x
and y
.
intersect(df1, df2)
## # A tibble: 1 x 2 ## x y ## <dbl> <dbl> ## 1 1.00 1.00
union(x, y)
returns unique observations in x
and y
.
union(df1, df2)
## # A tibble: 3 x 2 ## x y ## <dbl> <dbl> ## 1 1.00 2.00 ## 2 2.00 1.00 ## 3 1.00 1.00
setdiff(x, y)
returns rows that appear in x
but not in y
.
setdiff(df1, df2)
## # A tibble: 1 x 2 ## x y ## <dbl> <dbl> ## 1 2.00 1.00
setdiff(df2, df1)
## # A tibble: 1 x 2 ## x y ## <dbl> <dbl> ## 1 1.00 2.00