A typical data science project:
Jan 30, 2018
A typical data science project:
Tibbles extend data frames in R and form the core of tidyverse.
library("tidyverse")
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4 ## ✔ tibble 1.4.2 ✔ dplyr 0.7.4 ## ✔ tidyr 0.7.2 ✔ stringr 1.2.0 ## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ── ## ✖ dplyr::filter() masks stats::filter() ## ✖ dplyr::lag() masks stats::lag()
Convert a regular data frame to tibble:
# a regular data frame head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3.0 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5.0 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa
as_tibble(iris)
## # A tibble: 150 x 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <fct> ## 1 5.10 3.50 1.40 0.200 setosa ## 2 4.90 3.00 1.40 0.200 setosa ## 3 4.70 3.20 1.30 0.200 setosa ## 4 4.60 3.10 1.50 0.200 setosa ## 5 5.00 3.60 1.40 0.200 setosa ## 6 5.40 3.90 1.70 0.400 setosa ## 7 4.60 3.40 1.40 0.300 setosa ## 8 5.00 3.40 1.50 0.200 setosa ## 9 4.40 2.90 1.40 0.200 setosa ## 10 4.90 3.10 1.50 0.100 setosa ## # ... with 140 more rows
Convert a tibble to data frame:
as.data.frame(tb)
Create tibble from individual vectors:
tibble( x = 1:5, y = 1, z = x ^ 2 + y )
## # A tibble: 5 x 3 ## x y z ## <int> <dbl> <dbl> ## 1 1 1.00 2.00 ## 2 2 1.00 5.00 ## 3 3 1.00 10.0 ## 4 4 1.00 17.0 ## 5 5 1.00 26.0
Transposed tibbles:
tribble( ~x, ~y, ~z, #--|--|---- "a", 2, 3.6, "b", 1, 8.5 )
## # A tibble: 2 x 3 ## x y z ## <chr> <dbl> <dbl> ## 1 a 2.00 3.60 ## 2 b 1.00 8.50
By default, tibble prints the first 10 rows and all columns that fit on screen.
tibble( a = lubridate::now() + runif(1e3) * 86400, b = lubridate::today() + runif(1e3) * 30, c = 1:1e3, d = runif(1e3), e = sample(letters, 1e3, replace = TRUE) )
## # A tibble: 1,000 x 5 ## a b c d e ## <dttm> <date> <int> <dbl> <chr> ## 1 2018-02-05 19:09:43 2018-02-21 1 0.299 n ## 2 2018-02-06 05:23:09 2018-02-05 2 0.444 z ## 3 2018-02-05 12:53:48 2018-02-20 3 0.412 k ## 4 2018-02-06 01:29:29 2018-02-08 4 0.0988 i ## 5 2018-02-06 00:35:26 2018-02-28 5 0.947 e ## 6 2018-02-05 20:09:45 2018-02-20 6 0.802 l ## 7 2018-02-06 00:58:43 2018-02-20 7 0.660 r ## 8 2018-02-05 19:35:04 2018-02-21 8 0.0958 r ## 9 2018-02-05 17:04:53 2018-02-24 9 0.658 a ## 10 2018-02-05 13:58:46 2018-02-08 10 0.380 a ## # ... with 990 more rows
To change number of rows and columns to display:
nycflights13::flights %>% print(n = 10, width = Inf)
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2.00 830 ## 2 2013 1 1 533 529 4.00 850 ## 3 2013 1 1 542 540 2.00 923 ## 4 2013 1 1 544 545 -1.00 1004 ## 5 2013 1 1 554 600 -6.00 812 ## 6 2013 1 1 554 558 -4.00 740 ## 7 2013 1 1 555 600 -5.00 913 ## 8 2013 1 1 557 600 -3.00 709 ## 9 2013 1 1 557 600 -3.00 838 ## 10 2013 1 1 558 600 -2.00 753 ## sched_arr_time arr_delay carrier flight tailnum origin dest air_time ## <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> ## 1 819 11.0 UA 1545 N14228 EWR IAH 227 ## 2 830 20.0 UA 1714 N24211 LGA IAH 227 ## 3 850 33.0 AA 1141 N619AA JFK MIA 160 ## 4 1022 -18.0 B6 725 N804JB JFK BQN 183 ## 5 837 -25.0 DL 461 N668DN LGA ATL 116 ## 6 728 12.0 UA 1696 N39463 EWR ORD 150 ## 7 854 19.0 B6 507 N516JB EWR FLL 158 ## 8 723 -14.0 EV 5708 N829AS LGA IAD 53.0 ## 9 846 - 8.00 B6 79 N593JB JFK MCO 140 ## 10 745 8.00 AA 301 N3ALAA LGA ORD 138 ## distance hour minute time_hour ## <dbl> <dbl> <dbl> <dttm> ## 1 1400 5.00 15.0 2013-01-01 05:00:00 ## 2 1416 5.00 29.0 2013-01-01 05:00:00 ## 3 1089 5.00 40.0 2013-01-01 05:00:00 ## 4 1576 5.00 45.0 2013-01-01 05:00:00 ## 5 762 6.00 0 2013-01-01 06:00:00 ## 6 719 5.00 58.0 2013-01-01 05:00:00 ## 7 1065 6.00 0 2013-01-01 06:00:00 ## 8 229 6.00 0 2013-01-01 06:00:00 ## 9 944 6.00 0 2013-01-01 06:00:00 ## 10 733 6.00 0 2013-01-01 06:00:00 ## # ... with 3.368e+05 more rows
options(tibble.print_max = n, tibble.print_min = m)
: if more than m
rows, print only n
rows.options(dplyr.print_min = Inf)
: print all row.options(tibble.width = Inf)
: print all columns.df <- tibble( x = runif(5), y = rnorm(5) )
Extract by name:
df$x
## [1] 0.5649089 0.9478870 0.9984776 0.7911995 0.5125524
df[["x"]]
## [1] 0.5649089 0.9478870 0.9984776 0.7911995 0.5125524
Extract by position:
df[[1]]
## [1] 0.5649089 0.9478870 0.9984776 0.7911995 0.5125524
Pipe:
df %>% .$x
## [1] 0.5649089 0.9478870 0.9984776 0.7911995 0.5125524
df %>% .[["x"]]
## [1] 0.5649089 0.9478870 0.9984776 0.7911995 0.5125524
readr package implements functions that turn flat files into tibbles.
read_csv()
, read_csv2()
(semicolon seperated files), read_tsv()
, read_delim()
.
read_fwf()
(fixed width files), read_table()
.
read_log()
(Apache style log files).
An example file heights.csv:
head heights.csv
## "earn","height","sex","ed","age","race" ## 50000,74.4244387818035,"male",16,45,"white" ## 60000,65.5375428255647,"female",16,58,"white" ## 30000,63.6291977374349,"female",16,29,"white" ## 50000,63.1085616752971,"female",16,91,"other" ## 51000,63.4024835710879,"female",17,39,"white" ## 9000,64.3995075440034,"female",15,26,"white" ## 29000,61.6563258264214,"female",12,49,"white" ## 32000,72.6985437364783,"male",17,46,"white" ## 2000,72.0394668497611,"male",15,21,"hispanic"
Read from a local file heights.csv:
(heights <- read_csv("heights.csv"))
## Parsed with column specification: ## cols( ## earn = col_double(), ## height = col_double(), ## sex = col_character(), ## ed = col_integer(), ## age = col_integer(), ## race = col_character() ## )
## # A tibble: 1,192 x 6 ## earn height sex ed age race ## <dbl> <dbl> <chr> <int> <int> <chr> ## 1 50000 74.4 male 16 45 white ## 2 60000 65.5 female 16 58 white ## 3 30000 63.6 female 16 29 white ## 4 50000 63.1 female 16 91 other ## 5 51000 63.4 female 17 39 white ## 6 9000 64.4 female 15 26 white ## 7 29000 61.7 female 12 49 white ## 8 32000 72.7 male 17 46 white ## 9 2000 72.0 male 15 21 hispanic ## 10 27000 72.2 male 12 26 white ## # ... with 1,182 more rows
Read from inline csv file:
read_csv("a,b,c 1,2,3 4,5,6")
## # A tibble: 2 x 3 ## a b c ## <int> <int> <int> ## 1 1 2 3 ## 2 4 5 6
Skip first n
lines:
read_csv("The first line of metadata The second line of metadata x,y,z 1,2,3", skip = 2)
## # A tibble: 1 x 3 ## x y z ## <int> <int> <int> ## 1 1 2 3
Skip comment lines:
read_csv("# A comment I want to skip x,y,z 1,2,3", comment = "#")
## # A tibble: 1 x 3 ## x y z ## <int> <int> <int> ## 1 1 2 3
No header line:
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## # A tibble: 2 x 3 ## X1 X2 X3 ## <int> <int> <int> ## 1 1 2 3 ## 2 4 5 6
No header line and specify colnames:
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## # A tibble: 2 x 3 ## x y z ## <int> <int> <int> ## 1 1 2 3 ## 2 4 5 6
Specify the symbol representing missing values:
read_csv("a,b,c\n1,2,.", na = ".")
## # A tibble: 1 x 3 ## a b c ## <int> <int> <chr> ## 1 1 2 <NA>
Write to csv:
write_csv(challenge, "challenge.csv")
Write (and read) RDS files:
write_rds(challenge, "challenge.rds") read_rds("challenge.rds")
haven reads SPSS, Stata, and SAS files.
readxl reads excel files (both .xls and .xlsx).
DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc) allows you to run SQL queries against a database and return a data frame.
jsonlite reads json files.
xml2 reads XML files.
There are three interrelated rules which make a dataset tidy:
Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.
Example table1
table1
## # A tibble: 6 x 4 ## country year cases population ## <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
is tidy.
Example table2
table2
## # A tibble: 12 x 4 ## country year type count ## <chr> <int> <chr> <int> ## 1 Afghanistan 1999 cases 745 ## 2 Afghanistan 1999 population 19987071 ## 3 Afghanistan 2000 cases 2666 ## 4 Afghanistan 2000 population 20595360 ## 5 Brazil 1999 cases 37737 ## 6 Brazil 1999 population 172006362 ## 7 Brazil 2000 cases 80488 ## 8 Brazil 2000 population 174504898 ## 9 China 1999 cases 212258 ## 10 China 1999 population 1272915272 ## 11 China 2000 cases 213766 ## 12 China 2000 population 1280428583
is not tidy.
Example table3
table3
## # A tibble: 6 x 3 ## country year rate ## * <chr> <int> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583
is not tidy.
Example table4a
table4a
## # A tibble: 3 x 3 ## country `1999` `2000` ## * <chr> <int> <int> ## 1 Afghanistan 745 2666 ## 2 Brazil 37737 80488 ## 3 China 212258 213766
is not tidy.
Example table4b
table4b
## # A tibble: 3 x 3 ## country `1999` `2000` ## * <chr> <int> <int> ## 1 Afghanistan 19987071 20595360 ## 2 Brazil 172006362 174504898 ## 3 China 1272915272 1280428583
is not tidy.
gather
columns into a new pair of variables.
table4a %>% gather(`1999`, `2000`, key = "year", value = "cases")
## # A tibble: 6 x 3 ## country year cases ## <chr> <chr> <int> ## 1 Afghanistan 1999 745 ## 2 Brazil 1999 37737 ## 3 China 1999 212258 ## 4 Afghanistan 2000 2666 ## 5 Brazil 2000 80488 ## 6 China 2000 213766
We can gather table4b too and then join them
tidy4a <- table4a %>% gather(`1999`, `2000`, key = "year", value = "cases") tidy4b <- table4b %>% gather(`1999`, `2000`, key = "year", value = "population") left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4 ## country year cases population ## <chr> <chr> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Brazil 1999 37737 172006362 ## 3 China 1999 212258 1272915272 ## 4 Afghanistan 2000 2666 20595360 ## 5 Brazil 2000 80488 174504898 ## 6 China 2000 213766 1280428583
Spreading is the opposite of gathering.
spread(table2, key = type, value = count)
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
table3 %>% separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <chr> <chr> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
Seperate into numeric values:
table3 %>% separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
Separate at a fixed position:
table3 %>% separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 x 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583
table5
## # A tibble: 6 x 4 ## country century year rate ## * <chr> <chr> <chr> <chr> ## 1 Afghanistan 19 99 745/19987071 ## 2 Afghanistan 20 00 2666/20595360 ## 3 Brazil 19 99 37737/172006362 ## 4 Brazil 20 00 80488/174504898 ## 5 China 19 99 212258/1272915272 ## 6 China 20 00 213766/1280428583
unite()
is the inverse of separate()
.
table5 %>% unite(new, century, year, sep = "")
## # A tibble: 6 x 3 ## country new rate ## * <chr> <chr> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583