Feb 6, 2018
stringr pacakge, by Hadley Wickham, provides utilities for handling strings.
Included in tidyverse.
library("tidyverse")
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4 ## ✔ tibble 1.4.2 ✔ dplyr 0.7.4 ## ✔ tidyr 0.8.0 ✔ stringr 1.2.0 ## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ── ## ✖ dplyr::filter() masks stats::filter() ## ✖ dplyr::lag() masks stats::lag()
Strings are enclosed by double quotes orx single quotes:
string1 <- "This is a string" string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
Literal single or double quote:
double_quote <- "\"" # or '"' single_quote <- '\'' # or "'"
Printed representation:
x <- c("\"", "\\") x
## [1] "\"" "\\"
vs writeLines()
:
writeLines(x)
## " ## \
Other special characters: "\n"
(new line), "\t"
(tab), … Check
?"'"
for a complete list.
Unicode
x <- "\u00b5" x
## [1] "µ"
Character vector (vector of strings):
c("one", "two", "three")
## [1] "one" "two" "three"
Length of a single string:
str_length("R for data science")
## [1] 18
Lengths of a character vector:
str_length(c("a", "R for data science", NA))
## [1] 1 18 NA
Combine two or more strings
str_c("x", "y")
## [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
Separator:
str_c("x", "y", sep = ", ")
## [1] "x, y"
str_c()
is vectorised:
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
Objects of length 0 are silently dropped:
name <- "Hadley" time_of_day <- "morning" birthday <- FALSE str_c( "Good ", time_of_day, " ", name, if (birthday) " and HAPPY BIRTHDAY", "." )
## [1] "Good morning Hadley."
Combine a vector of strings:
str_c(c("x", "y", "z"))
## [1] "x" "y" "z"
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
By position:
str_sub("Apple", 1, 3)
## [1] "App"
x <- c("Apple", "Banana", "Pear") str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
Negative numbers count backwards from end:
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
Out of range:
str_sub("a", 1, 5)
## [1] "a"
str_sub("a", 2, 5)
## [1] ""
Assignment to a substring:
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1)) x
## [1] "apple" "banana" "pear"
str_view()
shows the first match;str_view_all()
shows all matches.
Match exact strings:
x <- c("apple", "banana", "pear") str_view(x, "an")
str_view_all(x, "an")
.
matches any character apart from a newline:
str_view(x, ".a.")
To match a literal .
:
str_view(c("abc", "a.c", "bef"), "a\\.c")
To match a literal \
:
str_view("a\\b", "\\\\")
^
matches the start of the string:
x <- c("apple", "banana", "pear") str_view(x, "^a")
$
matches the end of the string:
str_view(x, "a$")
To force a regular expression to only match a complete string:
x <- c("apple pie", "apple", "apple cake") str_view(x, "^apple$")
Other special matches:
\d
: matches any digit.
\s
: matches any whitespace (e.g. space, tab, newline).
[abc]
: matches a, b, or c.
[^abc]
: matches anything except a, b, or c.
alternation
str_view(c("grey", "gray"), "gr(e|a)y")
str_view(c("grey", "gray"), "gr[ea]y")
?
: 0 or 1+
: 1 or more*
: 0 or more
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" str_view(x, "CC?")
# greedy matches str_view(x, "CC+")
# greedy matches str_view(x, 'C[LX]+')
Specify number of matches:{n}
: exactly n{n,}
: n or more{,m}
: at most m{n,m}
: between n and m
str_view(x, "C{2}")
# greedy matches str_view(x, "C{2,}")
# greedy matches str_view(x, "C{2,3}")
Greedy (default) vs lazy (put ?
after repetition):
# lazy matches str_view(x, 'C{2,3}?')
# lazy matches str_view(x, 'C[LX]+?')
Parentheses define groups, which can be back-referenced as \1
, \2
, …
str_view(fruit, "(..)\\1", match = TRUE)
x <- c("apple", "banana", "pear") str_detect(x, "e")
## [1] TRUE FALSE TRUE
Vector words
contains 1000 commonly used words:
length(words)
## [1] 980
head(words)
## [1] "a" "able" "about" "absolute" "accept" "account"
# How many common words start with t? sum(str_detect(words, "^t"))
## [1] 65
# What proportion of common words end with a vowel? mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
Find workds that end with x
:
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
same as
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
Filter a data frame:
df <- tibble( word = words, i = seq_along(word) ) df %>% filter(str_detect(words, "x$"))
## # A tibble: 4 x 2 ## word i ## <chr> <int> ## 1 box 108 ## 2 sex 747 ## 3 six 772 ## 4 tax 841
str_count()
tells how many matches are found:
x <- c("apple", "banana", "pear") str_count(x, "a")
## [1] 1 3 1
# On average, how many vowels per word? mean(str_count(words, "[aeiou]"))
## [1] 1.991837
Matches never overlap:
str_count("abababa", "aba")
## [1] 2
str_view_all("abababa", "aba")
Mutate a data frame:
df %>% mutate( vowels = str_count(word, "[aeiou]"), consonants = str_count(word, "[^aeiou]") )
## # A tibble: 980 x 4 ## word i vowels consonants ## <chr> <int> <int> <int> ## 1 a 1 1 0 ## 2 able 2 2 2 ## 3 about 3 3 2 ## 4 absolute 4 4 4 ## 5 accept 5 2 4 ## 6 account 6 3 4 ## 7 achieve 7 4 3 ## 8 across 8 2 4 ## 9 act 9 1 2 ## 10 active 10 3 3 ## # ... with 970 more rows
sentences
is a collection of 720 phrases:
length(sentences)
## [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks." ## [2] "Glue the sheet to the dark blue background." ## [3] "It's easy to tell the depth of a well." ## [4] "These days a chicken leg is a rare dish." ## [5] "Rice is often served in round bowls." ## [6] "The juice of lemons makes fine punch."
Suppose we want to find all sentences that contain a colour.
Create a collection of colours:
colours <- c("red", "orange", "yellow", "green", "blue", "purple") colour_match <- str_c(colours, collapse = "|") colour_match
## [1] "red|orange|yellow|green|blue|purple"
Select the sentences that contain a colour, and then extract the colour to figure out which one it is:
has_colour <- str_subset(sentences, colour_match) matches <- str_extract(has_colour, colour_match) head(matches)
## [1] "blue" "blue" "red" "red" "red" "blue"
str_extract()
only extracts the first match.
more <- sentences[str_count(sentences, colour_match) > 1] str_view_all(more, colour_match)
str_extract_all()
extracts all matches:
str_extract_all(more, colour_match)
## [[1]] ## [1] "blue" "red" ## ## [[2]] ## [1] "green" "red" ## ## [[3]] ## [1] "orange" "red"
Setting simplify = TRUE
in str_extract_all()
will return a matrix with short matches expanded to the same length as the longest:
str_extract_all(more, colour_match, simplify = TRUE)
## [,1] [,2] ## [1,] "blue" "red" ## [2,] "green" "red" ## [3,] "orange" "red"
x <- c("a", "a b", "a b c") str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3] ## [1,] "a" "" "" ## [2,] "a" "b" "" ## [3,] "a" "b" "c"
str_extract()
gives us the complete match:
noun <- "(a|the) ([^ ]+)" has_noun <- sentences %>% str_subset(noun) %>% head(10) has_noun %>% str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked" ## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
str_match()
gives each individual component:
has_noun %>% str_match(noun)
## [,1] [,2] [,3] ## [1,] "the smooth" "the" "smooth" ## [2,] "the sheet" "the" "sheet" ## [3,] "the depth" "the" "depth" ## [4,] "a chicken" "a" "chicken" ## [5,] "the parked" "the" "parked" ## [6,] "the sun" "the" "sun" ## [7,] "the huge" "the" "huge" ## [8,] "the ball" "the" "ball" ## [9,] "the woman" "the" "woman" ## [10,] "a helps" "a" "helps"
tidyr::extract()
works with tibble:
tibble(sentence = sentences) %>% tidyr::extract( sentence, c("article", "noun"), "(a|the) ([^ ]+)", remove = FALSE )
## # A tibble: 720 x 3 ## sentence article noun ## <chr> <chr> <chr> ## 1 The birch canoe slid on the smooth planks. the smooth ## 2 Glue the sheet to the dark blue background. the sheet ## 3 It's easy to tell the depth of a well. the depth ## 4 These days a chicken leg is a rare dish. a chicken ## 5 Rice is often served in round bowls. <NA> <NA> ## 6 The juice of lemons makes fine punch. <NA> <NA> ## 7 The box was thrown beside the parked truck. the parked ## 8 The hogs were fed chopped corn and garbage. <NA> <NA> ## 9 Four hours of steady work faced us. <NA> <NA> ## 10 Large size in stockings is hard to sell. <NA> <NA> ## # ... with 710 more rows
Replace the first match:
x <- c("apple", "pear", "banana") str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
Replace all matches:
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
Multiple replacement:
x <- c("1 house", "2 cars", "3 people") str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
Back-reference:
# flip the order of the second and third words sentences %>% str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% head(5)
## [1] "The canoe birch slid on the smooth planks." ## [2] "Glue sheet the to the dark blue background." ## [3] "It's to easy tell the depth of a well." ## [4] "These a days chicken leg is a rare dish." ## [5] "Rice often is served in round bowls."
Split a string up into pieces:
sentences %>% head(5) %>% str_split(" ")
## [[1]] ## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth" ## [8] "planks." ## ## [[2]] ## [1] "Glue" "the" "sheet" "to" "the" ## [6] "dark" "blue" "background." ## ## [[3]] ## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well." ## ## [[4]] ## [1] "These" "days" "a" "chicken" "leg" "is" "a" ## [8] "rare" "dish." ## ## [[5]] ## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
Use simplify = TRUE
to return a matrix:
sentences %>% head(5) %>% str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] ## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" ## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" ## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" ## [4,] "These" "days" "a" "chicken" "leg" "is" "a" ## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ## [,8] [,9] ## [1,] "planks." "" ## [2,] "background." "" ## [3,] "a" "well." ## [4,] "rare" "dish." ## [5,] "" ""