There is a wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2018.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2018 can be accessed at page https://www.imdb.com/search/title?count=100&release_date=2018,2018&title_type=feature.
#Loading the rvest and tidyverse package
library("rvest")
## Loading required package: xml2
library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2018,2018&title_type=feature'
#Reading the HTML code from the website
(webpage <- read_html(url))
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="styleguide-v2" class="fixed">\n\n <img height=" ...
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Use SelectorGadget to find the CSS selector .lister-item-header a
.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt1727824/?ref_=adv_li_tt">Bohemian Rhapsody</a>
## [2] <a href="/title/tt4530422/?ref_=adv_li_tt">Overlord</a>
## [3] <a href="/title/tt6966692/?ref_=adv_li_tt">Green Book</a>
## [4] <a href="/title/tt1477834/?ref_=adv_li_tt">Aquaman</a>
## [5] <a href="/title/tt5083738/?ref_=adv_li_tt">The Favourite</a>
## [6] <a href="/title/tt1517451/?ref_=adv_li_tt">A Star Is Born</a>
## [7] <a href="/title/tt3606756/?ref_=adv_li_tt">Incredibles 2</a>
## [8] <a href="/title/tt1571234/?ref_=adv_li_tt">Mortal Engines</a>
## [9] <a href="/title/tt4218572/?ref_=adv_li_tt">Widows</a>
## [10] <a href="/title/tt4532826/?ref_=adv_li_tt">Robin Hood</a>
## [11] <a href="/title/tt6155172/?ref_=adv_li_tt">Roma</a>
## [12] <a href="/title/tt6266538/?ref_=adv_li_tt">Vice</a>
## [13] <a href="/title/tt4633694/?ref_=adv_li_tt">Spider-Man: Into the Spi ...
## [14] <a href="/title/tt4154756/?ref_=adv_li_tt">Avengers: Infinity War</a>
## [15] <a href="/title/tt7349662/?ref_=adv_li_tt">BlacKkKlansman</a>
## [16] <a href="/title/tt2737304/?ref_=adv_li_tt">Bird Box</a>
## [17] <a href="/title/tt4595882/?ref_=adv_li_tt">Can You Ever Forgive Me? ...
## [18] <a href="/title/tt1034415/?ref_=adv_li_tt">Suspiria</a>
## [19] <a href="/title/tt5095030/?ref_=adv_li_tt">Ant-Man and the Wasp</a>
## [20] <a href="/title/tt8359848/?ref_=adv_li_tt">Climax</a>
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "Bohemian Rhapsody"
## [2] "Overlord"
## [3] "Green Book"
## [4] "Aquaman"
## [5] "The Favourite"
## [6] "A Star Is Born"
## [7] "Incredibles 2"
## [8] "Mortal Engines"
## [9] "Widows"
## [10] "Robin Hood"
## [11] "Roma"
## [12] "Vice"
## [13] "Spider-Man: Into the Spider-Verse"
## [14] "Avengers: Infinity War"
## [15] "BlacKkKlansman"
## [16] "Bird Box"
## [17] "Can You Ever Forgive Me?"
## [18] "Suspiria"
## [19] "Ant-Man and the Wasp"
## [20] "Climax"
## [21] "The Mule"
## [22] "The Man Who Killed Hitler and Then The Bigfoot"
## [23] "First Man"
## [24] "Black Panther"
## [25] "Hunter Killer"
## [26] "The Girl in the Spider's Web"
## [27] "Venom"
## [28] "Bumblebee"
## [29] "Bad Times at the El Royale"
## [30] "The Ballad of Buster Scruggs"
## [31] "Mary Queen of Scots"
## [32] "Uncle Drew"
## [33] "Solo: A Star Wars Story"
## [34] "Dragon Ball Super: Broly"
## [35] "A Quiet Place"
## [36] "Fantastic Beasts: The Crimes of Grindelwald"
## [37] "Ready Player One"
## [38] "Arctic"
## [39] "A Simple Favor"
## [40] "A Private War"
## [41] "Deadpool 2"
## [42] "The Front Runner"
## [43] "The Grinch"
## [44] "Boy Erased"
## [45] "At Eternity's Gate"
## [46] "Todos lo saben"
## [47] "Tag"
## [48] "Prospect"
## [49] "Mary Poppins Returns"
## [50] "Beautiful Boy"
## [51] "Annihilation"
## [52] "Crazy Rich Asians"
## [53] "Cold War"
## [54] "Mission: Impossible - Fallout"
## [55] "If Beale Street Could Talk"
## [56] "Hereditary"
## [57] "The Nutcracker and the Four Realms"
## [58] "Instant Family"
## [59] "Halloween"
## [60] "Burning"
## [61] "Ralph Breaks the Internet"
## [62] "Ocean's 8"
## [63] "The Sisters Brothers"
## [64] "Creed II"
## [65] "The Predator"
## [66] "Hotel Transylvania 3: Summer Vacation"
## [67] "Replicas"
## [68] "Stan & Ollie"
## [69] "The Hate U Give"
## [70] "On the Basis of Sex"
## [71] "Destroyer"
## [72] "Upgrade"
## [73] "Red Sparrow"
## [74] "The Guilty"
## [75] "The House That Jack Built"
## [76] "Isle of Dogs"
## [77] "Searching"
## [78] "The Old Man & the Gun"
## [79] "Nobody's Fool"
## [80] "Game Night"
## [81] "Capharnaüm"
## [82] "Sicario: Day of the Soldado"
## [83] "Dumplin'"
## [84] "The Meg"
## [85] "Black Mirror: Bandersnatch"
## [86] "Rampage"
## [87] "The Nun"
## [88] "Untogether"
## [89] "To All the Boys I've Loved Before"
## [90] "Then Came You"
## [91] "Jurassic World: Fallen Kingdom"
## [92] "Little Italy"
## [93] "K.G.F: Chapter 1"
## [94] "Eighth Grade"
## [95] "Love, Simon"
## [96] "Peranbu"
## [97] "High Life"
## [98] "Mile 22"
## [99] "Mowgli: Legend of the Jungle"
## [100] "Mandy"
# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\n The story of the legendary rock band <a ...
## [2] <p class="text-muted">\n A small group of American soldiers find ...
## [3] <p class="text-muted">\n A working-class Italian-American bounce ...
## [4] <p class="text-muted">\n Arthur Curry, the human-born heir to th ...
## [5] <p class="text-muted">\n In early 18th century England, a frail ...
## [6] <p class="text-muted">\n A musician helps a young singer find fa ...
## [7] <p class="text-muted">\n The Incredibles hero family takes on a ...
## [8] <p class="text-muted">\n In a post-apocalyptic world where citie ...
## [9] <p class="text-muted">\n Set in contemporary Chicago, amid a tim ...
## [10] <p class="text-muted">\n A war-hardened Crusader and his Moorish ...
## [11] <p class="text-muted">\n A year in the life of a middle-class fa ...
## [12] <p class="text-muted">\n The story of <a href="/name/nm0155515"> ...
## [13] <p class="text-muted">\n Teen Miles Morales becomes Spider-Man o ...
## [14] <p class="text-muted">\n The Avengers and their allies must be w ...
## [15] <p class="text-muted">\n Ron Stallworth, an African American pol ...
## [16] <p class="text-muted">\n Five years after an ominous unseen pres ...
## [17] <p class="text-muted">\n When Lee Israel falls out of step with ...
## [18] <p class="text-muted">\n A darkness swirls at the center of a wo ...
## [19] <p class="text-muted">\n As Scott Lang balances being both a Sup ...
## [20] <p class="text-muted">\n French dancers gather in a remote, empt ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\n The story of the legendary rock band Queen and lead singer Freddie Mercury, leading up to their famous performance at Live Aid (1985)."
## [2] "\n A small group of American soldiers find horror behind enemy lines on the eve of D-Day."
## [3] "\n A working-class Italian-American bouncer becomes the driver of an African-American classical pianist on a tour of venues through the 1960s American South."
## [4] "\n Arthur Curry, the human-born heir to the underwater kingdom of Atlantis, goes on a quest to prevent a war between the worlds of ocean and land."
## [5] "\n In early 18th century England, a frail Queen Anne occupies the throne and her close friend, Lady Sarah, governs the country in her stead. When a new servant, Abigail, arrives, her charm endears her to Sarah."
## [6] "\n A musician helps a young singer find fame, even as age and alcoholism send his own career into a downward spiral."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n\\s+", "")
head(description_data)
## [1] "The story of the legendary rock band Queen and lead singer Freddie Mercury, leading up to their famous performance at Live Aid (1985)."
## [2] "A small group of American soldiers find horror behind enemy lines on the eve of D-Day."
## [3] "A working-class Italian-American bouncer becomes the driver of an African-American classical pianist on a tour of venues through the 1960s American South."
## [4] "Arthur Curry, the human-born heir to the underwater kingdom of Atlantis, goes on a quest to prevent a war between the worlds of ocean and land."
## [5] "In early 18th century England, a frail Queen Anne occupies the throne and her close friend, Lady Sarah, governs the country in her stead. When a new servant, Abigail, arrives, her charm endears her to Sarah."
## [6] "A musician helps a young singer find fame, even as age and alcoholism send his own career into a downward spiral."
# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.text-muted .runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
## [1] "134 min" "110 min" "130 min" "143 min" "119 min" "136 min"
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 134 110 130 143 119 136
# Using CSS selectors to scrap the Movie genre section
genre_data_html <- html_nodes(webpage, '.genre')
# Converting the genre data to text
genre_data <- html_text(genre_data_html)
# Let's have a look at the genre data
head(genre_data)
## [1] "\nBiography, Drama, Music "
## [2] "\nAction, Adventure, Horror "
## [3] "\nBiography, Comedy, Drama "
## [4] "\nAction, Adventure, Fantasy "
## [5] "\nBiography, Comedy, Drama "
## [6] "\nDrama, Music, Romance "
# Data-Preprocessing: retrieve the first word
genre_data <- str_extract(genre_data, "[:alpha:]+")
# Convering each genre from text to factor
#genre_data <- as.factor(genre_data)
# Let's have another look at the genre data
head(genre_data)
## [1] "Biography" "Action" "Biography" "Action" "Biography" "Drama"
# Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
# Converting the ratings data to text
rating_data <- html_text(rating_data_html)
# Let's have a look at the ratings
head(rating_data)
## [1] "8.2" "6.9" "8.3" "7.4" "7.8" "7.9"
# Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
# Let's have another look at the ratings data
rating_data
## [1] 8.2 6.9 8.3 7.4 7.8 7.9 7.8 6.2 7.1 5.3 7.9 7.2 8.7 8.5 7.5 6.7 7.3
## [18] 7.0 7.1 7.4 7.2 5.6 7.4 7.4 6.6 6.1 6.8 7.1 7.2 7.3 6.5 5.7 7.0 8.3
## [35] 7.6 6.8 7.5 7.3 6.9 6.7 7.8 6.3 6.3 7.0 7.0 7.0 6.6 6.3 7.2 7.3 6.9
## [52] 7.0 7.7 7.8 7.6 7.3 5.5 7.6 6.7 7.7 7.2 6.2 7.0 7.6 5.4 6.3 5.4 7.6
## [69] 7.1 6.5 6.7 7.6 6.6 7.6 7.0 7.9 7.7 6.8 4.4 7.0 8.3 7.1 6.8 5.7 7.4
## [86] 6.1 5.4 6.0 7.3 7.0 6.2 5.7 8.6 7.5 7.7 9.8 6.7 6.1 6.5 6.6
# Using CSS selectors to scrap the votes section
votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
# Converting the votes data to text
votes_data <- html_text(votes_data_html)
# Let's have a look at the votes data
head(votes_data)
## [1] "246,470" "30,977" "62,443" "164,708" "53,107" "177,646"
# Data-Preprocessing: removing commas
votes_data <- str_replace(votes_data, ",", "")
# Data-Preprocessing: converting votes to numerical
votes_data <- as.numeric(votes_data)
#Let's have another look at the votes data
votes_data
## [1] 246470 30977 62443 164708 53107 177646 175775 38561 42185 24190
## [11] 73780 31162 94876 584672 105254 181502 11941 23891 194191 12128
## [21] 18793 1478 94459 460433 23700 18439 232108 50412 65543 70123
## [31] 9858 8176 203858 11331 273750 118011 277710 1198 67893 4853
## [41] 347170 3104 23306 11519 6503 11282 71377 2288 32968 21556
## [51] 210985 78108 22384 214004 9922 130347 13666 10324 75263 13687
## [61] 47185 126827 19964 36849 82948 36372 9681 6170 12404 4050
## [71] 3681 89913 127667 17575 24854 96145 79219 18667 2278 149228
## [81] 4846 83167 14791 97311 79985 105826 78279 447 53964 744
## [91] 197518 4520 17953 33566 74886 8274 1246 40984 39406 37113
# Using CSS selectors to scrap the directors section
(directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm0001741/?ref_=adv_li_dr_0">Bryan Singer</a>
## [2] <a href="/name/nm1170339/?ref_=adv_li_dr_0">Julius Avery</a>
## [3] <a href="/name/nm0268380/?ref_=adv_li_dr_0">Peter Farrelly</a>
## [4] <a href="/name/nm1490123/?ref_=adv_li_dr_0">James Wan</a>
## [5] <a href="/name/nm0487166/?ref_=adv_li_dr_0">Yorgos Lanthimos</a>
## [6] <a href="/name/nm0177896/?ref_=adv_li_dr_0">Bradley Cooper</a>
## [7] <a href="/name/nm0083348/?ref_=adv_li_dr_0">Brad Bird</a>
## [8] <a href="/name/nm0729514/?ref_=adv_li_dr_0">Christian Rivers</a>
## [9] <a href="/name/nm2588606/?ref_=adv_li_dr_0">Steve McQueen</a>
## [10] <a href="/name/nm1163264/?ref_=adv_li_dr_0">Otto Bathurst</a>
## [11] <a href="/name/nm0190859/?ref_=adv_li_dr_0">Alfonso Cuarón</a>
## [12] <a href="/name/nm0570912/?ref_=adv_li_dr_0">Adam McKay</a>
## [13] <a href="/name/nm2130108/?ref_=adv_li_dr_0">Bob Persichetti</a>
## [14] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
## [15] <a href="/name/nm0000490/?ref_=adv_li_dr_0">Spike Lee</a>
## [16] <a href="/name/nm0081540/?ref_=adv_li_dr_0">Susanne Bier</a>
## [17] <a href="/name/nm1716636/?ref_=adv_li_dr_0">Marielle Heller</a>
## [18] <a href="/name/nm0345174/?ref_=adv_li_dr_0">Luca Guadagnino</a>
## [19] <a href="/name/nm0715636/?ref_=adv_li_dr_0">Peyton Reed</a>
## [20] <a href="/name/nm0637615/?ref_=adv_li_dr_0">Gaspar Noé</a>
## ...
# Converting the directors data to text
directors_data <- html_text(directors_data_html)
# Let's have a look at the directors data
head(directors_data)
## [1] "Bryan Singer" "Julius Avery" "Peter Farrelly"
## [4] "James Wan" "Yorgos Lanthimos" "Bradley Cooper"
# Data-Preprocessing: converting directors data into factors
(directors_data <- as.factor(directors_data))
## [1] Bryan Singer Julius Avery Peter Farrelly
## [4] James Wan Yorgos Lanthimos Bradley Cooper
## [7] Brad Bird Christian Rivers Steve McQueen
## [10] Otto Bathurst Alfonso Cuarón Adam McKay
## [13] Bob Persichetti Anthony Russo Spike Lee
## [16] Susanne Bier Marielle Heller Luca Guadagnino
## [19] Peyton Reed Gaspar Noé Clint Eastwood
## [22] Robert D. Krzykowski Damien Chazelle Ryan Coogler
## [25] Donovan Marsh Fede Alvarez Ruben Fleischer
## [28] Travis Knight Drew Goddard Ethan Coen
## [31] Josie Rourke Charles Stone III Ron Howard
## [34] Tatsuya Nagamine John Krasinski David Yates
## [37] Steven Spielberg Joe Penna Paul Feig
## [40] Matthew Heineman David Leitch Jason Reitman
## [43] Yarrow Cheney Joel Edgerton Julian Schnabel
## [46] Asghar Farhadi Jeff Tomsic Christopher Caldwell
## [49] Rob Marshall Felix van Groeningen Alex Garland
## [52] Jon M. Chu Pawel Pawlikowski Christopher McQuarrie
## [55] Barry Jenkins Ari Aster Lasse Hallström
## [58] Sean Anders David Gordon Green Chang-dong Lee
## [61] Phil Johnston Gary Ross Jacques Audiard
## [64] Steven Caple Jr. Shane Black Genndy Tartakovsky
## [67] Jeffrey Nachmanoff Jon S. Baird George Tillman Jr.
## [70] Mimi Leder Karyn Kusama Leigh Whannell
## [73] Francis Lawrence Gustav Möller Lars von Trier
## [76] Wes Anderson Aneesh Chaganty David Lowery
## [79] Tyler Perry John Francis Daley Nadine Labaki
## [82] Stefano Sollima Anne Fletcher Jon Turteltaub
## [85] David Slade Brad Peyton Corin Hardy
## [88] Emma Forrest Susan Johnson Peter Hutchings
## [91] J.A. Bayona Donald Petrie Prashanth Neel
## [94] Bo Burnham Greg Berlanti Ram
## [97] Claire Denis Peter Berg Andy Serkis
## [100] Panos Cosmatos
## 100 Levels: Adam McKay Alex Garland Alfonso Cuarón ... Yorgos Lanthimos
# Using CSS selectors to scrap the actors section
(actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm1785339/?ref_=adv_li_st_0">Rami Malek</a>
## [2] <a href="/name/nm5381254/?ref_=adv_li_st_0">Jovan Adepo</a>
## [3] <a href="/name/nm0001557/?ref_=adv_li_st_0">Viggo Mortensen</a>
## [4] <a href="/name/nm0597388/?ref_=adv_li_st_0">Jason Momoa</a>
## [5] <a href="/name/nm1469236/?ref_=adv_li_st_0">Olivia Colman</a>
## [6] <a href="/name/nm3078932/?ref_=adv_li_st_0">Lady Gaga</a>
## [7] <a href="/name/nm0005266/?ref_=adv_li_st_0">Craig T. Nelson</a>
## [8] <a href="/name/nm2623492/?ref_=adv_li_st_0">Hera Hilmar</a>
## [9] <a href="/name/nm0205626/?ref_=adv_li_st_0">Viola Davis</a>
## [10] <a href="/name/nm5473782/?ref_=adv_li_st_0">Taron Egerton</a>
## [11] <a href="/name/nm8611957/?ref_=adv_li_st_0">Yalitza Aparicio</a>
## [12] <a href="/name/nm0000288/?ref_=adv_li_st_0">Christian Bale</a>
## [13] <a href="/name/nm4271336/?ref_=adv_li_st_0">Shameik Moore</a>
## [14] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
## [15] <a href="/name/nm0913475/?ref_=adv_li_st_0">John David Washington</a>
## [16] <a href="/name/nm0000113/?ref_=adv_li_st_0">Sandra Bullock</a>
## [17] <a href="/name/nm0565250/?ref_=adv_li_st_0">Melissa McCarthy</a>
## [18] <a href="/name/nm1631269/?ref_=adv_li_st_0">Chloë Grace Moretz</a>
## [19] <a href="/name/nm0748620/?ref_=adv_li_st_0">Paul Rudd</a>
## [20] <a href="/name/nm1154749/?ref_=adv_li_st_0">Sofia Boutella</a>
## ...
# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
# Let's have a look at the actors data
head(actors_data)
## [1] "Rami Malek" "Jovan Adepo" "Viggo Mortensen" "Jason Momoa"
## [5] "Olivia Colman" "Lady Gaga"
# Data-Preprocessing: converting actors data into factors
(actors_data <- as.factor(actors_data))
## [1] Rami Malek Jovan Adepo Viggo Mortensen
## [4] Jason Momoa Olivia Colman Lady Gaga
## [7] Craig T. Nelson Hera Hilmar Viola Davis
## [10] Taron Egerton Yalitza Aparicio Christian Bale
## [13] Shameik Moore Robert Downey Jr. John David Washington
## [16] Sandra Bullock Melissa McCarthy Chloë Grace Moretz
## [19] Paul Rudd Sofia Boutella Bradley Cooper
## [22] Sam Elliott Ryan Gosling Chadwick Boseman
## [25] Gerard Butler Claire Foy Tom Hardy
## [28] Hailee Steinfeld Jeff Bridges Tim Blake Nelson
## [31] Saoirse Ronan Kyrie Irving Alden Ehrenreich
## [34] Masako Nozawa Emily Blunt Eddie Redmayne
## [37] Tye Sheridan Mads Mikkelsen Anna Kendrick
## [40] Rosamund Pike Ryan Reynolds Hugh Jackman
## [43] Benedict Cumberbatch Lucas Hedges Willem Dafoe
## [46] Penélope Cruz Jeremy Renner Sophie Thatcher
## [49] Emily Blunt Steve Carell Natalie Portman
## [52] Constance Wu Joanna Kulig Tom Cruise
## [55] KiKi Layne Toni Collette Mackenzie Foy
## [58] Mark Wahlberg Jamie Lee Curtis Ah-in Yoo
## [61] John C. Reilly Sandra Bullock John C. Reilly
## [64] Michael B. Jordan Boyd Holbrook Adam Sandler
## [67] Alice Eve Shirley Henderson Amandla Stenberg
## [70] Felicity Jones Nicole Kidman Logan Marshall-Green
## [73] Jennifer Lawrence Jakob Cedergren Matt Dillon
## [76] Bryan Cranston John Cho Robert Redford
## [79] Tiffany Haddish Jason Bateman Zain Al Rafeea
## [82] Benicio Del Toro Danielle Macdonald Jason Statham
## [85] Fionn Whitehead Dwayne Johnson Demián Bichir
## [88] Alice Eve Lana Condor Asa Butterfield
## [91] Chris Pratt Hayden Christensen Yash
## [94] Elsie Fisher Nick Robinson Mammootty
## [97] Robert Pattinson Mark Wahlberg Christian Bale
## [100] Nicolas Cage
## 94 Levels: Adam Sandler Ah-in Yoo Alden Ehrenreich ... Zain Al Rafeea
Be careful with missing data.
# Using CSS selectors to scrap the metascore section
metascore_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
# Let's have a look at the metascore
head(metascore_data)
## [1] "49 " "60 " "69 " "55 " "90 "
## [6] "88 "
# Data-Preprocessing: removing extra space in metascore
metascore_data <- str_replace(metascore_data, "\\s*$", "")
metascore_data <- as.numeric(metascore_data)
metascore_data
## [1] 49 60 69 55 90 88 80 44 84 32 96 61 87 68 83 51 87 64 70 83 58 50 84
## [24] 88 43 43 35 66 60 79 60 57 62 59 82 52 64 71 67 75 66 61 51 69 78 67
## [47] 56 68 66 62 79 74 90 86 87 87 39 57 67 90 71 61 78 66 48 54 19 75 81
## [70] 60 62 67 53 83 42 82 71 80 39 66 75 61 53 46 45 46 45 64 43 51 28 90
## [93] 72 81 38 51 81
# Lets check the length of metascore data
length(metascore_data)
## [1] 97
# Visual inspection finds 69, 74, 87 don't have metascore
ms <- rep(NA, 100)
ms[-c(85, 93, 96)] <- metascore_data
(metascore_data <- ms)
## [1] 49 60 69 55 90 88 80 44 84 32 96 61 87 68 83 51 87 64 70 83 58 50 84
## [24] 88 43 43 35 66 60 79 60 57 62 59 82 52 64 71 67 75 66 61 51 69 78 67
## [47] 56 68 66 62 79 74 90 86 87 87 39 57 67 90 71 61 78 66 48 54 19 75 81
## [70] 60 62 67 53 83 42 82 71 80 39 66 75 61 53 46 NA 45 46 45 64 43 51 28
## [93] NA 90 72 NA 81 38 51 81
Be careful with missing data.
# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the votes data
head(gross_data)
## [1] "$210.79M" "$21.70M" "$62.10M" "$329.00M" "$30.38M" "$208.85M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 10)
#(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
length(gross_data)
## [1] 85
# Visual inspection finds below movies don't have gross
gs_data <- rep(NA, 100)
gs_data[-c(6, 12, 29, 40, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
(gross_data <- gs_data)
## [1] 210.79 21.70 62.10 329.00 30.38 NA 208.85 608.58 15.95 42.39
## [11] 30.82 NA 45.36 180.30 678.82 48.69 8.56 2.47 216.65 102.70
## [21] 44.94 700.06 15.77 14.84 213.52 125.93 17.84 16.47 NA 42.47
## [31] 213.77 30.38 188.02 159.45 137.69 0.17 53.54 1.63 324.59 NA
## [41] 2.00 270.60 6.79 2.29 0.08 54.55 169.91 7.65 32.73 174.53
## [51] 2.90 220.16 13.83 44.07 54.86 67.36 159.34 0.70 197.62 139.38
## [61] NA 3.14 115.62 51.02 167.51 4.04 4.30 29.72 NA 23.90
## [71] NA 1.48 11.87 NA 46.87 0.21 0.09 NA 32.02 26.02
## [81] 11.28 NA 31.71 NA NA NA NA 69.00 0.74 NA
## [91] 50.07 145.44 99.35 117.44 417.72 0.99 13.54 40.83 36.11 1.21
Following code programatically figures out missing entries for metascore.
# Use CSS selectors to scrap the rankings section
(rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
## {xml_nodeset (197)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="metascore mixed">49 </span>
## [3] <span class="lister-item-index unbold text-primary">2.</span>
## [4] <span class="metascore mixed">60 </span>
## [5] <span class="lister-item-index unbold text-primary">3.</span>
## [6] <span class="metascore favorable">69 </span>
## [7] <span class="lister-item-index unbold text-primary">4.</span>
## [8] <span class="metascore mixed">55 </span>
## [9] <span class="lister-item-index unbold text-primary">5.</span>
## [10] <span class="metascore favorable">90 </span>
## [11] <span class="lister-item-index unbold text-primary">6.</span>
## [12] <span class="metascore favorable">88 </span>
## [13] <span class="lister-item-index unbold text-primary">7.</span>
## [14] <span class="metascore favorable">80 </span>
## [15] <span class="lister-item-index unbold text-primary">8.</span>
## [16] <span class="metascore mixed">44 </span>
## [17] <span class="lister-item-index unbold text-primary">9.</span>
## [18] <span class="metascore favorable">84 </span>
## [19] <span class="lister-item-index unbold text-primary">10.</span>
## [20] <span class="metascore unfavorable">32 </span>
## ...
# Convert the ranking data to text
(rank_metascore_data <- html_text(rank_metascore_data_html))
## [1] "1." "49 " "2." "60 " "3."
## [6] "69 " "4." "55 " "5." "90 "
## [11] "6." "88 " "7." "80 " "8."
## [16] "44 " "9." "84 " "10." "32 "
## [21] "11." "96 " "12." "61 " "13."
## [26] "87 " "14." "68 " "15." "83 "
## [31] "16." "51 " "17." "87 " "18."
## [36] "64 " "19." "70 " "20." "83 "
## [41] "21." "58 " "22." "50 " "23."
## [46] "84 " "24." "88 " "25." "43 "
## [51] "26." "43 " "27." "35 " "28."
## [56] "66 " "29." "60 " "30." "79 "
## [61] "31." "60 " "32." "57 " "33."
## [66] "62 " "34." "59 " "35." "82 "
## [71] "36." "52 " "37." "64 " "38."
## [76] "71 " "39." "67 " "40." "75 "
## [81] "41." "66 " "42." "61 " "43."
## [86] "51 " "44." "69 " "45." "78 "
## [91] "46." "67 " "47." "56 " "48."
## [96] "68 " "49." "66 " "50." "62 "
## [101] "51." "79 " "52." "74 " "53."
## [106] "90 " "54." "86 " "55." "87 "
## [111] "56." "87 " "57." "39 " "58."
## [116] "57 " "59." "67 " "60." "90 "
## [121] "61." "71 " "62." "61 " "63."
## [126] "78 " "64." "66 " "65." "48 "
## [131] "66." "54 " "67." "19 " "68."
## [136] "75 " "69." "81 " "70." "60 "
## [141] "71." "62 " "72." "67 " "73."
## [146] "53 " "74." "83 " "75." "42 "
## [151] "76." "82 " "77." "71 " "78."
## [156] "80 " "79." "39 " "80." "66 "
## [161] "81." "75 " "82." "61 " "83."
## [166] "53 " "84." "46 " "85." "86."
## [171] "45 " "87." "46 " "88." "45 "
## [176] "89." "64 " "90." "43 " "91."
## [181] "51 " "92." "28 " "93." "94."
## [186] "90 " "95." "72 " "96." "97."
## [191] "81 " "98." "38 " "99." "51 "
## [196] "100." "81 "
# Strip spaces
(rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
## [1] "1." "49" "2." "60" "3." "69" "4." "55" "5." "90"
## [11] "6." "88" "7." "80" "8." "44" "9." "84" "10." "32"
## [21] "11." "96" "12." "61" "13." "87" "14." "68" "15." "83"
## [31] "16." "51" "17." "87" "18." "64" "19." "70" "20." "83"
## [41] "21." "58" "22." "50" "23." "84" "24." "88" "25." "43"
## [51] "26." "43" "27." "35" "28." "66" "29." "60" "30." "79"
## [61] "31." "60" "32." "57" "33." "62" "34." "59" "35." "82"
## [71] "36." "52" "37." "64" "38." "71" "39." "67" "40." "75"
## [81] "41." "66" "42." "61" "43." "51" "44." "69" "45." "78"
## [91] "46." "67" "47." "56" "48." "68" "49." "66" "50." "62"
## [101] "51." "79" "52." "74" "53." "90" "54." "86" "55." "87"
## [111] "56." "87" "57." "39" "58." "57" "59." "67" "60." "90"
## [121] "61." "71" "62." "61" "63." "78" "64." "66" "65." "48"
## [131] "66." "54" "67." "19" "68." "75" "69." "81" "70." "60"
## [141] "71." "62" "72." "67" "73." "53" "74." "83" "75." "42"
## [151] "76." "82" "77." "71" "78." "80" "79." "39" "80." "66"
## [161] "81." "75" "82." "61" "83." "53" "84." "46" "85." "86."
## [171] "45" "87." "46" "88." "45" "89." "64" "90." "43" "91."
## [181] "51" "92." "28" "93." "94." "90" "95." "72" "96." "97."
## [191] "81" "98." "38" "99." "51" "100." "81"
# a rank followed by another rank means the metascore for the 1st rank is missing
(isrank <- str_detect(rank_metascore_data, "\\.$"))
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [12] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [23] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [34] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [45] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [56] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [67] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [78] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [89] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [100] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [111] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [122] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [133] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [144] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [155] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [166] FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [177] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [188] FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
(ismissing <- isrank[1:length(rank_metascore_data)-1] &
isrank[2:length(rank_metascore_data)])
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [188] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
(missingpos <- as.integer(rank_metascore_data[ismissing]))
## [1] 85 93 96
#(rank_metascore_data <- as.integer(rank_metascore_data))
You (students) should work out the code for finding missing positions for gross.
Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 x 11
## Rank Title
## <int> <chr>
## 1 1 Bohemian Rhapsody
## 2 2 Overlord
## 3 3 Green Book
## 4 4 Aquaman
## 5 5 The Favourite
## 6 6 A Star Is Born
## 7 7 Incredibles 2
## 8 8 Mortal Engines
## 9 9 Widows
## 10 10 Robin Hood
## Description
## <chr>
## 1 The story of the legendary rock band Queen and lead singer Freddie Merc…
## 2 A small group of American soldiers find horror behind enemy lines on th…
## 3 A working-class Italian-American bouncer becomes the driver of an Afric…
## 4 Arthur Curry, the human-born heir to the underwater kingdom of Atlantis…
## 5 In early 18th century England, a frail Queen Anne occupies the throne a…
## 6 A musician helps a young singer find fame, even as age and alcoholism s…
## 7 The Incredibles hero family takes on a new mission, which involves a ch…
## 8 In a post-apocalyptic world where cities ride on wheels and consume eac…
## 9 Set in contemporary Chicago, amid a time of turmoil, four women with no…
## 10 A war-hardened Crusader and his Moorish commander mount an audacious re…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 134 Biography 8.2 49 246470 211.
## 2 110 Action 6.9 60 30977 21.7
## 3 130 Biography 8.3 69 62443 62.1
## 4 143 Action 7.4 55 164708 329
## 5 119 Biography 7.8 90 53107 30.4
## 6 136 Drama 7.9 88 177646 NA
## 7 118 Animation 7.8 80 175775 209.
## 8 128 Action 6.2 44 38561 609.
## 9 129 Crime 7.1 84 42185 16.0
## 10 116 Action 5.3 32 24190 42.4
## Director Actor
## <fct> <fct>
## 1 Bryan Singer Rami Malek
## 2 Julius Avery Jovan Adepo
## 3 Peter Farrelly Viggo Mortensen
## 4 James Wan Jason Momoa
## 5 Yorgos Lanthimos Olivia Colman
## 6 Bradley Cooper Lady Gaga
## 7 Brad Bird Craig T. Nelson
## 8 Christian Rivers Hera Hilmar
## 9 Steve McQueen Viola Davis
## 10 Otto Bathurst Taron Egerton
## # … with 90 more rows
How many top 100 movies are in each genre?
ggplot(movies) +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
(earn_by_genre <- movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)))
## # A tibble: 10 x 2
## Genre avg_earning
## <chr> <dbl>
## 1 Action 103.
## 2 Adventure 147.
## 3 Animation 70.8
## 4 Biography 132.
## 5 Comedy 130.
## 6 Crime 43.4
## 7 Drama 45.5
## 8 Fantasy 2.47
## 9 Horror 198.
## 10 Sci 4.3
ggplot(data = earn_by_genre) +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
## # A tibble: 10 x 11
## # Groups: Genre [10]
## Rank Title Description Runtime Genre Rating Metascore Votes
## <int> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 7 Incr… The Incred… 118 Anim… 7.8 80 175775
## 2 8 Mort… In a post-… 128 Acti… 6.2 44 38561
## 3 15 Blac… Ron Stallw… 135 Biog… 7.5 83 105254
## 4 18 Susp… A darkness… 152 Fant… 7 64 23891
## 5 22 The … A legendar… 98 Adve… 5.6 50 1478
## 6 55 If B… A woman in… 119 Crime 7.6 87 9922
## 7 59 Hall… Laurie Str… 106 Horr… 6.7 67 75263
## 8 60 Burn… Jong-su bu… 148 Drama 7.7 90 13687
## 9 67 Repl… A scientis… 107 Sci 5.4 19 9681
## 10 95 Love… Simon Spie… 110 Come… 7.7 72 74886
## # … with 3 more variables: Gross_Earning_in_Mil <dbl>, Director <fct>,
## # Actor <fct>
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 15 rows containing missing values (geom_point).
searchTerm <- "ucla"
# tbm=isch (images), app (apps), bks (books), nws (news), pts (patents), vid (videos)
# <https://stenevang.wordpress.com/2013/02/22/google-advanced-power-search-url-request-parameters/>
(url <- paste0("https://www.google.com/search?q=", searchTerm,
"&source=lnms&tbm=isch&sa=X&tbs=isz:m"))
## [1] "https://www.google.com/search?q=ucla&source=lnms&tbm=isch&sa=X&tbs=isz:m"
webpage <- read_html(url)
(imageurl <- webpage %>% html_nodes("img") %>% html_attr("src"))
## [1] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSPAHGngfC90w8TVGwxDtF7EOfHRruNfeHqVicOCZHfcFZ28aN9AGMPOrE7"
## [2] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRi4lynSHbXZ4Iw8g2dqSWIHUbwYlVAnCG8JmoJk0m5TDqv7u1A4DZXIXo"
## [3] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQgDqKCP2Uj7dmIqDKLCzwnO2Nxe4NKkxqi7yzULvq5yIvx9AM5Let8VSfF"
## [4] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS7PJWOnMu2-F2ZOcKYLiHF6EJM4ddYQas6W7_nEOqv4kmyo5ANyJK-wy1g"
## [5] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIwWQ2gn7Ae8lSfeTuK5QGLPv4v6O1uxD_RrVuDElsHEXOZQiPvIyEcWY"
## [6] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSuyXbGus89EayJHMdhKKrUcbPlPgwGSajIZzlWqrM8MVaHZYg6_2QLSe8"
## [7] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQE3k_c0rFd-J7mOLS19TZ4crq6Bx9vXI3gNGPklgoVArW3skuai1ljggub"
## [8] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTI-oSi_FSFDeqOxkX1sse5D2Q-yfDKE28MMq3lOh1B9LckMTTYi-JxJIC8"
## [9] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR_dm3YNqqvT8HV_c6-sYISjWDHpiEG3ivlntQAGkbub2gItlOcgFgSnrU5"
## [10] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQsP-A6JtaJjfe33HsF5Ng7yRa4HRalvfCBNjERoFxWhvFL_ALJ3UcYtCno"
## [11] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSi7wafHAE8gTQM7LWdMDi49sfwV9bp4n2-l6MdJ3pODMLa-z83zQZ4j2FY"
## [12] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcuXKyr-IO6FRGq4mKUqfaZMmVUKdqOFGkC74VFxLv3atIlywjhMvLay5Z"
## [13] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQtayay8IIEhDm5NyjKj_1Jir0j1YKdHHcUoUJaI0MYV5t7WcK7iK0e_bfN"
## [14] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTrkSgBdNkT44_cRvZvyqfAk-M-Y30qxLiiGD_KkqSIBEpuoSLPkI66YQQ2"
## [15] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSazvWH_KTumPPOEFTfJbYNt5KeCPCCoIAqi38Rxxh25D1vBeviJKgFTi8"
## [16] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSEIYZP7caAsQ8_KHJBDCeMCcNK62f4a3F6tbQMYU4e8PXliQnK6mhPhNY"
## [17] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRwAHVp8uKCO6RN_QPvUZd87r_qGz1gB6ywhSG6OmpEWihlV8OZloNQW-EzBw"
## [18] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQtr10_NL7Ko621GJPtfMkvky-r7PQN85p-7Y5YZ55oQsAX9GBYUEty5Q"
## [19] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS9BgYpoLO-hIH9QC1GUIhhPwy0T6XQe-WiTPJ18UbWhny6SikDHcCFHtz7"
## [20] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT6Vr40BZfh0xMbWCgMRLlHZgPIuZtrEsGMM4FyGu3kEcGLrauySTKCjZ2_"
downloadImages <- function(files, brand, outPath="images"){
for(i in 1:length(files)){
download.file(files[i], destfile = paste0(outPath, "/", brand, "_", i, ".jpg"), mode = 'wb')
}
}
downloadImages(imageurl, "ucla")
ls images/
## ucla_1.jpg
## ucla_10.jpg
## ucla_11.jpg
## ucla_12.jpg
## ucla_13.jpg
## ucla_14.jpg
## ucla_15.jpg
## ucla_16.jpg
## ucla_17.jpg
## ucla_18.jpg
## ucla_19.jpg
## ucla_2.jpg
## ucla_20.jpg
## ucla_3.jpg
## ucla_4.jpg
## ucla_5.jpg
## ucla_6.jpg
## ucla_7.jpg
## ucla_8.jpg
## ucla_9.jpg
quantmod
package contains many utility functions for retrieving and plotting finance data. E.g.,
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
stock <- getSymbols("AAPL", src = "yahoo", auto.assign = FALSE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
##
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
head(stock)
## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2007-01-03 12.32714 12.36857 11.70000 11.97143 309579900
## 2007-01-04 12.00714 12.27857 11.97429 12.23714 211815100
## 2007-01-05 12.25286 12.31428 12.05714 12.15000 208685400
## 2007-01-08 12.28000 12.36143 12.18286 12.21000 199276700
## 2007-01-09 12.35000 13.28286 12.16429 13.22429 837324600
## 2007-01-10 13.53571 13.97143 13.35000 13.85714 738220000
## AAPL.Adjusted
## 2007-01-03 7.982585
## 2007-01-04 8.159763
## 2007-01-05 8.101658
## 2007-01-08 8.141665
## 2007-01-09 8.817995
## 2007-01-10 9.239983
chartSeries(stock, theme = chartTheme("white"),
type = "line", log.scale = FALSE, TA = NULL)