I used rvest to scrape the price of oil in the Czech Republic, and then make a geographic visualization (using tmap) to show the most expensive pumps (the line of red dots happens to copy the main Czech motorway)
# Initialization ----
library(rvest)
library(tmap)
library(tmaptools)
library(raster)
library(RCzechia) # set of shapefiles for the Czech Republic - devtools::install_github("jlacko/RCzechia")
library(stringr)
library(dplyr)
library(RColorBrewer)
url <- "http://benzin.impuls.cz/benzin.aspx?strana=" # url without page no.
frmBenzin <- data.frame() # empty data frame for data
bbox <- extent(republika) # a little more space around - enough for title and legend
bbox@ymax <- bbox@ymax + 0.35
bbox@ymin <- bbox@ymin - 0.15
# Scraping data ----
for (i in 1:56) { # Scrape data, translate and append to results
impuls <- read_html(paste(url, i, sep = ''), encoding = "windows-1250")
asdf <- impuls %>%
html_table()
frmBenzin <- rbind(frmBenzin, asdf[[1]])
}
# Cleaning data ----
frmBenzin$X1 <- NULL
colnames(frmBenzin) <- c("nazev", "obec", "okres","smes", "datum", "cena")
frmBenzin$cena <- gsub("(*UCP)\\s*Kč", "", frmBenzin$cena, perl = T) # regex is tricky - perl is safer
frmBenzin$cena <- as.double(frmBenzin$cena)
frmBenzin$datum <- as.Date(frmBenzin$datum, "%d. %m. %Y")
frmBenzin$okres <- gsub("Hlavní město\\s","",frmBenzin$okres)
frmBenzin$obec <- str_split(frmBenzin$obec, ",", simplify = T)[,1]
frmBenzin$key <- paste(frmBenzin$obec, frmBenzin$okres, sep = "/")
# Data wrangling ----
frmBenzinKey <- frmBenzin %>%
select(key, cena, smes) %>%
filter(smes == "natural95") %>% # only gasoline - no diesel
group_by(key) %>%
summarise(cena = mean(cena)) # average price in town
obce <- obce_body # from package RCzechia
obce$key <- paste(obce$Obec, obce$Okres, sep = "/") # shapefile: preparing a key to bind on
vObce <- c("Praha", "Brno", "Plzeň", "Ostrava") # big cities - these will be displayed by a polygon, not a point
obce <- obce %>%
append_data(frmBenzinKey, key.shp = "key", key.data = "key") # binding by key
obce <- subset(obce, !is.na(obce$cena)) # throwing out towns with no known oil price
obce <- subset(obce, !obce$Obec %in% vObce) # throwing out the big cities
wrkObce <- obce_polygony[obce_polygony$Obec %in% vObce, ]
# Vizualization at last... ----
nadpis <- "Oil price in the Czech Republic" # Chart title
leyenda <- "Natural 95" # Legend title
endCredits <- paste("data source: Ráádio Impuls (http://benzin.impuls.cz), scraped af of", format(max(frmBenzin$datum), "%d.%m.%Y") ,sep = " ")
tmBenzin <- tm_shape(obce, bbox = bbox) + tm_bubbles(size = 1/15, col = "cena", alpha = 0.85, border.alpha = 0, showNA = F, pal = "YlOrRd", title.col = leyenda) +
tm_shape(republika, bbox = bbox) + tm_borders("grey30", lwd = 1) +
tm_shape(wrkObce) + tm_borders("grey30", lwd = 0.5) +
tm_style_white(nadpis, frame = F, fontfamily = "Roboto", title.size = 2, legend.text.size = 0.6, legend.title.size = 1.2, legend.format = list(text.separator = "-", fun = function(x) paste0(formatC(x, digits = 0, format = "f"), " Kč"))) +
tm_credits(endCredits, position = c("RIGHT", "BOTTOM"), size = 0.6, col = "grey30")
print(tmBenzin)