Thanks so much @HanOostdijk , I really appreciate your time. Please can you have a look at the code below? I have specified some functions for extracting the Executive summary, but the path and names for the download pdf files code is giving out an error, thereby making all other codes not giving an output. Kindly help out. Thank you!
pacman::p_load(
# Data Wrangling
tidyverse, lubridate, magrittr,
# Web scraping
rvest, xopen,
# Text data mining
readtext, tidytext,
quanteda, textclean
)
# Define the pages of interest
search_pages <-
c("https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?beginrec=1&endrec=20&keyword=&from=&tod=",
"https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?beginrec=21&endrec=40&keyword=&from=&tod=") %>%
tibble(page = .) %>%
print()
get_links <- function(page_url){
# page <- search_pages %>% pull(page) %>% .[1] %>% read_html()
page <- page_url %>% read_html()
# Create a table of extracted data
page_tbl <- tibble(
# Get Title
title = page %>%
html_nodes('.dbasetable a') %>%
html_text2() %>%
str_remove_all(
"(CBN )|(Economic Report)|(for )|(the )|(Published\\s\\d+/\\d+/\\d+)|(of)") %>%
str_squish(),
# Get Published Date
date = page %>%
html_nodes('#publishedDt') %>% #
html_text2() %>%
str_squish() %>%
str_replace("Published ", "") %>%
str_extract("\\d+/\\d+/\\d+") %>%
mdy() %>%
format(., format = "%Y%m%d"),
# Get the download links
links = page %>%
html_nodes('.dbasetable a') %>%
html_attr("href") %>%
str_replace("^(\\.\\.)", "") %>%
str_c("https://www.cbn.gov.ng", .)
)
page_tbl <- page_tbl %>%
mutate(reference = case_when(
str_detect(title, "First") ~ str_c("Q1-", parse_number(title)),
str_detect(title, "Second") ~ str_c("Q2-", parse_number(title)),
str_detect(title, "Third") ~ str_c("Q3-", parse_number(title)),
str_detect(title, "Fourth") ~ str_c("Q4-", parse_number(title)),
TRUE ~ title
)) %>%
# mutate(reference = zoo::as.yearqtr(date) %>% format(., format = "Q%q-%Y")) %>%
relocate(reference, .before = "title") %>%
unite(reference, c("reference", "date"), sep = "-")
# close all connections to avoid errors
closeAllConnections()
return(page_tbl)
}
# Define the urls for downloading the files
pdf_urls <- page_tbl %>%
glue::glue_data("{links}")
# Define the path and names for the download pdf files
pdf_names <- page_tbl %>%
glue::glue_data("data/cbn/{reference}.pdf") # This is giving an error!
# This prevent the code from stopping on accounting an error from the download
safe_download <- safely(~ download.file(.x , .y, mode = "wb")) #This has refused to download
# Download and save the pdf files
walk2(pdf_urls, pdf_names, safe_download)
# Check the number downloaded files
list.files("data/cbn", "*.pdf") %>% length()
# Extracting the Excutive Summary Parts from the pdf files
# Load the pdf files into R
cbn_quarterly_report <- readtext(
"data/cbn/*.pdf",
docvarsfrom = "filenames",
docvarnames = c("quarter", "year", "pubdate"),
dvsep = "-",
encoding = "UTF-8"
)
# Generate the Corpus collection of the documents
cbn_quarterly_corpus <- corpus(cbn_quarterly_report)
# remove file extension from document names
docnames(cbn_quarterly_corpus) <- str_remove(
docnames(cbn_quarterly_corpus),
pattern = "(-\\d+).pdf")
docnames(cbn_quarterly_corpus)
head(docvars(cbn_quarterly_corpus), 5)
# Extract data from the corpus, convert it to dataframe for use in plotting
cbn_quarterly_summary_tbl <- summary(cbn_quarterly_corpus) %>%
as_tibble() %>%
print(n = 10)
# Convert the corpus to tibble
cbn_quarterly_report_tbl <- convert(
x = cbn_quarterly_corpus,
to = "data.frame"
) %>%
mutate(pubdate = ymd(pubdate)) %>%
as_tibble() %>%
arrange(year) %>%
mutate(docgrp = case_when(
str_detect(str_to_lower(text), "executive summary") ~ "A",
TRUE ~ "B"
)) %>%
unnest_tokens(output = text, input = text, token = "lines") %>%
mutate(text = str_squish(text)) %>%
print()
cbn_quarterly_report_tbl %>% glimpse()
# Split and clean the text by groups
cbn_quarterly_report_grpA <- cbn_quarterly_report_tbl %>%
filter(str_detect(docgrp, "A")) %>%
group_by(doc_id, quarter, year, pubdate) %>%
mutate(chapter_id = cumsum(
str_detect(text, regex("^(executive summary)", ignore_case = TRUE)))) %>%
filter(chapter_id > 1) %>%
mutate(chapter_id = cumsum(
str_detect(
text,
regex("^(1.0 global economic developments)", ignore_case = TRUE)))) %>%
filter(chapter_id < 1) %>%
slice_head(n = -2) %>%
filter(!str_detect(text, "^\\d+\\s+\\|")) %>%
filter(!str_detect(text, "executive summary")) %>%
summarise(text = paste(text, collapse = ' '), .groups = "drop") %>%
print()
cbn_quarterly_report_grpB <- cbn_quarterly_report_tbl %>%
filter(str_detect(docgrp, "B")) %>%
group_by(doc_id, quarter, year, pubdate) %>%
mutate(chapter_id = cumsum(
str_detect(text, regex("^(1.0 summary)", ignore_case = TRUE)))) %>%
filter(chapter_id > 1) %>%
mutate(chapter_id = cumsum(
str_detect(
text,
regex("^(2.0 financial sector developments)", ignore_case = TRUE)))) %>%
filter(chapter_id < 1) %>%
slice_head(n = -2) %>%
filter(!str_detect(text, "^\\d+\\s+\\|")) %>%
filter(!str_detect(text, "1.0 summary")) %>%
summarise(text = paste(text, collapse = ' '), .groups = "drop") %>%
print()
cbn_quarterly_report_clean <- bind_rows(
cbn_quarterly_report_grpA, cbn_quarterly_report_grpB
) %>%
print()
type or paste code here