I am having issues returning my functions

Hi Community,

I am trying to scrap the executive summary of the cbn files with function and rvest. I am having issues returning my functions
Below is my code;

pacman::p_load(

Data Wrangling
tidyverse, lubridate, magrittr,

Web scraping
rvest, xopen,

Text data mining
readtext, tidytext,
quanteda, textclean
)
search_pages <- c("https://www.cbn.gov.ng/Documents/quarterlyecoreports.aspbeginrec=1&endrec=20&keyword=&from=&tod= ", "https://www.cbn.gov.ng/Documents/quarterlyecoreports.aspbeginrec=21&endrec=40&keyword=&from=&tod= ") %>% tibble(page = .) %>%
print()
get_links <- function(page){
page <- search_pages %>% read_html()
page_tbl <- tibble(
title = page %>% 
    html_nodes('.dbasetable a') %>% 
    html_text2() %>% 
    str_remove_all(
      "(CBN )|(Economic Report)|(for )|(the )|(Published\\s\\d+/\\d+/\\d+)|(of)") %>% 
    str_squish(),
date = page %>% 
    html_nodes('#publishedDt') %>% #
    html_text2() %>% 
    str_squish() %>% 
    str_replace("Published ", "") %>% 
    str_extract("\\d+/\\d+/\\d+") %>% 
    mdy() %>% 
    format(., format = "%Y%m%d"),
links = page %>%
    html_nodes('.dbasetable a') %>%
    html_attr("href") %>% 
    str_replace("^(\\.\\.)", "") %>% 
    str_c("https://www.cbn.gov.ng", .)
)
return(links)
}

Thank you!

1 Like

you probably have mistakenly returned links instead of the object that contains links; its called page_tbl

@nirgrahamuk is right (as he always is) but there some other little issues.
The 'difficult' part of the code however was okay.
I think the version below does the trick. If not tell us where is does fail.

library(tibble)
library(rvest) 
library(stringr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(purrr)


search_pages <- c("https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?&beginrec=1&endrec=20&keyword=&from=&tod= ", 
                  "https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?&beginrec=21&endrec=40&keyword=&from=&tod= ") %>% 
     # tibble(page = .) %>%
      print()
#> [1] "https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?&beginrec=1&endrec=20&keyword=&from=&tod= " 
#> [2] "https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?&beginrec=21&endrec=40&keyword=&from=&tod= "
get_links <- function(page_url) {
  page <- read_html(page_url)
  page_tbl <- tibble(
    title = page %>%
      html_nodes('.dbasetable a') %>%
      html_text2() %>%
      str_remove_all(
        "(CBN )|(Economic Report)|(for )|(the )|(Published\\s\\d+/\\d+/\\d+)|(of)"
      ) %>%
      str_squish(),
    date = page %>%
      html_nodes('#publishedDt') %>% #
      html_text2() %>%
      str_squish() %>%
      str_replace("Published ", "") %>%
      str_extract("\\d+/\\d+/\\d+") %>%
      mdy() %>%
      format(., format = "%Y%m%d"),
    links = page %>%
      html_nodes('.dbasetable a') %>%
      html_attr("href") %>%
      str_replace("^(\\.\\.)", "") %>%
      str_c("https://www.cbn.gov.ng", .)
  )
  return(page_tbl)
}

my_cbn_links <- purrr::map_dfr(search_pages,get_links)
dim(my_cbn_links)
#> [1] 60  3
head(my_cbn_links)
#> # A tibble: 6 Ă— 3
#>   title               date     links                                            
#>   <chr>               <chr>    <chr>                                            
#> 1 Third Quarter 2021  20221201 https://www.cbn.gov.ng/Out/2022/RSD/2021Q3 ECR.p…
#> 2 Second Quarter 2022 20220630 https://www.cbn.gov.ng/Out/2022/RSD/Second Quart…
#> 3 First Quarter 2022  20220331 https://www.cbn.gov.ng/Out/2022/RSD/2022Q1_poste…
#> 4 Fourth Quarter 2021 20211231 https://www.cbn.gov.ng/Out/2022/RSD/ECR 2021Q4.p…
#> 5 Fourth Quarter 2020 20201231 https://www.cbn.gov.ng/Out/2021/RSD/Fourth Quart…
#> 6 Third Quarter 2020  20200930 https://www.cbn.gov.ng/Out/2020/RSD/Third Quarte…
Created on 2022-12-01 with reprex v2.0.2
1 Like

Thanks so much @HanOostdijk , How do I go about scrapping of the Executive summary from the pdfs?

I am glad it helps you.
Regarding getting the contents of pdfs, I got good results using the pdftools::pdf_data function .
In my example below I read the text of the 55 pages in the list pdf_data1 with 55 entries.
Each entry is a data.frame with the text of one page.
Using purrr::imap_dfr I insert the page number and combine into one data.frame with the text
of all the pages (actually here I do only the first two).

Then you have to analyze the data.frame df1to get the data you need. The 'Executive summary' starts on the sixth page but these words occur earlier in the table of contents. So it is is a little puzzle to determine which rows of df1 you have to select for the 'Executive summary' text: text between 'EXECUTIVE SUMMARY' and '1.0 GLOBAL ECONOMIC DEVELOPMENTS' .

I preferred using pdftools::pdf_data but you can also try if pdftools::pdf_text gives good results.
If that is the case, it is probably easier to extract the data.

pdf1 <- "https://www.cbn.gov.ng/Out/2022/RSD/2021Q3 ECR.pdf"
pdf_data1 <- pdftools::pdf_data(URLencode(pdf1))
df1 <- purrr::imap_dfr(pdf_data1[1:2], function(x,i) cbind(data.frame(page=i),x))
head(df1)
#>   page width height   x   y space     text
#> 1    1    99     27  73 474  TRUE  CENTRAL
#> 2    1    60     27 178 474  TRUE     BANK
#> 3    1    30     27 244 474  TRUE       OF
#> 4    1    90     27 280 474 FALSE  NIGERIA
#> 5    1   124     27 113 505  TRUE ECONOMIC
#> 6    1    86     27 243 505 FALSE   REPORT
tail(df1)
#>     page width height   x   y space     text
#> 176    2    29     11 193 335  TRUE   P.M.B.
#> 177    2    19     11 225 335  TRUE     187,
#> 178    2    26     11 247 335  TRUE   Garki,
#> 179    2    28     11 276 335  TRUE   Abuja,
#> 180    2    35     11 307 335 FALSE Nigeria.
#> 181    2     2      9 304 733 FALSE        i
Created on 2022-12-01 with reprex v2.0.2

Thanks so much @HanOostdijk , I really appreciate your time. Please can you have a look at the code below? I have specified some functions for extracting the Executive summary, but the path and names for the download pdf files code is giving out an error, thereby making all other codes not giving an output. Kindly help out. Thank you!

pacman::p_load(
  # Data Wrangling
  tidyverse, lubridate, magrittr,
  
  # Web scraping
  rvest, xopen,
  
  # Text data mining
  readtext, tidytext,
  quanteda, textclean
  
)
# Define the pages of interest
search_pages <- 
  c("https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?beginrec=1&endrec=20&keyword=&from=&tod=",
    "https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?beginrec=21&endrec=40&keyword=&from=&tod=") %>% 
  tibble(page = .) %>% 
  print()
get_links <- function(page_url){
  
  # page <- search_pages %>% pull(page) %>% .[1] %>% read_html() 
  page <- page_url %>% read_html()
  
  # Create a table of extracted data
  page_tbl <- tibble(
    
    # Get Title
    title = page %>% 
      html_nodes('.dbasetable a') %>% 
      html_text2() %>% 
      str_remove_all(
        "(CBN )|(Economic Report)|(for )|(the )|(Published\\s\\d+/\\d+/\\d+)|(of)") %>% 
      str_squish(),
    
    # Get Published Date
    date = page %>% 
      html_nodes('#publishedDt') %>% #
      html_text2() %>% 
      str_squish() %>% 
      str_replace("Published ", "") %>% 
      str_extract("\\d+/\\d+/\\d+") %>% 
      mdy() %>% 
      format(., format = "%Y%m%d"),
    
    # Get the download links
    links = page %>%
      html_nodes('.dbasetable a') %>%
      html_attr("href") %>% 
      str_replace("^(\\.\\.)", "") %>% 
      str_c("https://www.cbn.gov.ng", .)
    
  ) 
  
  page_tbl <- page_tbl %>% 
    mutate(reference = case_when(
      str_detect(title, "First") ~ str_c("Q1-", parse_number(title)),
      str_detect(title, "Second") ~ str_c("Q2-", parse_number(title)),
      str_detect(title, "Third") ~ str_c("Q3-", parse_number(title)),
      str_detect(title, "Fourth") ~ str_c("Q4-", parse_number(title)),
      TRUE ~ title
    )) %>% 
    # mutate(reference = zoo::as.yearqtr(date) %>% format(., format = "Q%q-%Y")) %>% 
    relocate(reference, .before = "title") %>% 
    unite(reference, c("reference", "date"), sep = "-")
  
  # close all connections to avoid errors
  closeAllConnections() 
   return(page_tbl)
}
# Define the urls for downloading the files
pdf_urls <- page_tbl %>% 
  glue::glue_data("{links}")
# Define the path and names for the download pdf files
pdf_names <- page_tbl %>%
  glue::glue_data("data/cbn/{reference}.pdf")  # This is giving an error!
# This prevent the code from stopping on accounting an error from the download
safe_download <- safely(~ download.file(.x , .y, mode = "wb")) #This has refused to download
# Download and save the pdf files
walk2(pdf_urls, pdf_names, safe_download)
# Check the number downloaded files
list.files("data/cbn", "*.pdf") %>% length()
# Extracting the Excutive Summary Parts from the pdf files
# Load the pdf files into R
cbn_quarterly_report <- readtext(
  "data/cbn/*.pdf", 
  docvarsfrom = "filenames", 
  docvarnames = c("quarter", "year", "pubdate"),
  dvsep = "-",
  encoding = "UTF-8"
)
# Generate the Corpus collection of the documents
cbn_quarterly_corpus <- corpus(cbn_quarterly_report)

# remove file extension from document names
docnames(cbn_quarterly_corpus) <- str_remove(
  docnames(cbn_quarterly_corpus), 
  pattern = "(-\\d+).pdf")

docnames(cbn_quarterly_corpus)

head(docvars(cbn_quarterly_corpus), 5)

# Extract data from the corpus, convert it to dataframe for use in plotting
cbn_quarterly_summary_tbl <- summary(cbn_quarterly_corpus) %>% 
  as_tibble() %>% 
  print(n = 10)


# Convert the corpus to tibble
cbn_quarterly_report_tbl <- convert(
  x = cbn_quarterly_corpus, 
  to = "data.frame"
) %>% 
  mutate(pubdate = ymd(pubdate)) %>% 
  as_tibble() %>% 
  arrange(year) %>% 
  mutate(docgrp = case_when(
    str_detect(str_to_lower(text), "executive summary") ~ "A",
    TRUE ~ "B"
  )) %>% 
  unnest_tokens(output = text, input = text, token = "lines") %>% 
  mutate(text = str_squish(text)) %>% 
  print()

cbn_quarterly_report_tbl %>% glimpse()

# Split and clean the text by groups
cbn_quarterly_report_grpA <- cbn_quarterly_report_tbl %>%
  filter(str_detect(docgrp, "A")) %>% 
  group_by(doc_id, quarter, year, pubdate) %>%
  mutate(chapter_id = cumsum(
    str_detect(text, regex("^(executive summary)", ignore_case = TRUE)))) %>% 
  filter(chapter_id > 1) %>%
  mutate(chapter_id = cumsum(
    str_detect(
      text, 
      regex("^(1.0 global economic developments)", ignore_case = TRUE)))) %>% 
  filter(chapter_id < 1) %>%
  slice_head(n = -2) %>% 
  filter(!str_detect(text, "^\\d+\\s+\\|")) %>% 
  filter(!str_detect(text, "executive summary")) %>% 
  summarise(text = paste(text, collapse = ' '), .groups = "drop") %>% 
  print()

cbn_quarterly_report_grpB <- cbn_quarterly_report_tbl %>% 
  filter(str_detect(docgrp, "B")) %>% 
  group_by(doc_id, quarter, year, pubdate) %>%
  mutate(chapter_id = cumsum(
    str_detect(text, regex("^(1.0 summary)", ignore_case = TRUE)))) %>% 
  filter(chapter_id > 1) %>% 
  mutate(chapter_id = cumsum(
    str_detect(
      text, 
      regex("^(2.0 financial sector developments)", ignore_case = TRUE)))) %>% 
  filter(chapter_id < 1) %>%
  slice_head(n = -2) %>%
  filter(!str_detect(text, "^\\d+\\s+\\|")) %>% 
  filter(!str_detect(text, "1.0 summary")) %>% 
  summarise(text = paste(text, collapse = ' '), .groups = "drop") %>%
  print()

cbn_quarterly_report_clean <- bind_rows(
  cbn_quarterly_report_grpA, cbn_quarterly_report_grpB
) %>% 
  print()
type or paste code here

Hello @Techzill,
I am impressed by the R-code you show us and I learned some new things from it.
So that is nice. However checking your code line by line looks like work to me so ...
Reading you post gave the following 'thoughts':

  • you shows us all your code. Also the part that does not run because of the error you indicated.
    That has the benefit that the reader will know where you are heading to, but it also could scare off readers who would otherwise have taken the time to read a small piece of code
  • you don't show us the code as you did run it. You manually copied pieces of the code to this post.
    I think many readers (and certainly myself) prefer that you use the RStudio reprex addin that comes with the package reprex.
    Looking at the output of that the reader is sure that he/she can recreate your problem without having to guess what you exactly did.
    In this case I have to guess that somewhere you executed
    page_tbl <- purrr::map_dfr(search_pages,get_links) or a similar statement
    Also in case of errors the reader will have the same information as you have.
    Now you only tell us that it did not work.
    Another advantage of using reprex before closing your session is to be sure that your code works after editing and deleting functions and variables (but restarting your R environment and rerunning the code would do the same).
  • When you only need to extract the summaries there is no need to download the original files.
    But maybe you need these for another reason.
  • I do not recognize the 'corpus' functions, so I could not help you with these anyway.

Thanks so much @HanOostdijk, pls can you tell me the reason why the code below has refused to run by using mutate as a funtion, its returning error saying page_tbl cant be found that "no applicable method for mutate applied to an object of class function. Thank you!

pacman::p_load(
  # Data Wrangling
  tidyverse, lubridate, magrittr,
  
  # Web scraping
  rvest, xopen,
  
  # Text data mining
  readtext, tidytext,
  quanteda, textclean
  
)
search_pages <- 
  c("https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?beginrec=1&endrec=20&keyword=&from=&tod=",
    "https://www.cbn.gov.ng/Documents/quarterlyecoreports.asp?beginrec=21&endrec=40&keyword=&from=&tod=") %>% 
  tibble(page = .) %>% 
  print()
# Create a function to grab the links
get_links <- function(page_url){
  
  # page 
  page <- page_url %>% read_html()
  
  # Create a table of extracted data
  page_tbl <- tibble(
    # Get Title
    title = page %>% 
      html_nodes('.dbasetable a') %>% 
      html_text2() %>% 
      str_remove_all(
        "(CBN )|(Economic Report)|(for )|(the )|(Published\\s\\d+/\\d+/\\d+)|(of)") %>% 
      str_squish(),
    
    # Get Published Date
    date = page %>% 
      html_nodes('#publishedDt') %>% #
      html_text2() %>% 
      str_squish() %>% 
      str_replace("Published ", "") %>% 
      str_extract("\\d+/\\d+/\\d+") %>% 
      mdy() %>% 
      format(., format = "%Y%m%d"),
    
    # Get the download links
    links = page %>%
      html_nodes('.dbasetable a') %>%
      html_attr("href") %>% 
      str_replace("^(\\.\\.)", "") %>% 
      str_c("https://www.cbn.gov.ng", .)
    
  ) 
  return(page_tbl)
  
}
page_tblA<- page_tbl  %>% 
  mutate(reference = case_when(
    str_detect(title, "First") ~ str_c("Q1-", parse_number(title)),
    str_detect(title, "Second") ~ str_c("Q2-", parse_number(title)),
    str_detect(title, "Third") ~ str_c("Q3-", parse_number(title)),
    str_detect(title, "Fourth") ~ str_c("Q4-", parse_number(title)),
    TRUE ~ title
  )) %>% 
  # mutate(reference = zoo::as.yearqtr(date) %>% format(., format = "Q%q-%Y")) %>% 
  relocate(reference, .before = "title") %>% 
  unite(reference, c("reference", "date"), sep = "-")

# close all connections to avoid errors
closeAllConnections()

Did you read my latest answer ?
Especially the part where I guessed that you had created the page_tbl variable somewhere.
Now I wonder: did you?

There is no need for you to take my suggestions seriously. I fully understand that.

Yes I read it and I have run the codes you sent which worked fine. I was just trying to figure out where I specified the page_tbl before . Please don't be pissed.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.