Using Rvest and Function

Hi community

I am trying to scrab a several information about the european parlament deputies with function and rvest. This is my code.


info_de_eurodiputados <- function(infou){
  
  result <- tryCatch({
  
  infoo <- read_html(infou) 
  
  email <- infoo %>% 
    html_nodes(".link_email .mr-2") %>% 
    html_attr("href") %>% 
    paste(., collapse = "")
  
  twitter <- infoo %>% 
    html_nodes(".link_twitt .mr-2") %>% 
    html_attr("href") %>%  
    paste(., collapse = "")
  
  youtube <- infoo %>% 
    html_nodes(".link_youtube .mr-2") %>% 
    html_attr("href") %>%     
    paste(., collapse = "")
  
  Instagram <- infoo %>% 
    html_nodes(".link_instagram .mr-2") %>% 
    html_attr("href") %>%  
    paste(., collapse = "")
  
  facebook <- infoo %>% 
    html_nodes(".link_fb .mr-2") %>% 
    html_attr("href") %>%  
    paste(., collapse = "")
  
  paginaweb <- infoo %>%
    html_nodes(".link_website") %>% 
    html_attr("href") %>%  
    paste(., collapse = "")

tibble(Correos = email, Perfiles_Twitter = twitter, Perfiles_Youtube = youtube, Perfiles_Instagram = Instagram, Perfiles_Facebookk = facebook, Pagina_Web_Personal = paginaweb)

  }, error = function(e) data.frame(Correos = NA, Perfiles_Twitter = NA, Perfiles_Youtube = NA, Perfiles_Instagram = NA, Perfiles_Facebook = NA, Pagina_Web_Personal = NA))
  
  return(result)

}

result <- purrr::map_df(url_europarlamentarios, info_de_eurodiputados)

My problem is that when I run the chunk it get stuck loading.

When I stop the chunk I get this message

no loop for break/next, jumping to top level

Any advice?

Thanks

Checking my libraries I realease that I have another problem

library(pdftools)
library(tidyverse)
library(readxl)
library(rvest)
library(magrittr)
library(purrr)
library(dplyr)
library(xml2)
library(tibble)
library(rebus)
library(lubridate)
library(stringr)
library(leaflet)
Warning message:
closing unused connection 3 (https://www.europarl.europa.eu/meps/es/197573)

You didnt provide url_europarlamentarios


lista_de_eurodiputados <- function(eurodiputados){
  read_html(eurodiputados) %>%
  html_nodes(".erpl_member-list-item-content") %>% 
  html_attr('href') 
  
}
url_europarlamentarios <- lista_de_eurodiputados("https://www.europarl.europa.eu/meps/es/full-list/all")

Ok, again your example contains over 700 entries. which to me is a lot.
I modified your code to at least show where it is in its execution. You should use this to zoom in on the relevant urls of interest. I ran this up to about 70 and got impatient , everything seemed ok to that point.
Perhaps run the whole thing and get back to us when you can reduce the urls to a working one and a problematic one.

library(rvest)
library(tidyverse)
info_de_eurodiputados <- function(infou,urlnum){
  cat("\n",urlnum)
  result <- tryCatch({
    
    infoo <- read_html(infou) 
    
    email <- infoo %>% 
      html_nodes(".link_email .mr-2") %>% 
      html_attr("href") %>% 
      paste(., collapse = "")
    
    twitter <- infoo %>% 
      html_nodes(".link_twitt .mr-2") %>% 
      html_attr("href") %>%  
      paste(., collapse = "")
    
    youtube <- infoo %>% 
      html_nodes(".link_youtube .mr-2") %>% 
      html_attr("href") %>%     
      paste(., collapse = "")
    
    Instagram <- infoo %>% 
      html_nodes(".link_instagram .mr-2") %>% 
      html_attr("href") %>%  
      paste(., collapse = "")
    
    facebook <- infoo %>% 
      html_nodes(".link_fb .mr-2") %>% 
      html_attr("href") %>%  
      paste(., collapse = "")
    
    paginaweb <- infoo %>%
      html_nodes(".link_website") %>% 
      html_attr("href") %>%  
      paste(., collapse = "")
    
    tibble(Correos = email, Perfiles_Twitter = twitter, Perfiles_Youtube = youtube, Perfiles_Instagram = Instagram, Perfiles_Facebookk = facebook, Pagina_Web_Personal = paginaweb)
    
  }, error = function(e) data.frame(Correos = NA, Perfiles_Twitter = NA, Perfiles_Youtube = NA, Perfiles_Instagram = NA, Perfiles_Facebook = NA, Pagina_Web_Personal = NA))
  
  return(result)
  
}

lista_de_eurodiputados <- function(eurodiputados){
  read_html(eurodiputados) %>%
    html_nodes(".erpl_member-list-item-content") %>% 
    html_attr('href') 
  
}
url_europarlamentarios <- lista_de_eurodiputados("https://www.europarl.europa.eu/meps/es/full-list/all")

result <- purrr::imap_dfr(url_europarlamentarios, ~info_de_eurodiputados(.x,.y))
1 Like

Thanks a lot, it works perfectly.

best regards

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.