I ran the code on a different computer and the problem remains.
As far as I can tell the problem originates from usingmap_df(r) to map the function. If use map and subsequently map_df(., bind_rows) there is no problem.
It seems that the (row-)binding which is - as far as I understood - 'included' in map_df(r) binds some data even if included function produced a NULL. Evidently, there is no drawback to use map + map_df(., bind_rows), but I would lie if I would say I have fully understood why map_df produces this wrong result. In any case it seems quite a dangerous behavior to me.
Below a repex which hopefully makes the problem clearer.
Many thanks again.
``` r
library(rvest)
#> Warning: Paket 'rvest' wurde unter R Version 3.5.3 erstellt
#> Lade nötiges Paket: xml2
library(tidyverse)
#> Warning: Paket 'ggplot2' wurde unter R Version 3.5.3 erstellt
#> Warning: Paket 'tibble' wurde unter R Version 3.5.3 erstellt
#> Warning: Paket 'tidyr' wurde unter R Version 3.5.3 erstellt
#> Warning: Paket 'readr' wurde unter R Version 3.5.2 erstellt
#> Warning: Paket 'purrr' wurde unter R Version 3.5.3 erstellt
#> Warning: Paket 'dplyr' wurde unter R Version 3.5.2 erstellt
#> Warning: Paket 'forcats' wurde unter R Version 3.5.2 erstellt
library(glue)
#> Warning: Paket 'glue' wurde unter R Version 3.5.3 erstellt
#>
#> Attache Paket: 'glue'
#> The following object is masked from 'package:dplyr':
#>
#> collapse
#here I create the vector; I know that category 358 has less than 5 pages;
seq_categories <- c(350, 358, 366)
seq_pages <- seq(1, 5, 1)
df_seq <- expand.grid(seq_pages=seq_pages, seq_categories=seq_categories)
df_links <- df_seq %>%
mutate(link=glue("http://www.ohr.int/?cat={seq_categories}&paged={seq_pages}"))
#define function
scr_bonn <- function(link_input) {
pb$tick()$print()
print(link_input)
site <- read_html(link_input)
if(!is.na(site)) {
date.publish <- site %>%
html_nodes(".date-publish") %>%
html_text() %>%
enframe(name=NULL, value="date.publish")
decision.name <- site %>%
html_nodes(".name") %>%
html_text %>%
enframe(name=NULL, value="decision.name")
bind_cols(date.publish=date.publish, decision.name=decision.name)
}
}
# CORRECT RESULTS WITH MAP + MAP_DF ---------------------------------------
pb <- progress_estimated(nrow(df_links))
#map function
list_correct<- df_links$link %>%
set_names() %>%
map(., possibly(scr_bonn, otherwise=NULL, quiet=FALSE))
#> http://www.ohr.int/?cat=350&paged=1
#> http://www.ohr.int/?cat=350&paged=2
#> http://www.ohr.int/?cat=350&paged=3
#> http://www.ohr.int/?cat=350&paged=4
#> http://www.ohr.int/?cat=350&paged=5
#> http://www.ohr.int/?cat=358&paged=1
#> http://www.ohr.int/?cat=358&paged=2
#> http://www.ohr.int/?cat=358&paged=3
#> Error: HTTP error 404.
#> http://www.ohr.int/?cat=358&paged=4
#> Error: HTTP error 404.
#> http://www.ohr.int/?cat=358&paged=5
#> Error: HTTP error 404.
#> http://www.ohr.int/?cat=366&paged=1
#> http://www.ohr.int/?cat=366&paged=2
#> http://www.ohr.int/?cat=366&paged=3
#> http://www.ohr.int/?cat=366&paged=4
#> http://www.ohr.int/?cat=366&paged=5
#map_df(., possibly(scr_bonn, otherwise=NULL, quiet=FALSE), .id="link_to_page")
#bind tibbles in lists with bind_rows
df_correct <- list_correct %>%
map_df(., bind_rows, .id="link_to_page") %>%
mutate(page=str_extract(link_to_page, "(?<=paged\\=)[:digit:]+") %>% as.numeric) %>%
mutate(category=str_extract(link_to_page, "(?<=cat\\=)[:digit:]+") %>% as.numeric)
df_correct %>%
filter(page>2 & category=="358")
#> # A tibble: 0 x 5
#> # ... with 5 variables: link_to_page <chr>, date.publish <chr>,
#> # decision.name <chr>, page <dbl>, category <dbl>
# WRONG RESULTS WHEN USING MAP_DF IMMEDIATELY -----------------------------
pb <- progress_estimated(nrow(df_links))
df_false <- df_links$link %>%
set_names() %>%
map_df(., possibly(scr_bonn, otherwise=NULL, quiet=FALSE), .id="link_to_page") %>%
mutate(page=str_extract(link_to_page, "(?<=paged\\=)[:digit:]+") %>% as.numeric) %>%
mutate(category=str_extract(link_to_page, "(?<=cat\\=)[:digit:]+") %>% as.numeric)
#> http://www.ohr.int/?cat=350&paged=1
#> http://www.ohr.int/?cat=350&paged=2
#> http://www.ohr.int/?cat=350&paged=3
#> http://www.ohr.int/?cat=350&paged=4
#> http://www.ohr.int/?cat=350&paged=5
#> http://www.ohr.int/?cat=358&paged=1
#> http://www.ohr.int/?cat=358&paged=2
#> http://www.ohr.int/?cat=358&paged=3
#> Error: HTTP error 404.
#> http://www.ohr.int/?cat=358&paged=4
#> Error: HTTP error 404.
#> http://www.ohr.int/?cat=358&paged=5
#> Error: HTTP error 404.
#> http://www.ohr.int/?cat=366&paged=1
#> http://www.ohr.int/?cat=366&paged=2
#> http://www.ohr.int/?cat=366&paged=3
#> http://www.ohr.int/?cat=366&paged=4
#> http://www.ohr.int/?cat=366&paged=5
#reports results for links which actually have no data to extracct
#http://www.ohr.int/?cat=358&paged=4
wrong <- df_false %>%
filter(page>2 & category=="358")
wrong
#> # A tibble: 12 x 5
#> link_to_page date.publish decision.name page category
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 http://www.ohr.in~ 01/05/2011 Order Suspending the App~ 4 358
#> 2 http://www.ohr.in~ 09/12/2009 OHR Inventory Team Estab~ 4 358
#> 3 http://www.ohr.in~ 06/25/2008 Decision Amending the La~ 4 358
#> 4 http://www.ohr.in~ 06/25/2008 Decision Amending the La~ 4 358
#> 5 http://www.ohr.in~ 06/25/2008 Decision Amending the La~ 4 358
#> 6 http://www.ohr.in~ 12/19/2007 Decision Amending the La~ 4 358
#> 7 http://www.ohr.in~ 12/19/2007 Decision Amending the La~ 4 358
#> 8 http://www.ohr.in~ 12/19/2007 Decision Amending the La~ 4 358
#> 9 http://www.ohr.in~ 09/28/2007 Decision Amending the La~ 4 358
#> 10 http://www.ohr.in~ 09/28/2007 Decision Amending the La~ 4 358
#> 11 http://www.ohr.in~ 09/28/2007 Decision Amending the La~ 4 358
#> 12 http://www.ohr.in~ 09/14/2007 Decision Withdrawing the~ 4 358
#shows that data wrongly assigned to links were actually retrieved from subsequent(!) links in the vector/loop
wrong_in_correct_results <- df_correct %>%
filter(decision.name %in% wrong$decision.name)
wrong_in_correct_results%>%
select(link_to_page, category, page)
#> # A tibble: 18 x 3
#> link_to_page category page
#> <chr> <dbl> <dbl>
#> 1 http://www.ohr.int/?cat=366&paged=1 366 1
#> 2 http://www.ohr.int/?cat=366&paged=1 366 1
#> 3 http://www.ohr.int/?cat=366&paged=1 366 1
#> 4 http://www.ohr.int/?cat=366&paged=1 366 1
#> 5 http://www.ohr.int/?cat=366&paged=1 366 1
#> 6 http://www.ohr.int/?cat=366&paged=1 366 1
#> 7 http://www.ohr.int/?cat=366&paged=1 366 1
#> 8 http://www.ohr.int/?cat=366&paged=1 366 1
#> 9 http://www.ohr.int/?cat=366&paged=1 366 1
#> 10 http://www.ohr.int/?cat=366&paged=1 366 1
#> 11 http://www.ohr.int/?cat=366&paged=1 366 1
#> 12 http://www.ohr.int/?cat=366&paged=1 366 1
#> 13 http://www.ohr.int/?cat=366&paged=2 366 2
#> 14 http://www.ohr.int/?cat=366&paged=2 366 2
#> 15 http://www.ohr.int/?cat=366&paged=2 366 2
#> 16 http://www.ohr.int/?cat=366&paged=2 366 2
#> 17 http://www.ohr.int/?cat=366&paged=2 366 2
#> 18 http://www.ohr.int/?cat=366&paged=2 366 2
Created on 2019-05-08 by the reprex package (v0.2.1)