Hi I cant figure out the problem. Help is highly welcome!
What I want to do:
I will compare 2 CSV Files with SEO meta data to identify changes on a websites over a week or month.
And creat a XLS report with the changes.
Problem: I don't get any error messages. However, the XLS file is not created
R version 3.6.3 (2020-02-29)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Catalina 10.15.6
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] writexl_1.3.1 lubridate_1.7.9.2 slackr_1.4.2 xlsx_0.6.5
[5] reprex_0.3.0 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
[9] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.4
[13] ggplot2_3.3.2 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] xfun_0.19 tidyselect_1.1.0 rJava_0.9-13 haven_2.3.1
[5] colorspace_2.0-0 vctrs_0.3.5 generics_0.1.0 htmltools_0.5.0
[9] rlang_0.4.8 pillar_1.4.7 glue_1.4.2 withr_2.3.0
[13] DBI_1.1.0 dbplyr_2.0.0 splitstackshape_1.4.8 modelr_0.1.8
[17] readxl_1.3.1 lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0
[21] cellranger_1.1.0 rvest_0.3.6 evaluate_0.14 knitr_1.30
[25] callr_3.5.1 ps_1.4.0 fansi_0.4.1 xlsxjars_0.6.1
[29] broom_0.7.2 Rcpp_1.0.5 clipr_0.7.1 scales_1.1.1
[33] backports_1.2.0 jsonlite_1.7.1 fs_1.5.0 hms_0.5.3
[37] digest_0.6.27 stringi_1.5.3 processx_3.4.4 grid_3.6.3
[41] cli_2.1.0 tools_3.6.3 magrittr_2.0.1 whisker_0.4
[45] crayon_1.3.4 pkgconfig_2.0.3 ellipsis_0.3.1 data.table_1.13.2
[49] xml2_1.3.2 assertthat_0.2.1 rmarkdown_2.5 httr_1.4.2
[53] rstudioapi_0.13 R6_2.5.0 compiler_3.6.3
Here are all files of the project (folder, CSV, )also the 2 R files:
https://drive.google.com/drive/folders/1mYns7XjQYW8f8hR7zEAUSGGq2L0JQX5V?usp=sharing
I see in the console only:
Attache Paket: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
# Libraries -------------------------------------------
options(java.parameters = "-Xmx8g")
library(tidyverse)
library(reprex)
library(xlsx)
library(slackr)
library(lubridate)
#>
#> Attache Paket: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
source("FUNS.R")
#> Warning in file(filename, "r", encoding = encoding): kann Datei 'FUNS.R' nicht
#> öffnen: No such file or directory
#> Error in file(filename, "r", encoding = encoding): kann Verbindung nicht öffnen
# VARS ------------------------------------------------
PATH_TO_XLSX_EXPORTS <- "/Users/u116346/Documents/R-Projekte/01_crawl_abgleiche/"
PATH_TO_SF_EXPORTS <- "/Users/u116346/Documents/R-Projekte/02_sf_exports/"
#TESTING_WEBSITE <- ""
#SLACK_API_KEY <- ""
# Slack -----------------------------------------------
#slackr_setup(api_token = SLACK_API_KEY,
# channel = "#scripts",
# username = "[R] Crawl-Abgleich",
# echo = FALSE)
# Get-files -------------------------------------------
## Find all CSVs
files <- list.files(PATH_TO_SF_EXPORTS,
full.names = TRUE,
recursive = TRUE,
pattern = "\\.csv")
## Split paths
df_file_paths <- tibble(path = files) %>%
splitstackshape::cSplit(splitCols = "path",
drop = FALSE,
sep = "/",
direction = "wide",
type.convert = FALSE) %>%
as_tibble()
## Extract website, datetime
select_cols <- c(1, ncol(df_file_paths) - 2, ncol(df_file_paths) - 1)
df_file_paths <- df_file_paths[ , select_cols]
names(df_file_paths) <- c("path", "website", "datetime")
## Convert to date
df_file_paths <- df_file_paths %>%
mutate(datetime = ymd_hms(datetime),
date = date(datetime))
## Check if two crawls exist
df_file_paths <- df_file_paths %>%
group_by(website) %>%
filter(n() > 1) %>%
ungroup()
## Get last two crawls
df_file_paths <- df_file_paths %>%
group_by(website) %>%
mutate(date_rank = rank(desc(date))) %>%
filter(date_rank %in% c(1, 2)) %>%
ungroup()
## Get websites for iteration
websites <- unique(df_file_paths$website)
## Filter for testing
if (exists("TESTING_WEBSITE")) websites <- TESTING_WEBSITE
# Compare crawls --------------------------------------
for (WEBSITE in websites) {
try_res <- tryCatch({
df_website_file_paths <- df_file_paths %>%
filter(website == WEBSITE)
CRAWL_NEW_DATE <- max(df_website_file_paths$date)
CRAWL_OLD_DATE <- min(df_website_file_paths$date)
message("\n\n[", WEBSITE, "] ", strrep("*", 40), "\n\n~ Read crawls\n")
crawl_new <- read_crawl(df_website_file_paths[df_website_file_paths$date == CRAWL_NEW_DATE, "path"][[1,1]]) %>%
mutate(date = CRAWL_NEW_DATE)
crawl_old <- read_crawl(df_website_file_paths[df_website_file_paths$date == CRAWL_OLD_DATE, "path"][[1,1]]) %>%
mutate(date = CRAWL_OLD_DATE)
## Create website-dir if not exists
dir.create(file.path(PATH_TO_XLSX_EXPORTS, WEBSITE),
showWarnings = FALSE)
## Get new (linked) URLs ---------------------------
message("~ Get new (linked) URLs\n")
new_urls <- crawl_new %>%
filter(str_detect(content, "html") | status == "Connection Timeout") %>%
anti_join(crawl_old %>%
filter(str_detect(content, "html") | status == "Connection Timeout"),
by = "address") %>%
mutate(is_canonical = (address == canonical_link_element_1 | canonical_link_element_1 == "")) %>%
select(date,
address,
status_code,
status,
indexability_status,
meta_robots_1,
canonical_link_element_1,
title_1,
meta_description_1,
is_canonical) %>%
mutate(date = as.character(date)) %>%
as.data.frame() # convert to DataFrame as xlsx:: cannot handle tibble()
## Get not longer / deleted URLs -------------------
message("~ Get not longer / deleted URLs\n")
no_longer_linked_urls <- crawl_old %>%
filter(str_detect(content, "html") | status == "Connection Timeout") %>%
anti_join(crawl_new %>%
filter(str_detect(content, "html") | status == "Connection Timeout"),
by = "address")
## get Status Code of not linked / deleted URLs
message("~ Get Status Code of not linked / deleted URLs\n")
urls <- no_longer_linked_urls$address
if (length(urls) > 0) {
r <- fetch_multi_urls(urls)
r_df <- tibble(
address = unlist(lapply(r, `[[`, "url")),
status_code_current = unlist(lapply(r, `[[`, "status_code"))
)
no_longer_linked_urls <- no_longer_linked_urls %>%
left_join(r_df, by = "address") %>%
select(date,
address,
status_code,
status,
indexability_status,
status_code_current,
meta_robots_1,
canonical_link_element_1,
title_1,
meta_description_1) %>%
rename(status_code_old = status_code) %>%
mutate(date = as.character(date)) %>%
as.data.frame()
}
## Get identical URLs ------------------------------
message("~ Get identical URLs\n")
identical_urls <- crawl_new %>%
filter(str_detect(content, "html")) %>%
inner_join(crawl_old %>%
filter(str_detect(content, "html")),
by = "address",
suffix = c("_new", "_old")) %>%
mutate(change_status_code = status_code_new != status_code_old,
change_canonical = canonical_link_element_1_new != canonical_link_element_1_old,
change_index = meta_robots_1_new != meta_robots_1_old,
change_title = title_1_new != title_1_old,
change_description = meta_description_1_new != meta_description_1_old,
change_content_type = content_new != content_old,
change_crawl_depth = crawl_depth_new != crawl_depth_old,
change_h1 = h1_1_new != h1_1_old,
change_word_count = word_count_new != word_count_old,
change_indexability = indexability_new != indexability_old,
change_indexability_status = indexability_status_old != indexability_status_new,
change_orphan_pages = (is.na(crawl_depth_old) & !is.na(crawl_depth_new)) | (!is.na(crawl_depth_old) & is.na(crawl_depth_new)))
## Get identical URLs with changes -----------------
message("~ Get identical URLs with changes\n")
## orphane pages
message("\tOrphan pages")
change_orphan_pages <- identical_urls %>%
filter(change_orphan_pages == TRUE) %>%
mutate(change = case_when(!is.na(crawl_depth_old) & is.na(crawl_depth_new) ~ "neu verwaist",
is.na(crawl_depth_old) & !is.na(crawl_depth_new) ~ "nicht mehr verwaist")) %>%
select(change,
address,
status_code_old,
status_code_new,
status_old,
status_new,
title_1_old,
title_1_new) %>%
as.data.frame()
## status code
message("\tStatus code")
change_status_code <- identical_urls %>%
filter(change_status_code == TRUE) %>%
select(address,
status_code_new,
status_code_old,
status_new,
status_old,
indexability_status_old,
indexability_status_new) %>%
as.data.frame()
## canonical
message("\tCanonical")
change_canonical <- identical_urls %>%
filter(change_canonical == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>% # URLs herausfiltern, die eine Änderung beim Status Code aufweisen
select(address,
canonical_link_element_1_new,
canonical_link_element_1_old) %>%
mutate(new_is_canonical = (address == canonical_link_element_1_new | canonical_link_element_1_new ==""),
old_was_canonical = (address == canonical_link_element_1_old | canonical_link_element_1_old == "")) %>%
as.data.frame()
## meta robots
message("\tMeta robots")
change_index <- identical_urls %>%
filter(change_index == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
meta_robots_1_new,
meta_robots_1_old) %>%
as.data.frame()
## title
message("\tTitle")
change_title <- identical_urls %>%
filter(change_title == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
title_1_new,
title_1_old) %>%
as.data.frame()
## description
message("\tDescription")
change_description <- identical_urls %>%
filter(change_description == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
meta_description_1_new,
meta_description_1_old) %>%
as.data.frame()
## content type
message("\tContent type")
change_content_type <- identical_urls %>%
filter(change_content_type == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
content_new,
content_old) %>%
as.data.frame()
## crawl depth
message("\tCrawl depth")
change_crawl_depth <- identical_urls %>%
filter(change_crawl_depth == TRUE | is.na(change_crawl_depth)) %>%
mutate_at(c("crawl_depth_new", "crawl_depth_old"), as.numeric) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
crawl_depth_new,
crawl_depth_old) %>%
mutate(crawl_depth_change_absolut = crawl_depth_new - crawl_depth_old) %>%
arrange(desc(crawl_depth_change_absolut)) %>%
as.data.frame()
## h1
message("\tH1")
change_h1 <- identical_urls %>%
filter(change_h1 == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
h1_1_new,
h1_1_old) %>%
as.data.frame()
## word count
message("\tWord count")
change_word_count <- identical_urls %>%
filter(change_word_count == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
word_count_new,
word_count_old) %>%
mutate(word_count_change_absolut = word_count_new - word_count_old,
word_count_change_perc = (word_count_new / word_count_old - 1) * 100) %>%
arrange(desc(word_count_change_absolut)) %>%
as.data.frame()
## indexability
message("\tIndexability")
change_indexability <- identical_urls %>%
filter(change_indexability == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
indexability_new,
indexability_old) %>%
as.data.frame()
## indexability status
message("\tIndexability status\n")
change_indexability_status <- identical_urls %>%
filter(change_indexability_status == TRUE) %>%
.[!.$address %in% change_status_code$address, ] %>%
select(address,
indexability_status_new,
indexability_status_old) %>%
as.data.frame()
## Get biggest ressources --------------------------
message("~ Get biggest ressources\n")
## images
message("\tImages")
top_image <- crawl_new %>%
filter(str_detect(content, "image")) %>%
select(address, size_bytes) %>%
arrange(desc(size_bytes)) %>%
as.data.frame()
## JavaScript
message("\tJavaScript")
top_js <- crawl_new %>%
filter(str_detect(content, "javascript")) %>%
select(address, size_bytes) %>%
arrange(desc(size_bytes)) %>%
as.data.frame()
## CSS
message("\tCSS\n")
top_css <- crawl_new %>%
filter(str_detect(content, "css")) %>%
select(address, size_bytes) %>%
arrange(desc(size_bytes)) %>%
as.data.frame()
## Create Excel ------------------------------------
message("~ Write Excel for: [ ", WEBSITE, " ] ", strrep("*", 40), "\n")
## set order of sheets here
sheet_order <- c("new_urls",
"no_longer_linked_urls",
"change_orphan_pages",
"change_status_code",
"change_index",
"change_canonical",
# "change_indexability",
# "change_indexability_status",
"change_title",
"change_description",
"change_h1",
"change_word_count",
"change_content_type",
"change_crawl_depth",
"top_image",
"top_js",
"top_css")
wb <- createWorkbook()
for (sheet in sheet_order) {
if (nrow(eval(parse(text = sheet))) > 0)
add_sheets(wb, sheet)
}
saveWorkbook(wb, file.path(PATH_TO_XLSX_EXPORTS,
WEBSITE,
paste0(CRAWL_NEW_DATE,
"__",
CRAWL_OLD_DATE,
"__",
WEBSITE,
".xlsx")))
}, error = function(e) {
print(e)
## send Slack notification if error occured
#fail_msg <- paste0(":siren: Script failed! Website: *",
# WEBSITE,
# "*\n\nError Message:")
#text_slackr(fail_msg,
# preformatted = FALSE)
#err_msg <- as.character(e)
#text_slackr(err_msg,
# preformatted = TRUE)
return(e)
})
## send Slack notification if script was successful
if(!inherits(try_res, "error")) {
# success_msg <- paste0(":aw_yeah: Script successful. Website: *",
# WEBSITE,
# "*")
# text_slackr(success_msg,
# preformatted = FALSE)
}
}
Created on 2020-11-23 by the reprex package (v0.3.0)