I have this dataset:
https://www.mediafire.com/file/o61f021p31j7abp/data.xlsx/file
I'm using this tutorial
https://towardsdatascience.com/auto-tagging-stack-overflow-questions-5426af692904
First I plotted the weekly frequency using this code and works fine
library(tidyverse)
library(lubridate)
library(readxl)
library(tidytext)
library(stringr)
library(tidyr)
library(dplyr)
library(scales)
stories <- read_xlsx("C:/User/data.xlsx")%>%
mutate(time = as.POSIXct(time, origin = "1970-01-01"),
week = round_date(time, "week"))
stories %>%
count(Week = round_date(time, "week")) %>%
ggplot(aes(Week, n)) +
scale_x_datetime(breaks = date_breaks("1 months"),labels = date_format("%Y-%m"))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
geom_line() +
ggtitle('The number of Titles posted per Week')
Now I'm trying to Compare the growth or shrinking of particular tags over time as in the tutorial:
title_words <- stories %>%
distinct(titles, .keep_all = TRUE) %>%
unnest_tokens(word, titles, drop = FALSE) %>%
distinct(ID, word, .keep_all = TRUE) %>%
anti_join(stop_words, by = "word") %>%
filter(str_detect(word, "[^\\d]")) %>%
group_by(word) %>%
mutate(word_total = n()) %>%
ungroup()
title_words
word_counts <- title_words %>%
count(word, sort = TRUE)
word_counts
tags <- c("coronavirus", "china")
q_per_year <- stories %>%
count(Week = week(time)) %>%
rename(WeekTotal = n)
head(q_per_year)
tags_per_year <- word_counts %>%
filter(word %in% tags)%>%
inner_join(stories)
count(Week = week(time), word)
inner_join(q_per_year)
ggplot(tags_per_year, aes(Week, n / WeekTotal, color = word)) +
geom_line() +
scale_y_continuous(labels = scales::percent_format()) +
ylab("% of Stack Overflow questions with this tag") +
ggtitle('Growth or Shrinking of Particular Tags Overtime')
But I get the error in the inner join
Error: `by` must be supplied when `x` and `y` have no common variables.
i use by = character()` to perform a cross-join.
I can't find out how to fix the error..