Thank you so much for this help, great advice.
My next query would be what is the best way and if i should remove that extreme outlier for June and re-run the test.
#Made a dataframe from the data and converted months and years into factors so you can group using these
dfsights <- data.frame(sights) %>%
convert_as_factor(Month, Year)
dfsights$Month <- factor(dfsights$Month, levels= c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
?levels
?factor
#Named the different variables for ease of coding
names(dfsights) <- c("Year", "Month", "Sightings")
dfsights
#Boxplot representing the raw data
bxp <- ggplot(data = dfsights, mapping = aes(x = Month, y = Sightings)) +
geom_boxplot(outlier.shape=NA)
bxp
#group by month
by_month <- dfsights %>%
group_by(Month)
view(by_month)
names(by_month) <- c("Year", "Month", "Sightings")
#one way anova showing that there is strong evidence for a difference in sightings between the months
anova_sights <- anova_test(data = by_month, dv= Sightings , wid = Year, within = Month)
get_anova_table(anova_sights)
#pariwise t-test to test difference in sightings between individual months- I think this compares
#all months to april atm but may change if january becomes the first month- check 'pwc' output
by_month <- group_by(dfsights, Month)
by_month
pwc <- by_month %>%
pairwise_t_test(Sightings ~ Month, paired = TRUE,
p.adjust.method = "bonferroni")
pwc
pwc <- pwc %>% add_xy_position(x = "Month")
#creating a boxplot graph that shows anova and pairwise t test values
bxp +
labs(subtitle = get_test_label(anova_sights, detailed = TRUE),
caption = get_pwc_label(pwc))
#attempting to include p values between months on graph but doesn't seem to be working...
library(ggpubr)
bxp +
labs(subtitle = get_test_label(anova_sights, detailed = TRUE),
caption = get_pwc_label(pwc)) +
geom_hline(yintercept = mean(dfsights$Sightings), linetype = 2) +
stat_compare_means(method ="anova", label.y = 700) +
stat_compare_means(label = "p.signif", method = "t.test",
ref.group = ".all.", label.y = 750) +
theme_classic()
This is where I have gotten so far and just wondering why i have two different p-values: is one for the anova, and another for the t-test etc and what do these mean. I am sorry for these basic questions I am very new to this analysis.
