I am trying to study wildfire impacts on air quality. I downloaded the EPA's AQI data (https://aqs.epa.gov/aqsweb/airdata/download_files.html#AQI) and I need help with getting the averages.
Table looks like this. I made a reprex with some examples (thanks to andresrcs). The actual dataset has almost a million pieces of data in there.
data.frame(AQI = c(67L, 84L, 61L, 49L, 58L, 67L, 49L, 44L, 43L,
44L, 44L, 42L, 37L, 26L, 35L, 43L, 42L, 39L, 40L, 39L), State.Name = as.factor(c("Alabama",
"Alabama", "Alaska", "Alaska", "Washington", "Washington",
"Oregon", "Oregon", "Alabama", "Alabama", "Wyoming", "Wyoming",
"Wyoming", "Wyoming", "Wyoming", "Utah", "Washington", "Wyoming",
"Idaho", "Wyoming")), county.Name = as.factor(c("Autauga",
"Fayette", "Ada", "Fayette", "Clark", "Washington", "Clark",
"Cowlitz", "Clark", "Autauga", "Weston", "Fayette", "Weston",
"Cowlitz", "Weston", "Weston", "Weston", "Weston", "Clark",
"Weston")), Date = as.factor(c("1980-06-05", "1985-09-06",
"1989-04-07", "2007-11-08", "1980-10-09", "1990-11-10", "1980-10-11",
"2000-04-12", "1980-09-13", "1980-04-14", "2016-12-22", "2016-12-23",
"2016-12-24", "2012-10-25", "2013-12-26", "2010-12-27", "2016-12-28",
"2016-10-29", "2016-09-30", "2016-12-31")))
#> AQI State.Name county.Name Date
#> 1 67 Alabama Autauga 1980-06-05
#> 2 84 Alabama Fayette 1985-09-06
#> 3 61 Alaska Ada 1989-04-07
#> 4 49 Alaska Fayette 2007-11-08
#> 5 58 Washington Clark 1980-10-09
#> 6 67 Washington Washington 1990-11-10
#> 7 49 Oregon Clark 1980-10-11
#> 8 44 Oregon Cowlitz 2000-04-12
#> 9 43 Alabama Clark 1980-09-13
#> 10 44 Alabama Autauga 1980-04-14
#> 11 44 Wyoming Weston 2016-12-22
#> 12 42 Wyoming Fayette 2016-12-23
#> 13 37 Wyoming Weston 2016-12-24
#> 14 26 Wyoming Cowlitz 2012-10-25
#> 15 35 Wyoming Weston 2013-12-26
#> 16 43 Utah Weston 2010-12-27
#> 17 42 Washington Weston 2016-12-28
#> 18 39 Wyoming Weston 2016-10-29
#> 19 40 Idaho Clark 2016-09-30
#> 20 39 Wyoming Weston 2016-12-31
I need the average AQI for September-November per county per state (different states share same county names)
I need to know how many days or years of data there are for each county
Then I'll need to learn how to figure out how to find the difference between that info and a separate dataset for 2017 with the same type of data
My goal is to do this with 2017 as well and map the difference between the two data sets per county to see if there's any trends during the Eagle Creek Fire (occurred Sept-Nov 2017) and I know there's going to be a large potential error or other factors. I'm told I can do this in R but also no idea how.
I tried to do loops but I'm a bit confused on how to do it properly. I'm told I could use tidyverse and aggregate but also not sure if that would actually work for this.
So far, I was able to combine 1980-2016 files into one .csv file. I have been having issues with getting the loops to work. This is what I have so far:
#1980-2016 Air Quality Index Data
install.packages("dplyr")
library(dplyr)
#Combinate datasets for Air Quality Index Data for 1980-2016
AQIData <- rbind(AQI1980,AQI1981,AQI1982,AQI1983,AQI1984,AQI1985,AQI1986,AQI1987,AQI1988,AQI1989,AQI1990,AQI1991,AQI1992,AQI1993,AQI1994,AQI1995,AQI1996,AQI1997,AQI1998,AQI1999,AQI2000,AQI2001,AQI2002,AQI2003,AQI2004,AQI2005,AQI2006,AQI2007,AQI2008,AQI2009,AQI2010,AQI2011,AQI2012,AQI2013,AQI2014,AQI2015,AQI2016)
write.csv(AQIData, file = "AQIData.csv", row.names = FALSE)
#checkdata
str(AQIData)
#check how many counties there are
length(unique(AQIData$County.Code))
#county code is inconsistent, 200 codes, 3007 counties in US, over 1000 counties in dataset
#use county name/state
levels(AQIData$State.Name)
#remove canada, mexio, guam, virgin islands, puerto rico,
#to get rid of states: tell R to remove rows where state is state.name
#which command, in a dataset, which are equal to ... which(v==1), will give location in vector equal to 1, finds row names
#Leave main one untouched
AQIData51 <- AQIData
#Mexico deleted observation, coded it weirdly. find it StateName <- levels(AQIData$State.Name); then length(which(AQIData$State.Name==StateName[9]))
StateName <- levels(AQIData$State.Name)
StateName
length(which(AQIData$State.Name==StateName[9]))
#Remove Country of Mexico, Canada, Virgin Islands, Puerto Rico, and Guam
AQIData51 <- AQIData51[-which(AQIData51$State.Name==StateName[9]),]
AQIData51 <- AQIData51[-which(AQIData51$State.Name=="Canada"),]
AQIData51 <- AQIData51[-which(AQIData51$State.Name=="Virgin Islands"),]
AQIData51 <- AQIData51[-which(AQIData51$State.Name=="Puerto Rico"),]
AQIData51 <- AQIData51[-which(AQIData51$State.Name=="Guam"),]
#factors... empty spots, instead of gone. tell R to get a different set of dividers
AQIData51 <- droplevels(AQIData51)
#always check structure to verify it's all good, such as if NA appears
str(AQIData51)
#51 states instead of 56
StateName2 <- levels(AQIData51$State.Name)
StateName2
#ask R to relate state to county to find AQI
#loops!
#how to map it? Don't have Lat/Long in the same dataset. test it., example: test <- merge(AQIData51, LatLong, By County)
#extract it? merging has to have identical names,
#ex. merge(both, seeds,by.x=c("genus", "species"), by.y-c("name1,"name2") ........... ?merge
#assuming no spelling errors
AQIData51$date <- as.Date(AQIData51$Date,format="%Y/%m/%d")
#check if it worked
str(AQIData51)
#contains NAs, didn't work
AQIData51$date <- as.Date(AQIData$Date,format="%Y/%m/%d")
#dates messed up
csvdat <- read.csv("C:/Users/keson/Desktop/Climate Change Lab/AQI data by year and county/AQI1980.csv")
str(csvdat)
#dates still different
#strptime function? didnt work
#change dates to character
#%d is day of the month as 0-31, but we have 1-31, day of month is 0-11 (?)
#anytime package?
#lubridate?
#try and split, date column is character, possible to split into day, month, year, columns
#only need months
#grep
test1 <- as.Date(AQIData51$Date, formate="%m/%d/%Y")
head(test1)
#error na na na, reader is...
test <- as.Date(csvdat$Date, formate="%m/%d/%Y")
head(test)
#formate is format.date
test <- as.Date(csvdat$Date,format.Date="%m/%d/%Y")
head(test)
#works for Y-m-d
test <- as.Date(AQIData51$Date,format.Date="%m/%d/%Y")
head(test)
#works
#sub data for states and county names based on certain months
state <- unique(AQIData51$State.Name)
#number of states
nS <- 51
#year variable
#November
#Took about 2 hours + to run
#for loop, do something for the number of times I tell you, indexes across that
meanmonth <- numeric()
#array of start and end dates to contain how many years of data for each county
#state name, county name, start date of record, end date of record
#tmp.m3 contains all info, subset
st.end.dates = list()
k=1
for(i in 1:nS){
tmp <- subset(AQIData51, State.Name==state[i])
county <- unique(tmp$county.Name)
nC <- length(county)
CM <- numeric()
for(t in 1:nC){
# x.tmp = which(tmp$county.Name==county[t])
tmp.c <- subset(tmp,county.Name==county[t])
tmp.m3 <-subset(tmp.c,format.Date(Date,'%m')%in%c('11'))
#tmp.m3 want to extract date variable .. get year .. from year, find unique years
#print every step to make sure its doing it right, add to st.end.dates. unique function
size.tmp3 = dim(tmp.m3)[1]
st.end.dates[[k]] <- cbind(tmp.m3[1,c(1,2,5)],tmp.m3[size.tmp3,5])
k=k+1
print(size.tmp3)
cm <- mean(tmp.m3[,"AQI"])
CM <- c(CM,cm)
print(CM)
}
names(CM) <- rep(state[i],nC)
meanmonth <- c(meanmonth,CM)
}
I don't know how to finish the start-end date part of the loop. Can anyone please help with this? I am new to R and this has been the bane of my existence all semester.