Process a database of air quality stations considering the period 2000-2017 for each station

Hello! I am developing a script to process ozone data and other meteorological variables from a database of 62 stations of measurement. The period studied is 2000-2017. The database is structured in 62 folders and each folder has 18 more folders (the 18 years for the period). Inside of these folders there are 12 DAT archives (one for each month). However, some stations do not present information for this whole period (years and months are missing). And this is the problem: I need to get a data frame for each station considering the 2000-2017 period. Those years not presented in the folders must be filled with NA in the data frame.

In order to solve the problem, I've tried to manually change the number of years available in those stations that not have all the data. But I get a dataframe that only considers the years available in the folders and not the 2000-2017 with NA in those years missing.

Thanks in advance!!
This is the script:

## This script allows the user to obtain clean data related to ozone for its later statistic analysis
rm(list=ls())

# Define both paths and set as working directory to files pane location
path.in <- "C:/Users/Oriol/OneDrive/Escritorio/Dades_corregides/"
path.in.metadata <- "C:/Users/Oriol/OneDrive/Escritorio/"
path.out<-"C:/Users/Oriol/Documents/"

# Read metadata
metadata <- read.csv(paste0(path.in.metadata, "XVPCA_modal_metadata.csv"))
vector.station.code <- "ES0584A"

# Define both station codes
for (f in 1:length(vector.station.code)){
  
 station.code<- vector.station.code[f]
 station.code2<- as.character(metadata$COD_DTES[match(station.code, metadata$COD_EUROPEU)])
 station.area<- as.character(metadata$ID_TIPUS_AREA[match(station.code, metadata$COD_EUROPEU)])
 station.type<- as.character(metadata$ID_TIPUS_ESTACIO[match(station.code, metadata$COD_EUROPEU)])
 station.aqz<- as.character(metadata$AQZ[match(station.code, metadata$COD_EUROPEU)])
 station.lat<- as.character(metadata$LATITUT[match(station.code, metadata$COD_EUROPEU)])
 station.lon<- as.character(metadata$LONGITUT[match(station.code, metadata$COD_EUROPEU)])
 
 # Define all the dates, from the beginning to the end. We are considering the 2000 - 2017 period and every month of the year
 nyears = c("00", "01","02","03","04","05","06","07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17")
 nmonth = c("01","02","03","04","05","06","07","08","09","10","11","12")
 date.ini<-20000101
 date.fin<-20180101


 # The function as.POSIXct converts the dates into the proper format (year, month and day) for its manipulation with R. The dates are found in Greenwich Mean Time (GMT)
 # The function strptime indicates in which format are the dates that will be used
 ff_ini = as.POSIXct(strptime(date.ini, format = "%Y%m%d"), tz = "GMT")
 ff_fin = as.POSIXct(strptime(date.fin, format = "%Y%m%d"), tz = "GMT")
 days = (as.integer(ff_fin - ff_ini))

 # We work with semihourly data. Therefore, for each day information will be received 48 times per day
 date.vec = as.character(seq(as.POSIXct(ff_ini, tz = "Europe/Madrid"), by = "30 mins", length = days*48))

 # Data is in local time
 a <- strptime(date.vec,format = "%Y-%m-%d %H:%M:%S", tz = "Europe/Madrid")
 b <- as.POSIXct(a, tz = "Europe/Madrid")
 date <- b

 # Define ozone matrix. For the moment, this matrix is filled with nule data (NA) related to ozone and the meteorological variables
 df.o3<-data.frame(date = date, o3 = NA, wd = NA, ws = NA, temp = NA, rh = NA, pressure = NA, rain = NA, solar = NA)

 h<-1 # Counter for the number of hours

  for (yy in 1:length(nyears)){ # Loop for years

    for (mm in 1:length(nmonth)){ # Loop for months
       year<- nyears[yy]
       month<-nmonth[mm]
      # Read the file, which is in comma separated values (csv) format
      # NEW WAY OF READING FILES TO AVOID THE POBLEM WITH .dat or .Dat
      a<-list.files(path=paste0(path.in,station.code,"/",year,"/"))
      filename <- paste0(path.in,station.code,"/",year,"/",a[which(paste0(station.code2,month,year) == substr(a,1,6))]) 
      print(filename)
      data <- read.csv(filename)

      # Number of parameters
      nparam<-as.numeric(substr(colnames(data),2,3))

      # Change the name of the matrix data for "mesures"
      # The matrix data has 589 rows and one column
      # The function gsub is able to separate the V state of validation from the nule data
      colnames(data)[1]<-"mesures"
      data$mesures <- gsub("V9999", "V 999", data$mesures)

      #Define the number of counts for each row
      cont<-0
      for (i in 1:round(dim(data)[1]/nparam)){ # Loop for days
        for (j in 1:nparam){ # Loop for parameters
          day<-as.numeric (substr(data[cont+j,],1,2))
          param<-as.numeric (substr(data[cont+j,],3,4))
          conc<-as.numeric()
          conc<-strsplit(data[cont+j,], "\\s+") [[1]][2:49]
          # print(paste0("***CHECKS*** station.code: ", station.code, " year: ", year, " month: ",
          #             month, " day: ", i, " Param: ",j, " hours: ", h, " - ",(h+48-1), " line: ", (cont+j)
          #             ))
          for (k in 1:48){
              if (grepl("V", conc[k]) == TRUE){
               conc[k] <- sub("V", "", conc[k])
              }
              if (grepl("N", conc[k]) == TRUE){
               conc[k] <- sub("N", "NA", conc[k])
              }
              if (grepl("T", conc[k]) == TRUE){
               conc[k] <- sub("T", "NA", conc[k])
              }
              if (grepl("P", conc[k]) == TRUE){
               conc[k] <- sub("P", "", conc[k])
              }
              if (grepl("9999", conc[k]) == TRUE){
               conc[k] <- sub("9999", "NA", conc[k])
              }
              if (grepl("999", conc[k]) == TRUE){
               conc[k] <- sub("999", "NA", conc[k])
              }
          }  
          conc <- as.numeric(conc)
          # For ozone, param == 5
          if (param == 5) {
            print(paste0("Saving data for ozone at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$o3 <- conc
          }
          # For wind direction, param == 30
          if (param == 30) {
            print(paste0("Saving data for wind direction at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$wd <- conc
          }
          # For wind speed, param == 31
          if (param == 31) {
            print(paste0("Saving data for wind speed at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$ws <- conc
          }
          # For temperature, param == 32
          if (param == 32) {
            print(paste0("Saving data for temperature at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$temp <- conc
          }
          # For relative humidity, param == 33
          if (param == 33) {
            print(paste0("Saving data for relative humidity at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$rh <- conc
          }
          # For atmospheric pressure, param == 34
          if (param == 34) {
            print(paste0("Saving data for atmospheric pressure at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$pressure <- conc
          }
          # For rainfall, param == 35
          if (param == 35) {
            print(paste0("Saving data for rainfall at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$rain <- conc
          }
          # For solar radiation, param == 36
          if (param == 36) {
            print(paste0("Saving data for solar radiation at for the hour: ", h, "-", (h+48-1), "....."))
            df.o3[h: (h+48-1),]$solar <- conc
          }
  
        } # End loop for parameters
        print(paste0(cont, " ", cont + nparam))
        cont <- cont + nparam
        h = h + 48
  
      } # End loop for days

    } # End loop for months

  } # End loop for years
 
 # Convert date from local time to UTC
 df.o3$date <- as.POSIXct(format(df.o3$date, tz = "UTC"),tz= "UTC")
 
 # Convert semihourly to hourly data
 df.o3 <- openair::timeAverage(df.o3, avg.time = "hour")
 
 df.o3$o3[df.o3$o3=="NaN"] <- NA
 df.o3$wd[df.o3$wd=="NaN"] <- NA
 df.o3$ws[df.o3$ws=="NaN"] <- NA
 df.o3$temp[df.o3$temp=="NaN"] <- NA
 df.o3$rh[df.o3$rh=="NaN"] <- NA
 df.o3$pressure[df.o3$pressure=="NaN"] <- NA
 df.o3$rain[df.o3$rain=="NaN"] <- NA
 df.o3$solar[df.o3$solar=="NaN"] <- NA
 df.o3$code <- station.code
 df.o3$code2 <- station.code2
 df.o3$area <- station.area
 df.o3$type <- station.type
 df.o3$aqz <- station.aqz
 df.o3$lat <- station.lat
 df.o3$lon <- station.lon
 
 
 fileout <- print(paste0(station.code, ".RData"))
 save(df.o3, file = fileout)
 
} # End loop for stations

Hi forlornfortress,

It looks like the padr package is designed to address this problem of missing observations in time series. Check it out here

https://cran.r-project.org/web/packages/padr/vignettes/padr.html

Let us know if this helps.

1 Like

Thanks for the reply! The package padr may be useful, but I don't know which function is the best option to solve my problem...

It looks like pad()
addresses the missing observations

pad

The second workhorse of padr is pad . It does date padding:

account <- data.frame(day     = as.Date(c('2016-10-21', '2016-10-23', '2016-10-26')),
                      balance = c(304.46, 414.76, 378.98))
account %>% pad()
## pad applied on the interval: day
##          day balance
## 1 2016-10-21  304.46
## 2 2016-10-22      NA
## 3 2016-10-23  414.76
## 4 2016-10-24      NA
## 5 2016-10-25      NA
## 6 2016-10-26  378.98

The account dataframe has three observations on different days. Like thicken , the pad function figures out what the datetime variable in the data frame is, and then assesses its interval. Next it notices that within the interval, day in this case, rows are lacking between the first and last observation. It inserts a row in the data frame for every time point that is lacking from the data set. All non-datetime values will get missing values at the padded rows.

It is up to the user what to do with the missing records. In the case of the balance of an account we want to carry the last observation forward. It needs tidyr::fill to arrive at the tidy data set.

account %>% pad() %>% tidyr::fill(balance)
## pad applied on the interval: day
##          day balance
## 1 2016-10-21  304.46
## 2 2016-10-22  304.46
## 3 2016-10-23  414.76
## 4 2016-10-24  414.76
## 5 2016-10-25  414.76
## 6 2016-10-26  378.98

Also pad allows for deviations from its default behavior. By default it pads all observations between the first and the last observation, but you can use start_val and end_val to deviate from this. You can also specify a lower interval than the one of the variable, using pad as the inverse of thicken .

account %>% pad('hour', start_val = as.POSIXct('2016-10-20 22:00:00')) %>% head()
##                   day balance
## 1 2016-10-20 22:00:00      NA
## 2 2016-10-20 23:00:00      NA
## 3 2016-10-21 00:00:00  304.46
## 4 2016-10-21 01:00:00      NA
## 5 2016-10-21 02:00:00      NA
## 6 2016-10-21 03:00:00      NA
1 Like