Data processing

Hello.
I have a data processing question:

-Data are water quality samples collected from many different locations and many different dates

  • Data are in wide format and I want to get them into long format
  • Each parameter (column) has a qualifier code ("_q")

I need to adjust the data based on the code:
- If "U" , take 1/2 the value
- If "V" or "K", remove the row of data

I could do this for each paramater using my script below, but I am thinking there has to be a more efficient method?

Thank you.

Some of the data:

test <-tibble::tribble(
         ~stream, ~site,  ~date,   ~DO, ~DO_q, ~Cond, ~Cond_q,  ~Sal, ~Sal_q, ~TKN, ~TKN_q,  ~NH3, ~NH3_q, ~NOX, ~NOX_q,
             10L,    2L, 37635L,  9.34,    NA, 492.4,      NA,  0.25,     NA,  0.8,     NA,  0.05,     NA, 0.66,     NA,
             19L,   10L, 37637L,  6.06,    NA, 723.2,      NA,  0.37,     NA, 0.93,     NA,  0.12,     NA, 0.34,     NA,
             19L,    2L, 37637L,  8.93,    NA, 16450,      NA,  9.62,     NA, 0.87,    "U", 0.005,    "U", 0.01,    "U",
             19L,    9L, 37637L,  9.22,  "VU", 477.4,      NA,  0.24,     NA, 0.61,     NA, 0.005,   "UI", 0.32,    "U",
             19L,    5L, 37637L,   7.7,    NA, 546.3,      NA,  0.28,     NA, 0.56,     NA,  0.05,     NA, 1.39,     NA,
             19L,    8L, 37637L,  8.86,  "VU",   525,     "V",  0.27,     NA, 0.81,     NA,  0.04,    "I", 1.49,    "U",
             19L,    7L, 37637L,  9.82,    NA, 479.2,      NA,  0.24,     NA, 0.56,   "VU",  0.01,    "I", 0.41,     NA,
             27L,    9L, 37637L,  9.12,    NA, 427.4,      NA,  0.21,    "U", 0.95,     NA,   0.1,     NA, 0.27,     NA,
             27L,    8L, 37637L, 10.24,  "UL", 622.9,      NA,  0.32,     NA,  1.1,     NA, 0.005,   "UV", 1.12,     NA,
             27L,   10L, 37637L, 10.03,    NA, 353.4,      NA,  0.17,     NA, 0.71,     NA, 0.005,    "U", 0.13,     NA,
             27L,    3L, 37637L,  7.02,    NA,   382,      NA,  0.19,     NA, 0.94,     NA,   0.1,     NA, 0.15,     NA,
             10L,    2L, 37670L,  8.48,    NA, 435.3,      NA,  0.22,     NA, 0.84,    "U",  0.04,    "I", 0.27,     NA,
             19L,   10L, 37676L,  6.25,  "KU", 625.7,     "U",  0.32,     NA, 0.81,     NA,  0.08,     NA, 0.15,     NA,
             19L,    2L, 37676L,  5.84,    NA, 14230,      NA,  8.23,    "K", 0.85,     NA,   0.1,     NA, 0.07,    "I",
             19L,    9L, 37676L,  7.38,   "U", 312.1,      NA,  0.15,     NA, 0.51,     NA,  0.06,     NA, 0.12,     NA,
             19L,    8L, 37676L,  7.93,   "U", 512.6,      NA,  0.26,     NA, 0.56,     NA,  0.09,     NA, 1.02,     NA,
             19L,    7L, 37676L,  8.62,    NA, 440.1,      NA,  0.22,     NA, 0.62,     NA,  0.04,    "I", 0.17,     NA,
             27L,    9L, 37676L,  8.35,    NA, 449.5,      NA,  0.23,     NA, 0.82,     NA,  0.04,    "I", 0.08,    "I",
             27L,    8L, 37676L,  8.44,    NA,   506,      NA,  0.26,     NA, 0.89,     NA,  0.08,     NA, 0.47,     NA,
             27L,    3L, 37676L,  6.96,    NA, 416.8,      NA,  0.21,     NA,  0.9,     NA,  0.09,     NA, 0.01,    "U",
             27L,   10L, 37676L,  9.68,    NA, 391.5,      NA,  0.19,     NA, 0.58,     NA, 0.005,    "U", 0.01,    "U",
             10L,    2L, 37705L,  7.78,  "KU", 487.8,      NA,  0.25,     NA, 0.71,     NA,  0.04,    "I", 0.35,     NA,
             19L,    7L, 37711L,  8.39,    NA,   526,      NA,   0.3,     NA, 0.53,     NA,  0.05,     NA, 0.21,     NA,
             19L,    8L, 37711L,  7.31,    NA,   591,      NA,   0.3,    "U", 0.61,     NA,  0.07,     NA, 0.88,     NA,
             19L,    9L, 37711L,  7.49,    NA,   468,      NA,   0.2,     NA, 0.58,     NA,  0.03,    "I", 0.13,     NA,
             19L,   10L, 37711L,  6.82,    NA,   707,      NA,   0.4,     NA, 0.76,     NA,   0.1,     NA, 0.19,     NA,
             19L,    2L, 37711L,  9.56,    NA,   125,      NA,   0.1,     NA, 1.12,     NA, 0.005,    "U", 0.01,    "U",
             27L,    9L, 37711L,  8.16,    NA,   498,      NA,   0.3,     NA, 0.89,     NA,  0.11,     NA,  0.2,     NA,
             27L,    3L, 37711L,   6.3,    NA,   163,      NA,   0.1,     NA,  0.8,     NA,  0.11,     NA, 0.04,    "I",
             27L,   10L, 37711L,   8.7,    NA,   415,      NA,   0.2,     NA, 0.56,     NA, 0.005,    "U", 0.01,   "UX",
             27L,    8L, 37711L,  8.36,    NA,   639,      NA,   0.3,     NA, 0.71,     NA,  0.07,     NA, 0.81,     NA,
             10L,    2L, 37748L,   6.5,    NA, 561.6,      NA,  0.29,     NA, 0.79,     NA,  0.22,     NA, 0.33,     NA,
             19L,    7L, 37753L,  6.85,    NA, 563.9,      NA,  0.29,     NA,  0.4,    "I", 0.005,    "U", 0.09,     NA,
             19L,    8L, 37753L,  6.02,    NA, 760.8,      NA,  0.39,     NA,  0.4,    "I", 0.005,    "U", 0.73,     NA,
             19L,    9L, 37753L,  4.95,    NA,   513,      NA,  0.26,     NA, 0.56,     NA, 0.005,    "U", 0.16,     NA,
             19L,    2L, 37753L,  4.16,    NA, 26710,      NA, 16.33,     NA, 0.57,     NA, 0.005,    "U", 0.01,    "U",
             27L,    9L, 37753L,  6.85,    NA, 460.8,      NA,  0.23,     NA, 0.64,     NA,  0.06,     NA, 0.23,     NA
         )
head(test)
#> # A tibble: 6 × 15
#>   stream  site  date    DO DO_q    Cond Cond_q   Sal Sal_q   TKN TKN_q   NH3
#>    <int> <int> <int> <dbl> <chr>  <dbl> <chr>  <dbl> <chr> <dbl> <chr> <dbl>
#> 1     10     2 37635  9.34 <NA>    492. <NA>    0.25 <NA>   0.8  <NA>  0.05 
#> 2     19    10 37637  6.06 <NA>    723. <NA>    0.37 <NA>   0.93 <NA>  0.12 
#> 3     19     2 37637  8.93 <NA>  16450  <NA>    9.62 <NA>   0.87 U     0.005
#> 4     19     9 37637  9.22 VU      477. <NA>    0.24 <NA>   0.61 <NA>  0.005
#> 5     19     5 37637  7.7  <NA>    546. <NA>    0.28 <NA>   0.56 <NA>  0.05 
#> 6     19     8 37637  8.86 VU      525  V       0.27 <NA>   0.81 <NA>  0.04 
#> # … with 3 more variables: NH3_q <chr>, NOX <dbl>, NOX_q <chr>

Created on 2023-01-24 by the reprex package (v2.0.1)

My super short script, based on my data being long with one column of values, and another column with the qualifiers:

## Based on data being in long format:
test1 <-test %>%
  #select(lake, date, QACode, parm, adj) %>%
  filter(str_detect(QACode, "K|V")== FALSE) %>%  ### Add qualifiers here ("?"does not work)
  ##  mutate(adj2 = if_else(QACode == "Q", adj/2, adj))  Works!!!
  mutate(adj2 = if_else(str_detect(QACode, "Q"), adj/2, adj))  ### Works!!!
#> Error in test %>% filter(str_detect(QACode, "K|V") == FALSE) %>% mutate(adj2 = if_else(str_detect(QACode, : could not find function "%>%"

Created on 2023-01-24 by the reprex package (v2.0.1)

There is likely a more elegant solution, but below is one way to transform the data into a long format, halving QACode "U" and removing QACodes "V" and "K".

library(tidyverse)

test1 =
  test %>%
  mutate(across(everything(), as.character)) %>%
  pivot_longer(cols = c(-'stream', -'site', -'date'), 
               names_to = 'parameter',
               values_to = 'adj') %>%
  arrange(stream, site, date, parameter) %>%
  mutate(parameter = str_replace(parameter, '_q', '')) %>%
  group_by(stream, site, date, parameter) %>%
  mutate(QACode = ifelse(row_number() == 1, lead(adj), NA)) %>%
  filter(row_number() == 1) %>%
  ungroup() %>%
  # halve the value
  mutate(adj2 = ifelse(!is.na(QACode) & QACode == 'U', 
                        as.numeric(adj)/2, 
                        as.numeric(adj))
         ) %>%
  # remove 'V' and 'K'
  filter(!(QACode %in% c('V', 'K'))) 

head(test1, 15) # notice line #12 below is 1/2
#> # A tibble: 15 × 7
#>    stream site  date  parameter adj   QACode   adj2
#>    <chr>  <chr> <chr> <chr>     <chr> <chr>   <dbl>
#>  1 10     2     37635 Cond      492.4 <NA>   492.  
#>  2 10     2     37635 DO        9.34  <NA>     9.34
#>  3 10     2     37635 NH3       0.05  <NA>     0.05
#>  4 10     2     37635 NOX       0.66  <NA>     0.66
#>  5 10     2     37635 Sal       0.25  <NA>     0.25
#>  6 10     2     37635 TKN       0.8   <NA>     0.8 
#>  7 10     2     37670 Cond      435.3 <NA>   435.  
#>  8 10     2     37670 DO        8.48  <NA>     8.48
#>  9 10     2     37670 NH3       0.04  I        0.04
#> 10 10     2     37670 NOX       0.27  <NA>     0.27
#> 11 10     2     37670 Sal       0.22  <NA>     0.22
#> 12 10     2     37670 TKN       0.84  U        0.42   
#> 13 10     2     37705 Cond      487.8 <NA>   488.  
#> 14 10     2     37705 DO        7.78  KU       7.78
#> 15 10     2     37705 NH3       0.04  I        0.04

Created on 2023-01-24 with reprex v2.0.2.9000

1 Like

@scottyd22 Thank you for your help on this!

  • Why the mutate across everything-as.character? what does this do?
  • Why the "arrange(stream, site, date, parameter) %>%"--what does this do?
  • Why the "as.numeric" statement?

Finally if one of those codes ( 'I') states that value from another column should be used ('MDL'), how would I nest that into the mutate statement? As follows::

mutate(adj2 = ifelse(!is.na(QACode) & QACode == 'U',
as.numeric(adj)/2,
as.numeric(adj),

ifelse(QACode == 'I', as.numeric(MDL), as.numeric(adj)
) %>%

Thank you!!!!

Let me answer each of your questions.

Since the values are going to be stacked into one column, we can't have a mix of character and numeric values. However, since creating the example, I've learned the mutate can be removed and the following argument can be added within the pivot statement: values_transform = as.character

In the event any parameters get out of sequence (i.e. DO_q does not follow after DO), this ensures they are in the proper sequence prior to eliminating the "_q".

Since I converted the numbers to character in order to stack them in one column, as.numeric() is needed to carry out the division by 2. Since this results in a numeric value, cases that do not require division also need to be converted to numeric because they will be in the same column.

Assuming MDL follows suit and there are columns for MDL and MDL_q, I would create a separate data frame of just MDL values that gets joined within the transformation flow, and then change the adj2 determination into a case_when() statement.

All of this is illustrated in the updated example below.

library(tidyverse)

# add sample MDL columns
test = test %>%
  mutate(MDL = 99,
         MDL_q = NA)

# MDL values by stream, site, date
mdl_vals = test %>%
  distinct(stream, site, date, mdl_value = MDL)

test1 =
  test %>%
  pivot_longer(cols = c(-'stream', -'site', -'date'), 
               names_to = 'parameter',
               values_to = 'adj',
               values_transform = as.character) %>%
  arrange(stream, site, date, parameter) %>%
  mutate(parameter = str_replace(parameter, '_q', '')) %>%
  group_by(stream, site, date, parameter) %>%
  mutate(QACode = ifelse(row_number() == 1, lead(adj), NA)) %>%
  filter(row_number() == 1) %>%
  ungroup() %>%
  # join in MDL values
  left_join(mdl_vals) %>%
  # halve the value or assign the MDL value
  mutate(adj2 = case_when(
    !is.na(QACode) & QACode == 'U' ~  as.numeric(adj)/2, 
    !is.na(QACode) & QACode == 'I' ~ mdl_value, # which is already numeric
    TRUE ~ as.numeric(adj))
    ) %>%
  select(-mdl_value) %>%
  # remove 'V' and 'K'
  filter(!(QACode %in% c('V', 'K')))
#> Joining, by = c("stream", "site", "date")

head(test1, 15)
#> # A tibble: 15 × 7
#>    stream  site  date parameter adj   QACode   adj2
#>     <int> <int> <int> <chr>     <chr> <chr>   <dbl>
#>  1     10     2 37635 Cond      492.4 <NA>   492.  
#>  2     10     2 37635 DO        9.34  <NA>     9.34
#>  3     10     2 37635 MDL       99    <NA>    99   
#>  4     10     2 37635 NH3       0.05  <NA>     0.05
#>  5     10     2 37635 NOX       0.66  <NA>     0.66
#>  6     10     2 37635 Sal       0.25  <NA>     0.25
#>  7     10     2 37635 TKN       0.8   <NA>     0.8 
#>  8     10     2 37670 Cond      435.3 <NA>   435.  
#>  9     10     2 37670 DO        8.48  <NA>     8.48
#> 10     10     2 37670 MDL       99    <NA>    99   
#> 11     10     2 37670 NH3       0.04  I       99   
#> 12     10     2 37670 NOX       0.27  <NA>     0.27
#> 13     10     2 37670 Sal       0.22  <NA>     0.22
#> 14     10     2 37670 TKN       0.84  U        0.42
#> 15     10     2 37705 Cond      487.8 <NA>   488.

Created on 2023-01-25 with reprex v2.0.2.9000

1 Like

@scottyd22 Thank you for the excellent explanations!!

I am going to start adjusting my scripts--Today, I had two incidences, where I had to go back and do the adjustment in Excel--second or third time around on these data. The R script should be more efficient!

thanks again

1 Like