Calculating values of multiple columns into one

cactus · January 14, 2022, 4:08pm

I have a data frame - dfchildlong

The 11 people (subjectid) has ~ 1-5 children each. The columns childone:childfive shows it and documents birth = 1, living =2 and dead = 4 covering year from 2000 - 2006.

dfchild <- data.frame(
  subjectid = c("a","b","c","d","e","f","g","h","i","j","k"),
  location = c("NY","NC","WA","WA","OR","CA","AR","KS","AZ","VT","MA"),
  childone_2000 = c(NA,NA,1,NA,NA,NA,NA,1,NA,NA,NA),
  childone_2001 = c(NA,1,2,NA,NA,NA,NA,2,NA,NA,1),
  childone_2002 = c(1,2,2,NA,1,NA,1,2,NA,NA,2),
  childone_2003 = c(2,2,2,NA,2,NA,2,2,1,NA,2),
  childone_2004 = c(2,2,2,NA,2,NA,2,2,2,NA,2),
  childone_2005 = c(2,2,2,1,2,NA,2,2,2,NA,2),
  childone_2006 = c(4,2,2,2,2,NA,2,2,2,1,2),
  childtwo_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childtwo_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childtwo_2002 = c(NA,NA,1,NA,NA,NA,NA,1,NA,NA,NA),
  childtwo_2003 = c(NA,NA,2,NA,NA,NA,NA,2,NA,NA,1),
  childtwo_2004 = c(1,NA,4,NA,NA,NA,NA,2,NA,NA,2),
  childtwo_2005 = c(2,1,NA,NA,1,NA,NA,2,NA,NA,2),
  childtwo_2006 = c(2,2,NA,1,2,NA,NA,2,NA,NA,2),
  childthree_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2005 = c(1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2006 = c(2,NA,2,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2005 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2006 = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2005 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2006 = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))
  

library(tidyr)
#> Warning: package 'tidyr' was built under R version 3.6.2
dfchildlong <- pivot_longer(data = dfchild,cols = childone_2000:childfive_2006,
             names_pattern = "([^\\d]+)(\\d+)",
             names_to = c(".value","year"))
Created on 2022-01-14 by the reprex package (v2.0.1)

Now, I want to calculate the number of child each subjectid has each year. Here is an example I created manually. I want the column "Total children". For years where the child dies or the value of child = 4, then I need to count down the number of children.

Thankyou for the help !

0sigmas · January 14, 2022, 4:38pm

Hi!

I have focused on an easy thought process over an efficient operation.

library(magrittr)

dfchild <- data.frame(
  subjectid = c("a","b","c","d","e","f","g","h","i","j","k"),
  location = c("NY","NC","WA","WA","OR","CA","AR","KS","AZ","VT","MA"),
  childone_2000 = c(NA,NA,1,NA,NA,NA,NA,1,NA,NA,NA),
  childone_2001 = c(NA,1,2,NA,NA,NA,NA,2,NA,NA,1),
  childone_2002 = c(1,2,2,NA,1,NA,1,2,NA,NA,2),
  childone_2003 = c(2,2,2,NA,2,NA,2,2,1,NA,2),
  childone_2004 = c(2,2,2,NA,2,NA,2,2,2,NA,2),
  childone_2005 = c(2,2,2,1,2,NA,2,2,2,NA,2),
  childone_2006 = c(4,2,2,2,2,NA,2,2,2,1,2),
  childtwo_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childtwo_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childtwo_2002 = c(NA,NA,1,NA,NA,NA,NA,1,NA,NA,NA),
  childtwo_2003 = c(NA,NA,2,NA,NA,NA,NA,2,NA,NA,1),
  childtwo_2004 = c(1,NA,4,NA,NA,NA,NA,2,NA,NA,2),
  childtwo_2005 = c(2,1,NA,NA,1,NA,NA,2,NA,NA,2),
  childtwo_2006 = c(2,2,NA,1,2,NA,NA,2,NA,NA,2),
  childthree_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2005 = c(1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2006 = c(2,NA,2,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2005 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2006 = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2005 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2006 = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))

dfchild %>%
  tidyr::pivot_longer(tidyr::starts_with("child")) %>% 
  dplyr::mutate(
    year = as.numeric(stringr::str_match(name, "[0-9]+")),
    child = stringr::str_sub(name, start=1, end=-6)
  ) %>% 
  dplyr::select(-name) %>% 
  tidyr::pivot_wider(
    names_from = "child",
    values_from = "value"
  ) %>% 
  dplyr::mutate(
    total_children = dplyr::if_else(is.na(childone) |  childone == 4, 0, 1 ) +
      dplyr::if_else(is.na(childtwo) |  childtwo == 4, 0, 1 ) +
      dplyr::if_else(is.na(childthree) |  childthree == 4, 0, 1 ) +
      dplyr::if_else(is.na(childfour) |  childfour == 4, 0, 1 ) +
      dplyr::if_else(is.na(childfive) |  childfive == 4, 0, 1 ) 
  )

Best!

cactus · January 14, 2022, 5:29pm

Thank you @0sigmas
Could you please explain this part

dplyr::mutate(
    year = as.numeric(stringr::str_match(name, "[0-9]+")),
    child = stringr::str_sub(name, start=1, end=-6)
  ) %>% 
  dplyr::select(-name) %>%

What does the [0-9]+ and start =1, end -= -6 imply?
Thank you

0sigmas · January 14, 2022, 5:43pm

The first part [0-9]+ is a regular expression (Regular expressions) that captures a substring of numbers. In your case I am making the assumption that there are no other numbers in the child_* columns apart from the year 20**.

The other one stringr::str_sub(name, start =1, end -= -6) just takes for each entry in the name column a substring that starts in the first character and ends in the sixth character counting from the end of the string (that is why it is negative)

Best!

cactus · January 14, 2022, 6:39pm

Thank you @0sigmas . This is awesome for learners like me.

Just a tweak from what I wanted before, is it possible to

continue having the same number of children for the rest of year for those individuals whose data are not available after that year for the particular child (i.e.vaule = NA). I tweaked a little in the previous dataframe to show that. See subjectid "c" for child one for years 2004:2006 and subjectid "e" for child one for years 2005:2006. This would be something where the child has not died yet but let's say lost to follow-up.
If no data is available at all for the individual e.g subjectid "b" starting 2004 have the same number of children for the rest of the available year (i.e 2 since the total number of children is "2" in 2003.

dfchildV1 <- data.frame(
  subjectid = c("a","b","c","d","e","f","g","h","i","j","k"),
  location = c("NY","NC","WA","WA","OR","CA","AR","KS","AZ","VT","MA"),
  childone_2000 = c(NA,NA,1,NA,NA,NA,NA,1,NA,NA,NA),
  childone_2001 = c(NA,1,2,NA,NA,NA,NA,2,NA,NA,1),
  childone_2002 = c(1,2,2,NA,1,NA,1,2,NA,NA,2),
  childone_2003 = c(2,2,2,NA,2,NA,2,2,1,NA,2),
  childone_2004 = c(2,NA,NA,NA,2,NA,2,2,2,NA,2),
  childone_2005 = c(2,NA,NA,1,NA,NA,2,2,2,NA,2),
  childone_2006 = c(4,NA,NA,2,NA,NA,2,2,2,1,2),
  childtwo_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childtwo_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childtwo_2002 = c(NA,1,1,NA,NA,NA,NA,1,NA,NA,NA),
  childtwo_2003 = c(NA,2,2,NA,NA,NA,NA,2,NA,NA,1),
  childtwo_2004 = c(1,NA,4,NA,NA,NA,NA,2,NA,NA,2),
  childtwo_2005 = c(2,NA,NA,NA,1,NA,NA,NA,NA,NA,2),
  childtwo_2006 = c(2,NA,NA,1,2,NA,NA,NA,NA,NA,2),
  childthree_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2005 = c(1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA),
  childthree_2006 = c(2,NA,2,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2005 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfour_2006 = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2000 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2001 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2002 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2003 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2004 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2005 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  childfive_2006 = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))
Created on 2022-01-14 by the reprex package (v2.0.1)

So the data would be something like this
Screen Shot 2022-01-14 at 1.31.52 PM

Thank you for your time and help !

system · January 27, 2022, 3:27pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.