Selecting indexes based on nth appearance of string

I have a ton of messy excel sheets to wrangle, where the list of names appear twice in the same sheet for different groups of statistics, with the two 'tables' being separated by an arbitrary number of rows.

df <- tibble::tribble(
                   ~Name,            ~col1, ~col2,      ~col3, ~col_blank, ~col6,      ~col7, ~col8, ~col9, ~col10,        ~col_null,    ~col12,
         "Participant 1",              "2", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 2",            "0.5", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 3",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 4",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 5",              "1", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 6",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 7",              "2", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 8",          "0.125", "0.5",         NA,         NA,   "1",         NA,    4L,    1L,     1L,               NA,        NA,
         "Participant 9",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 10",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 11",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 12",              "3", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 13",          "0.125", "0.5",         NA,         NA,   "1",         NA,    4L,    1L,     1L,               NA,        NA,
        "Participant 14",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 15",           "0.33", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 16",          "0.333", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 17",          "0.125", "0.5",         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 18",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 19",          "0.125", "0.5",         NA,         NA,   "1",        "1",    4L,    1L,     NA,               NA,        NA,
        "Participant 20",              "3", "0.5",         NA,         NA,   "0",        "3",    4L,    1L,     NA,               NA,        NA,
                      NA,               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
                      NA,               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
                      NA,         "TrialA",    NA,         NA,   "TrialB",    NA,         NA,    NA,    NA,     NA,               NA,        NA,
                      NA,           "gain", "pax", "gain_pct",     "gain", "pax", "gain_pct",    NA,    NA,     NA, "total_gain_pct", "forfeit",
         "Participant 1",              "1",   "4",     "0.25",        "1",   "4",     "0.25",    NA,    NA,     NA,            "0.5",        NA,
         "Participant 2",              "1",   "4",     "0.25",        "1",   "4",     "0.25",    NA,    NA,     NA,            "0.5",        NA,
         "Participant 3",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 4",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 5",              "0",   "4",        "0",      "0.5",   "4",    "0.125",    NA,    NA,     NA,          "0.125",        NA,
         "Participant 6",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
         "Participant 7",              "0",   "4",        "0",      "0.5",   "4",    "0.125",    NA,    NA,     NA,          "0.125",        NA,
         "Participant 8",            "0.5",   "4",    "0.125",        "0",   "4",        "0",    NA,    NA,     NA,          "0.125",        NA,
         "Participant 9",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 10",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 11",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 12",              "0",   "4",        "0",      "0.5",   "4",    "0.125",    NA,    NA,     NA,          "0.125",        NA,
        "Participant 13",            "0.5",   "4",    "0.125",        "0",   "4",        "0",    NA,    NA,     NA,          "0.125",        NA,
        "Participant 14",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 15",              "1",   "4",     "0.25",        "1",   "4",     "0.25",    NA,    NA,     NA,            "0.5",        NA,
        "Participant 16",              "1",   "4",     "0.25",        "1",   "4",     "0.25",    NA,    NA,     NA,            "0.5",        NA,
        "Participant 17",              "0",   "4",        "0",      "0.5",   "4",    "0.125",    NA,    NA,     NA,          "0.125",        NA,
        "Participant 18",               NA,    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
        "Participant 19",            "0.5",   "4",    "0.125",        "0",   "4",        "0",    NA,    NA,     NA,          "0.125",        NA,
        "Participant 20",            "0.5",   "4",    "0.125",        "0",   "4",        "0",    NA,    NA,     NA,          "0.125",        NA,
                      NA,  "Some footnote",    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA,
                      NA, "More footnotes",    NA,         NA,         NA,    NA,         NA,    NA,    NA,     NA,               NA,        NA
        )

I have written the code to extract the top section of statistics from the first to last name (renamed as 'Participant (number)' in the reprex for anonimity).

df_top <- df %>% mutate(
  check = str_detect(Name, "^[A-Z]")) 
# slice the data from the top to the first NA row
df_top <- df_top %>% slice(1:min(
  which(is.na(df_top$check))) - 1) %>%  # how do I use detect_index() instead?
  select(-check)

I would like to extract the bottom section, with the end result looking like this, but with all the positioning rectified (e.g., the columns would be Name, gainA, paxA, gain_pctA, gainB, paxB, gain_pctB, total_gain_pct, forfeit):

> df %>% slice(23:44) %>% print(n = Inf, width = Inf)
# A tibble: 22 x 12
   Name           col1   col2  col3     col_blank col6  col7      col8  col9 col10
   <chr>          <chr>  <chr> <chr>    <chr>     <chr> <chr>    <int> <int> <int>
 1 NA             TrialA NA    NA       TrialB    NA    NA          NA    NA    NA
 2 NA             gain   pax   gain_pct gain      pax   gain_pct    NA    NA    NA
 3 Participant 1  1      4     0.25     1         4     0.25        NA    NA    NA
 4 Participant 2  1      4     0.25     1         4     0.25        NA    NA    NA
 5 Participant 3  NA     NA    NA       NA        NA    NA          NA    NA    NA
 6 Participant 4  NA     NA    NA       NA        NA    NA          NA    NA    NA
 7 Participant 5  0      4     0        0.5       4     0.125       NA    NA    NA
 8 Participant 6  NA     NA    NA       NA        NA    NA          NA    NA    NA
 9 Participant 7  0      4     0        0.5       4     0.125       NA    NA    NA
10 Participant 8  0.5    4     0.125    0         4     0           NA    NA    NA
11 Participant 9  NA     NA    NA       NA        NA    NA          NA    NA    NA
12 Participant 10 NA     NA    NA       NA        NA    NA          NA    NA    NA
13 Participant 11 NA     NA    NA       NA        NA    NA          NA    NA    NA
14 Participant 12 0      4     0        0.5       4     0.125       NA    NA    NA
15 Participant 13 0.5    4     0.125    0         4     0           NA    NA    NA
16 Participant 14 NA     NA    NA       NA        NA    NA          NA    NA    NA
17 Participant 15 1      4     0.25     1         4     0.25        NA    NA    NA
18 Participant 16 1      4     0.25     1         4     0.25        NA    NA    NA
19 Participant 17 0      4     0        0.5       4     0.125       NA    NA    NA
20 Participant 18 NA     NA    NA       NA        NA    NA          NA    NA    NA
21 Participant 19 0.5    4     0.125    0         4     0           NA    NA    NA
22 Participant 20 0.5    4     0.125    0         4     0           NA    NA    NA
   col_null       col12  
   <chr>          <chr>  
 1 NA             NA     
 2 total_gain_pct forfeit
 3 0.5            NA     
 4 0.5            NA     
 5 NA             NA     
 6 NA             NA     
 7 0.125          NA     
 8 NA             NA     
 9 0.125          NA     
10 0.125          NA     
11 NA             NA     
12 NA             NA     
13 NA             NA     
14 0.125          NA     
15 0.125          NA     
16 NA             NA     
17 0.5            NA     
18 0.5            NA     
19 0.125          NA     
20 NA             NA     
21 0.125          NA     
22 0.125          NA  

What I thought of was to get the index of the 2nd match of "Participant 1" and the index of the 2nd match of last participant so as to get the relevant rows, but I am unsure how to make this happen e.g., use stringr to create a new column where the number of occurrences of each name is counted/increased as they are encountered down the first column. Then do something like "detect index where Name = Participant 1 & occurrence = 2".

The first and last names can be identified from df_top, as each sheet has a different number of participants.

Appreciate any help!

Hi @bayesian,
You may have already solved this problem after 3 days but, if not, here is what I did with a simplified version of your data:

suppressPackageStartupMessages(library(tidyverse))

df <- tibble::tribble(
~Name, ~col1,
"Fred", "2",
"Mary", "0.5",
"Simon", NA,
"Susan", NA,
NA, NA,
NA, NA,
NA, "TrialA",
NA, "gain",
"Fred", "1",
"Mary", "1",
"Simon", NA,
"Susan", NA,
NA, "Some footnote",
NA, "More footnotes")

subject_num <- length(levels(factor(df$Name)))

# Add index column
df %>%
  rownames_to_column(var="index") -> df

# So, top section has 4 rows
df %>%
  slice(1:subject_num) %>%
  as.data.frame() -> df_top

df_top
#>   index  Name col1
#> 1     1  Fred    2
#> 2     2  Mary  0.5
#> 3     3 Simon <NA>
#> 4     4 Susan <NA>

# Find start of bottom section
# This only works if all subjects are in the same order in the two sections
# Find second occurrence of first factor level using index column
bottom_start <- as.numeric(df$index[(df$Name == levels(factor(df$Name))[1]) &
                                   !is.na(df$Name)][2])

# Get bottom section (ignoring any footnotes)
df %>%
  slice(bottom_start:(subject_num + bottom_start - 1)) %>%
  as.data.frame() -> df_bottom

df_bottom
#>   index  Name col1
#> 1     9  Fred    1
#> 2    10  Mary    1
#> 3    11 Simon <NA>
#> 4    12 Susan <NA>

Created on 2021-07-25 by the reprex package (v2.0.0)

You can then select the desired columns, join the data frames, and fix the column labels.

This is a very elegant solution, especially with using factor levels to count. Great tip!

What I did was a lot clunkier: I created an index column, but instead of using the name I got the start index of where the df_bottom column header (in your example, the text gain) appeared till the last participant's name extracted from df_top.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.