I have a ton of messy excel sheets to wrangle, where the list of names appear twice in the same sheet for different groups of statistics, with the two 'tables' being separated by an arbitrary number of rows.
df <- tibble::tribble(
~Name, ~col1, ~col2, ~col3, ~col_blank, ~col6, ~col7, ~col8, ~col9, ~col10, ~col_null, ~col12,
"Participant 1", "2", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 2", "0.5", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 3", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 4", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 5", "1", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 6", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 7", "2", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 8", "0.125", "0.5", NA, NA, "1", NA, 4L, 1L, 1L, NA, NA,
"Participant 9", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 11", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 12", "3", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 13", "0.125", "0.5", NA, NA, "1", NA, 4L, 1L, 1L, NA, NA,
"Participant 14", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 15", "0.33", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 16", "0.333", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 17", "0.125", "0.5", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 18", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 19", "0.125", "0.5", NA, NA, "1", "1", 4L, 1L, NA, NA, NA,
"Participant 20", "3", "0.5", NA, NA, "0", "3", 4L, 1L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "TrialA", NA, NA, "TrialB", NA, NA, NA, NA, NA, NA, NA,
NA, "gain", "pax", "gain_pct", "gain", "pax", "gain_pct", NA, NA, NA, "total_gain_pct", "forfeit",
"Participant 1", "1", "4", "0.25", "1", "4", "0.25", NA, NA, NA, "0.5", NA,
"Participant 2", "1", "4", "0.25", "1", "4", "0.25", NA, NA, NA, "0.5", NA,
"Participant 3", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 4", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 5", "0", "4", "0", "0.5", "4", "0.125", NA, NA, NA, "0.125", NA,
"Participant 6", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 7", "0", "4", "0", "0.5", "4", "0.125", NA, NA, NA, "0.125", NA,
"Participant 8", "0.5", "4", "0.125", "0", "4", "0", NA, NA, NA, "0.125", NA,
"Participant 9", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 11", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 12", "0", "4", "0", "0.5", "4", "0.125", NA, NA, NA, "0.125", NA,
"Participant 13", "0.5", "4", "0.125", "0", "4", "0", NA, NA, NA, "0.125", NA,
"Participant 14", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 15", "1", "4", "0.25", "1", "4", "0.25", NA, NA, NA, "0.5", NA,
"Participant 16", "1", "4", "0.25", "1", "4", "0.25", NA, NA, NA, "0.5", NA,
"Participant 17", "0", "4", "0", "0.5", "4", "0.125", NA, NA, NA, "0.125", NA,
"Participant 18", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Participant 19", "0.5", "4", "0.125", "0", "4", "0", NA, NA, NA, "0.125", NA,
"Participant 20", "0.5", "4", "0.125", "0", "4", "0", NA, NA, NA, "0.125", NA,
NA, "Some footnote", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "More footnotes", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)
I have written the code to extract the top section of statistics from the first to last name (renamed as 'Participant (number)' in the reprex for anonimity).
df_top <- df %>% mutate(
check = str_detect(Name, "^[A-Z]"))
# slice the data from the top to the first NA row
df_top <- df_top %>% slice(1:min(
which(is.na(df_top$check))) - 1) %>% # how do I use detect_index() instead?
select(-check)
I would like to extract the bottom section, with the end result looking like this, but with all the positioning rectified (e.g., the columns would be Name, gainA, paxA, gain_pctA, gainB, paxB, gain_pctB, total_gain_pct, forfeit
):
> df %>% slice(23:44) %>% print(n = Inf, width = Inf)
# A tibble: 22 x 12
Name col1 col2 col3 col_blank col6 col7 col8 col9 col10
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <int>
1 NA TrialA NA NA TrialB NA NA NA NA NA
2 NA gain pax gain_pct gain pax gain_pct NA NA NA
3 Participant 1 1 4 0.25 1 4 0.25 NA NA NA
4 Participant 2 1 4 0.25 1 4 0.25 NA NA NA
5 Participant 3 NA NA NA NA NA NA NA NA NA
6 Participant 4 NA NA NA NA NA NA NA NA NA
7 Participant 5 0 4 0 0.5 4 0.125 NA NA NA
8 Participant 6 NA NA NA NA NA NA NA NA NA
9 Participant 7 0 4 0 0.5 4 0.125 NA NA NA
10 Participant 8 0.5 4 0.125 0 4 0 NA NA NA
11 Participant 9 NA NA NA NA NA NA NA NA NA
12 Participant 10 NA NA NA NA NA NA NA NA NA
13 Participant 11 NA NA NA NA NA NA NA NA NA
14 Participant 12 0 4 0 0.5 4 0.125 NA NA NA
15 Participant 13 0.5 4 0.125 0 4 0 NA NA NA
16 Participant 14 NA NA NA NA NA NA NA NA NA
17 Participant 15 1 4 0.25 1 4 0.25 NA NA NA
18 Participant 16 1 4 0.25 1 4 0.25 NA NA NA
19 Participant 17 0 4 0 0.5 4 0.125 NA NA NA
20 Participant 18 NA NA NA NA NA NA NA NA NA
21 Participant 19 0.5 4 0.125 0 4 0 NA NA NA
22 Participant 20 0.5 4 0.125 0 4 0 NA NA NA
col_null col12
<chr> <chr>
1 NA NA
2 total_gain_pct forfeit
3 0.5 NA
4 0.5 NA
5 NA NA
6 NA NA
7 0.125 NA
8 NA NA
9 0.125 NA
10 0.125 NA
11 NA NA
12 NA NA
13 NA NA
14 0.125 NA
15 0.125 NA
16 NA NA
17 0.5 NA
18 0.5 NA
19 0.125 NA
20 NA NA
21 0.125 NA
22 0.125 NA
What I thought of was to get the index of the 2nd match of "Participant 1" and the index of the 2nd match of last participant so as to get the relevant rows, but I am unsure how to make this happen e.g., use stringr to create a new column where the number of occurrences of each name is counted/increased as they are encountered down the first column. Then do something like "detect index where Name = Participant 1 & occurrence = 2".
The first and last names can be identified from df_top
, as each sheet has a different number of participants.
Appreciate any help!