concatenating multiple lines based on regex group - help please

Hi,

I was pretty sure that the 3/line would be problematic for the record problem unless notes were captured in an app that enforced it through the way in which it captured and exported. That problem is solved, although I suggest simplifying it in the reprex below.

For the field problem, I've assumed the first three are numeric.

suppressPackageStartupMessages({
  library(purrr)
  library(stringr)
})

# Regex for identifying the Note ID
note_id <- "([0-9]{1}\\t)"

####################################
# took out first element of faketext
####################################

faketext <- c(
  "1\t8\t2\t5012312343\tphysician note\tdude is not doing so well, having issues smelling. will recommmend ct scan.",
  "plan will be to call back patient 1 week after scan.",
  "\t2019-09-25",
  "2\t8\t2\t5053245235\temergency visit\tkid in his 20s came in having a high fever.  prescribed antibiotics.",
  "recommended to come see after 3 days.",
  "\t2019-04-10",
  "\t2019-04-10",
  "3\t8\t2\t5053345677\tclinical note\tadult in 40s, bmi over 30, multiple co-morbidities.  recommended 30 minutes of walking daily,",
  "reduction of calorie intake. along with advise to ween off the number of cigarettes daily. ",
  "will be tracking progress once a month.",
  "recommended to come see after 30 days.",
  "-md",
  "\t2019-07-20",
  "4\t8\t2\t5053784567\tclinical note\tvariation in text.",
  "fakeone",
  "faketwo",
  "fakethree",
  "fakefour",
  "fakefive",
  "fake6",
  "\t2019-07-20"
)

count_records <- function(x) {
  a <- cumsum(str_detect(x, note_id))
  b <- rle(a)$lengths
  d <- cumsum(b)
  e <- d-b+1
  return(seq(1:length(e)))
}

prep_records <- function(i) {
    a <- cumsum(str_detect(faketext, note_id))
    b <- rle(a)$lengths
    d <- cumsum(b)
    e <- d-b+1
    r <- seq(1:length(e))
    faketext[e[i]:d[i]]
}

mk_records <- function(x) {
  count_records(x) %>% map(prep_records) 
}

mk_records(faketext)
#> [[1]]
#> [1] "1\t8\t2\t5012312343\tphysician note\tdude is not doing so well, having issues smelling. will recommmend ct scan."
#> [2] "plan will be to call back patient 1 week after scan."                                                            
#> [3] "\t2019-09-25"                                                                                                    
#> 
#> [[2]]
#> [1] "2\t8\t2\t5053245235\temergency visit\tkid in his 20s came in having a high fever.  prescribed antibiotics."
#> [2] "recommended to come see after 3 days."                                                                     
#> [3] "\t2019-04-10"                                                                                              
#> [4] "\t2019-04-10"                                                                                              
#> 
#> [[3]]
#> [1] "3\t8\t2\t5053345677\tclinical note\tadult in 40s, bmi over 30, multiple co-morbidities.  recommended 30 minutes of walking daily,"
#> [2] "reduction of calorie intake. along with advise to ween off the number of cigarettes daily. "                                      
#> [3] "will be tracking progress once a month."                                                                                          
#> [4] "recommended to come see after 30 days."                                                                                           
#> [5] "-md"                                                                                                                              
#> [6] "\t2019-07-20"                                                                                                                     
#> 
#> [[4]]
#> [1] "4\t8\t2\t5053784567\tclinical note\tvariation in text."
#> [2] "fakeone"                                               
#> [3] "faketwo"                                               
#> [4] "fakethree"                                             
#> [5] "fakefour"                                              
#> [6] "fakefive"                                              
#> [7] "fake6"                                                 
#> [8] "\t2019-07-20"

Created on 2020-12-12 by the reprex package (v0.3.0.9001)

Part two parses the records created, ending up with results containing the numeric fields and date character string (use lubridate::ymd("2020-12-12") to convert to date object) with a combined text character string in the middle.

suppressPackageStartupMessages({
  library(purrr)
  library(stringr)
})

# Regex for identifying the Note ID
note_id <- "([0-9]{1}\\t)"

####################################
# took out first element of faketext
####################################

faketext <- c(
  "1\t8\t2\t5012312343\tphysician note\tdude is not doing so well, having issues smelling. will recommmend ct scan.",
  "plan will be to call back patient 1 week after scan.",
  "\t2019-09-25",
  "2\t8\t2\t5053245235\temergency visit\tkid in his 20s came in having a high fever.  prescribed antibiotics.",
  "recommended to come see after 3 days.",
  "\t2019-04-10",
  "\t2019-04-10",
  "3\t8\t2\t5053345677\tclinical note\tadult in 40s, bmi over 30, multiple co-morbidities.  recommended 30 minutes of walking daily,",
  "reduction of calorie intake. along with advise to ween off the number of cigarettes daily. ",
  "will be tracking progress once a month.",
  "recommended to come see after 30 days.",
  "-md",
  "\t2019-07-20",
  "4\t8\t2\t5053784567\tclinical note\tvariation in text.",
  "fakeone",
  "faketwo",
  "fakethree",
  "fakefour",
  "fakefive",
  "fake6",
  "\t2019-07-20"
)

count_records <- function(x) {
  a <- cumsum(str_detect(x, note_id))
  b <- rle(a)$lengths
  d <- cumsum(b)
  e <- d-b+1
  return(seq(1:length(e)))
}


mk_entry <- function(x) {
  c(row_id = x[1], 
    patient_id = x[2], 
    unknown = x[3],
    note_id = x[4],
    note = x[5], 
    date = x[6])
}

prep_records <- function(i) {
    a <- cumsum(str_detect(faketext, note_id))
    b <- rle(a)$lengths
    d <- cumsum(b)
    e <- d-b+1
    r <- seq(1:length(e))
    faketext[e[i]:d[i]]
}

mk_records <- function(x) {
  count_records(x) %>% map(prep_records) 
}


mk_record <- function(x) {
  c(row_id = x[1], patient_id = x[2], note_id = x[3],
    note = x[4], date = x[5])
}
mk_records(faketext) -> records

records
#> [[1]]
#> [1] "1\t8\t2\t5012312343\tphysician note\tdude is not doing so well, having issues smelling. will recommmend ct scan."
#> [2] "plan will be to call back patient 1 week after scan."                                                            
#> [3] "\t2019-09-25"                                                                                                    
#> 
#> [[2]]
#> [1] "2\t8\t2\t5053245235\temergency visit\tkid in his 20s came in having a high fever.  prescribed antibiotics."
#> [2] "recommended to come see after 3 days."                                                                     
#> [3] "\t2019-04-10"                                                                                              
#> [4] "\t2019-04-10"                                                                                              
#> 
#> [[3]]
#> [1] "3\t8\t2\t5053345677\tclinical note\tadult in 40s, bmi over 30, multiple co-morbidities.  recommended 30 minutes of walking daily,"
#> [2] "reduction of calorie intake. along with advise to ween off the number of cigarettes daily. "                                      
#> [3] "will be tracking progress once a month."                                                                                          
#> [4] "recommended to come see after 30 days."                                                                                           
#> [5] "-md"                                                                                                                              
#> [6] "\t2019-07-20"                                                                                                                     
#> 
#> [[4]]
#> [1] "4\t8\t2\t5053784567\tclinical note\tvariation in text."
#> [2] "fakeone"                                               
#> [3] "faketwo"                                               
#> [4] "fakethree"                                             
#> [5] "fakefour"                                              
#> [6] "fakefive"                                              
#> [7] "fake6"                                                 
#> [8] "\t2019-07-20"

# example with the longest record
last_record <- records[[length(records)]]
tab <- "\\t"
the_date <- str_remove(last_record[length(last_record)],tab)
first_fields <- last_record[1]
numerics <- str_split(first_fields,tab)[[1]][1:4]
uppertext <- str_split(first_fields,tab)[[1]][5:length(str_split(first_fields,tab)[[1]])]
lowertext <- last_record[2:(length(last_record)-1)]
texts <- paste(c(uppertext,lowertext), collapse = " ")

result <- c(numerics,texts,the_date)

pander::pander(mk_entry(result))
Table continues below
row_id patient_id unknown note_id note
4 8 2 5053784567 clinical note variation in text. fakeone faketwo fakethree fakefour fakefive fake6

Table continues below

date
2019-07-20

Created on 2020-12-12 by the reprex package (v0.3.0.9001)