Problems working with multiple tables in one worksheet and multiple excel files

Carlos_Cajas · July 16, 2022, 4:53am

Hey everybody!! looking for some help again.

I have multiple excel files that have in sheet 1 multiple tables so I want to get one data frame of all imported excel files joining the multiple tables in sheet 1.

I found this post How to efficiently import multiple excel tables located in one sheet into an R list? - Stack Overflow that I was trying to reproduce. Basically, this imports an excel file, map the tables in the sheet then split them in different elements of a list.

I tried to do the same but I don´t get clean tables.

the first row of each table has a name that I want to pass it to a new column so in the final data frame I would know from whom the table is. I try to get the name that is in position [1,1] and goes like UNIDAD 1 then I delete that row and 2 more so I get the table just for the data that I need but sometimes I don't get the name from the position [1,1].

Something to tell is that not always all the tables are fill with data.

Another thing is that the column HORA is supposed to show hours from 2 to 24 but when files are loaded, the format is changed to values form 0 to 1. I don't know how to get hours again. I tried with hm() from lubridate but it didn't work.

I hope you can help me with a better solution, I got stuck with this one.

I'm sharing 3 files for you to test.

Data

This is my code. (Not so clean)

pacman::p_load(readxl, tidyverse, janitor)

# UBICACION ARCHIVOS----

path <- "D:/Downloads/DATOS_LUCY"

# FUNCIONES----
# carga de datos
input_files_excel <- function(path) {
  require(stringr)
  table_raw <- read_excel(path = path, 
                    sheet = 1, #col_types = "text",
                    col_names = F)
  table_raw <- clean_names(table_raw)
  
  return(table_raw)
}

# utility function to get rle as a named vector
vec_rle <- function(v){
  temp <- rle(v)
  out <- temp$values
  names(out) <- temp$lengths
  return(out)
}

# utility function to map table with their columns/rows in a bigger table
make_df_index <- function(v){
  table_rle <- vec_rle(v)
  divide_points <- c(0,cumsum(names(table_rle)))
  table_index <- map2((divide_points + 1)[1:length(divide_points)-1],
                      divide_points[2:length(divide_points)],
                      ~.x:.y)
  return(table_index[table_rle])
}

# split a large table in one direction if there are blank columns or rows
split_direction <- function(df,direction = "col"){
  if(direction == "col"){
    col_has_data <- unname(map_lgl(df,~!all(is.na(.x))))
    df_mapping <- make_df_index(col_has_data)
    out <- map(df_mapping,~df[,.x])
  } else if(direction == "row"){
    row_has_data <- df %>% 
      mutate_all(~!is.na(.x)) %>%
      as.matrix() %>% 
      apply(1,any)
    df_mapping <- make_df_index(row_has_data)
    out <- map(df_mapping,~df[.x,])
  }
  return(out)
}

# split a large table into smaller tables if there are blank columns or rows
# if you still see entire rows or columns missing. Please increase complexity
split_df <- function(df,showWarnig = TRUE,complexity = 1){
  if(showWarnig){
    warning("Please don't use first row as column names.")
  }
  
  out <- split_direction(df,"col")
  
  for(i in 1 :complexity){
    out <- out %>%
      map(~split_direction(.x,"row")) %>%
      flatten() %>%
      map(~split_direction(.x,"col")) %>%
      flatten()
  }
  return(out)
  
}

# funciones para nombres de columnas
nom_col_tab <- function(df){
  names(df) <- col_tab
  return(df)
}

# This function is to take the name in position [1,1] of each table and pass it to column named UNIDAD
# Crear columna unidad
c_unidad <- function(i){
  uni <- i[1,1]
  i$UNIDAD <- uni
  return(i)
}

# Eliminar filas superiores
rm_filas <- function(i){
  i <- i[-c(1:3),]
}


# CODIGO----

# Columns names
col_tab <- c("HORA", "TANQUE_EXPAN", "POTENCIA_ACTIVA", "ENERGIA", 
             "AA_INY1", "AA_INY2", "AA_INY3", "AA_INY4", "AA_INY5", 
             "AA_INY6", "AD_DEF1", "AD_DEF2", "AD_DEF3", "AD_DEF4",
             "AD_DEF5", "AD_DEF6", "CE_TEM_MAX_PATIN_EMPUJE_C",
             "CE_TEM_ACEITE_ENTRADA_C", "CE_TEM_ACEITE_SALIDA_C",
             "CGS_TEM_MAX_PATIN_C_SUPERIOR_C", "CGS_TEM_ACEITE_ENTRADA_C",
             "CGS_TEM_ACEITE_SALIDA_C", "CGI_TEM_MAX_PATIN_C_INFERIOR_C",
             "CGI_TEM_ACEITE_ENTRADA_C", "CGI_TEM_ACEITE_SALIDA_C",
             "CGT_TEM_MAX_PATIN_C_TURBINA_C", "CGT_TEM_ACEITE_ENTRADA_C",
             "CGT_TEM_ACEITE_SALIDA_C", "ENTRADA_AIRE")

# PATH COMPLETO DE ARCHIVOS----
pathsx <- list.files(path=path, 
                     pattern="\\.xls",
                     full.names=TRUE)


table_raw <- lapply(pathsx, input_files_excel) # load files
#> New names:
#> New names:
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`
#> • `` -> `...13`
#> • `` -> `...14`
#> • `` -> `...15`
#> • `` -> `...16`
#> • `` -> `...17`
#> • `` -> `...18`
#> • `` -> `...19`
#> • `` -> `...20`
#> • `` -> `...21`
#> • `` -> `...22`
#> • `` -> `...23`
#> • `` -> `...24`
#> • `` -> `...25`
#> • `` -> `...26`
#> • `` -> `...27`
#> • `` -> `...28`
#> • `` -> `...29`

# create a df with all mapped tables
dd <- bind_rows(table_raw) 

# Split each table as an element of a list
split_table <- dd %>%    
  split_df(complexity = 1) # another custom function I wrote
#> Warning in split_df(., complexity = 1): Please don't use first row as column
#> names.

# name columns
split_table <- lapply(split_table, nom_col_tab)

# creating column UNIDAD with the name of position [1,1] of each table before deleting unuseful rows.
split_table <- lapply(split_table, c_unidad)

# Deleting unuseful rows
split_table <- lapply(split_table, rm_filas)

# Creating final df
datos <- bind_rows(split_table)

^{Created on 2022-07-15 by the reprex package (v2.0.1)}

mrjoh3 · July 17, 2022, 1:34am

to index the tables on each sheet, you can use the row headers for each table to create a new index. And then use tidyr::fill to identify the table "blocks". This at least gives you a clean start for processing each table. When you split on this column you end up with a named list of tables.

tables <- readxl::read_excel('~/Downloads/14-05-2016.xls', col_names = FALSE) %>%
  dplyr::mutate(index = ifelse(stringr::str_starts(`...1`, 'UNIDAD'), `...1`, NA)) %>%
  tidyr::fill(index, .direction = 'down') %>%
  split(f = .$index)

Each table then looks very consistent, so it might be easiest to just pass a vector to name the columns. This will depend on how variable they are.

For the second part of you question regarding the HORA column. It look like decimal time, so HORA * 24 should give you the desired hour.

Hope this helps

Carlos_Cajas · July 20, 2022, 3:39am

@mrjoh3 , thanks a lot.

I have now a nested list but I just realize that I also need the date.

So my code for my list goes like this

pacman::p_load(readxl, tidyverse, janitor)

# UBICACION ARCHIVOS----

#path <- "D:/Downloads/DATOS_LUCY"
path <- "G:/Mi unidad/DATOS_LUCY"

# FUNCIONES----
# carga de datos
input_files_excel <- function(path) {
  #require(stringr)
  tables <- readxl::read_excel(path = path, col_names = FALSE) %>%
    dplyr::mutate(index = ifelse(stringr::str_starts(`...1`, 'UNIDAD'), 
                                 `...1`, NA)) %>%
    tidyr::fill(index, .direction = 'down') %>%
    split(f = .$index)
  
  return(tables)
}

# CODIGO ----
## PATH COMPLETO DE ARCHIVOS----
pathsx <- list.files(path=path, 
                     pattern="\\.xls",
                     full.names=TRUE)

## DATOS----
tables <- lapply(pathsx, input_files_excel)
#> New names:
#> New names:
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`
#> • `` -> `...13`
#> • `` -> `...14`
#> • `` -> `...15`
#> • `` -> `...16`
#> • `` -> `...17`
#> • `` -> `...18`
#> • `` -> `...19`
#> • `` -> `...20`
#> • `` -> `...21`
#> • `` -> `...22`
#> • `` -> `...23`
#> • `` -> `...24`
#> • `` -> `...25`
#> • `` -> `...26`
#> • `` -> `...27`
#> • `` -> `...28`
#> • `` -> `...29`

^{Created on 2022-07-19 by the reprex package (v2.0.1)}

But the dates are missing inside of each dataframe.

I got the dates in other list like this

## FECHAS----
fechasx <- as.list(str_extract(pathsx,
                               pattern = "[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]"))

^{Created on 2022-07-19 by the reprex package (v2.0.1)}

But I don't know how to add them to each data frame inside the lists.

So what I want to get besides the last column index, is a date column.

Each list is created from one file, so each list with the names UNIDAD 1, UNIDAD 2, UNIDAD 3 and UNIDAD 4 inside that list should have a date column with the date of the file.

Please some advice.

nirgrahamuk · July 20, 2022, 9:36am

I don't wish to download your excel files to attempt to reproduce what you have from first principles. but perhaps I can help.

library(tidyverse)

(list_of_3_tables <- split(tibble(iris), ~Species))

(list_of_3_dates <- seq.Date(Sys.Date(), by = 1, length.out = 3))

# assuming the order of tables matches the order of dates ...

(list_of_3_tables_with_date_cols <- map2(
  .x = list_of_3_tables,
  .y = list_of_3_dates,
  .f = ~ mutate(.x,
    date = .y
  )
))

mrjoh3 · July 20, 2022, 11:48am

Carlos_Cajas:

input_files_excel <- function(path) {
  #require(stringr)
  tables <- readxl::read_excel(path = path, col_names = FALSE) %>%
    dplyr::mutate(index = ifelse(stringr::str_starts(`...1`, 'UNIDAD'), 
                                 `...1`, NA)) %>%
    tidyr::fill(index, .direction = 'down') %>%
    split(f = .$index)
  
  return(tables)
}

In this case you are already passing the filename into the input_files_excel function, so you can convert that and create the date field. You need two functions for this, one to get the filename and one to drop the extension tools::file_path_sans_ext(basename(path))

input_files_excel <- function(path) {
  #require(stringr)
  date <- tools::file_path_sans_ext(basename(path))
  tables <- readxl::read_excel(path = path, col_names = FALSE) %>%
    dplyr::mutate(index = ifelse(stringr::str_starts(`...1`, 'UNIDAD'),  `...1`, NA),
                                  date = lubridate::dmy(date)) %>%
    tidyr::fill(index, .direction = 'down') %>%
    split(f = .$index)
  
  return(tables)
}

then you can still pass this function to lapply or a map function. I would use map_dfr and the final result would look like

list.files(path=path, 
           pattern="\\.xls",
           full.names=TRUE) |>
map_dfr(~ input_files_excel(.x))

Of course you still need the clean up the headers of the individual tables, but I think you already have that. You just need the put that code in the function before return(tables) and merge the table list back into a single data.frame.

Carlos_Cajas · July 20, 2022, 9:26pm

Thanks a lot.

I couldn´t make it with the map_dfr() because one dataframe has less rows than the others, so I was getting an error.But I got my final df with this

bind_rows(as.list(do.call(rbind, tables)))

After that I just worked as a simple df.

system · July 27, 2022, 9:27pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.