csv import problem

arthur.t · June 3, 2021, 3:13am

I think I figured it out! You'll have to install tidyverse and janitor packages. I made a silly error with matrix() and didn't set byrow = TRUE.

library(tidyverse)

# read each row of text individually so we can parse out the information manually
election0 <- 
  read_delim(
    "C:\\Users\\st2516\\Downloads\\muni-2014-resultats-com-1000-et-plus-t1.txt", 
    "\n",
    col_names = FALSE) %>%
  setNames("line_text") %>%
  mutate(
    # split by delimiter
    split_text  = strsplit(line_text, ";"),
    # assume the first 17 elements are common
    split_df    = map(split_text, ~.[1:17]),
    # and everything past this is repeating 11
    split_names = map(split_text, ~.[-c(1:17)]),
    columns     = map_dbl(split_text, length),
    # the number of repeating 11 name data elements
    n_names     = (columns - 17)/11)
#> 
#> -- Column specification --------------------------------------------------------
#> cols(
#>   X1 = col_character()
#> )

# file header
header <- election0$split_text[[1]] %>% janitor::make_clean_names()

make_df <- function(split_text, n_rows, col_names) {
  # function to take the split text and create a data frame with n_rows and col_names
  matrix(split_text, nrow = n_rows, byrow = TRUE) %>% 
    as_tibble() %>% 
    setNames(col_names)
}

# parse the first 17 columns (df) separately from the names (in subsequent sets 
# of 11 columns) (name_df)
election <- election0[-1, ] %>% # (first row had the header)
  transmute(
    id      = 1:n(),
    df      = pmap(list(split_df,          1, list(!!header[1:17])),     make_df),
    name_df = pmap(list(split_names, n_names, list(!!header[-c(1:17)])), make_df)) %>%
  unnest(df) %>%
  unnest(name_df)
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.
  
glimpse(election, width = 80)
#> Rows: 22,899
#> Columns: 29
#> $ id                                   <int> 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, ...
#> $ date_de_lexport                      <chr> "25/03/2014 12:50:21", "25/03/...
#> $ code_du_d_gan_liepartement           <chr> "01", "01", "01", "01", "01", ...
#> $ type_de_scrutin                      <chr> "LI2", "LI2", "LI2", "LI2", "L...
#> $ libell_gan_lie_du_d_gan_liepartement <chr> "AIN", "AIN", "AIN", "AIN", "A...
#> $ code_de_la_commune                   <chr> "004", "004", "004", "004", "0...
#> $ libell_gan_lie_de_la_commune         <chr> "Amb<U+653C><U+3E39>rieu-en-Bugey", "Amb<U+653C><U+3E39>ri...
#> $ inscrits                             <chr> "00008198", "00008198", "00008...
#> $ abstentions                          <chr> "00003422", "00003422", "00003...
#> $ percent_abs_ins                      <chr> "41,74", "41,74", "41,74", "41...
#> $ votants                              <chr> "00004776", "00004776", "00004...
#> $ percent_vot_ins                      <chr> "58,26", "58,26", "58,26", "58...
#> $ blancs_et_nuls                       <chr> "00000191", "00000191", "00000...
#> $ percent_bl_nuls_ins                  <chr> "2,33", "2,33", "2,33", "2,33"...
#> $ percent_bl_nuls_vot                  <chr> "4,00", "4,00", "4,00", "4,00"...
#> $ exprim_gan_lies                      <chr> "00004585", "00004585", "00004...
#> $ percent_exp_ins                      <chr> "55,93", "55,93", "55,93", "55...
#> $ percent_exp_vot                      <chr> "96,00", "96,00", "96,00", "96...
#> $ code_nuance                          <chr> "LDVG", "LDVG", "LUMP", "LDVD"...
#> $ sexe                                 <chr> "F", "F", "M", "M", "M", "F", ...
#> $ nom                                  <chr> "EXPOSITO", "PIDOUX", "FORTIN"...
#> $ pr_gan_lienom                        <chr> "Josiane", "Catherine", "Chris...
#> $ liste                                <chr> "AMBERIEU AMBITION", "VIVONS N...
#> $ si_gan_houges_elu                    <chr> "0", "0", "0", "0", "19", "0",...
#> $ si_gan_houges_secteur                <chr> "0", "0", "0", "0", "0", "0", ...
#> $ si_gan_houges_cc                     <chr> "0", "0", "0", "0", "2", "0", ...
#> $ voix                                 <chr> "00000954", "00000822", "00001...
#> $ percent_voix_ins                     <chr> "11,64", "10,03", "16,87", "17...
#> $ percent_voix_exp                     <chr> "20,81", "17,93", "30,16", "31...

^{Created on 2021-06-02 by the reprex package (v1.0.0)}