Future::multiprocess purrr xml parse 'Error: external pointer is not valid' on Windows

xml2
future

#1

I'm parsing some rather large xml files into tibbles and in an attempt to speed the process, using the combination of purrr::map and future. I've been able to complete a successful end to end test in a Linux environment, but the same test fails in Windows. Here's the reprex:

# Large XML future::multiprocess Parse Test - Windows Error
library(httr)
library(xml2)
library(tidyverse)
library(RCurl)
#> Loading required package: bitops
#> 
#> Attaching package: 'RCurl'
#> The following object is masked from 'package:tidyr':
#> 
#>     complete
library(future)

sessionInfo()
#> R version 3.4.3 (2017-11-30)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 16299)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] future_1.7.0    RCurl_1.95-4.10 bitops_1.0-6    forcats_0.3.0  
#>  [5] stringr_1.3.0   dplyr_0.7.4     purrr_0.2.4     readr_1.1.1    
#>  [9] tidyr_0.8.0     tibble_1.4.2    ggplot2_2.2.1   tidyverse_1.2.1
#> [13] xml2_1.2.0      httr_1.3.1     
#> 
#> loaded via a namespace (and not attached):
#>  [1] listenv_0.7.0    reshape2_1.4.3   haven_1.1.1      lattice_0.20-35 
#>  [5] colorspace_1.3-2 htmltools_0.3.6  yaml_2.1.18      rlang_0.2.0     
#>  [9] pillar_1.2.1     foreign_0.8-69   glue_1.2.0       modelr_0.1.1    
#> [13] readxl_1.0.0     bindrcpp_0.2     bindr_0.1        plyr_1.8.4      
#> [17] munsell_0.4.3    gtable_0.2.0     cellranger_1.1.0 rvest_0.3.2     
#> [21] codetools_0.2-15 psych_1.7.8      evaluate_0.10.1  knitr_1.20      
#> [25] parallel_3.4.3   broom_0.4.3      Rcpp_0.12.15     backports_1.1.2 
#> [29] scales_0.5.0     jsonlite_1.5     mnormt_1.5-5     hms_0.4.2       
#> [33] digest_0.6.15    stringi_1.1.6    grid_3.4.3       rprojroot_1.3-2 
#> [37] cli_1.0.0        tools_3.4.3      magrittr_1.5     lazyeval_0.2.1  
#> [41] crayon_1.3.4     pkgconfig_2.0.1  lubridate_1.7.2  assertthat_0.2.0
#> [45] rmarkdown_1.9    R6_2.2.2         globals_0.11.0   nlme_3.1-131.1  
#> [49] compiler_3.4.3

# New Mexico Oil and Gas FTP Root
url <- "ftp://164.64.106.6/Public/OCD/OCD%20Data/"

# str_split string determined by OS Env
split_on_char <- if (Sys.info()[1] == "Windows") {
    "\r\n"
} else {
    "\n"
}

# Get Core Data .zip file
file_name <- str_split(getURL(url, ftp.use.epsv = FALSE, dirlistonly = TRUE), split_on_char)[[1]] %>% str_subset('OCDCoreData(\\d{8}).zip$')
file_path <- paste(url, file_name, sep = "")
GET(file_path, write_disk(paste0(tempdir(),"\\nm.zip"), overwrite = TRUE))
#> Warning in parse_http_status(lines[[1]]): NAs introduced by coercion
#> Warning: Failed to parse headers:
#> 331 Password required for anonymous
#> 230 Logged on
#> 257 "/" is current directory.
#> 250 CWD successful. "/Public" is current directory.
#> 250 CWD successful. "/Public/OCD" is current directory.
#> 250 CWD successful. "/Public/OCD/OCD Data" is current directory.
#> 213 20180319063106
#> 229 Entering Extended Passive Mode (|||50028|)
#> 200 Type set to I
#> 213 20518739
#> 150 Opening data channel for file download from server of "/Public/OCD/OCD Data/OCDCoreData20180319.zip"
#> 226 Successfully transferred "/Public/OCD/OCD Data/OCDCoreData20180319.zip"
#> Response [ftp://164.64.106.6/Public/OCD/OCD%20Data/OCDCoreData20180319.zip]
#>   Date: 2018-03-20 15:26
#>   Status: 226
#>   Content-Type: <unknown>
#>   Size: 20.5 MB
#> <ON DISK>  C:\Users\gmccomas\AppData\Local\Temp\RtmpgR0Jc1\nm.zip
unzip(paste0(tempdir(),"\\nm.zip"), files = "TEMP\\T_OGRID.xml", exdir = tempdir(), junkpaths = TRUE, overwrite = TRUE)

# Generate NM operator xml document
nm_xml <- read_xml(list.files(tempdir(), pattern = "T_OGRID.xml", full.names = TRUE))

cols <- nm_xml %>%
    xml_find_first("./Table") %>%
    xml_children() %>%
    xml_name(ns = xml_ns(.))

# future::sequential xml parse - no error in any environment
plan(sequential)

system.time(
    nm_df <- cols %>%
        map(~ future({
            nm_xml %>%
                xml_find_all(sprintf("./Table/%s", .x)) %>%
                xml_text() %>%
                trimws() %>%
                list() %>%
                set_names(tolower(gsub(".*:","",.x)))
        })) %>%
        values() %>%
        flatten_df()
)
#>    user  system elapsed 
#>   26.89    0.19   27.08

# future::multiprocess xml parse - Generates 'Error: external pointer is not valid' on Windows but completes on Linux
plan(multiprocess)

system.time(
    nm_df <- cols %>%
        map(~ future({
            nm_xml %>%
                xml_find_all(sprintf("./Table/%s", .x)) %>%
                xml_text() %>%
                trimws() %>%
                list() %>%
                set_names(tolower(gsub(".*:","",.x)))
        })) %>%
        values() %>%
        flatten_df()
)
#> Error: external pointer is not valid
#> Timing stopped at: 1.86 0.05 8.42

unlink(tempdir())

Created on 2018-03-20 by the reprex package (v0.2.0).

Any idea why this fails to complete in Windows?