I have a data frame of values which varies in size each week when I run my code but it's typically ~3000 rows
I need to pass each of these values into a function I've created (a whole world of pain for me!) to get data back from an API.
I've found that I get an "parse error: premature EOF" error if I pass too many values to the API at once so I've broken the original data frame of values into chunks of size 500 (not a good idea, I know).
Is there a more elegant solution to passing all ~3000 values to the API so that I don't overload the API and get a HTTP 429 message?
Here is a reprex to help explain the problem, however in this example lets assume the API limit is 2 and I have 10 values i wish to pass to the API call.
install.packages("pacman")
library(pacman)
pacman::p_load(tidyverse,data.table,httr,jsonlite)
values <- c(1598727430,
1083632731,
1710983663,
1033159769,
1659312593,
1720037021,
1538669445,
1508835828,
1033154026,
1003819475,
1811299944)
df <- data.frame(values)
Here is the function that I created:
# URL OF THE NPI API
path <- "https://npiregistry.cms.hhs.gov/api/?"
# CREATE A FUNCTION TO PULL NPI INFORMATION FROM THE NPI REGISTRY
getNPI <- function(object) {
request <- httr::GET(
url = path,
query = list(
version = "2.0",
number = object
)
)
Sys.sleep(0.25)
warn_for_status(request)
npi_details <- content(request,
as = "text",
encoding = "UTF-8"
) %>%
fromJSON(.,
flatten = TRUE
) %>%
data.frame()
# IF THE API THROWS BACK A RESULT WHERE THE COLUMN NAMES CONTAIN 'ERROR'
# THEN ASSIGN ALL THE ROW VALUES TO NA AND ADD THE NPI VALUE TO THE FIRST
# COLUMN
if (any(grepl("ERROR", toupper(colnames(npi_details))))) {
npi_details <- as.data.frame(matrix("error", ncol = 6, nrow = 1), stringsAsFactors = FALSE) %>%
dplyr::rename(
`NPI NUMBER` = V1,
`CMS REF ADDRESS 1` = V2,
`CMS REF ADDRESS 2` = V3,
`CMS REF CITY` = V4,
`CMS REF STATE` = V5,
`CMS REF ZIP` = V6
) %>% as_tibble()
npi_details[1,1] <- as.character(object)
return(npi_details)
# ELSE IF THE DATA FRAME DOES NOT CONTAIN 'ERROR' THEN RUN THIS CHUNK
} else {
select(npi_details, contains(c("addresses", "number"))) %>%
unnest(c(contains("address"))) %>%
filter(address_purpose == "MAILING") %>%
rename_all(.funs = toupper) %>%
select(
`NPI NUMBER` = RESULTS.NUMBER,
-COUNTRY_CODE,
-COUNTRY_NAME,
-ADDRESS_PURPOSE,
-ADDRESS_TYPE,
`CMS REF ADDRESS 1` = ADDRESS_1,
`CMS REF ADDRESS 2` = ADDRESS_2,
`CMS REF CITY` = CITY,
`CMS REF STATE` = STATE,
`CMS REF ZIP` = POSTAL_CODE
) %>%
mutate(`NPI NUMBER` = as.character(`NPI NUMBER`))
}
}
Right now this is how I run my function against my values:
# apply the getNPI function to each chunk of the data frame
data1_2 <- apply(df[1:2,], 1, function(x) list(getNPI(x)))
data3_4 <- apply(df[3:4], 1, function(x) list(getNPI(x)))
....
data9_10 <- apply(df[9:10], 1, function(x) list(getNPI(x)))
# convert the list of lists to a single list
npi_list <- do.call(c,list(data1_2 ,
data3_4 ,
data5_6 ,
data7_8 ,
data9_10))
# convert data table
dt_list <- map(npi_list, as.data.table)
# bind them together to get a data frame
provider_npi_data <- rbindlist(dt_list, fill = TRUE, idcol = FALSE) %>%
as.data.frame() %>%
drop_na()