I am working with the R programming language.
I have this problem where I am trying to find out the pairwise intersection between two shapefiles (i.e. the percentage that each polygon from the first shapefile is intersected by all polygons in the second shapefile).
I think I figured out a basic way to solve this problem:
# Load libraries
library(sf)
library(dplyr)
# Download zip files
url_1 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lada000b21a_e.zip&k=%20%20%20151162&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip"
url_2 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lpc_000b21a_e.zip&k=%20%20%20%20%207778&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lpc_000b21a_e.zip"
download.file(url_1, destfile = "lada000b21a_e.zip")
download.file(url_2, destfile = "lpc_000b21a_e.zip")
# Extract zip files
unzip("lada000b21a_e.zip")
unzip("lpc_000b21a_e.zip")
# Read shapefiles
file_1 <- st_read("lada000b21a_e.shp")
file_2 <- st_read("lpc_000b21a_e.shp")
# Reproject if necessary
file_1 <- st_transform(file_1, "+proj=longlat +datum=WGS84")
file_2 <- st_transform(file_2, "+proj=longlat +datum=WGS84")
# Repair invalid geometries in file_1
file_1$geometry <- st_make_valid(file_1$geometry)
# Repair invalid geometries in file_2
file_2$geometry <- st_make_valid(file_2$geometry)
# Use st_intersection to find overlapping polygons
intersections <- st_intersection(file_1, file_2)
# Check for invalid features
invalid_features <- st_is_valid(intersections)
# Remove invalid features
intersections <- intersections[invalid_features, ]
# Calculate area of overlapping polygons
area_intersections <- st_area(intersections)
# Calculate area of all polygons in file_1 and file_2
area_file_1 <- st_area(file_1)
area_file_2 <- st_area(file_2)
# Calculate percentage coverage for each pair of polygons
coverage_matrix <- matrix(0, nrow(file_1), nrow(file_2))
for (i in seq_len(nrow(file_1))) {
for (j in seq_len(nrow(file_2))) {
# Calculate area of intersection between polygons i and j
intersection_area <- sum(area_intersections[which(intersections$ADAUID == file_1$ADAUID[i] &
intersections$PCUID == file_2$PCUID[j])])
# Calculate percentage coverage
coverage_matrix[i, j] <- intersection_area / (area_file_1[i] + area_file_2[j] - intersection_area)
}
}
# View coverage matrix
coverage_matrix
# Set row and column names for the coverage matrix
rownames(coverage_matrix) <- file_1$ADAUID
colnames(coverage_matrix) <- file_2$PCUID
# Convert matrix to data frame
coverage_df <- as.data.frame.table(coverage_matrix)
# Rename columns
names(coverage_df) <- c("adauid", "pcuid", "coverage_value")
# Filter non-zero coverage values
coverage_df <- coverage_df[coverage_df$coverage_value != 0, c("adauid", "coverage_value")]
coverage_df$coverage_value = coverage_df$coverage_value * 100
My Question: Now, I am trying to speed up this code using parallel computing
I know that in order to do this, I have to convert the above FOR LOOP into a function:
calculate_coverage <- function(file_1, file_2) {
coverage_matrix <- matrix(0, nrow(file_1), nrow(file_2))
for (i in seq_len(nrow(file_1))) {
for (j in seq_len(nrow(file_2))) {
# Calculate area of intersection between polygons i and j
intersection_area <- st_area(st_intersection(file_1[i, ], file_2[j, ]))
# Calculate percentage coverage
coverage_matrix[i, j] <- intersection_area / (st_area(file_1[i, ]) + st_area(file_2[j, ]) - intersection_area)
}
}
return(coverage_matrix)
}
Then, I have to "export" all intermediate objects that are required during the processing to the parallel clusters, and then run the function I defined above:
library(parallel)
# Export necessary libraries and functions to the cluster
ncores <- detectCores()
cl <- makePSOCKcluster(ncores - 1L) # reserve 1 cpu core
clusterExport(cl, "calculate_coverage")
clusterExport(cl, "file_1")
clusterExport(cl, "file_2")
clusterEvalQ(cl, library(sf))
# Apply the function in parallel
result <- parLapply(
cl = cl,
X = 1:nrow(file_1),
function(i){
calculate_coverage(file_1[i,], file_2)
}
)
# Stop the cluster
stopCluster(cl)
# Convert the result to a matrix
result_matrix <- do.call(rbind, result)
In both cases, the code runs - but am I doing this correctly?
Thanks!