Converting Loops into Functions

omario · April 2, 2023, 2:16am

I am working with the R programming language.

I have this problem where I am trying to find out the pairwise intersection between two shapefiles (i.e. the percentage that each polygon from the first shapefile is intersected by all polygons in the second shapefile).

I think I figured out a basic way to solve this problem:

# Load libraries
library(sf)
library(dplyr)


# Download zip files
url_1 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lada000b21a_e.zip&k=%20%20%20151162&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip"
url_2 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lpc_000b21a_e.zip&k=%20%20%20%20%207778&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lpc_000b21a_e.zip"

download.file(url_1, destfile = "lada000b21a_e.zip")
download.file(url_2, destfile = "lpc_000b21a_e.zip")

# Extract zip files
unzip("lada000b21a_e.zip")
unzip("lpc_000b21a_e.zip")

# Read shapefiles
file_1 <- st_read("lada000b21a_e.shp")
file_2 <- st_read("lpc_000b21a_e.shp")

# Reproject if necessary
file_1 <- st_transform(file_1, "+proj=longlat +datum=WGS84")
file_2 <- st_transform(file_2, "+proj=longlat +datum=WGS84")


# Repair invalid geometries in file_1
file_1$geometry <- st_make_valid(file_1$geometry)
# Repair invalid geometries in file_2
file_2$geometry <- st_make_valid(file_2$geometry)

# Use st_intersection to find overlapping polygons
intersections <- st_intersection(file_1, file_2)

# Check for invalid features
invalid_features <- st_is_valid(intersections)

# Remove invalid features
intersections <- intersections[invalid_features, ]


# Calculate area of overlapping polygons
area_intersections <- st_area(intersections)

# Calculate area of all polygons in file_1 and file_2
area_file_1 <- st_area(file_1)
area_file_2 <- st_area(file_2)

# Calculate percentage coverage for each pair of polygons
coverage_matrix <- matrix(0, nrow(file_1), nrow(file_2))
for (i in seq_len(nrow(file_1))) {
    for (j in seq_len(nrow(file_2))) {
        # Calculate area of intersection between polygons i and j
        intersection_area <- sum(area_intersections[which(intersections$ADAUID == file_1$ADAUID[i] & 
                                                              intersections$PCUID == file_2$PCUID[j])])
        # Calculate percentage coverage
        coverage_matrix[i, j] <- intersection_area / (area_file_1[i] + area_file_2[j] - intersection_area)


    }
}

# View coverage matrix
coverage_matrix

# Set row and column names for the coverage matrix
rownames(coverage_matrix) <- file_1$ADAUID
colnames(coverage_matrix) <- file_2$PCUID

# Convert matrix to data frame
coverage_df <- as.data.frame.table(coverage_matrix)

# Rename columns
names(coverage_df) <- c("adauid", "pcuid", "coverage_value")

# Filter non-zero coverage values
coverage_df <- coverage_df[coverage_df$coverage_value != 0, c("adauid", "coverage_value")]
coverage_df$coverage_value = coverage_df$coverage_value * 100

My Question: Now, I am trying to speed up this code using parallel computing

I know that in order to do this, I have to convert the above FOR LOOP into a function:

calculate_coverage <- function(file_1, file_2) {
  coverage_matrix <- matrix(0, nrow(file_1), nrow(file_2))
  for (i in seq_len(nrow(file_1))) {
    for (j in seq_len(nrow(file_2))) {
      # Calculate area of intersection between polygons i and j
      intersection_area <- st_area(st_intersection(file_1[i, ], file_2[j, ]))
      # Calculate percentage coverage
      coverage_matrix[i, j] <- intersection_area / (st_area(file_1[i, ]) + st_area(file_2[j, ]) - intersection_area)
    }
  }
  return(coverage_matrix)
}

Then, I have to "export" all intermediate objects that are required during the processing to the parallel clusters, and then run the function I defined above:

library(parallel)

# Export necessary libraries and functions to the cluster
ncores <- detectCores()
cl <- makePSOCKcluster(ncores - 1L) # reserve 1 cpu core
clusterExport(cl, "calculate_coverage")
clusterExport(cl, "file_1")
clusterExport(cl, "file_2")
clusterEvalQ(cl, library(sf))

# Apply the function in parallel
result <- parLapply(
    cl = cl,
    X = 1:nrow(file_1),
    function(i){
        calculate_coverage(file_1[i,], file_2)
    }
)

# Stop the cluster
stopCluster(cl)

# Convert the result to a matrix
result_matrix <- do.call(rbind, result)

In both cases, the code runs - but am I doing this correctly?

Thanks!

nirgrahamuk · April 2, 2023, 6:35am

You would know if the result of your parallel code was equivalent to the result of your original serial code

Its also worth verifying that its faster and not slower for a typical czse

system · May 14, 2023, 6:36am

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.