From ced34c929b5e86b15cac550a92b3fec69d64da54 Mon Sep 17 00:00:00 2001 From: Martin Machyna Date: Sat, 8 Jan 2022 19:54:02 -0500 Subject: [PATCH] Extending query split vignette --- DESCRIPTION | 1 + vignettes/query-split.Rmd | 147 +++++++++++++++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 13da55ba..6fc570bf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,6 +11,7 @@ Authors@R: c( person("Andrea", "Gilardi", role = "ctb"), person("Enrico", "Spinielli", role = "ctb"), person("Anthony", "North", role = "ctb"), + person("Martin", "Machyna", role = "ctb"), person("Marcin", "Kalicinski", role = c("ctb", "cph"), comment = "Author of included RapidXML code"), person("Finkelstein", "Noam", role = c("ctb", "cph"), diff --git a/vignettes/query-split.Rmd b/vignettes/query-split.Rmd index 7bb26996..af19ac46 100644 --- a/vignettes/query-split.Rmd +++ b/vignettes/query-split.Rmd @@ -2,6 +2,7 @@ title: "4. Splitting large queries" author: - "Mark Padgham" + - "Martin Machyna" date: "`r Sys.Date()`" bibliography: osmdata-refs.bib output: @@ -25,5 +26,149 @@ bounding coordinates and specific OSM key-value pairs. The server has internal routines to limit delivery rates on queries for excessively large data sets, and may ultimately fail for large queries. This vignette describes one approach for breaking overly large queries into a set of smaller queries, and for -re-combining the resultant data sets into a single `osmdata` object reflecting +re-combining the resulting data sets into a single `osmdata` object reflecting the desired, large query. + + +## 2. Query splitting + +Some more complex or data heavy queries will eventually exhaust the time or memory +limit of the `overpass` server. One way to get around the problem is to split +the bounding box (bbox) of the query to several smaller fragments and afterwards +combinging the data while removing duplicate objects. + +```{r get-bbox, eval = FALSE} +library(osmdata) + +bb <- getbb("Southeastern Connecticut COG", featuretype = "boundary") +bb +``` +```{out1, eval = FALSE} + min max +x -72.46677 -71.79315 +y 41.27591 41.75617 +``` + +Now let's split the bbox into two smaller areas: +```{r bbox-split, eval = FALSE} +dx <- (bb["x", "max"] - bb["x", "min"]) / 2 + +bbs <- list(bb, bb) + +bbs[[1]]["x", "max"] <- bb["x", "max"] - dx +bbs[[2]]["x", "min"] <- bb["x", "min"] + dx + +bbs +``` +```{out2, eval = FALSE} +[[1]] + min max +x -72.46677 -72.12996 +y 41.27591 41.75617 + +[[2]] + min max +x -72.12996 -71.79315 +y 41.27591 41.75617 +``` + +We can use them to create two separate overpass queries: + +```{r opq-2x, eval = FALSE} +towns <- list() + +towns[[1]] <- opq(bbox = bbs[[1]]) |> + add_osm_feature(key="admin_level", value="8") |> + osmdata_sf() +towns[[2]] <- opq(bbox = bbs[[2]]) |> + add_osm_feature(key="admin_level", value="8") |> + osmdata_sf() +``` + +The retrieved `osmdata` objects can then be merged usig the`c(...)` function, +which automatically removes duplicate objects that span across both bboxes. + +```{r opq-merge, eval = FALSE} +towns2 <- c(towns[[1]], towns[[2]]) +``` + + +## 3. Automatic bbox splitting +Sometimes even the split bboxes can still be too large for a successful query and +additional splitting is needed. We can automate this process by monitoring the exit +status of `opq() |> osmdata_sf()` and in case of a failed query we can keep recursively +splitting the current bbox into incresingly smaller fragments until the correct size +is reached. + +Let's make a function that will split the provided bbox into a list of four equal +size bboxes. + +```{r bbox-auto-split, eval = FALSE} +split_bbox <- function(bbox, grid = 2) { + xmin <- bbox["x", "min"] + ymin <- bbox["y", "min"] + dx <- (bbox["x", "max"] - bbox["x", "min"]) / grid + dy <- (bbox["y", "max"] - bbox["y", "min"]) / grid + + bboxl <- list() + + for (i in 1:grid) { + for (j in 1:grid) { + b <- matrix(c(xmin + ((i-1) * dx), + ymin + ((j-1) * dy), + xmin + (i * dx), + ymin + (j * dy)), + nrow = 2, + dimnames = dimnames(bbox)) + + bboxl <- append(bboxl, list(b)) + } + } + bboxl +} +``` + +We pre-split our area and create a queue of bboxes that we will use for +submitting queries. + +```{r bbox-pre-split, eval = FALSE} +bb <- getbb("Connecticut", featuretype = NULL) +queue <- split_bbox(bb) +result <- list() +``` + +Now we can create a loop that will monitor the exit status of our query and in +case of success removes the bbox from the queue. If our query fails for +some reason, we split the failed bbox into four fragments and add them to +our queue. + +```{r auto-query, eval = FALSE} +while (length(queue) > 0) { + + print(queue[[1]]) + + opres <- NULL + opres <- try({ + opq(bbox = queue[[1]], timeout = 25) |> + add_osm_feature(key="natural", value="tree") |> + osmdata_sf() + }) + + if (class(opres)[1] != "try-error") { + result <- append(result, list(opres)) + queue <- queue[-1] + } else { + bboxnew <- split_bbox(queue[[1]]) + queue <- append(bboxnew, queue[-1]) + } +} +``` + +All retreived `osmdata` objects stored in the `result` list can be combined using +`c(...)`. Note that for large datasets this process can be quite time consuming. + +```{r merge-result-list, eval = FALSE} +final <- do.call(c, result) +``` + +