Skip to content

Commit

Permalink
Raster api tweaks (#578)
Browse files Browse the repository at this point in the history
* formatter, added rst_setsrid to docs

* standardize on rst_tooverlappingtiles, mark rst_to_overlapping_tiles as deprecated

* fixed error in RST_WorldToRasterCoordY (was returning x)

* marked rst_to_overlapping_tiles deprecated in MosaicContext

* updated RST_PixelCount to take extra params

* fixed failing python test

* added rst_write expression for writing rasters

* fixed R test

* added cutline option to RST_Clip

* minor docs tweak

* `MosaicRasterGDAL` does not need `memsize` at instantiation. Will be estimated from band data types and sizes if not known

* formatter run on the python bindings
  • Loading branch information
sllynn authored Sep 25, 2024
1 parent 0a9d8bd commit a019c23
Show file tree
Hide file tree
Showing 29 changed files with 805 additions and 151 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ test_that("scalar raster functions behave as intended", {
sdf <- withColumn(sdf, "rst_worldtorastercoordx", rst_worldtorastercoordx(column("tile"), lit(0.0), lit(0.0)))
sdf <- withColumn(sdf, "rst_worldtorastercoordy", rst_worldtorastercoordy(column("tile"), lit(0.0), lit(0.0)))
sdf <- withColumn(sdf, "rst_worldtorastercoord", rst_worldtorastercoord(column("tile"), lit(0.0), lit(0.0)))
sdf <- withColumn(sdf, "rst_write", rst_write(column("tile"), lit("/tmp/mosaic_tmp/")))

expect_no_error(write.df(sdf, source = "noop", mode = "overwrite"))
})
Expand All @@ -67,7 +68,7 @@ test_that("raster flatmap functions behave as intended", {
expect_equal(nrow(tessellate_sdf), 63)

overlap_sdf <- generate_singleband_raster_df()
overlap_sdf <- withColumn(overlap_sdf, "rst_to_overlapping_tiles", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L)))
overlap_sdf <- withColumn(overlap_sdf, "rst_tooverlappingtiles", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L)))

expect_no_error(write.df(overlap_sdf, source = "noop", mode = "overwrite"))
expect_equal(nrow(overlap_sdf), 87)
Expand All @@ -76,7 +77,7 @@ test_that("raster flatmap functions behave as intended", {
test_that("raster aggregation functions behave as intended", {
collection_sdf <- generate_singleband_raster_df()
collection_sdf <- withColumn(collection_sdf, "extent", st_astext(rst_boundingbox(column("tile"))))
collection_sdf <- withColumn(collection_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L)))
collection_sdf <- withColumn(collection_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L)))

merge_sdf <- summarize(
groupBy(collection_sdf, "path"),
Expand Down Expand Up @@ -127,7 +128,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", {
raster_sdf <- withColumn(raster_sdf, "timestep", element_at(rst_metadata(column("tile")), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX"))
raster_sdf <- where(raster_sdf, "timestep = 21")
raster_sdf <- withColumn(raster_sdf, "tile", rst_setsrid(column("tile"), lit(4326L)))
raster_sdf <- withColumn(raster_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(20L), lit(20L), lit(10L)))
raster_sdf <- withColumn(raster_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(20L), lit(20L), lit(10L)))
raster_sdf <- withColumn(raster_sdf, "tile", rst_tessellate(column("tile"), lit(target_resolution)))

clipped_sdf <- join(raster_sdf, census_sdf, raster_sdf$tile.index_id == census_sdf$index_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ test_that("scalar raster functions behave as intended", {
mutate(rst_width = rst_width(tile)) %>%
mutate(rst_worldtorastercoordx = rst_worldtorastercoordx(tile, as.double(0.0), as.double(0.0))) %>%
mutate(rst_worldtorastercoordy = rst_worldtorastercoordy(tile, as.double(0.0), as.double(0.0))) %>%
mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0)))
mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) %>%
mutate(rst_write = rst_write(tile, "/tmp/mosaic_tmp"))

expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite"))
})
Expand All @@ -95,7 +96,7 @@ test_that("raster flatmap functions behave as intended", {
expect_equal(sdf_nrow(tessellate_sdf), 63)

overlap_sdf <- generate_singleband_raster_df() %>%
mutate(rst_to_overlapping_tiles = rst_to_overlapping_tiles(tile, 200L, 200L, 10L))
mutate(rst_tooverlappingtiles = rst_tooverlappingtiles(tile, 200L, 200L, 10L))

expect_no_error(spark_write_source(overlap_sdf, "noop", mode = "overwrite"))
expect_equal(sdf_nrow(overlap_sdf), 87)
Expand All @@ -105,7 +106,7 @@ test_that("raster flatmap functions behave as intended", {
test_that("raster aggregation functions behave as intended", {
collection_sdf <- generate_singleband_raster_df() %>%
mutate(extent = st_astext(rst_boundingbox(tile))) %>%
mutate(tile = rst_to_overlapping_tiles(tile, 200L, 200L, 10L))
mutate(tile = rst_tooverlappingtiles(tile, 200L, 200L, 10L))

merge_sdf <- collection_sdf %>%
group_by(path) %>%
Expand Down Expand Up @@ -167,7 +168,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", {
indexed_raster_sdf <- sdf_sql(sc, "SELECT tile, element_at(rst_metadata(tile), 'NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX') as timestep FROM raster") %>%
filter(timestep == 21L) %>%
mutate(tile = rst_setsrid(tile, 4326L)) %>%
mutate(tile = rst_to_overlapping_tiles(tile, 20L, 20L, 10L)) %>%
mutate(tile = rst_tooverlappingtiles(tile, 20L, 20L, 10L)) %>%
mutate(tile = rst_tessellate(tile, target_resolution))

clipped_sdf <- indexed_raster_sdf %>%
Expand Down
168 changes: 151 additions & 17 deletions docs/source/api/raster-functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ rst_avg

Returns an array containing mean values for each band.

:param tile: A column containing the raster tile.
:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:rtype: Column: ArrayType(DoubleType)

Expand Down Expand Up @@ -92,7 +92,7 @@ rst_bandmetadata
Extract the metadata describing the raster band.
Metadata is return as a map of key value pairs.

:param tile: A column containing the raster tile.
:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param band: The band number to extract metadata for.
:type band: Column (IntegerType)
Expand Down Expand Up @@ -193,29 +193,43 @@ rst_boundingbox
rst_clip
********

.. function:: rst_clip(tile, geometry)
.. function:: rst_clip(tile, geometry, cutline_all_touched)

Clips :code:`tile` with :code:`geometry`, provided in a supported encoding (WKB, WKT or GeoJSON).

:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param geometry: A column containing the geometry to clip the raster to.
:type geometry: Column (GeometryType)
:param cutline_all_touched: A column to specify pixels boundary behavior.
:type cutline_all_touched: Column (BooleanType)
:rtype: Column: RasterTileType

.. note::
Notes
**Notes**
Geometry input
The :code:`geometry` parameter is expected to be a polygon or a multipolygon.

:code:`geometry` is expected to be:
- in the same coordinate reference system as the raster.
- a polygon or a multipolygon.
Cutline handling
The :code:`cutline_all_touched` parameter:

The output raster tiles will have:
- the same extent as the input geometry.
- the same number of bands as the input raster.
- the same pixel data type as the input raster.
- the same pixel size as the input raster.
- the same coordinate reference system as the input raster.
- Optional: default is true. This is a GDAL warp config for the operation.
- If set to true, the pixels touching the geometry are included in the clip,
regardless of half-in or half-out; this is important for tessellation behaviors.
- If set to false, only at least half-in pixels are included in the clip.
- More information can be found `here <https://gdal.org/en/latest/api/gdalwarp_cpp.html>`__

The actual GDAL command employed to perform the clipping operation is as follows:
:code:`"gdalwarp -wo CUTLINE_ALL_TOUCHED=<TRUE|FALSE> -cutline <GEOMETRY> -crop_to_cutline"`

Output
Output raster tiles will have:

- the same extent as the input geometry.
- the same number of bands as the input raster.
- the same pixel data type as the input raster.
- the same pixel size as the input raster.
- the same coordinate reference system as the input raster.
..
:example:
Expand Down Expand Up @@ -1737,16 +1751,36 @@ rst_numbands
+---------------------+

rst_pixelcount
***************
**************

.. function:: rst_pixelcount(tile)
.. function:: rst_pixelcount(tile, count_nodata, count_all)

Returns an array containing valid pixel count values for each band.
Returns an array containing pixel count values for each band; default excludes mask and nodata pixels.

:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param count_nodata: A column to specify whether to count nodata pixels.
:type count_nodata: Column (BooleanType)
:param count_all: A column to specify whether to count all pixels.
:type count_all: Column (BooleanType)
:rtype: Column: ArrayType(LongType)

.. note::

Notes:

If pixel value is noData or mask value is 0.0, the pixel is not counted by default.

:code:`count_nodata`
- This is an optional param.
- if specified as true, include the noData (not mask) pixels in the count (default is false).

:code:`count_all`
- This is an optional param; as a positional arg, must also pass :code:`count_nodata`
(value of :code:`count_nodata` is ignored).
- if specified as true, simply return bandX * bandY in the count (default is false).
..
:example:

.. tabs::
Expand Down Expand Up @@ -2616,7 +2650,7 @@ rst_separatebands
+--------------------------------------------------------------------------------------------------------------------------------+

rst_setnodata
**********************
*************

.. function:: rst_setnodata(tile, nodata)

Expand Down Expand Up @@ -2668,6 +2702,49 @@ rst_setnodata
| {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

rst_setsrid
***********

.. function:: rst_setsrid(tile, srid)

Set the SRID of the raster tile as an EPSG code.

:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param srid: The SRID to set
:type srid: Column (IntegerType)
:rtype: Column: (RasterTileType)

:example:

.. tabs::
.. code-tab:: py

df.select(mos.rst_setsrid('tile', F.lit(9122))).display()
+------------------------------------------------------------------------------------------------------------------+
| rst_setsrid(tile, 9122) |
+------------------------------------------------------------------------------------------------------------------+
| {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

.. code-tab:: scala

df.select(rst_setsrid(col("tile"), lit(9122))).show
+------------------------------------------------------------------------------------------------------------------+
| rst_setsrid(tile, 9122) |
+------------------------------------------------------------------------------------------------------------------+
| {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

.. code-tab:: sql

SELECT rst_setsrid(tile, 9122) FROM table
+------------------------------------------------------------------------------------------------------------------+
| rst_setsrid(tile, 9122) |
+------------------------------------------------------------------------------------------------------------------+
| {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

rst_skewx
*********

Expand Down Expand Up @@ -3517,3 +3594,60 @@ rst_worldtorastercoordy
+------------------------------------------------------------------------------------------------------------------+
| 997 |
+------------------------------------------------------------------------------------------------------------------+

rst_write
*********

.. function:: rst_write(input, dir)

Writes raster tiles from the input column to a specified directory.

:param input: A column containing the raster tile.
:type input: Column
:param dir: The directory, e.g. fuse, to write the tile's raster as file.
:type dir: Column(StringType)
:rtype: Column: RasterTileType

.. note::
**Notes**
- Use :code:`RST_Write` to save a 'tile' column to a specified directory (e.g. fuse) location using its
already populated GDAL driver and tile information.
- Useful for formalizing the tile 'path' when writing a Lakehouse table. An example might be to turn on checkpointing
for internal data pipeline phase operations in which multiple interim tiles are populated, but at the end of the phase
use this function to set the final path to be used in the phase's persisted table. Then, you are free to delete
the internal tiles that accumulated in the configured checkpointing directory.
..
:example:

.. tabs::
.. code-tab:: py

df.select(rst_write("tile", <write_dir>).alias("tile")).limit(1).display()
+------------------------------------------------------------------------+
| tile |
+------------------------------------------------------------------------+
| {"index_id":null,"tile":"<write_path>","metadata":{ |
| "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} |
+------------------------------------------------------------------------+

.. code-tab:: scala

df.select(rst_write(col("tile", <write_dir>)).as("tile)).limit(1).show
+------------------------------------------------------------------------+
| tile |
+------------------------------------------------------------------------+
| {"index_id":null,"tile":"<write_path>","metadata":{ |
| "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} |
+------------------------------------------------------------------------+

.. code-tab:: sql

SELECT rst_write(tile, <write_dir>) as tile FROM table LIMIT 1
+------------------------------------------------------------------------+
| tile |
+------------------------------------------------------------------------+
| {"index_id":null,"tile":"<write_path>","metadata":{ |
| "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} |
+------------------------------------------------------------------------+

Loading

0 comments on commit a019c23

Please sign in to comment.