Gdalinfo cleanup (#966)

* Isolate logic in parse_json_from_output. * Test if gdalinfo outputs are same * Figure out difference in Jenkins. * Explicitly make job fail. * better message. Make gdalinfo_use_subprocess new default. #906 * remove 'assert False'
Open-EO · Dec 5, 2024 · 69f85bb · 69f85bb
1 parent ec85949
commit 69f85bb
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 40 deletions.
diff --git a/openeogeotrellis/config/config.py b/openeogeotrellis/config/config.py
@@ -242,6 +242,6 @@ class GpsBackendConfig(OpenEoBackendConfig):
         "/opt/tensorflow:/opt/tensorflow:ro"
     )
     batch_user_docker_mounts: dict[str, List[str]] = {}
-    gdalinfo_python_call: bool = True
-    gdalinfo_use_subprocess: bool = False
+    gdalinfo_python_call: bool = False
+    gdalinfo_use_subprocess: bool = True  # TODO: Only keep one gdalinfo on true
     gdalinfo_use_python_subprocess: bool = False
diff --git a/openeogeotrellis/integrations/gdal.py b/openeogeotrellis/integrations/gdal.py
@@ -16,7 +16,7 @@
 
 from openeo_driver.utils import smart_bool
 from openeogeotrellis.config import get_backend_config
-from openeogeotrellis.utils import stream_s3_binary_file_contents, _make_set_for_key
+from openeogeotrellis.utils import stream_s3_binary_file_contents, _make_set_for_key, parse_json_from_output
 
 
 def poorly_log(message: str, level=logging.INFO):
@@ -476,63 +476,75 @@ def read_gdal_info(asset_uri: str) -> GDALInfo:
     # See https://gdal.org/api/python_gotchas.html
     gdal.UseExceptions()
 
-    try:
-        data_gdalinfo = None
-        # TODO: Choose a version, and remove others
-        backend_config = get_backend_config()
-        if (
+    data_gdalinfo = {}
+    # TODO: Choose a version, and remove others
+    backend_config = get_backend_config()
+    if (
             not backend_config.gdalinfo_python_call
             and not backend_config.gdalinfo_use_subprocess
             and not backend_config.gdalinfo_use_python_subprocess
-        ):
-            poorly_log(
-                "Neither gdalinfo_python_call nor gdalinfo_use_subprocess nor gdalinfo_use_python_subprocess is True. Avoiding gdalinfo."
-            )
-            data_gdalinfo = {}
+    ):
+        poorly_log(
+            "Neither gdalinfo_python_call nor gdalinfo_use_subprocess nor gdalinfo_use_python_subprocess is True. Avoiding gdalinfo."
+        )
 
-        if backend_config.gdalinfo_python_call:
-            start = time.time()
+    if backend_config.gdalinfo_python_call:
+        start = time.time()
+        try:
             data_gdalinfo = gdal.Info(asset_uri, options=gdal.InfoOptions(format="json", stats=True))
             end = time.time()
-            poorly_log(f"gdal.Info() took {(end - start) * 1000}ms for {asset_uri}")  # ~10ms
+            # This can throw a segfault on empty netcdf bands:
+            poorly_log(f"gdal.Info() took {(end - start) * 1000}ms for {asset_uri}", level=logging.DEBUG)  # ~10ms
+        except Exception as exc:
+            poorly_log(
+                f"gdalinfo Exception. Statistics won't be added to STAC metadata. '{exc}'.", level=logging.WARNING
+            )
 
-        if backend_config.gdalinfo_use_subprocess:
-            start = time.time()
-            # Ignore errors like "band 2: Failed to compute statistics, no valid pixels found in sampling."
-            cmd = ["gdalinfo", asset_uri, "-json", "-stats", "--config", "GDAL_IGNORE_ERRORS", "ALL"]
+    if backend_config.gdalinfo_use_subprocess:
+        start = time.time()
+        # Ignore errors like "band 2: Failed to compute statistics, no valid pixels found in sampling."
+        # use "--debug ON" to print more logging to cerr
+        cmd = ["gdalinfo", asset_uri, "-json", "-stats", "--config", "GDAL_IGNORE_ERRORS", "ALL"]
+        try:
             out = subprocess.check_output(cmd, timeout=60, text=True)
-            data_gdalinfo_from_subprocess = json.loads(out)
+            data_gdalinfo_from_subprocess = parse_json_from_output(out)
             end = time.time()
-            poorly_log(f"gdalinfo took {(end - start) * 1000}ms for {asset_uri}")  # ~30ms
+            poorly_log(f"gdalinfo took {(end - start) * 1000}ms for {asset_uri}", level=logging.DEBUG)  # ~30ms
             if data_gdalinfo:
                 assert data_gdalinfo_from_subprocess == data_gdalinfo
             else:
                 data_gdalinfo = data_gdalinfo_from_subprocess
+        except Exception as exc:
+            poorly_log(
+                f"gdalinfo Exception. Statistics won't be added to STAC metadata. '{exc}'. Command: {subprocess.list2cmdline(cmd)}",
+                level=logging.WARNING,
+            )
 
-        if backend_config.gdalinfo_use_python_subprocess:
-            start = time.time()
-            cmd = [
-                sys.executable,
-                "-c",
-                f"""from osgeo import gdal; import json; gdal.UseExceptions(); print(json.dumps(gdal.Info({asset_uri!r}, options=gdal.InfoOptions(format="json", stats=True))))""",
-            ]
-            print("\n" + subprocess.list2cmdline(cmd) + "\n")
+    if backend_config.gdalinfo_use_python_subprocess:
+        start = time.time()
+        cmd = [
+            sys.executable,
+            "-c",
+            f"""from osgeo import gdal; import json; gdal.UseExceptions(); print(json.dumps(gdal.Info({asset_uri!r}, options=gdal.InfoOptions(format="json", stats=True))))""",
+        ]
+        try:
             out = subprocess.check_output(cmd, timeout=60, text=True)
-            last_json_line = next(reversed(list(filter(lambda x: x.startswith("{"), out.split("\n")))))
-            data_gdalinfo_from_subprocess = json.loads(last_json_line)
+            data_gdalinfo_from_subprocess = parse_json_from_output(out)
             end = time.time()
-            poorly_log(f"gdal.Info() subprocess took {(end - start) * 1000}ms for {asset_uri}")  # ~130ms
+            poorly_log(
+                f"gdal.Info() subprocess took {(end - start) * 1000}ms for {asset_uri}", level=logging.DEBUG
+            )  # ~130ms
             if data_gdalinfo:
                 assert data_gdalinfo_from_subprocess == data_gdalinfo
             else:
                 data_gdalinfo = data_gdalinfo_from_subprocess
-    except Exception as exc:
-        poorly_log(f"gdalinfo Exception {exc}", level=logging.WARNING)
-        # TODO: Specific exception type(s) would be better but Wasn't able to find what
-        #   specific exceptions gdal.Info might raise.
-        return {}
-    else:
-        return data_gdalinfo
+        except Exception as exc:
+            poorly_log(
+                f"gdalinfo Exception. Statistics won't be added to STAC metadata. '{exc}'. Command: {subprocess.list2cmdline(cmd)}",
+                level=logging.WARNING,
+            )
+
+    return data_gdalinfo
 
 
 def _get_raster_statistics(gdal_info: GDALInfo, band_name: Optional[str] = None) -> RasterStatistics:

diff --git a/openeogeotrellis/utils.py b/openeogeotrellis/utils.py
@@ -555,6 +555,22 @@ def json_default(obj: Any) -> Any:
     raise TypeError("%r is not JSON serializable" % obj)
 
 
+def parse_json_from_output(output_str: str) -> Dict[str, Any]:
+    lines = output_str.split("\n")
+    parsing_json = False
+    json_str = ""
+    # reverse order to get last possible json line
+    for l in reversed(lines):
+        if not parsing_json:
+            if l.endswith("}"):
+                parsing_json = True
+        json_str = l + json_str
+        if l.startswith("{"):
+            break
+
+    return json.loads(json_str)
+
+
 def calculate_rough_area(geoms: Iterable[BaseGeometry]):
     """
     For every geometry, roughly estimate its area using its bounding box and return their sum.

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -27,6 +27,7 @@
     stream_s3_binary_file_contents,
     to_s3_url,
     utcnow,
+    parse_json_from_output,
 )
 
 
@@ -210,6 +211,16 @@ def test_json_default(value, expected):
     assert out == expected
 
 
+def test_parse_json_from_output():
+    json_dict = parse_json_from_output("{}")
+    assert json_dict == {}
+
+
+def test_parse_json_from_output_complex():
+    json_dict = parse_json_from_output("""prefix\n{}\nmiddle\n{"num":\n 5}\n""")
+    assert json_dict == {"num": 5}
+
+
 class TestStatsReporter:
     def test_basic(self, caplog):
         caplog.set_level(logging.INFO)