Skip to content

Commit

Permalink
Gdalinfo cleanup (#966)
Browse files Browse the repository at this point in the history
* Isolate logic in parse_json_from_output.

* Test if gdalinfo outputs are same

* Figure out difference in Jenkins.

* Explicitly make job fail.

* better message. Make gdalinfo_use_subprocess new default. #906

* remove 'assert False'
  • Loading branch information
EmileSonneveld authored Dec 5, 2024
1 parent ec85949 commit 69f85bb
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 40 deletions.
4 changes: 2 additions & 2 deletions openeogeotrellis/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,6 @@ class GpsBackendConfig(OpenEoBackendConfig):
"/opt/tensorflow:/opt/tensorflow:ro"
)
batch_user_docker_mounts: dict[str, List[str]] = {}
gdalinfo_python_call: bool = True
gdalinfo_use_subprocess: bool = False
gdalinfo_python_call: bool = False
gdalinfo_use_subprocess: bool = True # TODO: Only keep one gdalinfo on true
gdalinfo_use_python_subprocess: bool = False
88 changes: 50 additions & 38 deletions openeogeotrellis/integrations/gdal.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from openeo_driver.utils import smart_bool
from openeogeotrellis.config import get_backend_config
from openeogeotrellis.utils import stream_s3_binary_file_contents, _make_set_for_key
from openeogeotrellis.utils import stream_s3_binary_file_contents, _make_set_for_key, parse_json_from_output


def poorly_log(message: str, level=logging.INFO):
Expand Down Expand Up @@ -476,63 +476,75 @@ def read_gdal_info(asset_uri: str) -> GDALInfo:
# See https://gdal.org/api/python_gotchas.html
gdal.UseExceptions()

try:
data_gdalinfo = None
# TODO: Choose a version, and remove others
backend_config = get_backend_config()
if (
data_gdalinfo = {}
# TODO: Choose a version, and remove others
backend_config = get_backend_config()
if (
not backend_config.gdalinfo_python_call
and not backend_config.gdalinfo_use_subprocess
and not backend_config.gdalinfo_use_python_subprocess
):
poorly_log(
"Neither gdalinfo_python_call nor gdalinfo_use_subprocess nor gdalinfo_use_python_subprocess is True. Avoiding gdalinfo."
)
data_gdalinfo = {}
):
poorly_log(
"Neither gdalinfo_python_call nor gdalinfo_use_subprocess nor gdalinfo_use_python_subprocess is True. Avoiding gdalinfo."
)

if backend_config.gdalinfo_python_call:
start = time.time()
if backend_config.gdalinfo_python_call:
start = time.time()
try:
data_gdalinfo = gdal.Info(asset_uri, options=gdal.InfoOptions(format="json", stats=True))
end = time.time()
poorly_log(f"gdal.Info() took {(end - start) * 1000}ms for {asset_uri}") # ~10ms
# This can throw a segfault on empty netcdf bands:
poorly_log(f"gdal.Info() took {(end - start) * 1000}ms for {asset_uri}", level=logging.DEBUG) # ~10ms
except Exception as exc:
poorly_log(
f"gdalinfo Exception. Statistics won't be added to STAC metadata. '{exc}'.", level=logging.WARNING
)

if backend_config.gdalinfo_use_subprocess:
start = time.time()
# Ignore errors like "band 2: Failed to compute statistics, no valid pixels found in sampling."
cmd = ["gdalinfo", asset_uri, "-json", "-stats", "--config", "GDAL_IGNORE_ERRORS", "ALL"]
if backend_config.gdalinfo_use_subprocess:
start = time.time()
# Ignore errors like "band 2: Failed to compute statistics, no valid pixels found in sampling."
# use "--debug ON" to print more logging to cerr
cmd = ["gdalinfo", asset_uri, "-json", "-stats", "--config", "GDAL_IGNORE_ERRORS", "ALL"]
try:
out = subprocess.check_output(cmd, timeout=60, text=True)
data_gdalinfo_from_subprocess = json.loads(out)
data_gdalinfo_from_subprocess = parse_json_from_output(out)
end = time.time()
poorly_log(f"gdalinfo took {(end - start) * 1000}ms for {asset_uri}") # ~30ms
poorly_log(f"gdalinfo took {(end - start) * 1000}ms for {asset_uri}", level=logging.DEBUG) # ~30ms
if data_gdalinfo:
assert data_gdalinfo_from_subprocess == data_gdalinfo
else:
data_gdalinfo = data_gdalinfo_from_subprocess
except Exception as exc:
poorly_log(
f"gdalinfo Exception. Statistics won't be added to STAC metadata. '{exc}'. Command: {subprocess.list2cmdline(cmd)}",
level=logging.WARNING,
)

if backend_config.gdalinfo_use_python_subprocess:
start = time.time()
cmd = [
sys.executable,
"-c",
f"""from osgeo import gdal; import json; gdal.UseExceptions(); print(json.dumps(gdal.Info({asset_uri!r}, options=gdal.InfoOptions(format="json", stats=True))))""",
]
print("\n" + subprocess.list2cmdline(cmd) + "\n")
if backend_config.gdalinfo_use_python_subprocess:
start = time.time()
cmd = [
sys.executable,
"-c",
f"""from osgeo import gdal; import json; gdal.UseExceptions(); print(json.dumps(gdal.Info({asset_uri!r}, options=gdal.InfoOptions(format="json", stats=True))))""",
]
try:
out = subprocess.check_output(cmd, timeout=60, text=True)
last_json_line = next(reversed(list(filter(lambda x: x.startswith("{"), out.split("\n")))))
data_gdalinfo_from_subprocess = json.loads(last_json_line)
data_gdalinfo_from_subprocess = parse_json_from_output(out)
end = time.time()
poorly_log(f"gdal.Info() subprocess took {(end - start) * 1000}ms for {asset_uri}") # ~130ms
poorly_log(
f"gdal.Info() subprocess took {(end - start) * 1000}ms for {asset_uri}", level=logging.DEBUG
) # ~130ms
if data_gdalinfo:
assert data_gdalinfo_from_subprocess == data_gdalinfo
else:
data_gdalinfo = data_gdalinfo_from_subprocess
except Exception as exc:
poorly_log(f"gdalinfo Exception {exc}", level=logging.WARNING)
# TODO: Specific exception type(s) would be better but Wasn't able to find what
# specific exceptions gdal.Info might raise.
return {}
else:
return data_gdalinfo
except Exception as exc:
poorly_log(
f"gdalinfo Exception. Statistics won't be added to STAC metadata. '{exc}'. Command: {subprocess.list2cmdline(cmd)}",
level=logging.WARNING,
)

return data_gdalinfo


def _get_raster_statistics(gdal_info: GDALInfo, band_name: Optional[str] = None) -> RasterStatistics:
Expand Down
16 changes: 16 additions & 0 deletions openeogeotrellis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,22 @@ def json_default(obj: Any) -> Any:
raise TypeError("%r is not JSON serializable" % obj)


def parse_json_from_output(output_str: str) -> Dict[str, Any]:
lines = output_str.split("\n")
parsing_json = False
json_str = ""
# reverse order to get last possible json line
for l in reversed(lines):
if not parsing_json:
if l.endswith("}"):
parsing_json = True
json_str = l + json_str
if l.startswith("{"):
break

return json.loads(json_str)


def calculate_rough_area(geoms: Iterable[BaseGeometry]):
"""
For every geometry, roughly estimate its area using its bounding box and return their sum.
Expand Down
11 changes: 11 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
stream_s3_binary_file_contents,
to_s3_url,
utcnow,
parse_json_from_output,
)


Expand Down Expand Up @@ -210,6 +211,16 @@ def test_json_default(value, expected):
assert out == expected


def test_parse_json_from_output():
json_dict = parse_json_from_output("{}")
assert json_dict == {}


def test_parse_json_from_output_complex():
json_dict = parse_json_from_output("""prefix\n{}\nmiddle\n{"num":\n 5}\n""")
assert json_dict == {"num": 5}


class TestStatsReporter:
def test_basic(self, caplog):
caplog.set_level(logging.INFO)
Expand Down

0 comments on commit 69f85bb

Please sign in to comment.