From 552de9d25a7232cd63f6df2f793bd95300bb92f1 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 31 Aug 2023 17:01:10 -0400 Subject: [PATCH 01/13] drop pydatetime --- ocean_data_parser/parsers/nmea.py | 2 +- ocean_data_parser/parsers/sunburst.py | 4 +--- ocean_data_parser/process/process.py | 2 +- tests/test_process.py | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ocean_data_parser/parsers/nmea.py b/ocean_data_parser/parsers/nmea.py index b63acda9..7d774471 100644 --- a/ocean_data_parser/parsers/nmea.py +++ b/ocean_data_parser/parsers/nmea.py @@ -298,7 +298,7 @@ def rename_variable(name): if nmea_dtype_mapping.get(col) != datetime: continue df[col] = ( - pd.to_datetime(df[col], utc=True).dt.tz_convert(None).dt.to_pydatetime() + pd.to_datetime(df[col], utc=True).dt.tz_convert(None).dt ) df = df.replace({np.nan: None, "": None, "None": None}) diff --git a/ocean_data_parser/parsers/sunburst.py b/ocean_data_parser/parsers/sunburst.py index 6bcb2b94..c30b8f66 100644 --- a/ocean_data_parser/parsers/sunburst.py +++ b/ocean_data_parser/parsers/sunburst.py @@ -100,7 +100,6 @@ def superCO2(path: str, output: str = None) -> xarray.Dataset: (df["Date"] + " " + df["Time"]), format="%Y%m%d %H%M%S", utc=True ) .dt.tz_convert(None) - .dt.to_pydatetime() ) # Review day of the year variable @@ -112,7 +111,6 @@ def superCO2(path: str, output: str = None) -> xarray.Dataset: utc=True, ) .dt.tz_convert(None) - .dt.to_pydatetime() ) # Compare DOY_UTC vs Date + Time @@ -176,7 +174,7 @@ def superCO2_notes(path: str) -> xarray.Dataset: notes += [{**note_ensemble, **dict(zip(columns, data))}] # Convert notes to a dataframe df = pd.DataFrame.from_dict(notes) - df["time"] = pd.to_datetime(df["time"]).dt.to_pydatetime() + df["time"] = pd.to_datetime(df["time"]) df = df.astype(dtype=notes_dtype_mapping, errors="ignore") ds = df.to_xarray() diff --git a/ocean_data_parser/process/process.py b/ocean_data_parser/process/process.py index 72ced366..2a349586 100644 --- a/ocean_data_parser/process/process.py +++ b/ocean_data_parser/process/process.py @@ -38,7 +38,7 @@ def load_cnv(file: str): ds = seabird.cnv(file) ds["time"] = ( ds["timeK"].dims, - pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s").to_pydatetime(), + pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s") ) return ds.swap_dims({"index": "time"}).drop("index") diff --git a/tests/test_process.py b/tests/test_process.py index 17c714e8..f1a3f4e2 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -56,7 +56,7 @@ def load_test_dataset(): ds = seabird.cnv(TEST_SEABIRD_FILE) ds["time"] = ( ds["timeK"].dims, - pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s").to_pydatetime(), + pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s") ) ds.process.time = "time" ds.process.lat = "latitude" From f26b45f1ed851857ed5244e856e8f39db9de9f38 Mon Sep 17 00:00:00 2001 From: Jessy Barrette Date: Thu, 31 Aug 2023 17:04:16 -0400 Subject: [PATCH 02/13] replace applymap by map --- ocean_data_parser/parsers/onset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocean_data_parser/parsers/onset.py b/ocean_data_parser/parsers/onset.py index da12c45a..8b689bd9 100644 --- a/ocean_data_parser/parsers/onset.py +++ b/ocean_data_parser/parsers/onset.py @@ -211,7 +211,7 @@ def csv( usecols=[id for id, name in enumerate(column_names)], **read_csv_kwargs, ) - df[header["time_variables"]] = df[header["time_variables"]].applymap( + df[header["time_variables"]] = df[header["time_variables"]].map( lambda x: _parse_onset_time(x, header["timezone"]) ) From 548677e2cdfcfa6aa28d5b172583e9547ca098d7 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:03:54 -0500 Subject: [PATCH 03/13] fix nmea parser --- ocean_data_parser/parsers/nmea.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocean_data_parser/parsers/nmea.py b/ocean_data_parser/parsers/nmea.py index c2fc8736..8e99f2aa 100644 --- a/ocean_data_parser/parsers/nmea.py +++ b/ocean_data_parser/parsers/nmea.py @@ -298,7 +298,7 @@ def rename_variable(name): if nmea_dtype_mapping.get(col) != datetime: continue df[col] = ( - pd.to_datetime(df[col], utc=True).dt.tz_convert(None).dt + pd.to_datetime(df[col], utc=True).dt.tz_convert(None) ) df = df.replace({np.nan: None, "": None, "None": None}) From 2ad9ab7d7e7eb4d446d9ff1abfac3d2f40228eb2 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:33:03 -0500 Subject: [PATCH 04/13] fix typo --- ocean_data_parser/parsers/nmea.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocean_data_parser/parsers/nmea.py b/ocean_data_parser/parsers/nmea.py index 8e99f2aa..ddd46953 100644 --- a/ocean_data_parser/parsers/nmea.py +++ b/ocean_data_parser/parsers/nmea.py @@ -61,7 +61,7 @@ "heading": float, "heading_true": float, "heading_magnetic": float, - "" "hdg_true": str, + "hdg_true": str, "wind_angle": float, "reference": str, "wind_speed": float, From 5fa4c130c643f0954b29f9abcc079e65c13034ac Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:34:07 -0500 Subject: [PATCH 05/13] isort . black . --- ocean_data_parser/parsers/amundsen.py | 12 +++++++----- ocean_data_parser/parsers/nmea.py | 10 ++++++---- ocean_data_parser/parsers/sunburst.py | 24 +++++++++--------------- ocean_data_parser/process/process.py | 2 +- tests/test_process.py | 2 +- 5 files changed, 24 insertions(+), 26 deletions(-) diff --git a/ocean_data_parser/parsers/amundsen.py b/ocean_data_parser/parsers/amundsen.py index cd4d8234..2292e144 100644 --- a/ocean_data_parser/parsers/amundsen.py +++ b/ocean_data_parser/parsers/amundsen.py @@ -109,15 +109,17 @@ def int_format( key, value = line.strip()[1:].split(":", 1) metadata[key.strip()] = value.strip() elif line == "% Fluorescence [ug/L]": - metadata['Fluo'] = "Fluorescence [ug/L]" + metadata["Fluo"] = "Fluorescence [ug/L]" elif line == "% Conservative Temperature (TEOS-10) [deg C]": - metadata['CONT'] = "Conservative Temperature (TEOS-10) [deg C]" + metadata["CONT"] = "Conservative Temperature (TEOS-10) [deg C]" elif line == "% In situ density TEOS10 ((s, t, p) - 1000) [kg/m^3]": - metadata['D_CT'] = "In situ density TEOS10 ((s, t, p) - 1000) [kg/m^3]" + metadata["D_CT"] = "In situ density TEOS10 ((s, t, p) - 1000) [kg/m^3]" elif line == "% Potential density TEOS10 ((s, t, 0) - 1000) [kg/m^3]": - metadata['D0CT'] = "Potential density TEOS10 ((s, t, 0) - 1000) [kg/m^3]" + metadata[ + "D0CT" + ] = "Potential density TEOS10 ((s, t, 0) - 1000) [kg/m^3]" elif line == "% Potential density TEOS10 (s, t, 0) [kg/m^3]": - metadata['D0CT'] = "Potential density TEOS10 (s, t, 0) [kg/m^3]" + metadata["D0CT"] = "Potential density TEOS10 (s, t, 0) [kg/m^3]" elif re.match(r"% .* \[.+\]", line): logger.warning( "Unknown variable name will be saved to unknown_variables_information: %s", diff --git a/ocean_data_parser/parsers/nmea.py b/ocean_data_parser/parsers/nmea.py index ddd46953..504f88c6 100644 --- a/ocean_data_parser/parsers/nmea.py +++ b/ocean_data_parser/parsers/nmea.py @@ -179,7 +179,11 @@ def _generate_extra_terms(nmea): f"{nmea['year']}-{nmea['month']}-{nmea['day']}T{nmea['timestamp']} UTC", f"%Y-%m-%dT%H%M%S{'.%f' if len(nmea['timestamp'])>6 else''} %Z", ) - if nmea["sentence_type"] == "RMC" and nmea.get('timestamp') and nmea.get('datestamp'): + if ( + nmea["sentence_type"] == "RMC" + and nmea.get("timestamp") + and nmea.get("datestamp") + ): extra[("GPS Time", "gps_datetime")] = datetime.strptime( f"{nmea['datestamp']}T{nmea['timestamp']} UTC", f"%d%m%yT%H%M%S{'.%f' if len(nmea['timestamp'])>6 else''} %Z", @@ -297,9 +301,7 @@ def rename_variable(name): for col in df: if nmea_dtype_mapping.get(col) != datetime: continue - df[col] = ( - pd.to_datetime(df[col], utc=True).dt.tz_convert(None) - ) + df[col] = pd.to_datetime(df[col], utc=True).dt.tz_convert(None) df = df.replace({np.nan: None, "": None, "None": None}) diff --git a/ocean_data_parser/parsers/sunburst.py b/ocean_data_parser/parsers/sunburst.py index c30b8f66..d265dced 100644 --- a/ocean_data_parser/parsers/sunburst.py +++ b/ocean_data_parser/parsers/sunburst.py @@ -95,23 +95,17 @@ def superCO2(path: str, output: str = None) -> xarray.Dataset: df.columns = [_format_variables(var) for var in df.columns] # Generate time variable from Date and Time columns - df["time"] = ( - pd.to_datetime( - (df["Date"] + " " + df["Time"]), format="%Y%m%d %H%M%S", utc=True - ) - .dt.tz_convert(None) - ) + df["time"] = pd.to_datetime( + (df["Date"] + " " + df["Time"]), format="%Y%m%d %H%M%S", utc=True + ).dt.tz_convert(None) # Review day of the year variable - df["time_doy_utc"] = ( - pd.to_datetime( - df["DOY_UTC"] - 1, - unit="D", - origin=pd.Timestamp(collected_beginning_date.year, 1, 1), - utc=True, - ) - .dt.tz_convert(None) - ) + df["time_doy_utc"] = pd.to_datetime( + df["DOY_UTC"] - 1, + unit="D", + origin=pd.Timestamp(collected_beginning_date.year, 1, 1), + utc=True, + ).dt.tz_convert(None) # Compare DOY_UTC vs Date + Time dt = (df["time"] - df["time_doy_utc"]).mean().total_seconds() diff --git a/ocean_data_parser/process/process.py b/ocean_data_parser/process/process.py index 2a349586..3def06a7 100644 --- a/ocean_data_parser/process/process.py +++ b/ocean_data_parser/process/process.py @@ -38,7 +38,7 @@ def load_cnv(file: str): ds = seabird.cnv(file) ds["time"] = ( ds["timeK"].dims, - pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s") + pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s"), ) return ds.swap_dims({"index": "time"}).drop("index") diff --git a/tests/test_process.py b/tests/test_process.py index f1a3f4e2..1eaa2c92 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -56,7 +56,7 @@ def load_test_dataset(): ds = seabird.cnv(TEST_SEABIRD_FILE) ds["time"] = ( ds["timeK"].dims, - pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s") + pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s"), ) ds.process.time = "time" ds.process.lat = "latitude" From 9a436947a21901a7093f2caa7edc683ff3d43d9b Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:36:29 -0500 Subject: [PATCH 06/13] drop seabird code from process module --- ocean_data_parser/process/process.py | 101 +-------------------------- 1 file changed, 1 insertion(+), 100 deletions(-) diff --git a/ocean_data_parser/process/process.py b/ocean_data_parser/process/process.py index 3def06a7..bbb5d307 100644 --- a/ocean_data_parser/process/process.py +++ b/ocean_data_parser/process/process.py @@ -6,110 +6,11 @@ import pandas as pd import xarray as xr -from ocean_data_parser.parsers import seabird, utils +from ocean_data_parser.parsers import utils logger = logging.getLogger(__name__) -def generate_dataset_file_name(ds: xr.Dataset, suffix: str = "") -> str: - return ( - "_".join( - [ - "Hakai", - f"{ds.attrs['instrument_manufacturer']}-{ds.attrs['instrument_type']}-{ds.attrs['instrument_sub_type']}", - f"SN{ds.attrs['serial_number']}", - f"{ds.attrs['region']}-{ds.attrs['site']}", - f"{pd.to_datetime(ds['time'].min().values):%y%m%d}-{pd.to_datetime(ds['time'].max().values):%y%m%d}", - ] - ) - + suffix - ) - - -def load_cnv(file: str): - """Load seabird cnv and apply conversion - - Args: - file (str): _description_ - - Returns: - _type_: _description_ - """ - ds = seabird.cnv(file) - ds["time"] = ( - ds["timeK"].dims, - pd.to_datetime(ds["timeK"], origin="2000-01-01", unit="s"), - ) - return ds.swap_dims({"index": "time"}).drop("index") - - -def match_metadata(file, log): - file = Path(file) - ds = load_cnv(file) - - # Append lat/lon/station/file_id variables - # Find any matching records in log - is_matching_sn = log["Serial Number"] == int(ds.attrs["temperature_sn"]) - is_within_deploymen_time = ( - log["Deployment Time"].dt.tz_localize(None) < ds["time"].mean().values - ) & ( - (log["Retrieval Time"].dt.tz_localize(None) > ds["time"].mean().values) - | (log["Retrieval Time"].isna()) - ) - selected_log_record = log.loc[is_matching_sn & is_within_deploymen_time] - - if len(selected_log_record) != 1: - raise RuntimeError("Failed to match an appropriate record") - elif selected_log_record.empty: - raise RuntimeError("Failed to match record to any deployments") - return selected_log_record.iloc[0] - - -def get_L0_file( - file: str, selected_log_record: pd.Series = None, ds: xr.Dataset = None -): - # Load raw file - file = Path(file) - if not ds: - ds = load_cnv(file) - - # Try to match metadata if not given - if selected_log_record is None: - selected_log_record = match_metadata - - # Add Log metadata to record - ds["file_id"] = file.name - ds["instrument_model"] = ds.attrs["instrument_type"].strip() - ds["instrument_serialnumber"] = f"037{ds.attrs['temperature_sn']}" - ds["station"] = selected_log_record["Site"] - ds["latitude"] = selected_log_record["Target Latitude (dd°mm.mmm'N) "] - ds["latitude"].attrs = { - "long_name": "Latitude", - "standard_name": "latitude", - "units": "degrees_north", - } - ds["longitude"] = selected_log_record["Target Longitude (dd°mm.mmm'W)"] - ds["longitude"].attrs = { - "long_name": "Longitude", - "standard_name": "longitude", - "units": "degrees_east", - } - - # Standardize units to match previous data - ds["sal00"].attrs["units"] = "1e-3" - ds["sbeopoxMLPerL"].attrs["units"] = "mL/L" - ds["prdM"].attrs["units"] = "dbar" - - # Dump all log as global attributes - ds.attrs.update( - { - key.lower().replace(" ", "_").split("(")[0]: value - for key, value in selected_log_record.to_dict().items() - } - ) - return ds - - @xr.register_dataset_accessor("process") class Processing: def __init__( From f0daed7e95e82e3de12d2c9b1f4c7aec7b74fcec Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:29:58 -0500 Subject: [PATCH 07/13] fix registry types --- tests/test_file_registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_file_registry.py b/tests/test_file_registry.py index 9e272f07..14cc9e50 100644 --- a/tests/test_file_registry.py +++ b/tests/test_file_registry.py @@ -133,7 +133,7 @@ def test_update(self): # Replace registry parameters file_registry.data["mtime"] = 0 - file_registry.data["hash"] = 0 + file_registry.data["hash"] = "0" assert ( file_registry != self._get_test_registry() ), "local test registry wasn't modify" @@ -150,7 +150,7 @@ def test_update_specific_source(self): file_registry = self._get_test_registry() # Replace registry parameters file_registry.data["mtime"] = 0 - file_registry.data["hash"] = 0 + file_registry.data["hash"] = "0" file_registry.update([file_registry.data.index[0]]) assert ( file_registry.data.iloc[0]["mtime"] != 0 From 63d79d7db81620716770f4f7ad3113fdab5a4747 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:33:02 -0500 Subject: [PATCH 08/13] define registry dtypes --- ocean_data_parser/batch/registry.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ocean_data_parser/batch/registry.py b/ocean_data_parser/batch/registry.py index 27cfb8c7..6a768ccd 100644 --- a/ocean_data_parser/batch/registry.py +++ b/ocean_data_parser/batch/registry.py @@ -16,6 +16,15 @@ ).set_index("source") +REGSITRY_DTYPE = { + "source": str, + "mtime": float, + "hash": str, + "error_message": str, + "output_path": str, +} + + class FileConversionRegistry: def __init__( self, @@ -42,9 +51,9 @@ def load(self, overwrite=False): elif self.path is None or not self.path.exists(): self.data = pd.DataFrame() elif self.path.suffix == ".csv": - self.data = pd.read_csv(self.path) + self.data = pd.read_csv(self.path, dtype=REGSITRY_DTYPE) elif self.path.suffix == ".parquet": - self.data = pd.read_parquet(self.path) + self.data = pd.read_parquet(self.path).astype(REGSITRY_DTYPE) else: raise TypeError("Unknown registry type") From 559a6555374ed522c556ab0cb5afa86146233504 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:41:29 -0500 Subject: [PATCH 09/13] fix registry tests hash and mtime placeholders --- tests/test_file_registry.py | 41 +++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/test_file_registry.py b/tests/test_file_registry.py index 14cc9e50..db1d24d2 100644 --- a/tests/test_file_registry.py +++ b/tests/test_file_registry.py @@ -19,7 +19,8 @@ with open(output_path, "w") as file: file.write("test file from registry") - +MTIME_PLACEHOLDER = 0.0 +HASH_PLACEHOLDER = "0" class FileRegistryTests(unittest.TestCase): def _get_test_registry(self, update=True): registry = FileConversionRegistry(path=TEST_REGISTRY_PATH) @@ -95,19 +96,19 @@ def test_registry(self): def test_registry_copy(self): deep_copied_file_registry = self._get_test_registry() - deep_copied_file_registry.data["hash"] = 0 + deep_copied_file_registry.data["hash"] = HASH_PLACEHOLDER assert ( deep_copied_file_registry != self._get_test_registry() ), "Deed copied registry after modification changed the original" copied_file_registry = deep_copied_file_registry.copy() - copied_file_registry.data["hash"] = 2 + copied_file_registry.data["hash"] = "2" assert ( deep_copied_file_registry != copied_file_registry ), "Copied registry after modification changed the original" not_copied_registry = deep_copied_file_registry - not_copied_registry.data["hash"] = 2 + not_copied_registry.data["hash"] = "2" assert ( deep_copied_file_registry == not_copied_registry ), "Registry after modification didn't changed the original" @@ -115,54 +116,54 @@ def test_registry_copy(self): def test_load(self): file_registry = self._get_test_registry() # Replace registry parameters - file_registry.data["mtime"] = 0 - file_registry.data["hash"] = 0 + file_registry.data["mtime"] = MTIME_PLACEHOLDER + file_registry.data["hash"] = HASH_PLACEHOLDER file_registry.load() - assert (file_registry.data["mtime"] == 0).all(), "mtime was updated with load()" - assert (file_registry.data["hash"] == 0).all(), "hash was updated with load()" + assert (file_registry.data["mtime"] == MTIME_PLACEHOLDER).all(), "mtime was updated with load()" + assert (file_registry.data["hash"] == HASH_PLACEHOLDER).all(), "hash was updated with load()" file_registry.load(overwrite=True) assert ( - file_registry.data["mtime"] != 0 + file_registry.data["mtime"] != MTIME_PLACEHOLDER ).all(), "mtime wasn't updated with load(overwrite=Trues)" assert ( - file_registry.data["hash"] != 0 + file_registry.data["hash"] != HASH_PLACEHOLDER ).all(), "hash wasn't updated with load()" def test_update(self): file_registry = self._get_test_registry(update=False) # Replace registry parameters - file_registry.data["mtime"] = 0 - file_registry.data["hash"] = "0" + file_registry.data["mtime"] = MTIME_PLACEHOLDER + file_registry.data["hash"] = HASH_PLACEHOLDER assert ( file_registry != self._get_test_registry() ), "local test registry wasn't modify" file_registry.update() assert ( - file_registry.data["mtime"] != 0 + file_registry.data["mtime"] != MTIME_PLACEHOLDER ).all(), "mtime wasn't updated wiht update()" assert ( - file_registry.data["hash"] != 0 + file_registry.data["hash"] != MTIME_PLACEHOLDER ).all(), "hash wasn't updated wiht update()" def test_update_specific_source(self): file_registry = self._get_test_registry() # Replace registry parameters - file_registry.data["mtime"] = 0 - file_registry.data["hash"] = "0" + file_registry.data["mtime"] = MTIME_PLACEHOLDER + file_registry.data["hash"] = HASH_PLACEHOLDER file_registry.update([file_registry.data.index[0]]) assert ( - file_registry.data.iloc[0]["mtime"] != 0 + file_registry.data.iloc[0]["mtime"] != MTIME_PLACEHOLDER ), "mtime wasn't updated wiht update(source)" assert ( - file_registry.data.iloc[1:]["mtime"] == 0 + file_registry.data.iloc[1:]["mtime"] == MTIME_PLACEHOLDER ).all(), "mtime source!=source shouldn't be updated with update(source)" assert ( - file_registry.data.iloc[0]["hash"] != 0 + file_registry.data.iloc[0]["hash"] != HASH_PLACEHOLDER ), "hash wasn't updated wiht update(source)" assert ( - file_registry.data.iloc[1:]["hash"] == 0 + file_registry.data.iloc[1:]["hash"] == HASH_PLACEHOLDER ).all(), "hash source!=source shouldn't be updated with update(source)" def test_update_field_for_all_sources_with_missing_field(self): From 862f959cc9078dda1cee1b98af6c1c24acc4fa4c Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 13:00:08 -0500 Subject: [PATCH 10/13] ignore self.data if empty when registry.add --- ocean_data_parser/batch/registry.py | 9 +++++++-- tests/test_file_registry.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ocean_data_parser/batch/registry.py b/ocean_data_parser/batch/registry.py index 6a768ccd..e041b85f 100644 --- a/ocean_data_parser/batch/registry.py +++ b/ocean_data_parser/batch/registry.py @@ -136,11 +136,16 @@ def add(self, sources: list): new_data["mtime"] = new_data["source"].progress_apply(self._get_mtime) logger.info("Get new files hash") new_data["hash"] = new_data["source"].progress_apply(self._get_hash) + + new_data = new_data.set_index(["source"]) + self.data = ( - pd.concat( + new_data + if self.data.empty + else pd.concat( [ self.data, - new_data.set_index(["source"]), + new_data, ], ) .groupby(level=0) diff --git a/tests/test_file_registry.py b/tests/test_file_registry.py index db1d24d2..84dd39e4 100644 --- a/tests/test_file_registry.py +++ b/tests/test_file_registry.py @@ -21,6 +21,8 @@ MTIME_PLACEHOLDER = 0.0 HASH_PLACEHOLDER = "0" + + class FileRegistryTests(unittest.TestCase): def _get_test_registry(self, update=True): registry = FileConversionRegistry(path=TEST_REGISTRY_PATH) @@ -119,8 +121,12 @@ def test_load(self): file_registry.data["mtime"] = MTIME_PLACEHOLDER file_registry.data["hash"] = HASH_PLACEHOLDER file_registry.load() - assert (file_registry.data["mtime"] == MTIME_PLACEHOLDER).all(), "mtime was updated with load()" - assert (file_registry.data["hash"] == HASH_PLACEHOLDER).all(), "hash was updated with load()" + assert ( + file_registry.data["mtime"] == MTIME_PLACEHOLDER + ).all(), "mtime was updated with load()" + assert ( + file_registry.data["hash"] == HASH_PLACEHOLDER + ).all(), "hash was updated with load()" file_registry.load(overwrite=True) assert ( file_registry.data["mtime"] != MTIME_PLACEHOLDER From d453dd43a4d42a9c7412225c2a9adf052a5e30d8 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:15:10 -0500 Subject: [PATCH 11/13] fix file registry data type --- ocean_data_parser/batch/registry.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/ocean_data_parser/batch/registry.py b/ocean_data_parser/batch/registry.py index e041b85f..2b0bf377 100644 --- a/ocean_data_parser/batch/registry.py +++ b/ocean_data_parser/batch/registry.py @@ -17,7 +17,6 @@ REGSITRY_DTYPE = { - "source": str, "mtime": float, "hash": str, "error_message": str, @@ -25,11 +24,22 @@ } +def generate_registry(sources=None, **kwargs): + return ( + pd.DataFrame( + data={"source": [Path(source) for source in sources or []], **kwargs}, + columns=list(REGSITRY_DTYPE.keys()) + ["source"], + ) + .astype(REGSITRY_DTYPE) + .set_index("source") + ) + + class FileConversionRegistry: def __init__( self, path: str = None, - data: pd.DataFrame = EMPTY_FILE_REGISTRY, + data: pd.DataFrame = generate_registry(), hashtype: str = "sha256", block_size: int = 65536, ): @@ -128,16 +138,15 @@ def add(self, sources: list): sources = [source for source in sources if source not in self.data.index] if not sources: return - new_data = pd.DataFrame({"source": sources}) + new_data = generate_registry(sources) # Retrieve mtime and hash only if a registry is actually saved if self.path: logger.info("Get new files mtime") - new_data["mtime"] = new_data["source"].progress_apply(self._get_mtime) - logger.info("Get new files hash") - new_data["hash"] = new_data["source"].progress_apply(self._get_hash) - - new_data = new_data.set_index(["source"]) + new_data = new_data.assign( + mtime=new_data.index.to_series().progress_map(self._get_mtime), + hash=new_data.index.to_series().progress_map(self._get_hash), + ) self.data = ( new_data From c9809facae403541aa09df4018f13f23d06bfcf5 Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Mon, 20 Nov 2023 11:00:17 -0500 Subject: [PATCH 12/13] fix regsitry dtypes --- ocean_data_parser/batch/registry.py | 35 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/ocean_data_parser/batch/registry.py b/ocean_data_parser/batch/registry.py index 2b0bf377..109727ad 100644 --- a/ocean_data_parser/batch/registry.py +++ b/ocean_data_parser/batch/registry.py @@ -1,7 +1,6 @@ import copy import hashlib import logging -import re from pathlib import Path from typing import Union @@ -16,7 +15,7 @@ ).set_index("source") -REGSITRY_DTYPE = { +REGISTRY_DTYPE = { "mtime": float, "hash": str, "error_message": str, @@ -24,15 +23,11 @@ } -def generate_registry(sources=None, **kwargs): - return ( - pd.DataFrame( - data={"source": [Path(source) for source in sources or []], **kwargs}, - columns=list(REGSITRY_DTYPE.keys()) + ["source"], - ) - .astype(REGSITRY_DTYPE) - .set_index("source") - ) +def generate_registry(sources=None): + return pd.DataFrame( + data={"source": sources}, + columns=list(REGISTRY_DTYPE.keys()) + ["source"], + ).set_index("source") class FileConversionRegistry: @@ -53,22 +48,26 @@ def __init__( def load(self, overwrite=False): """Load file registry if available otherwise return an empty dataframe""" + + def _as_path(path): + return Path(path) if pd.notna(path) else path + if not self.data.empty and not overwrite: logger.warning( "Registry already contains data and won't reload from: %s", self.data ) return elif self.path is None or not self.path.exists(): - self.data = pd.DataFrame() + self.data = generate_registry() elif self.path.suffix == ".csv": - self.data = pd.read_csv(self.path, dtype=REGSITRY_DTYPE) + self.data = pd.read_csv(self.path, index_col="source", dtype=REGISTRY_DTYPE) elif self.path.suffix == ".parquet": - self.data = pd.read_parquet(self.path).astype(REGSITRY_DTYPE) + self.data = pd.read_parquet(self.path) else: raise TypeError("Unknown registry type") - if "source" in self.data: - self.data = self.data.set_index(["source"]) + self.data.index = self.data.index.map(Path) + self.data["output_path"] = self.data["output_path"].apply(_as_path) return self def save(self): @@ -144,8 +143,8 @@ def add(self, sources: list): if self.path: logger.info("Get new files mtime") new_data = new_data.assign( - mtime=new_data.index.to_series().progress_map(self._get_mtime), - hash=new_data.index.to_series().progress_map(self._get_hash), + mtime=new_data.index.map(self._get_mtime), + hash=new_data.index.map(self._get_hash), ) self.data = ( From 652b395eba477f1c3f04e86a36a585895db8501e Mon Sep 17 00:00:00 2001 From: Jessy Barrette <30420025+JessyBarrette@users.noreply.github.com> Date: Mon, 20 Nov 2023 11:47:34 -0500 Subject: [PATCH 13/13] fix test path handling --- tests/test_batch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_batch.py b/tests/test_batch.py index fe96b1e5..ab3e13da 100644 --- a/tests/test_batch.py +++ b/tests/test_batch.py @@ -198,16 +198,15 @@ def test_batch_failed_cli_conversion_with_argument_inputs(self): ), f"Unexpected output {result.output=}" def test_failed_cli_batch_conversion_with_ignore_errors(self, tmp_path): - test_file_path = str(tmp_path / "failed_cli_test_file.cnv") + test_file_path = tmp_path / "failed_cli_test_file.cnv" config = _get_config( cwd=tmp_path, - input_path=test_file_path, + input_path=str(test_file_path), parser="seabird.cnv", overwrite=True, multiprocessing=1, errors="ignore", ) - config["registry"]["path"] = str(tmp_path / "registry.csv") config_path = _save_config(tmp_path, config) assert config_path.exists()