From 1cf8d5872741dda163857423685288bc896d09ae Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 10:42:45 +0100 Subject: [PATCH 01/12] Made all models, datasets, tasks and flows importable through top-level omnipy. Moved runtime to omnipy.hub Testing out dynamic module import for omnipy --- src/omnipy/__init__.py | 151 ++++++++++++++++++++++--- src/omnipy/compute/mixins/serialize.py | 2 +- src/omnipy/data/dataset.py | 2 +- src/omnipy/hub/runtime.py | 16 +++ src/omnipy/util/helpers.py | 24 +++- src/omnipy/util/mako_helpers.py | 25 +--- 6 files changed, 179 insertions(+), 41 deletions(-) diff --git a/src/omnipy/__init__.py b/src/omnipy/__init__.py index d4f198cd..3b37e278 100644 --- a/src/omnipy/__init__.py +++ b/src/omnipy/__init__.py @@ -1,27 +1,148 @@ __version__ = '0.12.3' +import importlib import os import sys from typing import Optional from omnipy.data.dataset import Dataset from omnipy.data.model import Model -from omnipy.hub.runtime import Runtime - -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) - -# TODO: The check disabling runtime for tests also trigger for tests that are run outside of Omnipy, -# breaking tests on the user side. -# Find a better way to disable the global runtime object for Omnipy tests +from omnipy.hub.runtime import runtime +# from omnipy.util.helpers import recursive_module_import +from omnipy.modules.general.tasks import import_directory, split_dataset +from omnipy.modules.json.datasets import (JsonDataset, + JsonDictDataset, + JsonDictOfDictsDataset, + JsonDictOfDictsOfScalarsDataset, + JsonDictOfListsDataset, + JsonDictOfListsOfDictsDataset, + JsonDictOfListsOfScalarsDataset, + JsonDictOfNestedListsDataset, + JsonDictOfScalarsDataset, + JsonListDataset, + JsonListOfDictsDataset, + JsonListOfDictsOfScalarsDataset, + JsonListOfListsDataset, + JsonListOfListsOfScalarsDataset, + JsonListOfNestedDictsDataset, + JsonListOfScalarsDataset, + JsonNestedDictsDataset, + JsonNestedListsDataset, + JsonOnlyDictsDataset, + JsonOnlyListsDataset, + JsonScalarDataset) +from omnipy.modules.json.flows import flatten_nested_json +from omnipy.modules.json.models import (JsonDictModel, + JsonDictOfDictsModel, + JsonDictOfDictsOfScalarsModel, + JsonDictOfListsModel, + JsonDictOfListsOfDictsModel, + JsonDictOfListsOfScalarsModel, + JsonDictOfNestedListsModel, + JsonDictOfScalarsModel, + JsonListModel, + JsonListOfDictsModel, + JsonListOfDictsOfScalarsModel, + JsonListOfListsModel, + JsonListOfListsOfScalarsModel, + JsonListOfNestedDictsModel, + JsonListOfScalarsModel, + JsonModel, + JsonNestedDictsModel, + JsonNestedListsModel, + JsonOnlyDictsModel, + JsonOnlyListsModel, + JsonScalarModel) +from omnipy.modules.json.tasks import (transpose_dict_of_dicts_2_list_of_dicts, + transpose_dicts_2_lists, + transpose_dicts_of_lists_of_dicts_2_lists_of_dicts) +from omnipy.modules.pandas.models import (ListOfPandasDatasetsWithSameNumberOfFiles, + PandasDataset, + PandasModel) +from omnipy.modules.pandas.tasks import (concat_dataframes_across_datasets, + convert_dataset_csv_to_pandas, + convert_dataset_list_of_dicts_to_pandas, + convert_dataset_pandas_to_csv, + extract_columns_as_files) +from omnipy.modules.raw.models import JoinLinesModel, SplitAndStripLinesModel, SplitLinesModel +from omnipy.modules.raw.tasks import modify_all_lines, modify_datafile_contents, modify_each_line +from omnipy.modules.tables.tasks import remove_columns +# from omnipy.util.helpers import recursive_module_import -def _get_runtime() -> Optional['Runtime']: - if 'pytest' not in sys.modules: - return Runtime() - else: - return None - +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) -runtime: Optional['Runtime'] = _get_runtime() +__all__ = [ + 'runtime', + 'Dataset', + 'Model', + 'JsonDataset', + 'JsonDictDataset', + 'JsonDictOfDictsDataset', + 'JsonDictOfDictsOfScalarsDataset', + 'JsonDictOfListsDataset', + 'JsonDictOfListsOfDictsDataset', + 'JsonDictOfListsOfScalarsDataset', + 'JsonDictOfNestedListsDataset', + 'JsonDictOfScalarsDataset', + 'JsonListDataset', + 'JsonListOfDictsDataset', + 'JsonListOfDictsOfScalarsDataset', + 'JsonListOfListsDataset', + 'JsonListOfListsOfScalarsDataset', + 'JsonListOfNestedDictsDataset', + 'JsonListOfScalarsDataset', + 'JsonNestedDictsDataset', + 'JsonNestedListsDataset', + 'JsonOnlyDictsDataset', + 'JsonOnlyListsDataset', + 'JsonScalarDataset', + 'JsonDictModel', + 'JsonDictOfDictsModel', + 'JsonDictOfDictsOfScalarsModel', + 'JsonDictOfListsModel', + 'JsonDictOfListsOfDictsModel', + 'JsonDictOfListsOfScalarsModel', + 'JsonDictOfNestedListsModel', + 'JsonDictOfScalarsModel', + 'JsonListModel', + 'JsonListOfDictsModel', + 'JsonListOfDictsOfScalarsModel', + 'JsonListOfListsModel', + 'JsonListOfListsOfScalarsModel', + 'JsonListOfNestedDictsModel', + 'JsonListOfScalarsModel', + 'JsonModel', + 'JsonNestedDictsModel', + 'JsonNestedListsModel', + 'JsonOnlyDictsModel', + 'JsonOnlyListsModel', + 'JsonScalarModel', + 'ListOfPandasDatasetsWithSameNumberOfFiles', + 'PandasModel', + 'PandasDataset', + 'SplitLinesModel', + 'SplitAndStripLinesModel', + 'JoinLinesModel', + 'import_directory', + 'split_dataset', + 'flatten_nested_json', + 'transpose_dicts_2_lists', + 'transpose_dict_of_dicts_2_list_of_dicts', + 'transpose_dicts_of_lists_of_dicts_2_lists_of_dicts', + 'concat_dataframes_across_datasets', + 'convert_dataset_csv_to_pandas', + 'convert_dataset_pandas_to_csv', + 'convert_dataset_list_of_dicts_to_pandas', + 'modify_all_lines', + 'modify_datafile_contents', + 'modify_each_line', + 'remove_columns', +] -__all__ = [Model, Dataset] +# +# def __getattr__(attr_name: str) -> object: +# omnipy = importlib.import_module(__name__) +# all_modules = [] +# recursive_module_import(omnipy, all_modules) +# print(all_modules) diff --git a/src/omnipy/compute/mixins/serialize.py b/src/omnipy/compute/mixins/serialize.py index 6aaf2631..417a9d1d 100644 --- a/src/omnipy/compute/mixins/serialize.py +++ b/src/omnipy/compute/mixins/serialize.py @@ -24,7 +24,7 @@ def _setup_serializer_registry() -> IsSerializerRegistry: - from omnipy import runtime + from omnipy.hub.runtime import runtime if runtime is not None: return runtime.objects.serializers else: diff --git a/src/omnipy/data/dataset.py b/src/omnipy/data/dataset.py index f317b890..7eb7aeef 100644 --- a/src/omnipy/data/dataset.py +++ b/src/omnipy/data/dataset.py @@ -362,7 +362,7 @@ def load(self, directory: str): @staticmethod def _get_serializer_registry(): - from omnipy import runtime + from omnipy.hub.runtime import runtime if len(runtime.objects.serializers.serializers) == 0: from omnipy.modules import register_serializers register_serializers(runtime.objects.serializers) diff --git a/src/omnipy/hub/runtime.py b/src/omnipy/hub/runtime.py index a2571807..99c194b2 100644 --- a/src/omnipy/hub/runtime.py +++ b/src/omnipy/hub/runtime.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +import sys from typing import Any from omnipy.api.enums import EngineChoice @@ -112,3 +113,18 @@ def _update_prefect_engine_config(self, prefect_engine: IsEngine): def _update_job_creator_engine(self, _item_changed: Any): self.objects.job_creator.set_engine(self._get_engine(self.config.engine)) + + +# TODO: The check disabling runtime for tests also trigger for tests that are run outside of Omnipy, +# breaking tests on the user side. +# Find a better way to disable the global runtime object for Omnipy tests + + +def _get_runtime() -> 'Runtime | None': + if 'pytest' not in sys.modules: + return Runtime() + else: + return None + + +runtime: 'Runtime | None' = _get_runtime() diff --git a/src/omnipy/util/helpers.py b/src/omnipy/util/helpers.py index d780b7d6..020c254a 100644 --- a/src/omnipy/util/helpers.py +++ b/src/omnipy/util/helpers.py @@ -1,8 +1,9 @@ from collections.abc import Hashable, Iterable from copy import copy, deepcopy import inspect +from inspect import getmodule, isclass import locale as pkg_locale -from types import GenericAlias, UnionType +from types import GenericAlias, ModuleType, UnionType from typing import (Annotated, Any, cast, @@ -192,3 +193,24 @@ def last_snapshot_taken_of_same_obj(self, obj: object) -> bool: def differs_from_last_snapshot(self, obj: object) -> bool: self._assert_not_empty() return not all_equals(self._last_snapshot.obj_copy, obj) + + +def _is_internal_module(module: ModuleType, imported_modules: list[ModuleType]): + return module not in imported_modules and module.__name__.startswith('omnipy') + + +def recursive_module_import(module: ModuleType, imported_modules: list[ModuleType] = []): + module_vars = vars(module) + imported_modules.append(module) + + for val in module_vars.values(): + if isclass(val): + for base_cls in val.__bases__: + base_cls_module = getmodule(base_cls) + if base_cls_module and _is_internal_module(base_cls_module, imported_modules): + module_vars = create_merged_dict( + recursive_module_import(base_cls_module, imported_modules), + module_vars, + ) + + return module_vars diff --git a/src/omnipy/util/mako_helpers.py b/src/omnipy/util/mako_helpers.py index 0729b5df..9a19403d 100644 --- a/src/omnipy/util/mako_helpers.py +++ b/src/omnipy/util/mako_helpers.py @@ -1,5 +1,5 @@ import ast -from inspect import formatannotation, getmodule, isclass, isgeneratorfunction, Signature +from inspect import formatannotation, isgeneratorfunction, Signature import os from types import ModuleType from typing import Any, get_type_hints @@ -7,7 +7,7 @@ from docstring_parser import DocstringParam, DocstringReturns from pdocs.doc import Doc, External, Function, Module -from omnipy.util.helpers import create_merged_dict +from omnipy.util.helpers import recursive_module_import IGNORED = None IGNORE_PARAMS = ['cls', 'self'] @@ -109,27 +109,6 @@ def get_type_name_from_annotation(module: ModuleType, annotation, empty_obj): return type_name -def _is_internal_module(module: ModuleType, imported_modules: list[ModuleType]): - return module not in imported_modules and module.__name__.startswith('omnipy') - - -def recursive_module_import(module: ModuleType, imported_modules: list[ModuleType] = []): - module_vars = vars(module) - imported_modules.append(module) - - for val in module_vars.values(): - if isclass(val): - for base_cls in val.__bases__: - base_cls_module = getmodule(base_cls) - if base_cls_module and _is_internal_module(base_cls_module, imported_modules): - module_vars = create_merged_dict( - recursive_module_import(base_cls_module, imported_modules), - module_vars, - ) - - return module_vars - - def convert_to_qual_name_type_hint_str(module: ModuleType, type_hint: Any) -> str: def fixed_get_type_hints(obj: Any) -> str: """ From fba77f49140b9730a2230255aaed7ca75382cb14 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Mon, 18 Dec 2023 09:55:56 +0100 Subject: [PATCH 02/12] New helper funcs get_calling_module_name() and ensure_non_str_byte_iterable() --- src/omnipy/util/helpers.py | 21 +++++++++++++++++---- tests/util/__init__.py | 4 ++-- tests/util/helpers/__init__.py | 6 ++++++ tests/util/test_helpers.py | 30 +++++++++++++++++++++++++++++- 4 files changed, 54 insertions(+), 7 deletions(-) diff --git a/src/omnipy/util/helpers.py b/src/omnipy/util/helpers.py index 020c254a..1b285915 100644 --- a/src/omnipy/util/helpers.py +++ b/src/omnipy/util/helpers.py @@ -89,6 +89,14 @@ def transfer_generic_args_to_cls(to_cls, from_generic_type): return to_cls +def ensure_plain_type(in_type: type | GenericAlias) -> type | GenericAlias | None | Any: + return get_origin(in_type) if get_args(in_type) else in_type + + +def ensure_non_str_byte_iterable(value): + return value if is_iterable(value) and not type(value) in (str, bytes) else (value,) + + def is_iterable(obj: object) -> bool: try: iter(obj) @@ -132,10 +140,6 @@ class IsDataclass(Protocol): __dataclass_fields__: ClassVar[dict] -def ensure_plain_type(in_type: type | GenericAlias) -> type | GenericAlias | None | Any: - return get_origin(in_type) if get_args(in_type) else in_type - - def remove_annotated_plus_optional_if_present( type_or_class: Type | UnionType | object) -> Type | UnionType | object: if get_origin(type_or_class) == Annotated: @@ -214,3 +218,12 @@ def recursive_module_import(module: ModuleType, imported_modules: list[ModuleTyp ) return module_vars + + +def get_calling_module_name() -> str | None: + stack = inspect.stack() + if len(stack) >= 3: + grandparent_frame = inspect.stack()[2][0] + module = inspect.getmodule(grandparent_frame) + if module is not None: + return module.__name__ diff --git a/tests/util/__init__.py b/tests/util/__init__.py index e8da7faa..b4bb73e2 100644 --- a/tests/util/__init__.py +++ b/tests/util/__init__.py @@ -2,7 +2,7 @@ from omnipy.util.param_key_mapper import ParamKeyMapper -# For test_mako_helpers.test_internally_inherited +# For test_mako_helpers::test_internally_inherited T = TypeVar('T') @@ -61,4 +61,4 @@ def inherited_parent_staticmethod() -> bool: return True def inherited_parent_method(self) -> bool: - return True + return True \ No newline at end of file diff --git a/tests/util/helpers/__init__.py b/tests/util/helpers/__init__.py index e69de29b..225b93d9 100644 --- a/tests/util/helpers/__init__.py +++ b/tests/util/helpers/__init__.py @@ -0,0 +1,6 @@ +from omnipy.util.helpers import get_calling_module_name + + +# For test_helpers::test_get_calling_module_name +def other_module_call_get_calling_module_name() -> str: + return get_calling_module_name() \ No newline at end of file diff --git a/tests/util/test_helpers.py b/tests/util/test_helpers.py index 366d2812..09391c06 100644 --- a/tests/util/test_helpers.py +++ b/tests/util/test_helpers.py @@ -5,7 +5,9 @@ import pytest from typing_inspect import get_generic_type -from omnipy.util.helpers import (ensure_plain_type, +from omnipy.util.helpers import (ensure_non_str_byte_iterable, + ensure_plain_type, + get_calling_module_name, is_iterable, is_optional, is_strict_subclass, @@ -14,6 +16,8 @@ RestorableContents, transfer_generic_args_to_cls) +from .helpers import other_module_call_get_calling_module_name + T = TypeVar('T') U = TypeVar('U') @@ -55,6 +59,22 @@ def test_ensure_plain_type() -> None: assert ensure_plain_type(list[str]) == list +def test_ensure_non_str_byte_iterable() -> None: + assert ensure_non_str_byte_iterable((1, 2, 3)) == (1, 2, 3) + assert ensure_non_str_byte_iterable([1, 2, 3]) == [1, 2, 3] + assert ensure_non_str_byte_iterable({'a': 1, 'b': 2}) == {'a': 1, 'b': 2} + assert ensure_non_str_byte_iterable({'a', 'b'}) == {'a', 'b'} + + assert ensure_non_str_byte_iterable(123) == (123,) + assert ensure_non_str_byte_iterable('abc') == ('abc',) + assert ensure_non_str_byte_iterable(b'abc') == (b'abc',) + assert ensure_non_str_byte_iterable(None) == (None,) + assert ensure_non_str_byte_iterable(True) == (True,) + + x = object() + assert ensure_non_str_byte_iterable(x) == (x,) + + def test_is_iterable() -> None: assert is_iterable(None) is False assert is_iterable(False) is False @@ -268,3 +288,11 @@ def test_restorable_contents(): assert my_dict == {1: 2, 3: 4} assert contents.last_snapshot_taken_of_same_obj(my_dict) is True assert contents.differs_from_last_snapshot(my_dict) is True + + +def test_get_calling_module_name() -> None: + def local_call_get_calling_module_name() -> str: + return get_calling_module_name() + + assert local_call_get_calling_module_name() == 'tests.util.test_helpers' + assert other_module_call_get_calling_module_name() == 'tests.util.test_helpers' From 54eb2411bd1f2e5b448cab50ef1f837131debad8 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Mon, 18 Dec 2023 09:57:45 +0100 Subject: [PATCH 03/12] JsonNoDicts -> JsonOnlyLists, JsonNoLists -> JsonOnlyDicts --- src/omnipy/modules/json/datasets.py | 8 +- src/omnipy/modules/json/models.py | 6 +- src/omnipy/modules/json/typedefs.py | 8 +- tests/modules/json/cases/json_data.py | 20 ++--- tests/modules/json/test_json_types.yml | 104 ++++++++++++------------- 5 files changed, 73 insertions(+), 73 deletions(-) diff --git a/src/omnipy/modules/json/datasets.py b/src/omnipy/modules/json/datasets.py index 9f5d7856..8fea2fd3 100644 --- a/src/omnipy/modules/json/datasets.py +++ b/src/omnipy/modules/json/datasets.py @@ -21,8 +21,8 @@ JsonModel, JsonNestedDictsModel, JsonNestedListsModel, - JsonNoDictsModel, - JsonNoListsModel, + JsonOnlyDictsModel, + JsonOnlyListsModel, JsonScalarModel) # TODO: switch from plural to singular for names of modules in omnipy modules @@ -110,7 +110,7 @@ class JsonDictOfDictsOfScalarsDataset(JsonBaseDataset[JsonDictOfDictsOfScalarsMo # Nested datasets -class JsonNoDictsDataset(JsonBaseDataset[JsonNoDictsModel]): +class JsonOnlyListsDataset(JsonBaseDataset[JsonOnlyListsModel]): ... @@ -118,7 +118,7 @@ class JsonNestedListsDataset(JsonBaseDataset[JsonNestedListsModel]): ... -class JsonNoListsDataset(JsonBaseDataset[JsonNoListsModel]): +class JsonOnlyDictsDataset(JsonBaseDataset[JsonOnlyDictsModel]): ... diff --git a/src/omnipy/modules/json/models.py b/src/omnipy/modules/json/models.py index d0540975..899c3eb6 100644 --- a/src/omnipy/modules/json/models.py +++ b/src/omnipy/modules/json/models.py @@ -257,7 +257,7 @@ class JsonDictOfDictsOfScalarsModel(Model[JsonDictM[_JsonDictOfScalarsM]]): # Nested models -class JsonNoDictsModel(Model[_JsonOnlyListsUnion]): +class JsonOnlyListsModel(Model[_JsonOnlyListsUnion]): ... @@ -265,7 +265,7 @@ class JsonNestedListsModel(Model[JsonOnlyListsM]): ... -class JsonNoListsModel(Model[_JsonOnlyDictsUnion]): +class JsonOnlyDictsModel(Model[_JsonOnlyDictsUnion]): ... @@ -328,7 +328,7 @@ class JsonCustomDictModel(Model[JsonDictM[_JsonBaseT]], Generic[_JsonBaseT]): JsonDictOfListsOfScalarsModel, JsonDictOfDictsModel, JsonDictOfDictsOfScalarsModel, - JsonNoListsModel, + JsonOnlyDictsModel, JsonNestedListsModel, JsonListOfNestedDictsModel, JsonDictOfNestedListsModel, diff --git a/src/omnipy/modules/json/typedefs.py b/src/omnipy/modules/json/typedefs.py index 84d5022d..b05b7ac4 100644 --- a/src/omnipy/modules/json/typedefs.py +++ b/src/omnipy/modules/json/typedefs.py @@ -23,11 +23,11 @@ # Exclusion variants -JsonNoDicts: TypeAlias = 'JsonScalar | JsonNestedLists' -JsonNestedLists: TypeAlias = list[JsonNoDicts] +JsonOnlyLists: TypeAlias = 'JsonScalar | JsonNestedLists' +JsonNestedLists: TypeAlias = list[JsonOnlyLists] -JsonNoLists: TypeAlias = 'JsonScalar | JsonNestedDicts' -JsonNestedDicts: TypeAlias = dict[str, JsonNoLists] +JsonOnlyDicts: TypeAlias = 'JsonScalar | JsonNestedDicts' +JsonNestedDicts: TypeAlias = dict[str, JsonOnlyDicts] # More specific types diff --git a/tests/modules/json/cases/json_data.py b/tests/modules/json/cases/json_data.py index 2a00b6fc..f7175504 100644 --- a/tests/modules/json/cases/json_data.py +++ b/tests/modules/json/cases/json_data.py @@ -20,8 +20,8 @@ JsonListOfScalarsDataset, JsonNestedDictsDataset, JsonNestedListsDataset, - JsonNoDictsDataset, - JsonNoListsDataset, + JsonOnlyDictsDataset, + JsonOnlyListsDataset, JsonScalarDataset) from omnipy.modules.json.models import (JsonDictModel, JsonDictOfDictsModel, @@ -41,8 +41,8 @@ JsonModel, JsonNestedDictsModel, JsonNestedListsModel, - JsonNoDictsModel, - JsonNoListsModel, + JsonOnlyDictsModel, + JsonOnlyListsModel, JsonScalarModel) from omnipy.modules.json.typedefs import JsonScalar as JS @@ -549,7 +549,7 @@ def case_json_nested_lists() -> CaseInfo: class JsonNestedListsDataPoints: # - # JsonNoDictsModel + # JsonOnlyListsModel # v_no_dicts_none: None = b_none @@ -615,11 +615,11 @@ class JsonNestedListsDataPoints: return CaseInfo( name='test_json_nested_lists', prefix2model_classes={ - 'v_no_dicts': (JsonNoDictsModel,), + 'v_no_dicts': (JsonOnlyListsModel,), 'v_nested_lists': (JsonNestedListsModel,), }, prefix2dataset_classes={ - 'v_no_dicts': (JsonNoDictsDataset,), + 'v_no_dicts': (JsonOnlyListsDataset,), 'v_nested_lists': (JsonNestedListsDataset,), }, data_points=JsonNestedListsDataPoints(), @@ -631,7 +631,7 @@ def case_json_nested_dicts() -> CaseInfo: @dataclass class JsonNestedDictsDataPoints: # - # JsonNoListsModel + # JsonOnlyDictsModel # v_no_lists_none: None = b_none @@ -718,11 +718,11 @@ class JsonNestedDictsDataPoints: return CaseInfo( name='test_json_nested_dicts', prefix2model_classes={ - 'v_no_lists': (JsonNoListsModel,), + 'v_no_lists': (JsonOnlyDictsModel,), 'v_nested_dicts': (JsonNestedDictsModel,), }, prefix2dataset_classes={ - 'v_no_lists': (JsonNoListsDataset,), + 'v_no_lists': (JsonOnlyDictsDataset,), 'v_nested_dicts': (JsonNestedDictsDataset,), }, data_points=JsonNestedDictsDataPoints(), diff --git a/tests/modules/json/test_json_types.yml b/tests/modules/json/test_json_types.yml index 9bc7d796..951c40e5 100644 --- a/tests/modules/json/test_json_types.yml +++ b/tests/modules/json/test_json_types.yml @@ -248,26 +248,26 @@ - case: test_json_nested_lists main: | - from omnipy.modules.json.typedefs import JsonNoDicts, JsonNestedLists + from omnipy.modules.json.typedefs import JsonOnlyLists, JsonNestedLists from tests.modules.json.cases.raw.examples import b_none, b_int, b_float, b_str, b_bool, b_list, b_dict, b_tuple, b_set - v_no_dicts_none: JsonNoDicts = b_none - v_no_dicts_int: JsonNoDicts = b_int - v_no_dicts_float: JsonNoDicts = b_float - v_no_dicts_str: JsonNoDicts = b_str - v_no_dicts_bool: JsonNoDicts = b_bool - v_no_dicts_list: JsonNoDicts = list(b_list) - err_v_no_dicts_dict: JsonNoDicts = dict(b_dict) # E: Incompatible types in assignment (expression has type "dict[str, str | float | int | None]", variable has type "JsonNoDicts") [assignment] - err_v_no_dicts_tuple: JsonNoDicts = b_tuple # E: Incompatible types in assignment (expression has type "tuple[None, int, float, str, bool]", variable has type "JsonNoDicts") [assignment] - err_v_no_dicts_set: JsonNoDicts = set(b_set) # E: Incompatible types in assignment (expression has type "set[str | float | int | None]", variable has type "JsonNoDicts") [assignment] - - v_no_dicts_list_of_none: JsonNoDicts = [b_none] - err_v_no_dicts_dict_of_none: JsonNoDicts = {'a': b_none} # E: Incompatible types in assignment (expression has type "dict[str, None]", variable has type "JsonNoDicts") [assignment] - v_no_dicts_two_levels: JsonNoDicts = list(b_list + [list(b_list)]) - v_no_dicts_three_levels: JsonNoDicts = list(b_list + [list(b_list), [list(b_list)]]) - err_v_no_dicts_with_dict_of_none_level_two: JsonNoDicts = [{'a': b_none}] # E: List item 0 has incompatible type "dict[str, None]"; expected "int | float | str | JsonNestedLists | None" [list-item] - err_v_no_dicts_with_dict_level_two: JsonNoDicts = list(b_list + [dict(b_dict)]) # E: List item 0 has incompatible type "dict[str, str | float | int | None]"; expected "str | float | int | None" [list-item] - err_v_no_dicts_with_dict_level_three: JsonNoDicts = list(b_list + [list(b_list + [dict(b_dict)])]) # E: List item 0 has incompatible type "list[dict[str, str | float | int | None] | str | float | int | None]"; expected "str | float | int | None" [list-item] + v_no_dicts_none: JsonOnlyLists = b_none + v_no_dicts_int: JsonOnlyLists = b_int + v_no_dicts_float: JsonOnlyLists = b_float + v_no_dicts_str: JsonOnlyLists = b_str + v_no_dicts_bool: JsonOnlyLists = b_bool + v_no_dicts_list: JsonOnlyLists = list(b_list) + err_v_no_dicts_dict: JsonOnlyLists = dict(b_dict) # E: Incompatible types in assignment (expression has type "dict[str, str | float | int | None]", variable has type "JsonOnlyLists") [assignment] + err_v_no_dicts_tuple: JsonOnlyLists = b_tuple # E: Incompatible types in assignment (expression has type "tuple[None, int, float, str, bool]", variable has type "JsonOnlyLists") [assignment] + err_v_no_dicts_set: JsonOnlyLists = set(b_set) # E: Incompatible types in assignment (expression has type "set[str | float | int | None]", variable has type "JsonOnlyLists") [assignment] + + v_no_dicts_list_of_none: JsonOnlyLists = [b_none] + err_v_no_dicts_dict_of_none: JsonOnlyLists = {'a': b_none} # E: Incompatible types in assignment (expression has type "dict[str, None]", variable has type "JsonOnlyLists") [assignment] + v_no_dicts_two_levels: JsonOnlyLists = list(b_list + [list(b_list)]) + v_no_dicts_three_levels: JsonOnlyLists = list(b_list + [list(b_list), [list(b_list)]]) + err_v_no_dicts_with_dict_of_none_level_two: JsonOnlyLists = [{'a': b_none}] # E: List item 0 has incompatible type "dict[str, None]"; expected "int | float | str | JsonNestedLists | None" [list-item] + err_v_no_dicts_with_dict_level_two: JsonOnlyLists = list(b_list + [dict(b_dict)]) # E: List item 0 has incompatible type "dict[str, str | float | int | None]"; expected "str | float | int | None" [list-item] + err_v_no_dicts_with_dict_level_three: JsonOnlyLists = list(b_list + [list(b_list + [dict(b_dict)])]) # E: List item 0 has incompatible type "list[dict[str, str | float | int | None] | str | float | int | None]"; expected "str | float | int | None" [list-item] err_v_nested_lists_none: JsonNestedLists = b_none # E: Incompatible types in assignment (expression has type "None", variable has type "JsonNestedLists") [assignment] err_v_nested_lists_int: JsonNestedLists = b_int # E: Incompatible types in assignment (expression has type "int", variable has type "JsonNestedLists") [assignment] @@ -289,31 +289,31 @@ - case: test_json_nested_dicts main: | - from omnipy.modules.json.typedefs import JsonNoLists, JsonNestedDicts + from omnipy.modules.json.typedefs import JsonOnlyDicts, JsonNestedDicts from tests.modules.json.cases.raw.examples import (b_none, b_int, b_float, b_str, b_bool, b_list, b_dict, b_tuple, b_set, e_int_key_dict, e_float_key_dict, e_bool_key_dict, e_none_key_dict) - v_no_lists_none: JsonNoLists = b_none - v_no_lists_int: JsonNoLists = b_int - v_no_lists_float: JsonNoLists = b_float - v_no_lists_str: JsonNoLists = b_str - v_no_lists_bool: JsonNoLists = b_bool - err_v_no_lists_list: JsonNoLists = list(b_list) # E: Incompatible types in assignment (expression has type "list[str | float | int | None]", variable has type "JsonNoLists") [assignment] - v_no_lists_dict: JsonNoLists = dict(b_dict) - err_v_no_lists_int_key_dict: JsonNoLists = dict(e_int_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[int, None]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] - err_v_no_lists_float_key_dict: JsonNoLists = dict(e_float_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[float, int]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] - err_v_no_lists_bool_key_dict: JsonNoLists = dict(e_bool_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[bool, str]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] - err_v_no_lists_none_key_dict: JsonNoLists = dict(e_none_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[None, bool]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] - err_v_no_lists_tuple: JsonNoLists = b_tuple # E: Incompatible types in assignment (expression has type "tuple[None, int, float, str, bool]", variable has type "JsonNoLists") [assignment] - err_v_no_lists_set: JsonNoLists = set(b_set) # E: Incompatible types in assignment (expression has type "set[str | float | int | None]", variable has type "JsonNoLists") [assignment] - - err_v_no_lists_list_of_none: JsonNoLists = [b_none] # E: Incompatible types in assignment (expression has type "list[None]", variable has type "JsonNoLists") [assignment] - v_no_lists_dict_of_none: JsonNoLists = {'a': b_none} - v_no_lists_two_levels: JsonNoLists = {'a': dict(b_dict), 'b': dict(b_dict)} - v_no_lists_three_levels: JsonNoLists = {'a': dict(b_dict), 'b': {'x': dict(b_dict)}} - err_v_no_lists_with_list_of_none_level_two: JsonNoLists = {'a': [b_none]} # E: Dict entry 0 has incompatible type "str": "list[None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] - err_v_no_lists_with_list_level_two: JsonNoLists = {'a': dict(b_dict), 'b': list(b_list)} # E: Dict entry 1 has incompatible type "str": "list[str | float | int | None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] - err_v_no_lists_with_list_level_three: JsonNoLists = {'a': dict(b_dict), 'b': {'x': list(b_list)}} # E: Dict entry 0 has incompatible type "str": "list[str | float | int | None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] + v_no_lists_none: JsonOnlyDicts = b_none + v_no_lists_int: JsonOnlyDicts = b_int + v_no_lists_float: JsonOnlyDicts = b_float + v_no_lists_str: JsonOnlyDicts = b_str + v_no_lists_bool: JsonOnlyDicts = b_bool + err_v_no_lists_list: JsonOnlyDicts = list(b_list) # E: Incompatible types in assignment (expression has type "list[str | float | int | None]", variable has type "JsonOnlyDicts") [assignment] + v_no_lists_dict: JsonOnlyDicts = dict(b_dict) + err_v_no_lists_int_key_dict: JsonOnlyDicts = dict(e_int_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[int, None]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] + err_v_no_lists_float_key_dict: JsonOnlyDicts = dict(e_float_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[float, int]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] + err_v_no_lists_bool_key_dict: JsonOnlyDicts = dict(e_bool_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[bool, str]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] + err_v_no_lists_none_key_dict: JsonOnlyDicts = dict(e_none_key_dict) # E: Argument 1 to "dict" has incompatible type "dict[None, bool]"; expected "SupportsKeysAndGetItem[str, int | float | str | JsonNestedDicts | None]" [arg-type] + err_v_no_lists_tuple: JsonOnlyDicts = b_tuple # E: Incompatible types in assignment (expression has type "tuple[None, int, float, str, bool]", variable has type "JsonOnlyDicts") [assignment] + err_v_no_lists_set: JsonOnlyDicts = set(b_set) # E: Incompatible types in assignment (expression has type "set[str | float | int | None]", variable has type "JsonOnlyDicts") [assignment] + + err_v_no_lists_list_of_none: JsonOnlyDicts = [b_none] # E: Incompatible types in assignment (expression has type "list[None]", variable has type "JsonOnlyDicts") [assignment] + v_no_lists_dict_of_none: JsonOnlyDicts = {'a': b_none} + v_no_lists_two_levels: JsonOnlyDicts = {'a': dict(b_dict), 'b': dict(b_dict)} + v_no_lists_three_levels: JsonOnlyDicts = {'a': dict(b_dict), 'b': {'x': dict(b_dict)}} + err_v_no_lists_with_list_of_none_level_two: JsonOnlyDicts = {'a': [b_none]} # E: Dict entry 0 has incompatible type "str": "list[None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] + err_v_no_lists_with_list_level_two: JsonOnlyDicts = {'a': dict(b_dict), 'b': list(b_list)} # E: Dict entry 1 has incompatible type "str": "list[str | float | int | None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] + err_v_no_lists_with_list_level_three: JsonOnlyDicts = {'a': dict(b_dict), 'b': {'x': list(b_list)}} # E: Dict entry 0 has incompatible type "str": "list[str | float | int | None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] err_v_nested_dicts_none: JsonNestedDicts = b_none # E: Incompatible types in assignment (expression has type "None", variable has type "JsonNestedDicts") [assignment] err_v_nested_dicts_int: JsonNestedDicts = b_int # E: Incompatible types in assignment (expression has type "int", variable has type "JsonNestedDicts") [assignment] @@ -347,20 +347,20 @@ err_m_list_of_nested_dicts_float: JsonListOfNestedDicts = b_float # E: Incompatible types in assignment (expression has type "float", variable has type "JsonListOfNestedDicts") [assignment] err_m_list_of_nested_dicts_str: JsonListOfNestedDicts = b_str # E: Incompatible types in assignment (expression has type "str", variable has type "JsonListOfNestedDicts") [assignment] err_m_list_of_nested_dicts_bool: JsonListOfNestedDicts = b_bool # E: Incompatible types in assignment (expression has type "bool", variable has type "JsonListOfNestedDicts") [assignment] - err_m_list_of_nested_dicts_list: JsonListOfNestedDicts = list(b_list) # E: Argument 1 to "list" has incompatible type "list[str | float | int | None]"; expected "Iterable[dict[str, JsonNoLists]]" [arg-type] + err_m_list_of_nested_dicts_list: JsonListOfNestedDicts = list(b_list) # E: Argument 1 to "list" has incompatible type "list[str | float | int | None]"; expected "Iterable[dict[str, JsonOnlyDicts]]" [arg-type] err_m_list_of_nested_dicts_dict: JsonListOfNestedDicts = dict(b_dict) # E: Incompatible types in assignment (expression has type "dict[str, str | float | int | None]", variable has type "JsonListOfNestedDicts") [assignment] err_m_list_of_nested_dicts_tuple: JsonListOfNestedDicts = b_tuple # E: Incompatible types in assignment (expression has type "tuple[None, int, float, str, bool]", variable has type "JsonListOfNestedDicts") [assignment] err_m_list_of_nested_dicts_set: JsonListOfNestedDicts = set(b_set) # E: Incompatible types in assignment (expression has type "set[str | float | int | None]", variable has type "JsonListOfNestedDicts") [assignment] - err_m_list_of_nested_dicts_list_of_none: JsonListOfNestedDicts = [b_none] # E: List item 0 has incompatible type "None"; expected "dict[str, JsonNoLists]" [list-item] + err_m_list_of_nested_dicts_list_of_none: JsonListOfNestedDicts = [b_none] # E: List item 0 has incompatible type "None"; expected "dict[str, JsonOnlyDicts]" [list-item] err_m_list_of_nested_dicts_dict_of_none: JsonListOfNestedDicts = {'a': b_none} # E: Incompatible types in assignment (expression has type "dict[str, None]", variable has type "JsonListOfNestedDicts") [assignment] m_list_of_nested_dicts_list_of_dict_of_none: JsonListOfNestedDicts = [{'a': b_none}] m_list_of_nested_dicts_two_levels: JsonListOfNestedDicts = [dict(b_dict)] m_list_of_nested_dicts_list_of_dict_of_dict_of_none: JsonListOfNestedDicts = [{'a': {'b': b_none}}] m_list_of_nested_dicts_three_levels: JsonListOfNestedDicts = [{'a': dict(b_dict), 'b': dict(b_dict)}] m_list_of_nested_dicts_four_levels: JsonListOfNestedDicts = [{'a': dict(b_dict), 'b': {'x': dict(b_dict)}}] - err_m_list_of_nested_dicts_with_list_of_none_level_two: JsonListOfNestedDicts = [[b_none]] # E: List item 0 has incompatible type "list[None]"; expected "dict[str, JsonNoLists]" [list-item] - err_m_list_of_nested_dicts_with_list_level_two: JsonListOfNestedDicts = [list(b_list)] # E: List item 0 has incompatible type "list[str | float | int | None]"; expected "dict[str, JsonNoLists]" [list-item] + err_m_list_of_nested_dicts_with_list_of_none_level_two: JsonListOfNestedDicts = [[b_none]] # E: List item 0 has incompatible type "list[None]"; expected "dict[str, JsonOnlyDicts]" [list-item] + err_m_list_of_nested_dicts_with_list_level_two: JsonListOfNestedDicts = [list(b_list)] # E: List item 0 has incompatible type "list[str | float | int | None]"; expected "dict[str, JsonOnlyDicts]" [list-item] err_m_list_of_nested_dicts_with_list_of_none_level_three: JsonListOfNestedDicts = [{'a': {'b': [b_none]}}] # E: Dict entry 0 has incompatible type "str": "list[None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] err_m_list_of_nested_dicts_with_list_level_three: JsonListOfNestedDicts = [{'a': dict(b_dict), 'b': list(b_list)}] # E: Dict entry 1 has incompatible type "str": "list[str | float | int | None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] err_m_list_of_nested_dicts_with_list_level_four: JsonListOfNestedDicts = [{'a': dict(b_dict), 'b': {'x': list(b_list)}}] # E: Dict entry 0 has incompatible type "str": "list[str | float | int | None]"; expected "str": "int | float | str | JsonNestedDicts | None" [dict-item] @@ -371,23 +371,23 @@ err_m_dict_of_nested_lists_str: JsonDictOfNestedLists = b_str # E: Incompatible types in assignment (expression has type "str", variable has type "JsonDictOfNestedLists") [assignment] err_m_dict_of_nested_lists_bool: JsonDictOfNestedLists = b_bool # E: Incompatible types in assignment (expression has type "bool", variable has type "JsonDictOfNestedLists") [assignment] err_m_dict_of_nested_lists_list: JsonDictOfNestedLists = list(b_list) # E: Incompatible types in assignment (expression has type "list[str | float | int | None]", variable has type "JsonDictOfNestedLists") [assignment] - err_m_dict_of_nested_lists_dict: JsonDictOfNestedLists = dict(b_dict) # E: Argument 1 to "dict" has incompatible type "dict[str, str | float | int | None]"; expected "SupportsKeysAndGetItem[str, list[JsonNoDicts]]" [arg-type] + err_m_dict_of_nested_lists_dict: JsonDictOfNestedLists = dict(b_dict) # E: Argument 1 to "dict" has incompatible type "dict[str, str | float | int | None]"; expected "SupportsKeysAndGetItem[str, list[JsonOnlyLists]]" [arg-type] err_m_dict_of_nested_lists_tuple: JsonDictOfNestedLists = b_tuple # E: Incompatible types in assignment (expression has type "tuple[None, int, float, str, bool]", variable has type "JsonDictOfNestedLists") [assignment] err_m_dict_of_nested_lists_set: JsonDictOfNestedLists = set(b_set) # E: Incompatible types in assignment (expression has type "set[str | float | int | None]", variable has type "JsonDictOfNestedLists") [assignment] err_m_dict_of_nested_lists_list_of_none: JsonDictOfNestedLists = [b_none] # E: Incompatible types in assignment (expression has type "list[None]", variable has type "JsonDictOfNestedLists") [assignment] - err_m_dict_of_nested_lists_dict_of_none: JsonDictOfNestedLists = {'a': b_none} # E: Dict entry 0 has incompatible type "str": "None"; expected "str": "list[JsonNoDicts]" [dict-item] + err_m_dict_of_nested_lists_dict_of_none: JsonDictOfNestedLists = {'a': b_none} # E: Dict entry 0 has incompatible type "str": "None"; expected "str": "list[JsonOnlyLists]" [dict-item] m_dict_of_nested_lists_dict_of_list_of_none: JsonDictOfNestedLists = {'a': [b_none]} m_dict_of_nested_lists_two_levels: JsonDictOfNestedLists = {'a': list(b_list), 'b': list(b_list)} - err_m_dict_of_nested_lists_int_key_dict: JsonDictOfNestedLists = {b_int: list(b_list)} # E: Dict entry 0 has incompatible type "int": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonNoDicts]" [dict-item] - err_m_dict_of_nested_lists_float_key_dict: JsonDictOfNestedLists = {b_float: list(b_list)} # E: Dict entry 0 has incompatible type "float": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonNoDicts]" [dict-item] - err_m_dict_of_nested_lists_bool_key_dict: JsonDictOfNestedLists = {b_bool: list(b_list)} # E: Dict entry 0 has incompatible type "bool": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonNoDicts]" [dict-item] - err_m_dict_of_nested_lists_none_key_dict: JsonDictOfNestedLists = {b_none: list(b_list)} # E: Dict entry 0 has incompatible type "None": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonNoDicts]" [dict-item] + err_m_dict_of_nested_lists_int_key_dict: JsonDictOfNestedLists = {b_int: list(b_list)} # E: Dict entry 0 has incompatible type "int": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonOnlyLists]" [dict-item] + err_m_dict_of_nested_lists_float_key_dict: JsonDictOfNestedLists = {b_float: list(b_list)} # E: Dict entry 0 has incompatible type "float": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonOnlyLists]" [dict-item] + err_m_dict_of_nested_lists_bool_key_dict: JsonDictOfNestedLists = {b_bool: list(b_list)} # E: Dict entry 0 has incompatible type "bool": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonOnlyLists]" [dict-item] + err_m_dict_of_nested_lists_none_key_dict: JsonDictOfNestedLists = {b_none: list(b_list)} # E: Dict entry 0 has incompatible type "None": "list[int | float | str | JsonNestedLists | None]"; expected "str": "list[JsonOnlyLists]" [dict-item] m_dict_of_nested_lists_dict_of_list_of_list_of_none: JsonDictOfNestedLists = {'a': [[b_none]]} m_dict_of_nested_lists_three_levels: JsonDictOfNestedLists = {'a': list(b_list), 'b': [list(b_list)]} m_dict_of_nested_lists_four_levels: JsonDictOfNestedLists = {'a': list(b_list), 'b': [list(b_list), [list(b_list)]]} - err_m_dict_of_nested_lists_with_dict_of_none_level_two: JsonDictOfNestedLists = {'a': {'b': b_none}} # E: Dict entry 0 has incompatible type "str": "dict[str, None]"; expected "str": "list[JsonNoDicts]" [dict-item] - err_m_dict_of_nested_lists_with_dict_level_two: JsonDictOfNestedLists = {'a': dict(b_dict)} # E: Dict entry 0 has incompatible type "str": "dict[str, str | float | int | None]"; expected "str": "list[JsonNoDicts]" [dict-item] + err_m_dict_of_nested_lists_with_dict_of_none_level_two: JsonDictOfNestedLists = {'a': {'b': b_none}} # E: Dict entry 0 has incompatible type "str": "dict[str, None]"; expected "str": "list[JsonOnlyLists]" [dict-item] + err_m_dict_of_nested_lists_with_dict_level_two: JsonDictOfNestedLists = {'a': dict(b_dict)} # E: Dict entry 0 has incompatible type "str": "dict[str, str | float | int | None]"; expected "str": "list[JsonOnlyLists]" [dict-item] err_m_dict_of_nested_lists_with_dict_of_none_level_three: JsonDictOfNestedLists = {'a': [{'b': b_none}]} # E: List item 0 has incompatible type "dict[str, None]"; expected "int | float | str | JsonNestedLists | None" [list-item] err_m_dict_of_nested_lists_with_dict_level_three: JsonDictOfNestedLists = {'a': list(b_list), 'b': [dict(b_dict)]} # E: List item 0 has incompatible type "dict[str, str | float | int | None]"; expected "int | float | str | JsonNestedLists | None" [list-item] err_m_dict_of_nested_lists_with_dict_level_four: JsonDictOfNestedLists = {'a': list(b_list), 'b': [list(b_list), [dict(b_dict)]]} # E: List item 0 has incompatible type "dict[str, str | float | int | None]"; expected "int | float | str | JsonNestedLists | None" [list-item] From b6ad81aba59d1d1d73117a029e11f7f71f07e235 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Mon, 18 Dec 2023 12:19:44 +0100 Subject: [PATCH 04/12] Serialization for Dataset.load() is now basing choice of serializer on dataset cls instead of file suffix. Added RawBytesDatasetToTarFileSerializer --- src/omnipy/api/protocols/public/data.py | 19 ++++++- src/omnipy/compute/mixins/serialize.py | 2 +- src/omnipy/data/dataset.py | 13 ++++- src/omnipy/data/serializer.py | 43 ++++++++++++-- src/omnipy/modules/__init__.py | 6 +- src/omnipy/modules/json/serializers.py | 11 ++-- src/omnipy/modules/pandas/serializers.py | 12 ++-- src/omnipy/modules/raw/serializers.py | 56 +++++++++++++++++-- tests/data/helpers/mocks.py | 14 +++-- .../serialize/test_serializer_registry.py | 12 ++-- 10 files changed, 150 insertions(+), 38 deletions(-) diff --git a/src/omnipy/api/protocols/public/data.py b/src/omnipy/api/protocols/public/data.py index a14cc196..799f509b 100644 --- a/src/omnipy/api/protocols/public/data.py +++ b/src/omnipy/api/protocols/public/data.py @@ -83,7 +83,7 @@ def serialize(cls, dataset: IsDataset) -> bytes | memoryview: pass @classmethod - def deserialize(cls, serialized: bytes) -> IsDataset: + def deserialize(cls, serialized: bytes, any_file_suffix=False) -> IsDataset: pass @@ -100,7 +100,8 @@ def create_dataset_from_tarfile(cls, tarfile_bytes: bytes, data_decode_func: Callable[[IO[bytes]], Any], dictify_object_func: Callable[[str, Any], dict | str], - import_method='from_data'): + import_method='from_data', + any_file_suffix: bool = False): ... @@ -130,8 +131,20 @@ def auto_detect_tar_file_serializer(self, dataset: IsDataset): def _autodetect_serializer(cls, dataset, serializers): ... + def detect_tar_file_serializers_from_dataset_cls(self, dataset: IsDataset): + ... + def detect_tar_file_serializers_from_file_suffix(self, file_suffix: str): ... - def load_from_tar_file_path(self, log_obj: CanLog, tar_file_path: str, to_dataset: IsDataset): + def load_from_tar_file_path_based_on_file_suffix(self, + log_obj: CanLog, + tar_file_path: str, + to_dataset: IsDataset): + ... + + def load_from_tar_file_path_based_on_dataset_cls(self, + log_obj: CanLog, + tar_file_path: str, + to_dataset: IsDataset): ... diff --git a/src/omnipy/compute/mixins/serialize.py b/src/omnipy/compute/mixins/serialize.py index 417a9d1d..5ae5ac7c 100644 --- a/src/omnipy/compute/mixins/serialize.py +++ b/src/omnipy/compute/mixins/serialize.py @@ -227,7 +227,7 @@ def _deserialize_and_restore_outputs(self) -> Dataset: for tar_file_path in self._all_job_output_file_paths_in_reverse_order_for_last_run( persist_data_dir_path, self._job_name()): to_dataset = cast(Type[Dataset], self._return_type) - return self._serializer_registry.load_from_tar_file_path( + return self._serializer_registry.load_from_tar_file_path_based_on_file_suffix( self, tar_file_path, to_dataset()) raise RuntimeError('No persisted output') diff --git a/src/omnipy/data/dataset.py b/src/omnipy/data/dataset.py index 7eb7aeef..181d3abf 100644 --- a/src/omnipy/data/dataset.py +++ b/src/omnipy/data/dataset.py @@ -356,9 +356,18 @@ def save(self, directory: str): tar.extractall(path=directory) tar.close() - def load(self, directory: str): + def load(self, tar_gz_file_path: str): serializer_registry = self._get_serializer_registry() - return serializer_registry.load_from_tar_file_path(self, directory, self) + + loaded_dataset = serializer_registry.load_from_tar_file_path_based_on_dataset_cls( + self, tar_gz_file_path, self) + if loaded_dataset is not None: + self.absorb(loaded_dataset) + return + + self.absorb( + serializer_registry.load_from_tar_file_path_based_on_file_suffix( + self, tar_gz_file_path, self)) @staticmethod def _get_serializer_registry(): diff --git a/src/omnipy/data/serializer.py b/src/omnipy/data/serializer.py index 2264ef62..80709453 100644 --- a/src/omnipy/data/serializer.py +++ b/src/omnipy/data/serializer.py @@ -35,7 +35,7 @@ def serialize(cls, dataset: IsDataset) -> bytes | memoryview: @classmethod @abstractmethod - def deserialize(cls, serialized: bytes) -> IsDataset: + def deserialize(cls, serialized: bytes, any_file_suffix=False) -> IsDataset: pass @@ -61,11 +61,13 @@ def create_dataset_from_tarfile(cls, tarfile_bytes: bytes, data_decode_func: Callable[[IO[bytes]], Any], dictify_object_func: Callable[[str, Any], dict | str], - import_method='from_data'): + import_method: str = 'from_data', + any_file_suffix: bool = False): with tarfile.open(fileobj=BytesIO(tarfile_bytes), mode='r:gz') as tarfile_stream: for filename in tarfile_stream.getnames(): data_file = tarfile_stream.extractfile(filename) - assert filename.endswith(f'.{cls.get_output_file_suffix()}') + if not any_file_suffix: + assert filename.endswith(f'.{cls.get_output_file_suffix()}') data_file_name = '.'.join(filename.split('.')[:-1]) getattr(dataset, import_method)( dictify_object_func(data_file_name, data_decode_func(data_file))) @@ -131,11 +133,19 @@ def _to_data_from_data_if_direct(dataset: IsDataset, serializer: IsSerializer): return None, None + def detect_tar_file_serializers_from_dataset_cls(self, dataset: IsDataset): + return tuple( + serializer_cls for serializer_cls in self.tar_file_serializers + if serializer_cls.is_dataset_directly_supported(dataset)) + def detect_tar_file_serializers_from_file_suffix(self, file_suffix: str): return tuple(serializer_cls for serializer_cls in self.tar_file_serializers if serializer_cls.get_output_file_suffix() == file_suffix) - def load_from_tar_file_path(self, log_obj: CanLog, tar_file_path: str, to_dataset: IsDataset): + def load_from_tar_file_path_based_on_file_suffix(self, + log_obj: CanLog, + tar_file_path: str, + to_dataset: IsDataset): if hasattr(log_obj, 'log'): log = log_obj.log else: @@ -174,3 +184,28 @@ def load_from_tar_file_path(self, log_obj: CanLog, tar_file_path: str, to_datase return to_dataset except Exception: return auto_dataset + + def load_from_tar_file_path_based_on_dataset_cls(self, + log_obj: CanLog, + tar_file_path: str, + to_dataset: IsDataset): + if hasattr(log_obj, 'log'): + log = log_obj.log + else: + log = print + + with tarfile.open(tar_file_path, 'r:gz') as tarfile_obj: + serializers = self.detect_tar_file_serializers_from_dataset_cls(to_dataset) + if len(serializers) == 0: + log(f'No serializer for Dataset with type "{type(to_dataset)}" can be ' + f'determined. Aborting load.') + else: + for serializer in serializers: + log(f'Reading dataset from a gzipped tarpack at' + f' "{os.path.abspath(tar_file_path)}" with serializer type: ' + f'"{type(serializer)}"') + + with open(tar_file_path, 'rb') as tarfile_binary: + out_dataset = serializer.deserialize(tarfile_binary.read(), any) + + return out_dataset diff --git a/src/omnipy/modules/__init__.py b/src/omnipy/modules/__init__.py index b7fa2ee2..37d82614 100644 --- a/src/omnipy/modules/__init__.py +++ b/src/omnipy/modules/__init__.py @@ -4,10 +4,12 @@ def register_serializers(registry: IsSerializerRegistry): from .json.serializers import JsonDatasetToTarFileSerializer from .pandas.serializers import PandasDatasetToTarFileSerializer - from .raw.serializers import RawDatasetToTarFileSerializer + from .raw.serializers import (RawBytesDatasetToTarFileSerializer, + RawStrDatasetToTarFileSerializer) registry.register(PandasDatasetToTarFileSerializer) - registry.register(RawDatasetToTarFileSerializer) + registry.register(RawStrDatasetToTarFileSerializer) + registry.register(RawBytesDatasetToTarFileSerializer) registry.register(JsonDatasetToTarFileSerializer) diff --git a/src/omnipy/modules/json/serializers.py b/src/omnipy/modules/json/serializers.py index 69757dcf..e36a78c1 100644 --- a/src/omnipy/modules/json/serializers.py +++ b/src/omnipy/modules/json/serializers.py @@ -3,6 +3,7 @@ from omnipy.data.dataset import Dataset from omnipy.data.serializer import TarFileSerializer +from ...api.protocols.public.data import IsDataset from .datasets import JsonBaseDataset, JsonDataset from .models import JsonModel @@ -10,11 +11,11 @@ class JsonDatasetToTarFileSerializer(TarFileSerializer): """""" @classmethod - def is_dataset_directly_supported(cls, dataset: Dataset) -> bool: + def is_dataset_directly_supported(cls, dataset: IsDataset) -> bool: return isinstance(dataset, JsonBaseDataset) @classmethod - def get_dataset_cls_for_new(cls) -> Type[Dataset]: + def get_dataset_cls_for_new(cls) -> Type[IsDataset]: return JsonDataset @classmethod @@ -29,7 +30,7 @@ def json_encode_func(json_data: JsonModel) -> bytes: return cls.create_tarfile_from_dataset(json_dataset, data_encode_func=json_encode_func) @classmethod - def deserialize(cls, tarfile_bytes: bytes) -> JsonDataset: + def deserialize(cls, tarfile_bytes: bytes, any_file_suffix=False) -> JsonDataset: json_dataset = JsonDataset() def json_decode_func(file_stream: IO[bytes]) -> str: @@ -43,6 +44,8 @@ def json_dictify_object(data_file: str, obj_val: str) -> dict[str, str]: tarfile_bytes, data_decode_func=json_decode_func, dictify_object_func=json_dictify_object, - import_method='from_json') + import_method='from_json', + any_file_suffix=any_file_suffix, + ) return json_dataset diff --git a/src/omnipy/modules/pandas/serializers.py b/src/omnipy/modules/pandas/serializers.py index fa5f021c..9f94e0f9 100644 --- a/src/omnipy/modules/pandas/serializers.py +++ b/src/omnipy/modules/pandas/serializers.py @@ -4,18 +4,18 @@ from omnipy.data.serializer import TarFileSerializer from . import pd -from ...data.dataset import Dataset +from ...api.protocols.public.data import IsDataset from .models import PandasDataset class PandasDatasetToTarFileSerializer(TarFileSerializer): """""" @classmethod - def is_dataset_directly_supported(cls, dataset: Dataset) -> bool: + def is_dataset_directly_supported(cls, dataset: IsDataset) -> bool: return isinstance(dataset, PandasDataset) @classmethod - def get_dataset_cls_for_new(cls) -> Type[Dataset]: + def get_dataset_cls_for_new(cls) -> Type[IsDataset]: return PandasDataset @classmethod @@ -34,7 +34,7 @@ def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview: return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func) @classmethod - def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset: + def deserialize(cls, tarfile_bytes: bytes, any_file_suffix=False) -> PandasDataset: pandas_dataset = PandasDataset() def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame: @@ -48,6 +48,8 @@ def python_dictify_object(data_file: str, obj_val: Any) -> dict: tarfile_bytes, data_decode_func=csv_decode_func, dictify_object_func=python_dictify_object, - import_method='from_data') # noqa + import_method='from_data', + any_file_suffix=any_file_suffix, + ) # noqa return pandas_dataset diff --git a/src/omnipy/modules/raw/serializers.py b/src/omnipy/modules/raw/serializers.py index 8eb7cb2b..ea2e7978 100644 --- a/src/omnipy/modules/raw/serializers.py +++ b/src/omnipy/modules/raw/serializers.py @@ -1,5 +1,6 @@ from typing import Any, IO, Type +from omnipy.api.protocols.public.data import IsDataset from omnipy.data.dataset import Dataset from omnipy.data.model import Model from omnipy.data.serializer import TarFileSerializer @@ -7,14 +8,14 @@ # from typing_inspect import get_generic_bases, get_generic_type, get_origin, get_parameters -class RawDatasetToTarFileSerializer(TarFileSerializer): +class RawStrDatasetToTarFileSerializer(TarFileSerializer): """""" @classmethod - def is_dataset_directly_supported(cls, dataset: Dataset) -> bool: + def is_dataset_directly_supported(cls, dataset: IsDataset) -> bool: return type(dataset) is Dataset[Model[str]] @classmethod - def get_dataset_cls_for_new(cls) -> Type[Dataset]: + def get_dataset_cls_for_new(cls) -> Type[IsDataset]: return Dataset[Model[str]] @classmethod @@ -29,7 +30,7 @@ def raw_encode_func(contents: str) -> bytes: return cls.create_tarfile_from_dataset(dataset, data_encode_func=raw_encode_func) @classmethod - def deserialize(cls, tarfile_bytes: bytes) -> Dataset[Model[str]]: + def deserialize(cls, tarfile_bytes: bytes, any_file_suffix=False) -> Dataset[Model[str]]: dataset = Dataset[Model[str]]() def raw_decode_func(file_stream: IO[bytes]) -> str: @@ -43,6 +44,51 @@ def python_dictify_object(data_file: str, obj_val: Any) -> dict: tarfile_bytes, data_decode_func=raw_decode_func, dictify_object_func=python_dictify_object, - import_method='from_data') # noqa + import_method='from_data', + any_file_suffix=any_file_suffix, + ) # noqa + + return dataset + + +class RawBytesDatasetToTarFileSerializer(TarFileSerializer): + """""" + @classmethod + def is_dataset_directly_supported(cls, dataset: IsDataset) -> bool: + return type(dataset) is Dataset[Model[bytes]] + + @classmethod + def get_dataset_cls_for_new(cls) -> Type[IsDataset]: + return Dataset[Model[bytes]] + + @classmethod + def get_output_file_suffix(cls) -> str: + return 'bytes.raw' + + @classmethod + def serialize(cls, dataset: Dataset[Model[bytes]]) -> bytes: + def raw_encode_func(contents: bytes) -> bytes: + return contents + + return cls.create_tarfile_from_dataset(dataset, data_encode_func=raw_encode_func) + + @classmethod + def deserialize(cls, tarfile_bytes: bytes, any_file_suffix=False) -> Dataset[Model[bytes]]: + dataset = Dataset[Model[bytes]]() + + def raw_decode_func(file_stream: IO[bytes]) -> bytes: + return file_stream.read() + + def python_dictify_object(data_file: str, obj_val: Any) -> dict: + return {data_file: obj_val} + + cls.create_dataset_from_tarfile( + dataset, + tarfile_bytes, + data_decode_func=raw_decode_func, + dictify_object_func=python_dictify_object, + import_method='from_data', + any_file_suffix=any_file_suffix, + ) # noqa return dataset diff --git a/tests/data/helpers/mocks.py b/tests/data/helpers/mocks.py index 0e5ecadf..822df459 100644 --- a/tests/data/helpers/mocks.py +++ b/tests/data/helpers/mocks.py @@ -1,6 +1,7 @@ import sys from typing import Any, IO, Type +from omnipy.api.protocols.public.data import IsDataset from omnipy.data.dataset import Dataset from omnipy.data.model import Model from omnipy.data.serializer import Serializer, TarFileSerializer @@ -12,11 +13,11 @@ class NumberDataset(Dataset[Model[int]]): class MockNumberSerializer(Serializer): @classmethod - def is_dataset_directly_supported(cls, dataset: Dataset) -> bool: + def is_dataset_directly_supported(cls, dataset: IsDataset) -> bool: return isinstance(dataset, NumberDataset) @classmethod - def get_dataset_cls_for_new(cls) -> Type[Dataset]: + def get_dataset_cls_for_new(cls) -> Type[IsDataset]: return NumberDataset @classmethod @@ -28,7 +29,7 @@ def serialize(cls, number_dataset: NumberDataset) -> bytes | memoryview: return ','.join(':'.join([k, str(v)]) for (k, v) in number_dataset.items()).encode('utf8') @classmethod - def deserialize(cls, serialized_bytes: bytes) -> NumberDataset: + def deserialize(cls, serialized_bytes: bytes, any_file_suffix=False) -> NumberDataset: number_dataset = NumberDataset() for key, val in [_.split(':') for _ in serialized_bytes.decode('utf8').split(',')]: number_dataset[key] = int(val) @@ -37,11 +38,11 @@ def deserialize(cls, serialized_bytes: bytes) -> NumberDataset: class MockNumberToTarFileSerializer(TarFileSerializer): @classmethod - def is_dataset_directly_supported(cls, dataset: Dataset) -> bool: + def is_dataset_directly_supported(cls, dataset: IsDataset) -> bool: return isinstance(dataset, NumberDataset) @classmethod - def get_dataset_cls_for_new(cls) -> Type[Dataset]: + def get_dataset_cls_for_new(cls) -> Type[IsDataset]: return NumberDataset @classmethod @@ -56,7 +57,7 @@ def number_encode_func(number_data: int) -> bytes: return cls.create_tarfile_from_dataset(number_dataset, data_encode_func=number_encode_func) @classmethod - def deserialize(cls, tarfile_bytes: bytes) -> NumberDataset: + def deserialize(cls, tarfile_bytes: bytes, any_file_suffix=False) -> NumberDataset: number_dataset = NumberDataset() def number_decode_func(file_stream: IO[bytes]) -> int: @@ -70,6 +71,7 @@ def python_dictify_object(data_file: str, obj_val: Any) -> dict: tarfile_bytes, data_decode_func=number_decode_func, dictify_object_func=python_dictify_object, + any_file_suffix=any_file_suffix, ) return number_dataset diff --git a/tests/integration/novel/serialize/test_serializer_registry.py b/tests/integration/novel/serialize/test_serializer_registry.py index 09c9ca6b..1b3cd1b6 100644 --- a/tests/integration/novel/serialize/test_serializer_registry.py +++ b/tests/integration/novel/serialize/test_serializer_registry.py @@ -3,7 +3,7 @@ from omnipy.data.serializer import SerializerRegistry from omnipy.modules.json.serializers import JsonDatasetToTarFileSerializer from omnipy.modules.pandas.serializers import PandasDatasetToTarFileSerializer -from omnipy.modules.raw.serializers import RawDatasetToTarFileSerializer +from omnipy.modules.raw.serializers import RawStrDatasetToTarFileSerializer from .cases.datasets import (csv_dataset, json_dataset, @@ -21,7 +21,7 @@ def registry(): registry = SerializerRegistry() registry.register(PandasDatasetToTarFileSerializer) - registry.register(RawDatasetToTarFileSerializer) + registry.register(RawStrDatasetToTarFileSerializer) registry.register(JsonDatasetToTarFileSerializer) return registry @@ -44,22 +44,22 @@ def test_serializer_registry_auto_detect_json_dataset(registry): def test_serializer_registry_auto_detect_json_table_as_str_dataset(registry): dataset, serializer = registry.auto_detect_tar_file_serializer(json_table_as_str_dataset) - assert serializer is RawDatasetToTarFileSerializer + assert serializer is RawStrDatasetToTarFileSerializer def test_serializer_registry_auto_detect_json_str_dataset(registry): dataset, serializer = registry.auto_detect_tar_file_serializer(json_str_dataset) - assert serializer is RawDatasetToTarFileSerializer + assert serializer is RawStrDatasetToTarFileSerializer def test_serializer_registry_auto_detect_csv_dataset(registry): dataset, serializer = registry.auto_detect_tar_file_serializer(csv_dataset) - assert serializer is RawDatasetToTarFileSerializer + assert serializer is RawStrDatasetToTarFileSerializer def test_serializer_registry_auto_detect_str_dataset(registry): dataset, serializer = registry.auto_detect_tar_file_serializer(str_dataset) - assert serializer is RawDatasetToTarFileSerializer + assert serializer is RawStrDatasetToTarFileSerializer def test_serializer_registry_auto_detect_python_dataset(registry): From 20924c5a935c9b7bec59b297efcce0c0ffe3aef2 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Mon, 18 Dec 2023 23:37:19 +0100 Subject: [PATCH 05/12] Implemented decode_bytes() task with automatic decoding using chardet --- pyproject.toml | 1 + src/omnipy/__init__.py | 6 ++++- src/omnipy/modules/raw/tasks.py | 30 +++++++++++++++++++++++++ tests/modules/raw/test_tasks.py | 39 +++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 tests/modules/raw/test_tasks.py diff --git a/pyproject.toml b/pyproject.toml index ca545192..86667154 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ typing-inspect = "^0.8.0" #orjson = "^3.8.0" #python-slugify = "^7.0.0" isort = "^5.12.0" +chardet = "^5.2.0" [tool.poetry.group.dev.dependencies] deepdiff = "^6.2.1" diff --git a/src/omnipy/__init__.py b/src/omnipy/__init__.py index 3b37e278..696dba57 100644 --- a/src/omnipy/__init__.py +++ b/src/omnipy/__init__.py @@ -65,7 +65,10 @@ convert_dataset_pandas_to_csv, extract_columns_as_files) from omnipy.modules.raw.models import JoinLinesModel, SplitAndStripLinesModel, SplitLinesModel -from omnipy.modules.raw.tasks import modify_all_lines, modify_datafile_contents, modify_each_line +from omnipy.modules.raw.tasks import (decode_bytes, + modify_all_lines, + modify_datafile_contents, + modify_each_line) from omnipy.modules.tables.tasks import remove_columns # from omnipy.util.helpers import recursive_module_import @@ -134,6 +137,7 @@ 'convert_dataset_csv_to_pandas', 'convert_dataset_pandas_to_csv', 'convert_dataset_list_of_dicts_to_pandas', + 'decode_bytes', 'modify_all_lines', 'modify_datafile_contents', 'modify_each_line', diff --git a/src/omnipy/modules/raw/tasks.py b/src/omnipy/modules/raw/tasks.py index 028905f5..6aaa04de 100644 --- a/src/omnipy/modules/raw/tasks.py +++ b/src/omnipy/modules/raw/tasks.py @@ -1,12 +1,42 @@ from io import StringIO import os +from chardet import UniversalDetector + from omnipy.compute.task import TaskTemplate from omnipy.compute.typing import mypy_fix_task_template from .protocols import IsModifyAllLinesCallable, IsModifyContentsCallable, IsModifyEachLineCallable +@mypy_fix_task_template +@TaskTemplate(iterate_over_data_files=True) +def decode_bytes(data: bytes, encoding: str | None = None) -> str: + if encoding is None: + detector = UniversalDetector() + for line in data.splitlines(): + detector.feed(line) + if detector.done: break + detector.close() + result = detector.result + + encoding = result['encoding'] + confidence = result['confidence'] + language = result['language'] + + # TODO: Implement simple solution to log from a task/flow. + # TODO: Implement solution to add information to the dataset metadata and apply this to + # decode_bytes() for storing detected encoding etc. + print(f'Automatically detected text encoding to be "{encoding}" with confidence ' + f'"{confidence}". The language is predicted to be "{language}". ' + f'(All predictions have been made by the "chardet" library.)') + + if encoding is None: + encoding = 'ascii' + + return data.decode(encoding) + + @mypy_fix_task_template @TaskTemplate(iterate_over_data_files=True) def modify_datafile_contents( diff --git a/tests/modules/raw/test_tasks.py b/tests/modules/raw/test_tasks.py new file mode 100644 index 00000000..5c3a5bb2 --- /dev/null +++ b/tests/modules/raw/test_tasks.py @@ -0,0 +1,39 @@ +from typing import Annotated, NamedTuple + +import pytest + +from omnipy import Dataset, decode_bytes, Model +from omnipy.api.protocols.public.hub import IsRuntime + + +def test_decode_bytes(runtime: Annotated[IsRuntime, pytest.fixture]) -> None: + class DecodeCaseInfo(NamedTuple): + bytes_data: bytes + target_str: str + encoding: str | None + + test_cases = [ + DecodeCaseInfo(b'', '', 'ascii'), + DecodeCaseInfo(b'ASCII string', 'ASCII string', 'ascii'), + DecodeCaseInfo(b'\xc3\xa6\xc3\xb8\xc3\xa5\xc3\x97\xe2\x80\xa0', 'æøå׆', 'utf-8'), + DecodeCaseInfo(b'\xff\xfe\xe6\x00\xf8\x00\xe5\x00\xd7\x00 ', 'æøå׆', 'utf-16'), + DecodeCaseInfo(b'\xd7 \xc6re v\xe6re \xf8let v\xe5rt! \xd7', + '× Ære være ølet vårt! ×', + 'latin-1'), + DecodeCaseInfo(b'\xd7 \xc6re v\xe6re \xf8let\x86 v\xe5rt! \xd7', + '× Ære være ølet† vårt! ×', + 'windows-1252'), + ] + for case in test_cases: + assert decode_bytes.run( + Dataset[Model[bytes]](a=case.bytes_data), encoding=case.encoding)['a'].contents == \ + case.target_str + + for case in test_cases: + assert decode_bytes.run( + Dataset[Model[bytes]](a=case.bytes_data), encoding=None)['a'].contents == \ + case.target_str + + assert decode_bytes.run( + Dataset[Model[bytes]](dict([(case.encoding, case.bytes_data) for case in test_cases])), + encoding=None).to_data() == dict([(case.encoding, case.target_str) for case in test_cases]) From d9699145205aa8489d7565ca3b75b1a3a27fb8a2 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 10:43:21 +0100 Subject: [PATCH 06/12] Initial prototype of info tables for Dataset and Model for use in interactive_mode (terminal, Jupyter, ...) --- pyproject.toml | 6 ++ src/omnipy/api/protocols/public/hub.py | 1 + src/omnipy/data/dataset.py | 45 +++++++++-- src/omnipy/data/model.py | 105 ++++++++++++++++++++++--- src/omnipy/hub/runtime.py | 1 + 5 files changed, 142 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 86667154..21d52f21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,11 @@ typing-inspect = "^0.8.0" #python-slugify = "^7.0.0" isort = "^5.12.0" chardet = "^5.2.0" +pathspec = "0.12.1" +tabulate = "^0.9.0" +devtools = "^0.12.2" +objsize = "^0.7.0" +humanize = "^4.9.0" [tool.poetry.group.dev.dependencies] deepdiff = "^6.2.1" @@ -49,6 +54,7 @@ pandas-stubs = "^2.1.1.230928" pre-commit = "^2.15.0" pytest-mypy-plugins = "^3.0.0" devtools = "^0.12.2" +ipython = "^8.18.1" [tool.poetry.group.docs.dependencies] furo = "^2022.12.7" diff --git a/src/omnipy/api/protocols/public/hub.py b/src/omnipy/api/protocols/public/hub.py index e1c50b37..c5d9c386 100644 --- a/src/omnipy/api/protocols/public/hub.py +++ b/src/omnipy/api/protocols/public/hub.py @@ -56,6 +56,7 @@ class IsRuntimeObjects(Protocol): registry: IsRunStateRegistry serializers: IsSerializerRegistry root_log: IsRootLogObjects + waiting_for_terminal_repr: bool def __init__( self, diff --git a/src/omnipy/data/dataset.py b/src/omnipy/data/dataset.py index 181d3abf..7fc48b6c 100644 --- a/src/omnipy/data/dataset.py +++ b/src/omnipy/data/dataset.py @@ -4,14 +4,25 @@ import tarfile from typing import Annotated, Any, Generic, get_args, get_origin, Iterator, Optional, Type, TypeVar +import humanize +import objsize # from orjson import orjson from pydantic import Field, PrivateAttr, root_validator, ValidationError from pydantic.fields import Undefined, UndefinedType from pydantic.generics import GenericModel from pydantic.utils import lenient_issubclass - -from omnipy.data.model import _cleanup_name_qualname_and_module, Model -from omnipy.util.helpers import is_optional, is_strict_subclass, remove_forward_ref_notation +from tabulate import tabulate + +from omnipy.data.model import (_cleanup_name_qualname_and_module, + _is_interactive_mode, + _waiting_for_terminal_repr, + INTERACTIVE_MODULES, + Model) +from omnipy.util.helpers import (get_calling_module_name, + is_iterable, + is_optional, + is_strict_subclass, + remove_forward_ref_notation) ModelT = TypeVar('ModelT', bound=Model) DATA_KEY = 'data' @@ -391,12 +402,34 @@ def __eq__(self, other: object) -> bool: and self.data == other.data \ and self.to_data() == other.to_data() # last is probably unnecessary, but just in case - def __repr__(self) -> str: - return super().__repr__() - def __repr_args__(self): return [(k, v.contents) for k, v in self.data.items()] + def __repr__(self): + if _is_interactive_mode() and not _waiting_for_terminal_repr(): + print(get_calling_module_name()) + if get_calling_module_name() in INTERACTIVE_MODULES: + _waiting_for_terminal_repr(True) + return _table_repr(self) + return self._trad_repr() + + def _trad_repr(self) -> str: + return super().__repr__() + + def _table_repr(self) -> str: + ret = tabulate( + ((i, + k, + type(v).__name__, + len(v) if hasattr(v, '__len__') else 'N/A', + humanize.naturalsize(objsize.get_deep_size(v))) + for i, (k, v) in enumerate(self.items())), + ('#', 'Data file name', 'Type', 'Length', 'Size (in memory)'), + tablefmt="rounded_outline", + ) + _waiting_for_terminal_repr(False) + return ret + # TODO: Use json serializer package from the pydantic config instead of 'json' diff --git a/src/omnipy/data/model.py b/src/omnipy/data/model.py index 7eca6478..85ec468c 100644 --- a/src/omnipy/data/model.py +++ b/src/omnipy/data/model.py @@ -2,6 +2,9 @@ import functools import inspect import json +import os +import shutil +from textwrap import dedent from types import UnionType from typing import (Annotated, Any, @@ -15,7 +18,7 @@ TypeVar, Union) -# from orjson import orjson +from devtools import debug, PrettyFormat from pydantic import NoneIsNotAllowedError from pydantic import Protocol as PydanticProtocol from pydantic import root_validator, ValidationError @@ -24,6 +27,7 @@ from pydantic.main import ModelMetaclass, validate_model from pydantic.typing import display_as_type, is_none_type from pydantic.utils import lenient_isinstance, lenient_issubclass +from tabulate import tabulate from omnipy.data.methodinfo import MethodInfo, SPECIAL_METHODS_INFO from omnipy.util.contexts import AttribHolder, LastErrorHolder, nothing @@ -31,6 +35,7 @@ from omnipy.util.helpers import (all_equals, ensure_plain_type, generate_qualname, + get_calling_module_name, is_optional, is_union, remove_annotated_plus_optional_if_present, @@ -44,6 +49,9 @@ ROOT_KEY = '__root__' +# TODO: Refactor Dataset and Model using mixins (including below functions) +INTERACTIVE_MODULES = ['__main__', 'IPython.lib.pretty', 'IPython.core.interactiveshell'] + def _cleanup_name_qualname_and_module(cls, created_model_or_dataset, model, orig_model): if isinstance(model, str): # ForwardRef @@ -60,6 +68,23 @@ def _cleanup_name_qualname_and_module(cls, created_model_or_dataset, model, orig created_model_or_dataset.__module__ = cls.__module__ +def _is_interactive_mode() -> bool: + from omnipy.hub.runtime import runtime + return runtime.config.data.interactive_mode if runtime else True + + +def _waiting_for_terminal_repr(new_value: bool | None = None) -> bool: + from omnipy.hub.runtime import runtime + if runtime is None: + return False + + if new_value is not None: + runtime.objects.waiting_for_terminal_repr = new_value + return new_value + else: + return runtime.objects.waiting_for_terminal_repr + + # def orjson_dumps(v, *, default): # # orjson.dumps returns bytes, to match standard json.dumps we need to decode # return orjson.dumps(v, default=default).decode() @@ -357,15 +382,10 @@ def _get_restorable_contents(self): return _restorable_content_cache.get(id(self)) def _take_snapshot_of_validated_contents(self): - interactive_mode = self._is_interactive_mode() + interactive_mode = _is_interactive_mode() if interactive_mode: self._get_restorable_contents().take_snapshot(self.contents) - def _is_interactive_mode(self): - from omnipy import runtime - interactive_mode = runtime.config.data.interactive_mode if runtime else True - return interactive_mode - @classmethod def _parse_data(cls, data: RootT) -> Any: return data @@ -523,7 +543,7 @@ def _special_method(self, name: str, info: MethodInfo, *args: object, if info.state_changing: restorable = self._get_restorable_contents() - if self._is_interactive_mode(): + if _is_interactive_mode(): if restorable.has_snapshot() \ and restorable.last_snapshot_taken_of_same_obj(self.contents) \ and restorable.differs_from_last_snapshot(self.contents): @@ -545,7 +565,7 @@ def _special_method(self, name: str, info: MethodInfo, *args: object, with reset_solution: ret = method(*args, **kwargs) - if self._is_interactive_mode(): + if _is_interactive_mode(): needs_validation = restorable.differs_from_last_snapshot(self.contents) \ if restorable.has_snapshot() else True else: @@ -596,7 +616,7 @@ def __getattr__(self, attr: str) -> Any: ret = self._getattr_from_contents(attr) if callable(ret): contents_holder = AttribHolder(self, 'contents', copy_attr=True) - context = contents_holder if self._is_interactive_mode() else nothing() + context = contents_holder if _is_interactive_mode() else nothing() ret = add_callback_after_call(ret, self.validate_contents, with_context=context) return ret @@ -613,7 +633,72 @@ def __eq__(self, other: object) -> bool: and self.to_data() == other.to_data() # last is probably unnecessary, but just in case def __repr__(self) -> str: + if _is_interactive_mode() and not _waiting_for_terminal_repr(): + if get_calling_module_name() in INTERACTIVE_MODULES: + _waiting_for_terminal_repr(True) + return self._table_repr + return self._trad_repr() + + def _trad_repr(self) -> str: return super().__repr__() def __repr_args__(self): return [(None, self.contents)] + + def _table_repr(self) -> str: + tabulate.PRESERVE_WHITESPACE = True # Does not seem to work together with 'maxcolwidths' + + terminal_size = shutil.get_terminal_size() + header_column_width = len('(bottom') + num_columns = 2 + table_chars_width = 3 * num_columns + 1 + data_column_width = terminal_size.columns - table_chars_width - header_column_width + + data_indent = 2 + + inspect.getmodule(debug).pformat = PrettyFormat( + indent_step=data_indent, + simple_cutoff=20, + width=data_column_width - data_indent, + yield_from_generators=True, + ) + + structure = str(debug.format(self)) + structure_lines = structure.splitlines() + new_structure_lines = dedent(os.linesep.join(structure_lines[1:])).splitlines() + if new_structure_lines[0].startswith('self: '): + new_structure_lines[0] = new_structure_lines[0][5:] + max_section_height = (terminal_size.lines - 8) // 2 + structure_len = len(new_structure_lines) + + if structure_len > max_section_height * 2 + 1: + top_structure_end = max_section_height + bottom_structure_start = structure_len - max_section_height + + top_structure = os.linesep.join(new_structure_lines[:top_structure_end]) + bottom_structure = os.linesep.join(new_structure_lines[bottom_structure_start:]) + + out = tabulate( + ( + ('#', self.__class__.__name__), + (os.linesep.join(str(i) for i in range(top_structure_end)), top_structure), + (os.linesep.join(str(i) for i in range(bottom_structure_start, structure_len)), + bottom_structure), + ), + maxcolwidths=[header_column_width, data_column_width], + tablefmt="rounded_grid", + ) + else: + out = tabulate( + ( + ('#', self.__class__.__name__), + (os.linesep.join(str(i) for i in range(structure_len)), + os.linesep.join(new_structure_lines)), + ), + maxcolwidths=[header_column_width, data_column_width], + tablefmt="rounded_grid", + ) + + _waiting_for_terminal_repr(False) + + return out diff --git a/src/omnipy/hub/runtime.py b/src/omnipy/hub/runtime.py index 99c194b2..7e7368a6 100644 --- a/src/omnipy/hub/runtime.py +++ b/src/omnipy/hub/runtime.py @@ -48,6 +48,7 @@ class RuntimeObjects(RuntimeEntryPublisher): registry: IsRunStateRegistry = field(default_factory=RunStateRegistry) serializers: IsSerializerRegistry = field(default_factory=SerializerRegistry) root_log: IsRootLogObjects = field(default_factory=RootLogObjects) + waiting_for_terminal_repr: bool = False @dataclass From 67d2705c66cea97f2202c98d8828b9868e12486a Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 08:35:32 +0100 Subject: [PATCH 07/12] Misc Bugfixes --- src/omnipy/data/dataset.py | 12 +++++++----- src/omnipy/data/serializer.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/omnipy/data/dataset.py b/src/omnipy/data/dataset.py index 7fc48b6c..5f4dc34c 100644 --- a/src/omnipy/data/dataset.py +++ b/src/omnipy/data/dataset.py @@ -382,11 +382,13 @@ def load(self, tar_gz_file_path: str): @staticmethod def _get_serializer_registry(): + from omnipy.data.serializer import SerializerRegistry from omnipy.hub.runtime import runtime - if len(runtime.objects.serializers.serializers) == 0: - from omnipy.modules import register_serializers - register_serializers(runtime.objects.serializers) - serializer_registry = runtime.objects.serializers + from omnipy.modules import register_serializers + serializer_registry = SerializerRegistry() if runtime is None else \ + runtime.objects.serializers + if len(serializer_registry.serializers) == 0: + register_serializers(serializer_registry) return serializer_registry def as_multi_model_dataset(self) -> 'MultiModelDataset[ModelT]': @@ -421,7 +423,7 @@ def _table_repr(self) -> str: ((i, k, type(v).__name__, - len(v) if hasattr(v, '__len__') else 'N/A', + v.__len__() if hasattr(v, '__len__') else 'N/A', humanize.naturalsize(objsize.get_deep_size(v))) for i, (k, v) in enumerate(self.items())), ('#', 'Data file name', 'Type', 'Length', 'Size (in memory)'), diff --git a/src/omnipy/data/serializer.py b/src/omnipy/data/serializer.py index 80709453..71069f75 100644 --- a/src/omnipy/data/serializer.py +++ b/src/omnipy/data/serializer.py @@ -48,7 +48,7 @@ def create_tarfile_from_dataset(cls, bytes_io = BytesIO() with tarfile.open(fileobj=bytes_io, mode='w:gz') as tarfile_stream: for data_file, data in dataset.items(): - json_data_bytestream = BytesIO(data_encode_func(data)) + json_data_bytestream = BytesIO(data_encode_func(data.contents)) json_data_bytestream.seek(0) tarinfo = TarInfo(name=f'{data_file}.{cls.get_output_file_suffix()}') tarinfo.size = len(json_data_bytestream.getbuffer()) From f7aa5d74430e869f0897c7648a1230df9d76d0ef Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 09:25:48 +0100 Subject: [PATCH 08/12] Dataset now supports almost every selection method imaginable --- src/omnipy/data/dataset.py | 30 +++++++++++++++++------ tests/data/test_dataset.py | 50 +++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/src/omnipy/data/dataset.py b/src/omnipy/data/dataset.py index 5f4dc34c..61c087e9 100644 --- a/src/omnipy/data/dataset.py +++ b/src/omnipy/data/dataset.py @@ -1,4 +1,5 @@ from collections import UserDict +from collections.abc import Iterable, Mapping import json import os import tarfile @@ -121,9 +122,9 @@ def __class_getitem__(cls, model: ModelT) -> ModelT: def __init__( self, - value: dict[str, object] | Iterator[tuple[str, object]] | UndefinedType = Undefined, + value: Mapping[str, object] | Iterable[tuple[str, object]] | UndefinedType = Undefined, *, - data: dict[str, object] | UndefinedType = Undefined, + data: Mapping[str, object] | UndefinedType = Undefined, **input_data: object, ) -> None: # TODO: Error message when forgetting parenthesis when creating Dataset should be improved. @@ -228,11 +229,26 @@ def __setitem__(self, data_file: str, data_obj: Any) -> None: del self.data[data_file] raise - def __getitem__(self, data_file: str) -> Any: - if data_file in self.data: - return self.data[data_file] + def __getitem__(self, selector: str | int | slice | Iterable[str | int]) -> Any: + if isinstance(selector, str): + if selector in self.data: + return self.data[selector] + else: + return self.data[selector] else: - return self.data[data_file] + data_keys = tuple(self.data.keys()) + + if isinstance(selector, int): + return self.data[data_keys[selector]] + elif isinstance(selector, slice): + return self.__class__({key: self.data[key] for key in data_keys[selector]}) + elif is_iterable(selector): + selected_keys = [data_keys[_] if isinstance(_, int) else _ for _ in selector] + return self.__class__({key: self.data[key] for key in selected_keys}) + else: + raise KeyError( + 'Selector is of incorrect type. Must be a string, a positive integer,' + 'or a slice (e.g. `dataset[2:5]`).') @classmethod def update_forward_refs(cls, **localns: Any) -> None: @@ -297,7 +313,7 @@ def to_json(self, pretty=True) -> dict[str, str]: return result def from_json(self, - data: dict[str, str] | Iterator[tuple[str, str]], + data: Mapping[str, str] | Iterable[tuple[str, str]], update: bool = True) -> None: if not isinstance(data, dict): data = dict(data) diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py index f71b2725..2a9f0b31 100644 --- a/tests/data/test_dataset.py +++ b/tests/data/test_dataset.py @@ -36,7 +36,7 @@ class MyDataset(Dataset[list[str]]): MyDataset() -def test_init_with_basic_parsing(): +def test_init_with_basic_parsing() -> None: dataset_1 = Dataset[Model[int]]() dataset_1['data_file_1'] = 123 @@ -169,6 +169,9 @@ def test_more_dict_methods_with_parsing(): dataset.setdefault('data_file_4', 789) assert dataset.get('data_file_4').contents == '789' + assert dataset.fromkeys(['data_file_1', 'data_file_2'], 321) == \ + Dataset[Model[str]](data_file_1='321', data_file_2='321') + assert len(dataset) == 3 dataset.pop('data_file_3') @@ -185,6 +188,51 @@ def test_more_dict_methods_with_parsing(): assert dataset.to_data() == {} +def test_get_item_with_int_and_slice() -> None: + dataset = Dataset[Model[int]](data_file_1=123, data_file_2=456, data_file_3=789) + assert dataset[0] == dataset['data_file_1'] == Model[int](123) + assert dataset[1] == dataset['data_file_2'] == Model[int](456) + assert dataset[2] == dataset[-1] == dataset['data_file_3'] == Model[int](789) + + assert dataset[0:2] == Dataset[Model[int]](data_file_1=dataset['data_file_1'], + data_file_2=dataset['data_file_2']) \ + == Dataset[Model[int]](data_file_1=123, data_file_2=456) + assert dataset[-1:] == dataset[2:3] == Dataset[Model[int]](data_file_3=dataset['data_file_3']) \ + == Dataset[Model[int]](data_file_3=789) + assert dataset[:] == dataset + assert dataset[1:1] == Dataset[Model[int]]() + + with pytest.raises(IndexError): + dataset[3] + + assert dataset[2:4] == Dataset[Model[int]](data_file_3=dataset['data_file_3']) \ + == Dataset[Model[int]](data_file_3=789) + + +def test_get_items_with_tuple_or_list() -> None: + dataset = Dataset[Model[int]](data_file_1=123, data_file_2=456, data_file_3=789) + + assert dataset[()] == dataset[[]] == Dataset[Model[int]]() + assert dataset[0,] == dataset[(0,)] == dataset[[0]] \ + == dataset['data_file_1',] == dataset[('data_file_1',)] == dataset[['data_file_1']] \ + == Dataset[Model[int]](data_file_1=123) + assert dataset[0, 2] == dataset[(0, 2)] == dataset[[0, 2]] \ + == dataset['data_file_1','data_file_3'] == dataset[('data_file_1', 'data_file_3')] \ + == dataset[['data_file_1', 'data_file_3']] == dataset[[0, 'data_file_3']] \ + == Dataset[Model[int]](data_file_1=dataset['data_file_1'], + data_file_3=dataset['data_file_3']) \ + == Dataset[Model[int]](data_file_1=123, data_file_3=789) + + with pytest.raises(IndexError): + dataset[0, 3] + + with pytest.raises(KeyError): + dataset[0, 'data_file_4'] + + with pytest.raises(IndexError): + dataset[[0, 3]] + + def test_equality() -> None: assert Dataset[Model[list[int]]]({'data_file_1': [1, 2, 3], 'data_file_2': [1.0, 2.0, 3.0]}) \ == Dataset[Model[list[int]]]({'data_file_1': [1.0, 2.0, 3.0], 'data_file_2': [1, 2, 3]}) From 96b3793b20a3e2ee4a6ff577b8bb11570c9e0c50 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 09:31:38 +0100 Subject: [PATCH 09/12] File suffix '.raw' -> '.txt' and '.bytes.raw' -> '.bytes' --- src/omnipy/modules/raw/serializers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omnipy/modules/raw/serializers.py b/src/omnipy/modules/raw/serializers.py index ea2e7978..2df40206 100644 --- a/src/omnipy/modules/raw/serializers.py +++ b/src/omnipy/modules/raw/serializers.py @@ -20,7 +20,7 @@ def get_dataset_cls_for_new(cls) -> Type[IsDataset]: @classmethod def get_output_file_suffix(cls) -> str: - return 'raw' + return 'txt' @classmethod def serialize(cls, dataset: Dataset[Model[str]]) -> bytes: @@ -63,7 +63,7 @@ def get_dataset_cls_for_new(cls) -> Type[IsDataset]: @classmethod def get_output_file_suffix(cls) -> str: - return 'bytes.raw' + return 'bytes' @classmethod def serialize(cls, dataset: Dataset[Model[bytes]]) -> bytes: From 9af63d84951d893e88ee3e7d9f6dbb3d51da33d7 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 09:33:07 +0100 Subject: [PATCH 10/12] PandasModel now allows empty objects and non-str col names --- src/omnipy/modules/pandas/models.py | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/omnipy/modules/pandas/models.py b/src/omnipy/modules/pandas/models.py index f99fbbb7..033ba958 100644 --- a/src/omnipy/modules/pandas/models.py +++ b/src/omnipy/modules/pandas/models.py @@ -7,21 +7,21 @@ from . import pd -class PandasModel(Model[pd.DataFrame]): - @classmethod - def _parse_data(cls, data: pd.DataFrame) -> pd.DataFrame: - cls._data_column_names_are_strings(data) - cls._data_not_empty_object(data) - return data - - @staticmethod - def _data_column_names_are_strings(data: pd.DataFrame) -> None: - for column in data.columns: - assert isinstance(column, str) - - @staticmethod - def _data_not_empty_object(data: pd.DataFrame) -> None: - assert not any(data.isna().all(axis=1)) +class PandasModel(Model[pd.DataFrame | pd.Series]): + # @classmethod + # def _parse_data(cls, data: pd.DataFrame | pd.Series) -> pd.DataFrame | pd.Series: + # # cls._data_column_names_are_strings(data) + # cls._data_not_empty_object(data) + # return data + + # @staticmethod + # def _data_column_names_are_strings(data: pd.DataFrame) -> None: + # for column in data.columns: + # assert isinstance(column, str) + + # @staticmethod + # def _data_not_empty_object(data: pd.DataFrame) -> None: + # assert not any(data.isna().all(axis=1)) def dict(self, *args, **kwargs) -> dict[str, dict[Any, Any]]: df = super().dict(*args, **kwargs)[ROOT_KEY] From 0d96fb14ebe3ef5e736c7110c29274b5eef96fb3 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 10:29:43 +0100 Subject: [PATCH 11/12] Bugfixes due to recent Dataset changes --- src/omnipy/modules/__init__.py | 2 +- tests/data/helpers/mocks.py | 3 ++- tests/integration/novel/serialize/test_serializer_registry.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/omnipy/modules/__init__.py b/src/omnipy/modules/__init__.py index 37d82614..94d3a7c0 100644 --- a/src/omnipy/modules/__init__.py +++ b/src/omnipy/modules/__init__.py @@ -7,10 +7,10 @@ def register_serializers(registry: IsSerializerRegistry): from .raw.serializers import (RawBytesDatasetToTarFileSerializer, RawStrDatasetToTarFileSerializer) - registry.register(PandasDatasetToTarFileSerializer) registry.register(RawStrDatasetToTarFileSerializer) registry.register(RawBytesDatasetToTarFileSerializer) registry.register(JsonDatasetToTarFileSerializer) + registry.register(PandasDatasetToTarFileSerializer) # TODO: Add module with helper classes/functions/takss to make it simpler to contact REST apis diff --git a/tests/data/helpers/mocks.py b/tests/data/helpers/mocks.py index 822df459..a6f59141 100644 --- a/tests/data/helpers/mocks.py +++ b/tests/data/helpers/mocks.py @@ -26,7 +26,8 @@ def get_output_file_suffix(cls) -> str: @classmethod def serialize(cls, number_dataset: NumberDataset) -> bytes | memoryview: - return ','.join(':'.join([k, str(v)]) for (k, v) in number_dataset.items()).encode('utf8') + return ','.join( + ':'.join([k, str(v.contents)]) for (k, v) in number_dataset.items()).encode('utf8') @classmethod def deserialize(cls, serialized_bytes: bytes, any_file_suffix=False) -> NumberDataset: diff --git a/tests/integration/novel/serialize/test_serializer_registry.py b/tests/integration/novel/serialize/test_serializer_registry.py index 1b3cd1b6..ac5e9331 100644 --- a/tests/integration/novel/serialize/test_serializer_registry.py +++ b/tests/integration/novel/serialize/test_serializer_registry.py @@ -20,9 +20,9 @@ def registry(): registry = SerializerRegistry() - registry.register(PandasDatasetToTarFileSerializer) registry.register(RawStrDatasetToTarFileSerializer) registry.register(JsonDatasetToTarFileSerializer) + registry.register(PandasDatasetToTarFileSerializer) return registry @@ -34,7 +34,7 @@ def test_serializer_registry_auto_detect_pandas_dataset(registry): def test_serializer_registry_auto_detect_json_table_dataset(registry): dataset, serializer = registry.auto_detect_tar_file_serializer(json_table_dataset) - assert serializer is PandasDatasetToTarFileSerializer + assert serializer is JsonDatasetToTarFileSerializer def test_serializer_registry_auto_detect_json_dataset(registry): From 9158357c50bb5e8d6052e16b56c5c61d1f01c8b6 Mon Sep 17 00:00:00 2001 From: Sveinung Gundersen Date: Tue, 19 Dec 2023 10:36:46 +0100 Subject: [PATCH 12/12] Cleanup --- src/omnipy/__init__.py | 4 +--- src/omnipy/api/protocols/public/hub.py | 2 +- src/omnipy/data/dataset.py | 4 ++-- src/omnipy/data/model.py | 4 ++-- src/omnipy/data/serializer.py | 25 ++++++++++++------------- src/omnipy/modules/json/serializers.py | 1 - src/omnipy/modules/raw/tasks.py | 3 ++- tests/data/test_dataset.py | 2 +- tests/modules/raw/test_tasks.py | 4 ++-- tests/util/__init__.py | 2 +- tests/util/helpers/__init__.py | 2 +- 11 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/omnipy/__init__.py b/src/omnipy/__init__.py index 696dba57..7ff32881 100644 --- a/src/omnipy/__init__.py +++ b/src/omnipy/__init__.py @@ -1,9 +1,6 @@ __version__ = '0.12.3' -import importlib import os -import sys -from typing import Optional from omnipy.data.dataset import Dataset from omnipy.data.model import Model @@ -137,6 +134,7 @@ 'convert_dataset_csv_to_pandas', 'convert_dataset_pandas_to_csv', 'convert_dataset_list_of_dicts_to_pandas', + 'extract_columns_as_files', 'decode_bytes', 'modify_all_lines', 'modify_datafile_contents', diff --git a/src/omnipy/api/protocols/public/hub.py b/src/omnipy/api/protocols/public/hub.py index c5d9c386..86f2b684 100644 --- a/src/omnipy/api/protocols/public/hub.py +++ b/src/omnipy/api/protocols/public/hub.py @@ -64,7 +64,7 @@ def __init__( local: IsEngine | None = None, # noqa prefect: IsEngine | None = None, # noqa registry: IsRunStateRegistry | None = None, # noqa - serializers: IsSerializerRegistry | None = None, + serializers: IsSerializerRegistry | None = None, # noqa root_log: IsRootLogObjects | None = None, # noqa *args: object, **kwargs: object) -> None: diff --git a/src/omnipy/data/dataset.py b/src/omnipy/data/dataset.py index 61c087e9..b0907dc6 100644 --- a/src/omnipy/data/dataset.py +++ b/src/omnipy/data/dataset.py @@ -428,7 +428,7 @@ def __repr__(self): print(get_calling_module_name()) if get_calling_module_name() in INTERACTIVE_MODULES: _waiting_for_terminal_repr(True) - return _table_repr(self) + return self._table_repr() return self._trad_repr() def _trad_repr(self) -> str: @@ -443,7 +443,7 @@ def _table_repr(self) -> str: humanize.naturalsize(objsize.get_deep_size(v))) for i, (k, v) in enumerate(self.items())), ('#', 'Data file name', 'Type', 'Length', 'Size (in memory)'), - tablefmt="rounded_outline", + tablefmt='rounded_outline', ) _waiting_for_terminal_repr(False) return ret diff --git a/src/omnipy/data/model.py b/src/omnipy/data/model.py index 85ec468c..9451b789 100644 --- a/src/omnipy/data/model.py +++ b/src/omnipy/data/model.py @@ -686,7 +686,7 @@ def _table_repr(self) -> str: bottom_structure), ), maxcolwidths=[header_column_width, data_column_width], - tablefmt="rounded_grid", + tablefmt='rounded_grid', ) else: out = tabulate( @@ -696,7 +696,7 @@ def _table_repr(self) -> str: os.linesep.join(new_structure_lines)), ), maxcolwidths=[header_column_width, data_column_width], - tablefmt="rounded_grid", + tablefmt='rounded_grid', ) _waiting_for_terminal_repr(False) diff --git a/src/omnipy/data/serializer.py b/src/omnipy/data/serializer.py index 71069f75..b07c0244 100644 --- a/src/omnipy/data/serializer.py +++ b/src/omnipy/data/serializer.py @@ -194,18 +194,17 @@ def load_from_tar_file_path_based_on_dataset_cls(self, else: log = print - with tarfile.open(tar_file_path, 'r:gz') as tarfile_obj: - serializers = self.detect_tar_file_serializers_from_dataset_cls(to_dataset) - if len(serializers) == 0: - log(f'No serializer for Dataset with type "{type(to_dataset)}" can be ' - f'determined. Aborting load.') - else: - for serializer in serializers: - log(f'Reading dataset from a gzipped tarpack at' - f' "{os.path.abspath(tar_file_path)}" with serializer type: ' - f'"{type(serializer)}"') + serializers = self.detect_tar_file_serializers_from_dataset_cls(to_dataset) + if len(serializers) == 0: + log(f'No serializer for Dataset with type "{type(to_dataset)}" can be ' + f'determined. Aborting load.') + else: + for serializer in serializers: + log(f'Reading dataset from a gzipped tarpack at' + f' "{os.path.abspath(tar_file_path)}" with serializer type: ' + f'"{type(serializer)}"') - with open(tar_file_path, 'rb') as tarfile_binary: - out_dataset = serializer.deserialize(tarfile_binary.read(), any) + with open(tar_file_path, 'rb') as tarfile_binary: + out_dataset = serializer.deserialize(tarfile_binary.read(), any) - return out_dataset + return out_dataset diff --git a/src/omnipy/modules/json/serializers.py b/src/omnipy/modules/json/serializers.py index e36a78c1..a4cab647 100644 --- a/src/omnipy/modules/json/serializers.py +++ b/src/omnipy/modules/json/serializers.py @@ -1,6 +1,5 @@ from typing import IO, Type -from omnipy.data.dataset import Dataset from omnipy.data.serializer import TarFileSerializer from ...api.protocols.public.data import IsDataset diff --git a/src/omnipy/modules/raw/tasks.py b/src/omnipy/modules/raw/tasks.py index 6aaa04de..15a528eb 100644 --- a/src/omnipy/modules/raw/tasks.py +++ b/src/omnipy/modules/raw/tasks.py @@ -16,7 +16,8 @@ def decode_bytes(data: bytes, encoding: str | None = None) -> str: detector = UniversalDetector() for line in data.splitlines(): detector.feed(line) - if detector.done: break + if detector.done: + break detector.close() result = detector.result diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py index 2a9f0b31..37fc4ded 100644 --- a/tests/data/test_dataset.py +++ b/tests/data/test_dataset.py @@ -217,7 +217,7 @@ def test_get_items_with_tuple_or_list() -> None: == dataset['data_file_1',] == dataset[('data_file_1',)] == dataset[['data_file_1']] \ == Dataset[Model[int]](data_file_1=123) assert dataset[0, 2] == dataset[(0, 2)] == dataset[[0, 2]] \ - == dataset['data_file_1','data_file_3'] == dataset[('data_file_1', 'data_file_3')] \ + == dataset['data_file_1', 'data_file_3'] == dataset[('data_file_1', 'data_file_3')] \ == dataset[['data_file_1', 'data_file_3']] == dataset[[0, 'data_file_3']] \ == Dataset[Model[int]](data_file_1=dataset['data_file_1'], data_file_3=dataset['data_file_3']) \ diff --git a/tests/modules/raw/test_tasks.py b/tests/modules/raw/test_tasks.py index 5c3a5bb2..e9e72f46 100644 --- a/tests/modules/raw/test_tasks.py +++ b/tests/modules/raw/test_tasks.py @@ -27,12 +27,12 @@ class DecodeCaseInfo(NamedTuple): for case in test_cases: assert decode_bytes.run( Dataset[Model[bytes]](a=case.bytes_data), encoding=case.encoding)['a'].contents == \ - case.target_str + case.target_str for case in test_cases: assert decode_bytes.run( Dataset[Model[bytes]](a=case.bytes_data), encoding=None)['a'].contents == \ - case.target_str + case.target_str assert decode_bytes.run( Dataset[Model[bytes]](dict([(case.encoding, case.bytes_data) for case in test_cases])), diff --git a/tests/util/__init__.py b/tests/util/__init__.py index b4bb73e2..1e67b9f0 100644 --- a/tests/util/__init__.py +++ b/tests/util/__init__.py @@ -61,4 +61,4 @@ def inherited_parent_staticmethod() -> bool: return True def inherited_parent_method(self) -> bool: - return True \ No newline at end of file + return True diff --git a/tests/util/helpers/__init__.py b/tests/util/helpers/__init__.py index 225b93d9..2eee8efb 100644 --- a/tests/util/helpers/__init__.py +++ b/tests/util/helpers/__init__.py @@ -3,4 +3,4 @@ # For test_helpers::test_get_calling_module_name def other_module_call_get_calling_module_name() -> str: - return get_calling_module_name() \ No newline at end of file + return get_calling_module_name()