From bb81fdb9fb4af18263fd98c697ea2bf3b98a1498 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 13 Nov 2024 08:25:58 +0100 Subject: [PATCH 1/2] Added tar archive extraction -> extended reader constructor to consume archived files --- converter_app/models.py | 62 ++++++++++++++++++++++++- converter_app/readers/asc_zip.py | 4 +- converter_app/readers/csv_reader.py | 4 +- converter_app/readers/ebl.py | 4 +- converter_app/readers/excel.py | 4 +- converter_app/readers/gcd.py | 4 +- converter_app/readers/helper/base.py | 19 ++++++-- converter_app/readers/helper/reader.py | 31 +++++++++---- converter_app/readers/jasco.py | 4 +- converter_app/readers/json_reader.py | 4 +- converter_app/readers/old_excel.py | 4 +- converter_app/readers/pdf.py | 4 +- converter_app/readers/sec.py | 4 +- converter_app/readers/tif.py | 4 +- converter_app/readers/uxd_reader.py | 4 +- converter_app/readers/xml_reader.py | 4 +- converter_app/readers/xrdml.py | 4 +- test_static/a/a.tar.gz | Bin 0 -> 161 bytes test_static/a/a.txt.0 | 15 ++++++ test_static/tests.py | 23 +++++++++ 20 files changed, 164 insertions(+), 42 deletions(-) create mode 100644 test_static/a/a.tar.gz create mode 100644 test_static/a/a.txt.0 diff --git a/converter_app/models.py b/converter_app/models.py index 74384ebb..b365b028 100644 --- a/converter_app/models.py +++ b/converter_app/models.py @@ -1,6 +1,9 @@ import json import logging +import os import pathlib +import tarfile +import tempfile import uuid from collections import defaultdict @@ -8,6 +11,7 @@ import magic from flask import current_app +from werkzeug.datastructures import FileStorage from converter_app.utils import check_uuid @@ -226,7 +230,15 @@ def name(self): """ :return: The origin file name """ - return self.fp.filename + return os.path.basename(self.fp.filename) + + @property + def file_path(self): + """ + This is only required for subfiles of a tar archive file + :return: The origin file path + """ + return os.path.basename(self.fp.filename) def features(self, name): """ @@ -246,3 +258,51 @@ def set_features(self, name, feature_content): :param feature_content: Feature content """ self._features[name] = feature_content + + @property + def is_tar_archive(self) -> bool: + """ + Checks if the file is a tar archive + :return: True if the file is a tar archive + """ + return self.name.endswith(".gz") or self.name.endswith(".xz") or self.name.endswith(".tar") + +def extract_tar_archive(file: File, temp_dir: str) -> list[File]: + """ + If the file is a tar archive, this function extracts it and returns a list of all files + :param file: Input file from the client + :return: A list of all files extracted + """ + if not file.is_tar_archive: + return [] + file_list = [] + with tempfile.NamedTemporaryFile(delete=True) as temp_pdf: + try: + # Save the contents of FileStorage to the temporary file + file.fp.save(temp_pdf.name) + if file.name.endswith(".gz"): + mode = "r:gz" + elif file.name.endswith(".xz"): + mode = "r:xz" + elif file.name.endswith(".tar"): + mode = "r:" + else: + return [] + with tarfile.open(temp_pdf.name, mode) as tar: + tar.extractall(temp_dir) + tar.close() + except ValueError: + return [] + + for root, _, files in os.walk(temp_dir, topdown=False): + for name in files: + path_file_name = os.path.join(root, name) + content_type = magic.Magic(mime=True).from_file(path_file_name) + f = open(path_file_name, 'rb') + fs = FileStorage(stream=f, filename=path_file_name, + content_type=content_type) + file_list.append(File(fs)) + + return file_list + + diff --git a/converter_app/readers/asc_zip.py b/converter_app/readers/asc_zip.py index 7b189716..1b7db7bd 100644 --- a/converter_app/readers/asc_zip.py +++ b/converter_app/readers/asc_zip.py @@ -16,8 +16,8 @@ class AscZipReader(Reader): identifier = 'asc_zip_reader' priority = 10 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.filedata = {} # two or more chars in row diff --git a/converter_app/readers/csv_reader.py b/converter_app/readers/csv_reader.py index 18cb0082..e88557d8 100644 --- a/converter_app/readers/csv_reader.py +++ b/converter_app/readers/csv_reader.py @@ -15,8 +15,8 @@ class CSVReader(Reader): identifier = 'csv_reader' priority = 100 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.lines = None self.rows = None self.table_min_rows = 20 diff --git a/converter_app/readers/ebl.py b/converter_app/readers/ebl.py index eb3d7307..1ea0a330 100644 --- a/converter_app/readers/ebl.py +++ b/converter_app/readers/ebl.py @@ -22,8 +22,8 @@ class _State(Enum): POS_TABLE_D = 4 END = 5 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.lines = None self.pre_header = None self.pre_script = None diff --git a/converter_app/readers/excel.py b/converter_app/readers/excel.py index 7fd3c51c..5d49e866 100644 --- a/converter_app/readers/excel.py +++ b/converter_app/readers/excel.py @@ -16,8 +16,8 @@ class ExcelReader(Reader): identifier = 'excel_reader' priority = 15 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.wb = None def check(self): diff --git a/converter_app/readers/gcd.py b/converter_app/readers/gcd.py index cbfee9d9..19f7d20d 100644 --- a/converter_app/readers/gcd.py +++ b/converter_app/readers/gcd.py @@ -15,8 +15,8 @@ class GcdReader(Reader): identifier = 'gcd_reader' priority = 5 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.lines = None self._number_of_ch = 0 diff --git a/converter_app/readers/helper/base.py b/converter_app/readers/helper/base.py index 106778b8..744ee50f 100644 --- a/converter_app/readers/helper/base.py +++ b/converter_app/readers/helper/base.py @@ -55,17 +55,28 @@ def add_unique(self, key: str, value: any): class Reader: """ Base reader. Any reader needs to extend this abstract reader. + + Attributes: + identifier (str): The manufacturer of the car. + metadata (dict): Auto generated bsed on the convertion results. + tables (int): Auto generated bsed on the convertion results. + file (converter_app.modelsFile): Received File from the client (Chemotion) + file_content ([]converter_app.modelsFile): file_content contains all files archived in the 'file' if it is a tarball file. + is_tar_ball (bool): Ture if 'file' is a tarball file. """ + float_pattern = re.compile(r'[-+]?[0-9]*[.,]?[0-9]+(?:[eE][-+]?[0-9]+)?\s*') float_de_pattern = re.compile(r'(-?[\d.]+,\d*[eE+\-\d]*)') float_us_pattern = re.compile(r'(-?[\d,]+.\d*[eE+\-\d]*)') - def __init__(self, file): - self.empty_values = ['', 'n.a.'] + def __init__(self, file, *tar_content): + self._empty_values = ['', 'n.a.'] self.identifier = None self.metadata = None self.tables = None self.file = file + self.file_content = tar_content + self.is_tar_ball = len(tar_content) > 0 @property def as_dict(self): @@ -79,7 +90,7 @@ def as_dict(self): def check(self): """ - Abstract method check if the reader matches a file + Abstract method check if the reader matches a filelist :return: [bool] true if the Reader checks a file """ raise NotImplementedError @@ -175,7 +186,7 @@ def get_shape(self, row) -> list: shape.append(None) else: cell = str(cell).strip() - if cell in self.empty_values: + if cell in self._empty_values: shape.append('') elif self.float_pattern.fullmatch(cell): shape.append('f') diff --git a/converter_app/readers/helper/reader.py b/converter_app/readers/helper/reader.py index 47d5291b..8f89736d 100644 --- a/converter_app/readers/helper/reader.py +++ b/converter_app/readers/helper/reader.py @@ -1,7 +1,9 @@ +import inspect +import tempfile from collections import OrderedDict from converter_app.converters import logger -from converter_app.models import File +from converter_app.models import File, extract_tar_archive class Readers: @@ -57,15 +59,26 @@ def match_reader(self, file: File): logger.debug('file_name=%s content_type=%s mime_type=%s encoding=%s', file.name, file.content_type, file.mime_type, file.encoding) - for _identifier, reader in self.readers.items(): - reader = reader(file) - result = reader.check() + with tempfile.TemporaryDirectory() as tmpdir: - logger.debug('For reader %s -> result=%s', reader.__class__.__name__, result) + archive_file_list = extract_tar_archive(file, tmpdir) - # reset file pointer and return the reader it is the one - file.fp.seek(0) - if result: - return reader + for _identifier, reader in self.readers.items(): + params = inspect.signature(reader).parameters + if len(params) > 1: + reader = reader(file, *archive_file_list) + else: + reader = reader(file) + + result = reader.check() + + logger.debug('For reader %s -> result=%s', reader.__class__.__name__, result) + + # reset file pointer and return the reader it is the one + file.fp.seek(0) + for archive_file in archive_file_list: + archive_file.fp.seek(0) + if result: + return reader return None diff --git a/converter_app/readers/jasco.py b/converter_app/readers/jasco.py index 08649647..223ca657 100644 --- a/converter_app/readers/jasco.py +++ b/converter_app/readers/jasco.py @@ -13,8 +13,8 @@ class JascoReader(Reader): identifier = 'jasco_reader' priority = 99 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.lines = None self.header_length = 8 diff --git a/converter_app/readers/json_reader.py b/converter_app/readers/json_reader.py index b0f81127..2604c71b 100644 --- a/converter_app/readers/json_reader.py +++ b/converter_app/readers/json_reader.py @@ -14,8 +14,8 @@ class JsonReader(Reader): identifier = 'json_reader' priority = 20 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.file_as_dict = {} self._all_tables = {} self.table = None diff --git a/converter_app/readers/old_excel.py b/converter_app/readers/old_excel.py index 2d5d3f9c..e8efdf17 100644 --- a/converter_app/readers/old_excel.py +++ b/converter_app/readers/old_excel.py @@ -14,8 +14,8 @@ class OldExcelReader(Reader): identifier = 'old_excel_reader' priority = 16 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.wb = None def check(self): diff --git a/converter_app/readers/pdf.py b/converter_app/readers/pdf.py index 869d0965..e2b707e3 100644 --- a/converter_app/readers/pdf.py +++ b/converter_app/readers/pdf.py @@ -14,8 +14,8 @@ class PdfReader(Reader): identifier = 'pdf_reader' priority = 100 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self.text_data = None def check(self): diff --git a/converter_app/readers/sec.py b/converter_app/readers/sec.py index 3de76f91..fa6995e5 100644 --- a/converter_app/readers/sec.py +++ b/converter_app/readers/sec.py @@ -15,8 +15,8 @@ class SecReader(Reader): identifier = 'sec_reader' priority = 95 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self._has_header = False self._has_first_value = False self._is_table_empty = True diff --git a/converter_app/readers/tif.py b/converter_app/readers/tif.py index 0e63b114..47de418e 100644 --- a/converter_app/readers/tif.py +++ b/converter_app/readers/tif.py @@ -16,8 +16,8 @@ class TifReader(Reader): priority = 96 - def __init__(self, file): - super().__init__(file) + def __init__(self, file, *tar_content): + super().__init__(file, *tar_content) self._parsed_values = None def check(self): diff --git a/converter_app/readers/uxd_reader.py b/converter_app/readers/uxd_reader.py index 90351d58..3a1e5dc3 100644 --- a/converter_app/readers/uxd_reader.py +++ b/converter_app/readers/uxd_reader.py @@ -17,8 +17,8 @@ class UXDReader(Reader): identifier = 'uxd_reader' priority = 10 - def __init__(self, file: File): - super().__init__(file) + def __init__(self, file: File, *tar_content): + super().__init__(file, *tar_content) self._file_extensions = ['.uxd'] self._table = None self._version = 2 diff --git a/converter_app/readers/xml_reader.py b/converter_app/readers/xml_reader.py index 95b31122..8479b188 100644 --- a/converter_app/readers/xml_reader.py +++ b/converter_app/readers/xml_reader.py @@ -17,8 +17,8 @@ class XMLReader(Reader): identifier = 'xml_reader' priority = 10 - def __init__(self, file: File): - super().__init__(file) + def __init__(self, file: File, *tar_content): + super().__init__(file, *tar_content) self._file_extensions = ['.xml'] self._table = None self._data_tables = [] diff --git a/converter_app/readers/xrdml.py b/converter_app/readers/xrdml.py index cfac3985..e4364b4e 100644 --- a/converter_app/readers/xrdml.py +++ b/converter_app/readers/xrdml.py @@ -12,8 +12,8 @@ class XRDMLReader(XMLReader): identifier = 'xrdml_reader' - def __init__(self, file: File): - super().__init__(file) + def __init__(self, file: File, *tar_content): + super().__init__(file, *tar_content) self._file_extensions = ['.xrdml'] self._step_sizes = {} diff --git a/test_static/a/a.tar.gz b/test_static/a/a.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..fbb6a4880c75603e446455a153884118a3ebb4ab GIT binary patch literal 161 zcmV;S0ABweiwFP!00000|Lo7f3c@f92H>9a6nld4$Hi_}d-5$tAryqEuvGBzWm04p z5f4LzewRPAIpmLRzULNhO90@#EXn;I>wIE($I`QS>39bR?jYitZLcbBmgDSzbDZ<( z@_z5rf9dkdKcOLnK&nYKnU#TP7>LsHKy-}LbZrh}7$ebYHce`_Y!UssMO9T*^{qz$ P00960YvQer00;m87tl>F literal 0 HcmV?d00001 diff --git a/test_static/a/a.txt.0 b/test_static/a/a.txt.0 new file mode 100644 index 00000000..7bedf43e --- /dev/null +++ b/test_static/a/a.txt.0 @@ -0,0 +1,15 @@ +dBBBB +ddfddfss +ddfddf + +88 + +ffss + + +Hallo A_B_a + +allo A_B_a +sdfdss +ffs + diff --git a/test_static/tests.py b/test_static/tests.py index 1328162e..6efb0ebc 100644 --- a/test_static/tests.py +++ b/test_static/tests.py @@ -4,8 +4,10 @@ import pytest from flask import Flask from flask.testing import FlaskClient +from werkzeug.datastructures import FileStorage from converter_app.app import create_app +from converter_app.models import File, extract_tar_archive res_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../test_files/ConverterAutoResults')) src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../test_files/ChemConverter')) @@ -50,3 +52,24 @@ def test_settings_list(client: FlaskClient): response = client.get('/') assert response.status_code == 200 + + + +def test_tar_unzip(client: FlaskClient): + """ + Test setting tests + :param client: Flask client + """ + + with open(os.path.join(os.path.dirname(__file__), 'a/a.tar.gz'), 'rb') as tar: + fs = FileStorage(stream=tar, filename='a/a.tar.gz', + content_type='application/tar') + file = File(fs) + assert file.is_tar_archive + assert file.name == 'a.tar.gz' + with tempfile.TemporaryDirectory() as tmpdirname: + with open(os.path.join(os.path.dirname(__file__), 'a/a.txt.0'), 'r') as tf: + archive = extract_tar_archive(file, tmpdirname) + assert len(archive) == 1 + assert archive[0].name == 'a.txt.0' + assert archive[0].string == tf.read() From 38963974b2047e6fa6d59e4b7b1d48f284de7197 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 17 Dec 2024 12:15:19 +0100 Subject: [PATCH 2/2] Fixed file_path property of models.File Object --- converter_app/models.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/converter_app/models.py b/converter_app/models.py index b365b028..7fdb4d76 100644 --- a/converter_app/models.py +++ b/converter_app/models.py @@ -238,7 +238,7 @@ def file_path(self): This is only required for subfiles of a tar archive file :return: The origin file path """ - return os.path.basename(self.fp.filename) + return os.path.dirname(self.fp.filename) def features(self, name): """ @@ -267,6 +267,7 @@ def is_tar_archive(self) -> bool: """ return self.name.endswith(".gz") or self.name.endswith(".xz") or self.name.endswith(".tar") + def extract_tar_archive(file: File, temp_dir: str) -> list[File]: """ If the file is a tar archive, this function extracts it and returns a list of all files @@ -276,10 +277,10 @@ def extract_tar_archive(file: File, temp_dir: str) -> list[File]: if not file.is_tar_archive: return [] file_list = [] - with tempfile.NamedTemporaryFile(delete=True) as temp_pdf: + with tempfile.NamedTemporaryFile(delete=True) as temp_archive: try: # Save the contents of FileStorage to the temporary file - file.fp.save(temp_pdf.name) + file.fp.save(temp_archive.name) if file.name.endswith(".gz"): mode = "r:gz" elif file.name.endswith(".xz"): @@ -288,7 +289,7 @@ def extract_tar_archive(file: File, temp_dir: str) -> list[File]: mode = "r:" else: return [] - with tarfile.open(temp_pdf.name, mode) as tar: + with tarfile.open(temp_archive.name, mode) as tar: tar.extractall(temp_dir) tar.close() except ValueError: @@ -304,5 +305,3 @@ def extract_tar_archive(file: File, temp_dir: str) -> list[File]: file_list.append(File(fs)) return file_list - -