Skip to content

Commit

Permalink
Merge pull request #132 from ComPlat/124-handle-tar-files-as-a-list-o…
Browse files Browse the repository at this point in the history
…f-files

124 handle tar files as a list of files
  • Loading branch information
StarmanMartin authored Dec 17, 2024
2 parents 8ec09e5 + 3896397 commit a9ad4e7
Show file tree
Hide file tree
Showing 20 changed files with 163 additions and 42 deletions.
61 changes: 60 additions & 1 deletion converter_app/models.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import json
import logging
import os
import pathlib
import tarfile
import tempfile
import uuid
from collections import defaultdict

from pathlib import Path

import magic
from flask import current_app
from werkzeug.datastructures import FileStorage

from converter_app.utils import check_uuid

Expand Down Expand Up @@ -226,7 +230,15 @@ def name(self):
"""
:return: The origin file name
"""
return self.fp.filename
return os.path.basename(self.fp.filename)

@property
def file_path(self):
"""
This is only required for subfiles of a tar archive file
:return: The origin file path
"""
return os.path.dirname(self.fp.filename)

def features(self, name):
"""
Expand All @@ -246,3 +258,50 @@ def set_features(self, name, feature_content):
:param feature_content: Feature content
"""
self._features[name] = feature_content

@property
def is_tar_archive(self) -> bool:
"""
Checks if the file is a tar archive
:return: True if the file is a tar archive
"""
return self.name.endswith(".gz") or self.name.endswith(".xz") or self.name.endswith(".tar")


def extract_tar_archive(file: File, temp_dir: str) -> list[File]:
"""
If the file is a tar archive, this function extracts it and returns a list of all files
:param file: Input file from the client
:return: A list of all files extracted
"""
if not file.is_tar_archive:
return []
file_list = []
with tempfile.NamedTemporaryFile(delete=True) as temp_archive:
try:
# Save the contents of FileStorage to the temporary file
file.fp.save(temp_archive.name)
if file.name.endswith(".gz"):
mode = "r:gz"
elif file.name.endswith(".xz"):
mode = "r:xz"
elif file.name.endswith(".tar"):
mode = "r:"
else:
return []
with tarfile.open(temp_archive.name, mode) as tar:
tar.extractall(temp_dir)
tar.close()
except ValueError:
return []

for root, _, files in os.walk(temp_dir, topdown=False):
for name in files:
path_file_name = os.path.join(root, name)
content_type = magic.Magic(mime=True).from_file(path_file_name)
f = open(path_file_name, 'rb')
fs = FileStorage(stream=f, filename=path_file_name,
content_type=content_type)
file_list.append(File(fs))

return file_list
4 changes: 2 additions & 2 deletions converter_app/readers/asc_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ class AscZipReader(Reader):
identifier = 'asc_zip_reader'
priority = 10

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.filedata = {}

# two or more chars in row
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class CSVReader(Reader):
identifier = 'csv_reader'
priority = 100

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.lines = None
self.rows = None
self.table_min_rows = 20
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/ebl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class _State(Enum):
POS_TABLE_D = 4
END = 5

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.lines = None
self.pre_header = None
self.pre_script = None
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ class ExcelReader(Reader):
identifier = 'excel_reader'
priority = 15

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.wb = None
self._table_row_meta = Table()
self._table_col_meta = Table()
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/gcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class GcdReader(Reader):
identifier = 'gcd_reader'
priority = 5

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.lines = None

self._number_of_ch = 0
Expand Down
19 changes: 15 additions & 4 deletions converter_app/readers/helper/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,28 @@ def add_unique(self, key: str, value: any):
class Reader:
"""
Base reader. Any reader needs to extend this abstract reader.
Attributes:
identifier (str): The manufacturer of the car.
metadata (dict): Auto generated bsed on the convertion results.
tables (int): Auto generated bsed on the convertion results.
file (converter_app.modelsFile): Received File from the client (Chemotion)
file_content ([]converter_app.modelsFile): file_content contains all files archived in the 'file' if it is a tarball file.
is_tar_ball (bool): Ture if 'file' is a tarball file.
"""

float_pattern = re.compile(r'[-+]?[0-9]*[.,]?[0-9]+(?:[eE][-+]?[0-9]+)?\s*')
float_de_pattern = re.compile(r'(-?[\d.]+,\d*[eE+\-\d]*)')
float_us_pattern = re.compile(r'(-?[\d,]+.\d*[eE+\-\d]*)')

def __init__(self, file):
self.empty_values = ['', 'n.a.']
def __init__(self, file, *tar_content):
self._empty_values = ['', 'n.a.']
self.identifier = None
self.metadata = None
self.tables = None
self.file = file
self.file_content = tar_content
self.is_tar_ball = len(tar_content) > 0

@property
def as_dict(self):
Expand All @@ -79,7 +90,7 @@ def as_dict(self):

def check(self):
"""
Abstract method check if the reader matches a file
Abstract method check if the reader matches a filelist
:return: [bool] true if the Reader checks a file
"""
raise NotImplementedError
Expand Down Expand Up @@ -178,7 +189,7 @@ def get_shape(self, row) -> list:
shape.append('f')
continue
cell = str(cell).strip()
if cell in self.empty_values:
if cell in self._empty_values:
shape.append('')
elif self.float_pattern.fullmatch(cell):
shape.append('f')
Expand Down
31 changes: 22 additions & 9 deletions converter_app/readers/helper/reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import inspect
import tempfile
from collections import OrderedDict

from converter_app.converters import logger
from converter_app.models import File
from converter_app.models import File, extract_tar_archive


class Readers:
Expand Down Expand Up @@ -57,15 +59,26 @@ def match_reader(self, file: File):
logger.debug('file_name=%s content_type=%s mime_type=%s encoding=%s',
file.name, file.content_type, file.mime_type, file.encoding)

for _identifier, reader in self.readers.items():
reader = reader(file)
result = reader.check()
with tempfile.TemporaryDirectory() as tmpdir:

logger.debug('For reader %s -> result=%s', reader.__class__.__name__, result)
archive_file_list = extract_tar_archive(file, tmpdir)

# reset file pointer and return the reader it is the one
file.fp.seek(0)
if result:
return reader
for _identifier, reader in self.readers.items():
params = inspect.signature(reader).parameters
if len(params) > 1:
reader = reader(file, *archive_file_list)
else:
reader = reader(file)

result = reader.check()

logger.debug('For reader %s -> result=%s', reader.__class__.__name__, result)

# reset file pointer and return the reader it is the one
file.fp.seek(0)
for archive_file in archive_file_list:
archive_file.fp.seek(0)
if result:
return reader

return None
4 changes: 2 additions & 2 deletions converter_app/readers/jasco.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class JascoReader(Reader):
identifier = 'jasco_reader'
priority = 99

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.lines = None
self.header_length = 8

Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/json_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class JsonReader(Reader):
identifier = 'json_reader'
priority = 20

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.file_as_dict = {}
self._all_tables = {}
self.table = None
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/old_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class OldExcelReader(Reader):
identifier = 'old_excel_reader'
priority = 16

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.wb = None

def check(self):
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class PdfReader(Reader):
identifier = 'pdf_reader'
priority = 100

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self.text_data = None

def check(self):
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/sec.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class SecReader(Reader):
identifier = 'sec_reader'
priority = 95

def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self._has_header = False
self._has_first_value = False
self._is_table_empty = True
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/tif.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ class TifReader(Reader):
priority = 96


def __init__(self, file):
super().__init__(file)
def __init__(self, file, *tar_content):
super().__init__(file, *tar_content)
self._parsed_values = None

def check(self):
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/uxd_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class UXDReader(Reader):
identifier = 'uxd_reader'
priority = 10

def __init__(self, file: File):
super().__init__(file)
def __init__(self, file: File, *tar_content):
super().__init__(file, *tar_content)
self._file_extensions = ['.uxd']
self._table = None
self._version = 2
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/xml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class XMLReader(Reader):
identifier = 'xml_reader'
priority = 10

def __init__(self, file: File):
super().__init__(file)
def __init__(self, file: File, *tar_content):
super().__init__(file, *tar_content)
self._file_extensions = ['.xml']
self._table = None
self._data_tables = []
Expand Down
4 changes: 2 additions & 2 deletions converter_app/readers/xrdml.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class XRDMLReader(XMLReader):

identifier = 'xrdml_reader'

def __init__(self, file: File):
super().__init__(file)
def __init__(self, file: File, *tar_content):
super().__init__(file, *tar_content)
self._file_extensions = ['.xrdml']
self._step_sizes = {}

Expand Down
Binary file added test_static/a/a.tar.gz
Binary file not shown.
15 changes: 15 additions & 0 deletions test_static/a/a.txt.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
dBBBB
ddfddfss
ddfddf

88

ffss


Hallo A_B_a

allo A_B_a
sdfdss
ffs

23 changes: 23 additions & 0 deletions test_static/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
import pytest
from flask import Flask
from flask.testing import FlaskClient
from werkzeug.datastructures import FileStorage

from converter_app.app import create_app
from converter_app.models import File, extract_tar_archive

res_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../test_files/ConverterAutoResults'))
src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../test_files/ChemConverter'))
Expand Down Expand Up @@ -50,3 +52,24 @@ def test_settings_list(client: FlaskClient):

response = client.get('/')
assert response.status_code == 200



def test_tar_unzip(client: FlaskClient):
"""
Test setting tests
:param client: Flask client
"""

with open(os.path.join(os.path.dirname(__file__), 'a/a.tar.gz'), 'rb') as tar:
fs = FileStorage(stream=tar, filename='a/a.tar.gz',
content_type='application/tar')
file = File(fs)
assert file.is_tar_archive
assert file.name == 'a.tar.gz'
with tempfile.TemporaryDirectory() as tmpdirname:
with open(os.path.join(os.path.dirname(__file__), 'a/a.txt.0'), 'r') as tf:
archive = extract_tar_archive(file, tmpdirname)
assert len(archive) == 1
assert archive[0].name == 'a.txt.0'
assert archive[0].string == tf.read()

0 comments on commit a9ad4e7

Please sign in to comment.