Skip to content

Commit

Permalink
fix lint
Browse files Browse the repository at this point in the history
  • Loading branch information
jiashenC committed Sep 8, 2023
1 parent f8dde26 commit 50cc8b1
Show file tree
Hide file tree
Showing 12 changed files with 60 additions and 38 deletions.
2 changes: 1 addition & 1 deletion evadb/readers/decord_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
import numpy as np

from evadb.catalog.catalog_type import VideoColumnName
from evadb.catalog.sql_config import ROW_NUM_COLUMN
from evadb.constants import AUDIORATE, IFRAMES
from evadb.expression.abstract_expression import AbstractExpression
from evadb.expression.expression_utils import extract_range_list_from_predicate
from evadb.readers.abstract_reader import AbstractReader
from evadb.utils.generic_utils import try_to_import_decord
from evadb.utils.logging_manager import logger
from evadb.catalog.sql_config import ROW_NUM_COLUMN


class DecordReader(AbstractReader):
Expand Down
8 changes: 6 additions & 2 deletions evadb/readers/document/document_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from pathlib import Path
from typing import Dict, Iterator

from evadb.catalog.sql_config import ROW_NUM_COLUMN
from evadb.readers.abstract_reader import AbstractReader
from evadb.readers.document.registry import (
_lazy_import_loader,
_lazy_import_text_splitter,
)
from evadb.catalog.sql_config import ROW_NUM_COLUMN


class DocumentReader(AbstractReader):
Expand Down Expand Up @@ -50,5 +50,9 @@ def _read(self) -> Iterator[Dict]:
for chunk_id, row in enumerate(
langchain_text_splitter.split_documents([data])
):
yield {"chunk_id": chunk_id, "data": row.page_content, ROW_NUM_COLUMN: row_num}
yield {
"chunk_id": chunk_id,
"data": row.page_content,
ROW_NUM_COLUMN: row_num,
}
row_num += 1
2 changes: 1 addition & 1 deletion evadb/readers/pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
# limitations under the License.
from typing import Dict, Iterator

from evadb.catalog.sql_config import ROW_NUM_COLUMN
from evadb.readers.abstract_reader import AbstractReader
from evadb.utils.generic_utils import try_to_import_fitz
from evadb.catalog.sql_config import ROW_NUM_COLUMN


class PDFReader(AbstractReader):
Expand Down
6 changes: 4 additions & 2 deletions evadb/storage/document_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from typing import Iterator

from evadb.catalog.models.table_catalog import TableCatalogEntry
from evadb.catalog.sql_config import ROW_NUM_COLUMN, ROW_NUM_MAGIC
from evadb.database import EvaDBDatabase
from evadb.models.storage.batch import Batch
from evadb.readers.document.document_reader import DocumentReader
from evadb.storage.abstract_media_storage_engine import AbstractMediaStorageEngine
from evadb.catalog.sql_config import ROW_NUM_COLUMN, ROW_NUM_MAGIC


class DocumentStorageEngine(AbstractMediaStorageEngine):
Expand All @@ -39,5 +39,7 @@ def read(self, table: TableCatalogEntry, chunk_params: dict) -> Iterator[Batch]:
for batch in reader.read():
batch.frames[table.columns[0].name] = row_id
batch.frames[table.columns[1].name] = str(file_name)
batch.frames[ROW_NUM_COLUMN] = row_id * ROW_NUM_MAGIC + batch.frames[ROW_NUM_COLUMN]
batch.frames[ROW_NUM_COLUMN] = (
row_id * ROW_NUM_MAGIC + batch.frames[ROW_NUM_COLUMN]
)
yield batch
2 changes: 1 addition & 1 deletion evadb/storage/image_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from typing import Iterator

from evadb.catalog.models.table_catalog import TableCatalogEntry
from evadb.catalog.sql_config import ROW_NUM_COLUMN
from evadb.database import EvaDBDatabase
from evadb.models.storage.batch import Batch
from evadb.readers.image.opencv_image_reader import CVImageReader
from evadb.storage.abstract_media_storage_engine import AbstractMediaStorageEngine
from evadb.catalog.sql_config import ROW_NUM_COLUMN


class ImageStorageEngine(AbstractMediaStorageEngine):
Expand Down
6 changes: 4 additions & 2 deletions evadb/storage/pdf_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from typing import Iterator

from evadb.catalog.models.table_catalog import TableCatalogEntry
from evadb.catalog.sql_config import ROW_NUM_COLUMN, ROW_NUM_MAGIC
from evadb.database import EvaDBDatabase
from evadb.models.storage.batch import Batch
from evadb.readers.pdf_reader import PDFReader
from evadb.storage.abstract_media_storage_engine import AbstractMediaStorageEngine
from evadb.catalog.sql_config import ROW_NUM_COLUMN, ROW_NUM_MAGIC


class PDFStorageEngine(AbstractMediaStorageEngine):
Expand All @@ -37,5 +37,7 @@ def read(self, table: TableCatalogEntry) -> Iterator[Batch]:
for batch in reader.read():
batch.frames[table.columns[0].name] = row_id
batch.frames[table.columns[1].name] = str(file_name)
batch.frames[ROW_NUM_COLUMN] = row_id * ROW_NUM_MAGIC + batch.frames[ROW_NUM_COLUMN]
batch.frames[ROW_NUM_COLUMN] = (
row_id * ROW_NUM_MAGIC + batch.frames[ROW_NUM_COLUMN]
)
yield batch
16 changes: 13 additions & 3 deletions evadb/storage/sqlite_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ def create(self, table: TableCatalogEntry, **kwargs):

# During table creation, assume row_id is automatically handled by
# the sqlalchemy engine.
table_columns = [col for col in table.columns if (col.name != IDENTIFIER_COLUMN and col.name != ROW_NUM_COLUMN)]
table_columns = [
col
for col in table.columns
if (col.name != IDENTIFIER_COLUMN and col.name != ROW_NUM_COLUMN)
]
sqlalchemy_schema = SchemaUtils.xform_to_sqlalchemy_schema(table_columns)
attr_dict.update(sqlalchemy_schema)

Expand Down Expand Up @@ -149,12 +153,18 @@ def write(self, table: TableCatalogEntry, rows: Batch):
# the sqlalchemy engine. Another assumption we make here is the
# updated data need not to take care of row_id.
table_columns = [
col for col in table.columns if (col.name != IDENTIFIER_COLUMN and col.name != ROW_NUM_COLUMN)
col
for col in table.columns
if (col.name != IDENTIFIER_COLUMN and col.name != ROW_NUM_COLUMN)
]

# Todo: validate the data type before inserting into the table
for record in rows.frames.values:
row_data = {col: record[idx] for idx, col in enumerate(columns) if col != ROW_NUM_COLUMN}
row_data = {
col: record[idx]
for idx, col in enumerate(columns)
if col != ROW_NUM_COLUMN
}
data.append(self._dict_to_sql_row(row_data, table_columns))
self._sql_session.execute(table_to_update.insert(), data)
self._sql_session.commit()
Expand Down
6 changes: 4 additions & 2 deletions evadb/storage/video_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
from typing import Iterator

from evadb.catalog.models.table_catalog import TableCatalogEntry
from evadb.catalog.sql_config import ROW_NUM_COLUMN, ROW_NUM_MAGIC
from evadb.database import EvaDBDatabase
from evadb.expression.abstract_expression import AbstractExpression
from evadb.models.storage.batch import Batch
from evadb.readers.decord_reader import DecordReader
from evadb.storage.abstract_media_storage_engine import AbstractMediaStorageEngine
from evadb.catalog.sql_config import ROW_NUM_COLUMN, ROW_NUM_MAGIC


class DecordStorageEngine(AbstractMediaStorageEngine):
Expand Down Expand Up @@ -59,5 +59,7 @@ def read(
for batch in reader.read():
batch.frames[table.columns[0].name] = row_id
batch.frames[table.columns[1].name] = str(video_file_name)
batch.frames[ROW_NUM_COLUMN] = row_id * ROW_NUM_MAGIC + batch.frames[ROW_NUM_COLUMN]
batch.frames[ROW_NUM_COLUMN] = (
row_id * ROW_NUM_MAGIC + batch.frames[ROW_NUM_COLUMN]
)
yield batch
14 changes: 2 additions & 12 deletions test/integration_tests/long/test_single_document_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,11 @@
import os
import unittest
from copy import deepcopy
from test.markers import gpu_skip_marker, qdrant_skip_marker
from test.util import (
create_sample_image,
get_evadb_for_testing,
load_functions_for_testing,
shutdown_ray,
)
from test.util import get_evadb_for_testing, load_functions_for_testing

import numpy as np
import pandas as pd
import pytest

from evadb.models.storage.batch import Batch
from evadb.server.command_handler import execute_query_fetch_all
from evadb.storage.storage_engine import StorageEngine
from evadb.utils.generic_utils import try_to_import_fitz


Expand Down Expand Up @@ -97,7 +87,7 @@ def test_single_pdf_should_work(self):

# Ensure index scan is used.
query = """
SELECT data
SELECT data
FROM MyPDF
ORDER BY Similarity(SentenceFeatureExtractor('{}'), SentenceFeatureExtractor(data))
LIMIT 1
Expand Down
28 changes: 19 additions & 9 deletions test/unit_tests/readers/test_decord_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,11 @@ def test_should_sample_only_iframe(self):
batches = list(video_loader.read())

expected = self._batches_to_reader_convertor(
create_dummy_batches(filters=[i for i in range(0, NUM_FRAMES, k)], is_from_storage=True)
create_dummy_batches(
filters=[i for i in range(0, NUM_FRAMES, k)], is_from_storage=True
)
)

print(batches[0].frames)
print(expected[0].frames)

self.assertEqual(batches, expected)

def test_should_sample_every_k_frame_with_predicate(self):
Expand All @@ -96,7 +95,10 @@ def test_should_sample_every_k_frame_with_predicate(self):
value = NUM_FRAMES // 2
start = value + k - (value % k) if value % k else value
expected = self._batches_to_reader_convertor(
create_dummy_batches(filters=[i for i in range(start, NUM_FRAMES, k)], is_from_storage=True)
create_dummy_batches(
filters=[i for i in range(start, NUM_FRAMES, k)],
is_from_storage=True,
)
)
self.assertEqual(batches, expected)

Expand All @@ -123,7 +125,9 @@ def test_should_sample_every_k_frame_with_predicate(self):
batches = list(video_loader.read())
start = value + k - (value % k) if value % k else value
expected = self._batches_to_reader_convertor(
create_dummy_batches(filters=[i for i in range(start, 8, k)], is_from_storage=True)
create_dummy_batches(
filters=[i for i in range(start, 8, k)], is_from_storage=True
)
)
self.assertEqual(batches, expected)

Expand All @@ -132,7 +136,9 @@ def test_should_return_one_batch(self):
file_url=self.video_file_url,
)
batches = list(video_loader.read())
expected = self._batches_to_reader_convertor(create_dummy_batches(is_from_storage=True))
expected = self._batches_to_reader_convertor(
create_dummy_batches(is_from_storage=True)
)
self.assertEqual(batches, expected)

def test_should_return_batches_equivalent_to_number_of_frames(self):
Expand All @@ -141,7 +147,9 @@ def test_should_return_batches_equivalent_to_number_of_frames(self):
batch_mem_size=self.frame_size,
)
batches = list(video_loader.read())
expected = self._batches_to_reader_convertor(create_dummy_batches(batch_size=1, is_from_storage=True))
expected = self._batches_to_reader_convertor(
create_dummy_batches(batch_size=1, is_from_storage=True)
)
self.assertEqual(batches, expected)

def test_should_sample_every_k_frame(self):
Expand All @@ -152,7 +160,9 @@ def test_should_sample_every_k_frame(self):
)
batches = list(video_loader.read())
expected = self._batches_to_reader_convertor(
create_dummy_batches(filters=[i for i in range(0, NUM_FRAMES, k)], is_from_storage=True)
create_dummy_batches(
filters=[i for i in range(0, NUM_FRAMES, k)], is_from_storage=True
)
)
self.assertEqual(batches, expected)

Expand Down
6 changes: 4 additions & 2 deletions test/unit_tests/storage/test_sqlite_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
from evadb.catalog.catalog_type import ColumnType, NdArrayType, TableType
from evadb.catalog.models.column_catalog import ColumnCatalogEntry
from evadb.catalog.models.table_catalog import TableCatalogEntry
from evadb.storage.sqlite_storage_engine import SQLStorageEngine
from evadb.catalog.sql_config import IDENTIFIER_COLUMN
from evadb.storage.sqlite_storage_engine import SQLStorageEngine


@pytest.mark.notparallel
Expand All @@ -41,7 +41,9 @@ def create_sample_table(self):
str(suffix_pytest_xdist_worker_id_to_dir("dataset")),
table_type=TableType.VIDEO_DATA,
)
column_pk = ColumnCatalogEntry(IDENTIFIER_COLUMN, ColumnType.INTEGER, is_nullable=False)
column_pk = ColumnCatalogEntry(
IDENTIFIER_COLUMN, ColumnType.INTEGER, is_nullable=False
)
column_0 = ColumnCatalogEntry("name", ColumnType.TEXT, is_nullable=False)
column_1 = ColumnCatalogEntry("id", ColumnType.INTEGER, is_nullable=False)
column_2 = ColumnCatalogEntry(
Expand Down
2 changes: 1 addition & 1 deletion test/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def create_dummy_batches(
batch_size=10,
start_id=0,
video_dir=None,
is_from_storage=False, # if cover test directly from storage, it needs to append a _row_number
is_from_storage=False, # if cover test directly from storage, it needs to append a _row_number
):
video_dir = video_dir or get_tmp_dir()

Expand Down

0 comments on commit 50cc8b1

Please sign in to comment.