diff --git a/evadb/parser/lark_visitor/_functions.py b/evadb/parser/lark_visitor/_functions.py index a3b5a868a..2b2c18095 100644 --- a/evadb/parser/lark_visitor/_functions.py +++ b/evadb/parser/lark_visitor/_functions.py @@ -151,6 +151,8 @@ def aggregate_windowed_function(self, tree): # Support for COUNT(*) if token != "*": agg_func_name = token + elif token == "*": + agg_func_arg = TupleValueExpression(name="_row_id") else: agg_func_arg = TupleValueExpression(name="id") diff --git a/test/data/uadetrac/small-data/pdf_data/fall_2023_orientation_document.pdf b/test/data/uadetrac/small-data/pdf_data/fall_2023_orientation_document.pdf new file mode 100644 index 000000000..4205f9ec9 Binary files /dev/null and b/test/data/uadetrac/small-data/pdf_data/fall_2023_orientation_document.pdf differ diff --git a/test/integration_tests/short/test_select_executor.py b/test/integration_tests/short/test_select_executor.py index c2ac348c7..6baafc00d 100644 --- a/test/integration_tests/short/test_select_executor.py +++ b/test/integration_tests/short/test_select_executor.py @@ -293,6 +293,38 @@ def test_select_and_groupby_with_sample(self): expected_batch.project(["FIRST.id", "SEGMENT.data"]), ) + def test_select_and_groupby_and_aggregate_with_pdf(self): + GROUPBY_SIZE = 8 + execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;") + # load from directory + pdf_path = ( + "test/data/uadetrac/small-data/pdf_data/fall_2023_orientation_document.pdf" + ) + load_query = f"LOAD PDF '{pdf_path}' INTO MyPDFs;" + execute_query_fetch_all(self.evadb, load_query) + select_all_query = "SELECT * FROM MyPDFs;" + all_pdf_batch = execute_query_fetch_all(self.evadb, select_all_query) + + select_query = ( + f"SELECT COUNT(*) FROM MyPDFs GROUP BY '{GROUPBY_SIZE} paragraphs';" + ) + actual_batch = execute_query_fetch_all(self.evadb, select_query) + + self.assertAlmostEqual( + len(all_pdf_batch), + len(actual_batch) * actual_batch.frames.iloc[0, 0], + None, + None, + GROUPBY_SIZE, + ) + self.assertEqual(len(actual_batch), 99) + n = len(actual_batch) + for i in range(n): + self.assertEqual(actual_batch.frames.iloc[i, 0], GROUPBY_SIZE) + + # tear down + execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;") + def test_lateral_join_with_unnest_and_sample(self): query = """SELECT id, label FROM MyVideo SAMPLE 2 JOIN LATERAL