From d62458101f6efeeb3081586386a3aed6d07096de Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Mon, 26 Aug 2024 12:51:45 -0600 Subject: [PATCH 1/8] Added YAML to SQLite reader which creates a db file for all ingested YAML files --- dsi/backends/sqlite.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index 7e5d86a9..ec3b121f 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -1,5 +1,9 @@ import csv import sqlite3 +import yaml +import re +import subprocess +import os from dsi.backends.filesystem import Filesystem @@ -494,3 +498,38 @@ def query_fctime(self, operator, ctime, isVerbose=False): print(resout) return resout + +class YamlReader(Sqlite): + + def __init__(self, filename): + super().__init__(filename) + + def yamlToSqlite(self, filename, db_name): + + with open(filename, 'r') as yaml_file, open(db_name+".sql", "w") as sql_file: + editedString = yaml_file.read() + editedString = re.sub('specification', r'columns:\n specification', editedString) #indent specification and put all under a columns dictionary + editedString = re.sub(r'(!.+)\n', r"'\1'\n", editedString) #make ! into a string + yml_data = yaml.safe_load_all(editedString) + + for table in yml_data: + cols = table['columns'].keys() + vals = table['columns'].values() + tableName = table["segment"] + + if not os.path.isfile(db_name+".db"): + data_types = {float: "REAL", str: "TEXT", int: "INTEGER"} + + createStmt = f"CREATE TABLE {tableName} ( " + for key,val in table['columns'].items(): + createStmt += f"{key} {data_types[type(val)]}, " + createStmt = createStmt[:-2] + createStmt+= ");\n\n" + + sql_file.write(createStmt) + + sql = f"INSERT INTO {tableName} {tuple(cols)} VALUES {tuple(vals)};\n\n" + sql_file.write(sql) + + subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) + os.remove(db_name+".sql") \ No newline at end of file From 5944fc741c7298200bed365f94de6139783cd77d Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Thu, 29 Aug 2024 10:14:17 -0600 Subject: [PATCH 2/8] Updated YamlReader to handle numerical data stored as strings by creating a unit table for each table. Created a test for this class --- dsi/backends/sqlite.py | 87 +++++++++++++++++++++++++------ dsi/backends/tests/test_sqlite.py | 16 +++++- examples/data/compare-yml.sql | 24 +++++++++ examples/data/schema.yml | 29 +++++++++++ 4 files changed, 137 insertions(+), 19 deletions(-) create mode 100644 examples/data/compare-yml.sql create mode 100644 examples/data/schema.yml diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index ec3b121f..b855a120 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -499,17 +499,37 @@ def query_fctime(self, operator, ctime, isVerbose=False): return resout -class YamlReader(Sqlite): +class YamlReader(): - def __init__(self, filename): - super().__init__(filename) + def __init__(self): + pass def yamlToSqlite(self, filename, db_name): + """ + Function that should be called externally to create the sqlite database file and delete temporary sql file used to ingest data + + `filename`: name of YAML file that is ingested + + `db_name`: name of database that YAML file should be added to. Database will be created if it does not exist in local directory. + """ + self.yaml_to_db(filename, db_name) + os.remove(db_name+".sql") + + def yaml_to_db(self, filename, db_name): + """ + DO NOT CALL EXTERNALLY EXCEPT FOR TESTING + + Function creates/adds to a sqlite db file from a given YAML file with specified database name + + `filename`: name of YAML file that is ingested + + `db_name`: name of database that YAML file should be added to. Database will be created if it does not exist in local directory. + """ with open(filename, 'r') as yaml_file, open(db_name+".sql", "w") as sql_file: editedString = yaml_file.read() - editedString = re.sub('specification', r'columns:\n specification', editedString) #indent specification and put all under a columns dictionary - editedString = re.sub(r'(!.+)\n', r"'\1'\n", editedString) #make ! into a string + editedString = re.sub('specification', r'columns:\n specification', editedString) + editedString = re.sub(r'(!.+)\n', r"'\1'\n", editedString) yml_data = yaml.safe_load_all(editedString) for table in yml_data: @@ -517,19 +537,52 @@ def yamlToSqlite(self, filename, db_name): vals = table['columns'].values() tableName = table["segment"] + data_types = {float: "REAL", str: "TEXT", int: "INTEGER"} if not os.path.isfile(db_name+".db"): - data_types = {float: "REAL", str: "TEXT", int: "INTEGER"} - createStmt = f"CREATE TABLE {tableName} ( " - for key,val in table['columns'].items(): - createStmt += f"{key} {data_types[type(val)]}, " - createStmt = createStmt[:-2] - createStmt+= ");\n\n" + createUnitStmt = f"CREATE TABLE {tableName}_units ( " + insertUnitStmt = f"INSERT INTO {tableName}_units {tuple(cols)} VALUES( " + + for key, val in table['columns'].items(): + createUnitStmt+= f"{key} TEXT, " + if data_types[type(val)] == "TEXT" and self.check_type(val[:val.find(" ")]) in ["INTEGER", "REAL"]: + createStmt += f"{key} {self.check_type(val[:val.find(" ")])}, " + insertUnitStmt+= f"'{val[val.find(" ")+1:]}', " + else: + createStmt += f"{key} {data_types[type(val)]}, " + insertUnitStmt+= "NULL, " + + sql_file.write(createStmt[:-2] + ");\n\n") + sql_file.write(createUnitStmt[:-2] + ");\n\n") + sql_file.write(insertUnitStmt[:-2] + ");\n\n") + + insertStmt = f"INSERT INTO {tableName} {tuple(cols)} VALUES( " + for val in vals: + if data_types[type(val)] == "TEXT" and self.check_type(val[:val.find(" ")]) in ["INTEGER", "REAL"]: + insertStmt+= f"{val[:val.find(" ")]}, " + elif data_types[type(val)] == "TEXT": + insertStmt+= f"'{val}', " + else: + insertStmt+= f"{val}, " + + sql_file.write(insertStmt[:-2] + ");\n\n") + + subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) - sql_file.write(createStmt) + def check_type(self, text): + """ + Tests input text and returns a predicted compatible SQL Type - sql = f"INSERT INTO {tableName} {tuple(cols)} VALUES {tuple(vals)};\n\n" - sql_file.write(sql) - - subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) - os.remove(db_name+".sql") \ No newline at end of file + `text`: text string + + `return`: string description of a SQL data type + """ + try: + value = int(text) + return "INTEGER" + except ValueError: + try: + value = float(text) + return "REAL" + except ValueError: + return "TEXT" \ No newline at end of file diff --git a/dsi/backends/tests/test_sqlite.py b/dsi/backends/tests/test_sqlite.py index 29f17ff3..81d3ae26 100644 --- a/dsi/backends/tests/test_sqlite.py +++ b/dsi/backends/tests/test_sqlite.py @@ -1,7 +1,8 @@ import git from collections import OrderedDict - -from dsi.backends.sqlite import Sqlite, DataType +from dsi.backends.sqlite import Sqlite, DataType, YamlReader +import os +import subprocess isVerbose = True @@ -59,3 +60,14 @@ def test_artifact_query(): store.close() # No error implies success assert True + +def test_yaml_reader(): + reader = YamlReader() + reader.yaml_to_db("../../../examples/data/schema.yml", "vedant-test") + subprocess.run(["diff", "../../../examples/data/compare-yml.sql", "vedant-test.sql"], stdout=open("output.txt", "w")) + file_size = os.path.getsize("output.txt") + os.remove("output.txt") + os.remove("vedant-test.sql") + os.remove("vedant-test.db") + + assert file_size == 0 #difference between sql files should be 0 characters \ No newline at end of file diff --git a/examples/data/compare-yml.sql b/examples/data/compare-yml.sql new file mode 100644 index 00000000..77b96b98 --- /dev/null +++ b/examples/data/compare-yml.sql @@ -0,0 +1,24 @@ +CREATE TABLE math ( specification TEXT, a INTEGER, b TEXT, c REAL, d INTEGER, e REAL, f TEXT); + +CREATE TABLE math_units ( specification TEXT, a TEXT, b TEXT, c TEXT, d TEXT, e TEXT, f TEXT); + +INSERT INTO math_units ('specification', 'a', 'b', 'c', 'd', 'e', 'f') VALUES( NULL, NULL, NULL, 'cm', NULL, NULL, NULL); + +INSERT INTO math ('specification', 'a', 'b', 'c', 'd', 'e', 'f') VALUES( '!jack', 1, 'there is CM', 45.98, 2, 34.8, '89e4'); + +CREATE TABLE address ( specification TEXT, fileLoc TEXT, g TEXT, h TEXT, i INTEGER, j INTEGER, k INTEGER, l TEXT, m INTEGER); + +CREATE TABLE address_units ( specification TEXT, fileLoc TEXT, g TEXT, h TEXT, i TEXT, j TEXT, k TEXT, l TEXT, m TEXT); + +INSERT INTO address_units ('specification', 'fileLoc', 'g', 'h', 'i', 'j', 'k', 'l', 'm') VALUES( NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); + +INSERT INTO address ('specification', 'fileLoc', 'g', 'h', 'i', 'j', 'k', 'l', 'm') VALUES( '!sam', '/home/sam/lib/data', 'good memories', '556place street', 2, 3, 4, '10000e-4', 99); + +CREATE TABLE physics ( specification TEXT, n REAL, o TEXT, p INTEGER, q TEXT, r INTEGER, s TEXT); + +CREATE TABLE physics_units ( specification TEXT, n TEXT, o TEXT, p TEXT, q TEXT, r TEXT, s TEXT); + +INSERT INTO physics_units ('specification', 'n', 'o', 'p', 'q', 'r', 's') VALUES( NULL, 'm / s / s', NULL, 's', NULL, 'million grams', NULL); + +INSERT INTO physics ('specification', 'n', 'o', 'p', 'q', 'r', 's') VALUES( '!amy', 9.8, 'gravity', 23, 'home 23', 1, '-12e-4'); + diff --git a/examples/data/schema.yml b/examples/data/schema.yml new file mode 100644 index 00000000..eb02dac4 --- /dev/null +++ b/examples/data/schema.yml @@ -0,0 +1,29 @@ +--- +segment: math +specification: !jack + a: 1 + b: "there is CM" + c: "45.98 cm" + d: 2 + e: 34.8 + f: 89e4 +--- +segment: address +specification: !sam + fileLoc: '/home/sam/lib/data' + g: "good memories" + h: "556place street" + i: 2 + j: 3 + k: 4 + l: 10000e-4 + m: 99 +--- +segment: physics +specification: !amy + n: "9.8 m / s / s" + o: "gravity" + p: "23 s" + q: "home 23" + r: '1 million grams' + s: -12e-4 \ No newline at end of file From 25b9a38aa4c6c872afe6e5b8f246e8671e9ebd20 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Thu, 29 Aug 2024 10:21:40 -0600 Subject: [PATCH 3/8] Removed the intermediate .dot file which generates the ER diagram from being saved --- dsi/plugins/file_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 8524e418..b02ed0f8 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -5,6 +5,7 @@ from math import isnan import sqlite3 import subprocess +import os from dsi.plugins.metadata import StructuredMetadata @@ -129,6 +130,7 @@ def export_erd(self, dbname, fname): dot_file.close() subprocess.run(["dot", "-T", file_type[1:], "-o", fname + file_type, fname + ".dot"]) + os.remove(fname + ".dot") class Csv(FileWriter): """ From ad5b47b89c7436a65b0a783a1466e50468322c68 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Tue, 3 Sep 2024 11:28:29 -0600 Subject: [PATCH 4/8] Created CI files for a few test files in backends and plugins --- .github/workflows/test_file_reader.yml | 33 +++++++++++++++++++++++++ .github/workflows/test_file_writer.yml | 33 +++++++++++++++++++++++++ .github/workflows/test_plugin.yml | 33 +++++++++++++++++++++++++ .github/workflows/test_sqlite.yml | 34 ++++++++++++++++++++++++++ 4 files changed, 133 insertions(+) create mode 100644 .github/workflows/test_file_reader.yml create mode 100644 .github/workflows/test_file_writer.yml create mode 100644 .github/workflows/test_plugin.yml create mode 100644 .github/workflows/test_sqlite.yml diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml new file mode 100644 index 00000000..d3ad4018 --- /dev/null +++ b/.github/workflows/test_file_reader.yml @@ -0,0 +1,33 @@ +name: file_reader test + +on: + push: + branches: + - gh-pages + + +jobs: + linux: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + python -m pip install gitpython + - name: Test reader + run: | + pip install pytest + pytest dsi/plugins/tests/test_file_reader.py + + + diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml new file mode 100644 index 00000000..8affad87 --- /dev/null +++ b/.github/workflows/test_file_writer.yml @@ -0,0 +1,33 @@ +name: file_writer test + +on: + push: + branches: + - gh-pages + + +jobs: + linux: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + python -m pip install gitpython + - name: Test reader + run: | + pip install pytest + pytest dsi/plugins/tests/test_file_writer.py + + + diff --git a/.github/workflows/test_plugin.yml b/.github/workflows/test_plugin.yml new file mode 100644 index 00000000..6963bb8f --- /dev/null +++ b/.github/workflows/test_plugin.yml @@ -0,0 +1,33 @@ +name: test_plugin.py test + +on: + push: + branches: + - gh-pages + + +jobs: + linux: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + python -m pip install opencv-python + - name: Test reader + run: | + pip install pytest + pytest dsi/tests/test_plugin.py + + + diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml new file mode 100644 index 00000000..82d912ea --- /dev/null +++ b/.github/workflows/test_sqlite.yml @@ -0,0 +1,34 @@ +name: sqlite.py test + +on: + push: + branches: + - gh-pages + + +jobs: + linux: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + python -m pip install gitpython + python -m pip install pyyaml + - name: Test reader + run: | + pip install pytest + pytest dsi/backends/tests/test_sqlite.py + + + From 7ebe144144e27f7ccaeba8a1b2bc0e2ca12fcbf4 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Tue, 3 Sep 2024 11:35:34 -0600 Subject: [PATCH 5/8] Edited CI files for github actions --- .github/workflows/test_file_reader.yml | 1 - .github/workflows/test_file_writer.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml index d3ad4018..8cb73f52 100644 --- a/.github/workflows/test_file_reader.yml +++ b/.github/workflows/test_file_reader.yml @@ -23,7 +23,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - python -m pip install gitpython - name: Test reader run: | pip install pytest diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml index 8affad87..95910dbc 100644 --- a/.github/workflows/test_file_writer.yml +++ b/.github/workflows/test_file_writer.yml @@ -23,7 +23,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - python -m pip install gitpython - name: Test reader run: | pip install pytest From 55027d9c9207d3c344ba223334ca7770c7caa636 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Tue, 3 Sep 2024 11:36:54 -0600 Subject: [PATCH 6/8] Edit test_sqlite.yml CI file --- .github/workflows/test_sqlite.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml index 82d912ea..082fb32a 100644 --- a/.github/workflows/test_sqlite.yml +++ b/.github/workflows/test_sqlite.yml @@ -23,7 +23,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - python -m pip install gitpython python -m pip install pyyaml - name: Test reader run: | From e4d1699cb374d4b64746ead1a0fdc2bf12bed20a Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Tue, 3 Sep 2024 13:29:19 -0600 Subject: [PATCH 7/8] Created files to set up CI on the open for file_writer/reader.py and sqlite.py --- .github/workflows/test_file_reader.yml | 5 +---- .github/workflows/test_file_writer.yml | 5 +---- .github/workflows/test_plugin.yml | 5 +---- .github/workflows/test_sqlite.yml | 5 +---- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml index 8cb73f52..2a3faba7 100644 --- a/.github/workflows/test_file_reader.yml +++ b/.github/workflows/test_file_reader.yml @@ -26,7 +26,4 @@ jobs: - name: Test reader run: | pip install pytest - pytest dsi/plugins/tests/test_file_reader.py - - - + pytest dsi/plugins/tests/test_file_reader.py \ No newline at end of file diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml index 95910dbc..103edc39 100644 --- a/.github/workflows/test_file_writer.yml +++ b/.github/workflows/test_file_writer.yml @@ -26,7 +26,4 @@ jobs: - name: Test reader run: | pip install pytest - pytest dsi/plugins/tests/test_file_writer.py - - - + pytest dsi/plugins/tests/test_file_writer.py \ No newline at end of file diff --git a/.github/workflows/test_plugin.yml b/.github/workflows/test_plugin.yml index 6963bb8f..0bb8d79c 100644 --- a/.github/workflows/test_plugin.yml +++ b/.github/workflows/test_plugin.yml @@ -27,7 +27,4 @@ jobs: - name: Test reader run: | pip install pytest - pytest dsi/tests/test_plugin.py - - - + pytest dsi/tests/test_plugin.py \ No newline at end of file diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml index 082fb32a..f7d99d21 100644 --- a/.github/workflows/test_sqlite.yml +++ b/.github/workflows/test_sqlite.yml @@ -27,7 +27,4 @@ jobs: - name: Test reader run: | pip install pytest - pytest dsi/backends/tests/test_sqlite.py - - - + pytest dsi/backends/tests/test_sqlite.py \ No newline at end of file From 28f97e47263d86931f920e6b7cd8290425220aaa Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Wed, 4 Sep 2024 15:17:39 -0600 Subject: [PATCH 8/8] Updated when CI files are run- now on push and pull requests --- .github/workflows/test_file_reader.yml | 6 +++++- .github/workflows/test_file_writer.yml | 5 ++++- .github/workflows/test_plugin.yml | 5 ++++- .github/workflows/test_sqlite.yml | 5 ++++- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml index 2a3faba7..7989b6e2 100644 --- a/.github/workflows/test_file_reader.yml +++ b/.github/workflows/test_file_reader.yml @@ -3,7 +3,11 @@ name: file_reader test on: push: branches: - - gh-pages + - main + pull_request: + branches: + - main + jobs: diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml index 103edc39..1137d20f 100644 --- a/.github/workflows/test_file_writer.yml +++ b/.github/workflows/test_file_writer.yml @@ -3,7 +3,10 @@ name: file_writer test on: push: branches: - - gh-pages + - main + pull_request: + branches: + - main jobs: diff --git a/.github/workflows/test_plugin.yml b/.github/workflows/test_plugin.yml index 0bb8d79c..ea8a9813 100644 --- a/.github/workflows/test_plugin.yml +++ b/.github/workflows/test_plugin.yml @@ -3,7 +3,10 @@ name: test_plugin.py test on: push: branches: - - gh-pages + - main + pull_request: + branches: + - main jobs: diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml index f7d99d21..2fe1fd1f 100644 --- a/.github/workflows/test_sqlite.yml +++ b/.github/workflows/test_sqlite.yml @@ -3,7 +3,10 @@ name: sqlite.py test on: push: branches: - - gh-pages + - main + pull_request: + branches: + - main jobs: