diff --git a/.github/workflows/linter-code-check.sh b/.github/workflows/linter-code-check.sh new file mode 100644 index 00000000..3fd65e63 --- /dev/null +++ b/.github/workflows/linter-code-check.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +eval "$(conda shell.bash hook)" +dir=$1 +echo "Checking directory: $dir" + +# Skip the directory if no .mypy.ini file is found +if [ ! -f "$dir/.mypy.ini" ]; then + echo "No .mypy.ini file found in $dir, skipping..." + exit 0 +fi + +if [ -f "$dir/environment.yml" ]; then + echo "Setting up conda environment for $dir" + conda env create -n $(basename "$dir") -f "$dir/environment.yml" + echo "Created conda environment" + conda activate $(basename "$dir") + pip install mypy +elif [ -f "$dir/requirements.txt" ]; then + echo "Setting up venv for $dir" + python3.9 -m venv "$dir/venv" + echo "activate venv" + source "$dir/venv/bin/activate" + echo "install requirements" + pip install --upgrade pip + pip install -r "$dir/requirements.txt" + pip install mypy +fi + +echo "Running mypy in $dir" +set +e # Disable exit on error +mypy_output=$(mypy --config-file "$dir/.mypy.ini" "$dir" 2>&1) +set -e # Re-enable exit on error + +echo "$mypy_output" +if echo "$mypy_output" | grep -q 'error:'; then + echo "Running install-types in $dir" + mypy --install-types --non-interactive --config-file "$dir/.mypy.ini" "$dir" +fi + +if [ -f "$dir/environment.yml" ]; then + conda deactivate + conda remove -y -n $(basename "$dir") --all +elif [ -f "$dir/requirements.txt" ]; then + deactivate + rm -rf "$dir/venv" +fi + +# done diff --git a/.github/workflows/linter-code-check.yml b/.github/workflows/linter-code-check.yml new file mode 100644 index 00000000..173c09d6 --- /dev/null +++ b/.github/workflows/linter-code-check.yml @@ -0,0 +1,43 @@ +name: Linter code check + +on: + push: + branches: + - main + - stage + - develop + pull_request: + types: [opened, reopened, synchronize] + branches: + - main + - stage + - develop + +jobs: + define-dirs: + runs-on: ubuntu-latest + outputs: + dirs: ${{ steps.dirs.outputs.dirs }} + steps: + - uses: actions/checkout@v3 + - name: Define Dirs + id: dirs + run: result=$(echo tasks/*/ | sed 's/\([^ ]*\)/"\1",/g') && result="${result%,}" && echo "dirs=[$result]" >> "$GITHUB_OUTPUT" + build: + runs-on: ubuntu-latest + needs: define-dirs + strategy: + matrix: + dirs: ${{ fromJSON(needs.define-dirs.outputs.dirs) }} + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: "3.9" + - name: Install mypy globally + run: | + pip install mypy + - name: Analysing templates with mypy + run: | + bash .github/workflows/linter-code-check.sh ${{matrix.dirs}} diff --git a/tasks/annotated-image-extractor/.mypy.ini b/tasks/annotated-image-extractor/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/annotated-image-extractor/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/audio-analytics/main.py b/tasks/audio-analytics/main.py index 1c79ff3b..40ba7cf9 100644 --- a/tasks/audio-analytics/main.py +++ b/tasks/audio-analytics/main.py @@ -3,7 +3,7 @@ import logging from coretex import CustomDataset, TaskRun, currentTaskRun -from coretex.nlp import AudioTranscriber +from coretex.nlp import AudioTranscriber # type: ignore[attr-defined] from src import text_search from src.utils import createTranscriptionArtfacts, fetchModelFile diff --git a/tasks/audio-analytics/src/text_search.py b/tasks/audio-analytics/src/text_search.py index f58f8fe0..4057c949 100644 --- a/tasks/audio-analytics/src/text_search.py +++ b/tasks/audio-analytics/src/text_search.py @@ -1,4 +1,4 @@ -from coretex.nlp import Token +from coretex.nlp import Token # type: ignore[attr-defined] from .occurence import EntityOccurrence diff --git a/tasks/audio-analytics/src/utils.py b/tasks/audio-analytics/src/utils.py index 9cd92f41..645e58ea 100644 --- a/tasks/audio-analytics/src/utils.py +++ b/tasks/audio-analytics/src/utils.py @@ -7,7 +7,7 @@ import logging from coretex import CustomSample, cache, TaskRun, folder_manager -from coretex.nlp import Token +from coretex.nlp import Token # type: ignore[attr-defined] from .occurence import NamedEntityRecognitionResult diff --git a/tasks/bio-bodysite-prediction-nn/.mypy.ini b/tasks/bio-bodysite-prediction-nn/.mypy.ini new file mode 100644 index 00000000..4c38c077 --- /dev/null +++ b/tasks/bio-bodysite-prediction-nn/.mypy.ini @@ -0,0 +1,30 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-tensorflow.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True + +[mypy-sklearn.*] +ignore_missing_imports = True diff --git a/tasks/bio-bodysite-prediction-nn/resources/function/function.py b/tasks/bio-bodysite-prediction-nn/resources/function/function.py index b1ac9f77..c2cd0fb3 100644 --- a/tasks/bio-bodysite-prediction-nn/resources/function/function.py +++ b/tasks/bio-bodysite-prediction-nn/resources/function/function.py @@ -6,6 +6,8 @@ from coretex import folder_manager, functions +import numpy as np + from load_data import loadDataAtlas from load_data_std import loadDataStd @@ -29,7 +31,7 @@ def unzip(inputPath: Path, dataFormat: int) -> Path: return inputPath -def inference(modelInput: Path, model: Model, uniqueTaxons: dict[str, int]) -> list[str]: +def inference(modelInput: Path, model: Model, uniqueTaxons: dict[str, int]) -> np.ndarray: BATCHE_SIZE = 562 sampleCount = len(list(modelInput.iterdir())) @@ -45,7 +47,11 @@ def response(requestData: dict[str, Any]) -> dict[str, Any]: with open(modelDir / "model_descriptor.json", "r") as jsonFile: modelDescriptor = json.load(jsonFile) - dataFormat = int(requestData.get("dataFormat")) # 0 - MBA, 1 - Microbiome Forensics Institute Zuric + dataFormatRaw = requestData.get("dataFormat") + if not isinstance(dataFormatRaw, str) and not isinstance(dataFormatRaw, int): + return functions.badRequest("Invalid dataFormat. (0 - MBA, 1 - Microbiome Forensics Institute Zuric)") + + dataFormat = int(dataFormatRaw) # 0 - MBA, 1 - Microbiome Forensics Institute Zuric inputPath = requestData.get("inputFile") if not isinstance(inputPath, Path): diff --git a/tasks/bio-bodysite-prediction-nn/resources/function/load_data.py b/tasks/bio-bodysite-prediction-nn/resources/function/load_data.py index 104588f0..fac6d629 100644 --- a/tasks/bio-bodysite-prediction-nn/resources/function/load_data.py +++ b/tasks/bio-bodysite-prediction-nn/resources/function/load_data.py @@ -73,6 +73,9 @@ def loadDataAtlas( ) -> tuple[Path, dict[str, int], dict[str, int], list[str]]: workerCount = os.cpu_count() # This value should not exceed the total number of CPU cores + if workerCount is None: + workerCount = 1 + logging.info(f">> [MicrobiomeForensics] Using {workerCount} CPU cores to read the file") fileSize = inputPath.stat().st_size @@ -89,8 +92,9 @@ def loadDataAtlas( uniqueBodySites = pickle.load(f) def onProcessingFinished(future: Future) -> None: - if future.exception() is not None: - raise future.exception() + exception = future.exception() + if exception is not None: + raise exception logging.info(f">> [MicrobiomeForensics] Reading: {inputPath}") diff --git a/tasks/bio-bodysite-prediction-nn/resources/function/load_data_std.py b/tasks/bio-bodysite-prediction-nn/resources/function/load_data_std.py index 8956ff99..a986dc50 100644 --- a/tasks/bio-bodysite-prediction-nn/resources/function/load_data_std.py +++ b/tasks/bio-bodysite-prediction-nn/resources/function/load_data_std.py @@ -9,12 +9,12 @@ from objects import Sample, Taxon -def loadDataStd(inputPath: Path, modelDir: Path, level: int) -> tuple[int, int, dict[str, int], list[int]]: +def loadDataStd(inputPath: Path, modelDir: Path, level: int) -> tuple[Path, dict[str, int], dict[str, int], list[str]]: with open(modelDir / "uniqueTaxons.pkl", "rb") as f: - uniqueTaxons = pickle.load(f) + uniqueTaxons: dict[str, int] = pickle.load(f) with open(modelDir / "uniqueBodySites.pkl", "rb") as f: - uniqueBodySites = pickle.load(f) + uniqueBodySites: dict[str, int] = pickle.load(f) datasetPath = folder_manager.createTempFolder("dataset") diff --git a/tasks/bio-bodysite-prediction-nn/resources/function/model.py b/tasks/bio-bodysite-prediction-nn/resources/function/model.py index 88ad4676..6ca81653 100644 --- a/tasks/bio-bodysite-prediction-nn/resources/function/model.py +++ b/tasks/bio-bodysite-prediction-nn/resources/function/model.py @@ -16,7 +16,7 @@ from utils import convertFromOneHot -class GatingLayer(tf.keras.layers.Layer): +class GatingLayer(tf.keras.layers.Layer): # type: ignore[misc] def __init__( self, @@ -83,7 +83,7 @@ def hard_sigmoid(self, x: Tensor, a: Tensor) -> Tensor: return x -class Model(tf.keras.Model): +class Model(tf.keras.Model): # type: ignore[misc] def __init__( self, @@ -148,7 +148,7 @@ def __init__( self.lam = lam self._activation_gating = activation_gating - self.activation_gating = activation_gating # will overwrite _activation_gating + self.activation_gating = activation_gating # type: ignore[assignment] self.activation_pred = activation_pred @@ -325,7 +325,7 @@ def _valid_step(self, X: Tensor, y: Tensor) -> Tensor: return y_pred_hot - def predict(self, data: tf.data.Dataset, batches: int): + def predict(self, data: tf.data.Dataset, batches: int) -> np.ndarray: y_pred: list[list[int]] = [] for i, batch in enumerate(data): @@ -337,7 +337,7 @@ def predict(self, data: tf.data.Dataset, batches: int): return convertFromOneHot(np.array(y_pred)) - def test(self, data: tf.data.Dataset, batches: int) -> tuple[np.ndarray, np.ndarray, float]: + def test(self, data: tf.data.Dataset, batches: int) -> tuple[np.ndarray, np.ndarray]: y_pred: list[list[int]] = [] # List of one hot vectors y_true: list[list[int]] = [] @@ -363,7 +363,7 @@ def test_from_array(self, X: ArrayLike) -> np.ndarray: if type(X) == sparse.csr_matrix: X = X.toarray().astype(np.float32) - return self.soft_to_hot(self._predict_from_array(X)).numpy() + return self.soft_to_hot(self._predict_from_array(X)).numpy() # type: ignore[no-any-return] @tf.function @@ -374,11 +374,11 @@ def _predict_from_array(self, X: ArrayLike) -> Tensor: @property def activation_gating(self) -> Callable: - return self._activation_gating + return self._activation_gating # type: ignore[return-value] @activation_gating.setter - def activation_gating(self, value: str) -> Callable: + def activation_gating(self, value: str) -> Callable: # type: ignore[return] if value == 'relu': self._activation_gating = tf.nn.relu elif value == 'l_relu': @@ -388,7 +388,7 @@ def activation_gating(self, value: str) -> Callable: elif value == 'tanh': self._activation_gating = tf.nn.tanh elif value == 'none': - self._activation_gating = lambda x: x + self._activation_gating = lambda x: x # type: ignore[assignment] else: raise NotImplementedError('activation for the gating network not recognized') diff --git a/tasks/bio-bodysite-prediction-nn/resources/function/utils.py b/tasks/bio-bodysite-prediction-nn/resources/function/utils.py index 130b52b6..bbdbbe98 100644 --- a/tasks/bio-bodysite-prediction-nn/resources/function/utils.py +++ b/tasks/bio-bodysite-prediction-nn/resources/function/utils.py @@ -1,10 +1,9 @@ -from typing import Optional - -from numpy.typing import ArrayLike +from typing import Optional, Union import numpy as np -def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.ndarray: + +def oneHotEncoding(vector: Union[np.ndarray, int], numClasses: Optional[int] = None) -> np.ndarray: """ Converts an input 1-D vector of integers into an output @@ -16,7 +15,7 @@ def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.n ---------- vector : ArrayLike A vector of integers - num_classes : int + numClasses : int Optionally declare the number of classes (can not exceed the maximum value of the vector) Returns @@ -26,7 +25,7 @@ def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.n Example ------- - >>> v = np.array((1, 0, 4)) + >>> v = np.array([1, 0, 4]) >>> one_hot_v = oneHotEncoding(v) >>> print one_hot_v [[0 1 0 0 0] @@ -34,15 +33,21 @@ def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.n [0 0 0 0 1]] """ - vecLen = 1 if isinstance(vector, int) else len(vector) + if isinstance(vector, int): + vector = np.array([vector]) + + vecLen = vector.shape[0] + + if numClasses is None: + numClasses = vector.max() + 1 - result = np.zeros(shape = (vecLen, num_classes)) + result = np.zeros(shape = (vecLen, numClasses)) result[np.arange(vecLen), vector] = 1 return result.astype(int) def convertFromOneHot(matrix: np.ndarray) -> np.ndarray: - numOfRows = len(matrix) if isinstance(matrix, list) else matrix.shape[0] + numOfRows = matrix.shape[0] if not numOfRows > 0: raise RuntimeError(f">> [MicrobiomeForensics] Encountered array with {numOfRows} rows when decoding one hot vector") diff --git a/tasks/bio-bodysite-prediction-nn/src/cache.py b/tasks/bio-bodysite-prediction-nn/src/cache.py index fe3acbd9..3da8c16b 100644 --- a/tasks/bio-bodysite-prediction-nn/src/cache.py +++ b/tasks/bio-bodysite-prediction-nn/src/cache.py @@ -69,7 +69,7 @@ def cacheDataset( logging.info(">> [MicrobiomeForensics] Successfuly cached assembled dataset") -def loadCache(taskRun: TaskRun[CustomDataset], cacheName: str) -> tuple[Path, dict[str, int], dict[str, int]]: +def loadCache(taskRun: TaskRun[CustomDataset], cacheName: str) -> tuple[dict[str, int], dict[str, int], int]: logging.info(">> [MicrobiomeForensics] Loading assembled dataset to cache") start = time.time() @@ -77,6 +77,9 @@ def loadCache(taskRun: TaskRun[CustomDataset], cacheName: str) -> tuple[Path, di datasetPath.mkdir(parents = True, exist_ok = True) cache = getCache(cacheName) + if cache is None: + raise ValueError(">> [MicrobiomeForensics] Failed to retrieve cache") + cache.download() samples = cache.getSamples(lambda sample: sample.name != "taxonDistribution" and sample.name != "classDistribution") @@ -89,16 +92,24 @@ def loadCache(taskRun: TaskRun[CustomDataset], cacheName: str) -> tuple[Path, di with datasetPath.joinpath(sample.name).open("wb") as file: pickle.dump(content, file) - taxonDistribution = cache.getSample("taxonDistribution") - classDistribution = cache.getSample("classDistribution") + taxonDistributionCache = cache.getSample("taxonDistribution") + classDistributionCache = cache.getSample("classDistribution") - if taxonDistribution is None and classDistribution is None: + if taxonDistributionCache is None and classDistributionCache is None: raise RuntimeError(">> [MicrobiomeForensics] Could not find taxonDistribution and classDistribution files in cache") - elif taxonDistribution is None: + elif taxonDistributionCache is None: raise RuntimeError(">> [MicrobiomeForensics] Could not find taxonDistribution file in cache") - elif classDistribution is None: + elif classDistributionCache is None: raise RuntimeError(">> [MicrobiomeForensics] Could not find classDistribution file in cache") + taxonDistributionCache.unzip() + with taxonDistributionCache.path.joinpath("taxonDistribution.pkl").open("rb") as file: + taxonDistribution: dict[str, int] = pickle.load(file) + + classDistributionCache.unzip() + with classDistributionCache.path.joinpath("classDistribution.pkl").open("rb") as file: + classDistribution: dict[str, int] = pickle.load(file) + uniqueTaxons = generateTaxonEncoding(taxonDistribution) uniqueBodySites = generateClassEncodings(classDistribution) @@ -108,7 +119,7 @@ def loadCache(taskRun: TaskRun[CustomDataset], cacheName: str) -> tuple[Path, di plots(taskRun, classDistribution, taxonDistribution, datasetLen) - return datasetPath, uniqueBodySites, uniqueTaxons, datasetLen + return uniqueBodySites, uniqueTaxons, datasetLen def generateTaxonEncoding(taxonDistribution: dict[str, int]) -> dict[str, int]: diff --git a/tasks/bio-bodysite-prediction-nn/src/dataset.py b/tasks/bio-bodysite-prediction-nn/src/dataset.py index 94cf6655..db858a1d 100644 --- a/tasks/bio-bodysite-prediction-nn/src/dataset.py +++ b/tasks/bio-bodysite-prediction-nn/src/dataset.py @@ -43,12 +43,12 @@ def generatorFunc() -> Generator: for taxon in sample.taxons: x[uniqueTaxons[taxon.taxonId]] = np.log(taxon.count + 0.5) - y = oneHotEncoding(y, len(uniqueBodySites)) - y = y.reshape(len(uniqueBodySites), ) + yOneHot = oneHotEncoding(y, len(uniqueBodySites)) + yOneHot = yOneHot.reshape(len(uniqueBodySites), ) yield { "features": tf.convert_to_tensor(x, dtype = tf.float32), - "labels": tf.convert_to_tensor(y, dtype = tf.float32) + "labels": tf.convert_to_tensor(yOneHot, dtype = tf.float32) } return tf.data.Dataset.from_generator( diff --git a/tasks/bio-bodysite-prediction-nn/src/load_data.py b/tasks/bio-bodysite-prediction-nn/src/load_data.py index efe73ca4..6a510d98 100644 --- a/tasks/bio-bodysite-prediction-nn/src/load_data.py +++ b/tasks/bio-bodysite-prediction-nn/src/load_data.py @@ -187,9 +187,9 @@ def loadDataAtlas( sampleOrigin: list[str], sequencingTechnique: list[str], useCache: bool, - validBodySites: dict[str, int] = None, - validTaxons: dict[str, int] = None -) -> tuple[Path, dict[str, int], dict[str, int], int]: + validBodySites: Optional[dict[str, int]] = None, + validTaxons: Optional[dict[str, int]] = None +) -> tuple[dict[str, int], dict[str, int], int]: """ Loads the dataset and returns it ready for training. @@ -230,6 +230,9 @@ def loadDataAtlas( sampleInfoObj = readEnvInfo(infoPath, sampleOrigin, sequencingTechnique) workerCount = os.cpu_count() # This value should not exceed the total number of CPU cores + if workerCount is None: + workerCount = 1 + logging.info(f">> [MicrobiomeForensics] Using {workerCount} CPU cores to read the dataset") fileSize = mappedPath.stat().st_size @@ -261,8 +264,9 @@ def onProcessingFinished(future: Future) -> None: The future object of the process from ProcessPoolExecutor """ - if future.exception() is not None: - raise future.exception() + exception = future.exception() + if exception is not None: + raise exception processClassDistribution, processTaxonDistribution = future.result() @@ -341,4 +345,4 @@ def onProcessingFinished(future: Future) -> None: taskRun.projectId ) - return uniqueBodySite, uniqueTaxons, datasetLen + return uniqueBodySite, uniqueTaxons, datasetLen diff --git a/tasks/bio-bodysite-prediction-nn/src/load_data_std.py b/tasks/bio-bodysite-prediction-nn/src/load_data_std.py index 0addfca4..ec7f4b1f 100644 --- a/tasks/bio-bodysite-prediction-nn/src/load_data_std.py +++ b/tasks/bio-bodysite-prediction-nn/src/load_data_std.py @@ -21,7 +21,7 @@ def loadDataStd( level: int, validBodySites: Optional[dict[str, int]] = None, validTaxons: Optional[dict[str, int]] = None -) -> tuple[int, int, dict[str, int], dict[str, int], list[int]]: +) -> tuple[dict[str, int], dict[str, int], int]: logging.info(">> [MicrobiomeForensics] Downloading dataset...") taskRun.updateStatus(TaskRunStatus.inProgress, "Downloading dataset...") @@ -50,19 +50,19 @@ def loadDataStd( samplePath = glob.glob(os.path.join(sample.path, f"*.json"))[0] with open(samplePath, "r") as f: - sample = json.load(f) + sampleDict = json.load(f) - if validBodySites is not None and sample["body_site"] not in validBodySites: + if validBodySites is not None and sampleDict["body_site"] not in validBodySites: continue - sampleObj = Sample(sample["_id"]["$oid"], sample["body_site"], None, []) + sampleObj = Sample(sampleDict["_id"]["$oid"], sampleDict["body_site"], None, []) - if not sample["body_site"] in classDistribution: - classDistribution[sample["body_site"]] = 1 + if not sampleDict["body_site"] in classDistribution: + classDistribution[sampleDict["body_site"]] = 1 else: - classDistribution[sample["body_site"]] += 1 + classDistribution[sampleDict["body_site"]] += 1 - taxons = loadTaxons(sample, level) + taxons = loadTaxons(sampleDict, level) if validTaxons is not None and any(taxon not in validTaxons for taxon in taxons.keys()): continue @@ -78,7 +78,7 @@ def loadDataStd( with datasetPath.joinpath(sampleObj.sampleId).open("wb") as file: pickle.dump(sampleObj, file) - if validBodySites is not None and uniqueTaxons is not None: + if validBodySites is not None and validTaxons is not None: uniqueBodySites = validBodySites uniqueTaxons = validTaxons else: diff --git a/tasks/bio-bodysite-prediction-nn/src/model.py b/tasks/bio-bodysite-prediction-nn/src/model.py index 6ecc0d30..dddd194a 100644 --- a/tasks/bio-bodysite-prediction-nn/src/model.py +++ b/tasks/bio-bodysite-prediction-nn/src/model.py @@ -16,7 +16,7 @@ from .utils import convertFromOneHot -class GatingLayer(tf.keras.layers.Layer): +class GatingLayer(tf.keras.layers.Layer): # type: ignore[misc] def __init__( self, @@ -83,7 +83,7 @@ def hard_sigmoid(self, x: Tensor, a: Tensor) -> Tensor: return x -class Model(tf.keras.Model): +class Model(tf.keras.Model): # type: ignore[misc] def __init__( self, @@ -148,7 +148,7 @@ def __init__( self.lam = lam self._activation_gating = activation_gating - self.activation_gating = activation_gating # will overwrite _activation_gating + self.activation_gating = activation_gating # type: ignore[assignment] self.activation_pred = activation_pred @@ -325,7 +325,19 @@ def _valid_step(self, X: Tensor, y: Tensor) -> Tensor: return y_pred_hot - def test(self, data: tf.data.Dataset, batches: int) -> tuple[np.ndarray, np.ndarray, float]: + def predict(self, data: tf.data.Dataset, batches: int) -> np.ndarray: + y_pred: list[list[int]] = [] + + for i, batch in enumerate(data): + if i == batches: + break + + y_pred.extend(list(self._test_step(batch["features"]))) + + return convertFromOneHot(np.array(y_pred)) + + + def test(self, data: tf.data.Dataset, batches: int) -> tuple[np.ndarray, np.ndarray]: y_pred: list[list[int]] = [] # List of one hot vectors y_true: list[list[int]] = [] @@ -351,7 +363,7 @@ def test_from_array(self, X: ArrayLike) -> np.ndarray: if type(X) == sparse.csr_matrix: X = X.toarray().astype(np.float32) - return self.soft_to_hot(self._predict_from_array(X)).numpy() + return self.soft_to_hot(self._predict_from_array(X)).numpy() # type: ignore[no-any-return] @tf.function @@ -362,11 +374,11 @@ def _predict_from_array(self, X: ArrayLike) -> Tensor: @property def activation_gating(self) -> Callable: - return self._activation_gating + return self._activation_gating # type: ignore[return-value] @activation_gating.setter - def activation_gating(self, value: str) -> Callable: + def activation_gating(self, value: str) -> Callable: # type: ignore[return] if value == 'relu': self._activation_gating = tf.nn.relu elif value == 'l_relu': @@ -376,7 +388,7 @@ def activation_gating(self, value: str) -> Callable: elif value == 'tanh': self._activation_gating = tf.nn.tanh elif value == 'none': - self._activation_gating = lambda x: x + self._activation_gating = lambda x: x # type: ignore[assignment] else: raise NotImplementedError('activation for the gating network not recognized') diff --git a/tasks/bio-bodysite-prediction-nn/src/objects.py b/tasks/bio-bodysite-prediction-nn/src/objects.py index 39cfc0ca..8cd25880 100644 --- a/tasks/bio-bodysite-prediction-nn/src/objects.py +++ b/tasks/bio-bodysite-prediction-nn/src/objects.py @@ -10,7 +10,7 @@ def __init__(self, taxonId: str, count: int): class Sample: - def __init__(self, sampleId: str, bodySite: str, associationSite: str, taxons: Optional[list[Taxon]] = None) -> None: + def __init__(self, sampleId: str, bodySite: str, associationSite: Optional[str], taxons: Optional[list[Taxon]] = None) -> None: self.sampleId = sampleId self.bodySite = bodySite self.associationSite = associationSite diff --git a/tasks/bio-bodysite-prediction-nn/src/train.py b/tasks/bio-bodysite-prediction-nn/src/train.py index 476469f0..363e0918 100644 --- a/tasks/bio-bodysite-prediction-nn/src/train.py +++ b/tasks/bio-bodysite-prediction-nn/src/train.py @@ -108,4 +108,4 @@ def train(taskRun: TaskRun[CustomDataset], datasetPath: Path, uniqueBodySites: d with open(modelPath / "uniqueBodySites.pkl", "wb") as f: pickle.dump(uniqueBodySites, f) - return accuracy + return float(accuracy) diff --git a/tasks/bio-bodysite-prediction-nn/src/utils.py b/tasks/bio-bodysite-prediction-nn/src/utils.py index 6fe19c5b..d30a0252 100644 --- a/tasks/bio-bodysite-prediction-nn/src/utils.py +++ b/tasks/bio-bodysite-prediction-nn/src/utils.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union, Any from pathlib import Path import csv @@ -6,15 +6,13 @@ import shutil import logging -from numpy.typing import ArrayLike - import numpy as np import matplotlib.pyplot as plt from coretex import CustomDataset, TaskRun, Model, folder_manager -def jsonPretty(data, savePath) -> None: +def jsonPretty(data: dict[str, Any], savePath: Path) -> None: with open(savePath, "w") as write_file: json.dump(data, write_file, indent=4) @@ -97,7 +95,7 @@ def saveFeatureTable(taskRun: TaskRun[CustomDataset], featureTablePath: str, tab def savePlotFig( taskRun: TaskRun[CustomDataset], distributionDict: dict, - savePath: str, + savePath: Path, fileName: str, xLabelRotation: bool, xLabel: str, @@ -127,15 +125,15 @@ def savePlotFig( def savePredictionFile( taskRun: TaskRun[CustomDataset], - savePath: str, + savePath: Path, trainCount: int, testCount: int, sampleIds: list, uniqueBodySite: dict, - yTrain: list, - yTest: list, - yPred: list, - zPred: list + yTrain: np.ndarray, + yTest: np.ndarray, + yPred: np.ndarray, + zPred: np.ndarray ) -> None: with folder_manager.temp.joinpath("body_site_predictions.csv").open("a+") as f: @@ -220,7 +218,7 @@ def plots(taskRun: TaskRun[CustomDataset], classDistribution: dict[str, int], ta logging.info(f">> [MicrobiomeForensics] Loading data and matching finished. Successfully matched {datasetLen} samples") -def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.ndarray: +def oneHotEncoding(vector: Union[np.ndarray, int], numClasses: Optional[int] = None) -> np.ndarray: """ Converts an input 1-D vector of integers into an output @@ -232,7 +230,7 @@ def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.n ---------- vector : ArrayLike A vector of integers - num_classes : int + numClasses : int Optionally declare the number of classes (can not exceed the maximum value of the vector) Returns @@ -242,7 +240,7 @@ def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.n Example ------- - >>> v = np.array((1, 0, 4)) + >>> v = np.array([1, 0, 4]) >>> one_hot_v = oneHotEncoding(v) >>> print one_hot_v [[0 1 0 0 0] @@ -250,15 +248,21 @@ def oneHotEncoding(vector: ArrayLike, num_classes: Optional[int] = None) -> np.n [0 0 0 0 1]] """ - vecLen = 1 if isinstance(vector, int) else len(vector) + if isinstance(vector, int): + vector = np.array([vector]) + + vecLen = vector.shape[0] + + if numClasses is None: + numClasses = vector.max() + 1 - result = np.zeros(shape = (vecLen, num_classes)) + result = np.zeros(shape = (vecLen, numClasses)) result[np.arange(vecLen), vector] = 1 return result.astype(int) def convertFromOneHot(matrix: np.ndarray) -> np.ndarray: - numOfRows = len(matrix) if isinstance(matrix, list) else matrix.shape[0] + numOfRows = matrix.shape[0] if not numOfRows > 0: raise RuntimeError(f">> [MicrobiomeForensics] Encountered array with {numOfRows} rows when decoding one hot vector") diff --git a/tasks/bio-bodysite-prediction/.mypy.ini b/tasks/bio-bodysite-prediction/.mypy.ini new file mode 100644 index 00000000..ebb9536e --- /dev/null +++ b/tasks/bio-bodysite-prediction/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-sklearn.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True diff --git a/tasks/bio-bodysite-prediction/main.py b/tasks/bio-bodysite-prediction/main.py index 43f5fb3e..e874e639 100644 --- a/tasks/bio-bodysite-prediction/main.py +++ b/tasks/bio-bodysite-prediction/main.py @@ -10,8 +10,8 @@ def validation(taskRun: TaskRun[CustomDataset]) -> None: - trainedModel: Model = taskRun.parameters.get("trainedModel") - if trainedModel is None: + trainedModel= taskRun.parameters.get("trainedModel") + if not isinstance(trainedModel, Model): raise RuntimeError(">> [MicrobiomeForensics] In order to start the validation process You have to type in \"trainedModel\" in TaskRun parameters") trainedModel.download() diff --git a/tasks/bio-bodysite-prediction/resources/function/function.py b/tasks/bio-bodysite-prediction/resources/function/function.py index 890cc160..c1e80067 100644 --- a/tasks/bio-bodysite-prediction/resources/function/function.py +++ b/tasks/bio-bodysite-prediction/resources/function/function.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Union from pathlib import Path from zipfile import ZipFile, is_zipfile @@ -36,13 +36,17 @@ def response(requestData: dict[str, Any]) -> dict[str, Any]: with open(modelDir / "model_descriptor.json", "r") as jsonFile: modelDescriptor = json.load(jsonFile) - dataFormat = int(requestData.get("dataFormat")) # 0 - MBA, 1 - Microbiome Forensics Institute Zuric + dataFormatRaw = requestData.get("dataFormat") + if not isinstance(dataFormatRaw, str) and not isinstance(dataFormatRaw, int): + return functions.badRequest("Invalid dataFormat. (0 - MBA, 1 - Microbiome Forensics Institute Zuric)") + + dataFormat = int(dataFormatRaw) # 0 - MBA, 1 - Microbiome Forensics Institute Zuric inputPath = requestData.get("inputFile") if not isinstance(inputPath, Path): return functions.badRequest("Invalid input data") - inputPath = unzip(inputPath) + inputPath = unzip(inputPath, dataFormat) if dataFormat == 0 and inputPath.is_file(): percentile = modelDescriptor.get("percentile") diff --git a/tasks/bio-bodysite-prediction/resources/function/load_data.py b/tasks/bio-bodysite-prediction/resources/function/load_data.py index a6ad5272..dffa4507 100644 --- a/tasks/bio-bodysite-prediction/resources/function/load_data.py +++ b/tasks/bio-bodysite-prediction/resources/function/load_data.py @@ -10,6 +10,8 @@ from scipy import sparse from sklearn.feature_selection import SelectPercentile +import numpy as np + from objects import Sample, Taxon @@ -93,14 +95,17 @@ def loadDataAtlas( inputPath: Path, modelDir: Path, percentile: int -) -> tuple[list[Sample], dict[str, int], dict[str, int]]: +) -> tuple[np.ndarray, dict[str, int], list[str]]: workerCount = os.cpu_count() # This value should not exceed the total number of CPU cores + if workerCount is None: + workerCount = 1 + logging.info(f">> [MicrobiomeForensics] Using {workerCount} CPU cores to read the file") fileSize = inputPath.stat().st_size # Smaller file size - used for testing - # fileSize = 100 * 1024 * 1024 + fileSize = 100 * 1024 * 1024 step = fileSize // workerCount remainder = fileSize % workerCount @@ -124,8 +129,9 @@ def onProcessingFinished(future: Future) -> None: The future object of the process from ProcessPoolExecutor """ - if future.exception() is not None: - raise future.exception() + exception = future.exception() + if exception is not None: + raise exception processSampleData = future.result() sampleData.extend(processSampleData) @@ -160,7 +166,7 @@ def prepareForInferenceAtlas( uniqueTaxons: dict[str, int], uniqueBodySites: dict[str, int], percentile: Optional[int] -) -> tuple[sparse.csr_matrix, dict[str, int], list[str]]: +) -> tuple[np.ndarray, dict[str, int], list[str]]: sampleIdList: list[str] = [] rowIndices: list[int] = [] diff --git a/tasks/bio-bodysite-prediction/resources/function/load_data_std.py b/tasks/bio-bodysite-prediction/resources/function/load_data_std.py index c686460d..78ad542b 100644 --- a/tasks/bio-bodysite-prediction/resources/function/load_data_std.py +++ b/tasks/bio-bodysite-prediction/resources/function/load_data_std.py @@ -6,7 +6,7 @@ import numpy as np -def loadDataStd(inputPath: Path, modelDir: Path, level: int) -> tuple[int, int, dict[str, int], list[int]]: +def loadDataStd(inputPath: Path, modelDir: Path, level: int) -> tuple[np.ndarray, dict[str, int], list[str]]: with open(modelDir / "uniqueTaxons.pkl", "rb") as f: uniqueTaxons = pickle.load(f) diff --git a/tasks/bio-bodysite-prediction/src/cache_json.py b/tasks/bio-bodysite-prediction/src/cache_json.py index edee9b22..77f9ed04 100644 --- a/tasks/bio-bodysite-prediction/src/cache_json.py +++ b/tasks/bio-bodysite-prediction/src/cache_json.py @@ -25,6 +25,9 @@ def loadJsonCache(cacheName: str) -> JsonTuple: logging.info(">> [MicrobiomeForensics] Loading assembled dataset from cache") cache = getJsonCache(cacheName) + if cache is None: + raise ValueError(">> [MicrobiomeForensics] Failed to retrieve cache") + cache.download() cache.samples[0].unzip() cachePath = Path(cache.samples[0].path) diff --git a/tasks/bio-bodysite-prediction/src/cache_matrix.py b/tasks/bio-bodysite-prediction/src/cache_matrix.py index 060e12f2..7db387ea 100644 --- a/tasks/bio-bodysite-prediction/src/cache_matrix.py +++ b/tasks/bio-bodysite-prediction/src/cache_matrix.py @@ -30,13 +30,16 @@ def getMatrixName( suffix = f"{origins}-{techniques}-{percentile}-{quantize}" - return hashCacheName(datasetName, suffix) + return hashCacheName(datasetName, suffix)[:20] def loadMatrixCache(cacheName: str, validation: bool) -> MatrixTuple: logging.info(">> [MicrobiomeForensics] Loading processed data from cache") cache = getMatrixCache(cacheName) + if cache is None: + raise ValueError(">> [MicrobiomeForensics] Failed to retrieve cache") + cache.download() cache.samples[0].unzip() cachePath = Path(cache.samples[0].path) @@ -104,7 +107,7 @@ def cacheMatrix( archive.write(cachePath.joinpath(f"{item}.pkl"), f"{item}.pkl") with createDataset(CustomDataset, cacheName, projectId) as cacheDataset: - if CustomSample().createCustomSample("zipedCache", cacheDataset.id, zipPath): + if cacheDataset.add(zipPath, "zipedCache"): logging.info(">> [MicrobiomeForensics] Successfuly cached processed data") else: logging.warning(">> [MicrobiomeForensics] Failed to cache processed data") diff --git a/tasks/bio-bodysite-prediction/src/load_data.py b/tasks/bio-bodysite-prediction/src/load_data.py index a7fe8692..a3a970b8 100644 --- a/tasks/bio-bodysite-prediction/src/load_data.py +++ b/tasks/bio-bodysite-prediction/src/load_data.py @@ -70,7 +70,7 @@ def readByteBlockUntilNewLine(file: BinaryIO, blockSize: int) -> Optional[bytes] return content + remainder -def processByteBatch(envInfoData: dict[str, str], filePath: Path, start: int, end: int) -> JsonTuple: +def processByteBatch(envInfoData: dict[str, str], filePath: Path, start: int, end: int) -> tuple[list[Sample], set[str], set[str]]: """ Called as a process by ProcessPoolExecutor for parallel processing. @@ -149,7 +149,7 @@ def processByteBatch(envInfoData: dict[str, str], filePath: Path, start: int, en int(count) )) - return JsonTuple(sampleData, uniqueBodySites, uniqueTaxons) + return sampleData, uniqueBodySites, uniqueTaxons def removeBadSamples(sampleData: list[Sample], uniqueTaxons: dict[str, int], uniqueBodySites: dict[str, int]) -> list[Sample]: @@ -321,7 +321,7 @@ def loadDataAtlas( validate = taskRun.parameters["validation"] cacheNameMatrix = getMatrixName( - dataset.name, + dataset.name[:42], sampleOrigin, sequencingTechnique, taskRun.parameters["percentile"], @@ -358,6 +358,9 @@ def loadDataAtlas( sampleInfoObj = readEnvInfo(infoPath, sampleOrigin, sequencingTechnique) workerCount = os.cpu_count() # This value should not exceed the total number of CPU cores + if workerCount is None: + workerCount = 1 + logging.info(f">> [MicrobiomeForensics] Using {workerCount} CPU cores to read the dataset") fileSize = mappedPath.stat().st_size @@ -367,11 +370,11 @@ def loadDataAtlas( step = fileSize // workerCount remainder = fileSize % workerCount - sampleData: list[Sample] = [] + sampleData: list[Sample] = [] # type: ignore[no-redef] # These two dictionaries represent the mapping between the names and encoded integers of the bodysites and taxons respectively - uniqueBodySite: dict[str, int] = {} - uniqueTaxons: dict[str, int] = {} + uniqueBodySite: dict[str, int] = {} # type: ignore[no-redef] + uniqueTaxons: dict[str, int] = {} # type: ignore[no-redef] if validate: # In the case of validation the same dictionaries will be used as during training @@ -395,8 +398,9 @@ def onProcessingFinished(future: Future) -> None: The future object of the process from ProcessPoolExecutor """ - if future.exception() is not None: - raise future.exception() + exception = future.exception() + if exception is not None: + raise exception processSampleData, processUniqueBodySite, processUniqueTaxons = future.result() @@ -510,10 +514,11 @@ def prepareForTrainingAtlas( if quantize: for i, num in enumerate(matrixData): if num > 65535: matrixData[i] = 65535 - matrixData = np.array(matrixData).astype(np.ushort) + + matrixDataU16 = np.array(matrixData).astype(np.ushort) # Assemble the input matrix in a sparse representation - inputMatrix = sparse.csr_matrix((matrixData, (rowIndices, columnIndices)), inputMatrixShape, dtype = np.ushort) + inputMatrix = sparse.csr_matrix((matrixDataU16, (rowIndices, columnIndices)), inputMatrixShape, dtype = np.ushort) else: inputMatrix = sparse.csr_matrix((matrixData, (rowIndices, columnIndices)), inputMatrixShape, dtype = np.int32) diff --git a/tasks/bio-bodysite-prediction/src/load_data_std.py b/tasks/bio-bodysite-prediction/src/load_data_std.py index 347bb4ae..96b2b310 100644 --- a/tasks/bio-bodysite-prediction/src/load_data_std.py +++ b/tasks/bio-bodysite-prediction/src/load_data_std.py @@ -10,7 +10,7 @@ from .utils import savePlotFig -def loadDataStd(dataset: CustomDataset, taskRun: TaskRun[CustomDataset]) -> tuple[int, int, dict[str, int], dict[str, int], list[int]]: +def loadDataStd(dataset: CustomDataset, taskRun: TaskRun[CustomDataset]) -> tuple[int, int, dict[str, int], dict[str, int]]: logging.info(">> [MicrobiomeForensics] Downloading dataset...") taskRun.updateStatus(TaskRunStatus.inProgress, "Downloading dataset...") dataset.download() @@ -37,15 +37,15 @@ def loadDataStd(dataset: CustomDataset, taskRun: TaskRun[CustomDataset]) -> tupl samplePath = glob.glob(os.path.join(sample.path, f"*.json"))[0] with open(samplePath, "r") as f: - sample = json.load(f) + sampleDict = json.load(f) - if not sample["body_site"] in uniqueBodySites: - uniqueBodySites[sample["body_site"]] = len(uniqueBodySites) - classDistribution[sample["body_site"]] = 1 + if not sampleDict["body_site"] in uniqueBodySites: + uniqueBodySites[sampleDict["body_site"]] = len(uniqueBodySites) + classDistribution[sampleDict["body_site"]] = 1 else: - classDistribution[sample["body_site"]] += 1 + classDistribution[sampleDict["body_site"]] += 1 - for bacteria in sample["97"]: + for bacteria in sampleDict["97"]: taxons = bacteria["taxon"].split(";") taxon = taxons[level] @@ -83,7 +83,7 @@ def loadDataStd(dataset: CustomDataset, taskRun: TaskRun[CustomDataset]) -> tupl return level, datasetLen, uniqueTaxons, uniqueBodySites -def prepareForTrainingStd(level: int, datasetLen: int, uniqueTaxons: dict, uniqueBodySites: dict, taskRun: TaskRun[CustomDataset]) -> tuple[np.ndarray, np.ndarray]: +def prepareForTrainingStd(level: int, datasetLen: int, uniqueTaxons: dict, uniqueBodySites: dict, taskRun: TaskRun[CustomDataset]) -> tuple[np.ndarray, np.ndarray, list[str]]: inputMatrix = np.zeros((datasetLen, len(uniqueTaxons))) outputMatrix = np.zeros((datasetLen, 1)) @@ -97,10 +97,10 @@ def prepareForTrainingStd(level: int, datasetLen: int, uniqueTaxons: dict, uniqu samplePath = glob.glob(os.path.join(sample.path, f"*.json"))[0] with open(samplePath, "r") as f: - sample = json.load(f) + sampleDict = json.load(f) - for bacteria in sample["97"]: - sampleIdList.append(sample["_id"]["$oid"]) + for bacteria in sampleDict["97"]: + sampleIdList.append(sampleDict["_id"]["$oid"]) taxons = bacteria["taxon"].split(";") taxon = taxons[level] @@ -109,6 +109,6 @@ def prepareForTrainingStd(level: int, datasetLen: int, uniqueTaxons: dict, uniqu c = bacteria["count"] inputMatrix[i, encodedTaxon] += c - outputMatrix[i, 0] = uniqueBodySites[sample["body_site"]] + outputMatrix[i, 0] = uniqueBodySites[sampleDict["body_site"]] return inputMatrix, outputMatrix, sampleIdList diff --git a/tasks/bio-bodysite-prediction/src/objects.py b/tasks/bio-bodysite-prediction/src/objects.py index 4b4d6776..9e573fc7 100644 --- a/tasks/bio-bodysite-prediction/src/objects.py +++ b/tasks/bio-bodysite-prediction/src/objects.py @@ -14,7 +14,7 @@ def __init__(self, taxonId: str, count: int): class Sample: - def __init__(self, sampleId: str, bodySite: str, associationSite: str, taxons: Optional[list[Taxon]] = []) -> None: + def __init__(self, sampleId: str, bodySite: str, associationSite: str, taxons: list[Taxon] = []) -> None: self.sampleId = sampleId self.bodySite = bodySite self.associationSite = associationSite diff --git a/tasks/bio-bodysite-prediction/src/train.py b/tasks/bio-bodysite-prediction/src/train.py index 61c5986c..b68f7d87 100644 --- a/tasks/bio-bodysite-prediction/src/train.py +++ b/tasks/bio-bodysite-prediction/src/train.py @@ -1,3 +1,5 @@ +from typing import Any + import logging import pickle import time @@ -34,7 +36,7 @@ def __init__( self.eval = xgb.DMatrix(evalSet[0][0], label = evalSet[0][1]) self.yEval = evalSet[0][1] - def after_iteration(self, model: XGBClassifier, epoch: int, evals_log) -> bool: + def after_iteration(self, model: XGBClassifier, epoch: int, evals_log: Any) -> bool: for data, metric in evals_log.items(): for metricName, log in metric.items(): loss = log[-1] @@ -134,4 +136,4 @@ def train( with open(modelPath / "uniqueBodySites.pkl", "wb") as f: pickle.dump(uniqueBodySites, f) - return accuracy + return float(accuracy) diff --git a/tasks/bio-bodysite-prediction/src/utils.py b/tasks/bio-bodysite-prediction/src/utils.py index 0de490a7..ae3a82e6 100644 --- a/tasks/bio-bodysite-prediction/src/utils.py +++ b/tasks/bio-bodysite-prediction/src/utils.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Any from pathlib import Path import csv @@ -16,7 +16,7 @@ from .objects import Sample -def jsonPretty(data, savePath) -> None: +def jsonPretty(data: dict[str, Any], savePath: Path) -> None: with open(savePath, "w") as write_file: json.dump(data, write_file, indent=4) @@ -103,7 +103,7 @@ def saveFeatureTable(featureTablePath: str, tableInput: np.ndarray, taskRun: Tas def savePlotFig( taskRun: TaskRun[CustomDataset], distributionDict: dict, - savePath: str, + savePath: Path, fileName: str, xLabelRotation: bool, xLabel: str, @@ -133,7 +133,7 @@ def savePlotFig( def savePredictionFile( taskRun: TaskRun[CustomDataset], - savePath: str, + savePath: Path, xTrain: csr_matrix, xTest: csr_matrix, sampleIdList: list, diff --git a/tasks/bio-primer-removal/.mypy.ini b/tasks/bio-primer-removal/.mypy.ini new file mode 100644 index 00000000..44cabdf1 --- /dev/null +++ b/tasks/bio-primer-removal/.mypy.ini @@ -0,0 +1,22 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports diff --git a/tasks/bio-primer-removal/main.py b/tasks/bio-primer-removal/main.py index c7d376d0..384f4572 100644 --- a/tasks/bio-primer-removal/main.py +++ b/tasks/bio-primer-removal/main.py @@ -8,7 +8,7 @@ from coretex.bioinformatics import cutadaptTrim -def uploadTrimmedReads(sampleName: str, dataset: SequenceDataset, forwardFile: Path, reverseFile: Optional[Path] = None): +def uploadTrimmedReads(sampleName: str, dataset: SequenceDataset, forwardFile: Path, reverseFile: Optional[Path] = None) -> None: zipPath = folder_manager.temp / f"{sampleName}.zip" with ZipFile(zipPath, 'w', ZIP_DEFLATED) as archive: archive.write(forwardFile, forwardFile.name) diff --git a/tasks/audio-analytics/.mypy.ini b/tasks/bio-read-quality/.mypy.ini similarity index 93% rename from tasks/audio-analytics/.mypy.ini rename to tasks/bio-read-quality/.mypy.ini index a9714b0e..9b7a6fef 100644 --- a/tasks/audio-analytics/.mypy.ini +++ b/tasks/bio-read-quality/.mypy.ini @@ -16,11 +16,9 @@ disallow_incomplete_defs = True no_implicit_optional = True strict_optional = True allow_redefinition = False -exclude = venv # Per-module options: # https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports - -[mypy-deepspeech.*] +[mypy-Bio.*] ignore_missing_imports = True diff --git a/tasks/bio-read-quality/main.py b/tasks/bio-read-quality/main.py index e84d65c4..f9a91494 100644 --- a/tasks/bio-read-quality/main.py +++ b/tasks/bio-read-quality/main.py @@ -9,7 +9,7 @@ from coretex import currentTaskRun, SequenceDataset, folder_manager, TaskRun -def calculateAverageScores(qualityScores: list[list[int]]) -> list[float]: +def calculateAverageScores(qualityScores: list[list[int]]) -> list[int]: maxLength = max(len(readScores) for readScores in qualityScores) totalScores = [0] * maxLength @@ -17,19 +17,19 @@ def calculateAverageScores(qualityScores: list[list[int]]) -> list[float]: for i in range(len(readScores)): totalScores[i] += readScores[i] - return [score / len(qualityScores) for score in totalScores] + return [int(score / len(qualityScores)) for score in totalScores] -def analyzeFastq(sequencePath: Path) -> list[float]: +def analyzeFastq(sequencePath: Path) -> list[int]: qualityScores: list[list[int]] = [] with sequencePath.open("r") as file: - for record in SeqIO.parse(file, "fastq"): + for record in SeqIO.parse(file, "fastq"): # type: ignore[no-untyped-call] qualityScores.append(record.letter_annotations["phred_quality"]) return calculateAverageScores(qualityScores) -def createPlot(scores: list[float], title: str, plotPath: Path) -> Path: +def createPlot(scores: list[int], title: str, plotPath: Path) -> Path: fig, ax = plt.subplots(figsize = (10, 6)) ax.plot(range(len(scores)), scores, linestyle = "-", color = "b", linewidth = 2, label = "Phred Scores") @@ -52,8 +52,8 @@ def main() -> None: taskRun.setDatasetType(SequenceDataset) taskRun.dataset.download() - forwardScores: list[list[float]] = [] - reverseScores: list[list[float]] = [] + forwardScores: list[list[int]] = [] + reverseScores: list[list[int]] = [] for sample in taskRun.dataset.samples: logging.info(f">> [Quality Scores] Analysing sample \"{sample.name}\"") diff --git a/tasks/bio-region-seperation/mypy.ini b/tasks/bio-region-seperation/mypy.ini index 21446fac..6f60cbe8 100644 --- a/tasks/bio-region-seperation/mypy.ini +++ b/tasks/bio-region-seperation/mypy.ini @@ -2,7 +2,7 @@ [mypy] exclude = venv -python_version = 3.8 +python_version = 3.9 pretty = True warn_return_any = True warn_no_return = True diff --git a/tasks/bio-region-seperation/src/__init__.py b/tasks/bio-region-seperation/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tasks/bio-region-seperation/src/separate.py b/tasks/bio-region-seperation/src/separate.py index a3b2ae93..52674a4b 100644 --- a/tasks/bio-region-seperation/src/separate.py +++ b/tasks/bio-region-seperation/src/separate.py @@ -1,5 +1,5 @@ from pathlib import Path -from io import BufferedWriter +from io import TextIOWrapper import logging @@ -13,7 +13,7 @@ def argmax(array: list) -> int: def splitToFiles(inputFile: Path, readClasses: list[int], groups: list[Path]) -> None: - outFiles: list[BufferedWriter] = [] + outFiles: list[TextIOWrapper] = [] for group in groups: outFiles.append(open(group / inputFile.name, "a")) diff --git a/tasks/body-tracking/.mypy.ini b/tasks/body-tracking/.mypy.ini new file mode 100644 index 00000000..43a9b317 --- /dev/null +++ b/tasks/body-tracking/.mypy.ini @@ -0,0 +1,30 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-tensorflow.*] +ignore_missing_imports = True + +[mypy-tensorflowjs.*] +ignore_missing_imports = True + +[mypy-coremltools.*] +ignore_missing_imports = True diff --git a/tasks/body-tracking/requirements.txt b/tasks/body-tracking/requirements.txt index cebefd48..2db665a9 100644 --- a/tasks/body-tracking/requirements.txt +++ b/tasks/body-tracking/requirements.txt @@ -1,4 +1,5 @@ -tensorflowjs>=3.9.0 -tensorflow==2.8 +tensorflowjs +protobuf +tensorflow coremltools==6.3.0 coretex diff --git a/tasks/contextual-targeting/.mypy.ini b/tasks/contextual-targeting/.mypy.ini index a9714b0e..1b535120 100644 --- a/tasks/contextual-targeting/.mypy.ini +++ b/tasks/contextual-targeting/.mypy.ini @@ -24,3 +24,6 @@ exclude = venv [mypy-deepspeech.*] ignore_missing_imports = True + +[mypy-sentence_transformers.*] +ignore_missing_imports = True diff --git a/tasks/contextual-targeting/main.py b/tasks/contextual-targeting/main.py index 89880950..ff76299a 100644 --- a/tasks/contextual-targeting/main.py +++ b/tasks/contextual-targeting/main.py @@ -4,7 +4,7 @@ import logging from coretex import CustomDataset, TaskRun, CustomSample, currentTaskRun, folder_manager -from coretex.nlp import AudioTranscriber, Transcription +from coretex.nlp import AudioTranscriber, Transcription # type: ignore[attr-defined] import matplotlib.pyplot as plt @@ -17,7 +17,7 @@ MODEL_SCORER_NAME = "deepspeech-0.8.2-model.scorer" -def transcribe(dataset: CustomDataset, parameters: Dict[str, Any]) -> List[Tuple[CustomSample, Transcription]]: +def transcribe(dataset: CustomDataset, parameters: Dict[str, Any]) -> Transcription: modelFile = fetchModelFile(parameters["modelUrl"], MODEL_NAME, ".pbmm") modelScorerFile = fetchModelFile(parameters["modelScorerUrl"], MODEL_SCORER_NAME, ".scorer") diff --git a/tasks/dataset-split/.mypy.ini b/tasks/dataset-split/.mypy.ini index d3e1fa10..138bfe20 100644 --- a/tasks/dataset-split/.mypy.ini +++ b/tasks/dataset-split/.mypy.ini @@ -16,3 +16,9 @@ disallow_incomplete_defs = True no_implicit_optional = True strict_optional = True allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-chardet.*] +ignore_missing_imports = True diff --git a/tasks/dataset-split/main.py b/tasks/dataset-split/main.py index 0b8c279c..cc53bc00 100644 --- a/tasks/dataset-split/main.py +++ b/tasks/dataset-split/main.py @@ -1,11 +1,12 @@ +from typing import Sequence + import logging -from coretex import currentTaskRun, ImageDataset, CustomDataset, SequenceDataset +from coretex import currentTaskRun, ImageDataset, CustomDataset, SequenceDataset, NetworkDataset from src.split_custom_dataset import splitCustomDataset from src.split_image_dataset import splitImageDataset from src.split_sequence_dataset import splitSequenceDataset -from src.utils import DatasetType def main() -> None: @@ -21,7 +22,7 @@ def main() -> None: if datasetParts < 2: raise ValueError("Dataset can be divided into at least two parts") - splitDatasets: list[DatasetType] + splitDatasets: Sequence[NetworkDataset] if isinstance(originalDataset, ImageDataset): logging.info(f">> [Dataset Split] Splitting ImageDataset {originalDataset.name}...") diff --git a/tasks/dataset-split/src/utils.py b/tasks/dataset-split/src/utils.py index cd7ce974..c43dcc49 100644 --- a/tasks/dataset-split/src/utils.py +++ b/tasks/dataset-split/src/utils.py @@ -1,10 +1,9 @@ from typing import TypeVar -from coretex import NetworkSample, NetworkDataset +from coretex import NetworkSample SampleType = TypeVar("SampleType", bound = NetworkSample) -DatasetType = TypeVar("DatasetType", bound = NetworkDataset) def splitOriginalSamples(originalSamples: list[SampleType], datasetCount: int) -> list[list[SampleType]]: diff --git a/tasks/image-augmentation/.mypy.ini b/tasks/image-augmentation/.mypy.ini new file mode 100644 index 00000000..5d213555 --- /dev/null +++ b/tasks/image-augmentation/.mypy.ini @@ -0,0 +1,24 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-imgaug.*] +ignore_missing_imports = True diff --git a/tasks/image-augmentation/src/augmentation.py b/tasks/image-augmentation/src/augmentation.py index 52cff683..f3326640 100644 --- a/tasks/image-augmentation/src/augmentation.py +++ b/tasks/image-augmentation/src/augmentation.py @@ -1,3 +1,5 @@ +from typing import Optional + import logging import cv2 @@ -10,7 +12,7 @@ from .utils import uploadAugmentedImage -def mask2poly(mask: np.ndarray) -> list[int]: +def mask2poly(mask: np.ndarray) -> Optional[list[int]]: contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) if len(contours) == 0: logging.warning(">> [Image Augmentation] Could not find annotated area on augmented image") @@ -27,13 +29,17 @@ def mask2poly(mask: np.ndarray) -> list[int]: return segmentation -def transformAnnotationInstances(sampleData: AnnotatedImageSampleData, pipeline: iaa.Sequential) -> CoretexImageAnnotation: +def transformAnnotationInstances(sampleData: AnnotatedImageSampleData, pipeline: iaa.Sequential) -> Optional[list[CoretexSegmentationInstance]]: augmentedInstances: list[CoretexSegmentationInstance] = [] - for instance in sampleData.annotation.instances: + annotation = sampleData.annotation + if annotation is None: + return None + + for instance in annotation.instances: mask = instance.extractSegmentationMask( - sampleData.annotation.width, - sampleData.annotation.height + annotation.width, + annotation.height ) mask = np.repeat(mask[..., None] * 255, 3, axis = -1) @@ -74,13 +80,15 @@ def augmentImage( augmentedImage = firstPipeline_.augment_image(image) augmentedImage = secondPipeline.augment_image(augmentedImage) augmentedInstances = transformAnnotationInstances(sampleData, firstPipeline_) - - annotation = CoretexImageAnnotation.create( - sample.name, - augmentedImage.shape[1], - augmentedImage.shape[0], - augmentedInstances - ) + if augmentedInstances is not None: + annotation = CoretexImageAnnotation.create( + sample.name, + augmentedImage.shape[1], + augmentedImage.shape[0], + augmentedInstances + ) + else: + annotation = None augmentedImageName = f"{sample.name}-{i}" + sample.imagePath.suffix uploadAugmentedImage(augmentedImageName, augmentedImage, annotation, sample, outputDataset) diff --git a/tasks/image-augmentation/src/utils.py b/tasks/image-augmentation/src/utils.py index faf17ea2..f7c163db 100644 --- a/tasks/image-augmentation/src/utils.py +++ b/tasks/image-augmentation/src/utils.py @@ -1,3 +1,5 @@ +from typing import Optional + import logging from numpy import ndarray @@ -10,7 +12,7 @@ def uploadAugmentedImage( imageName: str, augmentedImage: ndarray, - annotation: CoretexImageAnnotation, + annotation: Optional[CoretexImageAnnotation], originalSample: ImageSample, outputDataset: ImageDataset ) -> None: @@ -24,8 +26,9 @@ def uploadAugmentedImage( logging.error(f">> [Image Augmentation] Failed to upload sample {imagePath} - \"{ex}\"") return - if not augmentedSample.saveAnnotation(annotation): - logging.error(f">> [Image Augmentation] Failed to update sample annotation {imagePath}") + if annotation is not None: + if not augmentedSample.saveAnnotation(annotation): + logging.error(f">> [Image Augmentation] Failed to update sample annotation {imagePath}") try: metadata = originalSample.loadMetadata() diff --git a/tasks/image-extractor/.mypy.ini b/tasks/image-extractor/.mypy.ini index 3fe41755..d3e1fa10 100644 --- a/tasks/image-extractor/.mypy.ini +++ b/tasks/image-extractor/.mypy.ini @@ -16,18 +16,3 @@ disallow_incomplete_defs = True no_implicit_optional = True strict_optional = True allow_redefinition = False - - -# Per-module options: -# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports -[mypy-shapely.*] -ignore_missing_imports = True - -[mypy-pytesseract.*] -ignore_missing_imports = True - -[mypy-easyocr.*] -ignore_missing_imports = True - -[mypy-transformers.*] -ignore_missing_imports = True diff --git a/tasks/image-extractor/src/sample_generator.py b/tasks/image-extractor/src/sample_generator.py index 7f0b24e1..be0fa084 100644 --- a/tasks/image-extractor/src/sample_generator.py +++ b/tasks/image-extractor/src/sample_generator.py @@ -42,7 +42,7 @@ def generateSample(sample: ImageSample, parentClass: Optional[ImageDatasetClass] try: sample.metadataPath.link_to(samplePath / "metadata.json") except AttributeError as e: - samplePath.joinpath("metadata.json").hardlink_to(sample.metadataPath) + samplePath.joinpath("metadata.json").hardlink_to(sample.metadataPath) # type: ignore[attr-defined] imagePaths.append(samplePath) diff --git a/tasks/image-orientation/.mypy.ini b/tasks/image-orientation/.mypy.ini new file mode 100644 index 00000000..e1a70fa8 --- /dev/null +++ b/tasks/image-orientation/.mypy.ini @@ -0,0 +1,30 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-torchvision.*] +ignore_missing_imports = True + +[mypy-sklearn.*] +ignore_missing_imports = True + +[mypy-seaborn.*] +ignore_missing_imports = True diff --git a/tasks/image-orientation/src/dataset.py b/tasks/image-orientation/src/dataset.py index 41539606..7ba4e1ab 100644 --- a/tasks/image-orientation/src/dataset.py +++ b/tasks/image-orientation/src/dataset.py @@ -29,17 +29,19 @@ def __init__( def __len__(self) -> int: return len(self.sampleIds) - def __getitem__(self, idx) -> dict[str, Any]: + def __getitem__(self, idx: int) -> dict[str, Any]: imagePath = self.imagesDir / f"{self.sampleIds[idx]}.png" metadataPath = self.imagesDir / f"{self.sampleIds[idx]}.json" image = ImageOps.exif_transpose(Image.open(imagePath).convert("RGB")) + if image is None: + raise ValueError(f">> [ImageOrientation] Failed to open image {imagePath.name}") + with metadataPath.open("r") as file: meta = json.load(file) flipped = meta.get(self.labelColumn, False) - label = [1, 0] if flipped else [0, 1] - label = torch.tensor(label).type(torch.float) + label = torch.tensor([1, 0] if flipped else [0, 1]).type(torch.float) if self.transform is not None: image = self.transform(image) @@ -73,9 +75,9 @@ def prepareDataset(dataset: ImageDataset) -> tuple[Path, list[int]]: return imagesDir, sampleIds -def splitDataset(dataset: OrientedDataset, validSplit: float) -> tuple["OrientedDataset", "OrientedDataset"]: +def splitDataset(dataset: OrientedDataset, validSplit: float) -> tuple[OrientedDataset, OrientedDataset]: totalSize = len(dataset) trainSize = int((1.0 - validSplit) * totalSize) validationSize = totalSize - trainSize - return random_split(dataset, [trainSize, validationSize]) + return random_split(dataset, [trainSize, validationSize]) # type: ignore[return-value] diff --git a/tasks/image-orientation/src/model.py b/tasks/image-orientation/src/model.py index 5ac99c2a..820cd3ac 100644 --- a/tasks/image-orientation/src/model.py +++ b/tasks/image-orientation/src/model.py @@ -1,10 +1,12 @@ +from torch import Tensor + import torch.nn as nn import torch.nn.functional as F class OrientationClassifier(nn.Module): - def __init__(self): + def __init__(self) -> None: super(OrientationClassifier, self).__init__() # Convolutional layers self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 3, padding = 1) @@ -26,7 +28,7 @@ def __init__(self): # Dropout self.dropout = nn.Dropout(0.25) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: x = self.pool1(F.leaky_relu(self.conv1(x))) x = self.pool2(F.leaky_relu(self.conv2(x))) x = self.pool3(F.leaky_relu(self.conv3(x))) diff --git a/tasks/image-orientation/src/train.py b/tasks/image-orientation/src/train.py index ca9f15a6..0bd1146f 100644 --- a/tasks/image-orientation/src/train.py +++ b/tasks/image-orientation/src/train.py @@ -18,7 +18,7 @@ def trainEpoch( trainLoader: DataLoader, model: OrientationClassifier, optimizer: optim.Adam, - criterion: nn.MSELoss, + criterion: nn.CrossEntropyLoss, device: torch.device ) -> tuple[float, float]: @@ -90,7 +90,7 @@ def runTraining( scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.3, patience = max(5, int(epochs * 0.05))) earlyStopping = EarlyStopping(max(10, int(epochs * 0.1))) - bestLoss: Optional[torch.Tensor] = None + bestLoss: Optional[float] = None exampleInput = torch.randn(1, 3, imageSize, imageSize) for epoch in range(epochs): @@ -117,13 +117,13 @@ def runTraining( bestLoss = validationLoss # Save the best model - tsModel = torch.jit.trace(model, exampleInput) + tsModel = torch.jit.trace(model, exampleInput) # type: ignore[no-untyped-call] tsModel.save(modelPath / "best.pt") # Save the latest model - tsModel = torch.jit.trace(model, exampleInput) + tsModel = torch.jit.trace(model, exampleInput) # type: ignore[no-untyped-call] tsModel.save(modelPath / "last.pt") if not modelPath.joinpath("best.pt").exists(): - tsModel = torch.jit.trace(model, exampleInput) + tsModel = torch.jit.trace(model, exampleInput) # type: ignore[no-untyped-call] tsModel.save(modelPath / "best.pt") diff --git a/tasks/image-orientation/src/utils.py b/tasks/image-orientation/src/utils.py index 1fa6c113..e2bae647 100644 --- a/tasks/image-orientation/src/utils.py +++ b/tasks/image-orientation/src/utils.py @@ -27,14 +27,18 @@ def __call__(self, bestLoss: Union[float, torch.Tensor], latestLoss: Union[float def getMeanAndStd(directory: Path) -> tuple[list[float], list[float]]: channelsSum, channelsSquaredSum, numImages = 0, 0, 0 - for image_file in directory.glob("*.png"): - image = np.array(ImageOps.exif_transpose(Image.open(image_file)).convert("RGB"), dtype = np.float32) / 255.0 # Normalize pixel values to [0, 1] + for imagePath in directory.glob("*.png"): + image = ImageOps.exif_transpose(Image.open(imagePath)) + if image is None: + raise ValueError(f"Failed to read image {imagePath.name}") - channelsSum += np.mean(image, axis = (0, 1)) - channelsSquaredSum += np.mean(np.square(image), axis = (0, 1)) + imageArray = np.array(image.convert("RGB"), dtype = np.float32) / 255.0 # Normalize pixel values to [0, 1] + + channelsSum += np.mean(imageArray, axis = (0, 1)) + channelsSquaredSum += np.mean(np.square(imageArray), axis = (0, 1)) numImages += 1 - mean = channelsSum / numImages + mean = np.array(channelsSum / numImages) std = np.sqrt(channelsSquaredSum / numImages - np.square(mean)) return mean.tolist(), std.tolist() diff --git a/tasks/image-orientation/src/validation.py b/tasks/image-orientation/src/validation.py index b950b369..8e2fb8d2 100644 --- a/tasks/image-orientation/src/validation.py +++ b/tasks/image-orientation/src/validation.py @@ -56,7 +56,7 @@ def runValidation( taskRun: TaskRun ) -> float: - model = torch.jit.load(modelPath) + model = torch.jit.load(modelPath) # type: ignore[no-untyped-call] model.to(device) model.eval() diff --git a/tasks/image-quality-predictor/.mypy.ini b/tasks/image-quality-predictor/.mypy.ini index 173ed2cc..2496755a 100644 --- a/tasks/image-quality-predictor/.mypy.ini +++ b/tasks/image-quality-predictor/.mypy.ini @@ -20,8 +20,5 @@ allow_redefinition = False # Per-module options: # https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports -[mypy-torch.*] -ignore_missing_imports = True - [mypy-torchvision.*] ignore_missing_imports = True diff --git a/tasks/image-quality-predictor/main.py b/tasks/image-quality-predictor/main.py index 6a52a212..4e354d01 100644 --- a/tasks/image-quality-predictor/main.py +++ b/tasks/image-quality-predictor/main.py @@ -97,12 +97,15 @@ def train(taskRun: TaskRun, dataset: list[tuple[ImageSample, float]]) -> None: # Calculate model accuracy logging.info(">> [ImageQuality] Validating model...") - sampleResultsCsvPath, accuracy = validation.run(modelPath / "best.pt", trainData + validData, transform) + sampleResultsCsvPath, datasetResultsCsvPath, accuracy = validation.run(modelPath / "best.pt", trainData + validData, transform) logging.info(f">> [ImageQuality] Model accuracy: {accuracy:.2f}%") if taskRun.createArtifact(sampleResultsCsvPath, sampleResultsCsvPath.name) is None: logging.error(f">> [ImageQuality] Failed to create artifact \"{sampleResultsCsvPath.name}\"") + if taskRun.createArtifact(datasetResultsCsvPath, datasetResultsCsvPath.name) is None: + logging.error(f">> [ImageQuality] Failed to create artifact \"{datasetResultsCsvPath.name}\"") + logging.info(">> [ImageQuality] Uploading model...") ctxModel = Model.createModel(taskRun.generateEntityName(), taskRun.projectId, accuracy) ctxModel.upload(modelPath) diff --git a/tasks/image-quality-predictor/src/data.py b/tasks/image-quality-predictor/src/data.py index 11fbc68e..f5786d4e 100644 --- a/tasks/image-quality-predictor/src/data.py +++ b/tasks/image-quality-predictor/src/data.py @@ -65,7 +65,7 @@ def loadDataset(artifacts: list[Artifact]) -> list[tuple[ImageSample, float]]: sample.download() sample.unzip() - value = (sample, float(row["total_iou"])) + value = (sample, float(row["total"])) dataset.append(value) return dataset @@ -88,7 +88,11 @@ def __len__(self) -> int: def __getitem__(self, idx: int) -> tuple[Any, float]: sample, quality = self.data[idx] - image = ImageOps.exif_transpose(Image.open(sample.imagePath)).convert("RGB") + image = ImageOps.exif_transpose(Image.open(sample.imagePath)) + if image is None: + raise ValueError(f">> [ImageQuality] Failed to open image {sample.name}") + + image = image.convert("RGB") if self.transform: image = self.transform(image) diff --git a/tasks/image-quality-predictor/src/validation.py b/tasks/image-quality-predictor/src/validation.py index 135ca96e..0ccf1f62 100644 --- a/tasks/image-quality-predictor/src/validation.py +++ b/tasks/image-quality-predictor/src/validation.py @@ -38,6 +38,8 @@ def run(modelPath: Path, dataset: list[tuple[ImageSample, float]], transform: tr for sample, quality in dataset: logging.info(f">> [ImageQuality] Validating sample \"{sample.name}\"...") image = ImageOps.exif_transpose(Image.open(sample.imagePath).convert("RGB")) + if image is None: + raise ValueError(f">> [ImageQuality] Failed to open image {sample.name}") if quality == 0: logging.warning("\tSample has quality == 0") diff --git a/tasks/image-segmentation/requirements.txt b/tasks/image-segmentation/requirements.txt index 74d06ead..0b82c337 100644 --- a/tasks/image-segmentation/requirements.txt +++ b/tasks/image-segmentation/requirements.txt @@ -1,7 +1,7 @@ -tensorflow==2.8 +tensorflow numpy matplotlib -protobuf~=3.19.0 +protobuf opencv-python coremltools==6.3.0 coretex diff --git a/tasks/image-segmentation/src/callbacks.py b/tasks/image-segmentation/src/callbacks.py index d60e7d66..0292e5fc 100644 --- a/tasks/image-segmentation/src/callbacks.py +++ b/tasks/image-segmentation/src/callbacks.py @@ -7,14 +7,14 @@ from coretex import currentTaskRun -class DisplayCallback(Callback): +class DisplayCallback(Callback): # type: ignore[misc] def __init__(self, epochs: int) -> None: super().__init__() self.epochs = epochs - def on_epoch_end(self, epoch: int, logs: Optional[dict[str, Any]] = None): + def on_epoch_end(self, epoch: int, logs: Optional[dict[str, Any]] = None) -> None: if logs is None: return diff --git a/tasks/image-segmentation/src/dataset.py b/tasks/image-segmentation/src/dataset.py index 6f265862..eac3314a 100644 --- a/tasks/image-segmentation/src/dataset.py +++ b/tasks/image-segmentation/src/dataset.py @@ -12,9 +12,9 @@ from .utils import hasDotAnnotation -class Augment(tf.keras.layers.Layer): +class Augment(tf.keras.layers.Layer): # type: ignore[misc] - def __init__(self, seed=42): + def __init__(self, seed: int = 42) -> None: super().__init__() self.augmentInputs = RandomFlip( @@ -27,7 +27,7 @@ def __init__(self, seed=42): seed=seed ) - def call(self, inputs, labels): + def call(self, inputs: tf.Tensor, labels: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]: inputs = self.augmentInputs(inputs) labels = self.augmentLabels(labels) diff --git a/tasks/image-segmentation/src/detect.py b/tasks/image-segmentation/src/detect.py index 71b31df5..9f91c96f 100644 --- a/tasks/image-segmentation/src/detect.py +++ b/tasks/image-segmentation/src/detect.py @@ -18,7 +18,12 @@ def run(taskRun: TaskRun, model: KerasModel, dataset: ImageDataset) -> None: sampleData = sample.load() - if hasDotAnnotation(sampleData.annotation): + annotation = sampleData.annotation + if annotation is None: + logging.warning(f">> [Image Segmentation] Sample \"{sample.name}\" (ID: {sample.id}) has no annotation. Skipping Sample") + continue + + if hasDotAnnotation(annotation): logging.warning(f">> [Image Segmentation] Sample \"{sample.name}\" (ID: {sample.id}) has invalid annotation (too few coordinates). Skipping Sample") continue diff --git a/tasks/image-segmentation/src/model.py b/tasks/image-segmentation/src/model.py index 3e458331..c528ac53 100644 --- a/tasks/image-segmentation/src/model.py +++ b/tasks/image-segmentation/src/model.py @@ -7,7 +7,7 @@ from keras.applications.mobilenet_v2 import MobileNetV2 -class UpSampler(Sequential): +class UpSampler(Sequential): # type: ignore[misc] def __init__(self, filters: int, size: int): super(UpSampler, self).__init__() diff --git a/tasks/image-segmentation/src/utils.py b/tasks/image-segmentation/src/utils.py index eca7f28a..20f34376 100644 --- a/tasks/image-segmentation/src/utils.py +++ b/tasks/image-segmentation/src/utils.py @@ -28,7 +28,7 @@ def createMask(predictionMask: np.ndarray) -> tf.Tensor: def saveDatasetPredictions(group: str, model: KerasModel, dataset: tf.data.Dataset, classes: ImageDatasetClasses) -> None: predictions = model.predict(dataset) for index, prediction in enumerate(predictions): - mask: np.ndarray = createMask([prediction]).numpy() + mask: np.ndarray = createMask(np.array([prediction])).numpy() coloredMask = np.empty(shape = (mask.shape[0], mask.shape[1], 3)) for h, row in enumerate(mask): diff --git a/tasks/llama2-lora/.mypy.ini b/tasks/llama2-lora/.mypy.ini new file mode 100644 index 00000000..a05e3f3b --- /dev/null +++ b/tasks/llama2-lora/.mypy.ini @@ -0,0 +1,33 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-transformers.*] +ignore_missing_imports = True + +[mypy-datasets.*] +ignore_missing_imports = True + +[mypy-trl.*] +ignore_missing_imports = True + +[mypy-peft.*] +ignore_missing_imports = True diff --git a/tasks/llama2-lora/main.py b/tasks/llama2-lora/main.py index 54a45972..b48d43db 100644 --- a/tasks/llama2-lora/main.py +++ b/tasks/llama2-lora/main.py @@ -31,7 +31,7 @@ def loadData(dataset: CustomDataset) -> Dataset: def runInference(trainer: SFTTrainer, tokenizer: AutoTokenizer, prompt: str) -> str: textGenerator = pipeline(task = "text-generation", model = trainer.model, tokenizer = tokenizer, max_length = 200) output = textGenerator(f"[INST] {prompt} [/INST]") - return output[0]['generated_text'] + return str(output[0]['generated_text']) def main() -> None: diff --git a/tasks/llama2-lora/src/configurations.py b/tasks/llama2-lora/src/configurations.py index 6b8fa5bf..f4503d94 100644 --- a/tasks/llama2-lora/src/configurations.py +++ b/tasks/llama2-lora/src/configurations.py @@ -6,7 +6,7 @@ def getPeftParameters(loraAlpha: float, loraDropout: float, rank: int) -> LoraConfig: return LoraConfig( - lora_alpha = loraAlpha, + lora_alpha = int(loraAlpha), lora_dropout = loraDropout, r = rank, bias = "none", diff --git a/tasks/llama2-lora/src/model.py b/tasks/llama2-lora/src/model.py index 1031130c..ad32407e 100644 --- a/tasks/llama2-lora/src/model.py +++ b/tasks/llama2-lora/src/model.py @@ -17,6 +17,8 @@ def getModelName(modelVersion: str) -> str: if modelVersion == "70b-chat": return "NousResearch/Llama-2-70b-chat-hf" + raise ValueError("Invalid model version") + def loadTokenizer(modelName: str, context: Optional[int] = None) -> AutoTokenizer: tokenizer = AutoTokenizer.from_pretrained(modelName, trust_remote_code=True, model_max_length = context) diff --git a/tasks/llm-text-processing/.mypy.ini b/tasks/llm-text-processing/.mypy.ini new file mode 100644 index 00000000..930ede5f --- /dev/null +++ b/tasks/llm-text-processing/.mypy.ini @@ -0,0 +1,24 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-ollama.*] +ignore_missing_imports = True diff --git a/tasks/llm-text-processing/main.py b/tasks/llm-text-processing/main.py index 99657843..df79ad5d 100644 --- a/tasks/llm-text-processing/main.py +++ b/tasks/llm-text-processing/main.py @@ -1,3 +1,5 @@ +from typing import Optional + import time import logging import subprocess @@ -38,7 +40,7 @@ def checkOllamaServer() -> bool: return False -def launchOllamaServer() -> subprocess.Popen[bytes]: +def launchOllamaServer() -> Optional[subprocess.Popen[bytes]]: if not isOllamaInstalled(): installOllama() diff --git a/tasks/model-comparison/.mypy.ini b/tasks/model-comparison/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/model-comparison/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/model-transfer/.mypy.ini b/tasks/model-transfer/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/model-transfer/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/model-transfer/main.py b/tasks/model-transfer/main.py index b74883b0..a1b89ee2 100644 --- a/tasks/model-transfer/main.py +++ b/tasks/model-transfer/main.py @@ -41,7 +41,7 @@ def main() -> None: modelName = model.name logging.info(">> [Coretex] Creating Model...") - destinationModel = Model.createProjectModel( + destinationModel = Model.createModel( modelName, taskRun.parameters["destinationProject"], model.accuracy, diff --git a/tasks/object-detection-yolov10/src/predict.py b/tasks/object-detection-yolov10/src/predict.py index e512d03b..bdcdf95b 100644 --- a/tasks/object-detection-yolov10/src/predict.py +++ b/tasks/object-detection-yolov10/src/predict.py @@ -19,7 +19,7 @@ def classByLabelId(labelId: int, classes: ImageDatasetClasses) -> Optional[Image return classes.classByLabel(classes.labels[labelId]) -def processResult(result: Results, classes: list[ImageDatasetClasses], savePath: Path) -> None: +def processResult(result: Results, classes: ImageDatasetClasses, savePath: Path) -> None: fig = plt.figure(num = 1, clear = True) plt.imshow(result.orig_img) @@ -45,11 +45,11 @@ def processResult(result: Results, classes: list[ImageDatasetClasses], savePath: def isSampleValid(sample: ImageSample) -> bool: try: - instances = sample.load().annotation.instances - if instances is None: + annotation = sample.load().annotation + if annotation is None: return False - for instance in instances: + for instance in annotation.instances: if any(len(segmentation) < DIMENSION_THRESHOLD for segmentation in instance.segmentations): return False except Exception as e: diff --git a/tasks/object-detection-yolov8/src/predict.py b/tasks/object-detection-yolov8/src/predict.py index 8d03efd0..854454e8 100644 --- a/tasks/object-detection-yolov8/src/predict.py +++ b/tasks/object-detection-yolov8/src/predict.py @@ -19,7 +19,7 @@ def classByLabelId(labelId: int, classes: ImageDatasetClasses) -> Optional[Image return classes.classByLabel(classes.labels[labelId]) -def processResult(result: Results, classes: list[ImageDatasetClasses], savePath: Path): +def processResult(result: Results, classes: ImageDatasetClasses, savePath: Path) -> None: fig = plt.figure(num = 1, clear = True) plt.imshow(result.orig_img) @@ -45,11 +45,11 @@ def processResult(result: Results, classes: list[ImageDatasetClasses], savePath: def isSampleValid(sample: ImageSample) -> bool: try: - instances = sample.load().annotation.instances - if instances is None: + annotation = sample.load().annotation + if annotation is None: return False - for instance in instances: + for instance in annotation.instances: if any(len(segmentation) < DIMENSION_THRESHOLD for segmentation in instance.segmentations): return False except Exception as e: @@ -59,7 +59,7 @@ def isSampleValid(sample: ImageSample) -> bool: return True -def predictBatch(model: YOLO, dataset: ImageDataset, startIdx: int, endIdx: int, resultPath: Path): +def predictBatch(model: YOLO, dataset: ImageDataset, startIdx: int, endIdx: int, resultPath: Path) -> None: batch = [sample for sample in dataset.samples[startIdx:endIdx] if isSampleValid(sample)] results: Results = model.predict([sample.imagePath for sample in batch], save = True, project = "./results") diff --git a/tasks/ollama-chatbot-fn/.mypy.ini b/tasks/ollama-chatbot-fn/.mypy.ini new file mode 100644 index 00000000..13cd8bc6 --- /dev/null +++ b/tasks/ollama-chatbot-fn/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-faiss.*] +ignore_missing_imports = True + +[mypy-ollama.*] +ignore_missing_imports = True diff --git a/tasks/ollama-chatbot-fn/main.py b/tasks/ollama-chatbot-fn/main.py index 3b07b809..9fdaace2 100644 --- a/tasks/ollama-chatbot-fn/main.py +++ b/tasks/ollama-chatbot-fn/main.py @@ -11,14 +11,14 @@ def copyDir(src: Path, dst: Path, directoryName: str) -> None: shutil.copytree(src, dst / directoryName, copy_function = os.link) -def getIndexPath(dataset: CustomDataset): +def getIndexPath(dataset: CustomDataset) -> Path: sample = dataset.samples[0] sample.unzip() return sample.path / "embeddings.index" -def main(): +def main() -> None: taskRun = currentTaskRun() model = Model.createModel(f"{taskRun.id}-rag-chatbot", taskRun.id, 1.0) diff --git a/tasks/ollama-chatbot-fn/resources/function/function.py b/tasks/ollama-chatbot-fn/resources/function/function.py index f64362ff..233a344d 100644 --- a/tasks/ollama-chatbot-fn/resources/function/function.py +++ b/tasks/ollama-chatbot-fn/resources/function/function.py @@ -30,8 +30,8 @@ def response(requestData: dict[str, Any]) -> dict[str, Any]: sessionPath = memoryFolder / f"{sessionId}.json" query = requestData.get("query") - if query == None: - functions.badRequest("Query cannot be empty") + if not isinstance(query, str): + return functions.badRequest("Query cannot be empty") if inputSessionId is None or not sessionPath.exists(): logging.debug(">>> Creating new session") @@ -55,7 +55,7 @@ def response(requestData: dict[str, Any]) -> dict[str, Any]: }] else: with sessionPath.open("r") as file: - messages: list[dict[str, str]] = json.load(file) + messages: list[dict[str, str]] = json.load(file) # type: ignore[no-redef] messages.append({ "role": "user", diff --git a/tasks/ollama-chatbot-fn/resources/function/model.py b/tasks/ollama-chatbot-fn/resources/function/model.py index 38d2a319..4dd3eabd 100644 --- a/tasks/ollama-chatbot-fn/resources/function/model.py +++ b/tasks/ollama-chatbot-fn/resources/function/model.py @@ -1,3 +1,5 @@ +from typing import Optional + import logging import subprocess @@ -50,7 +52,7 @@ def checkOllamaServer() -> bool: return False -def launchOllamaServer() -> subprocess.Popen[bytes]: +def launchOllamaServer() -> Optional[subprocess.Popen[bytes]]: if not isOllamaInstalled(): installOllama() diff --git a/tasks/ollama-chatbot-fn/resources/function/utils.py b/tasks/ollama-chatbot-fn/resources/function/utils.py index 6fc27952..1012f8af 100644 --- a/tasks/ollama-chatbot-fn/resources/function/utils.py +++ b/tasks/ollama-chatbot-fn/resources/function/utils.py @@ -15,6 +15,6 @@ def loadCorpusAndIndex(dirPath: Path) -> tuple[np.ndarray, Any]: return corpus, index -def retrieveDocuments(queryEmbedding: np.ndarray, index: faiss.IndexFlatL2, corpus: np.ndarray, k: int): +def retrieveDocuments(queryEmbedding: np.ndarray, index: faiss.IndexFlatL2, corpus: np.ndarray, k: int) -> list[tuple[str, int]]: distances, indices = index.search(queryEmbedding, k) return [(corpus[i], distances[0][j]) for j, i in enumerate(indices[0])] diff --git a/tasks/ollama-rag-index/.mypy.ini b/tasks/ollama-rag-index/.mypy.ini new file mode 100644 index 00000000..7b6c87aa --- /dev/null +++ b/tasks/ollama-rag-index/.mypy.ini @@ -0,0 +1,30 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-ollama.*] +ignore_missing_imports = True + +[mypy-faiss.*] +ignore_missing_imports = True + +[mypy-fitz.*] +ignore_missing_imports = True diff --git a/tasks/ollama-rag-index/main.py b/tasks/ollama-rag-index/main.py index 9f29f382..4de5e1bd 100644 --- a/tasks/ollama-rag-index/main.py +++ b/tasks/ollama-rag-index/main.py @@ -64,10 +64,10 @@ def loadCorpus(dataset: CustomDataset) -> np.ndarray: with open(txtPath, "r") as f: corpus.append(f.read()) - return np.array(corpus) + return np.array(corpus) -def main(): +def main() -> None: taskRun = currentTaskRun() taskRun.dataset.download() diff --git a/tasks/ollama-rag-index/src/model.py b/tasks/ollama-rag-index/src/model.py index 38d2a319..4dd3eabd 100644 --- a/tasks/ollama-rag-index/src/model.py +++ b/tasks/ollama-rag-index/src/model.py @@ -1,3 +1,5 @@ +from typing import Optional + import logging import subprocess @@ -50,7 +52,7 @@ def checkOllamaServer() -> bool: return False -def launchOllamaServer() -> subprocess.Popen[bytes]: +def launchOllamaServer() -> Optional[subprocess.Popen[bytes]]: if not isOllamaInstalled(): installOllama() diff --git a/tasks/ollama-sentiment-analysis-fn/.mypy.ini b/tasks/ollama-sentiment-analysis-fn/.mypy.ini new file mode 100644 index 00000000..9de5e299 --- /dev/null +++ b/tasks/ollama-sentiment-analysis-fn/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-ollama.*] +ignore_missing_imports = True + +[mypy-bs4.*] +ignore_missing_imports = True diff --git a/tasks/ollama-sentiment-analysis-fn/main.py b/tasks/ollama-sentiment-analysis-fn/main.py index 691f7c52..0430e124 100644 --- a/tasks/ollama-sentiment-analysis-fn/main.py +++ b/tasks/ollama-sentiment-analysis-fn/main.py @@ -1,6 +1,6 @@ from coretex import currentTaskRun, Model -def main(): +def main() -> None: taskRun = currentTaskRun() model = Model.createModel(taskRun.generateEntityName(), taskRun.projectId, 1.0, {}) diff --git a/tasks/qiime-alpha-beta-diversity/.mypy.ini b/tasks/qiime-alpha-beta-diversity/.mypy.ini new file mode 100644 index 00000000..9f2db0ac --- /dev/null +++ b/tasks/qiime-alpha-beta-diversity/.mypy.ini @@ -0,0 +1,24 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-py3nvml.*] +ignore_missing_imports = True diff --git a/tasks/qiime-alpha-beta-diversity/main.py b/tasks/qiime-alpha-beta-diversity/main.py index 797ed85a..edd6ee0b 100644 --- a/tasks/qiime-alpha-beta-diversity/main.py +++ b/tasks/qiime-alpha-beta-diversity/main.py @@ -1,3 +1,4 @@ +from typing import Optional from pathlib import Path from zipfile import ZipFile @@ -24,7 +25,7 @@ def diversityCoreMetricsPhylogeneticSample( samplingDepth: int, metadataPath: Path, outputDir: Path, - threads + threads: Optional[int] ) -> Path: phylogenyPath = sample.joinPath("rooted-tree.qza") @@ -54,7 +55,7 @@ def diversityAlphaGroupSignificance( outputDataset: CustomDataset, outputPath: Path, taskRun: TaskRun -): +) -> None: ctx_qiime2.diversityAlphaGroupSignificance( str(alphaDiversityPath), @@ -79,7 +80,7 @@ def diversityBetaGroupSignificance( outputDataset: CustomDataset, outputPath: Path, taskRun: TaskRun -): +) -> None: ctx_qiime2.diversityBetaGroupSignificance( str(distanceMatrixPath), @@ -105,7 +106,7 @@ def emperorPlot( outputDataset: CustomDataset, outputPath: Path, taskRun: TaskRun -): +) -> None: ctx_qiime2.emperorPlot( str(pcoaPath), @@ -132,7 +133,7 @@ def diversityAlphaRarefaction( outputDataset: CustomDataset, outputPath: Path, taskRun: TaskRun -): +) -> None: ctx_qiime2.diversityAlphaRarefaction( str(tablePath), str(phylogenyPath), @@ -158,7 +159,7 @@ def processSample( taskRun: TaskRun, outputDataset: CustomDataset, outputDir: Path -): +) -> None: sample.unzip() metadataSample.unzip() diff --git a/tasks/qiime-dada2/.mypy.ini b/tasks/qiime-dada2/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/qiime-dada2/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/qiime-demultiplexing/.mypy.ini b/tasks/qiime-demultiplexing/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/qiime-demultiplexing/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/qiime-import/mypy.ini b/tasks/qiime-import/mypy.ini index 21446fac..c9224ebb 100644 --- a/tasks/qiime-import/mypy.ini +++ b/tasks/qiime-import/mypy.ini @@ -17,3 +17,9 @@ disallow_incomplete_defs = True no_implicit_optional = True strict_optional = True allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-cchardet.*] +ignore_missing_imports = True diff --git a/tasks/qiime-import/src/__init__.py b/tasks/qiime-import/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tasks/qiime-import/src/utils.py b/tasks/qiime-import/src/utils.py index 050f92f0..01a87707 100644 --- a/tasks/qiime-import/src/utils.py +++ b/tasks/qiime-import/src/utils.py @@ -27,7 +27,7 @@ def detectFileEncoding(path: Path) -> Optional[str]: raise ValueError(">> [Qiime: Import] Metadate file is too small") with path.open("rb") as file: - encoding = cchardet.detect(file.read())["encoding"] + encoding: Optional[str] = cchardet.detect(file.read())["encoding"] if encoding is None: logging.warning(">> [Qiime: Import] Could not determine metadata encoding") diff --git a/tasks/qiime-taxonomic-analysis/.mypy.ini b/tasks/qiime-taxonomic-analysis/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/qiime-taxonomic-analysis/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/qiime-taxonomic-analysis/main.py b/tasks/qiime-taxonomic-analysis/main.py index 9bc9109e..3dbad1d8 100644 --- a/tasks/qiime-taxonomic-analysis/main.py +++ b/tasks/qiime-taxonomic-analysis/main.py @@ -37,7 +37,7 @@ def processSample( outputDataset: CustomDataset, outputDir: Path, threads: Optional[int] -): +) -> None: sample.unzip() metadataSample.unzip() diff --git a/tasks/qiime-tree-phylogenetic-diversity-analysis/.mypy.ini b/tasks/qiime-tree-phylogenetic-diversity-analysis/.mypy.ini new file mode 100644 index 00000000..d3e1fa10 --- /dev/null +++ b/tasks/qiime-tree-phylogenetic-diversity-analysis/.mypy.ini @@ -0,0 +1,18 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False diff --git a/tasks/s3-model-upload/.mypy.ini b/tasks/s3-model-upload/.mypy.ini index 3682b4f5..ac48fe36 100644 --- a/tasks/s3-model-upload/.mypy.ini +++ b/tasks/s3-model-upload/.mypy.ini @@ -20,3 +20,8 @@ allow_redefinition = False # Per-module options: # https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-boto3.*] +ignore_missing_imports = True + +[mypy-botocore.*] +ignore_missing_imports = True diff --git a/tasks/sql-connector/main.py b/tasks/sql-connector/main.py index 42fff26f..7917bb19 100644 --- a/tasks/sql-connector/main.py +++ b/tasks/sql-connector/main.py @@ -19,18 +19,18 @@ def connectMysqlDatabase(connectionConfig: dict[str, str]) -> CMySQLConnection: try: conn = mysql.connector.connect(**connectionConfig) except mysql.connector.errors.Error as e: - logging.error(f">> [SQL Connector] Error while connecting to database: {e}") + raise mysql.connector.errors.Error(f">> [SQL Connector] Error while connecting to database: {e}") - return conn + return conn # type: ignore[return-value] def connectPostgresqlDatabase(connectionConfig: dict[str, str]) -> connection: logging.info(f">> [SQL Connector] Connecting with PostgreSQL database \"{connectionConfig['database']}\"...") try: - conn = psycopg2.connect(**connectionConfig) + conn: connection = psycopg2.connect(**connectionConfig) # type: ignore[call-overload] except psycopg2._psycopg.Error as e: - logging.error(f">> [SQL Connector] Error while connecting to database: {e}") + raise psycopg2._psycopg.Error(f">> [SQL Connector] Error while connecting to database: {e}") return conn @@ -42,28 +42,26 @@ def fetchAllData(conn: Union[CMySQLConnection, connection], dataset: CustomDatas if len(tables) < 1: raise RuntimeError("There are no tables in the database") - tables = [table[0] for table in tables] - - for table in tables: + for table in [table[0] for table in tables]: tableData: list[dict[str, str]] = [] - cursor.execute(queryGetRows + f"'{table}'") + cursor.execute(queryGetRows + f"'{table}'") # type: ignore[str-bytes-safe] columnNames = list(cursor.fetchall()) columnNames = [name[0] for name in columnNames] - cursor.execute(f"SELECT * FROM {table}") + cursor.execute(f"SELECT * FROM {table}") # type: ignore[str-bytes-safe] rows = list(cursor.fetchall()) for row in rows: - tableData.append(dict(zip(columnNames, list(row)))) + tableData.append(dict(zip(columnNames, list(row)))) # type: ignore[arg-type] - sampleNameCsv = f"{table}.csv" + sampleNameCsv = f"{table}.csv" # type: ignore[str-bytes-safe] with open(sampleNameCsv, "w", newline = "") as file: writer = csv.DictWriter(file, fieldnames = columnNames) writer.writeheader() - writer.writerows(tableData) + writer.writerows(tableData) # type: ignore[arg-type] - sampleNameZip = f"{table}.zip" + sampleNameZip = f"{table}.zip" # type: ignore[str-bytes-safe] with zipfile.ZipFile(sampleNameZip, "w") as zipFile: zipFile.write(sampleNameCsv) @@ -92,23 +90,23 @@ def main() -> None: } if databaseType == "MySQL": - conn: Union[CMySQLConnection, connection] = connectMysqlDatabase(connectionConfig) + mySqlConn = connectMysqlDatabase(connectionConfig) - if conn.is_connected(): + if mySqlConn.is_connected(): dataset = CustomDataset.createDataset(f"{taskRun.id}-{database}", taskRun.projectId) queryGetTables = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{database}'" queryGetRows = f"SELECT column_name FROM information_schema.columns WHERE table_schema = '{database}' AND table_name = " - fetchAllData(conn, dataset, queryGetTables, queryGetRows) + fetchAllData(mySqlConn, dataset, queryGetTables, queryGetRows) else: logging.warning(">> [SQL Connector] Problem with the database connection") elif databaseType == "PostgreSQL": - conn = connectPostgresqlDatabase(connectionConfig) + postgreSqlConn = connectPostgresqlDatabase(connectionConfig) - if conn: + if postgreSqlConn: dataset = CustomDataset.createDataset(f"{taskRun.id}-{database}", taskRun.projectId) queryGetTables = f"SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'" queryGetRows = f"SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = " - fetchAllData(conn, dataset, queryGetTables, queryGetRows) + fetchAllData(postgreSqlConn, dataset, queryGetTables, queryGetRows) else: logging.warning(">> [SQL Connector] Problem with the database connection") diff --git a/tasks/stable-diffusion-fn/.mypy.ini b/tasks/stable-diffusion-fn/.mypy.ini new file mode 100644 index 00000000..c5af1153 --- /dev/null +++ b/tasks/stable-diffusion-fn/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-diffusers.*] +ignore_missing_imports = True + +[mypy-torch.*] +ignore_missing_imports = True diff --git a/tasks/stable-diffusion-fn/main.py b/tasks/stable-diffusion-fn/main.py index e4164c35..b87ad7a4 100644 --- a/tasks/stable-diffusion-fn/main.py +++ b/tasks/stable-diffusion-fn/main.py @@ -3,7 +3,7 @@ from coretex import currentTaskRun, Model -def main(): +def main() -> None: taskRun = currentTaskRun() logging.info(">> [StableDiffusion] Creating Coretex model") diff --git a/tasks/stable-diffusion/.mypy.ini b/tasks/stable-diffusion/.mypy.ini new file mode 100644 index 00000000..c5af1153 --- /dev/null +++ b/tasks/stable-diffusion/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-diffusers.*] +ignore_missing_imports = True + +[mypy-torch.*] +ignore_missing_imports = True diff --git a/tasks/stable-diffusion/main.py b/tasks/stable-diffusion/main.py index 70decc4b..c3c45dc7 100644 --- a/tasks/stable-diffusion/main.py +++ b/tasks/stable-diffusion/main.py @@ -18,9 +18,9 @@ def loadModel(device: str) -> StableDiffusionPipeline: dtype = torch.float16 if device == "cuda" else torch.float32 - pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype = dtype) - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - return pipe.to(device) + pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype = dtype) # type: ignore[no-untyped-call] + pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) # type: ignore[no-untyped-call] + return pipe.to(device) # type: ignore[no-any-return] def getDefault(taskRun: TaskRun, name: str, default: Any) -> Any: @@ -53,14 +53,14 @@ def generateImages( # Create an array equal to number of input prompts negativePrompts = [negativePrompt] * len(prompts) - images = model( + images = model( # type: ignore[operator] prompts, negative_prompt = negativePrompts, num_inference_steps = steps, width = width, height = height, seed = seed - ).images # type: ignore + ).images imagePaths: list[Path] = [] diff --git a/tasks/synthetic-image-generator/.mypy.ini b/tasks/synthetic-image-generator/.mypy.ini index 3fe41755..d3e1fa10 100644 --- a/tasks/synthetic-image-generator/.mypy.ini +++ b/tasks/synthetic-image-generator/.mypy.ini @@ -16,18 +16,3 @@ disallow_incomplete_defs = True no_implicit_optional = True strict_optional = True allow_redefinition = False - - -# Per-module options: -# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports -[mypy-shapely.*] -ignore_missing_imports = True - -[mypy-pytesseract.*] -ignore_missing_imports = True - -[mypy-easyocr.*] -ignore_missing_imports = True - -[mypy-transformers.*] -ignore_missing_imports = True diff --git a/tasks/synthetic-image-generator/main.py b/tasks/synthetic-image-generator/main.py index b6decbf0..62c56d1d 100644 --- a/tasks/synthetic-image-generator/main.py +++ b/tasks/synthetic-image-generator/main.py @@ -1,3 +1,4 @@ +from typing import Any, Optional from pathlib import Path from contextlib import ExitStack from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, Future @@ -19,7 +20,11 @@ def getRandomSamples(dataset: ImageDataset, count: int) -> list[ImageSample]: return [dataset.samples[i] for i in indexes] -def didGenerateSample(dataset: ImageDataset, future: Future[tuple[Path, CoretexImageAnnotation]]) -> None: +def didGenerateSample( + dataset: ImageDataset, + future: Future[tuple[Path, CoretexImageAnnotation, Optional[dict[str, Any]]]] +) -> None: + try: imagePath, annotation, metadata = future.result() generatedSample = dataset.add(imagePath) diff --git a/tasks/synthetic-image-generator/src/sample_generator.py b/tasks/synthetic-image-generator/src/sample_generator.py index df84db11..14bc27b4 100644 --- a/tasks/synthetic-image-generator/src/sample_generator.py +++ b/tasks/synthetic-image-generator/src/sample_generator.py @@ -52,6 +52,8 @@ def generateSample( image = Image.fromarray(data.image) backgroundImage = ImageOps.exif_transpose(Image.open(backgroundImagePath)) + if backgroundImage is None: + raise ValueError(f"Failed to open background image. ID: {backgroundImagePath.parent.name}") # Resize image parentAnnotationWidth = int(backgroundImage.width * random.uniform(minImageSize, maxImageSize)) diff --git a/tasks/tabular-data-diagnostics/.mypy.ini b/tasks/tabular-data-diagnostics/.mypy.ini new file mode 100644 index 00000000..ebb9536e --- /dev/null +++ b/tasks/tabular-data-diagnostics/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-sklearn.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True diff --git a/tasks/tabular-data-diagnostics/main.py b/tasks/tabular-data-diagnostics/main.py index bf85c228..947ee3d5 100644 --- a/tasks/tabular-data-diagnostics/main.py +++ b/tasks/tabular-data-diagnostics/main.py @@ -9,8 +9,8 @@ from src.dataset import extractTestTrainData, loadDataset -def saveModel(taskRun: TaskRun[CustomDataset], accuracy: float, trainColumnCount: int, labels: list[str]): - model = Model.createModel(taskRun.generateEntityName(), taskRun.projectId, accuracy, {}) +def saveModel(taskRun: TaskRun[CustomDataset], accuracy: float, trainColumnCount: int, labels: list[str]) -> None: + model = Model.createModel(taskRun.generateEntityName(), taskRun.id, accuracy, {}) modelPath = folder_manager.temp / "model" model.saveModelDescriptor(modelPath, { diff --git a/tasks/tabular-data-diagnostics/src/dataset.py b/tasks/tabular-data-diagnostics/src/dataset.py index e42945d2..2bd3b0f4 100644 --- a/tasks/tabular-data-diagnostics/src/dataset.py +++ b/tasks/tabular-data-diagnostics/src/dataset.py @@ -5,7 +5,7 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder -from pandas import DataFrame +from pandas import DataFrame, Series import pandas as pd @@ -68,14 +68,14 @@ def loadDataset(coretexDataset: CustomDataset, validationSplit: float, labelColu return train, test, labels -def extractXY(df: DataFrame, labelColumn: str) -> tuple[DataFrame, DataFrame]: +def extractXY(df: DataFrame, labelColumn: str) -> tuple[DataFrame, Series]: x = df.drop(labelColumn, axis = 1) y = df[labelColumn] return x, y -def extractTestTrainData(train: DataFrame, test: DataFrame, labelColumn: str) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]: +def extractTestTrainData(train: DataFrame, test: DataFrame, labelColumn: str) -> tuple[DataFrame, DataFrame, Series, Series]: logging.info(f">> [Tabular Data Diagnostics] Extracting test and train data...") xTrain, yTrain = extractXY(train, labelColumn) xTest, yTest = extractXY(test, labelColumn) diff --git a/tasks/translation-ollama/.mypy.ini b/tasks/translation-ollama/.mypy.ini new file mode 100644 index 00000000..b2b4f473 --- /dev/null +++ b/tasks/translation-ollama/.mypy.ini @@ -0,0 +1,27 @@ +# Global options: + +[mypy] +python_version = 3.9 +pretty = True +warn_return_any = True +warn_no_return = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True +warn_unreachable = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_untyped_defs = True +disallow_incomplete_defs = True +no_implicit_optional = True +strict_optional = True +allow_redefinition = False + + +# Per-module options: +# https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports +[mypy-ollama.*] +ignore_missing_imports = True + +[mypy-fitz.*] +ignore_missing_imports = True diff --git a/tasks/translation-ollama/main.py b/tasks/translation-ollama/main.py index 09b23f25..d57bbccf 100644 --- a/tasks/translation-ollama/main.py +++ b/tasks/translation-ollama/main.py @@ -13,7 +13,7 @@ def readPDF(filePath: Path) -> list[str]: pagesText: list[str] = [] - + with fitz.open(filePath) as doc: for page in doc: paragraphs = page.get_text().split("\n") @@ -24,14 +24,14 @@ def readPDF(filePath: Path) -> list[str]: def loadCorpus(dataset: CustomDataset) -> list[list[str]]: corpus: list[list[str]] = [] - + for sample in dataset.samples: sample.unzip() pdfPaths = list(sample.path.rglob("*.pdf")) if len(pdfPaths) == 0: raise ValueError(">> [LLM Translate] The provided dataset does not contain any .pdf documents") - + for pdfPath in pdfPaths: if not "__MACOSX" in str(pdfPath): corpus.append(readPDF(pdfPath)) @@ -45,13 +45,13 @@ def main() -> None: dataset.download() launchOllamaServer() - + logging.info(">> [OllamaRAG] Pulling model") pullModel(LLM) - + logging.info(">> [OllamaRAG] Loading text corpus") corpus = loadCorpus(taskRun.dataset) - + translatedDataset = CustomDataset.createDataset(f"{taskRun.id}-translated", taskRun.projectId) language = taskRun.parameters["language"] @@ -59,7 +59,7 @@ def main() -> None: for counter, document in enumerate(corpus, start = 1): document = [x.strip() for x in document] document = [line for line in document if line != ""] - + translatedText = "" for paragraph in document: logging.info(">> [OllamaRAG] Translating paragraph") @@ -70,10 +70,10 @@ def main() -> None: "role": "user", "content": query } - response = ollama.chat(model = LLM, messages = [msg]) + response = ollama.chat(model = LLM, messages = [msg]) # type: ignore[list-item] answer = response["message"]["content"] translatedText += answer + "\n" - + txtFileName = f"file-{counter}.txt" txtFile = folder_manager.temp / txtFileName with open(txtFile, "w") as f: @@ -83,7 +83,7 @@ def main() -> None: zipFile = folder_manager.temp / zipFileName with zipfile.ZipFile(zipFile, "w") as zf: zf.write(txtFile, txtFileName) - + translatedDataset.add(zipFile) taskRun.submitOutput("translatedDataset", translatedDataset)