coretex-ai · igorperic17 · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/coretex/bioinformatics/sequence_alignment/__init__.py b/coretex/bioinformatics/sequence_alignment/__init__.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 
 import subprocess
-import logging
+import os
 
 from ..utils import command, logProcessOutput, CommandException
 from ...coretex import CustomDataset
@@ -61,7 +61,14 @@ def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None:
     ])
 
 
-def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Path) -> None:
+def alignCommand(
+    bwaPath: Path,
+    prefix: Path,
+    sequencePath: Path,
+    outputPath: Path,
+    multithreading: bool = True
+) -> None:
+
     """
         This function acts as a wrapper for the mem command of BWA
         (Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read
@@ -92,15 +99,29 @@ def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Pa
 
     args = [
         str(bwaPath.absolute()), "mem",
-        "-o", str(outputPath.absolute()),
+        "-o", str(outputPath.absolute())
+    ]
+
+    if multithreading:
+        threads = os.cpu_count()
+        if threads is not None:
+            args.extend(["-t", str(threads)])
+
+    args.extend([
         str(prefix.absolute()),
         str(sequencePath.absolute())
-    ]
+    ])
 
     command(args, True)
 
 
-def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:
+def sam2bamCommand(
+    samtoolsPath: Path,
+    samPath: Path,
+    outputPath: Path,
+    multithreading: bool = True
+) -> None:
+
     """
         This function uses the CLI tool "samtools" to convert SAM files into their binary
         version, BAM.
@@ -125,13 +146,23 @@ def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:
         Link to samtools: http://htslib.org/
     """
 
-    command([
+    args = [
         str(samtoolsPath.absolute()), "view",
-        "-b", "-S", "-o",
-        str(outputPath.absolute()),
+        "-b", "-S"
+    ]
+
+    if multithreading:
+        threads = os.cpu_count()
+        if threads is not None:
+            args.extend(["--threads", str(threads - 1)])
+
+    args.extend([
+        "-o", str(outputPath.absolute()),
         str(samPath.absolute())
     ])
 
+    command(args)
+
 
 def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]:
     """

diff --git a/coretex/coretex/dataset/network_dataset.py b/coretex/coretex/dataset/network_dataset.py
@@ -112,8 +112,7 @@ def createDataset(
     ) -> Optional[Self]:
 
         """
-            Creates a new dataset with the provided name, type
-            and samples (if present, samples are not required)
+            Creates a new dataset with the provided name and type
 
             Parameters
             ----------

diff --git a/coretex/coretex/dataset/sequence_dataset/sequence_dataset.py b/coretex/coretex/dataset/sequence_dataset/sequence_dataset.py
@@ -15,10 +15,15 @@
 #     You should have received a copy of the GNU Affero General Public License
 #     along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from typing import Dict
+from typing import Dict, Optional, Any, Union
+from typing_extensions import Self
+from pathlib import Path
+
+import logging
 
 from .base import BaseSequenceDataset
 from ..network_dataset import NetworkDataset
+from ..custom_dataset import CustomDataset
 from ...sample import SequenceSample, CustomSample
 from ....codable import KeyDescriptor
 
@@ -50,6 +55,58 @@ def onDecode(self) -> None:
             if sample.id != self.metadata.id
         ]
 
+    @classmethod
+    def createSequenceDataset(
+        cls,
+        name: str,
+        spaceId: int,
+        metadataPath: Union[Path, str],
+        meta: Optional[Dict[str, Any]] = None
+    ) -> Optional[Self]:
+
+        """
+            Creates a new sequence dataset with the provided name and metadata
+
+            Parameters
+            ----------
+            name : str
+                dataset name
+            spaceId : int
+                space for which the dataset will be created
+            metadataPath : Union[Path, str]
+                path the zipped metadata file
+
+            Returns
+            -------
+            The created sequence dataset object or None if creation failed
+
+            Example
+            -------
+            >>> from coretex import SequenceDataset
+            \b
+            >>> dummyDataset = SequenceDataset.createSequenceDataset("dummyDataset", 123, pathToMetadata)
+            >>> if dummyDataset is not None:
+                    print("Dataset created successfully")
+        """
+
+        if isinstance(metadataPath, str):
+            metadataPath = Path(metadataPath)
+
+        dataset = CustomDataset.createDataset(name, spaceId, meta)
+        if dataset is None:
+            return None
+
+        if CustomSample.createCustomSample(
+            "_metadata",
+            dataset.id,
+            metadataPath
+        ) is None:
+
+            logging.getLogger("coretexpylib").warning(">> [Coretex] Failed to create _metadata sample")
+            return None
+
+        return cls.fetchById(dataset.id)
+
     def download(self, ignoreCache: bool = False) -> None:
         super().download(ignoreCache)
 
@@ -80,4 +137,3 @@ def isPairedEnd(self) -> bool:
             return False
 
         raise ValueError(">> [Coretex] Dataset contains a mix of paired-end and single-end sequences. It should contain either one or the other")
-