Skip to content

Commit

Permalink
Add VCF create and register functions (#457)
Browse files Browse the repository at this point in the history
  • Loading branch information
gspowley authored Sep 7, 2023
1 parent 4fbddae commit 5bc791d
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/tiledb/cloud/vcf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .allele_frequency import read_allele_frequency
from .ingestion import Contigs
from .ingestion import create_dataset_udf as create_dataset
from .ingestion import ingest
from .ingestion import register_dataset_udf as register_dataset
from .query import build_read_dag
from .query import read
from .utils import bgzip_and_index
Expand All @@ -12,7 +14,9 @@

__all__ = [
"Contigs",
"create_dataset",
"ingest",
"register_dataset",
"build_read_dag",
"read",
"read_allele_frequency",
Expand Down
65 changes: 65 additions & 0 deletions src/tiledb/cloud/vcf/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,58 @@ def create_dataset_udf(
return dataset_uri


def register_dataset_udf(
dataset_uri: str,
*,
register_name: str,
namespace: Optional[str] = None,
config: Optional[Mapping[str, Any]] = None,
verbose: bool = False,
) -> None:
"""
Register the dataset on TileDB Cloud.
:param dataset_uri: dataset URI
:param register_name: name to register the dataset with on TileDB Cloud
:param namespace: TileDB Cloud namespace, defaults to the user's default namespace
:param config: config dictionary, defaults to None
:param verbose: verbose logging, defaults to False
"""

logger = get_logger_wrapper(verbose)

namespace = namespace or tiledb.cloud.user_profile().default_namespace_charged
tiledb_uri = f"tiledb://{namespace}/{register_name}"

with tiledb.scope_ctx(config):
found = False
try:
object_type = tiledb.object_type(tiledb_uri)
if object_type == "group":
found = True
elif object_type is not None:
raise ValueError(
f"Another object is already registered at '{tiledb_uri}'."
)

except Exception:
# tiledb.object_type raises an exception if the namespace does not exist
logger.error(
"Error checking if %r is registered. Bad namespace?", tiledb_uri
)
raise

if found:
logger.info("Dataset already registered at %r.", tiledb_uri)
else:
logger.info("Registering dataset at %r.", tiledb_uri)
tiledb.cloud.groups.register(
dataset_uri,
name=register_name,
namespace=namespace,
)


def read_uris_udf(
dataset_uri: str,
list_uri: str,
Expand Down Expand Up @@ -1380,6 +1432,7 @@ def ingest(
*,
config=None,
namespace: Optional[str] = None,
register_name: Optional[str] = None,
search_uri: Optional[str] = None,
pattern: Optional[str] = None,
ignore: Optional[str] = None,
Expand Down Expand Up @@ -1414,6 +1467,8 @@ def ingest(
:param dataset_uri: dataset URI
:param config: config dictionary, defaults to None
:param namespace: TileDB-Cloud namespace, defaults to None
:param register_name: name to register the dataset with on TileDB Cloud,
defaults to None
:param search_uri: URI to search for VCF files, defaults to None
:param pattern: Unix shell style pattern to match when searching for VCF files,
defaults to None
Expand Down Expand Up @@ -1520,4 +1575,14 @@ def ingest(
consolidate_stats=consolidate_stats,
)

# Register the dataset on TileDB Cloud
if register_name:
register_dataset_udf(
dataset_uri,
namespace=namespace,
register_name=register_name,
config=config,
verbose=verbose,
)

return dag, sample_uris

0 comments on commit 5bc791d

Please sign in to comment.