From 6acc24093ff648265f20cc7aa93d8ff3583b9800 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 4 Dec 2024 22:18:43 -0500 Subject: [PATCH] Issue 260 classification content (#261) * Add content to vcv and rcv classification. Fix bug where 'classifications' were being included in the disassembled rcv_accession as well as in their own table * Add content column to vcv/rcv classification bq schemas --- .../rcv_accession_classification.bq.json | 4 ++++ .../variation_archive_classification.bq.json | 4 ++++ .../cloud/bigquery/processing_history.py | 2 +- clinvar_ingest/model/variation_archive.py | 16 ++++++++++++---- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/clinvar_ingest/cloud/bigquery/bq_json_schemas/rcv_accession_classification.bq.json b/clinvar_ingest/cloud/bigquery/bq_json_schemas/rcv_accession_classification.bq.json index 68a0764..6f8d7db 100644 --- a/clinvar_ingest/cloud/bigquery/bq_json_schemas/rcv_accession_classification.bq.json +++ b/clinvar_ingest/cloud/bigquery/bq_json_schemas/rcv_accession_classification.bq.json @@ -34,5 +34,9 @@ { "name": "clinical_impact_clinical_significance", "type": "STRING" + }, + { + "name": "content", + "type": "STRING" } ] diff --git a/clinvar_ingest/cloud/bigquery/bq_json_schemas/variation_archive_classification.bq.json b/clinvar_ingest/cloud/bigquery/bq_json_schemas/variation_archive_classification.bq.json index 6540b8b..c16bd07 100644 --- a/clinvar_ingest/cloud/bigquery/bq_json_schemas/variation_archive_classification.bq.json +++ b/clinvar_ingest/cloud/bigquery/bq_json_schemas/variation_archive_classification.bq.json @@ -42,5 +42,9 @@ { "name": "clinical_impact_clinical_significance", "type": "STRING" + }, + { + "name": "content", + "type": "STRING" } ] diff --git a/clinvar_ingest/cloud/bigquery/processing_history.py b/clinvar_ingest/cloud/bigquery/processing_history.py index e063c2f..ad42710 100644 --- a/clinvar_ingest/cloud/bigquery/processing_history.py +++ b/clinvar_ingest/cloud/bigquery/processing_history.py @@ -252,7 +252,7 @@ def write_started( # noqa: PLR0913 f"release_tag={release_tag}, bucket_dir={bucket_dir}" ) _logger.warning( - f"Expected 0 rows to exist for the started event, but found {row.c}." + f"Expected 0 rows to exist for the started event, but found {row.c}. " f"file_type={file_type}, release_date={release_date}, " f"release_tag={release_tag}, bucket_dir={bucket_dir}" ) diff --git a/clinvar_ingest/model/variation_archive.py b/clinvar_ingest/model/variation_archive.py index 0a8dda9..f0ba28b 100644 --- a/clinvar_ingest/model/variation_archive.py +++ b/clinvar_ingest/model/variation_archive.py @@ -787,9 +787,11 @@ class RcvAccessionClassification(Model): clinical_impact_assertion_type: str clinical_impact_clinical_significance: str + content: dict + @staticmethod def jsonifiable_fields() -> list[str]: - return [] + return ["content"] def __post_init__(self): self.entity_type = "rcv_accession_classification" @@ -802,7 +804,9 @@ def from_xml_single(inp: dict, statement_type: StatementType, rcv_id: str): or OncogenicityClassification entry. The statement_type is the key from the original `Classifications` XML/dict, indicating the type. """ - raw_description = extract(inp, "Description") + # TODO is there a chance they add fields to Description? Maybe don't extract. + # raw_description = extract(inp, "Description") + raw_description = get(inp, "Description") or {} return RcvAccessionClassification( rcv_id=rcv_id, statement_type=statement_type, @@ -819,6 +823,7 @@ def from_xml_single(inp: dict, statement_type: StatementType, rcv_id: str): raw_description, "@ClinicalImpactClinicalSignificance", ), + content=inp, ) @staticmethod @@ -945,7 +950,7 @@ def disassemble(self): yield from c.disassemble() del self_copy.classifications - yield self + yield self_copy @dataclasses.dataclass @@ -965,9 +970,11 @@ class VariationArchiveClassification(Model): clinical_impact_assertion_type: str clinical_impact_clinical_significance: str + content: dict + @staticmethod def jsonifiable_fields() -> list[str]: - return [] + return ["content"] def __post_init__(self): self.entity_type = "variation_archive_classification" @@ -998,6 +1005,7 @@ def from_xml_single(inp: dict, statement_type: StatementType, vcv_id: str): interp_description, "@ClinicalImpactClinicalSignificance", ), + content=inp, ) @staticmethod