Allow duplicated columns

Allows the single accession column to be duplicated to represent both accession and strain so that the final tree has strain names
nextstrain · May 28, 2024 · e2b0886 · e2b0886
1 parent f2fcf37
commit e2b0886
Showing 1 changed file with 31 additions and 19 deletions.
diff --git a/phylogenetic/scripts/curate_private_data.py b/phylogenetic/scripts/curate_private_data.py
@@ -43,42 +43,54 @@ def convert(accession: str, k: str, v: Any) -> str:
         return f"{v.year}-{v.month:02}-{v.day:02}"
     return str(v)
 
-def parse_excel(fname: str, remap: dict[str, str]) -> tuple[Metadata, MetadataHeader]:
+
+def column_map(names: tuple[str], remap: list[tuple[str, str]]) -> list[tuple[str, str]]:
+    remap_idx_used = []
+    columns = []
+    for name in names:
+        # any  matching renames / duplications?
+        changes = [(idx, name_map) for idx, name_map in enumerate(remap) if name_map[0]==name.lower()]
+        if len(changes):
+            for idx, name_map in changes:
+                remap_idx_used.append(idx)
+                columns.append((name, name_map[1]))
+        else:
+            columns.append((name, name.lower()))
+
+    assert len(set([n[1] for n in columns]))==len(columns), "Requested column names aren't unique!"
+
+    for i,name_map in enumerate(remap):
+        if i not in remap_idx_used:
+            print(f"WARNING: You asked to remap column {name_map[0]!r} but that column doesn't exist!")
+    return columns
+
+def parse_excel(fname: str, remap: list[tuple[str, str]]) -> tuple[Metadata, MetadataHeader]:
     workbook = load_workbook(filename=fname)
     worksheet = workbook.active
     n_rows = 0
 
     rows = worksheet.values # type: ignore
     assert rows is not None, f"The metadata file {fname!r} seemed to be empty!"
 
-    column_names: tuple[Any] = next(rows) # type: ignore
-    new_column_names: MetadataHeader = list([str(n).lower() for n in column_names])
-    column_names_lower = new_column_names[:]
-
-    for old_name, new_name in remap.items():
-        if old_name not in column_names_lower:
-            print(f"WARNING: You asked to remap column {old_name!r} but it's not found in {fname!r}")
-            continue
-        new_column_names[column_names_lower.index(old_name)] = new_name
+    existing_column_names: tuple[str] = next(rows) # type: ignore
+    column_names = column_map(existing_column_names, remap)
 
     for name in REQUIRED_COLUMNS:
-        assert name in new_column_names, f"Metadata didn't have an {name!r} column (after column names were remapped)"
+        assert name in [c[1] for c in column_names], f"Metadata didn't have an {name!r} column (after column names were remapped)"
     for name in RECOMMENDED_COLUMNS:
-        if name not in new_column_names:
+        if name not in [c[1] for c in column_names]:
             print(f"Warning: Metadata didn't have an {name!r} column (after column names were remapped) which is recommended ")
 
-    accession_idx = new_column_names.index(ACCESSION)
-    # accessions = set()
+    accession_idx = [c[1] for c in column_names].index(ACCESSION)
 
     metadata: Metadata = {}
     for row in rows:
         n_rows+=1
         accession = str(row[accession_idx])
-        # accessions.add(accession)
-        metadata[accession] = {k:convert(accession, k,v) for k,v in zip(new_column_names, row)}
+        metadata[accession] = {new_name:convert(accession, new_name, row[existing_column_names.index(old_name)]) for old_name,new_name in column_names}
 
     print(f"Parsed {n_rows} metadata rows (excluding header) from xlsx file")
-    return (metadata, new_column_names)
+    return (metadata, [c[1] for c in column_names])
 
 
 def compare_ids(sequences: Sequences, metadata: Metadata) -> tuple[Sequences, Metadata]:
@@ -141,9 +153,9 @@ def write_metadata(fname: str|None, metadata: Metadata, header: MetadataHeader)
         for _, value in metadata.items():
             print("\t".join([value[field] for field in header]), file=fh)
 
-def parse_remap_columns(arg: list[str]) -> dict[str, str]:
+def parse_remap_columns(arg: list[str]) -> list[tuple[str, str]]:
     try:
-        return {x[0].lower():x[1].lower() for x in [a.split(':') for a in arg]}
+        return [(x[0].lower(),x[1].lower()) for x in [a.split(':') for a in arg]]
     except:
         print("Error while parsing the remap-columns argument. Each entry must be two column names with a ':' between them.")
         print("For instance: \"--remap-columns 'collection date:date' 'province:division'\"")