Skip to content

Commit

Permalink
Allow duplicated columns
Browse files Browse the repository at this point in the history
Allows the single accession column to be duplicated to represent both accession and strain so that the final tree has strain names
  • Loading branch information
jameshadfield committed May 28, 2024
1 parent f2fcf37 commit e2b0886
Showing 1 changed file with 31 additions and 19 deletions.
50 changes: 31 additions & 19 deletions phylogenetic/scripts/curate_private_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,42 +43,54 @@ def convert(accession: str, k: str, v: Any) -> str:
return f"{v.year}-{v.month:02}-{v.day:02}"
return str(v)

def parse_excel(fname: str, remap: dict[str, str]) -> tuple[Metadata, MetadataHeader]:

def column_map(names: tuple[str], remap: list[tuple[str, str]]) -> list[tuple[str, str]]:
remap_idx_used = []
columns = []
for name in names:
# any matching renames / duplications?
changes = [(idx, name_map) for idx, name_map in enumerate(remap) if name_map[0]==name.lower()]
if len(changes):
for idx, name_map in changes:
remap_idx_used.append(idx)
columns.append((name, name_map[1]))
else:
columns.append((name, name.lower()))

assert len(set([n[1] for n in columns]))==len(columns), "Requested column names aren't unique!"

for i,name_map in enumerate(remap):
if i not in remap_idx_used:
print(f"WARNING: You asked to remap column {name_map[0]!r} but that column doesn't exist!")
return columns

def parse_excel(fname: str, remap: list[tuple[str, str]]) -> tuple[Metadata, MetadataHeader]:
workbook = load_workbook(filename=fname)
worksheet = workbook.active
n_rows = 0

rows = worksheet.values # type: ignore
assert rows is not None, f"The metadata file {fname!r} seemed to be empty!"

column_names: tuple[Any] = next(rows) # type: ignore
new_column_names: MetadataHeader = list([str(n).lower() for n in column_names])
column_names_lower = new_column_names[:]

for old_name, new_name in remap.items():
if old_name not in column_names_lower:
print(f"WARNING: You asked to remap column {old_name!r} but it's not found in {fname!r}")
continue
new_column_names[column_names_lower.index(old_name)] = new_name
existing_column_names: tuple[str] = next(rows) # type: ignore
column_names = column_map(existing_column_names, remap)

for name in REQUIRED_COLUMNS:
assert name in new_column_names, f"Metadata didn't have an {name!r} column (after column names were remapped)"
assert name in [c[1] for c in column_names], f"Metadata didn't have an {name!r} column (after column names were remapped)"
for name in RECOMMENDED_COLUMNS:
if name not in new_column_names:
if name not in [c[1] for c in column_names]:
print(f"Warning: Metadata didn't have an {name!r} column (after column names were remapped) which is recommended ")

accession_idx = new_column_names.index(ACCESSION)
# accessions = set()
accession_idx = [c[1] for c in column_names].index(ACCESSION)

metadata: Metadata = {}
for row in rows:
n_rows+=1
accession = str(row[accession_idx])
# accessions.add(accession)
metadata[accession] = {k:convert(accession, k,v) for k,v in zip(new_column_names, row)}
metadata[accession] = {new_name:convert(accession, new_name, row[existing_column_names.index(old_name)]) for old_name,new_name in column_names}

print(f"Parsed {n_rows} metadata rows (excluding header) from xlsx file")
return (metadata, new_column_names)
return (metadata, [c[1] for c in column_names])


def compare_ids(sequences: Sequences, metadata: Metadata) -> tuple[Sequences, Metadata]:
Expand Down Expand Up @@ -141,9 +153,9 @@ def write_metadata(fname: str|None, metadata: Metadata, header: MetadataHeader)
for _, value in metadata.items():
print("\t".join([value[field] for field in header]), file=fh)

def parse_remap_columns(arg: list[str]) -> dict[str, str]:
def parse_remap_columns(arg: list[str]) -> list[tuple[str, str]]:
try:
return {x[0].lower():x[1].lower() for x in [a.split(':') for a in arg]}
return [(x[0].lower(),x[1].lower()) for x in [a.split(':') for a in arg]]
except:
print("Error while parsing the remap-columns argument. Each entry must be two column names with a ':' between them.")
print("For instance: \"--remap-columns 'collection date:date' 'province:division'\"")
Expand Down

0 comments on commit e2b0886

Please sign in to comment.