From 0c0448b9a3566edbc09641c8d3fda67ac8d8d71b Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Wed, 7 Dec 2022 23:39:13 +0100 Subject: [PATCH] Include year_letter_clade & more in metadata We currently output only composite clade names, e.g. `21L (Omicron)` Nextclade now produces also atomic clades, that are Nextstrain and WHO only: `21L` and `Omicron`. Nextclade will at some point switch "clade" from `legacy` to `Nextstrain` This commit insulates ingest-metadata from this upcoming change by using `clade_legacy` in place of `clade` in the column_map as source of `Nextstrain_clade`. This commit also adds: - `year_letter_clade` sourced from `clade_nextstrain` - `who_variant` sourced from `clade_who` - `clade_display_name`: calculated as `21L (Omicron)`/`20C` --- bin/join-metadata-and-clades | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bin/join-metadata-and-clades b/bin/join-metadata-and-clades index 37344fc7..279fa528 100755 --- a/bin/join-metadata-and-clades +++ b/bin/join-metadata-and-clades @@ -14,8 +14,11 @@ rate_per_day = 0.0007 * 29903 / 365 reference_day = datetime(2020,1,1).toordinal() column_map = { - "clade": "Nextstrain_clade", + "clade_legacy": "Nextstrain_clade", "Nextclade_pango": "Nextclade_pango", + "clade_nextstrain": "year_letter_clade", + "clade_who": "who_variant", + "clade_display_name": "clade_display_name", "immune_escape": "immune_escape", "ace2_binding": "ace2_binding", "totalMissing": "missing_data", @@ -53,9 +56,9 @@ def reorder_columns(result: pd.DataFrame): Moves the new clade column after a specified column """ columns = list(result.columns) - columns.remove(column_map['clade']) + columns.remove(column_map['clade_legacy']) insert_at = columns.index(INSERT_BEFORE_THIS_COLUMN) - columns.insert(insert_at, column_map['clade']) + columns.insert(insert_at, column_map['clade_legacy']) return result[columns] @@ -107,6 +110,11 @@ def main(): # Remove immune_escape and ace2_binding when clade <21L and not recombinant clades.loc[clades.Nextstrain_clade < "21L",["immune_escape","ace2_binding"]] = float('nan') + # Calculate `clade_display_name` column, we can make this more sophisticated later + clades["clade_display_name"] = clades.apply( + lambda x: f"{x.year_letter_clade}" + (f" ({x.who_variant})" if x.who_variant.istitle() else ""), + axis=1 + ) clades = clades[list(column_map.values())]