From 26a511da386ba9fc4274bbbf9780582c06f47206 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 21 Jun 2021 13:01:28 -0400 Subject: [PATCH 1/2] add perturbation count summary --- perturbation-count-summary.ipynb | 229 ++++++++++++++++++ ..._batch1_metadata_cell_count_summary.tsv.gz | 3 + ..._Batch2_metadata_cell_count_summary.tsv.gz | 3 + 3 files changed, 235 insertions(+) create mode 100644 perturbation-count-summary.ipynb create mode 100644 profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz create mode 100644 profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz diff --git a/perturbation-count-summary.ipynb b/perturbation-count-summary.ipynb new file mode 100644 index 00000000..77cfeb87 --- /dev/null +++ b/perturbation-count-summary.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "posted-ottawa", + "metadata": {}, + "source": [ + "## Count LINCS Cell Painting data\n", + "\n", + "This notebook provides counting statistics for two batches of LINCS Cell Painting set in this repository.\n", + "\n", + "* Count profiles\n", + "* Count perturbation treatments\n", + "* Count single cells (with an output cell count summary file)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "civic-festival", + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "offensive-german", + "metadata": {}, + "outputs": [], + "source": [ + "batches = [\"2016_04_01_a549_48hr_batch1\", \"2017_12_05_Batch2\"]\n", + "\n", + "consensus_dir = pathlib.Path(\"consensus\")\n", + "batch_effect_dir = pathlib.Path(\"spherized_profiles\")\n", + "cell_count_dir = pathlib.Path(\"profiles/cell_count\")\n", + "platemap_dir = pathlib.Path(\"metadata/platemaps\")\n", + "\n", + "consensus_suffix = \"_consensus_modz_feature_select_dmso.csv.gz\"\n", + "batch_effect_suffix = \"_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz\"" + ] + }, + { + "cell_type": "markdown", + "id": "4ba958c5", + "metadata": {}, + "source": [ + "## Profile and perturbation count" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b905b954", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1\n", + "There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1\n", + "There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1\n", + "There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1\n", + "There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1\n", + "There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])\n", + "There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1\n", + "\n", + "There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2\n", + "There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2\n", + "There are 349 unique compounds assayed in 2017_12_05_Batch2\n", + "There are 3 unique time points assayed in 2017_12_05_Batch2\n", + "There are 6 unique doses assayed in 2017_12_05_Batch2\n", + "There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])\n", + "There is a total of 9,369 unique perturbations assayed in 2017_12_05_Batch2\n", + "\n" + ] + } + ], + "source": [ + "for batch in batches:\n", + " consensus_file = pathlib.Path(consensus_dir, batch, f\"{batch}{consensus_suffix}\")\n", + " batch_effect_file = pathlib.Path(batch_effect_dir, \"profiles\", f\"{batch}{batch_effect_suffix}\")\n", + " \n", + " profiles_df = pd.read_csv(batch_effect_file, low_memory=False)\n", + " print(f\"There are a total of {profiles_df.shape[0]:,} image-based profiles assayed in {batch}\")\n", + "\n", + " consensus_df = pd.read_csv(consensus_file)\n", + " print(f\"There are a total of {consensus_df.shape[0]:,} image-based consensus profiles assayed in {batch}\")\n", + " \n", + " num_compounds = len(consensus_df.Metadata_broad_sample.unique())\n", + " print(f\"There are {num_compounds:,} unique compounds assayed in {batch}\")\n", + " \n", + " time_points = len(consensus_df.Metadata_time_point.unique())\n", + " print(f\"There are {time_points} unique time points assayed in {batch}\")\n", + " \n", + " doses = len(consensus_df.Metadata_dose_recode.unique())\n", + " print(f\"There are {doses} unique doses assayed in {batch}\")\n", + " \n", + " cell_lines = consensus_df.Metadata_cell_id.unique()\n", + " print(f\"There are {len(cell_lines)} unique cell lines assayed in {batch} ({cell_lines})\")\n", + " \n", + " unique_perturbations = (\n", + " consensus_df\n", + " .groupby(\n", + " [\"Metadata_cell_id\", \"Metadata_broad_sample\", \"Metadata_time_point\", \"Metadata_dose_recode\"]\n", + " )[\"Metadata_pert_well\"]\n", + " .count()\n", + " .reset_index()\n", + " .rename(\n", + " columns={\"Metadata_pert_well\": \"num_replicates\"}\n", + " )\n", + " )\n", + " print(f\"There is a total of {len(unique_perturbations):,} unique perturbations assayed in {batch}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "30171986", + "metadata": {}, + "source": [ + "### Cell Count" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "verbal-vaccine", + "metadata": {}, + "outputs": [], + "source": [ + "# Count cells\n", + "all_cell_count_df = {x: [] for x in batches}\n", + "for batch in batches:\n", + " # Load barcode platemap\n", + " barcode_platemap_file = platemap_dir / batch / \"barcode_platemap.csv\"\n", + " barcode_platemap_df = pd.read_csv(barcode_platemap_file)\n", + "\n", + " # Setup other path variables\n", + " batch_count_dir = pathlib.Path(cell_count_dir, batch)\n", + " plate_dirs = [x for x in batch_count_dir.iterdir() if \".DS_Store\" not in x.name]\n", + " \n", + " for plate_dir in plate_dirs:\n", + " plate_name = plate_dir.name\n", + " platemap_name = barcode_platemap_df.query(\"Assay_Plate_Barcode == @plate_name\").Plate_Map_Name.values[0]\n", + " platemap_file = platemap_dir / batch / \"platemap\" / f\"{platemap_name}.txt\"\n", + " platemap_df = pd.read_csv(platemap_file, sep=\"\\t\")\n", + "\n", + " cell_count_file = plate_dir / f\"{plate_name}_cell_count.csv\"\n", + " cell_count_df = (\n", + " pd.read_csv(cell_count_file)\n", + " .assign(batch=batch)\n", + " .rename({\n", + " \"Image_Metadata_Well\": \"Metadata_Well\",\n", + " \"Image_Metadata_Plate\": \"Metadata_Plate\"\n", + " }, axis=\"columns\")\n", + " )\n", + " \n", + " cell_count_df = (\n", + " cell_count_df\n", + " .merge(platemap_df, left_on=\"Metadata_Well\", right_on=\"well_position\")\n", + " )\n", + " \n", + " all_cell_count_df[batch].append(cell_count_df)\n", + " \n", + " # Combine batch-specific cell count summary\n", + " all_cell_count_df[batch] = pd.concat(all_cell_count_df[batch]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "complex-polls", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In batch 2016_04_01_a549_48hr_batch1, we profiled 110,012,425 cells\n", + "In batch 2017_12_05_Batch2, we profiled 49,705,063 cells\n", + "\n", + "We profiled a total of 159,717,488 cells in the LINCS Cell Painting dataset\n" + ] + } + ], + "source": [ + "# Output metadata summary files\n", + "total_cells = 0\n", + "for batch in batches:\n", + " batch_metadata_df = all_cell_count_df[batch]\n", + " batch_cell_count = batch_metadata_df.cell_count.sum()\n", + " \n", + " print(f\"In batch {batch}, we profiled {batch_cell_count:,} cells\")\n", + " \n", + " output_cell_count_summary_file = cell_count_dir / f\"{batch}_metadata_cell_count_summary.tsv.gz\"\n", + " batch_metadata_df.to_csv(output_cell_count_summary_file, index=False)\n", + " \n", + " total_cells += batch_cell_count\n", + " \n", + "print(f\"\\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz new file mode 100644 index 00000000..b65e0154 --- /dev/null +++ b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6929a0c4f09ee3b97f4b794f5ab338dc01208bb6f0dfd8347220107181a8a748 +size 1058707 diff --git a/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz new file mode 100644 index 00000000..ffea6b7c --- /dev/null +++ b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15b968c5bc457484dc131fba6fc410e374ccbbc4c4624160df951e1045e5d32 +size 950856 From 2de5827e700fcbfa06e861fae4863fb532ba0b19 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 21 Jun 2021 13:08:26 -0400 Subject: [PATCH 2/2] output time and make compression mtime=0 --- perturbation-count-summary.ipynb | 14 ++++++++------ ..._48hr_batch1_metadata_cell_count_summary.tsv.gz | 2 +- ...12_05_Batch2_metadata_cell_count_summary.tsv.gz | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/perturbation-count-summary.ipynb b/perturbation-count-summary.ipynb index 77cfeb87..ce32067b 100644 --- a/perturbation-count-summary.ipynb +++ b/perturbation-count-summary.ipynb @@ -64,7 +64,7 @@ "There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1\n", "There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1\n", "There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1\n", - "There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1\n", + "There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1 (['48H'])\n", "There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1\n", "There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])\n", "There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1\n", @@ -72,7 +72,7 @@ "There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2\n", "There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2\n", "There are 349 unique compounds assayed in 2017_12_05_Batch2\n", - "There are 3 unique time points assayed in 2017_12_05_Batch2\n", + "There are 3 unique time points assayed in 2017_12_05_Batch2 (['24H' '48H' '6H'])\n", "There are 6 unique doses assayed in 2017_12_05_Batch2\n", "There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])\n", "There is a total of 9,369 unique perturbations assayed in 2017_12_05_Batch2\n", @@ -94,8 +94,8 @@ " num_compounds = len(consensus_df.Metadata_broad_sample.unique())\n", " print(f\"There are {num_compounds:,} unique compounds assayed in {batch}\")\n", " \n", - " time_points = len(consensus_df.Metadata_time_point.unique())\n", - " print(f\"There are {time_points} unique time points assayed in {batch}\")\n", + " time_points = consensus_df.Metadata_time_point.unique()\n", + " print(f\"There are {len(time_points)} unique time points assayed in {batch} ({time_points})\")\n", " \n", " doses = len(consensus_df.Metadata_dose_recode.unique())\n", " print(f\"There are {doses} unique doses assayed in {batch}\")\n", @@ -197,8 +197,10 @@ " print(f\"In batch {batch}, we profiled {batch_cell_count:,} cells\")\n", " \n", " output_cell_count_summary_file = cell_count_dir / f\"{batch}_metadata_cell_count_summary.tsv.gz\"\n", - " batch_metadata_df.to_csv(output_cell_count_summary_file, index=False)\n", - " \n", + " batch_metadata_df.to_csv(\n", + " output_cell_count_summary_file, index=False, compression={\"method\": \"gzip\", \"mtime\": 0}\n", + " )\n", + " \n", " total_cells += batch_cell_count\n", " \n", "print(f\"\\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset\")" diff --git a/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz index b65e0154..58cc6383 100644 --- a/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz +++ b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6929a0c4f09ee3b97f4b794f5ab338dc01208bb6f0dfd8347220107181a8a748 +oid sha256:28a429d4bf68c014278886cb66f838338a60532334c214f1a8649956afc35ad2 size 1058707 diff --git a/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz index ffea6b7c..157068fc 100644 --- a/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz +++ b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c15b968c5bc457484dc131fba6fc410e374ccbbc4c4624160df951e1045e5d32 +oid sha256:6aaceed38cad4f050b64324859c087e31dae13babd02b5a53942a0a454ec1e4f size 950856