From 26a511da386ba9fc4274bbbf9780582c06f47206 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 21 Jun 2021 13:01:28 -0400
Subject: [PATCH 1/2] add perturbation count summary

---
 perturbation-count-summary.ipynb              | 229 ++++++++++++++++++
 ..._batch1_metadata_cell_count_summary.tsv.gz |   3 +
 ..._Batch2_metadata_cell_count_summary.tsv.gz |   3 +
 3 files changed, 235 insertions(+)
 create mode 100644 perturbation-count-summary.ipynb
 create mode 100644 profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
 create mode 100644 profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz

diff --git a/perturbation-count-summary.ipynb b/perturbation-count-summary.ipynb
new file mode 100644
index 00000000..77cfeb87
--- /dev/null
+++ b/perturbation-count-summary.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "posted-ottawa",
+   "metadata": {},
+   "source": [
+    "## Count LINCS Cell Painting data\n",
+    "\n",
+    "This notebook provides counting statistics for two batches of LINCS Cell Painting set in this repository.\n",
+    "\n",
+    "* Count profiles\n",
+    "* Count perturbation treatments\n",
+    "* Count single cells (with an output cell count summary file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "civic-festival",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "offensive-german",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batches = [\"2016_04_01_a549_48hr_batch1\", \"2017_12_05_Batch2\"]\n",
+    "\n",
+    "consensus_dir = pathlib.Path(\"consensus\")\n",
+    "batch_effect_dir = pathlib.Path(\"spherized_profiles\")\n",
+    "cell_count_dir = pathlib.Path(\"profiles/cell_count\")\n",
+    "platemap_dir = pathlib.Path(\"metadata/platemaps\")\n",
+    "\n",
+    "consensus_suffix = \"_consensus_modz_feature_select_dmso.csv.gz\"\n",
+    "batch_effect_suffix = \"_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ba958c5",
+   "metadata": {},
+   "source": [
+    "## Profile and perturbation count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b905b954",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1\n",
+      "There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1\n",
+      "There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1\n",
+      "There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1\n",
+      "There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1\n",
+      "There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])\n",
+      "There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1\n",
+      "\n",
+      "There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2\n",
+      "There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2\n",
+      "There are 349 unique compounds assayed in 2017_12_05_Batch2\n",
+      "There are 3 unique time points assayed in 2017_12_05_Batch2\n",
+      "There are 6 unique doses assayed in 2017_12_05_Batch2\n",
+      "There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])\n",
+      "There is a total of 9,369 unique perturbations assayed in 2017_12_05_Batch2\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for batch in batches:\n",
+    "    consensus_file = pathlib.Path(consensus_dir, batch, f\"{batch}{consensus_suffix}\")\n",
+    "    batch_effect_file = pathlib.Path(batch_effect_dir, \"profiles\", f\"{batch}{batch_effect_suffix}\")\n",
+    "    \n",
+    "    profiles_df = pd.read_csv(batch_effect_file, low_memory=False)\n",
+    "    print(f\"There are a total of {profiles_df.shape[0]:,} image-based profiles assayed in {batch}\")\n",
+    "\n",
+    "    consensus_df = pd.read_csv(consensus_file)\n",
+    "    print(f\"There are a total of {consensus_df.shape[0]:,} image-based consensus profiles assayed in {batch}\")\n",
+    "    \n",
+    "    num_compounds = len(consensus_df.Metadata_broad_sample.unique())\n",
+    "    print(f\"There are {num_compounds:,} unique compounds assayed in {batch}\")\n",
+    "    \n",
+    "    time_points = len(consensus_df.Metadata_time_point.unique())\n",
+    "    print(f\"There are {time_points} unique time points assayed in {batch}\")\n",
+    "    \n",
+    "    doses = len(consensus_df.Metadata_dose_recode.unique())\n",
+    "    print(f\"There are {doses} unique doses assayed in {batch}\")\n",
+    "    \n",
+    "    cell_lines = consensus_df.Metadata_cell_id.unique()\n",
+    "    print(f\"There are {len(cell_lines)} unique cell lines assayed in {batch} ({cell_lines})\")\n",
+    "    \n",
+    "    unique_perturbations = (\n",
+    "        consensus_df\n",
+    "        .groupby(\n",
+    "            [\"Metadata_cell_id\", \"Metadata_broad_sample\", \"Metadata_time_point\", \"Metadata_dose_recode\"]\n",
+    "        )[\"Metadata_pert_well\"]\n",
+    "        .count()\n",
+    "        .reset_index()\n",
+    "        .rename(\n",
+    "            columns={\"Metadata_pert_well\": \"num_replicates\"}\n",
+    "        )\n",
+    "    )\n",
+    "    print(f\"There is a total of {len(unique_perturbations):,} unique perturbations assayed in {batch}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30171986",
+   "metadata": {},
+   "source": [
+    "### Cell Count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "verbal-vaccine",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Count cells\n",
+    "all_cell_count_df = {x: [] for x in batches}\n",
+    "for batch in batches:\n",
+    "    # Load barcode platemap\n",
+    "    barcode_platemap_file = platemap_dir / batch / \"barcode_platemap.csv\"\n",
+    "    barcode_platemap_df = pd.read_csv(barcode_platemap_file)\n",
+    "\n",
+    "    # Setup other path variables\n",
+    "    batch_count_dir = pathlib.Path(cell_count_dir, batch)\n",
+    "    plate_dirs = [x for x in batch_count_dir.iterdir() if \".DS_Store\" not in x.name]\n",
+    "    \n",
+    "    for plate_dir in plate_dirs:\n",
+    "        plate_name = plate_dir.name\n",
+    "        platemap_name = barcode_platemap_df.query(\"Assay_Plate_Barcode == @plate_name\").Plate_Map_Name.values[0]\n",
+    "        platemap_file = platemap_dir / batch / \"platemap\" / f\"{platemap_name}.txt\"\n",
+    "        platemap_df = pd.read_csv(platemap_file, sep=\"\\t\")\n",
+    "\n",
+    "        cell_count_file = plate_dir / f\"{plate_name}_cell_count.csv\"\n",
+    "        cell_count_df = (\n",
+    "            pd.read_csv(cell_count_file)\n",
+    "            .assign(batch=batch)\n",
+    "            .rename({\n",
+    "                \"Image_Metadata_Well\": \"Metadata_Well\",\n",
+    "                \"Image_Metadata_Plate\": \"Metadata_Plate\"\n",
+    "            }, axis=\"columns\")\n",
+    "        )\n",
+    "        \n",
+    "        cell_count_df = (\n",
+    "            cell_count_df\n",
+    "            .merge(platemap_df, left_on=\"Metadata_Well\", right_on=\"well_position\")\n",
+    "        )\n",
+    "        \n",
+    "        all_cell_count_df[batch].append(cell_count_df)\n",
+    "    \n",
+    "    # Combine batch-specific cell count summary\n",
+    "    all_cell_count_df[batch] = pd.concat(all_cell_count_df[batch]).reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "complex-polls",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "In batch 2016_04_01_a549_48hr_batch1, we profiled 110,012,425 cells\n",
+      "In batch 2017_12_05_Batch2, we profiled 49,705,063 cells\n",
+      "\n",
+      "We profiled a total of 159,717,488 cells in the LINCS Cell Painting dataset\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Output metadata summary files\n",
+    "total_cells = 0\n",
+    "for batch in batches:\n",
+    "    batch_metadata_df = all_cell_count_df[batch]\n",
+    "    batch_cell_count = batch_metadata_df.cell_count.sum()\n",
+    "    \n",
+    "    print(f\"In batch {batch}, we profiled {batch_cell_count:,} cells\")\n",
+    "    \n",
+    "    output_cell_count_summary_file = cell_count_dir / f\"{batch}_metadata_cell_count_summary.tsv.gz\"\n",
+    "    batch_metadata_df.to_csv(output_cell_count_summary_file, index=False)\n",
+    "    \n",
+    "    total_cells += batch_cell_count\n",
+    "    \n",
+    "print(f\"\\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
new file mode 100644
index 00000000..b65e0154
--- /dev/null
+++ b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6929a0c4f09ee3b97f4b794f5ab338dc01208bb6f0dfd8347220107181a8a748
+size 1058707
diff --git a/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz
new file mode 100644
index 00000000..ffea6b7c
--- /dev/null
+++ b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c15b968c5bc457484dc131fba6fc410e374ccbbc4c4624160df951e1045e5d32
+size 950856

From 2de5827e700fcbfa06e861fae4863fb532ba0b19 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 21 Jun 2021 13:08:26 -0400
Subject: [PATCH 2/2] output time and make compression mtime=0

---
 perturbation-count-summary.ipynb                   | 14 ++++++++------
 ..._48hr_batch1_metadata_cell_count_summary.tsv.gz |  2 +-
 ...12_05_Batch2_metadata_cell_count_summary.tsv.gz |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/perturbation-count-summary.ipynb b/perturbation-count-summary.ipynb
index 77cfeb87..ce32067b 100644
--- a/perturbation-count-summary.ipynb
+++ b/perturbation-count-summary.ipynb
@@ -64,7 +64,7 @@
       "There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1\n",
       "There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1\n",
       "There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1\n",
-      "There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1\n",
+      "There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1 (['48H'])\n",
       "There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1\n",
       "There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])\n",
       "There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1\n",
@@ -72,7 +72,7 @@
       "There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2\n",
       "There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2\n",
       "There are 349 unique compounds assayed in 2017_12_05_Batch2\n",
-      "There are 3 unique time points assayed in 2017_12_05_Batch2\n",
+      "There are 3 unique time points assayed in 2017_12_05_Batch2 (['24H' '48H' '6H'])\n",
       "There are 6 unique doses assayed in 2017_12_05_Batch2\n",
       "There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])\n",
       "There is a total of 9,369 unique perturbations assayed in 2017_12_05_Batch2\n",
@@ -94,8 +94,8 @@
     "    num_compounds = len(consensus_df.Metadata_broad_sample.unique())\n",
     "    print(f\"There are {num_compounds:,} unique compounds assayed in {batch}\")\n",
     "    \n",
-    "    time_points = len(consensus_df.Metadata_time_point.unique())\n",
-    "    print(f\"There are {time_points} unique time points assayed in {batch}\")\n",
+    "    time_points = consensus_df.Metadata_time_point.unique()\n",
+    "    print(f\"There are {len(time_points)} unique time points assayed in {batch} ({time_points})\")\n",
     "    \n",
     "    doses = len(consensus_df.Metadata_dose_recode.unique())\n",
     "    print(f\"There are {doses} unique doses assayed in {batch}\")\n",
@@ -197,8 +197,10 @@
     "    print(f\"In batch {batch}, we profiled {batch_cell_count:,} cells\")\n",
     "    \n",
     "    output_cell_count_summary_file = cell_count_dir / f\"{batch}_metadata_cell_count_summary.tsv.gz\"\n",
-    "    batch_metadata_df.to_csv(output_cell_count_summary_file, index=False)\n",
-    "    \n",
+    "    batch_metadata_df.to_csv(\n",
+    "        output_cell_count_summary_file, index=False, compression={\"method\": \"gzip\", \"mtime\": 0}\n",
+    "    )\n",
+    "        \n",
     "    total_cells += batch_cell_count\n",
     "    \n",
     "print(f\"\\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset\")"
diff --git a/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
index b65e0154..58cc6383 100644
--- a/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
+++ b/profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6929a0c4f09ee3b97f4b794f5ab338dc01208bb6f0dfd8347220107181a8a748
+oid sha256:28a429d4bf68c014278886cb66f838338a60532334c214f1a8649956afc35ad2
 size 1058707
diff --git a/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz
index ffea6b7c..157068fc 100644
--- a/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz
+++ b/profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c15b968c5bc457484dc131fba6fc410e374ccbbc4c4624160df951e1045e5d32
+oid sha256:6aaceed38cad4f050b64324859c087e31dae13babd02b5a53942a0a454ec1e4f
 size 950856