From 1438c0250e3325088a35f274698eb501f561b47a Mon Sep 17 00:00:00 2001 From: Quarto_GHA_Runner Date: Thu, 12 Sep 2024 15:36:55 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- explanations/FAQ.ipynb | 2 +- explanations/Resources.ipynb | 2 +- explanations/glossary.ipynb | 2 +- howto/1_retrieve_profiles.html | 16 +- howto/1_retrieve_profiles.ipynb | 32 +- howto/2_add_metadata.html | 12 +- howto/2_add_metadata.ipynb | 24 +- howto/3_calculate_activity.html | 18 +- howto/3_calculate_activity.ipynb | 494 +++++++++--------- .../figure-html/cell-7-output-1.png | Bin 48591 -> 48550 bytes howto/4_display_perturbation_images.html | 16 +- howto/4_display_perturbation_images.ipynb | 38 +- .../figure-html/cell-7-output-1.png | Bin 750377 -> 1002732 bytes .../figure-html/cell-8-output-1.png | Bin 1027432 -> 1080049 bytes .../figure-html/cell-9-output-1.png | Bin 980879 -> 993809 bytes howto/5_explore_distance_clusters.html | 8 +- howto/5_explore_distance_clusters.ipynb | 18 +- howto/6_query_genes_externally.html | 12 +- howto/6_query_genes_externally.ipynb | 20 +- index.html | 2 +- index.ipynb | 4 +- readme.ipynb | 2 +- sitemap.xml | 22 +- 24 files changed, 373 insertions(+), 373 deletions(-) diff --git a/.nojekyll b/.nojekyll index 199f92a..14bfb61 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -681c0219 \ No newline at end of file +0bb361a4 \ No newline at end of file diff --git a/explanations/FAQ.ipynb b/explanations/FAQ.ipynb index 37952bf..dc8a8b4 100644 --- a/explanations/FAQ.ipynb +++ b/explanations/FAQ.ipynb @@ -152,7 +152,7 @@ " of these replicates’ value was in turn the mean of all the sites\n", " and cells in a given well." ], - "id": "8d5696cb-d7b6-4d19-9daf-316acf4fb80e" + "id": "76f1d9ce-f912-41b2-bcba-c93d55b1541d" } ], "nbformat": 4, diff --git a/explanations/Resources.ipynb b/explanations/Resources.ipynb index e83aa9a..3c3a285 100644 --- a/explanations/Resources.ipynb +++ b/explanations/Resources.ipynb @@ -28,7 +28,7 @@ " [website](https://www.springscience.com/jump-cp) for data\n", " exploration (account needed)." ], - "id": "929910cb-1283-4ad0-b8b1-01ca932ba8c2" + "id": "9638699e-e48e-4448-a927-06063f27081c" } ], "nbformat": 4, diff --git a/explanations/glossary.ipynb b/explanations/glossary.ipynb index 37d5681..a70005e 100644 --- a/explanations/glossary.ipynb +++ b/explanations/glossary.ipynb @@ -63,7 +63,7 @@ "for compound probes). q-value: Expected False Discovery Rate (FDR): the\n", "proportion of false positives among all positive results." ], - "id": "8a5d959e-1f0a-43a9-ad87-b95ef603e94e" + "id": "c06bf953-fe68-450b-a7a6-c8fd8e4c9a2b" } ], "nbformat": 4, diff --git a/howto/1_retrieve_profiles.html b/howto/1_retrieve_profiles.html index 3c105c0..8a52f55 100644 --- a/howto/1_retrieve_profiles.html +++ b/howto/1_retrieve_profiles.html @@ -260,7 +260,7 @@

Retrieve JUMP profiles

This is a tutorial on how to access profiles from the JUMP Cell Painting datasets. We will use polars to fetch the data frames lazily, with the help of s3fs and pyarrow. We prefer lazy loading because the data can be too big to be handled in memory.

-
+
import polars as pl

The shapes of the available datasets are:

@@ -270,11 +270,11 @@

Retrieve JUMP profiles

  • cpg0016-jump[compound]: Chemical perturbations.
  • Their explicit location is determined by the transformations that produce the datasets. The aws paths of the dataframes are built from a prefix below:

    -
    +
    INDEX_FILE = "https://raw.githubusercontent.com/jump-cellpainting/datasets/50cd2ab93749ccbdb0919d3adf9277c14b6343dd/manifests/profile_index.csv"

    We use a version-controlled csv to release the latest corrected profiles

    -
    +
    profile_index = pl.read_csv(INDEX_FILE)
     profile_index.head()
    @@ -333,7 +333,7 @@

    Retrieve JUMP profiles

    We do not need the ‘etag’ (used to check file integrity) column nor the ‘interpretable’ (i.e., before major modifications)

    -
    +
    selected_profiles = profile_index.filter(
         pl.col("subset").is_in(("crispr", "orf", "compound"))
     ).select(pl.exclude("etag"))
    @@ -344,7 +344,7 @@ 

    Retrieve JUMP profiles

    We will lazy-load the dataframes and print the number of rows and columns

    -
    +
    info = {k: [] for k in ("dataset", "#rows", "#cols", "#Metadata cols", "Size (MB)")}
     for name, path in filepaths.items():
         data = pl.scan_parquet(path)
    @@ -414,7 +414,7 @@ 

    Retrieve JUMP profiles

    Let us now focus on the crispr dataset and use a regex to select the metadata columns. We will then sample rows and display the overview. Note that the collect() method enforces loading some data into memory.

    -
    +
    data = pl.scan_parquet(filepaths["crispr"])
     data.select(pl.col("^Metadata.*$").sample(n=5, seed=1)).collect()
    @@ -480,7 +480,7 @@

    Retrieve JUMP profiles

    The following line excludes the metadata columns:

    -
    +
    data_only = data.select(pl.all().exclude("^Metadata.*$").sample(n=5, seed=1)).collect()
     data_only
    @@ -1043,7 +1043,7 @@

    Retrieve JUMP profiles

    Finally, we can convert this to pandas if we want to perform analyses with that tool. Keep in mind that this loads the entire dataframe into memory.

    -
    +
    data_only.to_pandas()
    diff --git a/howto/1_retrieve_profiles.ipynb b/howto/1_retrieve_profiles.ipynb index 26e7170..a81c1a5 100644 --- a/howto/1_retrieve_profiles.ipynb +++ b/howto/1_retrieve_profiles.ipynb @@ -12,7 +12,7 @@ "and `pyarrow`. We prefer lazy loading because the data can be too big to\n", "be handled in memory." ], - "id": "e97fd426-67e8-4ede-a761-020bf876ed25" + "id": "c207e014-f2db-4136-8569-5fac72a40e20" }, { "cell_type": "code", @@ -24,7 +24,7 @@ "source": [ "import polars as pl" ], - "id": "bda87b61" + "id": "050fdbe0" }, { "cell_type": "markdown", @@ -40,7 +40,7 @@ "produce the datasets. The aws paths of the dataframes are built from a\n", "prefix below:" ], - "id": "12964272-caf6-4d5a-bffd-79e266306ec4" + "id": "456b6441-9b59-4668-a3f6-6ca9fd0638e2" }, { "cell_type": "code", @@ -52,7 +52,7 @@ "source": [ "INDEX_FILE = \"https://raw.githubusercontent.com/jump-cellpainting/datasets/50cd2ab93749ccbdb0919d3adf9277c14b6343dd/manifests/profile_index.csv\"" ], - "id": "40c00f57" + "id": "dff5e9f9" }, { "cell_type": "markdown", @@ -60,7 +60,7 @@ "source": [ "We use a version-controlled csv to release the latest corrected profiles" ], - "id": "eaedc4b3-e2ff-497e-8b06-fa8ccad23f15" + "id": "294aca9f-26b0-4db9-b3ae-4cd328db8c6c" }, { "cell_type": "code", @@ -81,7 +81,7 @@ "profile_index = pl.read_csv(INDEX_FILE)\n", "profile_index.head()" ], - "id": "da1585b1" + "id": "be3d7bc9" }, { "cell_type": "markdown", @@ -90,7 +90,7 @@ "We do not need the ‘etag’ (used to check file integrity) column nor the\n", "‘interpretable’ (i.e., before major modifications)" ], - "id": "fb519c55-606c-44a5-9de3-9646b3904a9b" + "id": "8748e5de-4a85-4aec-a40e-17fe629c4da9" }, { "cell_type": "code", @@ -112,7 +112,7 @@ "filepaths = dict(selected_profiles.iter_rows())\n", "print(filepaths)" ], - "id": "fa128411" + "id": "f0d2b220" }, { "cell_type": "markdown", @@ -121,7 +121,7 @@ "We will lazy-load the dataframes and print the number of rows and\n", "columns" ], - "id": "0f9f9d9b-5fad-4c4d-848c-917f996870b2" + "id": "8211888a-f29b-4140-8ada-2eb7fd26b9e2" }, { "cell_type": "code", @@ -153,7 +153,7 @@ "\n", "pl.DataFrame(info)" ], - "id": "be9cab4b" + "id": "110f0995" }, { "cell_type": "markdown", @@ -163,7 +163,7 @@ "metadata columns. We will then sample rows and display the overview.\n", "Note that the collect() method enforces loading some data into memory." ], - "id": "ce4f1a2f-13ee-4154-aca2-d1337ae63131" + "id": "1ba2c4e6-32d7-490c-8ae4-ea2ed71af70a" }, { "cell_type": "code", @@ -184,7 +184,7 @@ "data = pl.scan_parquet(filepaths[\"crispr\"])\n", "data.select(pl.col(\"^Metadata.*$\").sample(n=5, seed=1)).collect()" ], - "id": "c6ee77bb" + "id": "6c420d21" }, { "cell_type": "markdown", @@ -192,7 +192,7 @@ "source": [ "The following line excludes the metadata columns:" ], - "id": "8722a143-fe76-4580-a487-3299ad49840f" + "id": "1be03951-55c7-4e1c-bda9-0be92a0083d4" }, { "cell_type": "code", @@ -213,7 +213,7 @@ "data_only = data.select(pl.all().exclude(\"^Metadata.*$\").sample(n=5, seed=1)).collect()\n", "data_only" ], - "id": "1c4f6b4c" + "id": "7ac64f6f" }, { "cell_type": "markdown", @@ -223,7 +223,7 @@ "with that tool. Keep in mind that this loads the entire dataframe into\n", "memory." ], - "id": "0bdebe1f-9cdd-457a-a46c-f1b07966f120" + "id": "14c7a423-e8c6-4b18-b3f2-ee46f7350fa7" }, { "cell_type": "code", @@ -245,7 +245,7 @@ "source": [ "data_only.to_pandas()" ], - "id": "80e1a977" + "id": "c8a420b3" } ], "nbformat": 4, diff --git a/howto/2_add_metadata.html b/howto/2_add_metadata.html index 7251c28..939a124 100644 --- a/howto/2_add_metadata.html +++ b/howto/2_add_metadata.html @@ -260,12 +260,12 @@

    Incorporate metadata into profiles

    A very common task when processing morphological profiles is knowing which ones are treatments and which ones are controls. Here we will explore how we can use broad-babel to accomplish this task.

    -
    +
    import polars as pl
     from broad_babel.query import get_mapper

    We will be using the CRISPR dataset specificed in our index csv.

    -
    +
    INDEX_FILE = "https://raw.githubusercontent.com/jump-cellpainting/datasets/50cd2ab93749ccbdb0919d3adf9277c14b6343dd/manifests/profile_index.csv"
     CRISPR_URL = pl.read_csv(INDEX_FILE).filter(pl.col("subset") == "crispr").item(0, "url")
     profiles = pl.scan_parquet(CRISPR_URL)
    @@ -275,7 +275,7 @@ 

    Incorporate metadata into profiles

    For simplicity the contents of our processed profiles are minimal: “The profile origin” (source, plate and well) and the unique JUMP identifier for that perturbation. We will use broad-babel to further expand on this metadata, but for simplicity’s sake let us sample subset of data.

    -
    +
    jcp_ids = (
         profiles.select(pl.col("Metadata_JCP2022")).unique().collect().to_series().sort()
     )
    @@ -298,7 +298,7 @@ 

    Incorporate metadata into profiles

    We will use these JUMP ids to obtain a mapper that indicates the perturbation type (trt, negcon or, rarely, poscon)

    -
    +
    pert_mapper = get_mapper(
         subsample, input_column="JCP2022", output_columns="JCP2022,pert_type"
     )
    @@ -319,7 +319,7 @@ 

    Incorporate metadata into profiles

    A couple of important notes about broad_babel’s get mapper and other functions: - these must be fed tuples, as these are cached and provide significant speed-ups for repeated calls - ‘get-mapper’ works for datasets for up to a few tens of thousands of samples. If you try to use it to get a mapper for the entirety of the ‘compounds’ dataset it is likely to fail. For these cases we suggest the more general function ‘run_query’. You can read more on this and other use-cases on Babel’s readme.

    We will now repeat the process to get their ‘standard’ name

    -
    +
    name_mapper = get_mapper(
         (*subsample, "JCP2022_800002"),
         input_column="JCP2022",
    @@ -341,7 +341,7 @@ 

    Incorporate metadata into profiles

    To wrap up, we will fetch all the available profiles for these perturbations and use the mappers to add the missing metadata. We also select a few features to showcase how how selection can be performed in polars.

    -
    +
    subsample_profiles = profiles.filter(
         pl.col("Metadata_JCP2022").is_in(subsample)
     ).collect()
    diff --git a/howto/2_add_metadata.ipynb b/howto/2_add_metadata.ipynb
    index a182273..9a41af7 100644
    --- a/howto/2_add_metadata.ipynb
    +++ b/howto/2_add_metadata.ipynb
    @@ -10,7 +10,7 @@
             "which ones are treatments and which ones are controls. Here we will\n",
             "explore how we can use broad-babel to accomplish this task."
           ],
    -      "id": "9b0c4aca-4139-4e19-9963-54860e5b2c26"
    +      "id": "b7c23e83-3a1c-4867-a24a-5ea1e6619f45"
         },
         {
           "cell_type": "code",
    @@ -23,7 +23,7 @@
             "import polars as pl\n",
             "from broad_babel.query import get_mapper"
           ],
    -      "id": "3e7e9095"
    +      "id": "50c8f860"
         },
         {
           "cell_type": "markdown",
    @@ -31,7 +31,7 @@
           "source": [
             "We will be using the CRISPR dataset specificed in our index csv."
           ],
    -      "id": "f46085f0-e839-4061-81fc-8d4c59de2596"
    +      "id": "83e02149-8f7c-42be-829a-e1d1dad3fae8"
         },
         {
           "cell_type": "code",
    @@ -54,7 +54,7 @@
             "profiles = pl.scan_parquet(CRISPR_URL)\n",
             "print(profiles.collect_schema().names()[:6])"
           ],
    -      "id": "cb0495b5"
    +      "id": "b8bcb0fb"
         },
         {
           "cell_type": "markdown",
    @@ -65,7 +65,7 @@
             "for that perturbation. We will use broad-babel to further expand on this\n",
             "metadata, but for simplicity’s sake let us sample subset of data."
           ],
    -      "id": "46c6af9b-0680-4ac4-809e-a7d904d742b1"
    +      "id": "e5f8e5e6-8079-436b-a039-a61440ef6896"
         },
         {
           "cell_type": "code",
    @@ -103,7 +103,7 @@
             "subsample = (*subsample, \"JCP2022_800002\")\n",
             "subsample"
           ],
    -      "id": "4359c0f0"
    +      "id": "de261922"
         },
         {
           "cell_type": "markdown",
    @@ -112,7 +112,7 @@
             "We will use these JUMP ids to obtain a mapper that indicates the\n",
             "perturbation type (trt, negcon or, rarely, poscon)"
           ],
    -      "id": "bd58da00-1aa9-4910-973d-cf774b093503"
    +      "id": "c6761bb3-3f38-4ff6-9a02-d60723934736"
         },
         {
           "cell_type": "code",
    @@ -147,7 +147,7 @@
             ")\n",
             "pert_mapper"
           ],
    -      "id": "7c3e462f"
    +      "id": "635c1625"
         },
         {
           "cell_type": "markdown",
    @@ -164,7 +164,7 @@
             "\n",
             "We will now repeat the process to get their ‘standard’ name"
           ],
    -      "id": "59a9634c-eab8-4a6a-8ea4-c274639d9d8a"
    +      "id": "9b30d97e-8c92-49c2-8a55-dfe7654be290"
         },
         {
           "cell_type": "code",
    @@ -201,7 +201,7 @@
             ")\n",
             "name_mapper"
           ],
    -      "id": "eb65c06c"
    +      "id": "3c645adf"
         },
         {
           "cell_type": "markdown",
    @@ -212,7 +212,7 @@
             "select a few features to showcase how how selection can be performed in\n",
             "polars."
           ],
    -      "id": "e2d6af67-0def-4485-9caf-20cd06890c96"
    +      "id": "89e60df7-2aee-4de5-9e15-10f34b936e14"
         },
         {
           "cell_type": "code",
    @@ -243,7 +243,7 @@
             "    pl.col((\"name\", \"pert_type\", \"^Metadata.*$\", \"^X_[0-3]$\"))\n",
             ").sort(by=\"pert_type\")"
           ],
    -      "id": "ae5ef7e7"
    +      "id": "499181c8"
         }
       ],
       "nbformat": 4,
    diff --git a/howto/3_calculate_activity.html b/howto/3_calculate_activity.html
    index c6f8c62..ea97c3b 100644
    --- a/howto/3_calculate_activity.html
    +++ b/howto/3_calculate_activity.html
    @@ -261,7 +261,7 @@ 

    Calculate phenotypic activity

    A common first analysis for morphological datasets is the activity of the cells’ phenotypes. We will use the copairs package, which makes use of mean average precision to obtain a metric of replicability for any set of morphological profiles. In other words, it indicates how similar a given set of compounds are, relative to their negative controls, which is usually cells that have experienced no perturbation.

    -
    +
    import polars as pl
     import polars.selectors as cs
     import seaborn as sns
    @@ -269,13 +269,13 @@ 

    Calculate phenotypic activity

    from copairs.map import average_precision

    We will be using the CRISPR dataset specificed in our index csv, but we will select a subset of perturbations and the controls present.

    -
    +
    INDEX_FILE = "https://raw.githubusercontent.com/jump-cellpainting/datasets/50cd2ab93749ccbdb0919d3adf9277c14b6343dd/manifests/profile_index.csv"
     CRISPR_URL = pl.read_csv(INDEX_FILE).filter(pl.col("subset") == "crispr").item(0, "url")
     profiles = pl.scan_parquet(CRISPR_URL)

    Sample perturbations and add known negative control.

    -
    +
    jcp_ids = (
         profiles.select(pl.col("Metadata_JCP2022")).unique().collect().to_series().sort()
     )
    @@ -291,7 +291,7 @@ 

    Calculate phenotypic activity

    perts_controls.head()

    Now we create a mapper to label treatments and controls. See the previous tutorial for details on fetching metadata.

    -
    +
    pert_mapper = get_mapper(
         subsample, input_column="JCP2022", output_columns="JCP2022,pert_type"
     )
    @@ -300,7 +300,7 @@ 

    Calculate phenotypic activity

    )

    Finally we use the parameters from . See the copairs wiki for more details on the parameters that copairs requires.

    -
    +
    pos_sameby = ["Metadata_JCP2022"]  # We want to match perturbations
     pos_diffby = []
     neg_sameby = []
    @@ -326,12 +326,12 @@ 

    Calculate phenotypic activity

    result.head()
    @@ -426,7 +426,7 @@

    Calculate phenotypic activity

    The result of copairs is a dataframe containing, in addition to the original metadata, the average precision with which perturbations were retrieved. Perturbations that look more similar to each other than to the negative controls in the plates present in the same plates will be higher. Perturbations that do not differentiate themselves against negative controls will be closer to zero.

    To wrap up we pull the standard gene symbol and plot the distribution of average precision.

    -
    +
    name_mapper = get_mapper(
         subsample, input_column="JCP2022", output_columns="JCP2022,standard_key"
     )
    @@ -451,7 +451,7 @@ 

    Calculate phenotypic activity