Merge pull request #25 from lincc-frameworks/lsdb-docs

Update LSDB notebook with science!
lincc-frameworks · May 31, 2024 · 3500be8 · 3500be8
2 parents 6af844c + 408e861
commit 3500be8
Show file tree

Hide file tree

Showing 2 changed files with 249 additions and 37 deletions.
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
@@ -4,4 +4,5 @@ Tutorials
 .. toctree::
 
     Loading Data into Nested-Dask <tutorials/loading_data>
+    Work with HiPSCat catalogs via LSDB <tutorials/work_with_lsdb>
     Using the `nest` Accessor <tutorials/nest_accessor>
diff --git a/docs/tutorials/work_with_lsdb.ipynb b/docs/tutorials/work_with_lsdb.ipynb
@@ -5,19 +5,25 @@
    "id": "6171e5bbd47ce869",
    "metadata": {},
    "source": [
-    "# Load large catalog data from the LSDB\n",
+    "# Load large catalog data with LSDB\n",
     "\n",
-    "Here we load a small part of ZTF DR14 stored as HiPSCat catalog using the [LSDB](https://lsdb.readthedocs.io/)."
+    "Here we load a small part of ZTF DR14 stored as HiPSCat catalog using [LSDB](https://lsdb.readthedocs.io/).\n",
+    "\n",
+    "The notebook is an adaptation of [the tutorial](https://github.com/lincc-frameworks/Rare_Gems_Demo/blob/main/Notebook_2_Basic_Time_Domain.ipynb) presented by Neven Caplar at the Rare Gems in Big Data conference, May 2024. "
    ]
   },
   {
    "cell_type": "markdown",
    "id": "c055a44b8ce3b34",
    "metadata": {},
    "source": [
-    "## Install LSDB and its dependencies and import the necessary modules\n",
+    "## Install dependencies for the notebook\n",
     "\n",
-    "We also need `aiohttp`, which is an optional LSDB's dependency, needed to access the catalog data from the web."
+    "The notebook requires `nested-dask` and few other packages to be installed.\n",
+    "- `lsdb` to load and join \"object\" (pointing) and \"source\" (detection) ZTF catalogs\n",
+    "- `aiohttp` is `lsdb`'s optional dependency to download the data via web\n",
+    "- `light-curve` to extract features from light curves\n",
+    "- `matplotlib` to plot the results"
    ]
   },
   {
@@ -26,16 +32,17 @@
    "id": "af1710055600582",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:00.759441Z",
-     "start_time": "2024-05-24T12:53:58.854875Z"
+     "end_time": "2024-05-30T21:15:57.356540Z",
+     "start_time": "2024-05-30T21:15:55.782198Z"
     }
    },
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
+    "# Uncomment the following line to install nested-dask\n",
+    "# %pip install nested-dask\n",
     "\n",
-    "# Comment the following line to skip LSDB installation\n",
-    "%pip install aiohttp lsdb"
+    "# Comment the following line to skip dependencies installation\n",
+    "%pip install --quiet lsdb aiohttp light-curve matplotlib"
    ]
   },
   {
@@ -44,16 +51,23 @@
    "id": "e4d03aa76aeeb1c0",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:03.834087Z",
-     "start_time": "2024-05-24T12:54:00.761330Z"
+     "end_time": "2024-05-30T21:15:59.778253Z",
+     "start_time": "2024-05-30T21:15:57.358002Z"
     }
    },
    "outputs": [],
    "source": [
+    "import dask.array\n",
+    "import dask.distributed\n",
+    "import light_curve as licu\n",
+    "import matplotlib.pyplot as plt\n",
     "import nested_pandas as npd\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from dask_expr import from_legacy_dataframe\n",
     "from lsdb import read_hipscat\n",
-    "from nested_dask import NestedFrame\n",
-    "from nested_pandas.series.packer import pack"
+    "from matplotlib.colors import LogNorm\n",
+    "from nested_dask import NestedFrame"
    ]
   },
   {
@@ -72,8 +86,8 @@
    "id": "a403e00e2fd8d081",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:06.745405Z",
-     "start_time": "2024-05-24T12:54:03.834904Z"
+     "end_time": "2024-05-30T21:16:02.906421Z",
+     "start_time": "2024-05-30T21:15:59.779007Z"
     }
    },
    "outputs": [],
@@ -87,7 +101,8 @@
     "lsdb_source = read_hipscat(\n",
     "    f\"{catalogs_dir}/ztf_source\",\n",
     "    columns=[\"mjd\", \"mag\", \"magerr\", \"band\", \"ps1_objid\", \"catflags\"],\n",
-    ")"
+    ")\n",
+    "lc_columns = [\"mjd\", \"mag\", \"magerr\", \"band\", \"catflags\"]"
    ]
   },
   {
@@ -105,8 +120,8 @@
    "id": "b9b57bced7f810c0",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:06.770931Z",
-     "start_time": "2024-05-24T12:54:06.746786Z"
+     "end_time": "2024-05-30T21:16:02.931031Z",
+     "start_time": "2024-05-30T21:16:02.907834Z"
     }
    },
    "outputs": [],
@@ -144,8 +159,8 @@
    "id": "9522ce0977ff9fdf",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:06.789983Z",
-     "start_time": "2024-05-24T12:54:06.772721Z"
+     "end_time": "2024-05-30T21:16:02.951964Z",
+     "start_time": "2024-05-30T21:16:02.931819Z"
     }
    },
    "outputs": [],
@@ -159,18 +174,14 @@
     "\n",
     "    source_df = df[nested_columns]\n",
     "    # lc is for light curve\n",
-    "    # https://github.com/lincc-frameworks/nested-pandas/issues/88\n",
-    "    # nested_frame.add_nested(source_df, 'lc')\n",
-    "    nested_frame[\"lc\"] = pack(source_df, name=\"lc\")\n",
-    "\n",
-    "    return nested_frame\n",
+    "    return nested_frame.add_nested(source_df, \"lc\")\n",
     "\n",
     "\n",
     "ddf = joined_ddf.map_partitions(\n",
-    "    lambda df: convert_to_nested_frame(df, nested_columns=lsdb_source.columns),\n",
-    "    meta=convert_to_nested_frame(joined_ddf._meta, nested_columns=lsdb_source.columns),\n",
+    "    lambda df: convert_to_nested_frame(df, nested_columns=lc_columns),\n",
+    "    meta=convert_to_nested_frame(joined_ddf._meta, nested_columns=lc_columns),\n",
     ")\n",
-    "nested_ddf = NestedFrame.from_dask_dataframe(ddf)\n",
+    "nested_ddf = NestedFrame.from_dask_dataframe(from_legacy_dataframe(ddf))\n",
     "nested_ddf"
    ]
   },
@@ -179,7 +190,8 @@
    "id": "de6820724bcd4781",
    "metadata": {},
    "source": [
-    "Second, we compute the NestedFrame."
+    "Now we filter our dataframe by the `catflags` column (0 flags correspond to the perfect observational conditions) and the `band` column to be equal to `r`.\n",
+    "After filtering the detections, we are going to count the number of detections per object and keep only those objects with more than 10 detections."
    ]
   },
   {
@@ -188,28 +200,227 @@
    "id": "a82bd831fc1f6f92",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:19.282406Z",
-     "start_time": "2024-05-24T12:54:06.790699Z"
+     "end_time": "2024-05-30T21:16:02.963360Z",
+     "start_time": "2024-05-30T21:16:02.952708Z"
     }
    },
    "outputs": [],
    "source": [
-    "ndf = nested_ddf.compute()\n",
-    "ndf"
+    "r_band = nested_ddf.query(\"lc.catflags == 0 and lc.band == 'r'\")\n",
+    "nobs = r_band.reduce(np.size, \"lc.mjd\", meta={0: int}).rename(columns={0: \"nobs\"})\n",
+    "r_band = r_band[nobs[\"nobs\"] > 10]\n",
+    "r_band"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f99f2d9052f5843f",
+   "metadata": {},
+   "source": [
+    "Later we are going to extract features, so we need to prepare light-curve data to be in the same float format."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92d6ec9f5b0889e1",
+   "metadata": {},
+   "source": [
+    "### Extract features from ZTF light curves\n",
+    "\n",
+    "Now we are going to extract some features:\n",
+    "- Top periodogram peak\n",
+    "- Mean magnitude\n",
+    "- Von Neumann's eta statistics\n",
+    "- Excess variance statistics\n",
+    "- Number of observations\n",
+    "\n",
+    "We are going to use [`light-curve`](https://github.com/light-curve/light-curve-python) package for this purposes"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9c82f593efca9d30",
+   "id": "672b9dbe67192b2d",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-05-24T12:54:19.284710Z",
-     "start_time": "2024-05-24T12:54:19.283179Z"
+     "end_time": "2024-05-30T21:16:02.978308Z",
+     "start_time": "2024-05-30T21:16:02.964225Z"
     }
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "extractor = licu.Extractor(\n",
+    "    licu.Periodogram(\n",
+    "        peaks=1, max_freq_factor=50.0, fast=True\n",
+    "    ),  # Would give two features: peak period and signa-to-noise ratio of the peak\n",
+    "    licu.WeightedMean(),  # Mean magnitude\n",
+    "    licu.Eta(),  # Von Neumann's eta statistics\n",
+    "    licu.ExcessVariance(),  # Excess variance statistics\n",
+    "    licu.ObservationCount(),  # Number of observations\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# light-curve requires all arrays to be the same dtype.\n",
+    "# It also requires the time array to be ordered and to have no duplicates.\n",
+    "def extract_features(mjd, mag, magerr, **kwargs):\n",
+    "    # We offset date, so we still would have <1 second precision\n",
+    "    t = np.asarray(mjd - 60000, dtype=np.float32)\n",
+    "    _, sort_index = np.unique(t, return_index=True)\n",
+    "    features = extractor(\n",
+    "        t[sort_index],\n",
+    "        mag[sort_index],\n",
+    "        magerr[sort_index],\n",
+    "        **kwargs,\n",
+    "    )\n",
+    "    # Return the features as a dictionary\n",
+    "    return dict(zip(extractor.names, features))\n",
+    "\n",
+    "\n",
+    "features = r_band.reduce(\n",
+    "    extract_features,\n",
+    "    \"lc.mjd\",\n",
+    "    \"lc.mag\",\n",
+    "    \"lc.magerr\",\n",
+    "    meta={name: np.float32 for name in extractor.names},\n",
+    ")\n",
+    "\n",
+    "df_w_features = r_band.join(features)\n",
+    "df_w_features"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f2db5bc7d5f23c",
+   "metadata": {},
+   "source": [
+    "Before we are going next and actually run the computation, let's create a Dask client which would allow us to run the computation in parallel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8f0e5c9816cd9f9",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-30T21:16:03.687894Z",
+     "start_time": "2024-05-30T21:16:02.979043Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "client = dask.distributed.Client()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb55750bd55f324f",
+   "metadata": {},
+   "source": [
+    "Now we can collect some statistics and plot it. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb6fb4857b9dc97d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-30T21:16:30.171468Z",
+     "start_time": "2024-05-30T21:16:03.688907Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "lg_period_bins = np.linspace(-1, 2, 101)\n",
+    "lg_snr_bins = np.linspace(0, 2, 101)\n",
+    "\n",
+    "lg_period = dask.array.log10(df_w_features[\"period_0\"].to_dask_array())\n",
+    "lg_snr = dask.array.log10(df_w_features[\"period_s_to_n_0\"].to_dask_array())\n",
+    "\n",
+    "hist2d = dask.array.histogram2d(\n",
+    "    lg_period,\n",
+    "    lg_snr,\n",
+    "    bins=[lg_period_bins, lg_snr_bins],\n",
+    ")\n",
+    "# Run the computation\n",
+    "hist2d = hist2d[0].compute()\n",
+    "\n",
+    "# Plot the 2D histogram\n",
+    "plt.imshow(\n",
+    "    hist2d.T,\n",
+    "    extent=(lg_period_bins[0], lg_period_bins[-1], lg_snr_bins[0], lg_snr_bins[-1]),\n",
+    "    origin=\"lower\",\n",
+    "    norm=LogNorm(vmin=1, vmax=hist2d.max()),\n",
+    ")\n",
+    "plt.colorbar(label=\"Number of stars\")\n",
+    "plt.xlabel(\"lg Period/day\")\n",
+    "plt.ylabel(\"lg S/N\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88dfbcbd324f9ac2",
+   "metadata": {},
+   "source": [
+    "Let's select a bright star with a high signal-to-noise ratio and plot its phase light curve. We also filter periods ~1 day, because they are very likely to be bogus."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f8adf145ef471fb",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-30T21:16:47.074714Z",
+     "start_time": "2024-05-30T21:16:30.174392Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "bright_periodic_stars = df_w_features.query(\n",
+    "    \"period_s_to_n_0 > 20 and weighted_mean < 15 and (period_0 < 0.9 or period_0 > 1.1)\"\n",
+    ")\n",
+    "df = bright_periodic_stars.compute()\n",
+    "\n",
+    "obj = df.iloc[0]\n",
+    "lc = obj[\"lc\"]\n",
+    "period = obj[\"period_0\"]\n",
+    "ra = obj[\"ra\"]\n",
+    "dec = obj[\"dec\"]\n",
+    "\n",
+    "plt.errorbar(lc[\"mjd\"] % period / period, lc[\"mag\"], lc[\"magerr\"], fmt=\"o\")\n",
+    "plt.gca().invert_yaxis()\n",
+    "plt.xlabel(\"Phase\")\n",
+    "plt.ylabel(\"Magnitude\")\n",
+    "plt.xlim([0, 1])\n",
+    "plt.title(f\"RA={ra:.4f}, Dec={dec:.4f} period={period:.4f} days\")\n",
+    "\n",
+    "print(\"Search this object for on the SNAD ZTF Viewer:\")\n",
+    "print(f\"https://ztf.snad.space/search/{ra}%20{dec}/1\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5922a307eb316c33",
+   "metadata": {},
+   "source": [
+    "It looks like a nice RR Lyrae star!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76cfeb23acf497cd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-30T21:16:47.499116Z",
+     "start_time": "2024-05-30T21:16:47.075603Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "client.close()"
+   ]
   }
  ],
  "metadata": {