diff --git a/docs/notebooks/opsim_notebook.ipynb b/docs/notebooks/opsim_notebook.ipynb new file mode 100644 index 00000000..418fbb3e --- /dev/null +++ b/docs/notebooks/opsim_notebook.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Rubin OpSims in Simulations\n", + "\n", + "A critical aspect to producing realistic simulations of time varying phenomena is correctly modeling the cadence at which the objects will be observed and capturing the relevant physical details about the observation. TDAstro provides an `OpSim` module that can load a Rubin opsim file and use that to filter observations based on location, associate observations with their filter, apply detector noise, etc.\n", + "\n", + "This notebook provides an introduction to using opsim data in the simulations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## OpSim Class\n", + "\n", + "The `OpSim` class provides a wrapper for loading and querying survey-specific information including pointings and survey information. The required information is:\n", + " * the pointing data (RA, dec, and times for each pointing), and\n", + " * the zero point information for each pointing (a zero point column or the information needed to derive it)\n", + "\n", + "Other common information includes the airmass, exposure time, and filter used (the information needed to derive the ero points).\n", + "\n", + "Internally, the `OpSim` class uses a table with the column names are given by the imported data. For Rubin opsim files, this means columns will have names such as \"observationStartMJD\" or \"fieldRA\". Since different inputs may have different columns, the class provides the ability to map simple column names such as \"ra\" and \"time\" to the corresponding table names. The constructor allows the user to pass a column-mapping dictionary that maps the short column name to the name used within the database. The default column-mapper corresponds to the Rubin opsim format:\n", + " * \"airmass\" -> \"airmass\"\n", + " * \"dec\" -> \"fieldDec\"\n", + " * \"exptime\" -> \"visitExposureTime\"\n", + " * \"filter\" -> \"filter\"\n", + " * \"ra\" -> \"fieldRA\"\n", + " * \"time\" -> \"observationStartMJD\"\n", + "By default the `OpSim` class will look for Rubin column names, such as \"observationStartMJD\" and \"fieldRA\".\n", + "\n", + "We can create a simple `OpSim` by manually specifying the data as a dict or a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from tdastro.astro_utils.opsim import OpSim\n", + "\n", + "input_data = {\n", + " \"observationStartMJD\": np.array([0.0, 1.0, 2.0, 3.0, 4.0]),\n", + " \"fieldRA\": np.array([15.0, 30.0, 15.0, 0.0, 60.0]),\n", + " \"fieldDec\": np.array([-10.0, -5.0, 0.0, 5.0, 10.0]),\n", + " \"zp_nJy\": np.ones(5),\n", + "}\n", + "ops_data = OpSim(input_data)\n", + "print(ops_data.table)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have an `OpSim` object, we can use the `[]` notation to access data directly. `OpSim` has the same mechanics as a pandas data frame. The columns are named and correspond to input attributes. Each row provides the information for a single pointing.\n", + "\n", + "As noted above, the column mapping allows us to access column by either their given name (\"observationStartMJD\") or their shortened name (\"time\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ops_data[\"observationStartMJD\"] # Same as ops_data[\"time\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading an OpSim\n", + "\n", + "Most users will want to load a pre-existing `OpSim` from a database file using the `from_db()` function. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from tdastro import _TDASTRO_TEST_DATA_DIR\n", + "\n", + "opsim_file = Path(_TDASTRO_TEST_DATA_DIR) / \"opsim_shorten.db\"\n", + "ops_data = OpSim.from_db(opsim_file)\n", + "\n", + "print(f\"Loaded an opsim database with {len(ops_data)} entries.\")\n", + "print(f\"Columns: {ops_data.columns}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spatial Matching\n", + "\n", + "The `OpSim` class provides a framework for efficiently determining when an object was observed given its (RA, dec). We use the `range_search()` function to the indices of all pointings that are within a given radius of the query point.\n", + "\n", + "We start by taking the (RA, dec) of the first observation in the table and using that to determine all times this position was observed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_ra = ops_data[\"ra\"][0]\n", + "query_dec = ops_data[\"dec\"][0]\n", + "print(f\"Searching for ({query_ra}, {query_dec}).\")\n", + "\n", + "# Find everything within 0.5 degrees of the query point.\n", + "matches = ops_data.range_search(query_ra, query_dec, 0.5)\n", + "print(f\"Found {len(matches)} matches at {matches}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have the indices, we can use those to find the other information about the pointings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ops_data[\"time\"][matches]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can run the spatial search in batch mode by providing a lists of RA and dec. The `range_search()` will return a list of numpy arrays where each element in the top-level list represents the matches for a single query (RA, dec). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_queries = 10\n", + "query_ra = ops_data[\"ra\"][0:num_queries]\n", + "query_dec = ops_data[\"dec\"][0:num_queries]\n", + "\n", + "matches = ops_data.range_search(query_ra, query_dec, 0.5)\n", + "for idx, m_ids in enumerate(matches):\n", + " print(f\"{idx}: ({query_ra[idx]}, {query_dec[idx]}) matched {m_ids}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Applying Noise\n", + "\n", + "The `OpSim` object use the information provided in its table to compute the noise properties for each observation and use those to inject synthetic noise into the observations. The `bandflux_error_point_source` function takes the pre-noise flux densities (bandflux of the point source in nJy) and returns the simulated bandflux noise in nJy.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fluxes = np.array([100.0, 200.0, 300.0])\n", + "flux_err = ops_data.bandflux_error_point_source(fluxes, [0, 1, 2])\n", + "flux_err" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This can then be feed into the `apply_noise` utility function to apply it to the flux." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tdastro.astro_utils.noise_model import apply_noise\n", + "\n", + "noisy_fluxes = apply_noise(fluxes, flux_err)\n", + "noisy_fluxes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering\n", + "\n", + "Depending on the analysis we run, we might want to need to use only a subset of the data. The `OpSim` class provides a helper function `filter_rows()` that can be used to filter the data in an `OpSim` and thus speed up subsequent operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_data[\"filter\"] = np.array([\"r\", \"g\", \"g\", \"r\", \"g\"])\n", + "ops_data2 = OpSim(input_data)\n", + "\n", + "print(\"BEFORE:\")\n", + "print(ops_data2.table)\n", + "\n", + "ops_data3 = ops_data2.filter_rows(ops_data2[\"filter\"] == \"g\")\n", + "\n", + "print(\"\\n\\nAFTER:\")\n", + "print(ops_data3.table)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to filtering the rows in the table, the function will also update the cached data structures within the `OpSim` object, such as the KD-tree." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scalability and Testing\n", + "\n", + "To test the scalability of the OpSim we can user the `create_random_opsim()` helper function to generate completely random data by sampling uniformly over the surface of a sphere. This will not be a realistic survey since it will cover both the Northern and Southern hemisphere, but can provide a good test set for timing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import timeit\n", + "from tdastro.astro_utils.opsim import create_random_opsim\n", + "\n", + "# Create an opsim with 1,000,000 observations.\n", + "num_obs = 1_000_000\n", + "opsim_data2 = create_random_opsim(num_obs)\n", + "\n", + "# Use the first 10,000 samples as queries.\n", + "num_queries = 100_000\n", + "query_ra = np.asarray(opsim_data2[\"ra\"][0:num_queries])\n", + "query_dec = np.asarray(opsim_data2[\"dec\"][0:num_queries])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the large OpSim data, we can test where batching speeds up the range search." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit\n", + "for i in range(num_queries):\n", + " _ = opsim_data2.range_search(query_ra[i], query_dec[i], 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit\n", + "_ = opsim_data2.range_search(query_ra, query_dec, 0.5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tdastro", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/tdastro/astro_utils/opsim.py b/src/tdastro/astro_utils/opsim.py index 0b2f2986..c72d33c0 100644 --- a/src/tdastro/astro_utils/opsim.py +++ b/src/tdastro/astro_utils/opsim.py @@ -87,12 +87,18 @@ class OpSim: # noqa: D101 _kd_tree : `scipy.spatial.KDTree` or None A kd_tree of the OpSim pointings for fast spatial queries. We use the scipy kd-tree instead of astropy's functions so we can directly control caching. - _pixel_scale : `float` or None, optional + pixel_scale : `float` or None, optional The pixel scale for the LSST camera in arcseconds per pixel. - _read_noise : `float` or None, optional + read_noise : `float` or None, optional The readout noise for the LSST camera in electrons per pixel. - _dark_current : `float` or None, optional + dark_current : `float` or None, optional The dark current for the LSST camera in electrons per second per pixel. + ext_coeff : `dict` or None, optional + Mapping of filter names to extinction coefficients. Defaults to + the Rubin OpSim values. + zp_per_sec : `dict` or None, optional + Mapping of filter names to zeropoints at zenith. Defaults to + the Rubin OpSim values. """ _required_names = ["ra", "dec", "time"] @@ -125,6 +131,8 @@ def __init__( self.pixel_scale = LSSTCAM_PIXEL_SCALE if pixel_scale is None else pixel_scale self.read_noise = _lsstcam_readout_noise if read_noise is None else read_noise self.dark_current = _lsstcam_dark_current if dark_current is None else dark_current + self.ext_coeff = _lsstcam_extinction_coeff if ext_coeff is None else ext_coeff + self.zp_per_sec = _lsstcam_zeropoint_per_sec_zenith if zp_per_sec is None else zp_per_sec # Build the kd-tree. self._kd_tree = None @@ -133,7 +141,7 @@ def __init__( # If we are not given zero point data, try to derive it from the other columns. if not self.has_columns("zp"): if self.has_columns(["filter", "airmass", "exptime"]): - self._assign_zero_points(ext_coeff=ext_coeff, zp_per_sec=zp_per_sec) + self._assign_zero_points(ext_coeff=self.ext_coeff, zp_per_sec=self.zp_per_sec) else: raise ValueError( "OpSim must include either a zero point column or the columns " @@ -301,6 +309,59 @@ def write_opsim_table(self, filename, tablename="observations", overwrite=False) con.close() + def time_bounds(self): + """Returns the min and max times for all observations in the OpSim. + + Returns + ------- + t_min, t_max : float, float + The min and max times for all observations in the OpSim. + """ + t_min = self.table[self.colmap["time"]].min() + t_max = self.table[self.colmap["time"]].max() + return t_min, t_max + + def filter_rows(self, rows): + """Filter the rows in the OpSim to only include those indices that are provided + in a list of row indices (integers) or marked True in a mask. + + Parameters + ---------- + rows : numpy.ndarray + Either a Boolean array of the same length as the table or list of integer + row indices to keep. + + Returns + ------- + new_opsim : OpSim + A new OpSim object with the reduced rows. + """ + # Check if we are dealing with a mask of a list of indices. + rows = np.asarray(rows) + if rows.dtype == bool: + if len(rows) != len(self.table): + raise ValueError( + f"Mask length mismatch. Expected {len(self.table)} rows, but found {len(rows)}." + ) + mask = rows + else: + mask = np.full((len(self.table),), False) + mask[rows] = True + + # Do the actual filtering and generate a new OpSim. This automatically creates + # the cached data, such as the KD-tree. + new_table = self.table[mask] + new_opsim = OpSim( + new_table, + colmap=self.colmap, + ext_coeff=self.ext_coeff, + zp_per_sec=self.zp_per_sec, + pixel_scale=self.pixel_scale, + read_noise=self.read_noise, + dark_current=self.dark_current, + ) + return new_opsim + def range_search(self, query_ra, query_dec, radius): """Return the indices of the opsim pointings that fall within the field of view of the query point(s). @@ -403,6 +464,58 @@ def bandflux_error_point_source(self, bandflux, index): ) +def create_random_opsim(num_obs, seed=None): + """Create a random OpSim pointings drawn uniformly from (RA, dec). + + Parameters + ---------- + num_obs : int + The size of the OpSim to generate. + seed : int + The seed to used for random number generation. If None then + uses a default random number generator. + Default: None + + Returns + ------- + opsim_data : OpSim + The OpSim data structure. + seed : int, optional + The seed for the random number generator. + """ + if num_obs <= 0: + raise ValueError("Number of observations must be greater than zero.") + + rng = np.random.default_rng() if seed is None else np.random.default_rng(seed=seed) + + # Generate the (RA, dec) pairs uniformly on the surface of a sphere. + ra = np.degrees(rng.uniform(0.0, 2.0 * np.pi, size=num_obs)) + dec = np.degrees(np.arccos(2.0 * rng.uniform(0.0, 1.0, size=num_obs) - 1.0) - (np.pi / 2.0)) + + # Generate the information needed to compute zeropoint. + airmass = rng.uniform(1.3, 1.7, size=num_obs) + filter = rng.choice(["u", "g", "r", "i", "z", "y"], size=num_obs) + + input_data = { + "observationStartMJD": 0.05 * np.arange(num_obs), + "fieldRA": ra, + "fieldDec": dec, + "airmass": airmass, + "filter": filter, + "visitExposureTime": 29.0 * np.ones(num_obs), + } + + opsim = OpSim( + input_data, + ext_coeff=_lsstcam_extinction_coeff, + zp_per_sec=_lsstcam_zeropoint_per_sec_zenith, + pixel_scale=LSSTCAM_PIXEL_SCALE, + read_noise=_lsstcam_readout_noise, + dark_current=_lsstcam_dark_current, + ) + return opsim + + def opsim_add_random_data(opsim_data, colname, min_val=0.0, max_val=1.0): """Add a column composed of random uniform data. Used for testing. diff --git a/tests/tdastro/astro_utils/test_opsim.py b/tests/tdastro/astro_utils/test_opsim.py index a2a4aff2..85294880 100644 --- a/tests/tdastro/astro_utils/test_opsim.py +++ b/tests/tdastro/astro_utils/test_opsim.py @@ -7,6 +7,7 @@ from tdastro.astro_utils.mag_flux import mag2flux from tdastro.astro_utils.opsim import ( OpSim, + create_random_opsim, opsim_add_random_data, oversample_opsim, ) @@ -26,6 +27,11 @@ def test_create_opsim(): assert len(ops_data) == 5 assert len(ops_data.columns) == 4 + # Check that we can extract the time bounds. + t_min, t_max = ops_data.time_bounds() + assert t_min == 0.0 + assert t_max == 4.0 + # We can query which columns the OpSim has. assert ops_data.has_columns("fieldRA") assert ops_data.has_columns("dec") @@ -113,6 +119,49 @@ def test_opsim_add_columns(): assert np.all(values <= 5.0) +def test_opsim_filter_rows(): + """Test that the user can filter out OpSim rows.""" + times = np.arange(0.0, 10.0, 1.0) + values = { + "observationStartMJD": times, + "fieldRA": 15.0 * (times + 1.0), + "fieldDec": -1.0 * times, + "zp_nJy": np.ones(10), + "filter": np.tile(["r", "g"], 5), + } + ops_data = OpSim(values) + assert len(ops_data) == 10 + assert len(ops_data.columns) == 5 + + # We can filter the OpSim to specific rows by index. + inds = [0, 1, 2, 3, 4, 5, 7, 8] + ops_data = ops_data.filter_rows(inds) + assert len(ops_data) == 8 + assert len(ops_data.columns) == 5 + + expected_times = np.array(inds) + assert np.allclose(ops_data["time"], expected_times) + assert np.allclose(ops_data["ra"], 15.0 * (expected_times + 1.0)) + assert np.allclose(ops_data["dec"], -1.0 * expected_times) + assert np.array_equal(ops_data["filter"], values["filter"][inds]) + + # Check that the size of the internal KD-tree has changed. + assert ops_data._kd_tree.n == 8 + + # We can filter the OpSim to specific rows by mask. + ops_data = ops_data.filter_rows(ops_data["filter"] == "r") + assert len(ops_data) == 4 + + expected_times = np.array([0.0, 2.0, 4.0, 8.0]) + assert np.allclose(ops_data["time"], expected_times) + assert np.allclose(ops_data["ra"], 15.0 * (expected_times + 1.0)) + assert np.allclose(ops_data["dec"], -1.0 * expected_times) + assert np.all(ops_data["filter"] == "r") + + # Check that the size of the internal KD-tree has changed (again). + assert ops_data._kd_tree.n == 4 + + def test_read_small_opsim(opsim_small): """Read in a small OpSim file from the testing data directory.""" ops_data = OpSim.from_db(opsim_small) @@ -253,6 +302,12 @@ def test_opsim_flux_err_point_source(opsim_shorten): np.testing.assert_allclose(flux_err, expected_flux_err, rtol=0.2) +def test_create_random_opsim(): + """Test that we can create a complete random OpSim.""" + opsim = create_random_opsim(1000) + assert len(opsim) == 1000 + + def test_oversample_opsim(opsim_shorten): """Test that we can oversample an OpSim file.""" opsim = OpSim.from_db(opsim_shorten) diff --git a/tests/tdastro/conftest.py b/tests/tdastro/conftest.py index f37c45cd..e1c96305 100644 --- a/tests/tdastro/conftest.py +++ b/tests/tdastro/conftest.py @@ -24,13 +24,13 @@ def grid_data_bad_file(test_data_dir): @pytest.fixture def opsim_small(test_data_dir): - """Return the file path for the bad grid input file.""" + """Return the file path for the small opsim db.""" return test_data_dir / "opsim_small.db" @pytest.fixture def opsim_shorten(test_data_dir): - """Return the file path for the bad grid input file.""" + """Return the file path for the shortened opsim db.""" return test_data_dir / "opsim_shorten.db"