From 7b571ed5890777b17af5b7f7f9fa3ec7990ef8f9 Mon Sep 17 00:00:00 2001
From: Ian Harry <ian.harry@ligo.org>
Date: Fri, 3 Nov 2023 08:48:57 +0000
Subject: [PATCH] Cherry pick things onto v23 release branch (#4551)

* Read DQ flags with finer time resolution (#4547)

* Read DQ flags with finer time resolution

* No default sample rate for dq flags

* No default sample rate for dq

* Add type for frequency command line option

* Optimize memory usage of pycbc_fit_sngls_split_binned (#4543)

* Optimize memory usage of pycbc_fit_sngls_split_binned

* Need add option here

* Comments on PR

* Example doesn't use code being edited!

* Update bin/all_sky_search/pycbc_fit_sngls_split_binned

Co-authored-by: Thomas Dent <thomas.dent@usc.es>

---------

Co-authored-by: Thomas Dent <thomas.dent@usc.es>

* Non-coinc ifos in minifollowup trig plots (#4548)

* Update get.sh (#4532)

---------

Co-authored-by: maxtrevor <65971534+maxtrevor@users.noreply.github.com>
Co-authored-by: Thomas Dent <thomas.dent@usc.es>
Co-authored-by: Shichao Wu <wushichao@mail.bnu.edu.cn>
---
 bin/all_sky_search/pycbc_calculate_dqflag     |  11 +-
 .../pycbc_fit_sngls_split_binned              | 178 +++++++++++++-----
 .../pycbc_foreground_minifollowup             |  49 +++--
 .../pycbc_plot_trigger_timeseries             |   5 +-
 examples/inference/lisa_smbhb_ldc/get.sh      |   6 +-
 5 files changed, 178 insertions(+), 71 deletions(-)

diff --git a/bin/all_sky_search/pycbc_calculate_dqflag b/bin/all_sky_search/pycbc_calculate_dqflag
index b639b539929..981d190ad33 100644
--- a/bin/all_sky_search/pycbc_calculate_dqflag
+++ b/bin/all_sky_search/pycbc_calculate_dqflag
@@ -26,6 +26,8 @@ parser.add_argument('--flag', type=str, required=True,
                     help="name of the data quality flag")
 parser.add_argument('--ifo', type=str, required=True,
                     help="interferometer to analyze")
+parser.add_argument('--sample-frequency', required=True, type=int,
+                    help="timeseries sampling frequency in Hz")
 parser.add_argument("--output-file", required=True,
                     help="name of output file")
 parser.add_argument("--output-channel", required=False,
@@ -35,13 +37,14 @@ args = parser.parse_args()
 pycbc.init_logging(args.verbose)
 
 
-def make_dq_ts(dq_segs, start, end, ifo, flag):
+def make_dq_ts(dq_segs, start, end, ifo, flag, freq):
     """ Create a data quality timeseries
     """
     logging.info('Creating data quality timeseries for flag %s', 
                  flag)
     # Keep just times which belong to science_segments
-    dq_times = numpy.arange(start, end)
+    dur = end - start
+    dq_times = numpy.linspace(start, end, int(freq*dur), endpoint=False)
     dq = numpy.zeros(len(dq_times))
     # Identify times within segments for the chosen flag
     indexes, _ = veto.indices_within_segments(dq_times, [dq_segs], ifo=ifo,
@@ -51,7 +54,7 @@ def make_dq_ts(dq_segs, start, end, ifo, flag):
     else:
         logging.warning('Veto definer segment list is empty for flag %s-%s',
                         ifo, flag)
-    return TimeSeries(dq, epoch=start, delta_t=1.)
+    return TimeSeries(dq, epoch=start, delta_t=1.0/freq)
 
 
 ifo = args.ifo
@@ -63,7 +66,7 @@ if len(flag_list)>1:
 
 # Create data quality time series
 dq = make_dq_ts(args.dq_segments, args.gps_start_time, args.gps_end_time, 
-                ifo, flag)
+                ifo, flag, args.sample_frequency)
 
 dq.save(args.output_file, group=args.output_channel)
 
diff --git a/bin/all_sky_search/pycbc_fit_sngls_split_binned b/bin/all_sky_search/pycbc_fit_sngls_split_binned
index 7326715c700..961ec6f5892 100644
--- a/bin/all_sky_search/pycbc_fit_sngls_split_binned
+++ b/bin/all_sky_search/pycbc_fit_sngls_split_binned
@@ -22,6 +22,7 @@ from matplotlib import pyplot as plt
 import numpy as np
 
 from pycbc import events, bin_utils, results
+from pycbc.io import HFile, SingleDetTriggers
 from pycbc.events import triggers as trigs
 from pycbc.events import trigger_fits as trstats
 from pycbc.events import stat as pystat
@@ -82,6 +83,10 @@ parser.add_argument("--gating-veto-windows", nargs='+',
 parser.add_argument("--stat-fit-threshold", type=float, required=True,
                     help="Only fit triggers with statistic value above this "
                          "threshold. Required")
+parser.add_argument("--plot-lower-stat-limit", type=float, required=True,
+                    help="Plot triggers down to this value. Setting this too"
+                         "low will incur huge memory usage in a full search."
+                         "To avoid this, choose 5.5 or larger.")
 parser.add_argument("--fit-function",
                     choices=["exponential", "rayleigh", "power"],
                     help="Functional form for the maximum likelihood fit")
@@ -96,6 +101,8 @@ pystat.insert_statistic_option_group(parser,
     default_ranking_statistic='single_ranking_only')
 args = parser.parse_args()
 
+assert(args.stat_fit_threshold >= args.plot_lower_stat_limit)
+
 pycbc.init_logging(args.verbose)
 
 logging.info('Opening trigger file: %s' % args.trigger_file)
@@ -122,8 +129,12 @@ for ex_p in extparams:
         logging.info('Reading duration from trigger file')
         # List comprehension loops over templates; if a template has no triggers, accessing
         # the 0th entry of its region reference will return zero due to a quirk of h5py.
-        params[ex_p] = np.array([trigf[args.ifo + '/template_duration'][ref][0]
-                                 for ref in trigf[args.ifo + '/template_duration_template'][:]])
+        params[ex_p] = np.array(
+            [
+                trigf[args.ifo + '/template_duration'][ref][0]
+                for ref in trigf[args.ifo + '/template_duration_template'][:]
+            ]
+        )
     else:
         logging.info("Calculating " + ex_p + " from template parameters")
         params[ex_p] = trigs.get_param(ex_p, args, params['mass1'],
@@ -198,10 +209,65 @@ for i, lower_2, upper_2 in zip(range(args.split_two_nbins),
 
 logging.info('Getting template boundaries from trigger file')
 boundaries = trigf[args.ifo + '/template_boundaries'][:]
+
+trigf.close()
+
+# Setup a data mask to remove any triggers with SNR below threshold
+# This works as a pre-filter as SNR is always greater than or equal
+# to sngl_ranking, except in the psdvar case, where it could increase.
+with HFile(args.trigger_file, 'r') as trig_file:
+    n_triggers_orig = trig_file[f'{args.ifo}/snr'].size
+    logging.info("Trigger file has %d triggers", n_triggers_orig)
+    logging.info('Generating trigger mask')
+    if f'{args.ifo}/psd_var_val' in trig_file:
+        idx, _, _ = trig_file.select(
+            lambda snr, psdvar: snr / psdvar ** 0.5 >= args.plot_lower_stat_limit,
+            f'{args.ifo}/snr',
+            f'{args.ifo}/psd_var_val',
+            return_indices=True
+        )
+    else:
+        # psd_var_val may not have been calculated
+        idx, _ = trig_file.select(
+            lambda snr: snr >= args.plot_lower_stat_limit,
+            f'{args.ifo}/snr',
+            return_indices=True
+        )
+    data_mask = np.zeros(n_triggers_orig, dtype=bool)
+    data_mask[idx] = True
+
+
+logging.info('Calculating single stat values from trigger file')
+trigs = SingleDetTriggers(
+    args.trigger_file,
+    None,
+    None,
+    None,
+    None,
+    args.ifo,
+    premask=data_mask
+)
+# This is the direct pointer to the HDF file, used later on
+trigf = trigs.trigs_f
+
+stat = trigs.get_ranking(args.sngl_ranking)
+time = trigs.end_time
+
+
+logging.info('Processing template boundaries')
 max_boundary_id = np.argmax(boundaries)
 sorted_boundary_list = np.sort(boundaries)
 
-logging.info('Processing template boundaries')
+# In the two blocks of code that follows we are trying to figure out the index
+# ranges in the masked trigger lists corresponding to the "boundaries".
+# We will do this by walking over the boundaries in the order they're
+# stored in the file, adding in the number of triggers not removed by the
+# mask every time.
+
+# First step is to loop over the "boundaries" which gives the start position
+# of each block of triggers (corresponding to one template) in the full trigger
+# merge file.
+# Here we identify the end_idx for the triggers corresponding to each template.
 where_idx_end = np.zeros_like(boundaries)
 for idx, idx_start in enumerate(boundaries):
     if idx == max_boundary_id:
@@ -210,22 +276,41 @@ for idx, idx_start in enumerate(boundaries):
         where_idx_end[idx] = sorted_boundary_list[
             np.argmax(sorted_boundary_list == idx_start) + 1]
 
-logging.info('Calculating single stat values from trigger file')
-rank_method = pystat.get_statistic_from_opts(args, [args.ifo])
-stat = rank_method.get_sngl_ranking(trigf[args.ifo])
+# Next we need to map these start/stop indices in the full file, to the start
+# stop indices in the masked list of triggers. We do this by figuring out
+# how many triggers are in the masked list for each template in the order they
+# are stored in the trigger merge file, and keep a running sum.
+curr_count = 0
+mask_start_idx = np.zeros_like(boundaries)
+mask_end_idx = np.zeros_like(boundaries)
+for idx_start in sorted_boundary_list:
+    boundary_idx = np.argmax(boundaries == idx_start)
+    idx_end = where_idx_end[boundary_idx]
+    mask_start_idx[boundary_idx] = curr_count
+    curr_count += np.sum(trigs.mask[idx_start:idx_end])
+    mask_end_idx[boundary_idx] = curr_count
+
 
 if args.veto_file:
     logging.info('Applying DQ vetoes')
-    time = trigf[args.ifo + '/end_time'][:]
-    remove, junk = events.veto.indices_within_segments(time, [args.veto_file],
-                         ifo=args.ifo, segment_name=args.veto_segment_name)
-    # Set stat to zero for triggers being vetoed: given that the fit threshold is
-    # >0 these will not be fitted or plotted.  Avoids complications from changing
-    # the number of triggers, ie changes of template boundary.
-    stat[remove] = np.zeros_like(remove)
-    time[remove] = np.zeros_like(remove)
-    logging.info('{} out of {} trigs removed after vetoing with {} from {}'.format(
-                      remove.size, stat.size, args.veto_segment_name, args.veto_file))
+    remove, junk = events.veto.indices_within_segments(
+        time,
+        [args.veto_file],
+        ifo=args.ifo,
+        segment_name=args.veto_segment_name
+    )
+    # Set stat to zero for triggers being vetoed: given that the fit threshold
+    # is >0 these will not be fitted or plotted.  Avoids complications from
+    # changing the number of triggers, ie changes of template boundary.
+    stat[remove] = 0.
+    time[remove] = 0.
+    logging.info(
+        '%d out of %d trigs removed after vetoing with %s from %s',
+        remove.size,
+        stat.size,
+        args.veto_segment_name,
+        args.veto_file
+    )
 
 if args.gating_veto_windows:
     logging.info('Applying veto to triggers near gates')
@@ -236,19 +321,24 @@ if args.gating_veto_windows:
         raise ValueError("Gating veto window values must be negative before "
                          "gates and positive after gates.")
     if not (gveto_before == 0 and gveto_after == 0):
-        time = trigf[args.ifo + '/end_time'][:]
         autogate_times = np.unique(trigf[args.ifo + '/gating/auto/time'][:])
         if args.ifo + '/gating/file' in trigf:
             detgate_times = trigf[args.ifo + '/gating/file/time'][:]
         else:
             detgate_times = []
         gate_times = np.concatenate((autogate_times, detgate_times))
-        gveto_remove = events.veto.indices_within_times(time, gate_times + gveto_before,
-                                                        gate_times + gveto_after)
-        stat[gveto_remove] = np.zeros_like(gveto_remove)
-        time[gveto_remove] = np.zeros_like(gveto_remove)
-        logging.info('{} out of {} trigs removed after vetoing triggers near gates'.format(
-                          gveto_remove.size, stat.size))
+        gveto_remove = events.veto.indices_within_times(
+            time,
+            gate_times + gveto_before,
+            gate_times + gveto_after
+        )
+        stat[gveto_remove] = 0.
+        time[gveto_remove] = 0.
+        logging.info(
+            '%d out of %d trigs removed after vetoing triggers near gates',
+            gveto_remove.size,
+            stat.size
+        )
 
 for x in range(args.split_one_nbins):
     if not args.prune_number:
@@ -266,9 +356,8 @@ for x in range(args.split_one_nbins):
         time_inbin = []
         # getting triggers that are in these templates
         for idx in id_in_both:
-            where_idx_start = boundaries[idx]
-            vals_inbin += list(stat[where_idx_start:where_idx_end[idx]])
-            time_inbin += list(time[where_idx_start:where_idx_end[idx]])
+            vals_inbin += list(stat[mask_start_idx[idx]:mask_end_idx[idx]])
+            time_inbin += list(time[mask_start_idx[idx]:mask_end_idx[idx]])
 
         vals_inbin = np.array(vals_inbin)
         time_inbin = np.array(time_inbin)
@@ -276,32 +365,36 @@ for x in range(args.split_one_nbins):
         count_pruned = 0
         logging.info('Pruning in split {}-{} {}-{}'.format(
                      args.split_param_one, x, args.split_param_two, y))
+        logging.info('Currently have %d triggers', len(vals_inbin))
         while count_pruned < args.prune_number:
             # Getting loudest statistic value in split
             max_val_arg = vals_inbin.argmax()
             max_statval = vals_inbin[max_val_arg]
 
-            remove = np.nonzero(abs(time_inbin[max_val_arg] - time)
-                                    < args.prune_window)[0]
+            remove = np.nonzero(
+                abs(time_inbin[max_val_arg] - time) < args.prune_window
+            )[0]
             # Remove from inbin triggers as well in case there
             # are more pruning iterations
-            remove_inbin = np.nonzero(abs(time_inbin[max_val_arg] - time_inbin)
-                                      < args.prune_window)[0]
-            logging.info('Prune {}: removing {} triggers around time {:.2f},'
-                         ' {} in this split'.format(count_pruned, remove.size,
-                                                    time[max_val_arg],
-                                                    remove_inbin.size))
+            remove_inbin = np.nonzero(
+                abs(time_inbin[max_val_arg] - time_inbin) < args.prune_window
+            )[0]
+            logging.info(
+                'Prune %d: removing %d triggers around %.2f, %d in this split',
+                count_pruned,
+                remove.size,
+                time[max_val_arg],
+                remove_inbin.size
+            )
             # Set pruned triggers' stat values to zero, as above for vetoes
-            vals_inbin[remove_inbin] = np.zeros_like(remove_inbin)
-            time_inbin[remove_inbin] = np.zeros_like(remove_inbin)
-            stat[remove] = np.zeros_like(remove)
-            time[remove] = np.zeros_like(remove)
+            vals_inbin[remove_inbin] = 0.
+            time_inbin[remove_inbin] = 0.
+            stat[remove] = 0.
+            time[remove] = 0.
             count_pruned += 1
 
-trigf.close()
-
 logging.info('Setting up plotting and fitting limit values')
-minplot = max(stat[np.nonzero(stat)].min(), args.stat_fit_threshold - 1)
+minplot = max(stat[np.nonzero(stat)].min(), args.plot_lower_stat_limit)
 min_fit = max(minplot, args.stat_fit_threshold)
 max_fit = 1.05 * stat.max()
 if args.plot_max_x:
@@ -355,8 +448,7 @@ for x in range(args.split_one_nbins):
             if len(indices_all_conditions) == 0: continue
             vals_inbin = []
             for idx in indices_all_conditions:
-                where_idx_start = boundaries[idx]
-                vals_inbin += list(stat[where_idx_start:where_idx_end[idx]])
+                vals_inbin += list(stat[mask_start_idx[idx]:mask_end_idx[idx]])
 
             vals_inbin = np.array(vals_inbin)
             vals_above_thresh = vals_inbin[vals_inbin >= args.stat_fit_threshold]
diff --git a/bin/minifollowups/pycbc_foreground_minifollowup b/bin/minifollowups/pycbc_foreground_minifollowup
index 41b3f015887..4d9a19d277e 100644
--- a/bin/minifollowups/pycbc_foreground_minifollowup
+++ b/bin/minifollowups/pycbc_foreground_minifollowup
@@ -103,10 +103,10 @@ file_val = args.analysis_category
 stat = f['{}/stat'.format(file_val)][:]
 
 if args.sort_variable not in f[file_val]:
-    all_datasets = [re.sub(file_val, '', ds).strip('/') for ds in get_all_subkeys(f, file_val)]
-    raise KeyError('Sort variable {0} not in {1}: sort choices in '
-                   '{1} are {2}'.format(args.sort_variable, file_val,
-                                        ', '.join(all_datasets)))
+    all_datasets = [re.sub(file_val, '', ds).strip('/')
+                    for ds in get_all_subkeys(f, file_val)]
+    raise KeyError(f'Sort variable {args.sort_variable} not in {file_val}: sort'
+                   f'choices in {file_val} are ' + ', '.join(all_datasets))
 
 events_to_read = num_events * 100
 
@@ -126,8 +126,8 @@ tids = {}
 # Version used for multi-ifo coinc code
 ifo_list = f.attrs['ifos'].split(' ')
 for ifo in ifo_list:
-    times[ifo] = f['{}/{}/time'.format(file_val,ifo)][:][event_idx]
-    tids[ifo] = f['{}/{}/trigger_id'.format(file_val, ifo)][:][event_idx]
+    times[ifo] = f[f'{file_val}/{ifo}/time'][:][event_idx]
+    tids[ifo] = f[f'{file_val}/{ifo}/trigger_id'][:][event_idx]
 
 bank_data = h5py.File(args.bank_file, 'r')
 
@@ -139,21 +139,30 @@ curr_idx = -1
 while event_count < num_events and curr_idx < (events_to_read - 1):
     curr_idx += 1
     files = wf.FileList([])
+    duplicate = False
 
+    # Times and trig ids for trigger timeseries plot
     ifo_times_strings = []
     ifo_tids_strings = []
-    duplicate = False
-    for ifo in times:
-        ifo_times_string = '%s:%s' % (ifo, times[ifo][curr_idx])
-        ifo_tids_string = '%s:%s' % (ifo, tids[ifo][curr_idx])
-        ifo_times_strings += [ifo_times_string]
-        ifo_tids_strings += [ifo_tids_string]
 
-        # For background do not want to follow up 10 background coincs with
-        # the same event in ifo 1 and different events in ifo 2
+    # Mean time to use as reference for zerolag
+    mtime = coinc.mean_if_greater_than_zero(
+        [times[ifo][curr_idx] for ifo in times])[0]
+
+    for ifo in times:
+        plottime = times[ifo][curr_idx]
+        if plottime == -1:  # Ifo is not in coinc
+            # Use mean time instead and don't plot special trigger
+            plottime = mtime
+        else:  # Do plot special trigger
+            ifo_tids_strings += ['%s:%s' % (ifo, tids[ifo][curr_idx])]
+        ifo_times_strings += ['%s:%s' % (ifo, plottime)]
+
+        # For background do not want to follow up 10 background coincs with the
+        # same event in ifo 1 and different events in ifo 2, so record times
         if ifo not in event_times:
             event_times[ifo] = []
-        # Don't skip coincs in zerolag or due to the sentinel time -1
+        # Only do this for background coincident triggers & not sentinel time -1
         if 'background' in args.analysis_category and \
                 times[ifo][curr_idx] != -1 and \
                 int(times[ifo][curr_idx]) in event_times[ifo]:
@@ -175,10 +184,10 @@ while event_count < num_events and curr_idx < (events_to_read - 1):
                         workflow,
                         skipped_data,
                         args.output_dir,
-                        tags=['SKIP_{}'.format(event_count)]),)
+                        tags=[f'SKIP_{event_count}']),)
         skipped_data = []
 
-    bank_id = f['{}/template_id'.format(file_val)][:][sorting][curr_idx]
+    bank_id = f[f'{file_val}/template_id'][:][sorting][curr_idx]
 
     layouts += (mini.make_coinc_info(workflow, single_triggers, tmpltbank_file,
                               coinc_file, args.output_dir, n_loudest=curr_idx,
@@ -193,7 +202,7 @@ while event_count < num_events and curr_idx < (events_to_read - 1):
         params['%s_end_time' % ifo] = times[ifo][curr_idx]
         try:
             # Only present for precessing case
-            params['u_vals_%s'%ifo] = \
+            params['u_vals_%s' % ifo] = \
                                  fsdt[ifo][ifo]['u_vals'][tids[ifo][curr_idx]]
         except:
             pass
@@ -236,8 +245,8 @@ while event_count < num_events and curr_idx < (events_to_read - 1):
                      tags=args.tags + [str(event_count)])
                 break
         else:
-            logging.info('Trigger time {} is not valid in {}, ' \
-                         'skipping singles plots'.format(time, single.ifo))
+            logging.info(f'Trigger time {time} is not valid in '
+                         f'{single.ifo}, skipping singles plots')
 
     layouts += list(layout.grouper(files, 2))
 
diff --git a/bin/minifollowups/pycbc_plot_trigger_timeseries b/bin/minifollowups/pycbc_plot_trigger_timeseries
index 78f1e562d4d..8f5f5f0a0da 100644
--- a/bin/minifollowups/pycbc_plot_trigger_timeseries
+++ b/bin/minifollowups/pycbc_plot_trigger_timeseries
@@ -14,6 +14,7 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
 """ Plot the single detector trigger timeseries """
 import argparse, logging, pycbc.version, pycbc.results, sys
 from pycbc.types import MultiDetOptionAction
@@ -109,7 +110,9 @@ for ifo in args.single_trigger_files.keys():
 
     if args.special_trigger_ids:
         special_idx = args.special_trigger_ids[ifo]
-        if special_idx not in idx:
+        if special_idx == None:  # No special trigger for this ifo
+            continue
+        elif special_idx not in idx:
             logging.info("IDX %d not in kept list",
                          args.special_trigger_ids[ifo])
             continue
diff --git a/examples/inference/lisa_smbhb_ldc/get.sh b/examples/inference/lisa_smbhb_ldc/get.sh
index 8cb31c037cc..4d7962c408b 100644
--- a/examples/inference/lisa_smbhb_ldc/get.sh
+++ b/examples/inference/lisa_smbhb_ldc/get.sh
@@ -4,13 +4,13 @@ for channel in A E T
 do
     strain_file=${channel}_TDI_v2.gwf
     test -f ${strain_file} && continue
-    curl -O --show-error --silent https://zenodo.org/record/7497853/files/${strain_file}
+    curl -LO --show-error --silent https://zenodo.org/record/7497853/files/${strain_file}
 
     psd_file=${channel}_psd.txt
     test -f ${psd_file} && continue
-    curl -O --show-error --silent https://zenodo.org/record/7497853/files/${psd_file}
+    curl -LO --show-error --silent https://zenodo.org/record/7497853/files/${psd_file}
 done
 
 params_file=MBHB_params_v2_LISA_frame.pkl 
 test -f ${params_file} && continue
-curl -O --show-error --silent https://zenodo.org/record/7497853/files/${params_file}
+curl -LO --show-error --silent https://zenodo.org/record/7497853/files/${params_file}