From 7b571ed5890777b17af5b7f7f9fa3ec7990ef8f9 Mon Sep 17 00:00:00 2001 From: Ian Harry Date: Fri, 3 Nov 2023 08:48:57 +0000 Subject: [PATCH] Cherry pick things onto v23 release branch (#4551) * Read DQ flags with finer time resolution (#4547) * Read DQ flags with finer time resolution * No default sample rate for dq flags * No default sample rate for dq * Add type for frequency command line option * Optimize memory usage of pycbc_fit_sngls_split_binned (#4543) * Optimize memory usage of pycbc_fit_sngls_split_binned * Need add option here * Comments on PR * Example doesn't use code being edited! * Update bin/all_sky_search/pycbc_fit_sngls_split_binned Co-authored-by: Thomas Dent --------- Co-authored-by: Thomas Dent * Non-coinc ifos in minifollowup trig plots (#4548) * Update get.sh (#4532) --------- Co-authored-by: maxtrevor <65971534+maxtrevor@users.noreply.github.com> Co-authored-by: Thomas Dent Co-authored-by: Shichao Wu --- bin/all_sky_search/pycbc_calculate_dqflag | 11 +- .../pycbc_fit_sngls_split_binned | 178 +++++++++++++----- .../pycbc_foreground_minifollowup | 49 +++-- .../pycbc_plot_trigger_timeseries | 5 +- examples/inference/lisa_smbhb_ldc/get.sh | 6 +- 5 files changed, 178 insertions(+), 71 deletions(-) diff --git a/bin/all_sky_search/pycbc_calculate_dqflag b/bin/all_sky_search/pycbc_calculate_dqflag index b639b539929..981d190ad33 100644 --- a/bin/all_sky_search/pycbc_calculate_dqflag +++ b/bin/all_sky_search/pycbc_calculate_dqflag @@ -26,6 +26,8 @@ parser.add_argument('--flag', type=str, required=True, help="name of the data quality flag") parser.add_argument('--ifo', type=str, required=True, help="interferometer to analyze") +parser.add_argument('--sample-frequency', required=True, type=int, + help="timeseries sampling frequency in Hz") parser.add_argument("--output-file", required=True, help="name of output file") parser.add_argument("--output-channel", required=False, @@ -35,13 +37,14 @@ args = parser.parse_args() pycbc.init_logging(args.verbose) -def make_dq_ts(dq_segs, start, end, ifo, flag): +def make_dq_ts(dq_segs, start, end, ifo, flag, freq): """ Create a data quality timeseries """ logging.info('Creating data quality timeseries for flag %s', flag) # Keep just times which belong to science_segments - dq_times = numpy.arange(start, end) + dur = end - start + dq_times = numpy.linspace(start, end, int(freq*dur), endpoint=False) dq = numpy.zeros(len(dq_times)) # Identify times within segments for the chosen flag indexes, _ = veto.indices_within_segments(dq_times, [dq_segs], ifo=ifo, @@ -51,7 +54,7 @@ def make_dq_ts(dq_segs, start, end, ifo, flag): else: logging.warning('Veto definer segment list is empty for flag %s-%s', ifo, flag) - return TimeSeries(dq, epoch=start, delta_t=1.) + return TimeSeries(dq, epoch=start, delta_t=1.0/freq) ifo = args.ifo @@ -63,7 +66,7 @@ if len(flag_list)>1: # Create data quality time series dq = make_dq_ts(args.dq_segments, args.gps_start_time, args.gps_end_time, - ifo, flag) + ifo, flag, args.sample_frequency) dq.save(args.output_file, group=args.output_channel) diff --git a/bin/all_sky_search/pycbc_fit_sngls_split_binned b/bin/all_sky_search/pycbc_fit_sngls_split_binned index 7326715c700..961ec6f5892 100644 --- a/bin/all_sky_search/pycbc_fit_sngls_split_binned +++ b/bin/all_sky_search/pycbc_fit_sngls_split_binned @@ -22,6 +22,7 @@ from matplotlib import pyplot as plt import numpy as np from pycbc import events, bin_utils, results +from pycbc.io import HFile, SingleDetTriggers from pycbc.events import triggers as trigs from pycbc.events import trigger_fits as trstats from pycbc.events import stat as pystat @@ -82,6 +83,10 @@ parser.add_argument("--gating-veto-windows", nargs='+', parser.add_argument("--stat-fit-threshold", type=float, required=True, help="Only fit triggers with statistic value above this " "threshold. Required") +parser.add_argument("--plot-lower-stat-limit", type=float, required=True, + help="Plot triggers down to this value. Setting this too" + "low will incur huge memory usage in a full search." + "To avoid this, choose 5.5 or larger.") parser.add_argument("--fit-function", choices=["exponential", "rayleigh", "power"], help="Functional form for the maximum likelihood fit") @@ -96,6 +101,8 @@ pystat.insert_statistic_option_group(parser, default_ranking_statistic='single_ranking_only') args = parser.parse_args() +assert(args.stat_fit_threshold >= args.plot_lower_stat_limit) + pycbc.init_logging(args.verbose) logging.info('Opening trigger file: %s' % args.trigger_file) @@ -122,8 +129,12 @@ for ex_p in extparams: logging.info('Reading duration from trigger file') # List comprehension loops over templates; if a template has no triggers, accessing # the 0th entry of its region reference will return zero due to a quirk of h5py. - params[ex_p] = np.array([trigf[args.ifo + '/template_duration'][ref][0] - for ref in trigf[args.ifo + '/template_duration_template'][:]]) + params[ex_p] = np.array( + [ + trigf[args.ifo + '/template_duration'][ref][0] + for ref in trigf[args.ifo + '/template_duration_template'][:] + ] + ) else: logging.info("Calculating " + ex_p + " from template parameters") params[ex_p] = trigs.get_param(ex_p, args, params['mass1'], @@ -198,10 +209,65 @@ for i, lower_2, upper_2 in zip(range(args.split_two_nbins), logging.info('Getting template boundaries from trigger file') boundaries = trigf[args.ifo + '/template_boundaries'][:] + +trigf.close() + +# Setup a data mask to remove any triggers with SNR below threshold +# This works as a pre-filter as SNR is always greater than or equal +# to sngl_ranking, except in the psdvar case, where it could increase. +with HFile(args.trigger_file, 'r') as trig_file: + n_triggers_orig = trig_file[f'{args.ifo}/snr'].size + logging.info("Trigger file has %d triggers", n_triggers_orig) + logging.info('Generating trigger mask') + if f'{args.ifo}/psd_var_val' in trig_file: + idx, _, _ = trig_file.select( + lambda snr, psdvar: snr / psdvar ** 0.5 >= args.plot_lower_stat_limit, + f'{args.ifo}/snr', + f'{args.ifo}/psd_var_val', + return_indices=True + ) + else: + # psd_var_val may not have been calculated + idx, _ = trig_file.select( + lambda snr: snr >= args.plot_lower_stat_limit, + f'{args.ifo}/snr', + return_indices=True + ) + data_mask = np.zeros(n_triggers_orig, dtype=bool) + data_mask[idx] = True + + +logging.info('Calculating single stat values from trigger file') +trigs = SingleDetTriggers( + args.trigger_file, + None, + None, + None, + None, + args.ifo, + premask=data_mask +) +# This is the direct pointer to the HDF file, used later on +trigf = trigs.trigs_f + +stat = trigs.get_ranking(args.sngl_ranking) +time = trigs.end_time + + +logging.info('Processing template boundaries') max_boundary_id = np.argmax(boundaries) sorted_boundary_list = np.sort(boundaries) -logging.info('Processing template boundaries') +# In the two blocks of code that follows we are trying to figure out the index +# ranges in the masked trigger lists corresponding to the "boundaries". +# We will do this by walking over the boundaries in the order they're +# stored in the file, adding in the number of triggers not removed by the +# mask every time. + +# First step is to loop over the "boundaries" which gives the start position +# of each block of triggers (corresponding to one template) in the full trigger +# merge file. +# Here we identify the end_idx for the triggers corresponding to each template. where_idx_end = np.zeros_like(boundaries) for idx, idx_start in enumerate(boundaries): if idx == max_boundary_id: @@ -210,22 +276,41 @@ for idx, idx_start in enumerate(boundaries): where_idx_end[idx] = sorted_boundary_list[ np.argmax(sorted_boundary_list == idx_start) + 1] -logging.info('Calculating single stat values from trigger file') -rank_method = pystat.get_statistic_from_opts(args, [args.ifo]) -stat = rank_method.get_sngl_ranking(trigf[args.ifo]) +# Next we need to map these start/stop indices in the full file, to the start +# stop indices in the masked list of triggers. We do this by figuring out +# how many triggers are in the masked list for each template in the order they +# are stored in the trigger merge file, and keep a running sum. +curr_count = 0 +mask_start_idx = np.zeros_like(boundaries) +mask_end_idx = np.zeros_like(boundaries) +for idx_start in sorted_boundary_list: + boundary_idx = np.argmax(boundaries == idx_start) + idx_end = where_idx_end[boundary_idx] + mask_start_idx[boundary_idx] = curr_count + curr_count += np.sum(trigs.mask[idx_start:idx_end]) + mask_end_idx[boundary_idx] = curr_count + if args.veto_file: logging.info('Applying DQ vetoes') - time = trigf[args.ifo + '/end_time'][:] - remove, junk = events.veto.indices_within_segments(time, [args.veto_file], - ifo=args.ifo, segment_name=args.veto_segment_name) - # Set stat to zero for triggers being vetoed: given that the fit threshold is - # >0 these will not be fitted or plotted. Avoids complications from changing - # the number of triggers, ie changes of template boundary. - stat[remove] = np.zeros_like(remove) - time[remove] = np.zeros_like(remove) - logging.info('{} out of {} trigs removed after vetoing with {} from {}'.format( - remove.size, stat.size, args.veto_segment_name, args.veto_file)) + remove, junk = events.veto.indices_within_segments( + time, + [args.veto_file], + ifo=args.ifo, + segment_name=args.veto_segment_name + ) + # Set stat to zero for triggers being vetoed: given that the fit threshold + # is >0 these will not be fitted or plotted. Avoids complications from + # changing the number of triggers, ie changes of template boundary. + stat[remove] = 0. + time[remove] = 0. + logging.info( + '%d out of %d trigs removed after vetoing with %s from %s', + remove.size, + stat.size, + args.veto_segment_name, + args.veto_file + ) if args.gating_veto_windows: logging.info('Applying veto to triggers near gates') @@ -236,19 +321,24 @@ if args.gating_veto_windows: raise ValueError("Gating veto window values must be negative before " "gates and positive after gates.") if not (gveto_before == 0 and gveto_after == 0): - time = trigf[args.ifo + '/end_time'][:] autogate_times = np.unique(trigf[args.ifo + '/gating/auto/time'][:]) if args.ifo + '/gating/file' in trigf: detgate_times = trigf[args.ifo + '/gating/file/time'][:] else: detgate_times = [] gate_times = np.concatenate((autogate_times, detgate_times)) - gveto_remove = events.veto.indices_within_times(time, gate_times + gveto_before, - gate_times + gveto_after) - stat[gveto_remove] = np.zeros_like(gveto_remove) - time[gveto_remove] = np.zeros_like(gveto_remove) - logging.info('{} out of {} trigs removed after vetoing triggers near gates'.format( - gveto_remove.size, stat.size)) + gveto_remove = events.veto.indices_within_times( + time, + gate_times + gveto_before, + gate_times + gveto_after + ) + stat[gveto_remove] = 0. + time[gveto_remove] = 0. + logging.info( + '%d out of %d trigs removed after vetoing triggers near gates', + gveto_remove.size, + stat.size + ) for x in range(args.split_one_nbins): if not args.prune_number: @@ -266,9 +356,8 @@ for x in range(args.split_one_nbins): time_inbin = [] # getting triggers that are in these templates for idx in id_in_both: - where_idx_start = boundaries[idx] - vals_inbin += list(stat[where_idx_start:where_idx_end[idx]]) - time_inbin += list(time[where_idx_start:where_idx_end[idx]]) + vals_inbin += list(stat[mask_start_idx[idx]:mask_end_idx[idx]]) + time_inbin += list(time[mask_start_idx[idx]:mask_end_idx[idx]]) vals_inbin = np.array(vals_inbin) time_inbin = np.array(time_inbin) @@ -276,32 +365,36 @@ for x in range(args.split_one_nbins): count_pruned = 0 logging.info('Pruning in split {}-{} {}-{}'.format( args.split_param_one, x, args.split_param_two, y)) + logging.info('Currently have %d triggers', len(vals_inbin)) while count_pruned < args.prune_number: # Getting loudest statistic value in split max_val_arg = vals_inbin.argmax() max_statval = vals_inbin[max_val_arg] - remove = np.nonzero(abs(time_inbin[max_val_arg] - time) - < args.prune_window)[0] + remove = np.nonzero( + abs(time_inbin[max_val_arg] - time) < args.prune_window + )[0] # Remove from inbin triggers as well in case there # are more pruning iterations - remove_inbin = np.nonzero(abs(time_inbin[max_val_arg] - time_inbin) - < args.prune_window)[0] - logging.info('Prune {}: removing {} triggers around time {:.2f},' - ' {} in this split'.format(count_pruned, remove.size, - time[max_val_arg], - remove_inbin.size)) + remove_inbin = np.nonzero( + abs(time_inbin[max_val_arg] - time_inbin) < args.prune_window + )[0] + logging.info( + 'Prune %d: removing %d triggers around %.2f, %d in this split', + count_pruned, + remove.size, + time[max_val_arg], + remove_inbin.size + ) # Set pruned triggers' stat values to zero, as above for vetoes - vals_inbin[remove_inbin] = np.zeros_like(remove_inbin) - time_inbin[remove_inbin] = np.zeros_like(remove_inbin) - stat[remove] = np.zeros_like(remove) - time[remove] = np.zeros_like(remove) + vals_inbin[remove_inbin] = 0. + time_inbin[remove_inbin] = 0. + stat[remove] = 0. + time[remove] = 0. count_pruned += 1 -trigf.close() - logging.info('Setting up plotting and fitting limit values') -minplot = max(stat[np.nonzero(stat)].min(), args.stat_fit_threshold - 1) +minplot = max(stat[np.nonzero(stat)].min(), args.plot_lower_stat_limit) min_fit = max(minplot, args.stat_fit_threshold) max_fit = 1.05 * stat.max() if args.plot_max_x: @@ -355,8 +448,7 @@ for x in range(args.split_one_nbins): if len(indices_all_conditions) == 0: continue vals_inbin = [] for idx in indices_all_conditions: - where_idx_start = boundaries[idx] - vals_inbin += list(stat[where_idx_start:where_idx_end[idx]]) + vals_inbin += list(stat[mask_start_idx[idx]:mask_end_idx[idx]]) vals_inbin = np.array(vals_inbin) vals_above_thresh = vals_inbin[vals_inbin >= args.stat_fit_threshold] diff --git a/bin/minifollowups/pycbc_foreground_minifollowup b/bin/minifollowups/pycbc_foreground_minifollowup index 41b3f015887..4d9a19d277e 100644 --- a/bin/minifollowups/pycbc_foreground_minifollowup +++ b/bin/minifollowups/pycbc_foreground_minifollowup @@ -103,10 +103,10 @@ file_val = args.analysis_category stat = f['{}/stat'.format(file_val)][:] if args.sort_variable not in f[file_val]: - all_datasets = [re.sub(file_val, '', ds).strip('/') for ds in get_all_subkeys(f, file_val)] - raise KeyError('Sort variable {0} not in {1}: sort choices in ' - '{1} are {2}'.format(args.sort_variable, file_val, - ', '.join(all_datasets))) + all_datasets = [re.sub(file_val, '', ds).strip('/') + for ds in get_all_subkeys(f, file_val)] + raise KeyError(f'Sort variable {args.sort_variable} not in {file_val}: sort' + f'choices in {file_val} are ' + ', '.join(all_datasets)) events_to_read = num_events * 100 @@ -126,8 +126,8 @@ tids = {} # Version used for multi-ifo coinc code ifo_list = f.attrs['ifos'].split(' ') for ifo in ifo_list: - times[ifo] = f['{}/{}/time'.format(file_val,ifo)][:][event_idx] - tids[ifo] = f['{}/{}/trigger_id'.format(file_val, ifo)][:][event_idx] + times[ifo] = f[f'{file_val}/{ifo}/time'][:][event_idx] + tids[ifo] = f[f'{file_val}/{ifo}/trigger_id'][:][event_idx] bank_data = h5py.File(args.bank_file, 'r') @@ -139,21 +139,30 @@ curr_idx = -1 while event_count < num_events and curr_idx < (events_to_read - 1): curr_idx += 1 files = wf.FileList([]) + duplicate = False + # Times and trig ids for trigger timeseries plot ifo_times_strings = [] ifo_tids_strings = [] - duplicate = False - for ifo in times: - ifo_times_string = '%s:%s' % (ifo, times[ifo][curr_idx]) - ifo_tids_string = '%s:%s' % (ifo, tids[ifo][curr_idx]) - ifo_times_strings += [ifo_times_string] - ifo_tids_strings += [ifo_tids_string] - # For background do not want to follow up 10 background coincs with - # the same event in ifo 1 and different events in ifo 2 + # Mean time to use as reference for zerolag + mtime = coinc.mean_if_greater_than_zero( + [times[ifo][curr_idx] for ifo in times])[0] + + for ifo in times: + plottime = times[ifo][curr_idx] + if plottime == -1: # Ifo is not in coinc + # Use mean time instead and don't plot special trigger + plottime = mtime + else: # Do plot special trigger + ifo_tids_strings += ['%s:%s' % (ifo, tids[ifo][curr_idx])] + ifo_times_strings += ['%s:%s' % (ifo, plottime)] + + # For background do not want to follow up 10 background coincs with the + # same event in ifo 1 and different events in ifo 2, so record times if ifo not in event_times: event_times[ifo] = [] - # Don't skip coincs in zerolag or due to the sentinel time -1 + # Only do this for background coincident triggers & not sentinel time -1 if 'background' in args.analysis_category and \ times[ifo][curr_idx] != -1 and \ int(times[ifo][curr_idx]) in event_times[ifo]: @@ -175,10 +184,10 @@ while event_count < num_events and curr_idx < (events_to_read - 1): workflow, skipped_data, args.output_dir, - tags=['SKIP_{}'.format(event_count)]),) + tags=[f'SKIP_{event_count}']),) skipped_data = [] - bank_id = f['{}/template_id'.format(file_val)][:][sorting][curr_idx] + bank_id = f[f'{file_val}/template_id'][:][sorting][curr_idx] layouts += (mini.make_coinc_info(workflow, single_triggers, tmpltbank_file, coinc_file, args.output_dir, n_loudest=curr_idx, @@ -193,7 +202,7 @@ while event_count < num_events and curr_idx < (events_to_read - 1): params['%s_end_time' % ifo] = times[ifo][curr_idx] try: # Only present for precessing case - params['u_vals_%s'%ifo] = \ + params['u_vals_%s' % ifo] = \ fsdt[ifo][ifo]['u_vals'][tids[ifo][curr_idx]] except: pass @@ -236,8 +245,8 @@ while event_count < num_events and curr_idx < (events_to_read - 1): tags=args.tags + [str(event_count)]) break else: - logging.info('Trigger time {} is not valid in {}, ' \ - 'skipping singles plots'.format(time, single.ifo)) + logging.info(f'Trigger time {time} is not valid in ' + f'{single.ifo}, skipping singles plots') layouts += list(layout.grouper(files, 2)) diff --git a/bin/minifollowups/pycbc_plot_trigger_timeseries b/bin/minifollowups/pycbc_plot_trigger_timeseries index 78f1e562d4d..8f5f5f0a0da 100644 --- a/bin/minifollowups/pycbc_plot_trigger_timeseries +++ b/bin/minifollowups/pycbc_plot_trigger_timeseries @@ -14,6 +14,7 @@ # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + """ Plot the single detector trigger timeseries """ import argparse, logging, pycbc.version, pycbc.results, sys from pycbc.types import MultiDetOptionAction @@ -109,7 +110,9 @@ for ifo in args.single_trigger_files.keys(): if args.special_trigger_ids: special_idx = args.special_trigger_ids[ifo] - if special_idx not in idx: + if special_idx == None: # No special trigger for this ifo + continue + elif special_idx not in idx: logging.info("IDX %d not in kept list", args.special_trigger_ids[ifo]) continue diff --git a/examples/inference/lisa_smbhb_ldc/get.sh b/examples/inference/lisa_smbhb_ldc/get.sh index 8cb31c037cc..4d7962c408b 100644 --- a/examples/inference/lisa_smbhb_ldc/get.sh +++ b/examples/inference/lisa_smbhb_ldc/get.sh @@ -4,13 +4,13 @@ for channel in A E T do strain_file=${channel}_TDI_v2.gwf test -f ${strain_file} && continue - curl -O --show-error --silent https://zenodo.org/record/7497853/files/${strain_file} + curl -LO --show-error --silent https://zenodo.org/record/7497853/files/${strain_file} psd_file=${channel}_psd.txt test -f ${psd_file} && continue - curl -O --show-error --silent https://zenodo.org/record/7497853/files/${psd_file} + curl -LO --show-error --silent https://zenodo.org/record/7497853/files/${psd_file} done params_file=MBHB_params_v2_LISA_frame.pkl test -f ${params_file} && continue -curl -O --show-error --silent https://zenodo.org/record/7497853/files/${params_file} +curl -LO --show-error --silent https://zenodo.org/record/7497853/files/${params_file}