From 4ca6c71ab6a8723a8a712e4a2e37c7b61b266e41 Mon Sep 17 00:00:00 2001 From: David Moreau Date: Tue, 26 Nov 2024 12:15:19 -0800 Subject: [PATCH] Implemented filter by resolution and n_obs in cctbx.xfel.merge --- .../application/filter/experiment_filter.py | 152 ++++++++++++------ xfel/merging/application/phil/phil.py | 8 +- 2 files changed, 111 insertions(+), 49 deletions(-) diff --git a/xfel/merging/application/filter/experiment_filter.py b/xfel/merging/application/filter/experiment_filter.py index 280a956c21..a42bd3f1b3 100644 --- a/xfel/merging/application/filter/experiment_filter.py +++ b/xfel/merging/application/filter/experiment_filter.py @@ -9,19 +9,11 @@ class experiment_filter(worker): def __init__(self, params, mpi_helper=None, mpi_logger=None): super(experiment_filter, self).__init__(params=params, mpi_helper=mpi_helper, mpi_logger=mpi_logger) - def validate(self): - if 'unit_cell' not in self.params.filter.algorithm: # so far only "unit_cell" algorithm is supported - return - assert self.params.filter.unit_cell.value.target_space_group is not None, \ - 'Space group is required for unit cell filtering' - def __repr__(self): return 'Filter experiments' def check_unit_cell(self, experiment): - experiment_unit_cell = experiment.crystal.get_unit_cell() - is_ok = experiment_unit_cell.is_similar_to(self.params.filter.unit_cell.value.target_unit_cell, self.params.filter.unit_cell.value.relative_length_tolerance, self.params.filter.unit_cell.value.absolute_angle_tolerance) @@ -49,7 +41,6 @@ def remove_experiments(experiments, reflections, experiment_ids_to_remove): experiments.select_on_experiment_identifiers([i for i in experiments.identifiers() if i not in experiment_ids_to_remove]) reflections.remove_on_experiment_identifiers(experiment_ids_to_remove) reflections.reset_ids() - return experiments, reflections def check_cluster(self, experiment): @@ -69,18 +60,89 @@ def check_cluster(self, experiment): return m_distance < self.params.filter.unit_cell.cluster.covariance.mahalanobis def run(self, experiments, reflections): - if 'unit_cell' not in self.params.filter.algorithm: # so far only "unit_cell" algorithm is supported + filter_by_unit_cell = 'unit_cell' in self.params.filter.algorithm[0] + filter_by_n_obs = 'n_obs' in self.params.filter.algorithm[0] + filter_by_resolution = 'resolution' in self.params.filter.algorithm[0] + # only "unit_cell" "n_obs" and "resolution" algorithms are supported + if (not filter_by_unit_cell) and (not filter_by_n_obs) and (not filter_by_resolution): return experiments, reflections + if filter_by_unit_cell: + assert self.params.filter.unit_cell.value.target_space_group is not None, \ + 'Space group is required for unit cell filtering' + if filter_by_n_obs: + check0 = self.params.filter.n_obs.min is not None + check1 = self.params.filter.n_obs.max is not None + assert check0 or check1, \ + 'Either min or max is required for n_obs filtering' + if filter_by_resolution: + assert self.params.filter.resolution.d_min is not None, \ + 'd_min is required for resolution filtering' self.logger.log_step_time("FILTER_EXPERIMENTS") + # BEGIN BY-VALUE FILTER + experiment_ids_to_remove = [] + if filter_by_unit_cell: + experiment_ids_to_remove_unit_cell, removed_for_unit_cell, removed_for_space_group = self.run_filter_by_unit_cell(experiments, reflections) + experiment_ids_to_remove += experiment_ids_to_remove_unit_cell + else: + removed_for_unit_cell = 0 + removed_for_space_group = 0 + if filter_by_n_obs: + experiment_ids_to_remove_n_obs, removed_for_n_obs = self.run_filter_by_n_obs(experiments, reflections) + experiment_ids_to_remove += experiment_ids_to_remove_n_obs + else: + removed_for_n_obs = 0 + if filter_by_resolution: + experiment_ids_to_remove_resolution, removed_for_resolution = self.run_filter_by_resolution(experiments, reflections) + experiment_ids_to_remove += experiment_ids_to_remove_resolution + else: + removed_for_resolution = 0 + experiment_ids_to_remove = list(set(experiment_ids_to_remove)) + + input_len_expts = len(experiments) + input_len_refls = len(reflections) + new_experiments, new_reflections = experiment_filter.remove_experiments(experiments, reflections, experiment_ids_to_remove) + + removed_reflections = input_len_refls - len(new_reflections) + #assert removed_for_space_group + removed_for_unit_cell == input_len_expts - len(new_experiments) + + self.logger.log("Experiments rejected because of unit cell dimensions: %d"%removed_for_unit_cell) + self.logger.log("Experiments rejected because of space group %d"%removed_for_space_group) + self.logger.log("Experiments rejected because of n_obs %d"%removed_for_n_obs) + self.logger.log("Experiments rejected because of resolution %d"%removed_for_resolution) + self.logger.log("Reflections rejected because of rejected experiments: %d"%removed_reflections) + + # MPI-reduce total counts + comm = self.mpi_helper.comm + MPI = self.mpi_helper.MPI + total_removed_for_unit_cell = comm.reduce(removed_for_unit_cell, MPI.SUM, 0) + total_removed_for_space_group = comm.reduce(removed_for_space_group, MPI.SUM, 0) + total_removed_for_n_obs = comm.reduce(removed_for_n_obs, MPI.SUM, 0) + total_removed_for_resolution = comm.reduce(removed_for_resolution, MPI.SUM, 0) + total_reflections_removed = comm.reduce(removed_reflections, MPI.SUM, 0) + + # rank 0: log total counts + if self.mpi_helper.rank == 0: + self.logger.main_log("Total experiments rejected because of unit cell dimensions: %d"%total_removed_for_unit_cell) + self.logger.main_log("Total experiments rejected because of space group %d"%total_removed_for_space_group) + self.logger.main_log("Total experiments rejected because of n_obs %d"%total_removed_for_n_obs) + self.logger.main_log("Total experiments rejected because of resolution %d"%total_removed_for_resolution) + self.logger.main_log("Total reflections rejected because of rejected experiments %d"%total_reflections_removed) + + self.logger.log_step_time("FILTER_EXPERIMENTS", True) + + # Do we have any data left? + from xfel.merging.application.utils.data_counter import data_counter + data_counter(self.params).count(new_experiments, new_reflections) + return new_experiments, new_reflections + + def filter_by_unit_cell(self, experiments, reflections): experiment_ids_to_remove = [] removed_for_unit_cell = 0 removed_for_space_group = 0 - -# BEGIN BY-VALUE FILTER if self.params.filter.unit_cell.algorithm == "value": - # If the filter unit cell and/or space group params are Auto, use the corresponding scaling targets. + # If the filter unit cell and/or space group params are Auto, use the corresponding scaling targets. if self.params.filter.unit_cell.value.target_unit_cell in (Auto, None): if self.params.scaling.unit_cell is None: try: @@ -102,12 +164,10 @@ def run(self, experiments, reflections): elif not self.check_unit_cell(experiment): experiment_ids_to_remove.append(experiment.identifier) removed_for_unit_cell += 1 -# END BY-VALUE FILTER + # END BY-VALUE FILTER elif self.params.filter.unit_cell.algorithm == "cluster": - from uc_metrics.clustering.util import get_population_permutation # implicit import import pickle - class Empty: pass if self.mpi_helper.rank == 0: with open(self.params.filter.unit_cell.cluster.covariance.file,'rb') as F: @@ -136,38 +196,36 @@ class Empty: pass if not self.check_cluster(experiment): experiment_ids_to_remove.append(experiment.identifier) removed_for_unit_cell += 1 -# END OF COVARIANCE FILTER - input_len_expts = len(experiments) - input_len_refls = len(reflections) - new_experiments, new_reflections = experiment_filter.remove_experiments(experiments, reflections, experiment_ids_to_remove) - - removed_reflections = input_len_refls - len(new_reflections) - assert removed_for_space_group + removed_for_unit_cell == input_len_expts - len(new_experiments) - - self.logger.log("Experiments rejected because of unit cell dimensions: %d"%removed_for_unit_cell) - self.logger.log("Experiments rejected because of space group %d"%removed_for_space_group) - self.logger.log("Reflections rejected because of rejected experiments: %d"%removed_reflections) - - # MPI-reduce total counts - comm = self.mpi_helper.comm - MPI = self.mpi_helper.MPI - total_removed_for_unit_cell = comm.reduce(removed_for_unit_cell, MPI.SUM, 0) - total_removed_for_space_group = comm.reduce(removed_for_space_group, MPI.SUM, 0) - total_reflections_removed = comm.reduce(removed_reflections, MPI.SUM, 0) - - # rank 0: log total counts - if self.mpi_helper.rank == 0: - self.logger.main_log("Total experiments rejected because of unit cell dimensions: %d"%total_removed_for_unit_cell) - self.logger.main_log("Total experiments rejected because of space group %d"%total_removed_for_space_group) - self.logger.main_log("Total reflections rejected because of rejected experiments %d"%total_reflections_removed) - - self.logger.log_step_time("FILTER_EXPERIMENTS", True) - - # Do we have any data left? - from xfel.merging.application.utils.data_counter import data_counter - data_counter(self.params).count(new_experiments, new_reflections) + # END OF COVARIANCE FILTER + return experiment_ids_to_remove, removed_for_unit_cell, removed_for_space_group - return new_experiments, new_reflections + def run_filter_by_n_obs(self, experiments, reflections): + experiment_ids_to_remove = [] + removed_for_n_obs = 0 + for expt_index, experiment in enumerate(experiments): + refls_expt = reflections.select(reflections['id'] == expt_index) + if self.params.filter.n_obs.max and len(refls_expt) > self.params.filter.n_obs.max: + experiment_ids_to_remove.append(experiment.identifier) + removed_for_n_obs += 1 + if self.params.filter.n_obs.min and len(refls_expt) < self.params.filter.n_obs.min: + experiment_ids_to_remove.append(experiment.identifier) + removed_for_n_obs += 1 + return experiment_ids_to_remove, removed_for_n_obs + + def run_filter_by_resolution(self, experiments, reflections): + experiment_ids_to_remove = [] + removed_for_resolution = 0 + resolution = reflections.compute_d(experiments) + start = 0 + for expt_index, expt in enumerate(experiments): + refls_expt = reflections.select(reflections['id'] == expt_index) + n_refls = refls_expt.size() + max_resolution = min(resolution[start: start + n_refls]) + start += n_refls + if max_resolution > self.params.filter.resolution.d_min: + experiment_ids_to_remove.append(expt.identifier) + removed_for_resolution += 1 + return experiment_ids_to_remove, removed_for_resolution if __name__ == '__main__': from xfel.merging.application.worker import exercise_worker diff --git a/xfel/merging/application/phil/phil.py b/xfel/merging/application/phil/phil.py index 327550717d..9c7c3f5876 100644 --- a/xfel/merging/application/phil/phil.py +++ b/xfel/merging/application/phil/phil.py @@ -129,14 +129,18 @@ .help = The filter section defines criteria to accept or reject whole experiments .help = or to modify the entire experiment by a reindexing operator .help = refer to the select section for filtering of individual reflections + .help = only unit_cell, resolution, and n_obs are implemented { algorithm = n_obs reindex resolution unit_cell report - .type = choice + .type = strings .multiple = True n_obs { - min = 15 + min = None .type = int .help = Minimum number of observations for subsequent processing + max = None + .type = int + .help = Maximum number of observations for subsequent processing } reindex { data_reindex_op = h,k,l