From 54c44d0f73589ac47c42fc4d0601aae53910c499 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 10:55:36 -0800 Subject: [PATCH 1/9] Annots --- src/dartsort/cluster/gaussian_mixture.py | 80 ++++++++++-------------- 1 file changed, 32 insertions(+), 48 deletions(-) diff --git a/src/dartsort/cluster/gaussian_mixture.py b/src/dartsort/cluster/gaussian_mixture.py index f3a46119..4c4870d6 100644 --- a/src/dartsort/cluster/gaussian_mixture.py +++ b/src/dartsort/cluster/gaussian_mixture.py @@ -249,16 +249,8 @@ def m_step(self, likelihoods=None, show_progress=False, prev_means=None): for j, unit in enumerate(zip(unit_ids, results)): assert unit.annotations["unit_id"] == j self.units.append(unit) - # if self.log_proportions is not None: - # # this is the index of the noise unit. it's got to be larger than - # # the largest unit index - # maxix = self.log_proportions.numel() - 1 - # assert (unit_ids < maxix).all() - # ixs = torch.cat((unit_ids, torch.tensor([maxix]))) - # self.log_proportions = self.log_proportions[ixs] if prev_means is not None: nu = len(unit_ids) - # prev_means = prev_means[unit_ids] new_means, *_ = self.stack_units(mean_only=True) dmu = (prev_means - new_means).abs_().view(nu, -1) adif = torch.max(dmu) @@ -387,14 +379,8 @@ def reassign(self, log_liks): self.labels.copy_(assignments) return reassign_count, spike_logliks, log_liks_csc - def cleanup(self, log_liks=None, clean_units=True, min_count=None): - """Remove too-small units - - Also handles bookkeeping to throw those units out of the sparse - log_liks array, and to throw away small units in self.units. - clean_units should be true unless your next move is an m step. - For instance if it's an E step, it's like the cleanup never happened. - """ + def cleanup(self, log_liks=None, min_count=None): + """Remove too-small units, make label space contiguous, tidy all properties""" unit_ids, counts = torch.unique(self.labels, return_counts=True) counts = counts[unit_ids >= 0] unit_ids = unit_ids[unit_ids >= 0] @@ -411,17 +397,33 @@ def cleanup(self, log_liks=None, clean_units=True, min_count=None): if log_liks is not None: has_noise_unit = log_liks.shape[1] > len(self.units) - self.relabel_units(unit_ids[big_enough]) - if clean_units and self.log_proportions is not None: + kept_ids = unit_ids[big_enough] + new_ids = torch.arange(kept_ids.numel()) + old2new = dict(zip(kept_ids, new_ids)) + self._relabel(kept_ids) + + if self.log_proportions is not None: lps = self.log_proportions.numpy(force=True) lps = lps[keep_noise.numpy(force=True)] # logsumexp to 0 (sumexp to 1) again lps -= logsumexp(lps) self.log_proportions = self.log_proportions.new_tensor(lps) - if clean_units and len(self.units): - keep_units = [u for j, u in enumerate(self.units) if keep[j]] + + if len(self.units): + keep_units = [] + for oi, ni in zip(kept_ids, new_ids): + u = self.units[oi] + u.annotations["unit_id"] = ni + keep_units.append(u) del self.units[:] self.units.extend(keep_units) + + if self.next_round_annotations: + next_round_annotations = {} + for j, nra in self.next_round_annotations.items(): + if keep[j]: + next_round_annotations[old2new[j]] = nra + if log_liks is None: return @@ -1091,32 +1093,19 @@ def unit_group_criterion( if fit_type == "refit_all": units = [] - subunit_logliks = spikes_core.features.new_full( - (len(unit_ids), len(in_any)), -torch.inf - ) + subunit_logliks = spikes_core.features.new_full((len(unit_ids), len(in_any)), -torch.inf) full_loglik = 0.0 for i, k in enumerate(unit_ids): - u = self.fit_unit( - unit_id=k, - indices=in_any, - likelihoods=likelihoods, - features=spikes_extract, - ) + u = self.fit_unit(unit_id=k, indices=in_any, likelihoods=likelihoods, features=spikes_extract) units.append(u) - _, subunit_logliks[i] = self.unit_log_likelihoods( - unit=u, spikes=spikes_core - ) + _, subunit_logliks[i] = self.unit_log_likelihoods(unit=u, spikes=spikes_core) subunit_log_props = F.softmax(subunit_logliks, dim=0).mean(1).log_() # loglik per spik - full_loglik = torch.logsumexp( - subunit_logliks.T + subunit_log_props, dim=1 - ).mean() + full_loglik = torch.logsumexp(subunit_logliks.T + subunit_log_props, dim=1).mean() unit = self.fit_unit(indices=in_any, features=spikes_extract) likelihoods = None elif fit_type == "avg_preexisting": - unit = self.units[unit_ids[0]].avg_with( - *[self.units[u] for u in unit_ids[1:]] - ) + unit = self.units[unit_ids[0]].avg_with(*[self.units[u] for u in unit_ids[1:]]) if debug: subunit_logliks = likelihoods[:, in_any][unit_ids] full_loglik = marginal_loglik( @@ -1365,6 +1354,7 @@ def from_features( channels_strategy_snr_min=50.0, prior_pseudocount=10, scale_mean: float = 0.1, + **annotations, ): self = cls( rank=features.features.shape[1], @@ -1377,6 +1367,7 @@ def from_features( channels_strategy=channels_strategy, channels_strategy_snr_min=channels_strategy_snr_min, scale_mean=scale_mean, + **annotations, ) self.fit(features, weights, neighborhoods=neighborhoods) self = self.to(features.features.device) @@ -1390,13 +1381,8 @@ def avg_with(self, *others): n_channels=self.n_channels, noise=self.noise, ) - new.register_buffer( - "mean", (self.mean + sum(o.mean for o in others)) / (1 + len(others)) - ) - new.register_buffer( - "channels", - torch.cat([self.channels, *[o.channels for o in others]]).unique(), - ) + new.register_buffer("mean", (self.mean + sum(o.mean for o in others)) / (1 + len(others))) + new.register_buffer("channels", torch.cat([self.channels, *[o.channels for o in others]]).unique()) assert self.cov_kind == "zero" return new @@ -1744,9 +1730,7 @@ def coo_to_scipy(coo_tensor): return coo_array((data, coords), shape=coo_tensor.shape) -def marginal_loglik( - indices, log_proportions, log_likelihoods, unit_ids=None, reduce="mean" -): +def marginal_loglik(indices, log_proportions, log_likelihoods, unit_ids=None, reduce="mean"): if unit_ids is not None: # renormalize log props log_proportions = log_proportions[unit_ids] From 7f5ba1ceccc87d39ec37200d7f3d722ba28afcb8 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 17:55:39 -0800 Subject: [PATCH 2/9] Re-do the labels and unit IDs logic... --- src/dartsort/cluster/gaussian_mixture.py | 384 ++++++++++++++++------- src/dartsort/cluster/stable_features.py | 4 +- 2 files changed, 267 insertions(+), 121 deletions(-) diff --git a/src/dartsort/cluster/gaussian_mixture.py b/src/dartsort/cluster/gaussian_mixture.py index 3fe24791..dfe348eb 100644 --- a/src/dartsort/cluster/gaussian_mixture.py +++ b/src/dartsort/cluster/gaussian_mixture.py @@ -11,7 +11,7 @@ from scipy.special import logsumexp from tqdm.auto import tqdm, trange -from ..util import more_operators, noise_util +from ..util import more_operators, noise_util, spiketorch from .cluster_util import agglomerate from .kmeans import kmeans from .modes import smoothed_dipscore_at @@ -74,6 +74,7 @@ def __init__( merge_bimodality_masked: bool = False, merge_sym_function: callable = np.minimum, em_converged_prop: float = 0.02, + em_converged_churn: float = 0.01, em_converged_atol: float = 1e-2, ): super().__init__() @@ -106,6 +107,7 @@ def __init__( self.merge_linkage = merge_linkage self.em_converged_prop = em_converged_prop self.em_converged_atol = em_converged_atol + self.em_converged_churn = em_converged_churn self.split_em_iter = split_em_iter self.split_whiten = split_whiten self.use_proportions = use_proportions @@ -116,7 +118,7 @@ def __init__( self.labels = torch.from_numpy(self.labels) # this is populated by self.m_step() - self.units = torch.nn.ModuleList() + self._units = torch.nn.ModuleDict() self.log_proportions = None # store arguments to the unit constructor in a dict @@ -150,6 +152,62 @@ def __init__( self.storage = threading.local() self.next_round_annotations = {} + # -- unit management + + def __getitem__(self, ix): + ix = self.normalize_key(ix) + if ix not in self: + raise KeyError(f"Mixture has no unit with ID {ix}") + return self._units[ix] + + def __setitem__(self, ix, value): + ix = self.normalize_key(ix) + self._units[ix] = value + + def __contains__(self, ix): + ix = self.normalize_key(ix) + return ix in self._units + + def update(self, other): + if isinstance(other, dict): + other = other.items() + for k, v in other: + self[k] = v + + def empty(self): + return not self._units + + def clear_units(self): + self._stack = None + self._units.clear() + + def unit_ids(self): + return np.array([int(k) for k in self._units.keys()]) + + def items(self): + for k, v in self._units.items(): + yield int(k), v + + def ids_and_units(self): + return self.unit_ids(), self._units.values() + + def n_units(self): + nu_u = max(self.unit_ids()) + 1 + nu_l = self.label_ids().max() + 1 + return max(nu_u, nu_l) + + def label_ids(self): + uids = torch.unique(self.labels) + return uids[uids >= 0] + + def ids(self): + return torch.arange(self.n_units()) + + def n_labels(self): + unit_ids = self.label_ids() + nu = unit_ids.max() + 1 + return nu + # -- headliners def to_sorting(self): @@ -157,10 +215,6 @@ def to_sorting(self): labels[self.data.kept_indices] = self.labels.numpy(force=True) return replace(self.data.original_sorting, labels=labels) - def unit_ids(self): - uids = torch.unique(self.labels) - return uids[uids >= 0] - def em(self, n_iter=None, show_progress=True, final_e_step=True): n_iter = self.n_em_iters if n_iter is None else n_iter if show_progress: @@ -170,62 +224,74 @@ def em(self, n_iter=None, show_progress=True, final_e_step=True): its = range(n_iter) # if we have no units, we can't E step. - if not self.units: + if self.empty(): self.m_step(show_progress=step_progress) + self.cleanup(min_count=1) + convergence_props = {} + log_liks = None for _ in its: # for convergence testing... - self.cleanup(min_count=1) - means, *_ = self.stack_units(mean_only=True) + log_liks, convergence_props = self.cleanup(log_liks, min_count=1, clean_props=convergence_props) + + recompute_mask = None + if 'adif' in convergence_props: + recompute_mask = convergence_props['adif'] > 0 + # recompute_mask = torch.ones(self.n_units(), dtype=bool) - reas_count, log_liks, spike_logliks = self.e_step( - show_progress=step_progress + unit_churn, reas_count, log_liks, spike_logliks = self.e_step( + show_progress=step_progress, recompute_mask=recompute_mask, prev_log_liks=log_liks ) + convergence_props['unit_churn'] = unit_churn + log_liks, convergence_props = self.cleanup(log_liks, clean_props=convergence_props) meanlogpx = spike_logliks.mean() # M step: fit units based on responsibilities - mres = self.m_step(log_liks, show_progress=step_progress, prev_means=means) + to_fit = convergence_props['unit_churn'] >= self.em_converged_churn + mres = self.m_step(log_liks, show_progress=step_progress, to_fit=to_fit) + convergence_props['adif'] = mres['adif'] # extra info for description + max_adif = mres['max_adif'] if show_progress: opct = (self.labels < 0).sum() / self.data.n_spikes opct = f"{100 * opct:.1f}" - nu = len(self.units) + nu = self.n_units().item() reas_prop = reas_count / self.data.n_spikes rpct = f"{100 * reas_prop:.1f}" - adif = f"{mres['adif']:.2f}" - msg = f"EM[{nu=},out={opct}%,reas={rpct}%,dmu={adif},meanlogpx={meanlogpx:g}]" + adif = f"{max_adif:.2f}" + msg = f"EM[{nu=},fu={to_fit.sum()},out={opct}%,reas={rpct}%,dmu={adif},meanlogpx={meanlogpx:g}]" its.set_description(msg) if reas_prop < self.em_converged_prop: break - if mres["adif"] < self.em_converged_atol: + if max_adif is not None and max_adif < self.em_converged_atol: break if not final_e_step: return # final e step for caller - reas_count, log_liks, spike_logliks = self.e_step(clean_units=True) + unit_churn, reas_count, log_liks, spike_logliks = self.e_step(show_progress=step_progress) return log_liks - def e_step(self, show_progress=False): + def e_step(self, show_progress=False, prev_log_liks=None, recompute_mask=None): # E step: get responsibilities and update hard assignments - log_liks = self.log_likelihoods(show_progress=show_progress) + log_liks = self.log_likelihoods( + show_progress=show_progress, previous_logliks=prev_log_liks, recompute_mask=recompute_mask + ) # replace log_liks by csc - reas_count, spike_logliks, log_liks = self.reassign(log_liks) - - # garbage collection -- get rid of low count labels - log_liks = self.cleanup(log_liks) + unit_churn, reas_count, spike_logliks, log_liks = self.reassign(log_liks) + return unit_churn, reas_count, log_liks, spike_logliks - return reas_count, log_liks, spike_logliks - - def m_step(self, likelihoods=None, show_progress=False, prev_means=None): + def m_step(self, likelihoods=None, show_progress=False, to_fit=None): """Beware that this flattens the labels.""" - needs_append = not self.units - unit_ids = self.unit_ids() - if not needs_append: - assert unit_ids.max() < len(self.units) + warm_start = not self.empty() + fit_ids = unit_ids = self.label_ids() + if to_fit is not None: + fit_ids = unit_ids[to_fit[unit_ids]] + if warm_start: + _, prev_means, *_ = self.stack_units(mean_only=True) if self.use_proportions and likelihoods is not None: self.update_proportions(likelihoods) @@ -235,34 +301,44 @@ def m_step(self, likelihoods=None, show_progress=False, prev_means=None): pool = Parallel(self.n_threads, backend="threading", return_as="generator") results = pool( delayed(self.fit_unit)( - j, likelihoods=likelihoods, **self.next_round_annotations.get(j, {}) + j, + likelihoods=likelihoods, + warm_start=warm_start, + **self.next_round_annotations.get(j, {}), ) - for j in unit_ids + for j in fit_ids ) if show_progress: results = tqdm( - results, desc="M step", unit="unit", total=len(unit_ids), **tqdm_kw + results, desc="M step", unit="unit", total=len(fit_ids), **tqdm_kw ) - if needs_append: - for j, unit in enumerate(zip(unit_ids, results)): - assert unit.annotations["unit_id"] == j - self.units.append(unit) - adif = None - if prev_means is not None: - nu = len(unit_ids) - new_means, *_ = self.stack_units(mean_only=True) - dmu = (prev_means - new_means).abs_().view(nu, -1) - adif = torch.max(dmu) + for uid, unit in zip(fit_ids, results): + if not warm_start: + self[uid] = unit + max_adif = adif = None self._stack = None + if warm_start: + ids, new_means, *_ = self.stack_units(mean_only=True) + dmu = (prev_means - new_means).abs_().view(len(new_means), -1) + adif_ = torch.max(dmu, dim=1).values + max_adif = adif_.max() + adif = torch.zeros(self.n_units()) + adif[ids] = adif_ self.clear_scheduled_annotations() - return dict(adif=adif) + return dict(max_adif=max_adif, adif=adif) def log_likelihoods( - self, unit_ids=None, with_noise_unit=True, use_storage=True, show_progress=False + self, + unit_ids=None, + with_noise_unit=True, + use_storage=True, + show_progress=False, + previous_logliks=None, + recompute_mask=None, ): """Noise unit last so that rows correspond to unit ids without 1 offset""" if unit_ids is None: - unit_ids = range(len(self.units)) + unit_ids = self.unit_ids() # how many units does each core neighborhood overlap with? n_cores = self.data.core_neighborhoods.n_neighborhoods @@ -276,11 +352,24 @@ def log_likelihoods( neighb_info = [] nnz = 0 for j in unit_ids: - unit = self.units[j] - neighbs, ns_unit = self.data.core_neighborhoods.subset_neighborhoods( - unit.channels, add_to_overlaps=core_overlaps - ) - neighb_info.append((j, neighbs, ns_unit)) + unit = self[j] + if recompute_mask is None or recompute_mask[j]: + covered_neighbs, neighbs, ns_unit = self.data.core_neighborhoods.subset_neighborhoods( + unit.channels, add_to_overlaps=core_overlaps + ) + unit.annotations['covered_neighbs'] = covered_neighbs + neighb_info.append((j, neighbs, ns_unit)) + else: + row = previous_logliks[[j]].tocoo(copy=True) + six = row.coords[1] + ns_unit = row.nnz + if 'covered_neighbs' in unit.annotations: + covered_neighbs = unit.annotations['covered_neighbs'] + else: + covered_neighbs = self.data.core_neighborhoods.neighborhood_ids[six] + covered_neighbs = torch.unique(covered_neighbs) + neighb_info.append((j, six, row.data, ns_unit)) + core_overlaps[covered_neighbs] += 1 nnz += ns_unit # how many units does each spike overlap with? needed to write csc @@ -292,6 +381,17 @@ def log_likelihoods( if with_noise_unit: nnz = nnz + self.data.n_spikes + @delayed + def job(ninfo): + if len(ninfo) == 4: + j, coo, data, ns = ninfo + return coo, data + else: + assert len(ninfo) == 3 + j, neighbs, ns = ninfo + ix, ll = self.unit_log_likelihoods(unit_id=j, neighbs=neighbs, ns=ns) + return ix, ll + # get the big nnz-length csc buffers. these can be huge so we cache them. csc_indices, csc_data = get_csc_storage(nnz, self.storage, use_storage) # csc compressed indptr. spikes are columns. @@ -301,10 +401,7 @@ def log_likelihoods( # spike, we increment the spike's "write head". idea is to directly make csc write_offsets = indptr[:-1].copy() pool = Parallel(self.n_threads, backend="threading", return_as="generator") - results = pool( - delayed(self.unit_log_likelihoods)(unit_id=j, neighbs=neighbs, ns=ns) - for j, neighbs, ns in neighb_info - ) + results = pool(job(ninfo) for ninfo in neighb_info) if show_progress: results = tqdm( results, @@ -316,8 +413,9 @@ def log_likelihoods( for j, (inds, liks) in enumerate(results): if inds is None: continue - inds = inds.numpy(force=True) - liks = liks.numpy(force=True) + if torch.is_tensor(inds): + inds = inds.numpy(force=True) + liks = liks.numpy(force=True) csc_insert(j, write_offsets, inds, csc_indices, csc_data, liks) # inds = inds.numpy(force=True) # data_ixs = write_offsets[inds] @@ -332,7 +430,7 @@ def log_likelihoods( csc_indices[data_ixs] = j + 1 csc_data[data_ixs] = liks.numpy(force=True) - shape = (len(unit_ids) + with_noise_unit, self.data.n_spikes) + shape = (self.n_units() + with_noise_unit, self.data.n_spikes) log_liks = csc_array((csc_data, csc_indices, indptr), shape=shape) log_liks.has_canonical_format = True @@ -368,36 +466,63 @@ def update_proportions(self, log_liks): ) def reassign(self, log_liks): - has_noise_unit = log_liks.shape[1] > len(self.units) + n_units = self.n_units() assignments, spike_logliks, log_liks_csc = loglik_reassign( log_liks, - has_noise_unit=has_noise_unit, + has_noise_unit=True, log_proportions=self.log_proportions, ) assignments = torch.from_numpy(assignments).to(self.labels) - reassign_count = (self.labels != assignments).sum() + + # track reassignments, first globally + same = torch.zeros_like(assignments) + torch.eq(self.labels, assignments, out=same) + reassign_count = len(same) - same.sum() + + # and per unit IoU + intersection = torch.zeros(n_units, dtype=int) + kept = assignments >= 0 + spiketorch.add_at_(intersection, assignments[kept], same[kept]) + union = torch.zeros(n_units, dtype=int) + buf1 = torch.zeros(same.shape, dtype=bool) + buf2 = torch.zeros(same.shape, dtype=bool) + buf3 = torch.zeros(same.shape, dtype=bool) + for j in range(n_units): + torch.eq(assignments, j, out=buf1) + torch.eq(self.labels, j, out=buf2) + union[j] = max(1, torch.logical_or(buf1, buf2, out=buf3).sum()) + iou = intersection / union + unit_churn = 1.0 - iou + + # update labels self.labels.copy_(assignments) - return reassign_count, spike_logliks, log_liks_csc - def cleanup(self, log_liks=None, min_count=None): + return unit_churn, reassign_count, spike_logliks, log_liks_csc + + def cleanup(self, log_liks=None, min_count=None, clean_props=None): """Remove too-small units, make label space contiguous, tidy all properties""" - unit_ids, counts = torch.unique(self.labels, return_counts=True) - counts = counts[unit_ids >= 0] - unit_ids = unit_ids[unit_ids >= 0] if min_count is None: min_count = self.min_count + + label_ids, counts = torch.unique(self.labels, return_counts=True) + counts = counts[label_ids >= 0] + label_ids = label_ids[label_ids >= 0] big_enough = counts >= min_count - keep = torch.zeros(len(self.units), dtype=bool) - keep[unit_ids] = big_enough + + keep = torch.zeros(self.n_units(), dtype=bool) + keep[label_ids] = big_enough + self._stack = None + if keep.all(): - return log_liks + return log_liks, clean_props + + if clean_props: + clean_props = {k: v[keep] for k, v in clean_props.items()} keep_noise = torch.concatenate((keep, torch.ones_like(keep[:1]))) keep = keep.numpy(force=True) - if log_liks is not None: - has_noise_unit = log_liks.shape[1] > len(self.units) - kept_ids = unit_ids[big_enough] + kept_ids = label_ids[big_enough] new_ids = torch.arange(kept_ids.numel()) old2new = dict(zip(kept_ids, new_ids)) self._relabel(kept_ids) @@ -409,36 +534,37 @@ def cleanup(self, log_liks=None, min_count=None): lps -= logsumexp(lps) self.log_proportions = self.log_proportions.new_tensor(lps) - if len(self.units): - keep_units = [] + if not self.empty(): + keep_units = {} for oi, ni in zip(kept_ids, new_ids): - u = self.units[oi] - u.annotations["unit_id"] = ni - keep_units.append(u) - del self.units[:] - self.units.extend(keep_units) + u = self[oi] + keep_units[ni] = u + self.clear_units() + self.update(keep_units) if self.next_round_annotations: next_round_annotations = {} for j, nra in self.next_round_annotations.items(): if keep[j]: next_round_annotations[old2new[j]] = nra + self.next_round_annotations = next_round_annotations if log_liks is None: - return + return log_liks, clean_props - keep_ll = keep_noise.numpy(force=True) if has_noise_unit else keep + keep_ll = keep_noise.numpy(force=True) assert keep_ll.size == log_liks.shape[0] if isinstance(log_liks, coo_array): log_liks = coo_sparse_mask_rows(log_liks, keep_ll) elif isinstance(log_liks, csc_array): keep_ll = np.flatnonzero(keep_ll) + assert keep_ll.max() == log_liks.shape[0] - 1 log_liks = log_liks[keep_ll] else: assert False - return log_liks + return log_liks, clean_props def merge(self, log_liks=None, show_progress=True): distances = self.distances(show_progress=show_progress) @@ -469,7 +595,7 @@ def merge(self, log_liks=None, show_progress=True): merge_parents = np.flatnonzero(new_ids == new_ids) self.schedule_annotations(new_id, dict(merge_parents=merge_parents)) - del self.units[:] + self.clear_units() if self.log_proportions is not None: log_props = self.log_proportions.numpy(force=True) @@ -483,7 +609,6 @@ def merge(self, log_liks=None, show_progress=True): self.log_proportions = torch.asarray( new_log_props, device=self.log_proportions.device ) - self._stack = None def split(self, show_progress=True): pool = Parallel( @@ -499,8 +624,7 @@ def split(self, show_progress=True): if "new_ids" in res: for nid in res["new_ids"]: self.schedule_annotations(nid, split_parent=res["parent_id"]) - del self.units[:] - self._stack = None + self.clear_units() def distances( self, kind=None, noise_normalized=None, units=None, show_progress=True @@ -512,21 +636,27 @@ def distances( noise_normalized = self.distance_noise_normalized if units is None: - units = self.units - nu = len(units) + nu = self.n_units() + ids, units = self.ids_and_units() + units = self.items() + else: + nu = len(units) + ids = range(nu) # stack unit data into one place mean_only = kind == "noise_metric" - means, covs, logdets = self.stack_units(units, mean_only=mean_only) + ids, means, covs, logdets = self.stack_units(ids=ids, units=units, mean_only=mean_only) # compute denominator of noised normalized distances if noise_normalized: - denom = self.noise_unit.divergence( + denom_ = self.noise_unit.divergence( means, other_covs=covs, other_logdets=logdets, kind=kind ) - denom = np.sqrt(denom.numpy(force=True)) + denom = np.zeros(nu) + denom[ids] = np.sqrt(denom_.numpy(force=True)) - dists = np.zeros((nu, nu), dtype=np.float32) + dists = np.full((nu, nu), np.inf, dtype=np.float32) + np.fill_diagonal(dists, 0.0) @delayed def dist_job(j, unit): @@ -536,12 +666,12 @@ def dist_job(j, unit): d = d.numpy(force=True).astype(dists.dtype) if noise_normalized: d /= denom * denom[j] - dists[j] = d + dists[j, ids] = d pool = Parallel( self.n_threads, backend="threading", return_as="generator_unordered" ) - results = pool(dist_job(j, u) for j, u in enumerate(units)) + results = pool(dist_job(j, u) for j, u in zip(ids, units)) if show_progress: results = tqdm(results, desc="Distances", total=nu, unit="unit", **tqdm_kw) for _ in results: @@ -569,7 +699,7 @@ def bimodalities( min_overlap = self.merge_bimodality_overlap if masked is None: masked = self.merge_bimodality_masked - nu = len(self.units) + nu = self.n_units() in_units = [ torch.nonzero(self.labels == j, as_tuple=True)[0] for j in range(nu) ] @@ -682,6 +812,7 @@ def fit_unit( weights=None, features=None, verbose=False, + warm_start=False, **unit_args, ): if features is None: @@ -695,16 +826,14 @@ def fit_unit( if verbose and weights is not None: print(f"{weights.sum()=} {weights.min()=} {weights.max()=}") unit_args = self.unit_args | unit_args - if len(self.units) > unit_id: - unit = self.units[unit_id] - assert unit.annotations.get(unit_id, unit_id) == unit_id + if warm_start: + unit = self[unit_id] unit.fit(features, weights, neighborhoods=self.data.extract_neighborhoods) else: unit = GaussianUnit.from_features( features, weights, neighborhoods=self.data.extract_neighborhoods, - unit_id=unit_id, **unit_args, ) return unit @@ -732,7 +861,7 @@ def unit_log_likelihoods( log_likelihoods """ if unit is None: - unit = self.units[unit_id] + unit = self[unit_id] if ignore_channels: core_channels = torch.arange(self.data.n_channels) else: @@ -748,9 +877,10 @@ def unit_log_likelihoods( core_channels, spike_indices ) else: - neighbs, ns = self.data.core_neighborhoods.subset_neighborhoods( + covered_neighbs, neighbs, ns = self.data.core_neighborhoods.subset_neighborhoods( core_channels ) + unit.annotations['covered_neighbs'] = covered_neighbs if not ns: return None, None @@ -811,8 +941,8 @@ def noise_log_likelihoods(self, show_progress=False): def kmeans_split_unit(self, unit_id, debug=False): # get spike data and use interpolation to fill it out to the # unit's channel set - result = dict(parent_id=parent_id) - unit = self.units[unit_id] + result = dict(parent_id=unit_id, new_ids=[unit_id]) + unit = self[unit_id] if not unit.channels.numel(): return result @@ -860,7 +990,8 @@ def kmeans_split_unit(self, unit_id, debug=False): if not valid.any(): return result split_ids = split_ids[valid] - assert np.array_equal(split_ids, np.arange(len(split_ids))) + if not np.array_equal(split_ids, np.arange(len(split_ids))): + raise ValueError(f"Bad {split_ids=}") if debug: result["merge_labels"] = split_labels @@ -873,7 +1004,7 @@ def kmeans_split_unit(self, unit_id, debug=False): with self.labels_lock: self.labels[indices_full] = -1 self.labels[sp.indices[split_labels >= 0]] = unit_id - return + return result # else, tack new units onto the end # we need to lock up the labels array access here because labels.max() @@ -1109,14 +1240,16 @@ def unit_group_criterion( for i, k in enumerate(unit_ids): u = self.fit_unit(unit_id=k, indices=in_any, likelihoods=likelihoods, features=spikes_extract) units.append(u) - _, subunit_logliks[i] = self.unit_log_likelihoods(unit=u, spikes=spikes_core) + _, sll = self.unit_log_likelihoods(unit=u, spikes=spikes_core) + if sll is not None: + subunit_logliks[i] = sll subunit_log_props = F.softmax(subunit_logliks, dim=0).mean(1).log_() # loglik per spik full_loglik = torch.logsumexp(subunit_logliks.T + subunit_log_props, dim=1).mean() unit = self.fit_unit(indices=in_any, features=spikes_extract) likelihoods = None elif fit_type == "avg_preexisting": - unit = self.units[unit_ids[0]].avg_with(*[self.units[u] for u in unit_ids[1:]]) + unit = self[unit_ids[0]].avg_with(*[self[u] for u in unit_ids[1:]]) if debug: subunit_logliks = likelihoods[:, in_any][unit_ids] full_loglik = marginal_loglik( @@ -1139,7 +1272,7 @@ def unit_group_criterion( unique_nids, inverse = torch.unique(nids, return_inverse=True) unique_chans = self.data.core_neighborhoods.neighborhoods[unique_nids] unique_k_merged = unit.n_params(unique_chans) - unique_k_full = [self.units[u].n_params(unique_chans) for u in unit_ids] + unique_k_full = [self[u].n_params(unique_chans) for u in unit_ids] unique_k_full = torch.stack(unique_k_full, dim=1) k_merged = unique_k_merged[inverse] k_full = unique_k_full[inverse] @@ -1208,6 +1341,14 @@ def get_fit_weights(self, unit_id, indices, likelihoods=None): # -- gizmos + @staticmethod + def normalize_key(ix): + if torch.is_tensor(ix): + ix = ix.numpy(force=True).item() + elif isinstance(ix, np.ndarray): + ix = ix.item() + return str(ix) + @property def rg(self): # thread-local rgs since they aren't safe @@ -1219,7 +1360,7 @@ def rg(self): def _relabel(self, old_labels, new_labels=None, flat=False): """Re-label units - !! This could invalidate self.units and props. + !! This could invalidate self._units and props. Suggested to only call .cleanup(), this is its low-level helper. @@ -1250,20 +1391,25 @@ def _relabel(self, old_labels, new_labels=None, flat=False): self.labels[torch.logical_not(kept)] = -1 self._stack = None - def stack_units(self, units=None, mean_only=True, use_cache=False): - if units is None: - units = self.units + def stack_units(self, units=None, ids=None, mean_only=True, use_cache=False): + if ids is not None: + assert units is not None + elif units is not None: + ids = np.arange(len(units)) + else: + ids, units = self.ids_and_units() + if use_cache and self._stack is not None: if mean_only or self._stack[1] is not None: return self._stack - nu, rank, nc = len(units), self.data.rank, self.data.n_channels + rank, nc = self.data.rank, self.data.n_channels - means = torch.zeros((nu, rank, nc), device=self.data.device) + means = torch.full((len(ids), rank, nc), torch.nan, device=self.data.device) covs = logdets = None if not mean_only: - covs = means.new_zeros((nu, rank * nc, rank * nc)) - logdets = means.new_zeros((nu,)) + covs = means.new_full((len(units), rank * nc, rank * nc), torch.nan) + logdets = means.new_full((len(units),), torch.nan) for j, unit in enumerate(units): means[j] = unit.mean @@ -1272,9 +1418,11 @@ def stack_units(self, units=None, mean_only=True, use_cache=False): logdets[j] = unit.logdet if use_cache: - self._stack = means, covs, logdets + self._stack = ids, means, covs, logdets + else: + self._stack = None - return means, covs, logdets + return ids, means, covs, logdets def schedule_annotations(self, unit_id, **annotations): if unit_id not in self.next_round_annotations: diff --git a/src/dartsort/cluster/stable_features.py b/src/dartsort/cluster/stable_features.py index 9bbf7a4a..7d6906a7 100644 --- a/src/dartsort/cluster/stable_features.py +++ b/src/dartsort/cluster/stable_features.py @@ -438,9 +438,7 @@ def subset_neighborhoods(self, channels, min_coverage=1.0, add_to_overlaps=None) for j in covered_ids } n_spikes = self.popcounts[covered_ids].sum() - if add_to_overlaps is not None: - add_to_overlaps[covered_ids] += 1 - return neighborhood_info, n_spikes + return covered_ids, neighborhood_info, n_spikes def spike_neighborhoods(self, channels, spike_indices, min_coverage=1.0): """Like subset_neighborhoods, but for an already chosen collection of spikes From 63d5c93c194d7938d326419eb75ed696ddb523d5 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 17:55:50 -0800 Subject: [PATCH 3/9] Nullable... --- src/dartsort/templates/pairwise_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dartsort/templates/pairwise_util.py b/src/dartsort/templates/pairwise_util.py index 7d98e3ce..405f2d41 100644 --- a/src/dartsort/templates/pairwise_util.py +++ b/src/dartsort/templates/pairwise_util.py @@ -386,7 +386,7 @@ def conv_to_resid( svs_a = low_rank_templates_a.singular_values[template_indices_a] svs_b = low_rank_templates_b.singular_values[template_indices_b] active_a = torch.any(spatial_a > 0, dim=1).to(svs_a) - if ignore_empty_channels: + if ignore_empty_channels and low_rank_templates_b.spike_counts_by_channel is not None: active_b = low_rank_templates_b.spike_counts_by_channel[template_indices_b] active_b = active_b > 0 active_b = torch.from_numpy(active_b).to(svs_a) From 16f5c0eac0de754a6aeb274aee6a99d43443764a Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 17:56:06 -0800 Subject: [PATCH 4/9] Greedy spike to spike matching --- src/dartsort/util/hybrid_util.py | 101 +++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 17 deletions(-) diff --git a/src/dartsort/util/hybrid_util.py b/src/dartsort/util/hybrid_util.py index e0fe0f29..e7e02e30 100644 --- a/src/dartsort/util/hybrid_util.py +++ b/src/dartsort/util/hybrid_util.py @@ -1,19 +1,16 @@ +import dataclasses import numpy as np +from tqdm.auto import tqdm import warnings -from spikeinterface.core import BaseRecording, BaseRecordingSegment, Templates -from spikeinterface.core.core_tools import define_function_from_class from spikeinterface.extractors import NumpySorting from spikeinterface.generation.drift_tools import InjectDriftingTemplatesRecording, DriftingTemplates, move_dense_templates -from spikeinterface.preprocessing.basepreprocessor import ( - BasePreprocessor, BasePreprocessorSegment) from probeinterface import Probe +from scipy.spatial import KDTree +from scipy.sparse import csgraph, coo_array from ..templates import TemplateData -from .analysis import DARTsortAnalysis from .data_util import DARTsortSorting from ..config import unshifted_raw_template_config -from ..templates import TemplateData - def get_drifty_hybrid_recording( @@ -82,7 +79,7 @@ def get_drifty_hybrid_recording( displacement_vectors=[disp], displacement_sampling_frequency=displacement_sampling_frequency, displacement_unit_factor=displacement_unit_factor, - amplitude_factor=amplitude_factor + amplitude_factor=amplitude_factor, ) rec.annotate(peak_channel=peak_channels.tolist()) return rec @@ -205,6 +202,7 @@ def refractory_poisson_spike_train( return spike_samples + def piecewise_refractory_poisson_spike_train(rates, bins, binsize_samples, **kwargs): """ Returns a spike train with variable firing rate using refractory_poisson_spike_train(). @@ -214,13 +212,14 @@ def piecewise_refractory_poisson_spike_train(rates, bins, binsize_samples, **kwa :param binsize_samples: number of samples per bin :param **kwargs: kwargs to feed to refractory_poisson_spike_train() """ - sp_tr = np.concatenate( - [ - refractory_poisson_spike_train(r, binsize_samples, **kwargs) + bins[i] if r > 0.1 else [] - for i, r in enumerate(rates) - ] - ) - return sp_tr + st = [] + for rate, bin in zip(rates, bins): + if rate < 0.1: + continue + binst = refractory_poisson_spike_train(rate, binsize_samples, **kwargs) + st.append(bin + binst) + st = np.concatenate(st) + return st def precompute_displaced_registered_templates( @@ -255,6 +254,66 @@ def precompute_displaced_registered_templates( return ret +def closest_clustering(gt_st, peel_st, geom=None, match_dt_ms=0.1, match_radius_um=0.0, p=2.0, M=50.0): + frames_per_ms = gt_st.sampling_frequency / 1000 + delta_frames = match_dt_ms * frames_per_ms + rescale = [delta_frames] + gt_pos = gt_st.times_samples[:, None] + peel_pos = peel_st.times_samples[:, None] + if match_radius_um: + rescale = rescale + (geom.shape[1] * [match_radius_um]) + gt_pos = np.c_[gt_pos, geom[gt_st.channels]] + peel_pos = np.c_[peel_pos, geom[peel_st.channels]] + else: + gt_pos = gt_pos.astype(float) + peel_pos = peel_pos.astype(float) + gt_pos /= rescale + peel_pos /= rescale + labels = greedy_match(gt_pos, peel_pos, dx=1.0 / frames_per_ms) + labels[labels >= 0] = gt_st.labels[labels[labels >= 0]] + + return dataclasses.replace(peel_st, labels=labels) + + +def greedy_match(gt_coords, test_coords, max_val=1.0, dx=1./30, workers=-1, p=2.0): + assignments = np.full(len(test_coords), -1) + gt_unmatched = np.ones(len(gt_coords), dtype=bool) + + for j, thresh in enumerate( + tqdm(np.arange(0.0, max_val + dx + 2e-5, dx), desc="match") + ): + test_unmatched = np.flatnonzero(assignments < 0) + if not test_unmatched.size: + break + test_kdtree = KDTree(test_coords[test_unmatched]) + gt_ix = np.flatnonzero(gt_unmatched) + d, i = test_kdtree.query( + gt_coords[gt_ix], + k=1, + distance_upper_bound=min(thresh, max_val), + workers=workers, + p=p, + ) + # handle multiple gt spikes getting matched to the same peel ix + thresh_matched = i < test_kdtree.n + _, ii = np.unique(i, return_index=True) + i = i[ii] + thresh_matched = thresh_matched[ii] + + gt_ix = gt_ix[ii] + gt_ix = gt_ix[thresh_matched] + i = i[thresh_matched] + assignments[test_unmatched[i]] = gt_ix + gt_unmatched[gt_ix] = False + + if not gt_unmatched.any(): + break + if thresh > max_val: + break + + return assignments + + def sorting_from_times_labels(times, labels, recording=None, sampling_frequency=None, determine_channels=True, template_config=unshifted_raw_template_config, n_jobs=0): channels = np.zeros_like(labels) if sampling_frequency is None: @@ -273,6 +332,14 @@ def sorting_from_times_labels(times, labels, recording=None, sampling_frequency= return sorting, td -def sorting_from_spikeinterface(sorting, recording=None, determine_channels=True, template_config=unshifted_raw_template_config, n_jobs=0): +def sorting_from_spikeinterface( + sorting, + recording=None, + determine_channels=True, + template_config=unshifted_raw_template_config, + n_jobs=0, +): sv = sorting.to_spike_vector() - return sorting_from_times_labels(sv['sample_index'], sv['unit_index'], sampling_frequency=sorting.sampling_frequency, recording=recording, determine_channels=determine_channels, template_config=template_config, n_jobs=n_jobs) + return sorting_from_times_labels( + sv['sample_index'], sv['unit_index'], sampling_frequency=sorting.sampling_frequency, recording=recording, determine_channels=determine_channels, template_config=template_config, n_jobs=n_jobs + ) From 7e3c6663410f4dd8808e8b5a1943aeb2a0ed1518 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 17:56:22 -0800 Subject: [PATCH 5/9] Sometimes things aren't. --- src/dartsort/util/spiketorch.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/dartsort/util/spiketorch.py b/src/dartsort/util/spiketorch.py index 45011a3d..9b3b0c46 100644 --- a/src/dartsort/util/spiketorch.py +++ b/src/dartsort/util/spiketorch.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from scipy.fftpack import next_fast_len from torch.fft import irfft, rfft +import warnings def fast_nanmedian(x, axis=-1): @@ -315,9 +316,12 @@ def nancov(x, weights=None, correction=1, nan_free=False, return_nobs=False, for cov = xtx / denom if force_posdef: - vals, vecs = torch.linalg.eigh(cov) - good = vals > 0 - cov = (vecs[:, good] * vals[good]) @ vecs[:, good].T + try: + vals, vecs = torch.linalg.eigh(cov) + good = vals > 0 + cov = (vecs[:, good] * vals[good]) @ vecs[:, good].T + except Exception as e: + warnings.warn(f"Error in nancov eigh: {e}") if return_nobs: return cov, nobs From ec629e632287b6c13321bed5f29f648f20e8fbf3 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 17:56:37 -0800 Subject: [PATCH 6/9] Transition vis to new label logic --- src/dartsort/vis/gmm.py | 142 +++++++++++++++++++++----------- src/dartsort/vis/gmm_helpers.py | 7 +- 2 files changed, 96 insertions(+), 53 deletions(-) diff --git a/src/dartsort/vis/gmm.py b/src/dartsort/vis/gmm.py index 9a5c39e5..262f9f07 100644 --- a/src/dartsort/vis/gmm.py +++ b/src/dartsort/vis/gmm.py @@ -1,4 +1,5 @@ from pathlib import Path +import warnings import matplotlib.pyplot as plt import numpy as np @@ -33,7 +34,7 @@ def draw(self, panel, gmm, unit_id): class ISIHistogram(GMMPlot): kind = "small" width = 2 - height = 2 + height = 1.5 def __init__(self, bin_ms=0.1, max_ms=5): self.bin_ms = bin_ms @@ -68,13 +69,13 @@ def draw(self, panel, gmm, unit_id): s = ax.scatter(*xy[unique_ixs].T, c=counts, lw=0, cmap=self.cmap) plt.colorbar(s, ax=ax, shrink=0.3, label="chan count") ax.scatter( - *xy[gmm.units[unit_id].channels.numpy(force=True)].T, + *xy[gmm[unit_id].channels.numpy(force=True)].T, color="r", lw=1, fc="none", ) ax.scatter( - *xy[np.atleast_1d(gmm.units[unit_id].snr.argmax().numpy(force=True))].T, + *xy[np.atleast_1d(gmm[unit_id].snr.argmax().numpy(force=True))].T, color="g", lw=0, ) @@ -92,6 +93,18 @@ def draw(self, panel, gmm, unit_id): nspikes = (gmm.labels == unit_id).sum() msg += f"n spikes: {nspikes}\n" + if gmm[unit_id].annotations: + msg += 'annots:\n' + for k, v in gmm[unit_id].annotations.items(): + if torch.is_tensor(k): + k = k.numpy(force=True) + if k.size == 1: + k = k.item() + if torch.is_tensor(v): + v = v.numpy(force=True) + if v.size == 1: + v = v.item() + msg += f"{k}: {v}" axis.text(0, 0, msg, fontsize=6.5) @@ -103,7 +116,7 @@ class MStep(GMMPlot): def __init__(self, n_waveforms_show=64, with_covs=True): self.with_covs = with_covs - self.height = 5 + 4 * with_covs + self.height = 4 + 4 * with_covs self.n_waveforms_show = n_waveforms_show def draw(self, panel, gmm, unit_id, axes=None): @@ -147,7 +160,7 @@ def draw(self, panel, gmm, unit_id, axes=None): n, r, c = feats.shape emp_mean = torch.nanmean(feats, dim=0) emp_mean = gmm.data.tpca.force_reconstruct(emp_mean.nan_to_num_()) - model_mean = gmm.units[unit_id].mean[:, chans] + model_mean = gmm[unit_id].mean[:, chans] model_mean = gmm.data.tpca.force_reconstruct(model_mean) geomplot( @@ -165,24 +178,23 @@ def draw(self, panel, gmm, unit_id, axes=None): return # covariance vis - feats = features_full[:, :, gmm.units[unit_id].channels] - model_mean = gmm.units[unit_id].mean[:, gmm.units[unit_id].channels] + feats = features_full[:, :, gmm[unit_id].channels] + model_mean = gmm[unit_id].mean[:, gmm[unit_id].channels] feats = feats - model_mean n, r, c = feats.shape emp_cov, nobs = spiketorch.nancov(feats.view(n, r * c), return_nobs=True) - denom = nobs + gmm.units[unit_id].prior_pseudocount + denom = nobs + gmm[unit_id].prior_pseudocount emp_cov = (nobs / denom) * emp_cov noise_cov = gmm.noise.marginal_covariance( - channels=gmm.units[unit_id].channels + channels=gmm[unit_id].channels ).to_dense() m = model_mean.reshape(-1) mmt = m[:, None] @ m[None, :] modelcov = ( - gmm.units[unit_id] - .marginal_covariance(channels=gmm.units[unit_id].channels) + gmm[unit_id] + .marginal_covariance(channels=gmm[unit_id].channels) .to_dense() ) - residual = emp_cov - modelcov covs = (emp_cov, noise_cov, mmt.abs(), mmt, modelcov, emp_cov - modelcov) # vmax = max(c.abs().max() for c in covs) names = ("regemp", "noise", "|temptempT|", "temptempT", "model", "resid") @@ -244,7 +256,11 @@ def draw(self, panel, gmm, unit_id): emp_eigs = torch.linalg.eigvalsh(emp_cov) noise_eigs = torch.linalg.eigvalsh(noise_cov) residual_eigs, residual_vecs = torch.linalg.eigh(residual) - model_residual_eigs = torch.linalg.eigvalsh(model_residual) + try: + model_residual_eigs = torch.linalg.eigvalsh(model_residual) + except Exception as e: + warnings.warn(f"Model residual. {e}") + model_residual_eigs = torch.zeros(model_residual.shape[0]) rank1 = (residual_vecs[:, -1:] * residual_eigs[-1:]) @ residual_vecs[:, -1:].T rank1_model = noise_cov + rank1 @@ -256,14 +272,14 @@ def draw(self, panel, gmm, unit_id): # ax_eig, ax_r2 = bot.subplots(ncols=2) ax_eig = bot.subplots() - vm = emp_cov.abs().max() - imk = dict(vmin=-vm, vmax=vm, cmap=plt.cm.seismic, interpolation="none") + # vm = 0.9 * emp_cov.abs().max() + imk = dict(cmap=plt.cm.seismic, interpolation="none") covs = dict( emp=emp_cov, noise=noise_cov, noise_resid=residual, - mmT_scaled=scale * mmT, + mmT=scale * mmT, mmT_model=model, mmT_resid=model_residual, rank1=rank1, @@ -279,19 +295,21 @@ def draw(self, panel, gmm, unit_id): rank1_resid=rank1_residual_eigs, ) for (name, cov), ax, color in zip(covs.items(), axes.flat, colors): - if name == "mmT_scaled": - vm = cov.abs().max() * 0.9 - mimk = dict( - vmin=-vm, vmax=vm, cmap=plt.cm.seismic, interpolation="none" - ) - else: - mimk = imk + vm = cov.abs().max() * 0.9 + mimk = imk | dict(vmax=vm, vmin=-vm) + # if name == "mmT": + # vm = cov.abs().max() * 0.9 + # mimk = dict( + # vmin=-vm, vmax=vm, cmap=plt.cm.seismic, interpolation="none" + # ) + # else: + # mimk = imk im = ax.imshow(cov, **mimk) cb = plt.colorbar(im, ax=ax, shrink=0.2) cb.outline.set_visible(False) title = name - if name.startswith("mmT_sc"): - title = title + f" (scale={scale:0.2f})" + if name == "mmT": + title = title + f" (scale={scale:.2f})" ax.set_title(title, color=color) if name in eigs: ax_eig.plot(eigs[name].flip(0), color=color, lw=1) @@ -459,7 +477,7 @@ def draw(self, panel, gmm, unit_id, split_info=None): ax.set_xticks([]) ax.axhline(0, color="k", lw=0.8) sns.despine(ax=ax, left=False, right=True, bottom=True, top=True) - mainchan = gmm.units[unit_id].snr.argmax() + mainchan = gmm[unit_id].snr.argmax() for subid, subunit in zip(split_ids, split_info["units"]): subm = subunit.mean[:, mainchan] subm = gmm.data.tpca._inverse_transform_in_probe(subm[None])[0] @@ -518,11 +536,11 @@ def __init__(self, n_neighbors=5): def draw(self, panel, gmm, unit_id): neighbors = gmm_helpers.get_neighbors(gmm, unit_id) - units = [gmm.units[u] for u in reversed(neighbors)] - labels = neighbors.numpy(force=True)[::-1] + units = [gmm[u] for u in reversed(neighbors)] + labels = neighbors[::-1] # means on core channels - chans = gmm.units[unit_id].snr.argmax() + chans = gmm[unit_id].snr.argmax() chans = torch.cdist(gmm.data.prgeom[chans[None]], gmm.data.prgeom) chans = chans.view(-1) (chans,) = torch.nonzero(chans <= gmm.data.core_radius, as_tuple=True) @@ -544,12 +562,12 @@ def __init__(self, n_neighbors=5, dist_vmax=1.0): def draw(self, panel, gmm, unit_id): neighbors = gmm_helpers.get_neighbors(gmm, unit_id) distances = gmm.distances( - units=[gmm.units[u] for u in neighbors], show_progress=False + units=[gmm[u] for u in neighbors], show_progress=False ) ax = analysis_plots.distance_matrix_dendro( panel, distances, - unit_ids=neighbors.numpy(force=True), + unit_ids=neighbors, dendrogram_linkage=None, show_unit_labels=True, vmax=self.dist_vmax, @@ -561,7 +579,7 @@ def draw(self, panel, gmm, unit_id): class NeighborBimodalities(GMMPlot): - kind = "merge" + kind = "bim" width = 4 height = 8 @@ -583,7 +601,7 @@ def draw(self, panel, gmm, unit_id): ) kept = labels >= 0 labels_ = np.full_like(labels, -1) - labels_[kept] = neighbors[labels[kept]].numpy(force=True) + labels_[kept] = neighbors[labels[kept]] labels = labels_ others = neighbors[1:] @@ -606,7 +624,9 @@ def draw(self, panel, gmm, unit_id): bimod_ax.set_title("bimodality computation", fontsize="small") if "in_pair_kept" not in bimod_info: - scatter_ax.text(0, 0, f"too few spikes") + scatter_ax.text( + 0.5, 0.5, f"too few spikes", transform=scatter_ax.transAxes, ha='center', va='center' + ) else: c = np.atleast_2d(glasbey1024[labels[bimod_info["in_pair_kept"]]]) scatter_ax.scatter(bimod_info["xi"], bimod_info["xj"], s=3, lw=0, c=c) @@ -615,7 +635,7 @@ def draw(self, panel, gmm, unit_id): if "samples" not in bimod_info: bimod_ax.text( - 0, 0, f"too-small kept prop {bimod_info['keep_prop']:.2f}" + 0.5, 0.5, f"too-small\nkept prop {bimod_info['keep_prop']:.2f}", transform=bimod_ax.transAxes, ha='center', va='center' ) bimod_ax.axis("off") continue @@ -655,30 +675,51 @@ def draw(self, panel, gmm, unit_id): class NeighborInfoCriteria(GMMPlot): - kind = "merge" - width = 3 + kind = "bim" + width = 4 height = 8 - def __init__(self, n_neighbors=5): + def __init__(self, n_neighbors=5, fit_by_avg=False): self.n_neighbors = n_neighbors + self.fit_by_avg = fit_by_avg def draw(self, panel, gmm, unit_id): neighbors = gmm_helpers.get_neighbors(gmm, unit_id) assert neighbors[0] == unit_id others = neighbors[1:] axes = panel.subplots(nrows=len(others), ncols=1) - histkw = dict(density=True, histtype="step", bins=128) - cstr = "{aic_full=:0.2f} {aic_merged=:0.2f} {bic_full=:0.2f} {bic_merged=:0.2f}" + histkw = dict(density=True, histtype="step", bins=128, log=True) + astr = "AICfull/merged: {aic_full:0.1f} / {aic_merged:0.1f}" + bstr = "BICfull/merged: {bic_full:0.1f} / {bic_merged:0.1f}" + lstr = "LLfull/merged: {full_loglik:0.1f} / {unit_loglik:0.1f}" + cstr = f"{astr}\n{bstr}\n{lstr}\n" for ax, other_id in zip(axes, others): uids = [unit_id, other_id] - sns.despine(ax=ax, left=True, right=True, top=True) res = gmm.unit_group_criterion(uids, gmm.log_liks, debug=True) - sll = res["subunit_logliks"].tocsr() + sll = res["subunit_logliks"] + if not torch.is_tensor(sll): + sll = sll.tocsr() for row, uid in enumerate(uids): - ax.hist(sll[[row]].data, color=glasbey1024[uid], **histkw) - s = f"other={other_id} {cstr.format(res)}" - ax.set_title(s, fontsize="small") + if not torch.is_tensor(sll): + rowsll = sll[[row]].data + else: + rowsll = sll[row] + rowsll = rowsll[torch.isfinite(rowsll)] + if rowsll.numel(): + ax.hist(rowsll, color=glasbey1024[uid], **histkw) + ull = res["unit_logliks"] + ax.hist(ull[torch.isfinite(ull)], color="k", **histkw) + s = f"other={other_id}\n" + cstr.format_map(res) + aic_merge = res['aic_merged'] < res['aic_full'] + bic_merge = res['bic_merged'] < res['bic_full'] + ll_merge = res['unit_loglik'] > res['full_loglik'] + s += f"aic: " + ("merge!" if aic_merge else "nope.") + "\n" + s += f"bic: " + ("merge!" if bic_merge else "nope.") + "\n" + s += f"ll: " + ("merge!" if ll_merge else "nope.") + ax.text(0.05, 0.95, s, transform=ax.transAxes) ax.set_xlabel("log lik") + sns.despine(ax=ax, left=True, right=True, top=True) + # ax.set_yticks([]) # -- main api @@ -686,14 +727,17 @@ def draw(self, panel, gmm, unit_id): default_gmm_plots = ( TextInfo(), ISIHistogram(), + ISIHistogram(bin_ms=1, max_ms=50), ChansHeatmap(), - MStep(), + MStep(with_covs=False), + CovarianceResidual(), Likelihoods(), Amplitudes(), KMeansSplit(), NeighborMeans(), NeighborDistances(), NeighborBimodalities(), + NeighborInfoCriteria(), ) @@ -702,7 +746,7 @@ def make_unit_gmm_summary( unit_id, plots=default_gmm_plots, max_height=9, - figsize=(14, 11), + figsize=(15, 11), hspace=0.1, figure=None, **other_global_params, @@ -731,7 +775,7 @@ def make_all_gmm_summaries( save_folder, plots=default_gmm_plots, max_height=9, - figsize=(14, 11), + figsize=(15, 11), hspace=0.1, dpi=200, image_ext="png", @@ -746,7 +790,7 @@ def make_all_gmm_summaries( ): save_folder = Path(save_folder) if unit_ids is None: - unit_ids = gmm.unit_ids().numpy(force=True) + unit_ids = gmm.unit_ids() if n_units is not None and n_units < len(unit_ids): rg = np.random.default_rng(seed) unit_ids = rg.choice(unit_ids, size=n_units, replace=False) diff --git a/src/dartsort/vis/gmm_helpers.py b/src/dartsort/vis/gmm_helpers.py index adbe25c0..8f668ec2 100644 --- a/src/dartsort/vis/gmm_helpers.py +++ b/src/dartsort/vis/gmm_helpers.py @@ -7,10 +7,10 @@ def get_neighbors(gmm, unit_id, n_neighbors=5): - means, covs, logdets = gmm.stack_units(use_cache=True) - dists = gmm.units[unit_id].divergence(means, covs, logdets, kind=gmm.distance_metric) + ids, means, covs, logdets = gmm.stack_units(use_cache=True) + dists = gmm[unit_id].divergence(means, covs, logdets, kind=gmm.distance_metric) dists = dists.view(-1) - order = torch.argsort(dists) + order = ids[torch.argsort(dists)] assert order[0] == unit_id return order[:n_neighbors + 1] @@ -60,7 +60,6 @@ def plot_means(panel, prgeom, tpca, chans, units, labels, title="nearest neighbo means.append(tpca.force_reconstruct(mean).numpy(force=True)) colors = glasbey1024[labels] - print(f"{len(means)=} {labels.shape=} {colors.shape=}") geomplot( np.stack(means, axis=0), channels=chans[None] From 049cb1d5b1ce6328460756b052fdd3c429dca4bd Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Fri, 15 Nov 2024 18:01:05 -0800 Subject: [PATCH 7/9] Debug merge --- src/dartsort/cluster/gaussian_mixture.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/dartsort/cluster/gaussian_mixture.py b/src/dartsort/cluster/gaussian_mixture.py index dfe348eb..07c206d7 100644 --- a/src/dartsort/cluster/gaussian_mixture.py +++ b/src/dartsort/cluster/gaussian_mixture.py @@ -638,14 +638,13 @@ def distances( if units is None: nu = self.n_units() ids, units = self.ids_and_units() - units = self.items() else: nu = len(units) ids = range(nu) # stack unit data into one place mean_only = kind == "noise_metric" - ids, means, covs, logdets = self.stack_units(ids=ids, units=units, mean_only=mean_only) + ids, means, covs, logdets = self.stack_units(nu=nu, ids=ids, units=units, mean_only=mean_only) # compute denominator of noised normalized distances if noise_normalized: @@ -1391,13 +1390,15 @@ def _relabel(self, old_labels, new_labels=None, flat=False): self.labels[torch.logical_not(kept)] = -1 self._stack = None - def stack_units(self, units=None, ids=None, mean_only=True, use_cache=False): + def stack_units(self, nu=None, units=None, ids=None, mean_only=True, use_cache=False): if ids is not None: assert units is not None elif units is not None: ids = np.arange(len(units)) else: ids, units = self.ids_and_units() + if nu is None: + nu = len(ids) if use_cache and self._stack is not None: if mean_only or self._stack[1] is not None: @@ -1405,11 +1406,11 @@ def stack_units(self, units=None, ids=None, mean_only=True, use_cache=False): rank, nc = self.data.rank, self.data.n_channels - means = torch.full((len(ids), rank, nc), torch.nan, device=self.data.device) + means = torch.full((nu, rank, nc), torch.nan, device=self.data.device) covs = logdets = None if not mean_only: - covs = means.new_full((len(units), rank * nc, rank * nc), torch.nan) - logdets = means.new_full((len(units),), torch.nan) + covs = means.new_full((nu, rank * nc, rank * nc), torch.nan) + logdets = means.new_full((nu,), torch.nan) for j, unit in enumerate(units): means[j] = unit.mean From 0944c79eae7f7fc439e5523e557716fe2be88271 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Thu, 21 Nov 2024 11:49:20 -0800 Subject: [PATCH 8/9] Debug annotations; refit_all/avg in AIC; handle edge cases in AIC --- src/dartsort/cluster/gaussian_mixture.py | 85 ++++++++++++++++-------- 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/src/dartsort/cluster/gaussian_mixture.py b/src/dartsort/cluster/gaussian_mixture.py index 07c206d7..3e1cb36b 100644 --- a/src/dartsort/cluster/gaussian_mixture.py +++ b/src/dartsort/cluster/gaussian_mixture.py @@ -260,7 +260,10 @@ def em(self, n_iter=None, show_progress=True, final_e_step=True): reas_prop = reas_count / self.data.n_spikes rpct = f"{100 * reas_prop:.1f}" adif = f"{max_adif:.2f}" - msg = f"EM[{nu=},fu={to_fit.sum()},out={opct}%,reas={rpct}%,dmu={adif},meanlogpx={meanlogpx:g}]" + msg = ( + f"EM[K={nu},Ka={to_fit.sum()};{opct}%fp," + f"{rpct}%reas,dmu={adif};logpx/n={meanlogpx:.1f}]" + ) its.set_description(msg) if reas_prop < self.em_converged_prop: @@ -593,7 +596,7 @@ def merge(self, log_liks=None, show_progress=True): for new_id in np.unique(new_ids): merge_parents = np.flatnonzero(new_ids == new_ids) - self.schedule_annotations(new_id, dict(merge_parents=merge_parents)) + self.schedule_annotations(new_id, merge_parents=merge_parents) self.clear_units() if self.log_proportions is not None: @@ -1219,7 +1222,7 @@ def unit_group_criterion( debug=False, ): """See if a single unit explains a group as far as AIC/BIC/MDL go.""" - assert fit_type in ("avg_preexisting", "refit_all") + assert fit_type in ("avg_preexisting", "refit_all", "refit_avg") unit_ids = torch.tensor(unit_ids) # pick spikes for likelihood computation @@ -1232,12 +1235,14 @@ def unit_group_criterion( spikes_core = self.data.spike_data(in_any, neighborhood="core") n = in_any.numel() - if fit_type == "refit_all": + if fit_type.startswith("refit"): units = [] subunit_logliks = spikes_core.features.new_full((len(unit_ids), len(in_any)), -torch.inf) full_loglik = 0.0 for i, k in enumerate(unit_ids): - u = self.fit_unit(unit_id=k, indices=in_any, likelihoods=likelihoods, features=spikes_extract) + u = self.fit_unit( + unit_id=k, indices=in_any, likelihoods=likelihoods, features=spikes_extract + ) units.append(u) _, sll = self.unit_log_likelihoods(unit=u, spikes=spikes_core) if sll is not None: @@ -1245,10 +1250,17 @@ def unit_group_criterion( subunit_log_props = F.softmax(subunit_logliks, dim=0).mean(1).log_() # loglik per spik full_loglik = torch.logsumexp(subunit_logliks.T + subunit_log_props, dim=1).mean() - unit = self.fit_unit(indices=in_any, features=spikes_extract) - likelihoods = None + + if fit_type == "refit_all": + unit = self.fit_unit(indices=in_any, features=spikes_extract) + elif fit_type == "refit_avg": + unit = average_units(units, props=subunit_log_props.exp()) + else: + assert False elif fit_type == "avg_preexisting": - unit = self[unit_ids[0]].avg_with(*[self[u] for u in unit_ids[1:]]) + subunit_log_props = self.log_proportions[unit_ids] + units = [self[uid] for uid in unit_ids] + unit = average_units(units, F.softmax(subunit_log_props)) if debug: subunit_logliks = likelihoods[:, in_any][unit_ids] full_loglik = marginal_loglik( @@ -1262,7 +1274,10 @@ def unit_group_criterion( # extract likelihoods... no proportions! _, unit_logliks = self.unit_log_likelihoods(unit=unit, spikes=spikes_core) - unit_loglik = unit_logliks.mean() + if unit_logliks is not None: + unit_loglik = unit_logliks.mean() + else: + unit_loglik = np.nan # parameter counting... since we use marginal likelihoods, I'm restricting # the parameter counts to just the marginal set considered for each spike. @@ -1272,7 +1287,7 @@ def unit_group_criterion( unique_chans = self.data.core_neighborhoods.neighborhoods[unique_nids] unique_k_merged = unit.n_params(unique_chans) unique_k_full = [self[u].n_params(unique_chans) for u in unit_ids] - unique_k_full = torch.stack(unique_k_full, dim=1) + unique_k_full = torch.stack(unique_k_full, dim=1).sum(1) k_merged = unique_k_merged[inverse] k_full = unique_k_full[inverse] @@ -1488,7 +1503,6 @@ def __init__( channels_strategy_snr_min=50.0, prior_pseudocount=10, scale_mean: float = 0.1, - mean=None, **annotations, ): super().__init__() @@ -1505,8 +1519,6 @@ def __init__( self.scale_alpha = float(prior_pseudocount) self.scale_beta = float(prior_pseudocount) / scale_mean self.annotations = annotations - if mean is not None: - self.register_buffer("mean", mean) @classmethod def from_features( @@ -1541,19 +1553,6 @@ def from_features( self = self.to(features.features.device) return self - def avg_with(self, *others): - new = self.__class__( - mean_kind=self.mean_kind, - cov_kind=self.cov_kind, - rank=self.rank, - n_channels=self.n_channels, - noise=self.noise, - ) - new.register_buffer("mean", (self.mean + sum(o.mean for o in others)) / (1 + len(others))) - new.register_buffer("channels", torch.cat([self.channels, *[o.channels for o in others]]).unique()) - assert self.cov_kind == "zero" - return new - def n_params(self, channels=None, on_channels=True): p = channels.new_zeros(len(channels)) if self.mean_kind == "full": @@ -1772,6 +1771,40 @@ def kl_divergence(self, other_means, other_covs, other_logdets): tqdm_kw = dict(smoothing=0, mininterval=1.0 / 24.0) +def average_units(units, proportions): + ua = units[0] + assert ua.cov_kind == "zero" + new_unit = GaussianUnit( + rank=ua.rank, + n_channels=ua.n_channels, + noise=ua.noise, + mean_kind=ua.mean_kind, + cov_kind=ua.cov_kind, + prior_type=ua.prior_type, + channels_strategy=ua.channels_strategy, + channels_strategy_snr_min=ua.channels_strategy_snr_min, + prior_pseudocount=ua.prior_pseudocount, + scale_mean=ua.scale_mean, + ) + if ua.mean_kind == "zero": + return ua + + if ua.channels_strategy == "all": + channels = ua.channels + elif ua.channels_strategy == "snr": + avg_snr = sum(pi * u.snr for pi, u in zip(proportions, units)) + new.register_buffer("snr", avg_snr) + (channels,) = torch.nonzero(avg_snr >= new.channels_strategy_snr_min, as_tuple=True) + else: + assert False + new.register_buffer("channels", channels) + + new_mean = sum(pi * u.mean for pi, u in zip(proportions, units)) + new.register_buffer("mean", new_mean) + + return new + + def to_full_probe(features, weights, n_channels, storage): """ Arguments From 2f5358db18d7aeebb1e76d4857cc671f3bb48472 Mon Sep 17 00:00:00 2001 From: Charlie Windolf Date: Thu, 21 Nov 2024 11:49:36 -0800 Subject: [PATCH 9/9] Debug vis edge cases --- src/dartsort/vis/gmm.py | 37 ++++++++++++++++++++++++++++--------- src/dartsort/vis/unit.py | 3 ++- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/dartsort/vis/gmm.py b/src/dartsort/vis/gmm.py index 262f9f07..3c978a51 100644 --- a/src/dartsort/vis/gmm.py +++ b/src/dartsort/vis/gmm.py @@ -1,5 +1,6 @@ from pathlib import Path import warnings +import itertools import matplotlib.pyplot as plt import numpy as np @@ -102,9 +103,21 @@ def draw(self, panel, gmm, unit_id): k = k.item() if torch.is_tensor(v): v = v.numpy(force=True) - if v.size == 1: + if isinstance(v, np.ndarray): + if not v.size: + v = "[]" + elif v.size == 1: v = v.item() - msg += f"{k}: {v}" + elif v.ndim == 1: + vv = [str(v[0])] + for vvv in map(str, v[1:]): + if len(vv[-1]) > 16: + vv[-1] += "\n" + vv.append(vvv) + continue + vv[-1] += "," + vvv + v = "\n".join(vv) + msg += f"{k}:\n{v}" axis.text(0, 0, msg, fontsize=6.5) @@ -581,7 +594,7 @@ def draw(self, panel, gmm, unit_id): class NeighborBimodalities(GMMPlot): kind = "bim" width = 4 - height = 8 + height = 9 def __init__(self, n_neighbors=5): self.n_neighbors = n_neighbors @@ -627,6 +640,7 @@ def draw(self, panel, gmm, unit_id): scatter_ax.text( 0.5, 0.5, f"too few spikes", transform=scatter_ax.transAxes, ha='center', va='center' ) + continue else: c = np.atleast_2d(glasbey1024[labels[bimod_info["in_pair_kept"]]]) scatter_ax.scatter(bimod_info["xi"], bimod_info["xj"], s=3, lw=0, c=c) @@ -677,7 +691,7 @@ def draw(self, panel, gmm, unit_id): class NeighborInfoCriteria(GMMPlot): kind = "bim" width = 4 - height = 8 + height = 9 def __init__(self, n_neighbors=5, fit_by_avg=False): self.n_neighbors = n_neighbors @@ -693,6 +707,7 @@ def draw(self, panel, gmm, unit_id): bstr = "BICfull/merged: {bic_full:0.1f} / {bic_merged:0.1f}" lstr = "LLfull/merged: {full_loglik:0.1f} / {unit_loglik:0.1f}" cstr = f"{astr}\n{bstr}\n{lstr}\n" + bbox = dict(facecolor='w', alpha=0.5, edgecolor="none") for ax, other_id in zip(axes, others): uids = [unit_id, other_id] res = gmm.unit_group_criterion(uids, gmm.log_liks, debug=True) @@ -708,15 +723,19 @@ def draw(self, panel, gmm, unit_id): if rowsll.numel(): ax.hist(rowsll, color=glasbey1024[uid], **histkw) ull = res["unit_logliks"] - ax.hist(ull[torch.isfinite(ull)], color="k", **histkw) + if ull is not None: + ax.hist(ull[torch.isfinite(ull)], color="k", **histkw) s = f"other={other_id}\n" + cstr.format_map(res) aic_merge = res['aic_merged'] < res['aic_full'] bic_merge = res['bic_merged'] < res['bic_full'] ll_merge = res['unit_loglik'] > res['full_loglik'] - s += f"aic: " + ("merge!" if aic_merge else "nope.") + "\n" - s += f"bic: " + ("merge!" if bic_merge else "nope.") + "\n" - s += f"ll: " + ("merge!" if ll_merge else "nope.") - ax.text(0.05, 0.95, s, transform=ax.transAxes) + aicdif = res['aic_full'] - res['aic_merged'] + bicdif = res['bic_full'] - res['bic_merged'] + lldif = res['full_loglik'] - res['unit_loglik'] + s += f"aic: {aicdif:0.1f}, " + ("merge!" if aic_merge else "nope.") + "\n" + s += f"bic: {bicdif:0.1f}, " + ("merge!" if bic_merge else "nope.") + "\n" + s += f"ll: {lldif:0.1f}, " + ("merge!" if ll_merge else "nope.") + ax.text(0.05, 0.95, s, transform=ax.transAxes, va="top", bbox=bbox) ax.set_xlabel("log lik") sns.despine(ax=ax, left=True, right=True, top=True) # ax.set_yticks([]) diff --git a/src/dartsort/vis/unit.py b/src/dartsort/vis/unit.py index 310c20ea..5325d941 100644 --- a/src/dartsort/vis/unit.py +++ b/src/dartsort/vis/unit.py @@ -51,7 +51,8 @@ def draw(self, panel, sorting_analysis, unit_id): axis.axis("off") msg = f"unit {unit_id}\n" - msg += f"feature source: {sorting_analysis.hdf5_path.name}\n" + if getattr(sorting_analysis, 'hdf5_path', None): + msg += f"feature source: {sorting_analysis.hdf5_path.name}\n" nspikes = sorting_analysis.spike_counts[ sorting_analysis.unit_ids == unit_id