From 513d1f4e0b4502ef2ea12ee6b7b30852a5e6219a Mon Sep 17 00:00:00 2001 From: rlaplaza Date: Fri, 6 Oct 2023 14:14:23 +0200 Subject: [PATCH] There was a bug in the gap statistic. Hopefully good now! --- navicat_marc/clustering.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/navicat_marc/clustering.py b/navicat_marc/clustering.py index f27ec12..b9eacc4 100755 --- a/navicat_marc/clustering.py +++ b/navicat_marc/clustering.py @@ -335,19 +335,22 @@ def gaps_diff(data, refs=None, nrefs=10, ks=range(1, 11), verb=0): if verb > 5: print(f"Gaps for k-values {ks[i]} : {gaps[i]}") for i in range(len(ks) - 1): - diff[i] = gaps[i] - gaps[i + 1] + s[i + 1] + diff[i] = gaps[i] - gaps[i + 1] - s[i + 1] if verb > 4: print( f"Gap(i) - Gap(i+1) - sk(i+1) for k-value {ks[i]} : {gaps[i]} - {gaps[i+1]} - {s[i+1]} = {diff[i]}" ) if verb > 3: - print(f"Gap(i) - Gap(i+1) = sk(i+1) for k-values {ks} : {diff}") + print(f"Gap(i) - Gap(i+1) = sk(i+1) for k-values {ks[:len(ks)-1]} : {diff}") return diff def gap(data, refs=None, nrefs=5, ks=range(1, 11), verb=0): diff = gaps_diff(data, refs, nrefs, ks, verb) - return np.argmax(diff) + best = np.argmax(diff > 0.5) + if best == 0: + best = np.argmax(diff) + return best def unique_nr(data, verb=0):