From 23bbed11ba1720c7328098dc9aad404dc75438ef Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 01:10:21 +0900
Subject: [PATCH 01/46] [add function or file]add hdbscan

make PR
---
 src/hdbscan.jl | 442 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 442 insertions(+)
 create mode 100644 src/hdbscan.jl

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
new file mode 100644
index 00000000..9e885e4f
--- /dev/null
+++ b/src/hdbscan.jl
@@ -0,0 +1,442 @@
+struct HDBSCANGraph
+    edges::Vector{Vector{Tuple{Int64, Float64}}}
+    HDBSCANGraph(n) = new([Tuple{Int64, Float64}[] for _ in 1 : n])
+end
+
+function add_edge(G::HDBSCANGraph, v::Tuple{Int64, Int64, Float64})
+    i, j, c = v
+    push!(G.edges[i], (j, c))
+    push!(G.edges[j], (i, c))
+end
+
+Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i]
+
+mutable struct HDBSCANCluster
+    parent::Int64
+    children::Vector{Int64}
+    points::Vector{Int64}
+    λp::Vector{Float64}
+    stability::Float64
+    children_stability::Float64
+    function HDBSCANCluster(noise::Bool, points::Vector{Int64})
+        if noise
+            return new(0, [], [], [], -1, -1)
+        else
+            return new(0, [], points, [], 0, 0)
+        end
+    end
+end
+
+Base.length(c::HDBSCANCluster) = size(c.points, 1)
+join(c1::HDBSCANCluster, c2::HDBSCANCluster, id) = HDBSCANCluster(nothing, vcat(c1.points, c2.points), id, 0)
+isnoise(c::HDBSCANCluster) = c.stability == -1
+hasstability(c::HDBSCANCluster) = c.stability != 0
+function compute_stability(c::HDBSCANCluster, λbirth)
+    c.stability += sum(c.λp.-λbirth)
+end
+
+"""
+    HDBSCAN(ε, minpts)
+Density-Based Clustering Based on Hierarchical Density Estimates.
+This algorithm performs clustering as follows.
+1. generate a minimum spanning tree
+2. build a HDBSCAN hierarchy
+3. extract the target cluster
+4. generate the list of cluster assignment for each point
+The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html).
+"""
+mutable struct HDBSCANResult
+    k::Int64
+    min_cluster_size::Int64
+    labels::Union{Vector{Int64}, Nothing}
+    function HDBSCANResult(k::Int64, min_cluster_size::Int64)
+        if min_cluster_size < 1
+            throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
+        end
+        return new(k, min_cluster_size)
+    end
+end
+
+"""
+    hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::INt64; gen_mst::Bool=true, mst=nothing)
+# Parameters
+- `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point
+- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
+- `min_cluster_size`: minimum number of points in the cluster
+- `gen_mst`: whether to generate minimum-spannig-tree or not
+- `mst`: when is specified and `gen_mst` is false, new mst won't be generated
+"""
+function hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::Int64; gen_mst::Bool=true, mst=nothing)
+    model = HDBSCANResult(k, min_cluster_size)
+    n = size(points, 1)
+    if gen_mst
+        #calculate core distances for each point
+        core_dists = core_dist(points, k)
+        #calculate mutual reachability distance between any two points
+        mrd = mutual_reachability_distance(core_dists, points)
+        #compute a minimum spanning tree by prim method
+        mst = prim(mrd, n)
+    elseif mst == nothing
+        throw(ArgumentError("if you set `gen_mst` to false, you must pass a minimum spanning tree as `mst`"))
+    end
+    #build a HDBSCAN hierarchy
+    hierarchy = build_hierarchy(mst, min_cluster_size)
+    #extract the target cluster
+    extract_cluster!(hierarchy)
+    #generate the list of cluster assignment for each point
+    result = fill(-1, n)
+    for (i, j) in enumerate(hierarchy[2n-1].children)
+        c = hierarchy[j]
+        for k in c.points
+            result[k] = i
+        end
+    end
+    model.labels = result
+    return model
+end
+
+function core_dist(points, k)
+    core_dists = Array{Float64}(undef, size(points, 1))
+    for i in 1 : size(points, 1)
+        p = points[i:i, :]
+        dists = vec(sum((@. (points - p)^2), dims=2))
+        sort!(dists)
+        core_dists[i] = dists[k]
+    end
+    return core_dists
+end
+
+function mutual_reachability_distance(core_dists, points)
+    n = size(points, 1)
+    graph = HDBSCANGraph(div(n * (n-1), 2))
+    for i in 1 : n-1
+        for j in i+1 : n
+            c = max(core_dists[i], core_dists[j], sum((points[i, :]-points[j, :]).^2))
+            add_edge(graph, (i, j, c))
+        end
+    end
+    return graph
+end
+
+function prim(graph, n)
+    minimum_spanning_tree = Array{Tuple{Float64, Int64, Int64}}(undef, n-1)
+    
+    marked = falses(n)
+    marked_cnt = 1
+    marked[1] = true
+    
+    h = []
+    
+    for (i, c) in graph[1]
+        heappush!(h, (c, 1, i))
+    end
+    
+    while marked_cnt < n
+        c, i, j = popfirst!(h)
+        
+        marked[j] == true && continue
+        minimum_spanning_tree[marked_cnt] = (c, i, j)
+        marked[j] = true
+        marked_cnt += 1
+        
+        for (k, c) in graph[j]
+            marked[k] == true && continue
+            heappush!(h, (c, j, k))
+        end
+    end
+    return minimum_spanning_tree
+end
+
+function build_hierarchy(mst, min_size)
+    n = length(mst) + 1
+    cost = 0
+    uf = UnionFind(n)
+    Hierarchy = Array{HDBSCANCluster}(undef, 2n-1)
+    if min_size == 1
+        for i in 1 : n
+            Hierarchy[i] = HDBSCANCluster(false, [i])
+        end
+    else
+        for i in 1 : n
+            Hierarchy[i] = HDBSCANCluster(true, Int64[])
+        end
+    end
+    sort!(mst)
+    
+    for i in 1 : n-1
+        c, j, k = mst[i]
+        cost += c
+        λ = 1 / cost
+        #child clusters
+        c1 = group(uf, j)
+        c2 = group(uf, k)
+        #reference to the parent cluster
+        Hierarchy[c1].parent = Hierarchy[c2].parent = n+i
+        nc1, nc2 = isnoise(Hierarchy[c1]), isnoise(Hierarchy[c2])
+        if !(nc1 || nc2)
+            #compute stability
+            compute_stability(Hierarchy[c1], λ)
+            compute_stability(Hierarchy[c2], λ)
+            #unite cluster
+            unite!(uf, j, k)
+            #create parent cluster
+            points = members(uf, group(uf, j))
+            Hierarchy[n+i] = HDBSCANCluster(false, points)
+        elseif !(nc1 && nc2)
+            if nc2 == true
+                (c1, c2) = (c2, c1)
+            end
+            #record the lambda value
+            append!(Hierarchy[c2].λp, fill(λ, length(Hierarchy[c1])))
+            #unite cluster
+            unite!(uf, j, k)
+            #create parent cluster
+            points = members(uf, group(uf, j))
+            Hierarchy[n+i] = HDBSCANCluster(false, points)
+        else
+            #unite the noise cluster
+            unite!(uf, j, k)
+            #create parent cluster
+            points = members(uf, group(uf, j))
+            if length(points) < min_size
+                Hierarchy[n+i] = HDBSCANCluster(true, Int64[])
+            else
+                Hierarchy[n+i] = HDBSCANCluster(false, points)
+            end
+        end
+    end
+    return Hierarchy
+end
+
+function extract_cluster!(hierarchy::Vector{HDBSCANCluster})
+    for i in 1 : length(hierarchy)-1
+        if isnoise(hierarchy[i])
+            c = hierarchy[i]
+            push!(hierarchy[c.parent].children, i)
+            hierarchy[c.parent].children_stability += c.stability
+        else
+            c = hierarchy[i]
+            if c.stability > c.children_stability
+                push!(hierarchy[c.parent].children, i)
+                hierarchy[c.parent].children_stability += c.stability
+            else
+                append!(hierarchy[c.parent].children, c.children)
+                hierarchy[c.parent].children_stability += c.children_stability
+            end
+        end
+    end
+end
+
+# Below are utility functions for building hierarchical trees
+# Please note that these functions are not so sophisticated since I made them by manually converting code of numpy.
+const LOG_2π = log(2π)
+const newaxis = [CartesianIndex()]
+
+heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v)
+
+mutable struct UnionFind{T <: Integer}
+    parent:: Vector{T}  # parent[root] is the negative of the size
+    label::Dict{Int, Int}
+    cnt::Int64
+
+    function UnionFind{T}(nodes::T) where T<:Integer
+        if nodes <= 0
+            throw(ArgumentError("invalid argument for nodes: $nodes"))
+        end
+
+        parent = -ones(T, nodes)
+        label = Dict([(i, i) for i in 1 : nodes])
+        new{T}(parent, label, nodes)
+    end
+end
+
+UnionFind(nodes::Integer) = UnionFind{typeof(nodes)}(nodes)
+group(uf::UnionFind, x)::Int = uf.label[root(uf, x)]
+members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label)))
+
+function root(uf::UnionFind{T}, x::T)::T where T<:Integer
+    if uf.parent[x] < 0
+        return x
+    else
+        return uf.parent[x] = root(uf, uf.parent[x])
+    end
+end
+
+function issame(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
+    return root(uf, x) == root(uf, y)
+end
+
+function Base.size(uf::UnionFind{T}, x::T)::T where T<:Integer
+    return -uf.parent[root(uf, x)]
+end
+
+function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
+    x = root(uf, x)
+    y = root(uf, y)
+    if x == y
+        return false
+    end
+    if uf.parent[x] > uf.parent[y]
+        x, y = y, x
+    end
+    # unite smaller tree(y) to bigger one(x)
+    uf.parent[x] += uf.parent[y]
+    uf.parent[y] = x
+    uf.cnt += 1
+    uf.label[y] = uf.cnt
+    for i in members(uf, group(uf, x))
+        uf.label[i] = uf.cnt
+    end
+    return true
+end
+
+#error function
+#if absolute of `x` is smaller than 2.4, we use Taylor expansion.
+#other wise, we use Continued fraction expansion
+function erf(x)
+    absx = abs(x)
+    if absx<2.4
+        c=1
+        a=0
+        for i in 1 : 40
+            a+=(x/(2i-1)*c)
+            c=-c*x^2/i
+        end
+        return a*2/sqrt(π)
+    else
+        if absx>1e50
+            a = 1
+        else
+            y = absx*sqrt(2)
+            a = 0
+            for i in 40:-1:1
+                a=i/(y+a)
+            end
+            a=1-exp(-x^2)/(y+a)*sqrt(2/π)
+        end
+        if x<0
+            return -a
+        else
+            return a
+        end
+    end
+end
+erfc(x) = 1-erf(x)
+norm_cdf(x) = 1/2*erfc(-x/sqrt(2))
+
+function process_parameters(dim, mean, cov)
+    if dim === nothing
+        if mean === nothing
+            if cov === nothing
+                dim = 1
+            else
+                cov = convert(Array{Float64}, cov)
+                if ndims(cov) < 2
+                    dim = 1
+                else
+                    dim = size(cov, 1)
+                end
+            end
+        else
+            mean = convert(Array{Float64}, mean)
+            dim = length(mean)
+        end
+    else
+        !isa(dim, Number) && throw(DimensionMismatch("dimension of random variable must be a scalar"))
+    end
+    
+    if mean === nothing
+        mean = zeros(dim)
+    end
+    mean = convert(Array{Float64}, mean)
+    
+    if cov === nothing
+        cov = [1.0]
+    end
+    cov = convert(Array{Float64}, cov)
+    
+    if dim == 1
+        mean = reshape(mean, 1)
+        cov = reshape(cov, 1, 1)
+    end
+    
+    if ndims(mean) != 1 || size(mean, 1) != dim
+        throw(ArgumentError("array `mean` must be vector of length $dim"))
+    end
+    if ndims(cov) == 0
+        cov = cov * Matrix{Float64}(I, dim, dim)
+    elseif ndims(cov) == 1
+        cov = diag(cov)
+    else
+        size(cov) != (dim, dim) && throw(DimensionMismatch("array `cov` must be at most two-dimensional, but ndims(cov) = $(ndims(cov))"))
+    end
+    return dim, mean, cov
+end
+
+function process_quantiles(x, dim)
+    x = convert(Array{Float64}, x)
+    
+    if ndims(x) == 0
+        x = [x]
+    elseif ndims(x) == 1
+        if dim == 1
+            x = x[:, :]
+        else
+            x = x[newaxis, :]
+        end
+    end
+    return x
+end
+
+function pinv_1d(v; _eps=1e-5)
+    return [(abs(x)<_eps) ? 0 : 1/x for x in v]
+end
+
+function psd_pinv_decomposed_log_pdet(mat; cond=nothing, rcond=nothing)
+    u, s = eigvecs(mat), eigvals(mat)
+    
+    if rcond !== nothing
+        cond = rcond
+    end
+    if cond === nothing || cond == -1
+        cond = 1e6 * Base.eps()
+    end
+    _eps = cond * maximum(abs.(s))
+    
+    if minimum(s) < -_eps
+        throw(ArgumentError("the covariance matrix must be positive semidefinite"))
+    end
+    s_pinv = pinv_1d(s, _eps=_eps)
+    U = u .* sqrt.(s_pinv')
+    log_pdet = sum(log.(s[findall(s.>_eps)]))
+
+    return U, log_pdet
+end
+
+function squeeze_output(out)
+    if length(out) == 1
+        out = Float64(out...)
+    else
+        out = vec(out)
+    end
+    return out
+end
+
+function _logpdf(x, mean, prec_U, log_det_cov)
+    dim = size(x, ndims(x))
+    dev = x - mean'
+    tmp = (dev * prec_U).^2
+    maha = sum(tmp, dims=ndims(tmp))
+    maha = dropdims(maha, dims=ndims(tmp))
+    return -0.5 * ((dim*LOG_2π+log_det_cov).+maha)
+end
+
+function pdf(x, mean, cov)
+    dim, mean, cov = process_parameters(nothing, mean, cov)
+    x = process_quantiles(x, dim)
+    prec_U, log_det_cov = psd_pinv_decomposed_log_pdet(cov)
+    out = exp.(_logpdf(x, mean, prec_U, log_det_cov))
+    return squeeze_output(out)
+end
+
+logpdf(x, mean, cov) = log.(pdf(x, mean, cov))
\ No newline at end of file

From fa4439816c4fe6f0ca892feb05e191d9c38acd6c Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 09:09:34 +0900
Subject: [PATCH 02/46] [test]add test for hdbscan

there was no test for it
---
 test/hdbscan.jl  | 42 ++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl |  1 +
 2 files changed, 43 insertions(+)
 create mode 100644 test/hdbscan.jl

diff --git a/test/hdbscan.jl b/test/hdbscan.jl
new file mode 100644
index 00000000..8469e506
--- /dev/null
+++ b/test/hdbscan.jl
@@ -0,0 +1,42 @@
+using Test
+include("../src/hdbscan.jl")
+
+@testset "HDBSCAN" begin
+    # test for util functions
+    # erf function tests
+    @test erf(0) == 0
+    @test erf(-0) == -0
+    @test isapprox(erf(1.0), 0.8427007929497148)
+    @test isapprox(erf(-1.0), -0.8427007929497148)
+    @test isapprox(erf(2.4), 0.999311486103355)
+    @test isapprox(erf(-2.4), -0.999311486103355)
+
+    # test for HDBSCANGraph
+    graph = HDBSCANGraph(4)
+    add_edge(graph, (1, 3, 0.1))
+    @test graph[1][1] == (3, 0.1) && graph[3][1] == (1, 0.1)
+
+    # test for UnionFind
+    uf = UnionFind(10)
+    @test group(uf, 1) == 1
+    @test sort(members(uf, 1)) == [1]
+    @test root(uf, 3) == 3
+    @test issame(uf, 1, 2) == false
+    @test size(uf, 1) == 1
+    @test unite!(uf, 1, 2) == true
+    @test issame(uf, 1, 2) == true
+    @test size(uf, 1) == 2
+    @test size(uf, 2) == 2
+    @test unite!(uf, 1, 2) == false
+    
+    # make moons for test
+    upper_x = [i for i in 0:π/50:π]
+    lower_x = [i for i in π/2:π/50:3/2*π]
+    upper_y = sin.(upper_x) .+ rand(50)./10
+    lower_y = cos.(lower_x) .+ rand(51)./10
+    data = hcat([lower_x; upper_x], [lower_y; upper_y])
+    #test for main function
+    @test_throws DomainError hdbscan!(data, 5, 0)
+    @test_throws ArgumentError hdbscan!(data, 5, 3; gen_mst=false)
+    @test_nowarn result = hdbscan!(data, 5, 3)
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 42301653..a18f475b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,6 +11,7 @@ tests = ["seeding",
          "kmedoids",
          "affprop",
          "dbscan",
+         "hdbscan",
          "fuzzycmeans",
          "counts",
          "silhouette",

From 4e64fdf36cd3d5b09409c3b938231643910a1471 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 09:19:03 +0900
Subject: [PATCH 03/46] [fix]change Int64 to Int

test failed on a ubuntu-latest-x86
---
 src/hdbscan.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 9e885e4f..fac44c0b 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -1,9 +1,9 @@
 struct HDBSCANGraph
-    edges::Vector{Vector{Tuple{Int64, Float64}}}
-    HDBSCANGraph(n) = new([Tuple{Int64, Float64}[] for _ in 1 : n])
+    edges::Vector{Vector{Tuple{Int, Float64}}}
+    HDBSCANGraph(n) = new([Tuple{Int, Float64}[] for _ in 1 : n])
 end
 
-function add_edge(G::HDBSCANGraph, v::Tuple{Int64, Int64, Float64})
+function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64})
     i, j, c = v
     push!(G.edges[i], (j, c))
     push!(G.edges[j], (i, c))

From e294565762962242c89d80c9ce6df83b879a0237 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 09:28:31 +0900
Subject: [PATCH 04/46] [fix]change all Int64 into Int

tests still fails on ubuntu-latest-x86
---
 src/hdbscan.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index fac44c0b..a0612fbb 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -12,13 +12,13 @@ end
 Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i]
 
 mutable struct HDBSCANCluster
-    parent::Int64
-    children::Vector{Int64}
-    points::Vector{Int64}
+    parent::Int
+    children::Vector{Int}
+    points::Vector{Int}
     λp::Vector{Float64}
     stability::Float64
     children_stability::Float64
-    function HDBSCANCluster(noise::Bool, points::Vector{Int64})
+    function HDBSCANCluster(noise::Bool, points::Vector{Int})
         if noise
             return new(0, [], [], [], -1, -1)
         else
@@ -46,10 +46,10 @@ This algorithm performs clustering as follows.
 The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html).
 """
 mutable struct HDBSCANResult
-    k::Int64
-    min_cluster_size::Int64
-    labels::Union{Vector{Int64}, Nothing}
-    function HDBSCANResult(k::Int64, min_cluster_size::Int64)
+    k::Int
+    min_cluster_size::Int
+    labels::Union{Vector{Int}, Nothing}
+    function HDBSCANResult(k::Int, min_cluster_size::Int)
         if min_cluster_size < 1
             throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
         end
@@ -58,7 +58,7 @@ mutable struct HDBSCANResult
 end
 
 """
-    hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::INt64; gen_mst::Bool=true, mst=nothing)
+    hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
 # Parameters
 - `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point
 - `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
@@ -66,7 +66,7 @@ end
 - `gen_mst`: whether to generate minimum-spannig-tree or not
 - `mst`: when is specified and `gen_mst` is false, new mst won't be generated
 """
-function hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::Int64; gen_mst::Bool=true, mst=nothing)
+function hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
     model = HDBSCANResult(k, min_cluster_size)
     n = size(points, 1)
     if gen_mst
@@ -119,7 +119,7 @@ function mutual_reachability_distance(core_dists, points)
 end
 
 function prim(graph, n)
-    minimum_spanning_tree = Array{Tuple{Float64, Int64, Int64}}(undef, n-1)
+    minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1)
     
     marked = falses(n)
     marked_cnt = 1
@@ -158,7 +158,7 @@ function build_hierarchy(mst, min_size)
         end
     else
         for i in 1 : n
-            Hierarchy[i] = HDBSCANCluster(true, Int64[])
+            Hierarchy[i] = HDBSCANCluster(true, Int[])
         end
     end
     sort!(mst)
@@ -199,7 +199,7 @@ function build_hierarchy(mst, min_size)
             #create parent cluster
             points = members(uf, group(uf, j))
             if length(points) < min_size
-                Hierarchy[n+i] = HDBSCANCluster(true, Int64[])
+                Hierarchy[n+i] = HDBSCANCluster(true, Int[])
             else
                 Hierarchy[n+i] = HDBSCANCluster(false, points)
             end
@@ -237,7 +237,7 @@ heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v)
 mutable struct UnionFind{T <: Integer}
     parent:: Vector{T}  # parent[root] is the negative of the size
     label::Dict{Int, Int}
-    cnt::Int64
+    cnt::Int
 
     function UnionFind{T}(nodes::T) where T<:Integer
         if nodes <= 0

From b851997c984ac9ffcc80d745d29a5ddb3497d2bc Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 22:22:37 +0900
Subject: [PATCH 05/46] [change]change usage and remove extra code

there were functions for Xmeans
---
 src/Clustering.jl |   4 +
 src/hdbscan.jl    | 238 +++++++++-------------------------------------
 2 files changed, 50 insertions(+), 192 deletions(-)

diff --git a/src/Clustering.jl b/src/Clustering.jl
index fae9ee82..a6540c47 100644
--- a/src/Clustering.jl
+++ b/src/Clustering.jl
@@ -40,6 +40,9 @@ module Clustering
     # dbscan
     DbscanResult, DbscanCluster, dbscan,
 
+    # hdbscan
+    HdbscanResult, hdbscan,
+
     # fuzzy_cmeans
     fuzzy_cmeans, FuzzyCMeansResult,
 
@@ -83,6 +86,7 @@ module Clustering
     include("kmedoids.jl")
     include("affprop.jl")
     include("dbscan.jl")
+    include("hdbscan.jl")
     include("mcl.jl")
     include("fuzzycmeans.jl")
 
diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index a0612fbb..f73ba676 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -11,14 +11,21 @@ end
 
 Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i]
 
-mutable struct HDBSCANCluster
+"""
+    HdbscanCluster(..., points, stability, ...)
+- `points`: vector of points which belongs to the cluster
+- `stability`: stablity of the cluster(-1 for noise clusters)
+
+You can use `length` to know the number of pooints in the cluster. And `isnoise` funciton is also available to know whether the cluster is noise or not.
+"""
+mutable struct HdbscanCluster
     parent::Int
     children::Vector{Int}
     points::Vector{Int}
     λp::Vector{Float64}
     stability::Float64
     children_stability::Float64
-    function HDBSCANCluster(noise::Bool, points::Vector{Int})
+    function HdbscanCluster(noise::Bool, points::Vector{Int})
         if noise
             return new(0, [], [], [], -1, -1)
         else
@@ -27,16 +34,27 @@ mutable struct HDBSCANCluster
     end
 end
 
-Base.length(c::HDBSCANCluster) = size(c.points, 1)
-join(c1::HDBSCANCluster, c2::HDBSCANCluster, id) = HDBSCANCluster(nothing, vcat(c1.points, c2.points), id, 0)
-isnoise(c::HDBSCANCluster) = c.stability == -1
-hasstability(c::HDBSCANCluster) = c.stability != 0
-function compute_stability(c::HDBSCANCluster, λbirth)
+Base.length(c::HdbscanCluster) = size(c.points, 1)
+isnoise(c::HdbscanCluster) = c.stability == -1
+hasstability(c::HdbscanCluster) = c.stability != 0
+function compute_stability(c::HdbscanCluster, λbirth)
     c.stability += sum(c.λp.-λbirth)
 end
 
 """
-    HDBSCAN(ε, minpts)
+    HdbscanResult(k, minpts, clusters)
+- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
+- `min_cluster_size`: minimum number of points in the cluster
+- `clusters`: result vector of clusters
+"""
+mutable struct HdbscanResult
+    k::Int
+    min_cluster_size::Int
+    clusters::Vector{HdbscanCluster}
+end
+
+"""
+    hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
 Density-Based Clustering Based on Hierarchical Density Estimates.
 This algorithm performs clustering as follows.
 1. generate a minimum spanning tree
@@ -44,21 +62,7 @@ This algorithm performs clustering as follows.
 3. extract the target cluster
 4. generate the list of cluster assignment for each point
 The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html).
-"""
-mutable struct HDBSCANResult
-    k::Int
-    min_cluster_size::Int
-    labels::Union{Vector{Int}, Nothing}
-    function HDBSCANResult(k::Int, min_cluster_size::Int)
-        if min_cluster_size < 1
-            throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
-        end
-        return new(k, min_cluster_size)
-    end
-end
 
-"""
-    hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
 # Parameters
 - `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point
 - `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
@@ -66,8 +70,10 @@ end
 - `gen_mst`: whether to generate minimum-spannig-tree or not
 - `mst`: when is specified and `gen_mst` is false, new mst won't be generated
 """
-function hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
-    model = HDBSCANResult(k, min_cluster_size)
+function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
+    if min_cluster_size < 1
+        throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
+    end
     n = size(points, 1)
     if gen_mst
         #calculate core distances for each point
@@ -84,15 +90,18 @@ function hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst
     #extract the target cluster
     extract_cluster!(hierarchy)
     #generate the list of cluster assignment for each point
-    result = fill(-1, n)
+    result = HdbscanCluster[]
+    noise_points = fill(-1, n)
     for (i, j) in enumerate(hierarchy[2n-1].children)
         c = hierarchy[j]
+        push!(result, c)
         for k in c.points
-            result[k] = i
+            noise_points[k] = 0
         end
     end
-    model.labels = result
-    return model
+    push!(result, HdbscanCluster(true, Int[]))
+    result[end].points = findall(x->x==-1, noise_points)
+    return HdbscanResult(k, min_cluster_size, result)
 end
 
 function core_dist(points, k)
@@ -151,14 +160,14 @@ function build_hierarchy(mst, min_size)
     n = length(mst) + 1
     cost = 0
     uf = UnionFind(n)
-    Hierarchy = Array{HDBSCANCluster}(undef, 2n-1)
+    Hierarchy = Array{HdbscanCluster}(undef, 2n-1)
     if min_size == 1
         for i in 1 : n
-            Hierarchy[i] = HDBSCANCluster(false, [i])
+            Hierarchy[i] = HdbscanCluster(false, [i])
         end
     else
         for i in 1 : n
-            Hierarchy[i] = HDBSCANCluster(true, Int[])
+            Hierarchy[i] = HdbscanCluster(true, Int[])
         end
     end
     sort!(mst)
@@ -181,7 +190,7 @@ function build_hierarchy(mst, min_size)
             unite!(uf, j, k)
             #create parent cluster
             points = members(uf, group(uf, j))
-            Hierarchy[n+i] = HDBSCANCluster(false, points)
+            Hierarchy[n+i] = HdbscanCluster(false, points)
         elseif !(nc1 && nc2)
             if nc2 == true
                 (c1, c2) = (c2, c1)
@@ -192,23 +201,23 @@ function build_hierarchy(mst, min_size)
             unite!(uf, j, k)
             #create parent cluster
             points = members(uf, group(uf, j))
-            Hierarchy[n+i] = HDBSCANCluster(false, points)
+            Hierarchy[n+i] = HdbscanCluster(false, points)
         else
             #unite the noise cluster
             unite!(uf, j, k)
             #create parent cluster
             points = members(uf, group(uf, j))
             if length(points) < min_size
-                Hierarchy[n+i] = HDBSCANCluster(true, Int[])
+                Hierarchy[n+i] = HdbscanCluster(true, Int[])
             else
-                Hierarchy[n+i] = HDBSCANCluster(false, points)
+                Hierarchy[n+i] = HdbscanCluster(false, points)
             end
         end
     end
     return Hierarchy
 end
 
-function extract_cluster!(hierarchy::Vector{HDBSCANCluster})
+function extract_cluster!(hierarchy::Vector{HdbscanCluster})
     for i in 1 : length(hierarchy)-1
         if isnoise(hierarchy[i])
             c = hierarchy[i]
@@ -228,10 +237,6 @@ function extract_cluster!(hierarchy::Vector{HDBSCANCluster})
 end
 
 # Below are utility functions for building hierarchical trees
-# Please note that these functions are not so sophisticated since I made them by manually converting code of numpy.
-const LOG_2π = log(2π)
-const newaxis = [CartesianIndex()]
-
 heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v)
 
 mutable struct UnionFind{T <: Integer}
@@ -288,155 +293,4 @@ function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
         uf.label[i] = uf.cnt
     end
     return true
-end
-
-#error function
-#if absolute of `x` is smaller than 2.4, we use Taylor expansion.
-#other wise, we use Continued fraction expansion
-function erf(x)
-    absx = abs(x)
-    if absx<2.4
-        c=1
-        a=0
-        for i in 1 : 40
-            a+=(x/(2i-1)*c)
-            c=-c*x^2/i
-        end
-        return a*2/sqrt(π)
-    else
-        if absx>1e50
-            a = 1
-        else
-            y = absx*sqrt(2)
-            a = 0
-            for i in 40:-1:1
-                a=i/(y+a)
-            end
-            a=1-exp(-x^2)/(y+a)*sqrt(2/π)
-        end
-        if x<0
-            return -a
-        else
-            return a
-        end
-    end
-end
-erfc(x) = 1-erf(x)
-norm_cdf(x) = 1/2*erfc(-x/sqrt(2))
-
-function process_parameters(dim, mean, cov)
-    if dim === nothing
-        if mean === nothing
-            if cov === nothing
-                dim = 1
-            else
-                cov = convert(Array{Float64}, cov)
-                if ndims(cov) < 2
-                    dim = 1
-                else
-                    dim = size(cov, 1)
-                end
-            end
-        else
-            mean = convert(Array{Float64}, mean)
-            dim = length(mean)
-        end
-    else
-        !isa(dim, Number) && throw(DimensionMismatch("dimension of random variable must be a scalar"))
-    end
-    
-    if mean === nothing
-        mean = zeros(dim)
-    end
-    mean = convert(Array{Float64}, mean)
-    
-    if cov === nothing
-        cov = [1.0]
-    end
-    cov = convert(Array{Float64}, cov)
-    
-    if dim == 1
-        mean = reshape(mean, 1)
-        cov = reshape(cov, 1, 1)
-    end
-    
-    if ndims(mean) != 1 || size(mean, 1) != dim
-        throw(ArgumentError("array `mean` must be vector of length $dim"))
-    end
-    if ndims(cov) == 0
-        cov = cov * Matrix{Float64}(I, dim, dim)
-    elseif ndims(cov) == 1
-        cov = diag(cov)
-    else
-        size(cov) != (dim, dim) && throw(DimensionMismatch("array `cov` must be at most two-dimensional, but ndims(cov) = $(ndims(cov))"))
-    end
-    return dim, mean, cov
-end
-
-function process_quantiles(x, dim)
-    x = convert(Array{Float64}, x)
-    
-    if ndims(x) == 0
-        x = [x]
-    elseif ndims(x) == 1
-        if dim == 1
-            x = x[:, :]
-        else
-            x = x[newaxis, :]
-        end
-    end
-    return x
-end
-
-function pinv_1d(v; _eps=1e-5)
-    return [(abs(x)<_eps) ? 0 : 1/x for x in v]
-end
-
-function psd_pinv_decomposed_log_pdet(mat; cond=nothing, rcond=nothing)
-    u, s = eigvecs(mat), eigvals(mat)
-    
-    if rcond !== nothing
-        cond = rcond
-    end
-    if cond === nothing || cond == -1
-        cond = 1e6 * Base.eps()
-    end
-    _eps = cond * maximum(abs.(s))
-    
-    if minimum(s) < -_eps
-        throw(ArgumentError("the covariance matrix must be positive semidefinite"))
-    end
-    s_pinv = pinv_1d(s, _eps=_eps)
-    U = u .* sqrt.(s_pinv')
-    log_pdet = sum(log.(s[findall(s.>_eps)]))
-
-    return U, log_pdet
-end
-
-function squeeze_output(out)
-    if length(out) == 1
-        out = Float64(out...)
-    else
-        out = vec(out)
-    end
-    return out
-end
-
-function _logpdf(x, mean, prec_U, log_det_cov)
-    dim = size(x, ndims(x))
-    dev = x - mean'
-    tmp = (dev * prec_U).^2
-    maha = sum(tmp, dims=ndims(tmp))
-    maha = dropdims(maha, dims=ndims(tmp))
-    return -0.5 * ((dim*LOG_2π+log_det_cov).+maha)
-end
-
-function pdf(x, mean, cov)
-    dim, mean, cov = process_parameters(nothing, mean, cov)
-    x = process_quantiles(x, dim)
-    prec_U, log_det_cov = psd_pinv_decomposed_log_pdet(cov)
-    out = exp.(_logpdf(x, mean, prec_U, log_det_cov))
-    return squeeze_output(out)
-end
-
-logpdf(x, mean, cov) = log.(pdf(x, mean, cov))
\ No newline at end of file
+end
\ No newline at end of file

From 7822a7c9fd60d9f44a1bec64f7a04e7f2311424f Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 22:23:08 +0900
Subject: [PATCH 06/46] [test]update test

deleted some utility functions
---
 test/hdbscan.jl | 39 ++++++++-------------------------------
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/test/hdbscan.jl b/test/hdbscan.jl
index 8469e506..cc1441c0 100644
--- a/test/hdbscan.jl
+++ b/test/hdbscan.jl
@@ -1,34 +1,7 @@
 using Test
-include("../src/hdbscan.jl")
+using Clustering
 
 @testset "HDBSCAN" begin
-    # test for util functions
-    # erf function tests
-    @test erf(0) == 0
-    @test erf(-0) == -0
-    @test isapprox(erf(1.0), 0.8427007929497148)
-    @test isapprox(erf(-1.0), -0.8427007929497148)
-    @test isapprox(erf(2.4), 0.999311486103355)
-    @test isapprox(erf(-2.4), -0.999311486103355)
-
-    # test for HDBSCANGraph
-    graph = HDBSCANGraph(4)
-    add_edge(graph, (1, 3, 0.1))
-    @test graph[1][1] == (3, 0.1) && graph[3][1] == (1, 0.1)
-
-    # test for UnionFind
-    uf = UnionFind(10)
-    @test group(uf, 1) == 1
-    @test sort(members(uf, 1)) == [1]
-    @test root(uf, 3) == 3
-    @test issame(uf, 1, 2) == false
-    @test size(uf, 1) == 1
-    @test unite!(uf, 1, 2) == true
-    @test issame(uf, 1, 2) == true
-    @test size(uf, 1) == 2
-    @test size(uf, 2) == 2
-    @test unite!(uf, 1, 2) == false
-    
     # make moons for test
     upper_x = [i for i in 0:π/50:π]
     lower_x = [i for i in π/2:π/50:3/2*π]
@@ -36,7 +9,11 @@ include("../src/hdbscan.jl")
     lower_y = cos.(lower_x) .+ rand(51)./10
     data = hcat([lower_x; upper_x], [lower_y; upper_y])
     #test for main function
-    @test_throws DomainError hdbscan!(data, 5, 0)
-    @test_throws ArgumentError hdbscan!(data, 5, 3; gen_mst=false)
-    @test_nowarn result = hdbscan!(data, 5, 3)
+    @test_throws DomainError hdbscan(data, 5, 0)
+    @test_throws ArgumentError hdbscan(data, 5, 3; gen_mst=false)
+    @test_nowarn result = hdbscan(data, 5, 3)
+
+    # tests for result
+    result = hdbscan(data, 5, 3)
+    @test sum([length(c) for c in result.clusters]) == 101
 end
\ No newline at end of file

From 8901cfe9d14e49fdbd3a83be3a9e1f9c068f9b45 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 23:14:10 +0900
Subject: [PATCH 07/46] [docs]add docs for HDBSCAN

Documentation workflow failed due to the docs for it
---
 docs/make.jl           |  1 +
 docs/source/hdbscan.md | 11 +++++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 docs/source/hdbscan.md

diff --git a/docs/make.jl b/docs/make.jl
index 556f471b..5455a1b8 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -16,6 +16,7 @@ makedocs(
             "mcl.md",
             "affprop.md",
             "dbscan.md",
+            "hdbscan.md",
             "fuzzycmeans.md",
         ],
         "validate.md",
diff --git a/docs/source/hdbscan.md b/docs/source/hdbscan.md
new file mode 100644
index 00000000..b61ce055
--- /dev/null
+++ b/docs/source/hdbscan.md
@@ -0,0 +1,11 @@
+# HDBSCAN
+
+Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters.
+
+## Interface
+The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function 
+```@docs
+hdbscan
+HdbscanResult
+HdbscanCluster
+```
\ No newline at end of file

From 6de5d02907cba73c434fedd5c71bb5bf058f7168 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 23 Mar 2024 23:21:58 +0900
Subject: [PATCH 08/46] [fix]export HdbscanCluster

I forgot to export it
---
 src/Clustering.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Clustering.jl b/src/Clustering.jl
index a6540c47..d76d91e9 100644
--- a/src/Clustering.jl
+++ b/src/Clustering.jl
@@ -41,7 +41,7 @@ module Clustering
     DbscanResult, DbscanCluster, dbscan,
 
     # hdbscan
-    HdbscanResult, hdbscan,
+    HdbscanResult, HdbscanCluster, hdbscan,
 
     # fuzzy_cmeans
     fuzzy_cmeans, FuzzyCMeansResult,

From 85c5644a823cf158dd2e465e37964d62021190ea Mon Sep 17 00:00:00 2001
From: MommawATASU <ascendwatson@gmail.com>
Date: Mon, 15 Apr 2024 08:54:59 +0900
Subject: [PATCH 09/46] [docs]merge docs of HDBSCAN with DBSCAN.md

there is no need to create new file
---
 docs/source/dbscan.md  | 12 ++++++++++++
 docs/source/hdbscan.md | 11 -----------
 2 files changed, 12 insertions(+), 11 deletions(-)
 delete mode 100644 docs/source/hdbscan.md

diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md
index b2c31b61..61e5ef73 100644
--- a/docs/source/dbscan.md
+++ b/docs/source/dbscan.md
@@ -46,3 +46,15 @@ dbscan
 DbscanResult
 DbscanCluster
 ```
+
+# HDBSCAN
+
+Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters.
+
+## Interface
+The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function 
+```@docs
+hdbscan
+HdbscanResult
+HdbscanCluster
+```
diff --git a/docs/source/hdbscan.md b/docs/source/hdbscan.md
deleted file mode 100644
index b61ce055..00000000
--- a/docs/source/hdbscan.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# HDBSCAN
-
-Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters.
-
-## Interface
-The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function 
-```@docs
-hdbscan
-HdbscanResult
-HdbscanCluster
-```
\ No newline at end of file

From edcc70ab90bf190a7970dab6e06bf2765f218862 Mon Sep 17 00:00:00 2001
From: MommawATASU <ascendwatson@gmail.com>
Date: Mon, 15 Apr 2024 09:10:55 +0900
Subject: [PATCH 10/46] [clean]refactoring HDBSCANGraph

add comment and alias
---
 src/hdbscan.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index f73ba676..beb0ba29 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -1,6 +1,10 @@
+# HDBSCAN Graph
+# edge[i] is a list of edges adjacent to the i-th vertex?
+# the second element of HDBSCANEdge is the mutual reachability distance.
+HDBSCANEdge = Tuple{Int, Float64}
 struct HDBSCANGraph
-    edges::Vector{Vector{Tuple{Int, Float64}}}
-    HDBSCANGraph(n) = new([Tuple{Int, Float64}[] for _ in 1 : n])
+    edges::Vector{Vector{HDBSCANEdge}}
+    HDBSCANGraph(n) = new([HDBSCANEdge[] for _ in 1 : n])
 end
 
 function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64})

From 939ce652c56cfb170720d0c88d431bec3812fc3c Mon Sep 17 00:00:00 2001
From: MommawATASU <ascendwatson@gmail.com>
Date: Fri, 26 Apr 2024 15:24:03 +0900
Subject: [PATCH 11/46] [docs]fix docs

hdbscan.md remains
---
 docs/make.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 5455a1b8..556f471b 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -16,7 +16,6 @@ makedocs(
             "mcl.md",
             "affprop.md",
             "dbscan.md",
-            "hdbscan.md",
             "fuzzycmeans.md",
         ],
         "validate.md",

From 2f67d07375b3f0e682fb793b74ee86d2bd764567 Mon Sep 17 00:00:00 2001
From: MommawATASU <ascendwatson@gmail.com>
Date: Fri, 26 Apr 2024 15:24:54 +0900
Subject: [PATCH 12/46] [clean]refactoring

the progress is temporary
---
 src/hdbscan.jl | 62 ++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index beb0ba29..004675db 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -1,20 +1,26 @@
 # HDBSCAN Graph
-# edge[i] is a list of edges adjacent to the i-th vertex?
+# edge[i] is a list of edges adjacent to the i-th vertex
 # the second element of HDBSCANEdge is the mutual reachability distance.
 HDBSCANEdge = Tuple{Int, Float64}
 struct HDBSCANGraph
     edges::Vector{Vector{HDBSCANEdge}}
-    HDBSCANGraph(n) = new([HDBSCANEdge[] for _ in 1 : n])
+    HDBSCANGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv])
 end
 
-function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64})
-    i, j, c = v
-    push!(G.edges[i], (j, c))
-    push!(G.edges[j], (i, c))
+function add_edge!(G::HDBSCANGraph, v1::Integer, v2::Integer, dist::Number)
+    push!(G.edges[v1], (v2, dist))
+    push!(G.edges[v2], (v1, dist))
 end
 
 Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i]
 
+struct MSTEdge
+    v1::Integer
+    v2::Integer
+    dist::Number
+end
+expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist)
+
 """
     HdbscanCluster(..., points, stability, ...)
 - `points`: vector of points which belongs to the cluster
@@ -29,20 +35,17 @@ mutable struct HdbscanCluster
     λp::Vector{Float64}
     stability::Float64
     children_stability::Float64
-    function HdbscanCluster(noise::Bool, points::Vector{Int})
-        if noise
-            return new(0, [], [], [], -1, -1)
-        else
-            return new(0, [], points, [], 0, 0)
-        end
+    function HdbscanCluster(points::Union{Vector{Int}, Nothing})
+        noise = isnothing(points)
+        return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0)
     end
 end
 
 Base.length(c::HdbscanCluster) = size(c.points, 1)
 isnoise(c::HdbscanCluster) = c.stability == -1
-hasstability(c::HdbscanCluster) = c.stability != 0
-function compute_stability(c::HdbscanCluster, λbirth)
-    c.stability += sum(c.λp.-λbirth)
+isstable(c::HdbscanCluster) = c.stability != 0
+function increment_stability(c::HdbscanCluster, λbirth)
+    c.stability += sum(c.λp) - length(c.λ) * λbirth
 end
 
 """
@@ -51,10 +54,9 @@ end
 - `min_cluster_size`: minimum number of points in the cluster
 - `clusters`: result vector of clusters
 """
-mutable struct HdbscanResult
-    k::Int
-    min_cluster_size::Int
+struct HdbscanResult
     clusters::Vector{HdbscanCluster}
+    assignments::Vector{Int}
 end
 
 """
@@ -105,21 +107,21 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst:
     end
     push!(result, HdbscanCluster(true, Int[]))
     result[end].points = findall(x->x==-1, noise_points)
-    return HdbscanResult(k, min_cluster_size, result)
+    return HdbscanResult(result, )
 end
 
-function core_dist(points, k)
+# calculate the core distances of the points
+function core_distances(points::AbstractMatrix, k::Integer)
     core_dists = Array{Float64}(undef, size(points, 1))
-    for i in 1 : size(points, 1)
-        p = points[i:i, :]
-        dists = vec(sum((@. (points - p)^2), dims=2))
+    for i in 1:size(points, 1)
+        dists = pairwise(Euclidean(), points[i:i, :], points; dims=2)
         sort!(dists)
         core_dists[i] = dists[k]
     end
     return core_dists
 end
 
-function mutual_reachability_distance(core_dists, points)
+function hdbscan_graph(core_dists::AbstractVector, points::AbstractMatrix)
     n = size(points, 1)
     graph = HDBSCANGraph(div(n * (n-1), 2))
     for i in 1 : n-1
@@ -131,30 +133,30 @@ function mutual_reachability_distance(core_dists, points)
     return graph
 end
 
-function prim(graph, n)
+function hdbscan_minspantree(graph::HDBSCANGraph, n::Integer)
     minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1)
     
     marked = falses(n)
     marked_cnt = 1
     marked[1] = true
     
-    h = []
+    h = MSTEdge[]
     
     for (i, c) in graph[1]
-        heappush!(h, (c, 1, i))
+        heappush!(h, MSTEdge(1, i, c))
     end
     
     while marked_cnt < n
-        c, i, j = popfirst!(h)
+        i, j, c = expand(popfirst!(h))
         
         marked[j] == true && continue
-        minimum_spanning_tree[marked_cnt] = (c, i, j)
+        minimum_spanning_tree[marked_cnt] = MSTEdge(i, j, c)
         marked[j] = true
         marked_cnt += 1
         
         for (k, c) in graph[j]
             marked[k] == true && continue
-            heappush!(h, (c, j, k))
+            heappush!(h, MSTEdge(j, k, c))
         end
     end
     return minimum_spanning_tree

From 61463f2d88c6f1af05b8ba66ea60cc4b28bfccf1 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 20:42:18 +0900
Subject: [PATCH 13/46] [clean]refactoring

add comment and improve performance, etc.
---
 src/hdbscan.jl | 161 ++++++++++++++++++++++++++-----------------------
 1 file changed, 85 insertions(+), 76 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 004675db..7bf6686f 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -20,6 +20,7 @@ struct MSTEdge
     dist::Number
 end
 expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist)
+Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist
 
 """
     HdbscanCluster(..., points, stability, ...)
@@ -45,7 +46,7 @@ Base.length(c::HdbscanCluster) = size(c.points, 1)
 isnoise(c::HdbscanCluster) = c.stability == -1
 isstable(c::HdbscanCluster) = c.stability != 0
 function increment_stability(c::HdbscanCluster, λbirth)
-    c.stability += sum(c.λp) - length(c.λ) * λbirth
+    c.stability += sum(c.λp) - length(c.λp) * λbirth
 end
 
 """
@@ -76,25 +77,22 @@ The detail is so complex it is difficult to explain the detail in here. But, if
 - `gen_mst`: whether to generate minimum-spannig-tree or not
 - `mst`: when is specified and `gen_mst` is false, new mst won't be generated
 """
-function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
+function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=SqEuclidean())
     if min_cluster_size < 1
         throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
     end
-    n = size(points, 1)
-    if gen_mst
-        #calculate core distances for each point
-        core_dists = core_dist(points, k)
-        #calculate mutual reachability distance between any two points
-        mrd = mutual_reachability_distance(core_dists, points)
-        #compute a minimum spanning tree by prim method
-        mst = prim(mrd, n)
-    elseif mst == nothing
-        throw(ArgumentError("if you set `gen_mst` to false, you must pass a minimum spanning tree as `mst`"))
-    end
+    n = size(points, 2)
+    dists = pairwise(metric, points; dims=2)
+    #calculate core distances for each point
+    core_dists = core_distances(dists, k)
+    #calculate mutual reachability distance between any two points
+    mrd = hdbscan_graph(core_dists, dists)
+    #compute a minimum spanning tree by prim method
+    mst = hdbscan_minspantree(mrd, n)
     #build a HDBSCAN hierarchy
-    hierarchy = build_hierarchy(mst, min_cluster_size)
+    hierarchy = hdbscan_clusters(mst, min_cluster_size)
     #extract the target cluster
-    extract_cluster!(hierarchy)
+    prune_cluster!(hierarchy)
     #generate the list of cluster assignment for each point
     result = HdbscanCluster[]
     noise_points = fill(-1, n)
@@ -105,125 +103,127 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst:
             noise_points[k] = 0
         end
     end
-    push!(result, HdbscanCluster(true, Int[]))
+    push!(result, HdbscanCluster(Int[]))
     result[end].points = findall(x->x==-1, noise_points)
-    return HdbscanResult(result, )
+    assignments = Array{Int}(undef, length(points))
+    for i in 1 : length(result)-1
+        assignments[result[i].points] = i
+    end
+    assignments[result[end].points] .= -1
+    return HdbscanResult(result, assignments)
 end
 
 # calculate the core distances of the points
-function core_distances(points::AbstractMatrix, k::Integer)
-    core_dists = Array{Float64}(undef, size(points, 1))
-    for i in 1:size(points, 1)
-        dists = pairwise(Euclidean(), points[i:i, :], points; dims=2)
-        sort!(dists)
-        core_dists[i] = dists[k]
+function core_distances(dists::AbstractMatrix, k::Integer)
+    core_dists = Array{Float64}(undef, size(dists, 1))
+    for i in axes(dists, 1)
+        dist = sort(dists[i, :])
+        core_dists[i] = dist[k]
     end
     return core_dists
 end
 
-function hdbscan_graph(core_dists::AbstractVector, points::AbstractMatrix)
-    n = size(points, 1)
+function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
+    n = size(dists, 1)
     graph = HDBSCANGraph(div(n * (n-1), 2))
     for i in 1 : n-1
         for j in i+1 : n
-            c = max(core_dists[i], core_dists[j], sum((points[i, :]-points[j, :]).^2))
-            add_edge(graph, (i, j, c))
+            c = max(core_dists[i], core_dists[j], dists[i, j])
+            add_edge!(graph, i, j, c)
         end
     end
     return graph
 end
 
 function hdbscan_minspantree(graph::HDBSCANGraph, n::Integer)
-    minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1)
+    function heapput!(h, v)
+        idx = searchsortedlast(h, v, rev=true)
+        insert!(h, (idx != 0) ? idx : 1, v)
+    end
+
+    minspantree = Vector{MSTEdge}(undef, n-1)
     
     marked = falses(n)
-    marked_cnt = 1
+    nmarked = 1
     marked[1] = true
     
     h = MSTEdge[]
     
     for (i, c) in graph[1]
-        heappush!(h, MSTEdge(1, i, c))
+        heapput!(h, MSTEdge(1, i, c))
     end
     
-    while marked_cnt < n
-        i, j, c = expand(popfirst!(h))
+    while nmarked < n
+        i, j, c = expand(pop!(h))
         
-        marked[j] == true && continue
-        minimum_spanning_tree[marked_cnt] = MSTEdge(i, j, c)
+        marked[j] && continue
+        minspantree[nmarked] = MSTEdge(i, j, c)
         marked[j] = true
-        marked_cnt += 1
+        nmarked += 1
         
         for (k, c) in graph[j]
-            marked[k] == true && continue
-            heappush!(h, MSTEdge(j, k, c))
+            marked[k] && continue
+            heapput!(h, MSTEdge(j, k, c))
         end
     end
-    return minimum_spanning_tree
+    return minspantree
 end
 
-function build_hierarchy(mst, min_size)
+function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer)
     n = length(mst) + 1
     cost = 0
     uf = UnionFind(n)
-    Hierarchy = Array{HdbscanCluster}(undef, 2n-1)
-    if min_size == 1
-        for i in 1 : n
-            Hierarchy[i] = HdbscanCluster(false, [i])
-        end
-    else
-        for i in 1 : n
-            Hierarchy[i] = HdbscanCluster(true, Int[])
-        end
-    end
+    clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
     sort!(mst)
     
     for i in 1 : n-1
-        c, j, k = mst[i]
+        j, k, c = expand(mst[i])
         cost += c
         λ = 1 / cost
         #child clusters
         c1 = group(uf, j)
         c2 = group(uf, k)
         #reference to the parent cluster
-        Hierarchy[c1].parent = Hierarchy[c2].parent = n+i
-        nc1, nc2 = isnoise(Hierarchy[c1]), isnoise(Hierarchy[c2])
+        println(c1, c2, n+i)
+        clusters[c1].parent = clusters[c2].parent = n+i
+        nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2])
         if !(nc1 || nc2)
             #compute stability
-            compute_stability(Hierarchy[c1], λ)
-            compute_stability(Hierarchy[c2], λ)
+            increment_stability(clusters[c1], λ)
+            increment_stability(clusters[c2], λ)
             #unite cluster
             unite!(uf, j, k)
             #create parent cluster
             points = members(uf, group(uf, j))
-            Hierarchy[n+i] = HdbscanCluster(false, points)
+            push!(clusters, HdbscanCluster(points))
         elseif !(nc1 && nc2)
             if nc2 == true
                 (c1, c2) = (c2, c1)
             end
             #record the lambda value
-            append!(Hierarchy[c2].λp, fill(λ, length(Hierarchy[c1])))
+            append!(clusters[c2].λp, fill(λ, length(clusters[c1])))
             #unite cluster
             unite!(uf, j, k)
             #create parent cluster
             points = members(uf, group(uf, j))
-            Hierarchy[n+i] = HdbscanCluster(false, points)
+            push!(clusters, HdbscanCluster(points))
         else
             #unite the noise cluster
             unite!(uf, j, k)
             #create parent cluster
             points = members(uf, group(uf, j))
             if length(points) < min_size
-                Hierarchy[n+i] = HdbscanCluster(true, Int[])
+                push!(clusters, HdbscanCluster(Int[]))
             else
-                Hierarchy[n+i] = HdbscanCluster(false, points)
+                push!(clusters, HdbscanCluster(points))
             end
         end
     end
-    return Hierarchy
+    @assert length(clusters) == 2n - 1
+    return clusters
 end
 
-function extract_cluster!(hierarchy::Vector{HdbscanCluster})
+function prune_cluster!(hierarchy::Vector{HdbscanCluster})
     for i in 1 : length(hierarchy)-1
         if isnoise(hierarchy[i])
             c = hierarchy[i]
@@ -235,6 +235,7 @@ function extract_cluster!(hierarchy::Vector{HdbscanCluster})
                 push!(hierarchy[c.parent].children, i)
                 hierarchy[c.parent].children_stability += c.stability
             else
+                println(c.parent, i)
                 append!(hierarchy[c.parent].children, c.children)
                 hierarchy[c.parent].children_stability += c.children_stability
             end
@@ -245,27 +246,34 @@ end
 # Below are utility functions for building hierarchical trees
 heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v)
 
-mutable struct UnionFind{T <: Integer}
-    parent:: Vector{T}  # parent[root] is the negative of the size
+# Union-Find
+# structure for managing disjoint sets
+# This structure tracks which sets the elements of a set belong to,
+# and allows us to efficiently determine whether two elements belong to the same set.
+mutable struct UnionFind
+    parent:: Vector{Integer}  # parent[root] is the negative of the size
     label::Dict{Int, Int}
-    cnt::Int
+    next_id::Int
 
-    function UnionFind{T}(nodes::T) where T<:Integer
+    function UnionFind(nodes::Integer)
         if nodes <= 0
             throw(ArgumentError("invalid argument for nodes: $nodes"))
         end
 
-        parent = -ones(T, nodes)
+        parent = -ones(nodes)
         label = Dict([(i, i) for i in 1 : nodes])
-        new{T}(parent, label, nodes)
+        new(parent, label, nodes)
     end
 end
 
-UnionFind(nodes::Integer) = UnionFind{typeof(nodes)}(nodes)
-group(uf::UnionFind, x)::Int = uf.label[root(uf, x)]
+# label of the set which element `x` belong to
+group(uf::UnionFind, x) = uf.label[root(uf, x)]
+# all elements that have the specified label
 members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label)))
 
-function root(uf::UnionFind{T}, x::T)::T where T<:Integer
+# root of element `x`
+# The root is the representative element of the set
+function root(uf::UnionFind, x::Integer)
     if uf.parent[x] < 0
         return x
     else
@@ -273,15 +281,16 @@ function root(uf::UnionFind{T}, x::T)::T where T<:Integer
     end
 end
 
-function issame(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
+# whether element `x` and `y` belong to the same set
+function issame(uf::UnionFind, x::Integer, y::Integer)
     return root(uf, x) == root(uf, y)
 end
 
-function Base.size(uf::UnionFind{T}, x::T)::T where T<:Integer
+function Base.size(uf::UnionFind, x::Integer)
     return -uf.parent[root(uf, x)]
 end
 
-function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
+function unite!(uf::UnionFind, x::Integer, y::Integer)
     x = root(uf, x)
     y = root(uf, y)
     if x == y
@@ -293,10 +302,10 @@ function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
     # unite smaller tree(y) to bigger one(x)
     uf.parent[x] += uf.parent[y]
     uf.parent[y] = x
-    uf.cnt += 1
-    uf.label[y] = uf.cnt
+    uf.next_id += 1
+    uf.label[y] = uf.next_id
     for i in members(uf, group(uf, x))
-        uf.label[i] = uf.cnt
+        uf.label[i] = uf.next_id
     end
     return true
 end
\ No newline at end of file

From 09ed174053adf7a7f3a734ae2a44d606c5f213fd Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 20:44:41 +0900
Subject: [PATCH 14/46] [test]update test

changes sugges alyst
---
 test/hdbscan.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/hdbscan.jl b/test/hdbscan.jl
index cc1441c0..0702b278 100644
--- a/test/hdbscan.jl
+++ b/test/hdbscan.jl
@@ -7,13 +7,13 @@ using Clustering
     lower_x = [i for i in π/2:π/50:3/2*π]
     upper_y = sin.(upper_x) .+ rand(50)./10
     lower_y = cos.(lower_x) .+ rand(51)./10
-    data = hcat([lower_x; upper_x], [lower_y; upper_y])
+    data = hcat([lower_x; upper_x], [lower_y; upper_y])'
     #test for main function
     @test_throws DomainError hdbscan(data, 5, 0)
-    @test_throws ArgumentError hdbscan(data, 5, 3; gen_mst=false)
     @test_nowarn result = hdbscan(data, 5, 3)
 
     # tests for result
-    result = hdbscan(data, 5, 3)
-    @test sum([length(c) for c in result.clusters]) == 101
+    result = @inferred(hdbscan(data, 5, 3))
+    @test isa(result, HdbscanResult)
+    @test sum(length, result.clusters) == 101
 end
\ No newline at end of file

From 6039c0cb9fe4aabf088fb7d9e120d473185f4952 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 21:01:18 +0900
Subject: [PATCH 15/46] [fix]change isnothing into ===

error occured with Julia1.10
---
 src/hdbscan.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 7bf6686f..8f6afde9 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -37,7 +37,7 @@ mutable struct HdbscanCluster
     stability::Float64
     children_stability::Float64
     function HdbscanCluster(points::Union{Vector{Int}, Nothing})
-        noise = isnothing(points)
+        noise = points === nothing
         return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0)
     end
 end

From 3cc76897449466485332536b95ff3c7fbfc5238a Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 21:06:39 +0900
Subject: [PATCH 16/46] [fix]remove println

forgot to remove debugging code
---
 src/hdbscan.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 8f6afde9..5ab4c829 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -184,7 +184,6 @@ function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer)
         c1 = group(uf, j)
         c2 = group(uf, k)
         #reference to the parent cluster
-        println(c1, c2, n+i)
         clusters[c1].parent = clusters[c2].parent = n+i
         nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2])
         if !(nc1 || nc2)
@@ -235,7 +234,6 @@ function prune_cluster!(hierarchy::Vector{HdbscanCluster})
                 push!(hierarchy[c.parent].children, i)
                 hierarchy[c.parent].children_stability += c.stability
             else
-                println(c.parent, i)
                 append!(hierarchy[c.parent].children, c.children)
                 hierarchy[c.parent].children_stability += c.children_stability
             end

From 1798148d103df298cc30b933fc9844b83ec76862 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 23:22:02 +0900
Subject: [PATCH 17/46] [docs]update docs

add description for detail
---
 docs/source/dbscan.md | 35 +++++++++++++++++++++++++++++++-
 src/hdbscan.jl        | 47 ++++++++++++++++++++++---------------------
 2 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md
index 61e5ef73..056a5ed7 100644
--- a/docs/source/dbscan.md
+++ b/docs/source/dbscan.md
@@ -49,7 +49,40 @@ DbscanCluster
 
 # HDBSCAN
 
-Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters.
+Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. The algorithm was proposed in:
+
+> Ricardo J. G. B. Campello, Davoud Moulavi & Joerg Sander 
+> *Density-Based Clustering Based on Hierarchical Density Estimates* 2013
+
+## Algorithm
+The main procedure of HDBSCAN:
+1. calculate the *mutual reachability distance*
+2. generate a *minimum spanning tree*
+3. build hierarchy
+4. extract the target cluster
+
+### Calculate the *mutual reachability distance*
+DBSCAN counts the number of points at a certain distance. But, HDBSCAN uses the opposite way, First, calculate the pairwise distances. Next, defined the core distance(``core_{ncore}(p)``) as a distance with the ncore-th closest point. And then, the mutual reachability distance between point `a` and `b` is defined as: ``d_{mreach-ncore}(a, b) = max\{ core_{ncore}(a), core_{ncore}(b), d(a, b) \}`` where ``d(a, b)`` is a euclidean distance between `a` and `b`(or specified metric).
+
+### Generate a *minimum-spanning tree(MST)*
+Conceptually what we will do is the following: consider the data as a weighted graph with the data points as vertices and an edge between any two points with weight equal to the mutual reachability distance of those points.
+
+Then, check the list of edges in descending order of distance whether the points which is connected by it is marked or not. If not, add the edge into the MST. After this processing, all data points are connected to MST with minimum weight.
+
+In practice, this is very expensive since there are ``n^{2}`` edges. 
+
+### Build hierarchy
+The next step is to build hierarchy based on MST. This step is very simple: repeatedly unite the clusters which has the minimum edge until all cluster are united into one cluster.
+
+### Extract the target cluster
+First we need a different measure than distance to consider the persistence of clusters; instead we will use ``\lambda = \frac{1}{\mathrm{distance}}``.
+For a given cluster we can then define values ``\lambda_{\mathrm{birth}}`` and ``\lambda_{\mathrm{death}}`` to be the lambda value when the cluster split off and became it’s own cluster, and the lambda value (if any) when the cluster split into smaller clusters respectively.
+In turn, for a given cluster, for each point ``p`` in that cluster we can define the value ``\lambda_{p}`` as the lambda value at which that point ‘fell out of the cluster’ which is a value somewhere between ``\lambda_{\mathrm{birth}}`` and ``\lambda_{\mathrm{death}}`` since the point either falls out of the cluster at some point in the cluster’s lifetime, or leaves the cluster when the cluster splits into two smaller clusters. 
+Now, for each cluster compute the stability as
+
+``\sum_{p \in \mathrm{cluster}} (\lambda_{p} - \lambda_{\mathrm{birth}})``.
+
+Declare all leaf nodes to be selected clusters. Now work up through the tree (the reverse topological sort order). If the sum of the stabilities of the child clusters is greater than the stability of the cluster, then we set the cluster stability to be the sum of the child stabilities. If, on the other hand, the cluster’s stability is greater than the sum of its children then we declare the cluster to be a selected cluster and unselect all its descendants. Once we reach the root node we call the current set of selected clusters our flat clustering and return it as the result of clustering.
 
 ## Interface
 The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function 
diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 5ab4c829..8cef620c 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -23,11 +23,14 @@ expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist)
 Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist
 
 """
-    HdbscanCluster(..., points, stability, ...)
-- `points`: vector of points which belongs to the cluster
-- `stability`: stablity of the cluster(-1 for noise clusters)
+    HdbscanCluster
+A cluster generated by the [hdbscan](@ref) method, part of [HdbscanResult](@ref).
+  - `points`: vector of points which belongs to the cluster
+  - `stability`: stability of the cluster (-1 for noise clusters)
 
-You can use `length` to know the number of pooints in the cluster. And `isnoise` funciton is also available to know whether the cluster is noise or not.
+The stability represents how much the cluster is "reasonable". So, a cluster which has a bigger stability is better.
+The noise cluster is determined not to belong to any cluster. So, you can ignore them.
+See also: [isnoise](@ref)
 """
 mutable struct HdbscanCluster
     parent::Int
@@ -43,6 +46,10 @@ mutable struct HdbscanCluster
 end
 
 Base.length(c::HdbscanCluster) = size(c.points, 1)
+"""
+    isnoise(c::HdbscanCluster)
+This function returns whether the cluster is the noise or not.
+"""
 isnoise(c::HdbscanCluster) = c.stability == -1
 isstable(c::HdbscanCluster) = c.stability != 0
 function increment_stability(c::HdbscanCluster, λbirth)
@@ -50,10 +57,10 @@ function increment_stability(c::HdbscanCluster, λbirth)
 end
 
 """
-    HdbscanResult(k, minpts, clusters)
-- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
-- `min_cluster_size`: minimum number of points in the cluster
-- `clusters`: result vector of clusters
+    HdbscanResult 
+Result of the [hdbscan](@ref) clustering.
+- `clusters`: vector of clusters
+- `assignments`: vectors of assignments for each points
 """
 struct HdbscanResult
     clusters::Vector{HdbscanCluster}
@@ -61,23 +68,17 @@ struct HdbscanResult
 end
 
 """
-    hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
-Density-Based Clustering Based on Hierarchical Density Estimates.
-This algorithm performs clustering as follows.
-1. generate a minimum spanning tree
-2. build a HDBSCAN hierarchy
-3. extract the target cluster
-4. generate the list of cluster assignment for each point
-The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html).
-
+    hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Integer;
+            metric=SqEuclidean())
+Cluster `points` using Density-Based Clustering Based on Hierarchical Density Estimates (HDBSCAN) algorithm.
+Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details on how the algorithm works.
 # Parameters
-- `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point
-- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
-- `min_cluster_size`: minimum number of points in the cluster
-- `gen_mst`: whether to generate minimum-spannig-tree or not
-- `mst`: when is specified and `gen_mst` is false, new mst won't be generated
+- `points`: the *d*×*n* matrix, where each column is a *d*-dimensional point
+- `ncore::Integer`: number of *core* neighbors of point, see [HDBSCAN algorithm](@ref hdbscan_algorithm) for a description
+- `min_cluster_size::Integer`: minimum number of points in the cluster
+- `metric`(defaults to Euclidean): the points distance metric to use.
 """
-function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=SqEuclidean())
+function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=Euclidean())
     if min_cluster_size < 1
         throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
     end

From a0a819ea89914f1fd71de027d703c9e482db3381 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 23:29:05 +0900
Subject: [PATCH 18/46] [docs]fix docs

fix links
---
 docs/source/dbscan.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md
index 056a5ed7..147872c1 100644
--- a/docs/source/dbscan.md
+++ b/docs/source/dbscan.md
@@ -54,7 +54,7 @@ Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN
 > Ricardo J. G. B. Campello, Davoud Moulavi & Joerg Sander 
 > *Density-Based Clustering Based on Hierarchical Density Estimates* 2013
 
-## Algorithm
+## [Algorithm](@id hdbscan_algorithm)
 The main procedure of HDBSCAN:
 1. calculate the *mutual reachability distance*
 2. generate a *minimum spanning tree*
@@ -90,4 +90,5 @@ The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) funct
 hdbscan
 HdbscanResult
 HdbscanCluster
+isnoise
 ```

From bf38eb6f475b164edf49039ea01782955e819240 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 23:43:52 +0900
Subject: [PATCH 19/46] [fix]fix docstring

add space
---
 src/hdbscan.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 8cef620c..0e46bcc8 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -46,6 +46,7 @@ mutable struct HdbscanCluster
 end
 
 Base.length(c::HdbscanCluster) = size(c.points, 1)
+
 """
     isnoise(c::HdbscanCluster)
 This function returns whether the cluster is the noise or not.

From 380acf17d2e31146cb1918d428537e43fb8be930 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sat, 27 Apr 2024 23:48:13 +0900
Subject: [PATCH 20/46] [fix]add isnoise to the list of exprted function

make isnoise available for user
---
 src/Clustering.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Clustering.jl b/src/Clustering.jl
index d76d91e9..a97cba89 100644
--- a/src/Clustering.jl
+++ b/src/Clustering.jl
@@ -41,7 +41,7 @@ module Clustering
     DbscanResult, DbscanCluster, dbscan,
 
     # hdbscan
-    HdbscanResult, HdbscanCluster, hdbscan,
+    HdbscanResult, HdbscanCluster, hdbscan, isnoise,
 
     # fuzzy_cmeans
     fuzzy_cmeans, FuzzyCMeansResult,

From 6fdebe603198290ae45ced877e540f1ebac9640b Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 11:19:41 -0700
Subject: [PATCH 21/46] core_distances() tweaks

---
 src/hdbscan.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 0e46bcc8..2bd8971c 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -115,12 +115,11 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E
     return HdbscanResult(result, assignments)
 end
 
-# calculate the core distances of the points
+# calculate the core (k-th nearest) distances of the points
 function core_distances(dists::AbstractMatrix, k::Integer)
     core_dists = Array{Float64}(undef, size(dists, 1))
-    for i in axes(dists, 1)
-        dist = sort(dists[i, :])
-        core_dists[i] = dist[k]
+    for i in axes(dists, 2)
+        core_dists[i] = partialsort!(dists[:, i], k)
     end
     return core_dists
 end

From ff98806679f2625ddfa050595b78cc4bbfe54b6a Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 11:36:56 -0700
Subject: [PATCH 22/46] Hdbscan: simplify results generation

---
 src/hdbscan.jl | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 2bd8971c..b4db9766 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -96,23 +96,16 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E
     #extract the target cluster
     prune_cluster!(hierarchy)
     #generate the list of cluster assignment for each point
-    result = HdbscanCluster[]
-    noise_points = fill(-1, n)
+    clusters = HdbscanCluster[]
+    assignments = fill(0, length(points)) # cluster index of each point
     for (i, j) in enumerate(hierarchy[2n-1].children)
-        c = hierarchy[j]
-        push!(result, c)
-        for k in c.points
-            noise_points[k] = 0
-        end
-    end
-    push!(result, HdbscanCluster(Int[]))
-    result[end].points = findall(x->x==-1, noise_points)
-    assignments = Array{Int}(undef, length(points))
-    for i in 1 : length(result)-1
-        assignments[result[i].points] = i
+        clu = hierarchy[j]
+        push!(clusters, clu)
+        assignments[clu.points] .= i
     end
-    assignments[result[end].points] .= -1
-    return HdbscanResult(result, assignments)
+    # add the cluster of all unassigned (noise) points
+    push!(clusters, HdbscanCluster(findall(==(0), assignments)))
+    return HdbscanResult(clusters, assignments)
 end
 
 # calculate the core (k-th nearest) distances of the points

From 64a202a7659c839f12016ab495f95e8ae1a1826e Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:41:24 -0700
Subject: [PATCH 23/46] remove heappush!: unused

---
 src/hdbscan.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index b4db9766..314ceb07 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -235,9 +235,6 @@ function prune_cluster!(hierarchy::Vector{HdbscanCluster})
     end
 end
 
-# Below are utility functions for building hierarchical trees
-heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v)
-
 # Union-Find
 # structure for managing disjoint sets
 # This structure tracks which sets the elements of a set belong to,

From 661edbc5b26409b5d4a684a1714c04d90f6ab548 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:54:18 -0700
Subject: [PATCH 24/46] hdbscan test: small tweaks

---
 test/hdbscan.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/hdbscan.jl b/test/hdbscan.jl
index 0702b278..d8f8fc01 100644
--- a/test/hdbscan.jl
+++ b/test/hdbscan.jl
@@ -7,13 +7,14 @@ using Clustering
     lower_x = [i for i in π/2:π/50:3/2*π]
     upper_y = sin.(upper_x) .+ rand(50)./10
     lower_y = cos.(lower_x) .+ rand(51)./10
-    data = hcat([lower_x; upper_x], [lower_y; upper_y])'
+    data = hcat([lower_x; upper_x],
+                [lower_y; upper_y])'
     #test for main function
     @test_throws DomainError hdbscan(data, 5, 0)
-    @test_nowarn result = hdbscan(data, 5, 3)
+    @test_nowarn @inferred(hdbscan(data, 5, 3))
 
     # tests for result
     result = @inferred(hdbscan(data, 5, 3))
     @test isa(result, HdbscanResult)
-    @test sum(length, result.clusters) == 101
+    @test sum(length, result.clusters) == size(data, 2)
 end
\ No newline at end of file

From 082c5e6f77d094576625d5160ae4a9f373c63c36 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:55:15 -0700
Subject: [PATCH 25/46] fixup hdbscan assignments

---
 src/hdbscan.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 314ceb07..9251c7dc 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -97,14 +97,15 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E
     prune_cluster!(hierarchy)
     #generate the list of cluster assignment for each point
     clusters = HdbscanCluster[]
-    assignments = fill(0, length(points)) # cluster index of each point
+    assignments = fill(0, n) # cluster index of each point
     for (i, j) in enumerate(hierarchy[2n-1].children)
         clu = hierarchy[j]
         push!(clusters, clu)
         assignments[clu.points] .= i
     end
     # add the cluster of all unassigned (noise) points
-    push!(clusters, HdbscanCluster(findall(==(0), assignments)))
+    noise_points = findall(==(0), assignments)
+    isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points))
     return HdbscanResult(clusters, assignments)
 end
 

From c9db368c9cc08d61935c8c9f1644b42eff1de7d3 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:55:57 -0700
Subject: [PATCH 26/46] hdbscan: further opt core_dists

we don't really need a function, this is one line operation
---
 src/hdbscan.jl | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 9251c7dc..e5652ca8 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -79,14 +79,15 @@ Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details
 - `min_cluster_size::Integer`: minimum number of points in the cluster
 - `metric`(defaults to Euclidean): the points distance metric to use.
 """
-function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=Euclidean())
+function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; metric=Euclidean())
     if min_cluster_size < 1
         throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
     end
     n = size(points, 2)
     dists = pairwise(metric, points; dims=2)
-    #calculate core distances for each point
-    core_dists = core_distances(dists, k)
+    # calculate core (ncore-th nearest) distance for each point
+    core_dists = [partialsort(i_dists, ncore) for i_dists in eachcol(dists)]
+
     #calculate mutual reachability distance between any two points
     mrd = hdbscan_graph(core_dists, dists)
     #compute a minimum spanning tree by prim method
@@ -109,15 +110,6 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E
     return HdbscanResult(clusters, assignments)
 end
 
-# calculate the core (k-th nearest) distances of the points
-function core_distances(dists::AbstractMatrix, k::Integer)
-    core_dists = Array{Float64}(undef, size(dists, 1))
-    for i in axes(dists, 2)
-        core_dists[i] = partialsort!(dists[:, i], k)
-    end
-    return core_dists
-end
-
 function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     n = size(dists, 1)
     graph = HDBSCANGraph(div(n * (n-1), 2))

From c205575d66af1d762b693f4fc3acc907ecee8b94 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:56:59 -0700
Subject: [PATCH 27/46] hdbscan: optimize edges generation

---
 src/hdbscan.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index e5652ca8..23881e43 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -113,9 +113,10 @@ end
 function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     n = size(dists, 1)
     graph = HDBSCANGraph(div(n * (n-1), 2))
-    for i in 1 : n-1
-        for j in i+1 : n
-            c = max(core_dists[i], core_dists[j], dists[i, j])
+    for (i, i_dists) in enumerate(eachcol(dists))
+        i_core = core_dists[i]
+        for j in i+1:n
+            c = max(i_core, core_dists[j], i_dists[j])
             add_edge!(graph, i, j, c)
         end
     end

From bdf1aec3c29234291e5e7a8afa5897e7de45033f Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:59:02 -0700
Subject: [PATCH 28/46] HDBSCANGraph -> HdbscanGraph

for consistency
---
 src/hdbscan.jl | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 23881e43..fa0dbf4b 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -2,17 +2,18 @@
 # edge[i] is a list of edges adjacent to the i-th vertex
 # the second element of HDBSCANEdge is the mutual reachability distance.
 HDBSCANEdge = Tuple{Int, Float64}
-struct HDBSCANGraph
+struct HdbscanGraph
     edges::Vector{Vector{HDBSCANEdge}}
-    HDBSCANGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv])
+
+    HdbscanGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv])
 end
 
-function add_edge!(G::HDBSCANGraph, v1::Integer, v2::Integer, dist::Number)
+function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number)
     push!(G.edges[v1], (v2, dist))
     push!(G.edges[v2], (v1, dist))
 end
 
-Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i]
+Base.getindex(G::HdbscanGraph, i::Int) = G.edges[i]
 
 struct MSTEdge
     v1::Integer
@@ -112,7 +113,7 @@ end
 
 function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     n = size(dists, 1)
-    graph = HDBSCANGraph(div(n * (n-1), 2))
+    graph = HdbscanGraph(div(n * (n-1), 2))
     for (i, i_dists) in enumerate(eachcol(dists))
         i_core = core_dists[i]
         for j in i+1:n
@@ -123,7 +124,7 @@ function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     return graph
 end
 
-function hdbscan_minspantree(graph::HDBSCANGraph, n::Integer)
+function hdbscan_minspantree(graph::HdbscanGraph, n::Integer)
     function heapput!(h, v)
         idx = searchsortedlast(h, v, rev=true)
         insert!(h, (idx != 0) ? idx : 1, v)

From 9aa984122bc5186d4ddb4486f0dddb9053104231 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:01:10 -0700
Subject: [PATCH 29/46] HdbscanEdge

---
 src/hdbscan.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index fa0dbf4b..1dfda402 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -1,11 +1,11 @@
+# HDBSCAN Graph edge: target vertex and mutual reachability distance.
+HdbscanEdge = Tuple{Int, Float64}
+
 # HDBSCAN Graph
-# edge[i] is a list of edges adjacent to the i-th vertex
-# the second element of HDBSCANEdge is the mutual reachability distance.
-HDBSCANEdge = Tuple{Int, Float64}
 struct HdbscanGraph
-    edges::Vector{Vector{HDBSCANEdge}}
+    edges::Vector{Vector{HdbscanEdge}}
 
-    HdbscanGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv])
+    HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv])
 end
 
 function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number)

From c9b9374c0c6996399c4e4e4838af2a85e474d16d Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:05:27 -0700
Subject: [PATCH 30/46] HdbscanGraph: rename edges to adj_edges

and remove Base.getindex()
---
 src/hdbscan.jl | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 1dfda402..780414a8 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -3,18 +3,16 @@ HdbscanEdge = Tuple{Int, Float64}
 
 # HDBSCAN Graph
 struct HdbscanGraph
-    edges::Vector{Vector{HdbscanEdge}}
+    adj_edges::Vector{Vector{HdbscanEdge}}
 
     HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv])
 end
 
 function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number)
-    push!(G.edges[v1], (v2, dist))
-    push!(G.edges[v2], (v1, dist))
+    push!(G.adj_edges[v1], (v2, dist))
+    push!(G.adj_edges[v2], (v1, dist))
 end
 
-Base.getindex(G::HdbscanGraph, i::Int) = G.edges[i]
-
 struct MSTEdge
     v1::Integer
     v2::Integer
@@ -137,8 +135,8 @@ function hdbscan_minspantree(graph::HdbscanGraph, n::Integer)
     marked[1] = true
     
     h = MSTEdge[]
-    
-    for (i, c) in graph[1]
+
+    for (i, c) in graph.adj_edges[1]
         heapput!(h, MSTEdge(1, i, c))
     end
     
@@ -149,8 +147,8 @@ function hdbscan_minspantree(graph::HdbscanGraph, n::Integer)
         minspantree[nmarked] = MSTEdge(i, j, c)
         marked[j] = true
         nmarked += 1
-        
-        for (k, c) in graph[j]
+
+        for (k, c) in graph.adj_edges[j]
             marked[k] && continue
             heapput!(h, MSTEdge(j, k, c))
         end

From 092ac4017b23a17abb74d71bd16c1b1c98724aa3 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:06:06 -0700
Subject: [PATCH 31/46] MSTEdge remove no-op expand() method

---
 src/hdbscan.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 780414a8..f3dd6745 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -18,7 +18,7 @@ struct MSTEdge
     v2::Integer
     dist::Number
 end
-expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist)
+
 Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist
 
 """
@@ -141,8 +141,8 @@ function hdbscan_minspantree(graph::HdbscanGraph, n::Integer)
     end
     
     while nmarked < n
-        i, j, c = expand(pop!(h))
-        
+        i, j, c = pop!(h)
+
         marked[j] && continue
         minspantree[nmarked] = MSTEdge(i, j, c)
         marked[j] = true
@@ -164,7 +164,7 @@ function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer)
     sort!(mst)
     
     for i in 1 : n-1
-        j, k, c = expand(mst[i])
+        j, k, c = mst[i]
         cost += c
         λ = 1 / cost
         #child clusters

From c972caad96a6f2bdaa4cc1297086c3df543a41c1 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:12:49 -0700
Subject: [PATCH 32/46] refactor HdbscanMSTEdge

---
 src/hdbscan.jl | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index f3dd6745..f2ca31cc 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -13,13 +13,8 @@ function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number)
     push!(G.adj_edges[v2], (v1, dist))
 end
 
-struct MSTEdge
-    v1::Integer
-    v2::Integer
-    dist::Number
-end
-
-Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist
+# Edge of the minimum spanning tree for HDBScan algorithm
+HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}}
 
 """
     HdbscanCluster
@@ -124,39 +119,39 @@ end
 
 function hdbscan_minspantree(graph::HdbscanGraph, n::Integer)
     function heapput!(h, v)
-        idx = searchsortedlast(h, v, rev=true)
+        idx = searchsortedlast(h, v, by=e -> e.dist, rev=true)
         insert!(h, (idx != 0) ? idx : 1, v)
     end
 
-    minspantree = Vector{MSTEdge}(undef, n-1)
-    
+    minspantree = Vector{HdbscanMSTEdge}(undef, n-1)
+
     marked = falses(n)
     nmarked = 1
     marked[1] = true
-    
-    h = MSTEdge[]
+
+    h = HdbscanMSTEdge[]
 
     for (i, c) in graph.adj_edges[1]
-        heapput!(h, MSTEdge(1, i, c))
+        heapput!(h, (v1=1, v2=i, dist=c))
     end
     
     while nmarked < n
-        i, j, c = pop!(h)
+        (i, j, c) = pop!(h)
 
         marked[j] && continue
-        minspantree[nmarked] = MSTEdge(i, j, c)
+        minspantree[nmarked] = (v1=i, v2=j, dist=c)
         marked[j] = true
         nmarked += 1
 
         for (k, c) in graph.adj_edges[j]
             marked[k] && continue
-            heapput!(h, MSTEdge(j, k, c))
+            heapput!(h, (v1=j, v2=k, dist=c))
         end
     end
     return minspantree
 end
 
-function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer)
+function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
     n = length(mst) + 1
     cost = 0
     uf = UnionFind(n)

From 57b05e6c92408081132fc55fd537a5651cadd909 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:27:02 -0700
Subject: [PATCH 33/46] hdbscan: fix graph vertices, remove add_edges!

add_edges!() is used only once
---
 src/hdbscan.jl | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index f2ca31cc..f30ab247 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -8,11 +8,6 @@ struct HdbscanGraph
     HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv])
 end
 
-function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number)
-    push!(G.adj_edges[v1], (v2, dist))
-    push!(G.adj_edges[v2], (v1, dist))
-end
-
 # Edge of the minimum spanning tree for HDBScan algorithm
 HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}}
 
@@ -106,12 +101,14 @@ end
 
 function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     n = size(dists, 1)
-    graph = HdbscanGraph(div(n * (n-1), 2))
+    graph = HdbscanGraph(n)
     for (i, i_dists) in enumerate(eachcol(dists))
         i_core = core_dists[i]
         for j in i+1:n
             c = max(i_core, core_dists[j], i_dists[j])
-            add_edge!(graph, i, j, c)
+            # add reciprocal edges
+            push!(graph.adj_edges[i], (j, c))
+            push!(graph.adj_edges[j], (i, c))
         end
     end
     return graph

From f6167951857a56684d353dcba8e4ebab72fc6671 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:28:18 -0700
Subject: [PATCH 34/46] hdbscan_minspantree(): refactor

---
 src/hdbscan.jl | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index f30ab247..b1667dda 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -80,7 +80,7 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int;
     #calculate mutual reachability distance between any two points
     mrd = hdbscan_graph(core_dists, dists)
     #compute a minimum spanning tree by prim method
-    mst = hdbscan_minspantree(mrd, n)
+    mst = hdbscan_minspantree(mrd)
     #build a HDBSCAN hierarchy
     hierarchy = hdbscan_clusters(mst, min_cluster_size)
     #extract the target cluster
@@ -114,35 +114,36 @@ function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     return graph
 end
 
-function hdbscan_minspantree(graph::HdbscanGraph, n::Integer)
-    function heapput!(h, v)
+function hdbscan_minspantree(graph::HdbscanGraph)
+    function heapput!(h, v::HdbscanMSTEdge)
         idx = searchsortedlast(h, v, by=e -> e.dist, rev=true)
         insert!(h, (idx != 0) ? idx : 1, v)
     end
 
-    minspantree = Vector{HdbscanMSTEdge}(undef, n-1)
-
-    marked = falses(n)
-    nmarked = 1
-    marked[1] = true
-
-    h = HdbscanMSTEdge[]
-
+    # initialize the edges heap by putting all edges of the first node (the root)
+    heap = HdbscanMSTEdge[]
     for (i, c) in graph.adj_edges[1]
-        heapput!(h, (v1=1, v2=i, dist=c))
+        heapput!(heap, (v1=1, v2=i, dist=c))
     end
-    
-    while nmarked < n
-        (i, j, c) = pop!(h)
 
-        marked[j] && continue
-        minspantree[nmarked] = (v1=i, v2=j, dist=c)
-        marked[j] = true
-        nmarked += 1
+    # build the tree
+    n = length(graph.adj_edges)
+    minspantree = Vector{HdbscanMSTEdge}()
+    inmst = falses(n) # if the graph node is in MST
+    inmst[1] = true # root
+
+    while length(minspantree) < n-1
+        # get the next edge with maximal? distance
+        (i, j, c) = pop!(heap)
+
+        # add j-th node to MST if not there
+        inmst[j] && continue
+        push!(minspantree, (v1=i, v2=j, dist=c))
+        inmst[j] = true
 
+        # add adjacent edges of j to the heap
         for (k, c) in graph.adj_edges[j]
-            marked[k] && continue
-            heapput!(h, (v1=j, v2=k, dist=c))
+            inmst[k] || heapput!(heap, (v1=j, v2=k, dist=c))
         end
     end
     return minspantree

From 0f6e9922e6b7da32b4167f277da7c7b16f75dc4c Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 14:32:41 -0700
Subject: [PATCH 35/46] prune_clusters!(): cleanup

---
 src/hdbscan.jl | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index b1667dda..e4dbebfc 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -84,7 +84,7 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int;
     #build a HDBSCAN hierarchy
     hierarchy = hdbscan_clusters(mst, min_cluster_size)
     #extract the target cluster
-    prune_cluster!(hierarchy)
+    prune_clusters!(hierarchy)
     #generate the list of cluster assignment for each point
     clusters = HdbscanCluster[]
     assignments = fill(0, n) # cluster index of each point
@@ -202,21 +202,16 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer
     return clusters
 end
 
-function prune_cluster!(hierarchy::Vector{HdbscanCluster})
-    for i in 1 : length(hierarchy)-1
-        if isnoise(hierarchy[i])
-            c = hierarchy[i]
-            push!(hierarchy[c.parent].children, i)
-            hierarchy[c.parent].children_stability += c.stability
+function prune_clusters!(hierarchy::Vector{HdbscanCluster})
+    for i in 1:length(hierarchy)-1
+        c = hierarchy[i]
+        parent = hierarchy[c.parent]
+        if isnoise(c) || c.stability > c.children_stability
+            push!(parent.children, i)
+            parent.children_stability += c.stability
         else
-            c = hierarchy[i]
-            if c.stability > c.children_stability
-                push!(hierarchy[c.parent].children, i)
-                hierarchy[c.parent].children_stability += c.stability
-            else
-                append!(hierarchy[c.parent].children, c.children)
-                hierarchy[c.parent].children_stability += c.children_stability
-            end
+            append!(parent.children, c.children)
+            parent.children_stability += c.children_stability
         end
     end
 end

From e01609bcd35b670a1d615d48c45c3732992e8a86 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 15:08:08 -0700
Subject: [PATCH 36/46] hdbscan: fix 1.0 compat

replace eachcol with alternatives
---
 src/hdbscan.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index e4dbebfc..fb8cf812 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -75,7 +75,7 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int;
     n = size(points, 2)
     dists = pairwise(metric, points; dims=2)
     # calculate core (ncore-th nearest) distance for each point
-    core_dists = [partialsort(i_dists, ncore) for i_dists in eachcol(dists)]
+    core_dists = [partialsort!(dists[:, i], ncore) for i in axes(dists, 2)]
 
     #calculate mutual reachability distance between any two points
     mrd = hdbscan_graph(core_dists, dists)
@@ -102,7 +102,8 @@ end
 function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
     n = size(dists, 1)
     graph = HdbscanGraph(n)
-    for (i, i_dists) in enumerate(eachcol(dists))
+    for i in axes(dists, 2)
+        i_dists = view(dists, :, i)
         i_core = core_dists[i]
         for j in i+1:n
             c = max(i_core, core_dists[j], i_dists[j])

From de6b83a47e1067d080f409ba95713cc166b4fd35 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sun, 28 Apr 2024 07:36:02 +0900
Subject: [PATCH 37/46] [docs]fix docstring

add newline
---
 src/hdbscan.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index fb8cf812..39bbfb1d 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -13,6 +13,7 @@ HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}}
 
 """
     HdbscanCluster
+
 A cluster generated by the [hdbscan](@ref) method, part of [HdbscanResult](@ref).
   - `points`: vector of points which belongs to the cluster
   - `stability`: stability of the cluster (-1 for noise clusters)
@@ -38,6 +39,7 @@ Base.length(c::HdbscanCluster) = size(c.points, 1)
 
 """
     isnoise(c::HdbscanCluster)
+
 This function returns whether the cluster is the noise or not.
 """
 isnoise(c::HdbscanCluster) = c.stability == -1
@@ -48,6 +50,7 @@ end
 
 """
     HdbscanResult 
+
 Result of the [hdbscan](@ref) clustering.
 - `clusters`: vector of clusters
 - `assignments`: vectors of assignments for each points
@@ -60,6 +63,7 @@ end
 """
     hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Integer;
             metric=SqEuclidean())
+
 Cluster `points` using Density-Based Clustering Based on Hierarchical Density Estimates (HDBSCAN) algorithm.
 Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details on how the algorithm works.
 # Parameters

From 38212cac3633b2ad50947fb04310f67a2daff873 Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sun, 28 Apr 2024 07:53:44 +0900
Subject: [PATCH 38/46] [clean]rename and refactoring

cleanup UnionFind terminology and move it to the other file
---
 src/Clustering.jl |  1 +
 src/hdbscan.jl    | 75 ++++-------------------------------------------
 src/unionfind.jl  | 58 ++++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 69 deletions(-)
 create mode 100644 src/unionfind.jl

diff --git a/src/Clustering.jl b/src/Clustering.jl
index a97cba89..fd522a78 100644
--- a/src/Clustering.jl
+++ b/src/Clustering.jl
@@ -81,6 +81,7 @@ module Clustering
 
     include("utils.jl")
     include("seeding.jl")
+    include("unionfind.jl")
 
     include("kmeans.jl")
     include("kmedoids.jl")
diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 39bbfb1d..0c4477ef 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -29,6 +29,7 @@ mutable struct HdbscanCluster
     λp::Vector{Float64}
     stability::Float64
     children_stability::Float64
+
     function HdbscanCluster(points::Union{Vector{Int}, Nothing})
         noise = points === nothing
         return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0)
@@ -166,8 +167,8 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer
         cost += c
         λ = 1 / cost
         #child clusters
-        c1 = group(uf, j)
-        c2 = group(uf, k)
+        c1 = set_id(uf, j)
+        c2 = set_id(uf, k)
         #reference to the parent cluster
         clusters[c1].parent = clusters[c2].parent = n+i
         nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2])
@@ -178,7 +179,7 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer
             #unite cluster
             unite!(uf, j, k)
             #create parent cluster
-            points = members(uf, group(uf, j))
+            points = items(uf, set_id(uf, j))
             push!(clusters, HdbscanCluster(points))
         elseif !(nc1 && nc2)
             if nc2 == true
@@ -189,13 +190,13 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer
             #unite cluster
             unite!(uf, j, k)
             #create parent cluster
-            points = members(uf, group(uf, j))
+            points = items(uf, set_id(uf, j))
             push!(clusters, HdbscanCluster(points))
         else
             #unite the noise cluster
             unite!(uf, j, k)
             #create parent cluster
-            points = members(uf, group(uf, j))
+            points = items(uf, set_id(uf, j))
             if length(points) < min_size
                 push!(clusters, HdbscanCluster(Int[]))
             else
@@ -220,67 +221,3 @@ function prune_clusters!(hierarchy::Vector{HdbscanCluster})
         end
     end
 end
-
-# Union-Find
-# structure for managing disjoint sets
-# This structure tracks which sets the elements of a set belong to,
-# and allows us to efficiently determine whether two elements belong to the same set.
-mutable struct UnionFind
-    parent:: Vector{Integer}  # parent[root] is the negative of the size
-    label::Dict{Int, Int}
-    next_id::Int
-
-    function UnionFind(nodes::Integer)
-        if nodes <= 0
-            throw(ArgumentError("invalid argument for nodes: $nodes"))
-        end
-
-        parent = -ones(nodes)
-        label = Dict([(i, i) for i in 1 : nodes])
-        new(parent, label, nodes)
-    end
-end
-
-# label of the set which element `x` belong to
-group(uf::UnionFind, x) = uf.label[root(uf, x)]
-# all elements that have the specified label
-members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label)))
-
-# root of element `x`
-# The root is the representative element of the set
-function root(uf::UnionFind, x::Integer)
-    if uf.parent[x] < 0
-        return x
-    else
-        return uf.parent[x] = root(uf, uf.parent[x])
-    end
-end
-
-# whether element `x` and `y` belong to the same set
-function issame(uf::UnionFind, x::Integer, y::Integer)
-    return root(uf, x) == root(uf, y)
-end
-
-function Base.size(uf::UnionFind, x::Integer)
-    return -uf.parent[root(uf, x)]
-end
-
-function unite!(uf::UnionFind, x::Integer, y::Integer)
-    x = root(uf, x)
-    y = root(uf, y)
-    if x == y
-        return false
-    end
-    if uf.parent[x] > uf.parent[y]
-        x, y = y, x
-    end
-    # unite smaller tree(y) to bigger one(x)
-    uf.parent[x] += uf.parent[y]
-    uf.parent[y] = x
-    uf.next_id += 1
-    uf.label[y] = uf.next_id
-    for i in members(uf, group(uf, x))
-        uf.label[i] = uf.next_id
-    end
-    return true
-end
\ No newline at end of file
diff --git a/src/unionfind.jl b/src/unionfind.jl
new file mode 100644
index 00000000..30018c1e
--- /dev/null
+++ b/src/unionfind.jl
@@ -0,0 +1,58 @@
+# Union-Find
+# structure for managing disjoint sets
+# This structure tracks which sets the elements of a set belong to,
+# and allows us to efficiently determine whether two elements belong to the same set.
+mutable struct UnionFind
+    parent::Vector{Int}  # parent[root] is the negative of the size
+    label::Dict{Int, Int}
+    next_id::Int
+
+    function UnionFind(nodes::Int)
+        if nodes <= 0
+            throw(ArgumentError("invalid argument for nodes: $nodes"))
+        end
+
+        parent = -ones(nodes)
+        label = Dict([i=>i for i in 1:nodes])
+        new(parent, label, nodes)
+    end
+end
+
+# label of the set which element `x` belong to
+set_id(uf::UnionFind, x) = uf.label[root(uf, x)]
+# all elements that have the specified label
+items(uf::UnionFind, x::Int) = [k for (k, v) in pairs(uf.label) if v == x]
+
+# root of element `x`
+# The root is the representative element of the set
+function root(uf::UnionFind, x::Int)
+    if uf.parent[x] < 0
+        return x
+    else
+        return uf.parent[x] = root(uf, uf.parent[x])
+    end
+end
+
+function setsize(uf::UnionFind, x::Int)
+    return -uf.parent[root(uf, x)]
+end
+
+function unite!(uf::UnionFind, x::Int, y::Int)
+    x = root(uf, x)
+    y = root(uf, y)
+    if x == y
+        return false
+    end
+    if uf.parent[x] > uf.parent[y]
+        x, y = y, x
+    end
+    # unite smaller tree(y) to bigger one(x)
+    uf.parent[x] += uf.parent[y]
+    uf.parent[y] = x
+    uf.next_id += 1
+    uf.label[y] = uf.next_id
+    for i in items(uf, set_id(uf, x))
+        uf.label[i] = uf.next_id
+    end
+    return true
+end
\ No newline at end of file

From 159858d14029b5a787d286919c4cebdf1ed3502b Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Sun, 28 Apr 2024 08:13:25 +0900
Subject: [PATCH 39/46] [test]add test for unionfind

there wes no test for it
---
 test/hdbscan.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/hdbscan.jl b/test/hdbscan.jl
index d8f8fc01..af111188 100644
--- a/test/hdbscan.jl
+++ b/test/hdbscan.jl
@@ -1,5 +1,16 @@
 using Test
 using Clustering
+using Clustering: UnionFind, set_id, items, root, setsize, unite!
+
+@testset "UnionFind" begin
+    uf = UnionFind(10)
+    @test set_id(uf, 5) == 5
+    @test items(uf, 5) == [5]
+    @test !(unite!(uf, 5, 5))
+    @test unite!(uf, 4, 5)
+    @test setsize(uf, 4) == 2
+    @test root(uf, 5) == 4
+end
 
 @testset "HDBSCAN" begin
     # make moons for test

From a03c22441deb10c1a28762756b9066048b4e7b98 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 20:38:48 -0700
Subject: [PATCH 40/46] hdbscan_minspantree: fix edges sorting

---
 src/hdbscan.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index fb8cf812..8a993123 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -116,9 +116,10 @@ function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
 end
 
 function hdbscan_minspantree(graph::HdbscanGraph)
+    # put the edge to the heap (sorted from largest to smallest distances)
     function heapput!(h, v::HdbscanMSTEdge)
         idx = searchsortedlast(h, v, by=e -> e.dist, rev=true)
-        insert!(h, (idx != 0) ? idx : 1, v)
+        insert!(h, idx + 1, v)
     end
 
     # initialize the edges heap by putting all edges of the first node (the root)
@@ -126,6 +127,7 @@ function hdbscan_minspantree(graph::HdbscanGraph)
     for (i, c) in graph.adj_edges[1]
         heapput!(heap, (v1=1, v2=i, dist=c))
     end
+    @assert issorted(heap, by=e -> e.dist, rev=true)
 
     # build the tree
     n = length(graph.adj_edges)
@@ -134,7 +136,7 @@ function hdbscan_minspantree(graph::HdbscanGraph)
     inmst[1] = true # root
 
     while length(minspantree) < n-1
-        # get the next edge with maximal? distance
+        # get the edge with the smallest distance from the heap
         (i, j, c) = pop!(heap)
 
         # add j-th node to MST if not there
@@ -147,7 +149,7 @@ function hdbscan_minspantree(graph::HdbscanGraph)
             inmst[k] || heapput!(heap, (v1=j, v2=k, dist=c))
         end
     end
-    return minspantree
+    return sort!(minspantree, by=e -> e.dist)
 end
 
 function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
@@ -155,7 +157,6 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer
     cost = 0
     uf = UnionFind(n)
     clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
-    sort!(mst)
     
     for i in 1 : n-1
         j, k, c = mst[i]

From 3a577eaa8b71b7dd47af5d66bf18961fba47f414 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 20:39:23 -0700
Subject: [PATCH 41/46] hdbscan_clusters(): fix cost type

---
 src/hdbscan.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 8a993123..65d6296b 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -154,7 +154,7 @@ end
 
 function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
     n = length(mst) + 1
-    cost = 0
+    cost = 0.0
     uf = UnionFind(n)
     clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
     

From 744af227cd773e35ebb58779dda31e536d2d6d84 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Sat, 27 Apr 2024 20:40:03 -0700
Subject: [PATCH 42/46] hdbscan_clusters: improve MST iteration

---
 src/hdbscan.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 65d6296b..9e3a9ef6 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -152,14 +152,13 @@ function hdbscan_minspantree(graph::HdbscanGraph)
     return sort!(minspantree, by=e -> e.dist)
 end
 
-function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
-    n = length(mst) + 1
+function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
+    n = length(minspantree) + 1
     cost = 0.0
     uf = UnionFind(n)
     clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
-    
-    for i in 1 : n-1
-        j, k, c = mst[i]
+
+    for (i, (j, k, c)) in enumerate(minspantree)
         cost += c
         λ = 1 / cost
         #child clusters

From b94de2528be373287b6ebe665015255d5659873c Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Wed, 1 May 2024 15:07:30 +0900
Subject: [PATCH 43/46] [clean]rename the result structure

rename HdbscanCluster into HdbscanNode and create new one to expose to the user
---
 src/hdbscan.jl   | 51 ++++++++++++++++++++++++++++--------------------
 src/unionfind.jl |  2 +-
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 9f1d11ee..aab68b1f 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -11,6 +11,21 @@ end
 # Edge of the minimum spanning tree for HDBScan algorithm
 HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}}
 
+mutable struct HdbscanNode
+    parent::Int
+    children::Vector{Int}
+    points::Vector{Int}
+    λp::Vector{Float64}
+    stability::Float64
+    children_stability::Float64
+
+    function HdbscanNode(points::Union{Vector{Int}, Nothing})
+        noise = points === nothing
+        return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0)
+    end
+end
+
+
 """
     HdbscanCluster
 
@@ -22,21 +37,12 @@ The stability represents how much the cluster is "reasonable". So, a cluster whi
 The noise cluster is determined not to belong to any cluster. So, you can ignore them.
 See also: [isnoise](@ref)
 """
-mutable struct HdbscanCluster
-    parent::Int
-    children::Vector{Int}
+struct HdbscanCluster
     points::Vector{Int}
-    λp::Vector{Float64}
     stability::Float64
-    children_stability::Float64
-
-    function HdbscanCluster(points::Union{Vector{Int}, Nothing})
-        noise = points === nothing
-        return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0)
-    end
 end
 
-Base.length(c::HdbscanCluster) = size(c.points, 1)
+Base.length(c::HdbscanCluster) = length(c.points)
 
 """
     isnoise(c::HdbscanCluster)
@@ -44,8 +50,9 @@ Base.length(c::HdbscanCluster) = size(c.points, 1)
 This function returns whether the cluster is the noise or not.
 """
 isnoise(c::HdbscanCluster) = c.stability == -1
-isstable(c::HdbscanCluster) = c.stability != 0
-function increment_stability(c::HdbscanCluster, λbirth)
+isnoise(c::HdbscanNode) = c.stability == -1
+isstable(c::HdbscanNode) = c.stability != 0
+function increment_stability(c::HdbscanNode, λbirth)
     c.stability += sum(c.λp) - length(c.λp) * λbirth
 end
 
@@ -55,6 +62,8 @@ end
 Result of the [hdbscan](@ref) clustering.
 - `clusters`: vector of clusters
 - `assignments`: vectors of assignments for each points
+
+See also: [HdbscanCluster](@ref)
 """
 struct HdbscanResult
     clusters::Vector{HdbscanCluster}
@@ -95,12 +104,12 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int;
     assignments = fill(0, n) # cluster index of each point
     for (i, j) in enumerate(hierarchy[2n-1].children)
         clu = hierarchy[j]
-        push!(clusters, clu)
+        push!(clusters, HdbscanCluster(clu.points, clu.stability))
         assignments[clu.points] .= i
     end
     # add the cluster of all unassigned (noise) points
     noise_points = findall(==(0), assignments)
-    isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points))
+    isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points, -1))
     return HdbscanResult(clusters, assignments)
 end
 
@@ -161,7 +170,7 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size:
     n = length(minspantree) + 1
     cost = 0.0
     uf = UnionFind(n)
-    clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
+    clusters = [HdbscanNode(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
 
     for (i, (j, k, c)) in enumerate(minspantree)
         cost += c
@@ -180,7 +189,7 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size:
             unite!(uf, j, k)
             #create parent cluster
             points = items(uf, set_id(uf, j))
-            push!(clusters, HdbscanCluster(points))
+            push!(clusters, HdbscanNode(points))
         elseif !(nc1 && nc2)
             if nc2 == true
                 (c1, c2) = (c2, c1)
@@ -191,16 +200,16 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size:
             unite!(uf, j, k)
             #create parent cluster
             points = items(uf, set_id(uf, j))
-            push!(clusters, HdbscanCluster(points))
+            push!(clusters, HdbscanNode(points))
         else
             #unite the noise cluster
             unite!(uf, j, k)
             #create parent cluster
             points = items(uf, set_id(uf, j))
             if length(points) < min_size
-                push!(clusters, HdbscanCluster(Int[]))
+                push!(clusters, HdbscanNode(Int[]))
             else
-                push!(clusters, HdbscanCluster(points))
+                push!(clusters, HdbscanNode(points))
             end
         end
     end
@@ -208,7 +217,7 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size:
     return clusters
 end
 
-function prune_clusters!(hierarchy::Vector{HdbscanCluster})
+function prune_clusters!(hierarchy::Vector{HdbscanNode})
     for i in 1:length(hierarchy)-1
         c = hierarchy[i]
         parent = hierarchy[c.parent]
diff --git a/src/unionfind.jl b/src/unionfind.jl
index 30018c1e..6b846bd6 100644
--- a/src/unionfind.jl
+++ b/src/unionfind.jl
@@ -19,7 +19,7 @@ mutable struct UnionFind
 end
 
 # label of the set which element `x` belong to
-set_id(uf::UnionFind, x) = uf.label[root(uf, x)]
+set_id(uf::UnionFind, x::Int) = uf.label[root(uf, x)]
 # all elements that have the specified label
 items(uf::UnionFind, x::Int) = [k for (k, v) in pairs(uf.label) if v == x]
 

From 7078223097e794f2c72d42dfff5d32b92a50c73d Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Thu, 2 May 2024 10:46:07 +0900
Subject: [PATCH 44/46] [hotfix]apply hot fix

the algorithm went wrong
---
 src/hdbscan.jl | 93 +++++++++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 47 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index aab68b1f..43996441 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -19,12 +19,13 @@ mutable struct HdbscanNode
     stability::Float64
     children_stability::Float64
 
-    function HdbscanNode(points::Union{Vector{Int}, Nothing})
-        noise = points === nothing
-        return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0)
+    function HdbscanNode(points::Vector{Int}; λp::Vector{Float64}=Float64[], children::Vector{Int}=Int[], children_stability::Float64=0.0)
+        noise = isempty(λp)
+        return new(0, children, points, λp, noise ? -1 : 0, children_stability)
     end
 end
 
+cluster_len(c::HdbscanNode) = length(c.points)
 
 """
     HdbscanCluster
@@ -52,8 +53,9 @@ This function returns whether the cluster is the noise or not.
 isnoise(c::HdbscanCluster) = c.stability == -1
 isnoise(c::HdbscanNode) = c.stability == -1
 isstable(c::HdbscanNode) = c.stability != 0
-function increment_stability(c::HdbscanNode, λbirth)
-    c.stability += sum(c.λp) - length(c.λp) * λbirth
+function compute_stability!(c::HdbscanNode, λbirth)
+    c.stability = sum(c.λp) - length(c.λp) * λbirth
+    return c.stability
 end
 
 """
@@ -96,14 +98,14 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int;
     #compute a minimum spanning tree by prim method
     mst = hdbscan_minspantree(mrd)
     #build a HDBSCAN hierarchy
-    hierarchy = hdbscan_clusters(mst, min_cluster_size)
+    tree = hdbscan_build_tree(mst, min_cluster_size)
     #extract the target cluster
-    prune_clusters!(hierarchy)
+    extract_clusters!(tree)
     #generate the list of cluster assignment for each point
     clusters = HdbscanCluster[]
     assignments = fill(0, n) # cluster index of each point
-    for (i, j) in enumerate(hierarchy[2n-1].children)
-        clu = hierarchy[j]
+    for (i, j) in enumerate(tree[end].children)
+        clu = tree[j]
         push!(clusters, HdbscanCluster(clu.points, clu.stability))
         assignments[clu.points] .= i
     end
@@ -166,11 +168,11 @@ function hdbscan_minspantree(graph::HdbscanGraph)
     return sort!(minspantree, by=e -> e.dist)
 end
 
-function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
+function hdbscan_build_tree(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer)
     n = length(minspantree) + 1
     cost = 0.0
     uf = UnionFind(n)
-    clusters = [HdbscanNode(min_size > 1 ? Int[i] : Int[]) for i in 1:n]
+    clusters = [HdbscanNode(Int[i], λp=(min_size==1) ? Float64[Inf] : Float64[]) for i in 1:n]
 
     for (i, (j, k, c)) in enumerate(minspantree)
         cost += c
@@ -180,53 +182,50 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size:
         c2 = set_id(uf, k)
         #reference to the parent cluster
         clusters[c1].parent = clusters[c2].parent = n+i
-        nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2])
-        if !(nc1 || nc2)
-            #compute stability
-            increment_stability(clusters[c1], λ)
-            increment_stability(clusters[c2], λ)
-            #unite cluster
-            unite!(uf, j, k)
-            #create parent cluster
-            points = items(uf, set_id(uf, j))
-            push!(clusters, HdbscanNode(points))
-        elseif !(nc1 && nc2)
-            if nc2 == true
-                (c1, c2) = (c2, c1)
-            end
-            #record the lambda value
-            append!(clusters[c2].λp, fill(λ, length(clusters[c1])))
-            #unite cluster
-            unite!(uf, j, k)
-            #create parent cluster
-            points = items(uf, set_id(uf, j))
+        #unite the clusters
+        unite!(uf, j, k)
+        points = items(uf, set_id(uf, j))
+        if length(points) < min_size
             push!(clusters, HdbscanNode(points))
         else
-            #unite the noise cluster
-            unite!(uf, j, k)
-            #create parent cluster
-            points = items(uf, set_id(uf, j))
-            if length(points) < min_size
-                push!(clusters, HdbscanNode(Int[]))
+            nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2])
+            if nc1 && nc2
+                push!(clusters, HdbscanNode(points, λp=fill(λ, length(points))))
+            elseif nc1 || nc2
+                #ensure that c1 is the cluster
+                if nc1 == true
+                    c1, c2 = c2, c1
+                end
+                push!(clusters, HdbscanNode(points, λp=[clusters[c1].λp..., fill(λ, cluster_len(clusters[c2]))...]))
             else
-                push!(clusters, HdbscanNode(points))
+                #compute stabilities for children
+                c1_stability = compute_stability!(clusters[c1], λ)
+                c2_stability = compute_stability!(clusters[c2], λ)
+                #unite the two clusters
+                push!(clusters, HdbscanNode(points, λp=fill(λ, length(points)), children=[c1, c2], children_stability=c1_stability+c2_stability))
             end
         end
     end
+    compute_stability!(clusters[end], 1/cost)
     @assert length(clusters) == 2n - 1
     return clusters
 end
 
-function prune_clusters!(hierarchy::Vector{HdbscanNode})
-    for i in 1:length(hierarchy)-1
-        c = hierarchy[i]
-        parent = hierarchy[c.parent]
-        if isnoise(c) || c.stability > c.children_stability
-            push!(parent.children, i)
-            parent.children_stability += c.stability
-        else
+function extract_clusters!(tree::Vector{HdbscanNode})
+    for i in eachindex(tree)
+        i == length(tree) && continue
+        c = tree[i]
+        isempty(c.children) && continue
+        parent = tree[c.parent]
+        children_stability = sum([tree[c.children[i]].stability for i in eachindex(c.children)])
+        if children_stability > c.stability
+            filter!(x->x!=i, parent.children)
             append!(parent.children, c.children)
-            parent.children_stability += c.children_stability
         end
     end
+    c = tree[end]
+    children_stability = sum([tree[c.children[i]].stability for i in eachindex(c.children)])
+    if children_stability <= c.stability
+        c.children = Int[2 * length(tree) - 1]
+    end
 end

From b0791d937ef05cbb1f018a69f942e2176968014d Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Thu, 2 May 2024 10:46:48 +0900
Subject: [PATCH 45/46] [test]add test about min_size

ensure that `min_size` effects properly
---
 test/hdbscan.jl | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/test/hdbscan.jl b/test/hdbscan.jl
index af111188..25038d31 100644
--- a/test/hdbscan.jl
+++ b/test/hdbscan.jl
@@ -20,12 +20,30 @@ end
     lower_y = cos.(lower_x) .+ rand(51)./10
     data = hcat([lower_x; upper_x],
                 [lower_y; upper_y])'
-    #test for main function
+    # test for main function
     @test_throws DomainError hdbscan(data, 5, 0)
-    @test_nowarn @inferred(hdbscan(data, 5, 3))
+    @test_nowarn @inferred(hdbscan(data, 5, 10))
 
     # tests for result
-    result = @inferred(hdbscan(data, 5, 3))
+    result = @inferred(hdbscan(data, 5, 15))
     @test isa(result, HdbscanResult)
     @test sum(length, result.clusters) == size(data, 2)
+    # test whether the model recognizes the moons
+    @test length(result.clusters) == 3
+
+    # generate more complicated data
+    n_points = 100  # number of points
+    clusters = 4    # number of clusters
+
+    # generate the centers of clusters
+    centers = [(rand(1:10), rand(1:10)) for _ in 1:clusters]
+
+    # generate all the points
+    data = hcat([0.5 * randn(2, n_points) .+ centers[i] for i in 1:clusters]...)
+    # check the effect of min_size
+    nclusters1 = length(@inferred(hdbscan(data, 5, 5)).clusters)
+    nclusters2 = length(@inferred(hdbscan(data, 5, 15)).clusters)
+    nclusters3 = length(@inferred(hdbscan(data, 5, 25)).clusters)
+    @test nclusters1 >= nclusters2
+    @test nclusters2 >= nclusters3
 end
\ No newline at end of file

From dc5dd409882d390d333b808c466f1b5604ef0d6c Mon Sep 17 00:00:00 2001
From: MommaWatasu <ascendwatson@gmail.com>
Date: Thu, 2 May 2024 11:03:26 +0900
Subject: [PATCH 46/46] [add function or file]support ClusteringResult

add counts field into HdbscanResult
---
 src/hdbscan.jl  |  9 +++++----
 test/hdbscan.jl | 10 +++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/hdbscan.jl b/src/hdbscan.jl
index 43996441..3a769f4a 100644
--- a/src/hdbscan.jl
+++ b/src/hdbscan.jl
@@ -43,8 +43,6 @@ struct HdbscanCluster
     stability::Float64
 end
 
-Base.length(c::HdbscanCluster) = length(c.points)
-
 """
     isnoise(c::HdbscanCluster)
 
@@ -67,9 +65,10 @@ Result of the [hdbscan](@ref) clustering.
 
 See also: [HdbscanCluster](@ref)
 """
-struct HdbscanResult
+struct HdbscanResult <: ClusteringResult
     clusters::Vector{HdbscanCluster}
     assignments::Vector{Int}
+    counts::Vector{Int}
 end
 
 """
@@ -103,16 +102,18 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int;
     extract_clusters!(tree)
     #generate the list of cluster assignment for each point
     clusters = HdbscanCluster[]
+    counts = Int[]
     assignments = fill(0, n) # cluster index of each point
     for (i, j) in enumerate(tree[end].children)
         clu = tree[j]
         push!(clusters, HdbscanCluster(clu.points, clu.stability))
+        push!(counts, length(clu.points))
         assignments[clu.points] .= i
     end
     # add the cluster of all unassigned (noise) points
     noise_points = findall(==(0), assignments)
     isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points, -1))
-    return HdbscanResult(clusters, assignments)
+    return HdbscanResult(clusters, assignments, counts)
 end
 
 function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix)
diff --git a/test/hdbscan.jl b/test/hdbscan.jl
index 25038d31..c3d0894b 100644
--- a/test/hdbscan.jl
+++ b/test/hdbscan.jl
@@ -27,9 +27,9 @@ end
     # tests for result
     result = @inferred(hdbscan(data, 5, 15))
     @test isa(result, HdbscanResult)
-    @test sum(length, result.clusters) == size(data, 2)
+    @test sum(x->length(x.points), result.clusters) == size(data, 2)
     # test whether the model recognizes the moons
-    @test length(result.clusters) == 3
+    @test nclusters(result) == 2
 
     # generate more complicated data
     n_points = 100  # number of points
@@ -41,9 +41,9 @@ end
     # generate all the points
     data = hcat([0.5 * randn(2, n_points) .+ centers[i] for i in 1:clusters]...)
     # check the effect of min_size
-    nclusters1 = length(@inferred(hdbscan(data, 5, 5)).clusters)
-    nclusters2 = length(@inferred(hdbscan(data, 5, 15)).clusters)
-    nclusters3 = length(@inferred(hdbscan(data, 5, 25)).clusters)
+    nclusters1 = nclusters(@inferred(hdbscan(data, 5, 5)))
+    nclusters2 = nclusters(@inferred(hdbscan(data, 5, 15)))
+    nclusters3 = nclusters(@inferred(hdbscan(data, 5, 25)))
     @test nclusters1 >= nclusters2
     @test nclusters2 >= nclusters3
 end
\ No newline at end of file