From 23bbed11ba1720c7328098dc9aad404dc75438ef Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 01:10:21 +0900 Subject: [PATCH 01/46] [add function or file]add hdbscan make PR --- src/hdbscan.jl | 442 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 442 insertions(+) create mode 100644 src/hdbscan.jl diff --git a/src/hdbscan.jl b/src/hdbscan.jl new file mode 100644 index 00000000..9e885e4f --- /dev/null +++ b/src/hdbscan.jl @@ -0,0 +1,442 @@ +struct HDBSCANGraph + edges::Vector{Vector{Tuple{Int64, Float64}}} + HDBSCANGraph(n) = new([Tuple{Int64, Float64}[] for _ in 1 : n]) +end + +function add_edge(G::HDBSCANGraph, v::Tuple{Int64, Int64, Float64}) + i, j, c = v + push!(G.edges[i], (j, c)) + push!(G.edges[j], (i, c)) +end + +Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i] + +mutable struct HDBSCANCluster + parent::Int64 + children::Vector{Int64} + points::Vector{Int64} + λp::Vector{Float64} + stability::Float64 + children_stability::Float64 + function HDBSCANCluster(noise::Bool, points::Vector{Int64}) + if noise + return new(0, [], [], [], -1, -1) + else + return new(0, [], points, [], 0, 0) + end + end +end + +Base.length(c::HDBSCANCluster) = size(c.points, 1) +join(c1::HDBSCANCluster, c2::HDBSCANCluster, id) = HDBSCANCluster(nothing, vcat(c1.points, c2.points), id, 0) +isnoise(c::HDBSCANCluster) = c.stability == -1 +hasstability(c::HDBSCANCluster) = c.stability != 0 +function compute_stability(c::HDBSCANCluster, λbirth) + c.stability += sum(c.λp.-λbirth) +end + +""" + HDBSCAN(ε, minpts) +Density-Based Clustering Based on Hierarchical Density Estimates. +This algorithm performs clustering as follows. +1. generate a minimum spanning tree +2. build a HDBSCAN hierarchy +3. extract the target cluster +4. generate the list of cluster assignment for each point +The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). +""" +mutable struct HDBSCANResult + k::Int64 + min_cluster_size::Int64 + labels::Union{Vector{Int64}, Nothing} + function HDBSCANResult(k::Int64, min_cluster_size::Int64) + if min_cluster_size < 1 + throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) + end + return new(k, min_cluster_size) + end +end + +""" + hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::INt64; gen_mst::Bool=true, mst=nothing) +# Parameters +- `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point +- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A. +- `min_cluster_size`: minimum number of points in the cluster +- `gen_mst`: whether to generate minimum-spannig-tree or not +- `mst`: when is specified and `gen_mst` is false, new mst won't be generated +""" +function hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::Int64; gen_mst::Bool=true, mst=nothing) + model = HDBSCANResult(k, min_cluster_size) + n = size(points, 1) + if gen_mst + #calculate core distances for each point + core_dists = core_dist(points, k) + #calculate mutual reachability distance between any two points + mrd = mutual_reachability_distance(core_dists, points) + #compute a minimum spanning tree by prim method + mst = prim(mrd, n) + elseif mst == nothing + throw(ArgumentError("if you set `gen_mst` to false, you must pass a minimum spanning tree as `mst`")) + end + #build a HDBSCAN hierarchy + hierarchy = build_hierarchy(mst, min_cluster_size) + #extract the target cluster + extract_cluster!(hierarchy) + #generate the list of cluster assignment for each point + result = fill(-1, n) + for (i, j) in enumerate(hierarchy[2n-1].children) + c = hierarchy[j] + for k in c.points + result[k] = i + end + end + model.labels = result + return model +end + +function core_dist(points, k) + core_dists = Array{Float64}(undef, size(points, 1)) + for i in 1 : size(points, 1) + p = points[i:i, :] + dists = vec(sum((@. (points - p)^2), dims=2)) + sort!(dists) + core_dists[i] = dists[k] + end + return core_dists +end + +function mutual_reachability_distance(core_dists, points) + n = size(points, 1) + graph = HDBSCANGraph(div(n * (n-1), 2)) + for i in 1 : n-1 + for j in i+1 : n + c = max(core_dists[i], core_dists[j], sum((points[i, :]-points[j, :]).^2)) + add_edge(graph, (i, j, c)) + end + end + return graph +end + +function prim(graph, n) + minimum_spanning_tree = Array{Tuple{Float64, Int64, Int64}}(undef, n-1) + + marked = falses(n) + marked_cnt = 1 + marked[1] = true + + h = [] + + for (i, c) in graph[1] + heappush!(h, (c, 1, i)) + end + + while marked_cnt < n + c, i, j = popfirst!(h) + + marked[j] == true && continue + minimum_spanning_tree[marked_cnt] = (c, i, j) + marked[j] = true + marked_cnt += 1 + + for (k, c) in graph[j] + marked[k] == true && continue + heappush!(h, (c, j, k)) + end + end + return minimum_spanning_tree +end + +function build_hierarchy(mst, min_size) + n = length(mst) + 1 + cost = 0 + uf = UnionFind(n) + Hierarchy = Array{HDBSCANCluster}(undef, 2n-1) + if min_size == 1 + for i in 1 : n + Hierarchy[i] = HDBSCANCluster(false, [i]) + end + else + for i in 1 : n + Hierarchy[i] = HDBSCANCluster(true, Int64[]) + end + end + sort!(mst) + + for i in 1 : n-1 + c, j, k = mst[i] + cost += c + λ = 1 / cost + #child clusters + c1 = group(uf, j) + c2 = group(uf, k) + #reference to the parent cluster + Hierarchy[c1].parent = Hierarchy[c2].parent = n+i + nc1, nc2 = isnoise(Hierarchy[c1]), isnoise(Hierarchy[c2]) + if !(nc1 || nc2) + #compute stability + compute_stability(Hierarchy[c1], λ) + compute_stability(Hierarchy[c2], λ) + #unite cluster + unite!(uf, j, k) + #create parent cluster + points = members(uf, group(uf, j)) + Hierarchy[n+i] = HDBSCANCluster(false, points) + elseif !(nc1 && nc2) + if nc2 == true + (c1, c2) = (c2, c1) + end + #record the lambda value + append!(Hierarchy[c2].λp, fill(λ, length(Hierarchy[c1]))) + #unite cluster + unite!(uf, j, k) + #create parent cluster + points = members(uf, group(uf, j)) + Hierarchy[n+i] = HDBSCANCluster(false, points) + else + #unite the noise cluster + unite!(uf, j, k) + #create parent cluster + points = members(uf, group(uf, j)) + if length(points) < min_size + Hierarchy[n+i] = HDBSCANCluster(true, Int64[]) + else + Hierarchy[n+i] = HDBSCANCluster(false, points) + end + end + end + return Hierarchy +end + +function extract_cluster!(hierarchy::Vector{HDBSCANCluster}) + for i in 1 : length(hierarchy)-1 + if isnoise(hierarchy[i]) + c = hierarchy[i] + push!(hierarchy[c.parent].children, i) + hierarchy[c.parent].children_stability += c.stability + else + c = hierarchy[i] + if c.stability > c.children_stability + push!(hierarchy[c.parent].children, i) + hierarchy[c.parent].children_stability += c.stability + else + append!(hierarchy[c.parent].children, c.children) + hierarchy[c.parent].children_stability += c.children_stability + end + end + end +end + +# Below are utility functions for building hierarchical trees +# Please note that these functions are not so sophisticated since I made them by manually converting code of numpy. +const LOG_2π = log(2π) +const newaxis = [CartesianIndex()] + +heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v) + +mutable struct UnionFind{T <: Integer} + parent:: Vector{T} # parent[root] is the negative of the size + label::Dict{Int, Int} + cnt::Int64 + + function UnionFind{T}(nodes::T) where T<:Integer + if nodes <= 0 + throw(ArgumentError("invalid argument for nodes: $nodes")) + end + + parent = -ones(T, nodes) + label = Dict([(i, i) for i in 1 : nodes]) + new{T}(parent, label, nodes) + end +end + +UnionFind(nodes::Integer) = UnionFind{typeof(nodes)}(nodes) +group(uf::UnionFind, x)::Int = uf.label[root(uf, x)] +members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label))) + +function root(uf::UnionFind{T}, x::T)::T where T<:Integer + if uf.parent[x] < 0 + return x + else + return uf.parent[x] = root(uf, uf.parent[x]) + end +end + +function issame(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer + return root(uf, x) == root(uf, y) +end + +function Base.size(uf::UnionFind{T}, x::T)::T where T<:Integer + return -uf.parent[root(uf, x)] +end + +function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer + x = root(uf, x) + y = root(uf, y) + if x == y + return false + end + if uf.parent[x] > uf.parent[y] + x, y = y, x + end + # unite smaller tree(y) to bigger one(x) + uf.parent[x] += uf.parent[y] + uf.parent[y] = x + uf.cnt += 1 + uf.label[y] = uf.cnt + for i in members(uf, group(uf, x)) + uf.label[i] = uf.cnt + end + return true +end + +#error function +#if absolute of `x` is smaller than 2.4, we use Taylor expansion. +#other wise, we use Continued fraction expansion +function erf(x) + absx = abs(x) + if absx<2.4 + c=1 + a=0 + for i in 1 : 40 + a+=(x/(2i-1)*c) + c=-c*x^2/i + end + return a*2/sqrt(π) + else + if absx>1e50 + a = 1 + else + y = absx*sqrt(2) + a = 0 + for i in 40:-1:1 + a=i/(y+a) + end + a=1-exp(-x^2)/(y+a)*sqrt(2/π) + end + if x<0 + return -a + else + return a + end + end +end +erfc(x) = 1-erf(x) +norm_cdf(x) = 1/2*erfc(-x/sqrt(2)) + +function process_parameters(dim, mean, cov) + if dim === nothing + if mean === nothing + if cov === nothing + dim = 1 + else + cov = convert(Array{Float64}, cov) + if ndims(cov) < 2 + dim = 1 + else + dim = size(cov, 1) + end + end + else + mean = convert(Array{Float64}, mean) + dim = length(mean) + end + else + !isa(dim, Number) && throw(DimensionMismatch("dimension of random variable must be a scalar")) + end + + if mean === nothing + mean = zeros(dim) + end + mean = convert(Array{Float64}, mean) + + if cov === nothing + cov = [1.0] + end + cov = convert(Array{Float64}, cov) + + if dim == 1 + mean = reshape(mean, 1) + cov = reshape(cov, 1, 1) + end + + if ndims(mean) != 1 || size(mean, 1) != dim + throw(ArgumentError("array `mean` must be vector of length $dim")) + end + if ndims(cov) == 0 + cov = cov * Matrix{Float64}(I, dim, dim) + elseif ndims(cov) == 1 + cov = diag(cov) + else + size(cov) != (dim, dim) && throw(DimensionMismatch("array `cov` must be at most two-dimensional, but ndims(cov) = $(ndims(cov))")) + end + return dim, mean, cov +end + +function process_quantiles(x, dim) + x = convert(Array{Float64}, x) + + if ndims(x) == 0 + x = [x] + elseif ndims(x) == 1 + if dim == 1 + x = x[:, :] + else + x = x[newaxis, :] + end + end + return x +end + +function pinv_1d(v; _eps=1e-5) + return [(abs(x)<_eps) ? 0 : 1/x for x in v] +end + +function psd_pinv_decomposed_log_pdet(mat; cond=nothing, rcond=nothing) + u, s = eigvecs(mat), eigvals(mat) + + if rcond !== nothing + cond = rcond + end + if cond === nothing || cond == -1 + cond = 1e6 * Base.eps() + end + _eps = cond * maximum(abs.(s)) + + if minimum(s) < -_eps + throw(ArgumentError("the covariance matrix must be positive semidefinite")) + end + s_pinv = pinv_1d(s, _eps=_eps) + U = u .* sqrt.(s_pinv') + log_pdet = sum(log.(s[findall(s.>_eps)])) + + return U, log_pdet +end + +function squeeze_output(out) + if length(out) == 1 + out = Float64(out...) + else + out = vec(out) + end + return out +end + +function _logpdf(x, mean, prec_U, log_det_cov) + dim = size(x, ndims(x)) + dev = x - mean' + tmp = (dev * prec_U).^2 + maha = sum(tmp, dims=ndims(tmp)) + maha = dropdims(maha, dims=ndims(tmp)) + return -0.5 * ((dim*LOG_2π+log_det_cov).+maha) +end + +function pdf(x, mean, cov) + dim, mean, cov = process_parameters(nothing, mean, cov) + x = process_quantiles(x, dim) + prec_U, log_det_cov = psd_pinv_decomposed_log_pdet(cov) + out = exp.(_logpdf(x, mean, prec_U, log_det_cov)) + return squeeze_output(out) +end + +logpdf(x, mean, cov) = log.(pdf(x, mean, cov)) \ No newline at end of file From fa4439816c4fe6f0ca892feb05e191d9c38acd6c Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 09:09:34 +0900 Subject: [PATCH 02/46] [test]add test for hdbscan there was no test for it --- test/hdbscan.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 2 files changed, 43 insertions(+) create mode 100644 test/hdbscan.jl diff --git a/test/hdbscan.jl b/test/hdbscan.jl new file mode 100644 index 00000000..8469e506 --- /dev/null +++ b/test/hdbscan.jl @@ -0,0 +1,42 @@ +using Test +include("../src/hdbscan.jl") + +@testset "HDBSCAN" begin + # test for util functions + # erf function tests + @test erf(0) == 0 + @test erf(-0) == -0 + @test isapprox(erf(1.0), 0.8427007929497148) + @test isapprox(erf(-1.0), -0.8427007929497148) + @test isapprox(erf(2.4), 0.999311486103355) + @test isapprox(erf(-2.4), -0.999311486103355) + + # test for HDBSCANGraph + graph = HDBSCANGraph(4) + add_edge(graph, (1, 3, 0.1)) + @test graph[1][1] == (3, 0.1) && graph[3][1] == (1, 0.1) + + # test for UnionFind + uf = UnionFind(10) + @test group(uf, 1) == 1 + @test sort(members(uf, 1)) == [1] + @test root(uf, 3) == 3 + @test issame(uf, 1, 2) == false + @test size(uf, 1) == 1 + @test unite!(uf, 1, 2) == true + @test issame(uf, 1, 2) == true + @test size(uf, 1) == 2 + @test size(uf, 2) == 2 + @test unite!(uf, 1, 2) == false + + # make moons for test + upper_x = [i for i in 0:π/50:π] + lower_x = [i for i in π/2:π/50:3/2*π] + upper_y = sin.(upper_x) .+ rand(50)./10 + lower_y = cos.(lower_x) .+ rand(51)./10 + data = hcat([lower_x; upper_x], [lower_y; upper_y]) + #test for main function + @test_throws DomainError hdbscan!(data, 5, 0) + @test_throws ArgumentError hdbscan!(data, 5, 3; gen_mst=false) + @test_nowarn result = hdbscan!(data, 5, 3) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 42301653..a18f475b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,7 @@ tests = ["seeding", "kmedoids", "affprop", "dbscan", + "hdbscan", "fuzzycmeans", "counts", "silhouette", From 4e64fdf36cd3d5b09409c3b938231643910a1471 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 09:19:03 +0900 Subject: [PATCH 03/46] [fix]change Int64 to Int test failed on a ubuntu-latest-x86 --- src/hdbscan.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 9e885e4f..fac44c0b 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -1,9 +1,9 @@ struct HDBSCANGraph - edges::Vector{Vector{Tuple{Int64, Float64}}} - HDBSCANGraph(n) = new([Tuple{Int64, Float64}[] for _ in 1 : n]) + edges::Vector{Vector{Tuple{Int, Float64}}} + HDBSCANGraph(n) = new([Tuple{Int, Float64}[] for _ in 1 : n]) end -function add_edge(G::HDBSCANGraph, v::Tuple{Int64, Int64, Float64}) +function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64}) i, j, c = v push!(G.edges[i], (j, c)) push!(G.edges[j], (i, c)) From e294565762962242c89d80c9ce6df83b879a0237 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 09:28:31 +0900 Subject: [PATCH 04/46] [fix]change all Int64 into Int tests still fails on ubuntu-latest-x86 --- src/hdbscan.jl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index fac44c0b..a0612fbb 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -12,13 +12,13 @@ end Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i] mutable struct HDBSCANCluster - parent::Int64 - children::Vector{Int64} - points::Vector{Int64} + parent::Int + children::Vector{Int} + points::Vector{Int} λp::Vector{Float64} stability::Float64 children_stability::Float64 - function HDBSCANCluster(noise::Bool, points::Vector{Int64}) + function HDBSCANCluster(noise::Bool, points::Vector{Int}) if noise return new(0, [], [], [], -1, -1) else @@ -46,10 +46,10 @@ This algorithm performs clustering as follows. The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). """ mutable struct HDBSCANResult - k::Int64 - min_cluster_size::Int64 - labels::Union{Vector{Int64}, Nothing} - function HDBSCANResult(k::Int64, min_cluster_size::Int64) + k::Int + min_cluster_size::Int + labels::Union{Vector{Int}, Nothing} + function HDBSCANResult(k::Int, min_cluster_size::Int) if min_cluster_size < 1 throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) end @@ -58,7 +58,7 @@ mutable struct HDBSCANResult end """ - hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::INt64; gen_mst::Bool=true, mst=nothing) + hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) # Parameters - `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point - `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A. @@ -66,7 +66,7 @@ end - `gen_mst`: whether to generate minimum-spannig-tree or not - `mst`: when is specified and `gen_mst` is false, new mst won't be generated """ -function hdbscan!(points::AbstractMatrix, k::Int64, min_cluster_size::Int64; gen_mst::Bool=true, mst=nothing) +function hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) model = HDBSCANResult(k, min_cluster_size) n = size(points, 1) if gen_mst @@ -119,7 +119,7 @@ function mutual_reachability_distance(core_dists, points) end function prim(graph, n) - minimum_spanning_tree = Array{Tuple{Float64, Int64, Int64}}(undef, n-1) + minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1) marked = falses(n) marked_cnt = 1 @@ -158,7 +158,7 @@ function build_hierarchy(mst, min_size) end else for i in 1 : n - Hierarchy[i] = HDBSCANCluster(true, Int64[]) + Hierarchy[i] = HDBSCANCluster(true, Int[]) end end sort!(mst) @@ -199,7 +199,7 @@ function build_hierarchy(mst, min_size) #create parent cluster points = members(uf, group(uf, j)) if length(points) < min_size - Hierarchy[n+i] = HDBSCANCluster(true, Int64[]) + Hierarchy[n+i] = HDBSCANCluster(true, Int[]) else Hierarchy[n+i] = HDBSCANCluster(false, points) end @@ -237,7 +237,7 @@ heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v) mutable struct UnionFind{T <: Integer} parent:: Vector{T} # parent[root] is the negative of the size label::Dict{Int, Int} - cnt::Int64 + cnt::Int function UnionFind{T}(nodes::T) where T<:Integer if nodes <= 0 From b851997c984ac9ffcc80d745d29a5ddb3497d2bc Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 22:22:37 +0900 Subject: [PATCH 05/46] [change]change usage and remove extra code there were functions for Xmeans --- src/Clustering.jl | 4 + src/hdbscan.jl | 238 +++++++++------------------------------------- 2 files changed, 50 insertions(+), 192 deletions(-) diff --git a/src/Clustering.jl b/src/Clustering.jl index fae9ee82..a6540c47 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -40,6 +40,9 @@ module Clustering # dbscan DbscanResult, DbscanCluster, dbscan, + # hdbscan + HdbscanResult, hdbscan, + # fuzzy_cmeans fuzzy_cmeans, FuzzyCMeansResult, @@ -83,6 +86,7 @@ module Clustering include("kmedoids.jl") include("affprop.jl") include("dbscan.jl") + include("hdbscan.jl") include("mcl.jl") include("fuzzycmeans.jl") diff --git a/src/hdbscan.jl b/src/hdbscan.jl index a0612fbb..f73ba676 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -11,14 +11,21 @@ end Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i] -mutable struct HDBSCANCluster +""" + HdbscanCluster(..., points, stability, ...) +- `points`: vector of points which belongs to the cluster +- `stability`: stablity of the cluster(-1 for noise clusters) + +You can use `length` to know the number of pooints in the cluster. And `isnoise` funciton is also available to know whether the cluster is noise or not. +""" +mutable struct HdbscanCluster parent::Int children::Vector{Int} points::Vector{Int} λp::Vector{Float64} stability::Float64 children_stability::Float64 - function HDBSCANCluster(noise::Bool, points::Vector{Int}) + function HdbscanCluster(noise::Bool, points::Vector{Int}) if noise return new(0, [], [], [], -1, -1) else @@ -27,16 +34,27 @@ mutable struct HDBSCANCluster end end -Base.length(c::HDBSCANCluster) = size(c.points, 1) -join(c1::HDBSCANCluster, c2::HDBSCANCluster, id) = HDBSCANCluster(nothing, vcat(c1.points, c2.points), id, 0) -isnoise(c::HDBSCANCluster) = c.stability == -1 -hasstability(c::HDBSCANCluster) = c.stability != 0 -function compute_stability(c::HDBSCANCluster, λbirth) +Base.length(c::HdbscanCluster) = size(c.points, 1) +isnoise(c::HdbscanCluster) = c.stability == -1 +hasstability(c::HdbscanCluster) = c.stability != 0 +function compute_stability(c::HdbscanCluster, λbirth) c.stability += sum(c.λp.-λbirth) end """ - HDBSCAN(ε, minpts) + HdbscanResult(k, minpts, clusters) +- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A. +- `min_cluster_size`: minimum number of points in the cluster +- `clusters`: result vector of clusters +""" +mutable struct HdbscanResult + k::Int + min_cluster_size::Int + clusters::Vector{HdbscanCluster} +end + +""" + hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) Density-Based Clustering Based on Hierarchical Density Estimates. This algorithm performs clustering as follows. 1. generate a minimum spanning tree @@ -44,21 +62,7 @@ This algorithm performs clustering as follows. 3. extract the target cluster 4. generate the list of cluster assignment for each point The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). -""" -mutable struct HDBSCANResult - k::Int - min_cluster_size::Int - labels::Union{Vector{Int}, Nothing} - function HDBSCANResult(k::Int, min_cluster_size::Int) - if min_cluster_size < 1 - throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) - end - return new(k, min_cluster_size) - end -end -""" - hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) # Parameters - `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point - `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A. @@ -66,8 +70,10 @@ end - `gen_mst`: whether to generate minimum-spannig-tree or not - `mst`: when is specified and `gen_mst` is false, new mst won't be generated """ -function hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) - model = HDBSCANResult(k, min_cluster_size) +function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) + if min_cluster_size < 1 + throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) + end n = size(points, 1) if gen_mst #calculate core distances for each point @@ -84,15 +90,18 @@ function hdbscan!(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst #extract the target cluster extract_cluster!(hierarchy) #generate the list of cluster assignment for each point - result = fill(-1, n) + result = HdbscanCluster[] + noise_points = fill(-1, n) for (i, j) in enumerate(hierarchy[2n-1].children) c = hierarchy[j] + push!(result, c) for k in c.points - result[k] = i + noise_points[k] = 0 end end - model.labels = result - return model + push!(result, HdbscanCluster(true, Int[])) + result[end].points = findall(x->x==-1, noise_points) + return HdbscanResult(k, min_cluster_size, result) end function core_dist(points, k) @@ -151,14 +160,14 @@ function build_hierarchy(mst, min_size) n = length(mst) + 1 cost = 0 uf = UnionFind(n) - Hierarchy = Array{HDBSCANCluster}(undef, 2n-1) + Hierarchy = Array{HdbscanCluster}(undef, 2n-1) if min_size == 1 for i in 1 : n - Hierarchy[i] = HDBSCANCluster(false, [i]) + Hierarchy[i] = HdbscanCluster(false, [i]) end else for i in 1 : n - Hierarchy[i] = HDBSCANCluster(true, Int[]) + Hierarchy[i] = HdbscanCluster(true, Int[]) end end sort!(mst) @@ -181,7 +190,7 @@ function build_hierarchy(mst, min_size) unite!(uf, j, k) #create parent cluster points = members(uf, group(uf, j)) - Hierarchy[n+i] = HDBSCANCluster(false, points) + Hierarchy[n+i] = HdbscanCluster(false, points) elseif !(nc1 && nc2) if nc2 == true (c1, c2) = (c2, c1) @@ -192,23 +201,23 @@ function build_hierarchy(mst, min_size) unite!(uf, j, k) #create parent cluster points = members(uf, group(uf, j)) - Hierarchy[n+i] = HDBSCANCluster(false, points) + Hierarchy[n+i] = HdbscanCluster(false, points) else #unite the noise cluster unite!(uf, j, k) #create parent cluster points = members(uf, group(uf, j)) if length(points) < min_size - Hierarchy[n+i] = HDBSCANCluster(true, Int[]) + Hierarchy[n+i] = HdbscanCluster(true, Int[]) else - Hierarchy[n+i] = HDBSCANCluster(false, points) + Hierarchy[n+i] = HdbscanCluster(false, points) end end end return Hierarchy end -function extract_cluster!(hierarchy::Vector{HDBSCANCluster}) +function extract_cluster!(hierarchy::Vector{HdbscanCluster}) for i in 1 : length(hierarchy)-1 if isnoise(hierarchy[i]) c = hierarchy[i] @@ -228,10 +237,6 @@ function extract_cluster!(hierarchy::Vector{HDBSCANCluster}) end # Below are utility functions for building hierarchical trees -# Please note that these functions are not so sophisticated since I made them by manually converting code of numpy. -const LOG_2π = log(2π) -const newaxis = [CartesianIndex()] - heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v) mutable struct UnionFind{T <: Integer} @@ -288,155 +293,4 @@ function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer uf.label[i] = uf.cnt end return true -end - -#error function -#if absolute of `x` is smaller than 2.4, we use Taylor expansion. -#other wise, we use Continued fraction expansion -function erf(x) - absx = abs(x) - if absx<2.4 - c=1 - a=0 - for i in 1 : 40 - a+=(x/(2i-1)*c) - c=-c*x^2/i - end - return a*2/sqrt(π) - else - if absx>1e50 - a = 1 - else - y = absx*sqrt(2) - a = 0 - for i in 40:-1:1 - a=i/(y+a) - end - a=1-exp(-x^2)/(y+a)*sqrt(2/π) - end - if x<0 - return -a - else - return a - end - end -end -erfc(x) = 1-erf(x) -norm_cdf(x) = 1/2*erfc(-x/sqrt(2)) - -function process_parameters(dim, mean, cov) - if dim === nothing - if mean === nothing - if cov === nothing - dim = 1 - else - cov = convert(Array{Float64}, cov) - if ndims(cov) < 2 - dim = 1 - else - dim = size(cov, 1) - end - end - else - mean = convert(Array{Float64}, mean) - dim = length(mean) - end - else - !isa(dim, Number) && throw(DimensionMismatch("dimension of random variable must be a scalar")) - end - - if mean === nothing - mean = zeros(dim) - end - mean = convert(Array{Float64}, mean) - - if cov === nothing - cov = [1.0] - end - cov = convert(Array{Float64}, cov) - - if dim == 1 - mean = reshape(mean, 1) - cov = reshape(cov, 1, 1) - end - - if ndims(mean) != 1 || size(mean, 1) != dim - throw(ArgumentError("array `mean` must be vector of length $dim")) - end - if ndims(cov) == 0 - cov = cov * Matrix{Float64}(I, dim, dim) - elseif ndims(cov) == 1 - cov = diag(cov) - else - size(cov) != (dim, dim) && throw(DimensionMismatch("array `cov` must be at most two-dimensional, but ndims(cov) = $(ndims(cov))")) - end - return dim, mean, cov -end - -function process_quantiles(x, dim) - x = convert(Array{Float64}, x) - - if ndims(x) == 0 - x = [x] - elseif ndims(x) == 1 - if dim == 1 - x = x[:, :] - else - x = x[newaxis, :] - end - end - return x -end - -function pinv_1d(v; _eps=1e-5) - return [(abs(x)<_eps) ? 0 : 1/x for x in v] -end - -function psd_pinv_decomposed_log_pdet(mat; cond=nothing, rcond=nothing) - u, s = eigvecs(mat), eigvals(mat) - - if rcond !== nothing - cond = rcond - end - if cond === nothing || cond == -1 - cond = 1e6 * Base.eps() - end - _eps = cond * maximum(abs.(s)) - - if minimum(s) < -_eps - throw(ArgumentError("the covariance matrix must be positive semidefinite")) - end - s_pinv = pinv_1d(s, _eps=_eps) - U = u .* sqrt.(s_pinv') - log_pdet = sum(log.(s[findall(s.>_eps)])) - - return U, log_pdet -end - -function squeeze_output(out) - if length(out) == 1 - out = Float64(out...) - else - out = vec(out) - end - return out -end - -function _logpdf(x, mean, prec_U, log_det_cov) - dim = size(x, ndims(x)) - dev = x - mean' - tmp = (dev * prec_U).^2 - maha = sum(tmp, dims=ndims(tmp)) - maha = dropdims(maha, dims=ndims(tmp)) - return -0.5 * ((dim*LOG_2π+log_det_cov).+maha) -end - -function pdf(x, mean, cov) - dim, mean, cov = process_parameters(nothing, mean, cov) - x = process_quantiles(x, dim) - prec_U, log_det_cov = psd_pinv_decomposed_log_pdet(cov) - out = exp.(_logpdf(x, mean, prec_U, log_det_cov)) - return squeeze_output(out) -end - -logpdf(x, mean, cov) = log.(pdf(x, mean, cov)) \ No newline at end of file +end \ No newline at end of file From 7822a7c9fd60d9f44a1bec64f7a04e7f2311424f Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 22:23:08 +0900 Subject: [PATCH 06/46] [test]update test deleted some utility functions --- test/hdbscan.jl | 39 ++++++++------------------------------- 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/test/hdbscan.jl b/test/hdbscan.jl index 8469e506..cc1441c0 100644 --- a/test/hdbscan.jl +++ b/test/hdbscan.jl @@ -1,34 +1,7 @@ using Test -include("../src/hdbscan.jl") +using Clustering @testset "HDBSCAN" begin - # test for util functions - # erf function tests - @test erf(0) == 0 - @test erf(-0) == -0 - @test isapprox(erf(1.0), 0.8427007929497148) - @test isapprox(erf(-1.0), -0.8427007929497148) - @test isapprox(erf(2.4), 0.999311486103355) - @test isapprox(erf(-2.4), -0.999311486103355) - - # test for HDBSCANGraph - graph = HDBSCANGraph(4) - add_edge(graph, (1, 3, 0.1)) - @test graph[1][1] == (3, 0.1) && graph[3][1] == (1, 0.1) - - # test for UnionFind - uf = UnionFind(10) - @test group(uf, 1) == 1 - @test sort(members(uf, 1)) == [1] - @test root(uf, 3) == 3 - @test issame(uf, 1, 2) == false - @test size(uf, 1) == 1 - @test unite!(uf, 1, 2) == true - @test issame(uf, 1, 2) == true - @test size(uf, 1) == 2 - @test size(uf, 2) == 2 - @test unite!(uf, 1, 2) == false - # make moons for test upper_x = [i for i in 0:π/50:π] lower_x = [i for i in π/2:π/50:3/2*π] @@ -36,7 +9,11 @@ include("../src/hdbscan.jl") lower_y = cos.(lower_x) .+ rand(51)./10 data = hcat([lower_x; upper_x], [lower_y; upper_y]) #test for main function - @test_throws DomainError hdbscan!(data, 5, 0) - @test_throws ArgumentError hdbscan!(data, 5, 3; gen_mst=false) - @test_nowarn result = hdbscan!(data, 5, 3) + @test_throws DomainError hdbscan(data, 5, 0) + @test_throws ArgumentError hdbscan(data, 5, 3; gen_mst=false) + @test_nowarn result = hdbscan(data, 5, 3) + + # tests for result + result = hdbscan(data, 5, 3) + @test sum([length(c) for c in result.clusters]) == 101 end \ No newline at end of file From 8901cfe9d14e49fdbd3a83be3a9e1f9c068f9b45 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 23:14:10 +0900 Subject: [PATCH 07/46] [docs]add docs for HDBSCAN Documentation workflow failed due to the docs for it --- docs/make.jl | 1 + docs/source/hdbscan.md | 11 +++++++++++ 2 files changed, 12 insertions(+) create mode 100644 docs/source/hdbscan.md diff --git a/docs/make.jl b/docs/make.jl index 556f471b..5455a1b8 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,6 +16,7 @@ makedocs( "mcl.md", "affprop.md", "dbscan.md", + "hdbscan.md", "fuzzycmeans.md", ], "validate.md", diff --git a/docs/source/hdbscan.md b/docs/source/hdbscan.md new file mode 100644 index 00000000..b61ce055 --- /dev/null +++ b/docs/source/hdbscan.md @@ -0,0 +1,11 @@ +# HDBSCAN + +Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. + +## Interface +The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function +```@docs +hdbscan +HdbscanResult +HdbscanCluster +``` \ No newline at end of file From 6de5d02907cba73c434fedd5c71bb5bf058f7168 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 23 Mar 2024 23:21:58 +0900 Subject: [PATCH 08/46] [fix]export HdbscanCluster I forgot to export it --- src/Clustering.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Clustering.jl b/src/Clustering.jl index a6540c47..d76d91e9 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -41,7 +41,7 @@ module Clustering DbscanResult, DbscanCluster, dbscan, # hdbscan - HdbscanResult, hdbscan, + HdbscanResult, HdbscanCluster, hdbscan, # fuzzy_cmeans fuzzy_cmeans, FuzzyCMeansResult, From 85c5644a823cf158dd2e465e37964d62021190ea Mon Sep 17 00:00:00 2001 From: MommawATASU Date: Mon, 15 Apr 2024 08:54:59 +0900 Subject: [PATCH 09/46] [docs]merge docs of HDBSCAN with DBSCAN.md there is no need to create new file --- docs/source/dbscan.md | 12 ++++++++++++ docs/source/hdbscan.md | 11 ----------- 2 files changed, 12 insertions(+), 11 deletions(-) delete mode 100644 docs/source/hdbscan.md diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md index b2c31b61..61e5ef73 100644 --- a/docs/source/dbscan.md +++ b/docs/source/dbscan.md @@ -46,3 +46,15 @@ dbscan DbscanResult DbscanCluster ``` + +# HDBSCAN + +Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. + +## Interface +The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function +```@docs +hdbscan +HdbscanResult +HdbscanCluster +``` diff --git a/docs/source/hdbscan.md b/docs/source/hdbscan.md deleted file mode 100644 index b61ce055..00000000 --- a/docs/source/hdbscan.md +++ /dev/null @@ -1,11 +0,0 @@ -# HDBSCAN - -Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. - -## Interface -The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function -```@docs -hdbscan -HdbscanResult -HdbscanCluster -``` \ No newline at end of file From edcc70ab90bf190a7970dab6e06bf2765f218862 Mon Sep 17 00:00:00 2001 From: MommawATASU Date: Mon, 15 Apr 2024 09:10:55 +0900 Subject: [PATCH 10/46] [clean]refactoring HDBSCANGraph add comment and alias --- src/hdbscan.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index f73ba676..beb0ba29 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -1,6 +1,10 @@ +# HDBSCAN Graph +# edge[i] is a list of edges adjacent to the i-th vertex? +# the second element of HDBSCANEdge is the mutual reachability distance. +HDBSCANEdge = Tuple{Int, Float64} struct HDBSCANGraph - edges::Vector{Vector{Tuple{Int, Float64}}} - HDBSCANGraph(n) = new([Tuple{Int, Float64}[] for _ in 1 : n]) + edges::Vector{Vector{HDBSCANEdge}} + HDBSCANGraph(n) = new([HDBSCANEdge[] for _ in 1 : n]) end function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64}) From 939ce652c56cfb170720d0c88d431bec3812fc3c Mon Sep 17 00:00:00 2001 From: MommawATASU Date: Fri, 26 Apr 2024 15:24:03 +0900 Subject: [PATCH 11/46] [docs]fix docs hdbscan.md remains --- docs/make.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 5455a1b8..556f471b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,7 +16,6 @@ makedocs( "mcl.md", "affprop.md", "dbscan.md", - "hdbscan.md", "fuzzycmeans.md", ], "validate.md", From 2f67d07375b3f0e682fb793b74ee86d2bd764567 Mon Sep 17 00:00:00 2001 From: MommawATASU Date: Fri, 26 Apr 2024 15:24:54 +0900 Subject: [PATCH 12/46] [clean]refactoring the progress is temporary --- src/hdbscan.jl | 62 ++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index beb0ba29..004675db 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -1,20 +1,26 @@ # HDBSCAN Graph -# edge[i] is a list of edges adjacent to the i-th vertex? +# edge[i] is a list of edges adjacent to the i-th vertex # the second element of HDBSCANEdge is the mutual reachability distance. HDBSCANEdge = Tuple{Int, Float64} struct HDBSCANGraph edges::Vector{Vector{HDBSCANEdge}} - HDBSCANGraph(n) = new([HDBSCANEdge[] for _ in 1 : n]) + HDBSCANGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv]) end -function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64}) - i, j, c = v - push!(G.edges[i], (j, c)) - push!(G.edges[j], (i, c)) +function add_edge!(G::HDBSCANGraph, v1::Integer, v2::Integer, dist::Number) + push!(G.edges[v1], (v2, dist)) + push!(G.edges[v2], (v1, dist)) end Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i] +struct MSTEdge + v1::Integer + v2::Integer + dist::Number +end +expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist) + """ HdbscanCluster(..., points, stability, ...) - `points`: vector of points which belongs to the cluster @@ -29,20 +35,17 @@ mutable struct HdbscanCluster λp::Vector{Float64} stability::Float64 children_stability::Float64 - function HdbscanCluster(noise::Bool, points::Vector{Int}) - if noise - return new(0, [], [], [], -1, -1) - else - return new(0, [], points, [], 0, 0) - end + function HdbscanCluster(points::Union{Vector{Int}, Nothing}) + noise = isnothing(points) + return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0) end end Base.length(c::HdbscanCluster) = size(c.points, 1) isnoise(c::HdbscanCluster) = c.stability == -1 -hasstability(c::HdbscanCluster) = c.stability != 0 -function compute_stability(c::HdbscanCluster, λbirth) - c.stability += sum(c.λp.-λbirth) +isstable(c::HdbscanCluster) = c.stability != 0 +function increment_stability(c::HdbscanCluster, λbirth) + c.stability += sum(c.λp) - length(c.λ) * λbirth end """ @@ -51,10 +54,9 @@ end - `min_cluster_size`: minimum number of points in the cluster - `clusters`: result vector of clusters """ -mutable struct HdbscanResult - k::Int - min_cluster_size::Int +struct HdbscanResult clusters::Vector{HdbscanCluster} + assignments::Vector{Int} end """ @@ -105,21 +107,21 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst: end push!(result, HdbscanCluster(true, Int[])) result[end].points = findall(x->x==-1, noise_points) - return HdbscanResult(k, min_cluster_size, result) + return HdbscanResult(result, ) end -function core_dist(points, k) +# calculate the core distances of the points +function core_distances(points::AbstractMatrix, k::Integer) core_dists = Array{Float64}(undef, size(points, 1)) - for i in 1 : size(points, 1) - p = points[i:i, :] - dists = vec(sum((@. (points - p)^2), dims=2)) + for i in 1:size(points, 1) + dists = pairwise(Euclidean(), points[i:i, :], points; dims=2) sort!(dists) core_dists[i] = dists[k] end return core_dists end -function mutual_reachability_distance(core_dists, points) +function hdbscan_graph(core_dists::AbstractVector, points::AbstractMatrix) n = size(points, 1) graph = HDBSCANGraph(div(n * (n-1), 2)) for i in 1 : n-1 @@ -131,30 +133,30 @@ function mutual_reachability_distance(core_dists, points) return graph end -function prim(graph, n) +function hdbscan_minspantree(graph::HDBSCANGraph, n::Integer) minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1) marked = falses(n) marked_cnt = 1 marked[1] = true - h = [] + h = MSTEdge[] for (i, c) in graph[1] - heappush!(h, (c, 1, i)) + heappush!(h, MSTEdge(1, i, c)) end while marked_cnt < n - c, i, j = popfirst!(h) + i, j, c = expand(popfirst!(h)) marked[j] == true && continue - minimum_spanning_tree[marked_cnt] = (c, i, j) + minimum_spanning_tree[marked_cnt] = MSTEdge(i, j, c) marked[j] = true marked_cnt += 1 for (k, c) in graph[j] marked[k] == true && continue - heappush!(h, (c, j, k)) + heappush!(h, MSTEdge(j, k, c)) end end return minimum_spanning_tree From 61463f2d88c6f1af05b8ba66ea60cc4b28bfccf1 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 20:42:18 +0900 Subject: [PATCH 13/46] [clean]refactoring add comment and improve performance, etc. --- src/hdbscan.jl | 161 ++++++++++++++++++++++++++----------------------- 1 file changed, 85 insertions(+), 76 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 004675db..7bf6686f 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -20,6 +20,7 @@ struct MSTEdge dist::Number end expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist) +Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist """ HdbscanCluster(..., points, stability, ...) @@ -45,7 +46,7 @@ Base.length(c::HdbscanCluster) = size(c.points, 1) isnoise(c::HdbscanCluster) = c.stability == -1 isstable(c::HdbscanCluster) = c.stability != 0 function increment_stability(c::HdbscanCluster, λbirth) - c.stability += sum(c.λp) - length(c.λ) * λbirth + c.stability += sum(c.λp) - length(c.λp) * λbirth end """ @@ -76,25 +77,22 @@ The detail is so complex it is difficult to explain the detail in here. But, if - `gen_mst`: whether to generate minimum-spannig-tree or not - `mst`: when is specified and `gen_mst` is false, new mst won't be generated """ -function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) +function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=SqEuclidean()) if min_cluster_size < 1 throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) end - n = size(points, 1) - if gen_mst - #calculate core distances for each point - core_dists = core_dist(points, k) - #calculate mutual reachability distance between any two points - mrd = mutual_reachability_distance(core_dists, points) - #compute a minimum spanning tree by prim method - mst = prim(mrd, n) - elseif mst == nothing - throw(ArgumentError("if you set `gen_mst` to false, you must pass a minimum spanning tree as `mst`")) - end + n = size(points, 2) + dists = pairwise(metric, points; dims=2) + #calculate core distances for each point + core_dists = core_distances(dists, k) + #calculate mutual reachability distance between any two points + mrd = hdbscan_graph(core_dists, dists) + #compute a minimum spanning tree by prim method + mst = hdbscan_minspantree(mrd, n) #build a HDBSCAN hierarchy - hierarchy = build_hierarchy(mst, min_cluster_size) + hierarchy = hdbscan_clusters(mst, min_cluster_size) #extract the target cluster - extract_cluster!(hierarchy) + prune_cluster!(hierarchy) #generate the list of cluster assignment for each point result = HdbscanCluster[] noise_points = fill(-1, n) @@ -105,125 +103,127 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst: noise_points[k] = 0 end end - push!(result, HdbscanCluster(true, Int[])) + push!(result, HdbscanCluster(Int[])) result[end].points = findall(x->x==-1, noise_points) - return HdbscanResult(result, ) + assignments = Array{Int}(undef, length(points)) + for i in 1 : length(result)-1 + assignments[result[i].points] = i + end + assignments[result[end].points] .= -1 + return HdbscanResult(result, assignments) end # calculate the core distances of the points -function core_distances(points::AbstractMatrix, k::Integer) - core_dists = Array{Float64}(undef, size(points, 1)) - for i in 1:size(points, 1) - dists = pairwise(Euclidean(), points[i:i, :], points; dims=2) - sort!(dists) - core_dists[i] = dists[k] +function core_distances(dists::AbstractMatrix, k::Integer) + core_dists = Array{Float64}(undef, size(dists, 1)) + for i in axes(dists, 1) + dist = sort(dists[i, :]) + core_dists[i] = dist[k] end return core_dists end -function hdbscan_graph(core_dists::AbstractVector, points::AbstractMatrix) - n = size(points, 1) +function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) + n = size(dists, 1) graph = HDBSCANGraph(div(n * (n-1), 2)) for i in 1 : n-1 for j in i+1 : n - c = max(core_dists[i], core_dists[j], sum((points[i, :]-points[j, :]).^2)) - add_edge(graph, (i, j, c)) + c = max(core_dists[i], core_dists[j], dists[i, j]) + add_edge!(graph, i, j, c) end end return graph end function hdbscan_minspantree(graph::HDBSCANGraph, n::Integer) - minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1) + function heapput!(h, v) + idx = searchsortedlast(h, v, rev=true) + insert!(h, (idx != 0) ? idx : 1, v) + end + + minspantree = Vector{MSTEdge}(undef, n-1) marked = falses(n) - marked_cnt = 1 + nmarked = 1 marked[1] = true h = MSTEdge[] for (i, c) in graph[1] - heappush!(h, MSTEdge(1, i, c)) + heapput!(h, MSTEdge(1, i, c)) end - while marked_cnt < n - i, j, c = expand(popfirst!(h)) + while nmarked < n + i, j, c = expand(pop!(h)) - marked[j] == true && continue - minimum_spanning_tree[marked_cnt] = MSTEdge(i, j, c) + marked[j] && continue + minspantree[nmarked] = MSTEdge(i, j, c) marked[j] = true - marked_cnt += 1 + nmarked += 1 for (k, c) in graph[j] - marked[k] == true && continue - heappush!(h, MSTEdge(j, k, c)) + marked[k] && continue + heapput!(h, MSTEdge(j, k, c)) end end - return minimum_spanning_tree + return minspantree end -function build_hierarchy(mst, min_size) +function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer) n = length(mst) + 1 cost = 0 uf = UnionFind(n) - Hierarchy = Array{HdbscanCluster}(undef, 2n-1) - if min_size == 1 - for i in 1 : n - Hierarchy[i] = HdbscanCluster(false, [i]) - end - else - for i in 1 : n - Hierarchy[i] = HdbscanCluster(true, Int[]) - end - end + clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n] sort!(mst) for i in 1 : n-1 - c, j, k = mst[i] + j, k, c = expand(mst[i]) cost += c λ = 1 / cost #child clusters c1 = group(uf, j) c2 = group(uf, k) #reference to the parent cluster - Hierarchy[c1].parent = Hierarchy[c2].parent = n+i - nc1, nc2 = isnoise(Hierarchy[c1]), isnoise(Hierarchy[c2]) + println(c1, c2, n+i) + clusters[c1].parent = clusters[c2].parent = n+i + nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2]) if !(nc1 || nc2) #compute stability - compute_stability(Hierarchy[c1], λ) - compute_stability(Hierarchy[c2], λ) + increment_stability(clusters[c1], λ) + increment_stability(clusters[c2], λ) #unite cluster unite!(uf, j, k) #create parent cluster points = members(uf, group(uf, j)) - Hierarchy[n+i] = HdbscanCluster(false, points) + push!(clusters, HdbscanCluster(points)) elseif !(nc1 && nc2) if nc2 == true (c1, c2) = (c2, c1) end #record the lambda value - append!(Hierarchy[c2].λp, fill(λ, length(Hierarchy[c1]))) + append!(clusters[c2].λp, fill(λ, length(clusters[c1]))) #unite cluster unite!(uf, j, k) #create parent cluster points = members(uf, group(uf, j)) - Hierarchy[n+i] = HdbscanCluster(false, points) + push!(clusters, HdbscanCluster(points)) else #unite the noise cluster unite!(uf, j, k) #create parent cluster points = members(uf, group(uf, j)) if length(points) < min_size - Hierarchy[n+i] = HdbscanCluster(true, Int[]) + push!(clusters, HdbscanCluster(Int[])) else - Hierarchy[n+i] = HdbscanCluster(false, points) + push!(clusters, HdbscanCluster(points)) end end end - return Hierarchy + @assert length(clusters) == 2n - 1 + return clusters end -function extract_cluster!(hierarchy::Vector{HdbscanCluster}) +function prune_cluster!(hierarchy::Vector{HdbscanCluster}) for i in 1 : length(hierarchy)-1 if isnoise(hierarchy[i]) c = hierarchy[i] @@ -235,6 +235,7 @@ function extract_cluster!(hierarchy::Vector{HdbscanCluster}) push!(hierarchy[c.parent].children, i) hierarchy[c.parent].children_stability += c.stability else + println(c.parent, i) append!(hierarchy[c.parent].children, c.children) hierarchy[c.parent].children_stability += c.children_stability end @@ -245,27 +246,34 @@ end # Below are utility functions for building hierarchical trees heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v) -mutable struct UnionFind{T <: Integer} - parent:: Vector{T} # parent[root] is the negative of the size +# Union-Find +# structure for managing disjoint sets +# This structure tracks which sets the elements of a set belong to, +# and allows us to efficiently determine whether two elements belong to the same set. +mutable struct UnionFind + parent:: Vector{Integer} # parent[root] is the negative of the size label::Dict{Int, Int} - cnt::Int + next_id::Int - function UnionFind{T}(nodes::T) where T<:Integer + function UnionFind(nodes::Integer) if nodes <= 0 throw(ArgumentError("invalid argument for nodes: $nodes")) end - parent = -ones(T, nodes) + parent = -ones(nodes) label = Dict([(i, i) for i in 1 : nodes]) - new{T}(parent, label, nodes) + new(parent, label, nodes) end end -UnionFind(nodes::Integer) = UnionFind{typeof(nodes)}(nodes) -group(uf::UnionFind, x)::Int = uf.label[root(uf, x)] +# label of the set which element `x` belong to +group(uf::UnionFind, x) = uf.label[root(uf, x)] +# all elements that have the specified label members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label))) -function root(uf::UnionFind{T}, x::T)::T where T<:Integer +# root of element `x` +# The root is the representative element of the set +function root(uf::UnionFind, x::Integer) if uf.parent[x] < 0 return x else @@ -273,15 +281,16 @@ function root(uf::UnionFind{T}, x::T)::T where T<:Integer end end -function issame(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer +# whether element `x` and `y` belong to the same set +function issame(uf::UnionFind, x::Integer, y::Integer) return root(uf, x) == root(uf, y) end -function Base.size(uf::UnionFind{T}, x::T)::T where T<:Integer +function Base.size(uf::UnionFind, x::Integer) return -uf.parent[root(uf, x)] end -function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer +function unite!(uf::UnionFind, x::Integer, y::Integer) x = root(uf, x) y = root(uf, y) if x == y @@ -293,10 +302,10 @@ function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer # unite smaller tree(y) to bigger one(x) uf.parent[x] += uf.parent[y] uf.parent[y] = x - uf.cnt += 1 - uf.label[y] = uf.cnt + uf.next_id += 1 + uf.label[y] = uf.next_id for i in members(uf, group(uf, x)) - uf.label[i] = uf.cnt + uf.label[i] = uf.next_id end return true end \ No newline at end of file From 09ed174053adf7a7f3a734ae2a44d606c5f213fd Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 20:44:41 +0900 Subject: [PATCH 14/46] [test]update test changes sugges alyst --- test/hdbscan.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/hdbscan.jl b/test/hdbscan.jl index cc1441c0..0702b278 100644 --- a/test/hdbscan.jl +++ b/test/hdbscan.jl @@ -7,13 +7,13 @@ using Clustering lower_x = [i for i in π/2:π/50:3/2*π] upper_y = sin.(upper_x) .+ rand(50)./10 lower_y = cos.(lower_x) .+ rand(51)./10 - data = hcat([lower_x; upper_x], [lower_y; upper_y]) + data = hcat([lower_x; upper_x], [lower_y; upper_y])' #test for main function @test_throws DomainError hdbscan(data, 5, 0) - @test_throws ArgumentError hdbscan(data, 5, 3; gen_mst=false) @test_nowarn result = hdbscan(data, 5, 3) # tests for result - result = hdbscan(data, 5, 3) - @test sum([length(c) for c in result.clusters]) == 101 + result = @inferred(hdbscan(data, 5, 3)) + @test isa(result, HdbscanResult) + @test sum(length, result.clusters) == 101 end \ No newline at end of file From 6039c0cb9fe4aabf088fb7d9e120d473185f4952 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 21:01:18 +0900 Subject: [PATCH 15/46] [fix]change isnothing into === error occured with Julia1.10 --- src/hdbscan.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 7bf6686f..8f6afde9 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -37,7 +37,7 @@ mutable struct HdbscanCluster stability::Float64 children_stability::Float64 function HdbscanCluster(points::Union{Vector{Int}, Nothing}) - noise = isnothing(points) + noise = points === nothing return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0) end end From 3cc76897449466485332536b95ff3c7fbfc5238a Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 21:06:39 +0900 Subject: [PATCH 16/46] [fix]remove println forgot to remove debugging code --- src/hdbscan.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 8f6afde9..5ab4c829 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -184,7 +184,6 @@ function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer) c1 = group(uf, j) c2 = group(uf, k) #reference to the parent cluster - println(c1, c2, n+i) clusters[c1].parent = clusters[c2].parent = n+i nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2]) if !(nc1 || nc2) @@ -235,7 +234,6 @@ function prune_cluster!(hierarchy::Vector{HdbscanCluster}) push!(hierarchy[c.parent].children, i) hierarchy[c.parent].children_stability += c.stability else - println(c.parent, i) append!(hierarchy[c.parent].children, c.children) hierarchy[c.parent].children_stability += c.children_stability end From 1798148d103df298cc30b933fc9844b83ec76862 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 23:22:02 +0900 Subject: [PATCH 17/46] [docs]update docs add description for detail --- docs/source/dbscan.md | 35 +++++++++++++++++++++++++++++++- src/hdbscan.jl | 47 ++++++++++++++++++++++--------------------- 2 files changed, 58 insertions(+), 24 deletions(-) diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md index 61e5ef73..056a5ed7 100644 --- a/docs/source/dbscan.md +++ b/docs/source/dbscan.md @@ -49,7 +49,40 @@ DbscanCluster # HDBSCAN -Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. +Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. The algorithm was proposed in: + +> Ricardo J. G. B. Campello, Davoud Moulavi & Joerg Sander +> *Density-Based Clustering Based on Hierarchical Density Estimates* 2013 + +## Algorithm +The main procedure of HDBSCAN: +1. calculate the *mutual reachability distance* +2. generate a *minimum spanning tree* +3. build hierarchy +4. extract the target cluster + +### Calculate the *mutual reachability distance* +DBSCAN counts the number of points at a certain distance. But, HDBSCAN uses the opposite way, First, calculate the pairwise distances. Next, defined the core distance(``core_{ncore}(p)``) as a distance with the ncore-th closest point. And then, the mutual reachability distance between point `a` and `b` is defined as: ``d_{mreach-ncore}(a, b) = max\{ core_{ncore}(a), core_{ncore}(b), d(a, b) \}`` where ``d(a, b)`` is a euclidean distance between `a` and `b`(or specified metric). + +### Generate a *minimum-spanning tree(MST)* +Conceptually what we will do is the following: consider the data as a weighted graph with the data points as vertices and an edge between any two points with weight equal to the mutual reachability distance of those points. + +Then, check the list of edges in descending order of distance whether the points which is connected by it is marked or not. If not, add the edge into the MST. After this processing, all data points are connected to MST with minimum weight. + +In practice, this is very expensive since there are ``n^{2}`` edges. + +### Build hierarchy +The next step is to build hierarchy based on MST. This step is very simple: repeatedly unite the clusters which has the minimum edge until all cluster are united into one cluster. + +### Extract the target cluster +First we need a different measure than distance to consider the persistence of clusters; instead we will use ``\lambda = \frac{1}{\mathrm{distance}}``. +For a given cluster we can then define values ``\lambda_{\mathrm{birth}}`` and ``\lambda_{\mathrm{death}}`` to be the lambda value when the cluster split off and became it’s own cluster, and the lambda value (if any) when the cluster split into smaller clusters respectively. +In turn, for a given cluster, for each point ``p`` in that cluster we can define the value ``\lambda_{p}`` as the lambda value at which that point ‘fell out of the cluster’ which is a value somewhere between ``\lambda_{\mathrm{birth}}`` and ``\lambda_{\mathrm{death}}`` since the point either falls out of the cluster at some point in the cluster’s lifetime, or leaves the cluster when the cluster splits into two smaller clusters. +Now, for each cluster compute the stability as + +``\sum_{p \in \mathrm{cluster}} (\lambda_{p} - \lambda_{\mathrm{birth}})``. + +Declare all leaf nodes to be selected clusters. Now work up through the tree (the reverse topological sort order). If the sum of the stabilities of the child clusters is greater than the stability of the cluster, then we set the cluster stability to be the sum of the child stabilities. If, on the other hand, the cluster’s stability is greater than the sum of its children then we declare the cluster to be a selected cluster and unselect all its descendants. Once we reach the root node we call the current set of selected clusters our flat clustering and return it as the result of clustering. ## Interface The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 5ab4c829..8cef620c 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -23,11 +23,14 @@ expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist) Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist """ - HdbscanCluster(..., points, stability, ...) -- `points`: vector of points which belongs to the cluster -- `stability`: stablity of the cluster(-1 for noise clusters) + HdbscanCluster +A cluster generated by the [hdbscan](@ref) method, part of [HdbscanResult](@ref). + - `points`: vector of points which belongs to the cluster + - `stability`: stability of the cluster (-1 for noise clusters) -You can use `length` to know the number of pooints in the cluster. And `isnoise` funciton is also available to know whether the cluster is noise or not. +The stability represents how much the cluster is "reasonable". So, a cluster which has a bigger stability is better. +The noise cluster is determined not to belong to any cluster. So, you can ignore them. +See also: [isnoise](@ref) """ mutable struct HdbscanCluster parent::Int @@ -43,6 +46,10 @@ mutable struct HdbscanCluster end Base.length(c::HdbscanCluster) = size(c.points, 1) +""" + isnoise(c::HdbscanCluster) +This function returns whether the cluster is the noise or not. +""" isnoise(c::HdbscanCluster) = c.stability == -1 isstable(c::HdbscanCluster) = c.stability != 0 function increment_stability(c::HdbscanCluster, λbirth) @@ -50,10 +57,10 @@ function increment_stability(c::HdbscanCluster, λbirth) end """ - HdbscanResult(k, minpts, clusters) -- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A. -- `min_cluster_size`: minimum number of points in the cluster -- `clusters`: result vector of clusters + HdbscanResult +Result of the [hdbscan](@ref) clustering. +- `clusters`: vector of clusters +- `assignments`: vectors of assignments for each points """ struct HdbscanResult clusters::Vector{HdbscanCluster} @@ -61,23 +68,17 @@ struct HdbscanResult end """ - hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing) -Density-Based Clustering Based on Hierarchical Density Estimates. -This algorithm performs clustering as follows. -1. generate a minimum spanning tree -2. build a HDBSCAN hierarchy -3. extract the target cluster -4. generate the list of cluster assignment for each point -The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). - + hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Integer; + metric=SqEuclidean()) +Cluster `points` using Density-Based Clustering Based on Hierarchical Density Estimates (HDBSCAN) algorithm. +Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details on how the algorithm works. # Parameters -- `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point -- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A. -- `min_cluster_size`: minimum number of points in the cluster -- `gen_mst`: whether to generate minimum-spannig-tree or not -- `mst`: when is specified and `gen_mst` is false, new mst won't be generated +- `points`: the *d*×*n* matrix, where each column is a *d*-dimensional point +- `ncore::Integer`: number of *core* neighbors of point, see [HDBSCAN algorithm](@ref hdbscan_algorithm) for a description +- `min_cluster_size::Integer`: minimum number of points in the cluster +- `metric`(defaults to Euclidean): the points distance metric to use. """ -function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=SqEuclidean()) +function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=Euclidean()) if min_cluster_size < 1 throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) end From a0a819ea89914f1fd71de027d703c9e482db3381 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 23:29:05 +0900 Subject: [PATCH 18/46] [docs]fix docs fix links --- docs/source/dbscan.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md index 056a5ed7..147872c1 100644 --- a/docs/source/dbscan.md +++ b/docs/source/dbscan.md @@ -54,7 +54,7 @@ Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN > Ricardo J. G. B. Campello, Davoud Moulavi & Joerg Sander > *Density-Based Clustering Based on Hierarchical Density Estimates* 2013 -## Algorithm +## [Algorithm](@id hdbscan_algorithm) The main procedure of HDBSCAN: 1. calculate the *mutual reachability distance* 2. generate a *minimum spanning tree* @@ -90,4 +90,5 @@ The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) funct hdbscan HdbscanResult HdbscanCluster +isnoise ``` From bf38eb6f475b164edf49039ea01782955e819240 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 23:43:52 +0900 Subject: [PATCH 19/46] [fix]fix docstring add space --- src/hdbscan.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 8cef620c..0e46bcc8 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -46,6 +46,7 @@ mutable struct HdbscanCluster end Base.length(c::HdbscanCluster) = size(c.points, 1) + """ isnoise(c::HdbscanCluster) This function returns whether the cluster is the noise or not. From 380acf17d2e31146cb1918d428537e43fb8be930 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sat, 27 Apr 2024 23:48:13 +0900 Subject: [PATCH 20/46] [fix]add isnoise to the list of exprted function make isnoise available for user --- src/Clustering.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Clustering.jl b/src/Clustering.jl index d76d91e9..a97cba89 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -41,7 +41,7 @@ module Clustering DbscanResult, DbscanCluster, dbscan, # hdbscan - HdbscanResult, HdbscanCluster, hdbscan, + HdbscanResult, HdbscanCluster, hdbscan, isnoise, # fuzzy_cmeans fuzzy_cmeans, FuzzyCMeansResult, From 6fdebe603198290ae45ced877e540f1ebac9640b Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 11:19:41 -0700 Subject: [PATCH 21/46] core_distances() tweaks --- src/hdbscan.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 0e46bcc8..2bd8971c 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -115,12 +115,11 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E return HdbscanResult(result, assignments) end -# calculate the core distances of the points +# calculate the core (k-th nearest) distances of the points function core_distances(dists::AbstractMatrix, k::Integer) core_dists = Array{Float64}(undef, size(dists, 1)) - for i in axes(dists, 1) - dist = sort(dists[i, :]) - core_dists[i] = dist[k] + for i in axes(dists, 2) + core_dists[i] = partialsort!(dists[:, i], k) end return core_dists end From ff98806679f2625ddfa050595b78cc4bbfe54b6a Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 11:36:56 -0700 Subject: [PATCH 22/46] Hdbscan: simplify results generation --- src/hdbscan.jl | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 2bd8971c..b4db9766 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -96,23 +96,16 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E #extract the target cluster prune_cluster!(hierarchy) #generate the list of cluster assignment for each point - result = HdbscanCluster[] - noise_points = fill(-1, n) + clusters = HdbscanCluster[] + assignments = fill(0, length(points)) # cluster index of each point for (i, j) in enumerate(hierarchy[2n-1].children) - c = hierarchy[j] - push!(result, c) - for k in c.points - noise_points[k] = 0 - end - end - push!(result, HdbscanCluster(Int[])) - result[end].points = findall(x->x==-1, noise_points) - assignments = Array{Int}(undef, length(points)) - for i in 1 : length(result)-1 - assignments[result[i].points] = i + clu = hierarchy[j] + push!(clusters, clu) + assignments[clu.points] .= i end - assignments[result[end].points] .= -1 - return HdbscanResult(result, assignments) + # add the cluster of all unassigned (noise) points + push!(clusters, HdbscanCluster(findall(==(0), assignments))) + return HdbscanResult(clusters, assignments) end # calculate the core (k-th nearest) distances of the points From 64a202a7659c839f12016ab495f95e8ae1a1826e Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 13:41:24 -0700 Subject: [PATCH 23/46] remove heappush!: unused --- src/hdbscan.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index b4db9766..314ceb07 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -235,9 +235,6 @@ function prune_cluster!(hierarchy::Vector{HdbscanCluster}) end end -# Below are utility functions for building hierarchical trees -heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v) - # Union-Find # structure for managing disjoint sets # This structure tracks which sets the elements of a set belong to, From 661edbc5b26409b5d4a684a1714c04d90f6ab548 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 13:54:18 -0700 Subject: [PATCH 24/46] hdbscan test: small tweaks --- test/hdbscan.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/hdbscan.jl b/test/hdbscan.jl index 0702b278..d8f8fc01 100644 --- a/test/hdbscan.jl +++ b/test/hdbscan.jl @@ -7,13 +7,14 @@ using Clustering lower_x = [i for i in π/2:π/50:3/2*π] upper_y = sin.(upper_x) .+ rand(50)./10 lower_y = cos.(lower_x) .+ rand(51)./10 - data = hcat([lower_x; upper_x], [lower_y; upper_y])' + data = hcat([lower_x; upper_x], + [lower_y; upper_y])' #test for main function @test_throws DomainError hdbscan(data, 5, 0) - @test_nowarn result = hdbscan(data, 5, 3) + @test_nowarn @inferred(hdbscan(data, 5, 3)) # tests for result result = @inferred(hdbscan(data, 5, 3)) @test isa(result, HdbscanResult) - @test sum(length, result.clusters) == 101 + @test sum(length, result.clusters) == size(data, 2) end \ No newline at end of file From 082c5e6f77d094576625d5160ae4a9f373c63c36 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 13:55:15 -0700 Subject: [PATCH 25/46] fixup hdbscan assignments --- src/hdbscan.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 314ceb07..9251c7dc 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -97,14 +97,15 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E prune_cluster!(hierarchy) #generate the list of cluster assignment for each point clusters = HdbscanCluster[] - assignments = fill(0, length(points)) # cluster index of each point + assignments = fill(0, n) # cluster index of each point for (i, j) in enumerate(hierarchy[2n-1].children) clu = hierarchy[j] push!(clusters, clu) assignments[clu.points] .= i end # add the cluster of all unassigned (noise) points - push!(clusters, HdbscanCluster(findall(==(0), assignments))) + noise_points = findall(==(0), assignments) + isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points)) return HdbscanResult(clusters, assignments) end From c9db368c9cc08d61935c8c9f1644b42eff1de7d3 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 13:55:57 -0700 Subject: [PATCH 26/46] hdbscan: further opt core_dists we don't really need a function, this is one line operation --- src/hdbscan.jl | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 9251c7dc..e5652ca8 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -79,14 +79,15 @@ Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details - `min_cluster_size::Integer`: minimum number of points in the cluster - `metric`(defaults to Euclidean): the points distance metric to use. """ -function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=Euclidean()) +function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; metric=Euclidean()) if min_cluster_size < 1 throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) end n = size(points, 2) dists = pairwise(metric, points; dims=2) - #calculate core distances for each point - core_dists = core_distances(dists, k) + # calculate core (ncore-th nearest) distance for each point + core_dists = [partialsort(i_dists, ncore) for i_dists in eachcol(dists)] + #calculate mutual reachability distance between any two points mrd = hdbscan_graph(core_dists, dists) #compute a minimum spanning tree by prim method @@ -109,15 +110,6 @@ function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; metric=E return HdbscanResult(clusters, assignments) end -# calculate the core (k-th nearest) distances of the points -function core_distances(dists::AbstractMatrix, k::Integer) - core_dists = Array{Float64}(undef, size(dists, 1)) - for i in axes(dists, 2) - core_dists[i] = partialsort!(dists[:, i], k) - end - return core_dists -end - function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) n = size(dists, 1) graph = HDBSCANGraph(div(n * (n-1), 2)) From c205575d66af1d762b693f4fc3acc907ecee8b94 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 13:56:59 -0700 Subject: [PATCH 27/46] hdbscan: optimize edges generation --- src/hdbscan.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index e5652ca8..23881e43 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -113,9 +113,10 @@ end function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) n = size(dists, 1) graph = HDBSCANGraph(div(n * (n-1), 2)) - for i in 1 : n-1 - for j in i+1 : n - c = max(core_dists[i], core_dists[j], dists[i, j]) + for (i, i_dists) in enumerate(eachcol(dists)) + i_core = core_dists[i] + for j in i+1:n + c = max(i_core, core_dists[j], i_dists[j]) add_edge!(graph, i, j, c) end end From bdf1aec3c29234291e5e7a8afa5897e7de45033f Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 13:59:02 -0700 Subject: [PATCH 28/46] HDBSCANGraph -> HdbscanGraph for consistency --- src/hdbscan.jl | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 23881e43..fa0dbf4b 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -2,17 +2,18 @@ # edge[i] is a list of edges adjacent to the i-th vertex # the second element of HDBSCANEdge is the mutual reachability distance. HDBSCANEdge = Tuple{Int, Float64} -struct HDBSCANGraph +struct HdbscanGraph edges::Vector{Vector{HDBSCANEdge}} - HDBSCANGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv]) + + HdbscanGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv]) end -function add_edge!(G::HDBSCANGraph, v1::Integer, v2::Integer, dist::Number) +function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number) push!(G.edges[v1], (v2, dist)) push!(G.edges[v2], (v1, dist)) end -Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i] +Base.getindex(G::HdbscanGraph, i::Int) = G.edges[i] struct MSTEdge v1::Integer @@ -112,7 +113,7 @@ end function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) n = size(dists, 1) - graph = HDBSCANGraph(div(n * (n-1), 2)) + graph = HdbscanGraph(div(n * (n-1), 2)) for (i, i_dists) in enumerate(eachcol(dists)) i_core = core_dists[i] for j in i+1:n @@ -123,7 +124,7 @@ function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) return graph end -function hdbscan_minspantree(graph::HDBSCANGraph, n::Integer) +function hdbscan_minspantree(graph::HdbscanGraph, n::Integer) function heapput!(h, v) idx = searchsortedlast(h, v, rev=true) insert!(h, (idx != 0) ? idx : 1, v) From 9aa984122bc5186d4ddb4486f0dddb9053104231 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:01:10 -0700 Subject: [PATCH 29/46] HdbscanEdge --- src/hdbscan.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index fa0dbf4b..1dfda402 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -1,11 +1,11 @@ +# HDBSCAN Graph edge: target vertex and mutual reachability distance. +HdbscanEdge = Tuple{Int, Float64} + # HDBSCAN Graph -# edge[i] is a list of edges adjacent to the i-th vertex -# the second element of HDBSCANEdge is the mutual reachability distance. -HDBSCANEdge = Tuple{Int, Float64} struct HdbscanGraph - edges::Vector{Vector{HDBSCANEdge}} + edges::Vector{Vector{HdbscanEdge}} - HdbscanGraph(nv::Integer) = new([HDBSCANEdge[] for _ in 1 : nv]) + HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv]) end function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number) From c9b9374c0c6996399c4e4e4838af2a85e474d16d Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:05:27 -0700 Subject: [PATCH 30/46] HdbscanGraph: rename edges to adj_edges and remove Base.getindex() --- src/hdbscan.jl | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 1dfda402..780414a8 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -3,18 +3,16 @@ HdbscanEdge = Tuple{Int, Float64} # HDBSCAN Graph struct HdbscanGraph - edges::Vector{Vector{HdbscanEdge}} + adj_edges::Vector{Vector{HdbscanEdge}} HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv]) end function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number) - push!(G.edges[v1], (v2, dist)) - push!(G.edges[v2], (v1, dist)) + push!(G.adj_edges[v1], (v2, dist)) + push!(G.adj_edges[v2], (v1, dist)) end -Base.getindex(G::HdbscanGraph, i::Int) = G.edges[i] - struct MSTEdge v1::Integer v2::Integer @@ -137,8 +135,8 @@ function hdbscan_minspantree(graph::HdbscanGraph, n::Integer) marked[1] = true h = MSTEdge[] - - for (i, c) in graph[1] + + for (i, c) in graph.adj_edges[1] heapput!(h, MSTEdge(1, i, c)) end @@ -149,8 +147,8 @@ function hdbscan_minspantree(graph::HdbscanGraph, n::Integer) minspantree[nmarked] = MSTEdge(i, j, c) marked[j] = true nmarked += 1 - - for (k, c) in graph[j] + + for (k, c) in graph.adj_edges[j] marked[k] && continue heapput!(h, MSTEdge(j, k, c)) end From 092ac4017b23a17abb74d71bd16c1b1c98724aa3 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:06:06 -0700 Subject: [PATCH 31/46] MSTEdge remove no-op expand() method --- src/hdbscan.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 780414a8..f3dd6745 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -18,7 +18,7 @@ struct MSTEdge v2::Integer dist::Number end -expand(edge::MSTEdge) = (edge.v1, edge.v2, edge.dist) + Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist """ @@ -141,8 +141,8 @@ function hdbscan_minspantree(graph::HdbscanGraph, n::Integer) end while nmarked < n - i, j, c = expand(pop!(h)) - + i, j, c = pop!(h) + marked[j] && continue minspantree[nmarked] = MSTEdge(i, j, c) marked[j] = true @@ -164,7 +164,7 @@ function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer) sort!(mst) for i in 1 : n-1 - j, k, c = expand(mst[i]) + j, k, c = mst[i] cost += c λ = 1 / cost #child clusters From c972caad96a6f2bdaa4cc1297086c3df543a41c1 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:12:49 -0700 Subject: [PATCH 32/46] refactor HdbscanMSTEdge --- src/hdbscan.jl | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index f3dd6745..f2ca31cc 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -13,13 +13,8 @@ function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number) push!(G.adj_edges[v2], (v1, dist)) end -struct MSTEdge - v1::Integer - v2::Integer - dist::Number -end - -Base.isless(edge1::MSTEdge, edge2::MSTEdge) = edge1.dist < edge2.dist +# Edge of the minimum spanning tree for HDBScan algorithm +HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}} """ HdbscanCluster @@ -124,39 +119,39 @@ end function hdbscan_minspantree(graph::HdbscanGraph, n::Integer) function heapput!(h, v) - idx = searchsortedlast(h, v, rev=true) + idx = searchsortedlast(h, v, by=e -> e.dist, rev=true) insert!(h, (idx != 0) ? idx : 1, v) end - minspantree = Vector{MSTEdge}(undef, n-1) - + minspantree = Vector{HdbscanMSTEdge}(undef, n-1) + marked = falses(n) nmarked = 1 marked[1] = true - - h = MSTEdge[] + + h = HdbscanMSTEdge[] for (i, c) in graph.adj_edges[1] - heapput!(h, MSTEdge(1, i, c)) + heapput!(h, (v1=1, v2=i, dist=c)) end while nmarked < n - i, j, c = pop!(h) + (i, j, c) = pop!(h) marked[j] && continue - minspantree[nmarked] = MSTEdge(i, j, c) + minspantree[nmarked] = (v1=i, v2=j, dist=c) marked[j] = true nmarked += 1 for (k, c) in graph.adj_edges[j] marked[k] && continue - heapput!(h, MSTEdge(j, k, c)) + heapput!(h, (v1=j, v2=k, dist=c)) end end return minspantree end -function hdbscan_clusters(mst::AbstractVector{MSTEdge}, min_size::Integer) +function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer) n = length(mst) + 1 cost = 0 uf = UnionFind(n) From 57b05e6c92408081132fc55fd537a5651cadd909 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:27:02 -0700 Subject: [PATCH 33/46] hdbscan: fix graph vertices, remove add_edges! add_edges!() is used only once --- src/hdbscan.jl | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index f2ca31cc..f30ab247 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -8,11 +8,6 @@ struct HdbscanGraph HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv]) end -function add_edge!(G::HdbscanGraph, v1::Integer, v2::Integer, dist::Number) - push!(G.adj_edges[v1], (v2, dist)) - push!(G.adj_edges[v2], (v1, dist)) -end - # Edge of the minimum spanning tree for HDBScan algorithm HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}} @@ -106,12 +101,14 @@ end function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) n = size(dists, 1) - graph = HdbscanGraph(div(n * (n-1), 2)) + graph = HdbscanGraph(n) for (i, i_dists) in enumerate(eachcol(dists)) i_core = core_dists[i] for j in i+1:n c = max(i_core, core_dists[j], i_dists[j]) - add_edge!(graph, i, j, c) + # add reciprocal edges + push!(graph.adj_edges[i], (j, c)) + push!(graph.adj_edges[j], (i, c)) end end return graph From f6167951857a56684d353dcba8e4ebab72fc6671 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:28:18 -0700 Subject: [PATCH 34/46] hdbscan_minspantree(): refactor --- src/hdbscan.jl | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index f30ab247..b1667dda 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -80,7 +80,7 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; #calculate mutual reachability distance between any two points mrd = hdbscan_graph(core_dists, dists) #compute a minimum spanning tree by prim method - mst = hdbscan_minspantree(mrd, n) + mst = hdbscan_minspantree(mrd) #build a HDBSCAN hierarchy hierarchy = hdbscan_clusters(mst, min_cluster_size) #extract the target cluster @@ -114,35 +114,36 @@ function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) return graph end -function hdbscan_minspantree(graph::HdbscanGraph, n::Integer) - function heapput!(h, v) +function hdbscan_minspantree(graph::HdbscanGraph) + function heapput!(h, v::HdbscanMSTEdge) idx = searchsortedlast(h, v, by=e -> e.dist, rev=true) insert!(h, (idx != 0) ? idx : 1, v) end - minspantree = Vector{HdbscanMSTEdge}(undef, n-1) - - marked = falses(n) - nmarked = 1 - marked[1] = true - - h = HdbscanMSTEdge[] - + # initialize the edges heap by putting all edges of the first node (the root) + heap = HdbscanMSTEdge[] for (i, c) in graph.adj_edges[1] - heapput!(h, (v1=1, v2=i, dist=c)) + heapput!(heap, (v1=1, v2=i, dist=c)) end - - while nmarked < n - (i, j, c) = pop!(h) - marked[j] && continue - minspantree[nmarked] = (v1=i, v2=j, dist=c) - marked[j] = true - nmarked += 1 + # build the tree + n = length(graph.adj_edges) + minspantree = Vector{HdbscanMSTEdge}() + inmst = falses(n) # if the graph node is in MST + inmst[1] = true # root + + while length(minspantree) < n-1 + # get the next edge with maximal? distance + (i, j, c) = pop!(heap) + + # add j-th node to MST if not there + inmst[j] && continue + push!(minspantree, (v1=i, v2=j, dist=c)) + inmst[j] = true + # add adjacent edges of j to the heap for (k, c) in graph.adj_edges[j] - marked[k] && continue - heapput!(h, (v1=j, v2=k, dist=c)) + inmst[k] || heapput!(heap, (v1=j, v2=k, dist=c)) end end return minspantree From 0f6e9922e6b7da32b4167f277da7c7b16f75dc4c Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 14:32:41 -0700 Subject: [PATCH 35/46] prune_clusters!(): cleanup --- src/hdbscan.jl | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index b1667dda..e4dbebfc 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -84,7 +84,7 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; #build a HDBSCAN hierarchy hierarchy = hdbscan_clusters(mst, min_cluster_size) #extract the target cluster - prune_cluster!(hierarchy) + prune_clusters!(hierarchy) #generate the list of cluster assignment for each point clusters = HdbscanCluster[] assignments = fill(0, n) # cluster index of each point @@ -202,21 +202,16 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer return clusters end -function prune_cluster!(hierarchy::Vector{HdbscanCluster}) - for i in 1 : length(hierarchy)-1 - if isnoise(hierarchy[i]) - c = hierarchy[i] - push!(hierarchy[c.parent].children, i) - hierarchy[c.parent].children_stability += c.stability +function prune_clusters!(hierarchy::Vector{HdbscanCluster}) + for i in 1:length(hierarchy)-1 + c = hierarchy[i] + parent = hierarchy[c.parent] + if isnoise(c) || c.stability > c.children_stability + push!(parent.children, i) + parent.children_stability += c.stability else - c = hierarchy[i] - if c.stability > c.children_stability - push!(hierarchy[c.parent].children, i) - hierarchy[c.parent].children_stability += c.stability - else - append!(hierarchy[c.parent].children, c.children) - hierarchy[c.parent].children_stability += c.children_stability - end + append!(parent.children, c.children) + parent.children_stability += c.children_stability end end end From e01609bcd35b670a1d615d48c45c3732992e8a86 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 15:08:08 -0700 Subject: [PATCH 36/46] hdbscan: fix 1.0 compat replace eachcol with alternatives --- src/hdbscan.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index e4dbebfc..fb8cf812 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -75,7 +75,7 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; n = size(points, 2) dists = pairwise(metric, points; dims=2) # calculate core (ncore-th nearest) distance for each point - core_dists = [partialsort(i_dists, ncore) for i_dists in eachcol(dists)] + core_dists = [partialsort!(dists[:, i], ncore) for i in axes(dists, 2)] #calculate mutual reachability distance between any two points mrd = hdbscan_graph(core_dists, dists) @@ -102,7 +102,8 @@ end function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) n = size(dists, 1) graph = HdbscanGraph(n) - for (i, i_dists) in enumerate(eachcol(dists)) + for i in axes(dists, 2) + i_dists = view(dists, :, i) i_core = core_dists[i] for j in i+1:n c = max(i_core, core_dists[j], i_dists[j]) From de6b83a47e1067d080f409ba95713cc166b4fd35 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sun, 28 Apr 2024 07:36:02 +0900 Subject: [PATCH 37/46] [docs]fix docstring add newline --- src/hdbscan.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index fb8cf812..39bbfb1d 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -13,6 +13,7 @@ HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}} """ HdbscanCluster + A cluster generated by the [hdbscan](@ref) method, part of [HdbscanResult](@ref). - `points`: vector of points which belongs to the cluster - `stability`: stability of the cluster (-1 for noise clusters) @@ -38,6 +39,7 @@ Base.length(c::HdbscanCluster) = size(c.points, 1) """ isnoise(c::HdbscanCluster) + This function returns whether the cluster is the noise or not. """ isnoise(c::HdbscanCluster) = c.stability == -1 @@ -48,6 +50,7 @@ end """ HdbscanResult + Result of the [hdbscan](@ref) clustering. - `clusters`: vector of clusters - `assignments`: vectors of assignments for each points @@ -60,6 +63,7 @@ end """ hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Integer; metric=SqEuclidean()) + Cluster `points` using Density-Based Clustering Based on Hierarchical Density Estimates (HDBSCAN) algorithm. Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details on how the algorithm works. # Parameters From 38212cac3633b2ad50947fb04310f67a2daff873 Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sun, 28 Apr 2024 07:53:44 +0900 Subject: [PATCH 38/46] [clean]rename and refactoring cleanup UnionFind terminology and move it to the other file --- src/Clustering.jl | 1 + src/hdbscan.jl | 75 ++++------------------------------------------- src/unionfind.jl | 58 ++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 69 deletions(-) create mode 100644 src/unionfind.jl diff --git a/src/Clustering.jl b/src/Clustering.jl index a97cba89..fd522a78 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -81,6 +81,7 @@ module Clustering include("utils.jl") include("seeding.jl") + include("unionfind.jl") include("kmeans.jl") include("kmedoids.jl") diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 39bbfb1d..0c4477ef 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -29,6 +29,7 @@ mutable struct HdbscanCluster λp::Vector{Float64} stability::Float64 children_stability::Float64 + function HdbscanCluster(points::Union{Vector{Int}, Nothing}) noise = points === nothing return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0) @@ -166,8 +167,8 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer cost += c λ = 1 / cost #child clusters - c1 = group(uf, j) - c2 = group(uf, k) + c1 = set_id(uf, j) + c2 = set_id(uf, k) #reference to the parent cluster clusters[c1].parent = clusters[c2].parent = n+i nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2]) @@ -178,7 +179,7 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer #unite cluster unite!(uf, j, k) #create parent cluster - points = members(uf, group(uf, j)) + points = items(uf, set_id(uf, j)) push!(clusters, HdbscanCluster(points)) elseif !(nc1 && nc2) if nc2 == true @@ -189,13 +190,13 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer #unite cluster unite!(uf, j, k) #create parent cluster - points = members(uf, group(uf, j)) + points = items(uf, set_id(uf, j)) push!(clusters, HdbscanCluster(points)) else #unite the noise cluster unite!(uf, j, k) #create parent cluster - points = members(uf, group(uf, j)) + points = items(uf, set_id(uf, j)) if length(points) < min_size push!(clusters, HdbscanCluster(Int[])) else @@ -220,67 +221,3 @@ function prune_clusters!(hierarchy::Vector{HdbscanCluster}) end end end - -# Union-Find -# structure for managing disjoint sets -# This structure tracks which sets the elements of a set belong to, -# and allows us to efficiently determine whether two elements belong to the same set. -mutable struct UnionFind - parent:: Vector{Integer} # parent[root] is the negative of the size - label::Dict{Int, Int} - next_id::Int - - function UnionFind(nodes::Integer) - if nodes <= 0 - throw(ArgumentError("invalid argument for nodes: $nodes")) - end - - parent = -ones(nodes) - label = Dict([(i, i) for i in 1 : nodes]) - new(parent, label, nodes) - end -end - -# label of the set which element `x` belong to -group(uf::UnionFind, x) = uf.label[root(uf, x)] -# all elements that have the specified label -members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label))) - -# root of element `x` -# The root is the representative element of the set -function root(uf::UnionFind, x::Integer) - if uf.parent[x] < 0 - return x - else - return uf.parent[x] = root(uf, uf.parent[x]) - end -end - -# whether element `x` and `y` belong to the same set -function issame(uf::UnionFind, x::Integer, y::Integer) - return root(uf, x) == root(uf, y) -end - -function Base.size(uf::UnionFind, x::Integer) - return -uf.parent[root(uf, x)] -end - -function unite!(uf::UnionFind, x::Integer, y::Integer) - x = root(uf, x) - y = root(uf, y) - if x == y - return false - end - if uf.parent[x] > uf.parent[y] - x, y = y, x - end - # unite smaller tree(y) to bigger one(x) - uf.parent[x] += uf.parent[y] - uf.parent[y] = x - uf.next_id += 1 - uf.label[y] = uf.next_id - for i in members(uf, group(uf, x)) - uf.label[i] = uf.next_id - end - return true -end \ No newline at end of file diff --git a/src/unionfind.jl b/src/unionfind.jl new file mode 100644 index 00000000..30018c1e --- /dev/null +++ b/src/unionfind.jl @@ -0,0 +1,58 @@ +# Union-Find +# structure for managing disjoint sets +# This structure tracks which sets the elements of a set belong to, +# and allows us to efficiently determine whether two elements belong to the same set. +mutable struct UnionFind + parent::Vector{Int} # parent[root] is the negative of the size + label::Dict{Int, Int} + next_id::Int + + function UnionFind(nodes::Int) + if nodes <= 0 + throw(ArgumentError("invalid argument for nodes: $nodes")) + end + + parent = -ones(nodes) + label = Dict([i=>i for i in 1:nodes]) + new(parent, label, nodes) + end +end + +# label of the set which element `x` belong to +set_id(uf::UnionFind, x) = uf.label[root(uf, x)] +# all elements that have the specified label +items(uf::UnionFind, x::Int) = [k for (k, v) in pairs(uf.label) if v == x] + +# root of element `x` +# The root is the representative element of the set +function root(uf::UnionFind, x::Int) + if uf.parent[x] < 0 + return x + else + return uf.parent[x] = root(uf, uf.parent[x]) + end +end + +function setsize(uf::UnionFind, x::Int) + return -uf.parent[root(uf, x)] +end + +function unite!(uf::UnionFind, x::Int, y::Int) + x = root(uf, x) + y = root(uf, y) + if x == y + return false + end + if uf.parent[x] > uf.parent[y] + x, y = y, x + end + # unite smaller tree(y) to bigger one(x) + uf.parent[x] += uf.parent[y] + uf.parent[y] = x + uf.next_id += 1 + uf.label[y] = uf.next_id + for i in items(uf, set_id(uf, x)) + uf.label[i] = uf.next_id + end + return true +end \ No newline at end of file From 159858d14029b5a787d286919c4cebdf1ed3502b Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Sun, 28 Apr 2024 08:13:25 +0900 Subject: [PATCH 39/46] [test]add test for unionfind there wes no test for it --- test/hdbscan.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/hdbscan.jl b/test/hdbscan.jl index d8f8fc01..af111188 100644 --- a/test/hdbscan.jl +++ b/test/hdbscan.jl @@ -1,5 +1,16 @@ using Test using Clustering +using Clustering: UnionFind, set_id, items, root, setsize, unite! + +@testset "UnionFind" begin + uf = UnionFind(10) + @test set_id(uf, 5) == 5 + @test items(uf, 5) == [5] + @test !(unite!(uf, 5, 5)) + @test unite!(uf, 4, 5) + @test setsize(uf, 4) == 2 + @test root(uf, 5) == 4 +end @testset "HDBSCAN" begin # make moons for test From a03c22441deb10c1a28762756b9066048b4e7b98 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 20:38:48 -0700 Subject: [PATCH 40/46] hdbscan_minspantree: fix edges sorting --- src/hdbscan.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index fb8cf812..8a993123 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -116,9 +116,10 @@ function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) end function hdbscan_minspantree(graph::HdbscanGraph) + # put the edge to the heap (sorted from largest to smallest distances) function heapput!(h, v::HdbscanMSTEdge) idx = searchsortedlast(h, v, by=e -> e.dist, rev=true) - insert!(h, (idx != 0) ? idx : 1, v) + insert!(h, idx + 1, v) end # initialize the edges heap by putting all edges of the first node (the root) @@ -126,6 +127,7 @@ function hdbscan_minspantree(graph::HdbscanGraph) for (i, c) in graph.adj_edges[1] heapput!(heap, (v1=1, v2=i, dist=c)) end + @assert issorted(heap, by=e -> e.dist, rev=true) # build the tree n = length(graph.adj_edges) @@ -134,7 +136,7 @@ function hdbscan_minspantree(graph::HdbscanGraph) inmst[1] = true # root while length(minspantree) < n-1 - # get the next edge with maximal? distance + # get the edge with the smallest distance from the heap (i, j, c) = pop!(heap) # add j-th node to MST if not there @@ -147,7 +149,7 @@ function hdbscan_minspantree(graph::HdbscanGraph) inmst[k] || heapput!(heap, (v1=j, v2=k, dist=c)) end end - return minspantree + return sort!(minspantree, by=e -> e.dist) end function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer) @@ -155,7 +157,6 @@ function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer cost = 0 uf = UnionFind(n) clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n] - sort!(mst) for i in 1 : n-1 j, k, c = mst[i] From 3a577eaa8b71b7dd47af5d66bf18961fba47f414 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 20:39:23 -0700 Subject: [PATCH 41/46] hdbscan_clusters(): fix cost type --- src/hdbscan.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 8a993123..65d6296b 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -154,7 +154,7 @@ end function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer) n = length(mst) + 1 - cost = 0 + cost = 0.0 uf = UnionFind(n) clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n] From 744af227cd773e35ebb58779dda31e536d2d6d84 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 27 Apr 2024 20:40:03 -0700 Subject: [PATCH 42/46] hdbscan_clusters: improve MST iteration --- src/hdbscan.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 65d6296b..9e3a9ef6 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -152,14 +152,13 @@ function hdbscan_minspantree(graph::HdbscanGraph) return sort!(minspantree, by=e -> e.dist) end -function hdbscan_clusters(mst::AbstractVector{HdbscanMSTEdge}, min_size::Integer) - n = length(mst) + 1 +function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer) + n = length(minspantree) + 1 cost = 0.0 uf = UnionFind(n) clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n] - - for i in 1 : n-1 - j, k, c = mst[i] + + for (i, (j, k, c)) in enumerate(minspantree) cost += c λ = 1 / cost #child clusters From b94de2528be373287b6ebe665015255d5659873c Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Wed, 1 May 2024 15:07:30 +0900 Subject: [PATCH 43/46] [clean]rename the result structure rename HdbscanCluster into HdbscanNode and create new one to expose to the user --- src/hdbscan.jl | 51 ++++++++++++++++++++++++++++-------------------- src/unionfind.jl | 2 +- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 9f1d11ee..aab68b1f 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -11,6 +11,21 @@ end # Edge of the minimum spanning tree for HDBScan algorithm HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}} +mutable struct HdbscanNode + parent::Int + children::Vector{Int} + points::Vector{Int} + λp::Vector{Float64} + stability::Float64 + children_stability::Float64 + + function HdbscanNode(points::Union{Vector{Int}, Nothing}) + noise = points === nothing + return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0) + end +end + + """ HdbscanCluster @@ -22,21 +37,12 @@ The stability represents how much the cluster is "reasonable". So, a cluster whi The noise cluster is determined not to belong to any cluster. So, you can ignore them. See also: [isnoise](@ref) """ -mutable struct HdbscanCluster - parent::Int - children::Vector{Int} +struct HdbscanCluster points::Vector{Int} - λp::Vector{Float64} stability::Float64 - children_stability::Float64 - - function HdbscanCluster(points::Union{Vector{Int}, Nothing}) - noise = points === nothing - return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0) - end end -Base.length(c::HdbscanCluster) = size(c.points, 1) +Base.length(c::HdbscanCluster) = length(c.points) """ isnoise(c::HdbscanCluster) @@ -44,8 +50,9 @@ Base.length(c::HdbscanCluster) = size(c.points, 1) This function returns whether the cluster is the noise or not. """ isnoise(c::HdbscanCluster) = c.stability == -1 -isstable(c::HdbscanCluster) = c.stability != 0 -function increment_stability(c::HdbscanCluster, λbirth) +isnoise(c::HdbscanNode) = c.stability == -1 +isstable(c::HdbscanNode) = c.stability != 0 +function increment_stability(c::HdbscanNode, λbirth) c.stability += sum(c.λp) - length(c.λp) * λbirth end @@ -55,6 +62,8 @@ end Result of the [hdbscan](@ref) clustering. - `clusters`: vector of clusters - `assignments`: vectors of assignments for each points + +See also: [HdbscanCluster](@ref) """ struct HdbscanResult clusters::Vector{HdbscanCluster} @@ -95,12 +104,12 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; assignments = fill(0, n) # cluster index of each point for (i, j) in enumerate(hierarchy[2n-1].children) clu = hierarchy[j] - push!(clusters, clu) + push!(clusters, HdbscanCluster(clu.points, clu.stability)) assignments[clu.points] .= i end # add the cluster of all unassigned (noise) points noise_points = findall(==(0), assignments) - isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points)) + isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points, -1)) return HdbscanResult(clusters, assignments) end @@ -161,7 +170,7 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size: n = length(minspantree) + 1 cost = 0.0 uf = UnionFind(n) - clusters = [HdbscanCluster(min_size > 1 ? Int[i] : Int[]) for i in 1:n] + clusters = [HdbscanNode(min_size > 1 ? Int[i] : Int[]) for i in 1:n] for (i, (j, k, c)) in enumerate(minspantree) cost += c @@ -180,7 +189,7 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size: unite!(uf, j, k) #create parent cluster points = items(uf, set_id(uf, j)) - push!(clusters, HdbscanCluster(points)) + push!(clusters, HdbscanNode(points)) elseif !(nc1 && nc2) if nc2 == true (c1, c2) = (c2, c1) @@ -191,16 +200,16 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size: unite!(uf, j, k) #create parent cluster points = items(uf, set_id(uf, j)) - push!(clusters, HdbscanCluster(points)) + push!(clusters, HdbscanNode(points)) else #unite the noise cluster unite!(uf, j, k) #create parent cluster points = items(uf, set_id(uf, j)) if length(points) < min_size - push!(clusters, HdbscanCluster(Int[])) + push!(clusters, HdbscanNode(Int[])) else - push!(clusters, HdbscanCluster(points)) + push!(clusters, HdbscanNode(points)) end end end @@ -208,7 +217,7 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size: return clusters end -function prune_clusters!(hierarchy::Vector{HdbscanCluster}) +function prune_clusters!(hierarchy::Vector{HdbscanNode}) for i in 1:length(hierarchy)-1 c = hierarchy[i] parent = hierarchy[c.parent] diff --git a/src/unionfind.jl b/src/unionfind.jl index 30018c1e..6b846bd6 100644 --- a/src/unionfind.jl +++ b/src/unionfind.jl @@ -19,7 +19,7 @@ mutable struct UnionFind end # label of the set which element `x` belong to -set_id(uf::UnionFind, x) = uf.label[root(uf, x)] +set_id(uf::UnionFind, x::Int) = uf.label[root(uf, x)] # all elements that have the specified label items(uf::UnionFind, x::Int) = [k for (k, v) in pairs(uf.label) if v == x] From 7078223097e794f2c72d42dfff5d32b92a50c73d Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Thu, 2 May 2024 10:46:07 +0900 Subject: [PATCH 44/46] [hotfix]apply hot fix the algorithm went wrong --- src/hdbscan.jl | 93 +++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index aab68b1f..43996441 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -19,12 +19,13 @@ mutable struct HdbscanNode stability::Float64 children_stability::Float64 - function HdbscanNode(points::Union{Vector{Int}, Nothing}) - noise = points === nothing - return new(0, Int[], noise ? Int[] : points, Float64[], noise ? -1 : 0, noise ? -1 : 0) + function HdbscanNode(points::Vector{Int}; λp::Vector{Float64}=Float64[], children::Vector{Int}=Int[], children_stability::Float64=0.0) + noise = isempty(λp) + return new(0, children, points, λp, noise ? -1 : 0, children_stability) end end +cluster_len(c::HdbscanNode) = length(c.points) """ HdbscanCluster @@ -52,8 +53,9 @@ This function returns whether the cluster is the noise or not. isnoise(c::HdbscanCluster) = c.stability == -1 isnoise(c::HdbscanNode) = c.stability == -1 isstable(c::HdbscanNode) = c.stability != 0 -function increment_stability(c::HdbscanNode, λbirth) - c.stability += sum(c.λp) - length(c.λp) * λbirth +function compute_stability!(c::HdbscanNode, λbirth) + c.stability = sum(c.λp) - length(c.λp) * λbirth + return c.stability end """ @@ -96,14 +98,14 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; #compute a minimum spanning tree by prim method mst = hdbscan_minspantree(mrd) #build a HDBSCAN hierarchy - hierarchy = hdbscan_clusters(mst, min_cluster_size) + tree = hdbscan_build_tree(mst, min_cluster_size) #extract the target cluster - prune_clusters!(hierarchy) + extract_clusters!(tree) #generate the list of cluster assignment for each point clusters = HdbscanCluster[] assignments = fill(0, n) # cluster index of each point - for (i, j) in enumerate(hierarchy[2n-1].children) - clu = hierarchy[j] + for (i, j) in enumerate(tree[end].children) + clu = tree[j] push!(clusters, HdbscanCluster(clu.points, clu.stability)) assignments[clu.points] .= i end @@ -166,11 +168,11 @@ function hdbscan_minspantree(graph::HdbscanGraph) return sort!(minspantree, by=e -> e.dist) end -function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer) +function hdbscan_build_tree(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer) n = length(minspantree) + 1 cost = 0.0 uf = UnionFind(n) - clusters = [HdbscanNode(min_size > 1 ? Int[i] : Int[]) for i in 1:n] + clusters = [HdbscanNode(Int[i], λp=(min_size==1) ? Float64[Inf] : Float64[]) for i in 1:n] for (i, (j, k, c)) in enumerate(minspantree) cost += c @@ -180,53 +182,50 @@ function hdbscan_clusters(minspantree::AbstractVector{HdbscanMSTEdge}, min_size: c2 = set_id(uf, k) #reference to the parent cluster clusters[c1].parent = clusters[c2].parent = n+i - nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2]) - if !(nc1 || nc2) - #compute stability - increment_stability(clusters[c1], λ) - increment_stability(clusters[c2], λ) - #unite cluster - unite!(uf, j, k) - #create parent cluster - points = items(uf, set_id(uf, j)) - push!(clusters, HdbscanNode(points)) - elseif !(nc1 && nc2) - if nc2 == true - (c1, c2) = (c2, c1) - end - #record the lambda value - append!(clusters[c2].λp, fill(λ, length(clusters[c1]))) - #unite cluster - unite!(uf, j, k) - #create parent cluster - points = items(uf, set_id(uf, j)) + #unite the clusters + unite!(uf, j, k) + points = items(uf, set_id(uf, j)) + if length(points) < min_size push!(clusters, HdbscanNode(points)) else - #unite the noise cluster - unite!(uf, j, k) - #create parent cluster - points = items(uf, set_id(uf, j)) - if length(points) < min_size - push!(clusters, HdbscanNode(Int[])) + nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2]) + if nc1 && nc2 + push!(clusters, HdbscanNode(points, λp=fill(λ, length(points)))) + elseif nc1 || nc2 + #ensure that c1 is the cluster + if nc1 == true + c1, c2 = c2, c1 + end + push!(clusters, HdbscanNode(points, λp=[clusters[c1].λp..., fill(λ, cluster_len(clusters[c2]))...])) else - push!(clusters, HdbscanNode(points)) + #compute stabilities for children + c1_stability = compute_stability!(clusters[c1], λ) + c2_stability = compute_stability!(clusters[c2], λ) + #unite the two clusters + push!(clusters, HdbscanNode(points, λp=fill(λ, length(points)), children=[c1, c2], children_stability=c1_stability+c2_stability)) end end end + compute_stability!(clusters[end], 1/cost) @assert length(clusters) == 2n - 1 return clusters end -function prune_clusters!(hierarchy::Vector{HdbscanNode}) - for i in 1:length(hierarchy)-1 - c = hierarchy[i] - parent = hierarchy[c.parent] - if isnoise(c) || c.stability > c.children_stability - push!(parent.children, i) - parent.children_stability += c.stability - else +function extract_clusters!(tree::Vector{HdbscanNode}) + for i in eachindex(tree) + i == length(tree) && continue + c = tree[i] + isempty(c.children) && continue + parent = tree[c.parent] + children_stability = sum([tree[c.children[i]].stability for i in eachindex(c.children)]) + if children_stability > c.stability + filter!(x->x!=i, parent.children) append!(parent.children, c.children) - parent.children_stability += c.children_stability end end + c = tree[end] + children_stability = sum([tree[c.children[i]].stability for i in eachindex(c.children)]) + if children_stability <= c.stability + c.children = Int[2 * length(tree) - 1] + end end From b0791d937ef05cbb1f018a69f942e2176968014d Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Thu, 2 May 2024 10:46:48 +0900 Subject: [PATCH 45/46] [test]add test about min_size ensure that `min_size` effects properly --- test/hdbscan.jl | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/test/hdbscan.jl b/test/hdbscan.jl index af111188..25038d31 100644 --- a/test/hdbscan.jl +++ b/test/hdbscan.jl @@ -20,12 +20,30 @@ end lower_y = cos.(lower_x) .+ rand(51)./10 data = hcat([lower_x; upper_x], [lower_y; upper_y])' - #test for main function + # test for main function @test_throws DomainError hdbscan(data, 5, 0) - @test_nowarn @inferred(hdbscan(data, 5, 3)) + @test_nowarn @inferred(hdbscan(data, 5, 10)) # tests for result - result = @inferred(hdbscan(data, 5, 3)) + result = @inferred(hdbscan(data, 5, 15)) @test isa(result, HdbscanResult) @test sum(length, result.clusters) == size(data, 2) + # test whether the model recognizes the moons + @test length(result.clusters) == 3 + + # generate more complicated data + n_points = 100 # number of points + clusters = 4 # number of clusters + + # generate the centers of clusters + centers = [(rand(1:10), rand(1:10)) for _ in 1:clusters] + + # generate all the points + data = hcat([0.5 * randn(2, n_points) .+ centers[i] for i in 1:clusters]...) + # check the effect of min_size + nclusters1 = length(@inferred(hdbscan(data, 5, 5)).clusters) + nclusters2 = length(@inferred(hdbscan(data, 5, 15)).clusters) + nclusters3 = length(@inferred(hdbscan(data, 5, 25)).clusters) + @test nclusters1 >= nclusters2 + @test nclusters2 >= nclusters3 end \ No newline at end of file From dc5dd409882d390d333b808c466f1b5604ef0d6c Mon Sep 17 00:00:00 2001 From: MommaWatasu Date: Thu, 2 May 2024 11:03:26 +0900 Subject: [PATCH 46/46] [add function or file]support ClusteringResult add counts field into HdbscanResult --- src/hdbscan.jl | 9 +++++---- test/hdbscan.jl | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/hdbscan.jl b/src/hdbscan.jl index 43996441..3a769f4a 100644 --- a/src/hdbscan.jl +++ b/src/hdbscan.jl @@ -43,8 +43,6 @@ struct HdbscanCluster stability::Float64 end -Base.length(c::HdbscanCluster) = length(c.points) - """ isnoise(c::HdbscanCluster) @@ -67,9 +65,10 @@ Result of the [hdbscan](@ref) clustering. See also: [HdbscanCluster](@ref) """ -struct HdbscanResult +struct HdbscanResult <: ClusteringResult clusters::Vector{HdbscanCluster} assignments::Vector{Int} + counts::Vector{Int} end """ @@ -103,16 +102,18 @@ function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; extract_clusters!(tree) #generate the list of cluster assignment for each point clusters = HdbscanCluster[] + counts = Int[] assignments = fill(0, n) # cluster index of each point for (i, j) in enumerate(tree[end].children) clu = tree[j] push!(clusters, HdbscanCluster(clu.points, clu.stability)) + push!(counts, length(clu.points)) assignments[clu.points] .= i end # add the cluster of all unassigned (noise) points noise_points = findall(==(0), assignments) isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points, -1)) - return HdbscanResult(clusters, assignments) + return HdbscanResult(clusters, assignments, counts) end function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) diff --git a/test/hdbscan.jl b/test/hdbscan.jl index 25038d31..c3d0894b 100644 --- a/test/hdbscan.jl +++ b/test/hdbscan.jl @@ -27,9 +27,9 @@ end # tests for result result = @inferred(hdbscan(data, 5, 15)) @test isa(result, HdbscanResult) - @test sum(length, result.clusters) == size(data, 2) + @test sum(x->length(x.points), result.clusters) == size(data, 2) # test whether the model recognizes the moons - @test length(result.clusters) == 3 + @test nclusters(result) == 2 # generate more complicated data n_points = 100 # number of points @@ -41,9 +41,9 @@ end # generate all the points data = hcat([0.5 * randn(2, n_points) .+ centers[i] for i in 1:clusters]...) # check the effect of min_size - nclusters1 = length(@inferred(hdbscan(data, 5, 5)).clusters) - nclusters2 = length(@inferred(hdbscan(data, 5, 15)).clusters) - nclusters3 = length(@inferred(hdbscan(data, 5, 25)).clusters) + nclusters1 = nclusters(@inferred(hdbscan(data, 5, 5))) + nclusters2 = nclusters(@inferred(hdbscan(data, 5, 15))) + nclusters3 = nclusters(@inferred(hdbscan(data, 5, 25))) @test nclusters1 >= nclusters2 @test nclusters2 >= nclusters3 end \ No newline at end of file