diff --git a/docs/source/dbscan.md b/docs/source/dbscan.md index b2c31b61..147872c1 100644 --- a/docs/source/dbscan.md +++ b/docs/source/dbscan.md @@ -46,3 +46,49 @@ dbscan DbscanResult DbscanCluster ``` + +# HDBSCAN + +Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters. The algorithm was proposed in: + +> Ricardo J. G. B. Campello, Davoud Moulavi & Joerg Sander +> *Density-Based Clustering Based on Hierarchical Density Estimates* 2013 + +## [Algorithm](@id hdbscan_algorithm) +The main procedure of HDBSCAN: +1. calculate the *mutual reachability distance* +2. generate a *minimum spanning tree* +3. build hierarchy +4. extract the target cluster + +### Calculate the *mutual reachability distance* +DBSCAN counts the number of points at a certain distance. But, HDBSCAN uses the opposite way, First, calculate the pairwise distances. Next, defined the core distance(``core_{ncore}(p)``) as a distance with the ncore-th closest point. And then, the mutual reachability distance between point `a` and `b` is defined as: ``d_{mreach-ncore}(a, b) = max\{ core_{ncore}(a), core_{ncore}(b), d(a, b) \}`` where ``d(a, b)`` is a euclidean distance between `a` and `b`(or specified metric). + +### Generate a *minimum-spanning tree(MST)* +Conceptually what we will do is the following: consider the data as a weighted graph with the data points as vertices and an edge between any two points with weight equal to the mutual reachability distance of those points. + +Then, check the list of edges in descending order of distance whether the points which is connected by it is marked or not. If not, add the edge into the MST. After this processing, all data points are connected to MST with minimum weight. + +In practice, this is very expensive since there are ``n^{2}`` edges. + +### Build hierarchy +The next step is to build hierarchy based on MST. This step is very simple: repeatedly unite the clusters which has the minimum edge until all cluster are united into one cluster. + +### Extract the target cluster +First we need a different measure than distance to consider the persistence of clusters; instead we will use ``\lambda = \frac{1}{\mathrm{distance}}``. +For a given cluster we can then define values ``\lambda_{\mathrm{birth}}`` and ``\lambda_{\mathrm{death}}`` to be the lambda value when the cluster split off and became it’s own cluster, and the lambda value (if any) when the cluster split into smaller clusters respectively. +In turn, for a given cluster, for each point ``p`` in that cluster we can define the value ``\lambda_{p}`` as the lambda value at which that point ‘fell out of the cluster’ which is a value somewhere between ``\lambda_{\mathrm{birth}}`` and ``\lambda_{\mathrm{death}}`` since the point either falls out of the cluster at some point in the cluster’s lifetime, or leaves the cluster when the cluster splits into two smaller clusters. +Now, for each cluster compute the stability as + +``\sum_{p \in \mathrm{cluster}} (\lambda_{p} - \lambda_{\mathrm{birth}})``. + +Declare all leaf nodes to be selected clusters. Now work up through the tree (the reverse topological sort order). If the sum of the stabilities of the child clusters is greater than the stability of the cluster, then we set the cluster stability to be the sum of the child stabilities. If, on the other hand, the cluster’s stability is greater than the sum of its children then we declare the cluster to be a selected cluster and unselect all its descendants. Once we reach the root node we call the current set of selected clusters our flat clustering and return it as the result of clustering. + +## Interface +The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function +```@docs +hdbscan +HdbscanResult +HdbscanCluster +isnoise +``` diff --git a/src/Clustering.jl b/src/Clustering.jl index fae9ee82..fd522a78 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -40,6 +40,9 @@ module Clustering # dbscan DbscanResult, DbscanCluster, dbscan, + # hdbscan + HdbscanResult, HdbscanCluster, hdbscan, isnoise, + # fuzzy_cmeans fuzzy_cmeans, FuzzyCMeansResult, @@ -78,11 +81,13 @@ module Clustering include("utils.jl") include("seeding.jl") + include("unionfind.jl") include("kmeans.jl") include("kmedoids.jl") include("affprop.jl") include("dbscan.jl") + include("hdbscan.jl") include("mcl.jl") include("fuzzycmeans.jl") diff --git a/src/hdbscan.jl b/src/hdbscan.jl new file mode 100644 index 00000000..3a769f4a --- /dev/null +++ b/src/hdbscan.jl @@ -0,0 +1,232 @@ +# HDBSCAN Graph edge: target vertex and mutual reachability distance. +HdbscanEdge = Tuple{Int, Float64} + +# HDBSCAN Graph +struct HdbscanGraph + adj_edges::Vector{Vector{HdbscanEdge}} + + HdbscanGraph(nv::Integer) = new([HdbscanEdge[] for _ in 1 : nv]) +end + +# Edge of the minimum spanning tree for HDBScan algorithm +HdbscanMSTEdge = NamedTuple{(:v1, :v2, :dist), Tuple{Int, Int, Float64}} + +mutable struct HdbscanNode + parent::Int + children::Vector{Int} + points::Vector{Int} + λp::Vector{Float64} + stability::Float64 + children_stability::Float64 + + function HdbscanNode(points::Vector{Int}; λp::Vector{Float64}=Float64[], children::Vector{Int}=Int[], children_stability::Float64=0.0) + noise = isempty(λp) + return new(0, children, points, λp, noise ? -1 : 0, children_stability) + end +end + +cluster_len(c::HdbscanNode) = length(c.points) + +""" + HdbscanCluster + +A cluster generated by the [hdbscan](@ref) method, part of [HdbscanResult](@ref). + - `points`: vector of points which belongs to the cluster + - `stability`: stability of the cluster (-1 for noise clusters) + +The stability represents how much the cluster is "reasonable". So, a cluster which has a bigger stability is better. +The noise cluster is determined not to belong to any cluster. So, you can ignore them. +See also: [isnoise](@ref) +""" +struct HdbscanCluster + points::Vector{Int} + stability::Float64 +end + +""" + isnoise(c::HdbscanCluster) + +This function returns whether the cluster is the noise or not. +""" +isnoise(c::HdbscanCluster) = c.stability == -1 +isnoise(c::HdbscanNode) = c.stability == -1 +isstable(c::HdbscanNode) = c.stability != 0 +function compute_stability!(c::HdbscanNode, λbirth) + c.stability = sum(c.λp) - length(c.λp) * λbirth + return c.stability +end + +""" + HdbscanResult + +Result of the [hdbscan](@ref) clustering. +- `clusters`: vector of clusters +- `assignments`: vectors of assignments for each points + +See also: [HdbscanCluster](@ref) +""" +struct HdbscanResult <: ClusteringResult + clusters::Vector{HdbscanCluster} + assignments::Vector{Int} + counts::Vector{Int} +end + +""" + hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Integer; + metric=SqEuclidean()) + +Cluster `points` using Density-Based Clustering Based on Hierarchical Density Estimates (HDBSCAN) algorithm. +Refer to [HDBSCAN algorithm](@ref hdbscan_algorithm) description for the details on how the algorithm works. +# Parameters +- `points`: the *d*×*n* matrix, where each column is a *d*-dimensional point +- `ncore::Integer`: number of *core* neighbors of point, see [HDBSCAN algorithm](@ref hdbscan_algorithm) for a description +- `min_cluster_size::Integer`: minimum number of points in the cluster +- `metric`(defaults to Euclidean): the points distance metric to use. +""" +function hdbscan(points::AbstractMatrix, ncore::Integer, min_cluster_size::Int; metric=Euclidean()) + if min_cluster_size < 1 + throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1")) + end + n = size(points, 2) + dists = pairwise(metric, points; dims=2) + # calculate core (ncore-th nearest) distance for each point + core_dists = [partialsort!(dists[:, i], ncore) for i in axes(dists, 2)] + + #calculate mutual reachability distance between any two points + mrd = hdbscan_graph(core_dists, dists) + #compute a minimum spanning tree by prim method + mst = hdbscan_minspantree(mrd) + #build a HDBSCAN hierarchy + tree = hdbscan_build_tree(mst, min_cluster_size) + #extract the target cluster + extract_clusters!(tree) + #generate the list of cluster assignment for each point + clusters = HdbscanCluster[] + counts = Int[] + assignments = fill(0, n) # cluster index of each point + for (i, j) in enumerate(tree[end].children) + clu = tree[j] + push!(clusters, HdbscanCluster(clu.points, clu.stability)) + push!(counts, length(clu.points)) + assignments[clu.points] .= i + end + # add the cluster of all unassigned (noise) points + noise_points = findall(==(0), assignments) + isempty(noise_points) || push!(clusters, HdbscanCluster(noise_points, -1)) + return HdbscanResult(clusters, assignments, counts) +end + +function hdbscan_graph(core_dists::AbstractVector, dists::AbstractMatrix) + n = size(dists, 1) + graph = HdbscanGraph(n) + for i in axes(dists, 2) + i_dists = view(dists, :, i) + i_core = core_dists[i] + for j in i+1:n + c = max(i_core, core_dists[j], i_dists[j]) + # add reciprocal edges + push!(graph.adj_edges[i], (j, c)) + push!(graph.adj_edges[j], (i, c)) + end + end + return graph +end + +function hdbscan_minspantree(graph::HdbscanGraph) + # put the edge to the heap (sorted from largest to smallest distances) + function heapput!(h, v::HdbscanMSTEdge) + idx = searchsortedlast(h, v, by=e -> e.dist, rev=true) + insert!(h, idx + 1, v) + end + + # initialize the edges heap by putting all edges of the first node (the root) + heap = HdbscanMSTEdge[] + for (i, c) in graph.adj_edges[1] + heapput!(heap, (v1=1, v2=i, dist=c)) + end + @assert issorted(heap, by=e -> e.dist, rev=true) + + # build the tree + n = length(graph.adj_edges) + minspantree = Vector{HdbscanMSTEdge}() + inmst = falses(n) # if the graph node is in MST + inmst[1] = true # root + + while length(minspantree) < n-1 + # get the edge with the smallest distance from the heap + (i, j, c) = pop!(heap) + + # add j-th node to MST if not there + inmst[j] && continue + push!(minspantree, (v1=i, v2=j, dist=c)) + inmst[j] = true + + # add adjacent edges of j to the heap + for (k, c) in graph.adj_edges[j] + inmst[k] || heapput!(heap, (v1=j, v2=k, dist=c)) + end + end + return sort!(minspantree, by=e -> e.dist) +end + +function hdbscan_build_tree(minspantree::AbstractVector{HdbscanMSTEdge}, min_size::Integer) + n = length(minspantree) + 1 + cost = 0.0 + uf = UnionFind(n) + clusters = [HdbscanNode(Int[i], λp=(min_size==1) ? Float64[Inf] : Float64[]) for i in 1:n] + + for (i, (j, k, c)) in enumerate(minspantree) + cost += c + λ = 1 / cost + #child clusters + c1 = set_id(uf, j) + c2 = set_id(uf, k) + #reference to the parent cluster + clusters[c1].parent = clusters[c2].parent = n+i + #unite the clusters + unite!(uf, j, k) + points = items(uf, set_id(uf, j)) + if length(points) < min_size + push!(clusters, HdbscanNode(points)) + else + nc1, nc2 = isnoise(clusters[c1]), isnoise(clusters[c2]) + if nc1 && nc2 + push!(clusters, HdbscanNode(points, λp=fill(λ, length(points)))) + elseif nc1 || nc2 + #ensure that c1 is the cluster + if nc1 == true + c1, c2 = c2, c1 + end + push!(clusters, HdbscanNode(points, λp=[clusters[c1].λp..., fill(λ, cluster_len(clusters[c2]))...])) + else + #compute stabilities for children + c1_stability = compute_stability!(clusters[c1], λ) + c2_stability = compute_stability!(clusters[c2], λ) + #unite the two clusters + push!(clusters, HdbscanNode(points, λp=fill(λ, length(points)), children=[c1, c2], children_stability=c1_stability+c2_stability)) + end + end + end + compute_stability!(clusters[end], 1/cost) + @assert length(clusters) == 2n - 1 + return clusters +end + +function extract_clusters!(tree::Vector{HdbscanNode}) + for i in eachindex(tree) + i == length(tree) && continue + c = tree[i] + isempty(c.children) && continue + parent = tree[c.parent] + children_stability = sum([tree[c.children[i]].stability for i in eachindex(c.children)]) + if children_stability > c.stability + filter!(x->x!=i, parent.children) + append!(parent.children, c.children) + end + end + c = tree[end] + children_stability = sum([tree[c.children[i]].stability for i in eachindex(c.children)]) + if children_stability <= c.stability + c.children = Int[2 * length(tree) - 1] + end +end diff --git a/src/unionfind.jl b/src/unionfind.jl new file mode 100644 index 00000000..6b846bd6 --- /dev/null +++ b/src/unionfind.jl @@ -0,0 +1,58 @@ +# Union-Find +# structure for managing disjoint sets +# This structure tracks which sets the elements of a set belong to, +# and allows us to efficiently determine whether two elements belong to the same set. +mutable struct UnionFind + parent::Vector{Int} # parent[root] is the negative of the size + label::Dict{Int, Int} + next_id::Int + + function UnionFind(nodes::Int) + if nodes <= 0 + throw(ArgumentError("invalid argument for nodes: $nodes")) + end + + parent = -ones(nodes) + label = Dict([i=>i for i in 1:nodes]) + new(parent, label, nodes) + end +end + +# label of the set which element `x` belong to +set_id(uf::UnionFind, x::Int) = uf.label[root(uf, x)] +# all elements that have the specified label +items(uf::UnionFind, x::Int) = [k for (k, v) in pairs(uf.label) if v == x] + +# root of element `x` +# The root is the representative element of the set +function root(uf::UnionFind, x::Int) + if uf.parent[x] < 0 + return x + else + return uf.parent[x] = root(uf, uf.parent[x]) + end +end + +function setsize(uf::UnionFind, x::Int) + return -uf.parent[root(uf, x)] +end + +function unite!(uf::UnionFind, x::Int, y::Int) + x = root(uf, x) + y = root(uf, y) + if x == y + return false + end + if uf.parent[x] > uf.parent[y] + x, y = y, x + end + # unite smaller tree(y) to bigger one(x) + uf.parent[x] += uf.parent[y] + uf.parent[y] = x + uf.next_id += 1 + uf.label[y] = uf.next_id + for i in items(uf, set_id(uf, x)) + uf.label[i] = uf.next_id + end + return true +end \ No newline at end of file diff --git a/test/hdbscan.jl b/test/hdbscan.jl new file mode 100644 index 00000000..c3d0894b --- /dev/null +++ b/test/hdbscan.jl @@ -0,0 +1,49 @@ +using Test +using Clustering +using Clustering: UnionFind, set_id, items, root, setsize, unite! + +@testset "UnionFind" begin + uf = UnionFind(10) + @test set_id(uf, 5) == 5 + @test items(uf, 5) == [5] + @test !(unite!(uf, 5, 5)) + @test unite!(uf, 4, 5) + @test setsize(uf, 4) == 2 + @test root(uf, 5) == 4 +end + +@testset "HDBSCAN" begin + # make moons for test + upper_x = [i for i in 0:π/50:π] + lower_x = [i for i in π/2:π/50:3/2*π] + upper_y = sin.(upper_x) .+ rand(50)./10 + lower_y = cos.(lower_x) .+ rand(51)./10 + data = hcat([lower_x; upper_x], + [lower_y; upper_y])' + # test for main function + @test_throws DomainError hdbscan(data, 5, 0) + @test_nowarn @inferred(hdbscan(data, 5, 10)) + + # tests for result + result = @inferred(hdbscan(data, 5, 15)) + @test isa(result, HdbscanResult) + @test sum(x->length(x.points), result.clusters) == size(data, 2) + # test whether the model recognizes the moons + @test nclusters(result) == 2 + + # generate more complicated data + n_points = 100 # number of points + clusters = 4 # number of clusters + + # generate the centers of clusters + centers = [(rand(1:10), rand(1:10)) for _ in 1:clusters] + + # generate all the points + data = hcat([0.5 * randn(2, n_points) .+ centers[i] for i in 1:clusters]...) + # check the effect of min_size + nclusters1 = nclusters(@inferred(hdbscan(data, 5, 5))) + nclusters2 = nclusters(@inferred(hdbscan(data, 5, 15))) + nclusters3 = nclusters(@inferred(hdbscan(data, 5, 25))) + @test nclusters1 >= nclusters2 + @test nclusters2 >= nclusters3 +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 42301653..a18f475b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,7 @@ tests = ["seeding", "kmedoids", "affprop", "dbscan", + "hdbscan", "fuzzycmeans", "counts", "silhouette",