diff --git a/src/console.cpp b/src/console.cpp index eb43292..7c9131d 100644 --- a/src/console.cpp +++ b/src/console.cpp @@ -102,6 +102,8 @@ bool Console::parse(int argc, char** argv) { findOption(args, PARAM_LEIDEN_BETA, leidenParams.beta); findOption(args, PARAM_LEIDEN_ITERATIONS, leidenParams.numIterations); + verbose = findSwitch(args, FLAG_VERBOSE); + if (args.size() == 2) { distancesFile = args[0]; output = args[1]; diff --git a/src/console.h b/src/console.h index 8e1171d..747bdb8 100644 --- a/src/console.h +++ b/src/console.h @@ -52,9 +52,11 @@ class Console { const std::string PARAM_LEIDEN_BETA{"--leiden-beta"}; const std::string PARAM_LEIDEN_ITERATIONS{"--leiden-iterations"}; - - Algo str2algo(const std::string& str) - { + const std::string FLAG_VERBOSE{ "-v" }; + +public: + static Algo str2algo(const std::string& str) + { if (str == "single") { return Algo::SingleLinkage; } else if (str == "complete") { return Algo::CompleteLinkage; } else if (str == "uclust") { return Algo::UClust; } @@ -65,6 +67,18 @@ class Console { else { throw std::runtime_error("Unkown clustering algorithm"); } } + static std::string algo2str(Algo algo) { + switch (algo) { + case Algo::SingleLinkage: return "single"; + case Algo::CompleteLinkage: return "complete"; + case Algo::UClust: return "uclust"; + case Algo::SetCover: return "set-cover"; + case Algo::Leiden: return "leiden"; + case Algo::CdHit: return "cd-hit"; + default: throw std::runtime_error("Unkown clustering algorithm"); + } + } + public: @@ -85,7 +99,9 @@ class Console { bool outputCSV{ false }; LeidenParams leidenParams; - + + bool verbose{ false }; + void printUsage() const; bool parse(int argc, char** argv); diff --git a/src/distances.cpp b/src/distances.cpp index bd9e53b..e0ea0c6 100644 --- a/src/distances.cpp +++ b/src/distances.cpp @@ -108,9 +108,8 @@ size_t SparseMatrixNamed::load( namesBuffer = new char[1LL << 30]; // 1 GB buffer for names char* raw_ptr = namesBuffer; - std::vector counts; - std::vector tmp_dists; - tmp_dists.reserve(128LL << 20); // for 128M distances + // assume space for 8M objects + distances.reserve(8LL << 20); bool continueReading = true; char* place = buf; @@ -207,7 +206,6 @@ size_t SparseMatrixNamed::load( // if name not mapped to numerical ids if (it->second.first == -1) { ids2names.push_back(it->first); - counts.push_back(0); it->second.first = ids2names.size() - 1; } } @@ -220,10 +218,24 @@ size_t SparseMatrixNamed::load( continue; } - ++counts[i]; - ++counts[j]; + if (distances.size() <= j) { + distances.resize(j + 1); + } + + auto& Di = distances[i]; + auto& Dj = distances[j]; + + // extend capacity by factor 1.5 with 16 as an initial state + if (Di.capacity() == Di.size()) { + Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5)); + } + + if (Dj.capacity() == Dj.size()) { + Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5)); + } - tmp_dists.emplace_back(i, j, d); + Di.emplace_back(i, j, d); + Dj.emplace_back(j, i, d); } // copy remaining part after consuming all the lines @@ -236,65 +248,20 @@ size_t SparseMatrixNamed::load( } } + // if neccessary, sort distances in rows according to the second id + n_elements = 0; - distances.resize(tmp_dists.size() * 2); - rows.resize(counts.size()); - int cumulated = 0; - for (size_t i = 0; i < rows.size(); ++i) { - rows[i] = distances.data() + cumulated; - cumulated += counts[i]; - } - - struct row_info { - int n_filled{ 0 }; - int last_id{ -1 }; - }; - - std::vector rows_info(rows.size()); - - // second pass - put distances in the final structure - for (const dist_t& dist : tmp_dists) { - - uint32_t i = dist.u.s.lo; - uint32_t j = dist.u.s.hi; - double d = dist.d; - - rows[i][rows_info[i].n_filled] = dist; - ++rows_info[i].n_filled; - rows_info[i].last_id = j; - - rows[j][rows_info[j].n_filled] = dist_t{ j,i,d }; - ++rows_info[j].n_filled; - rows_info[j].last_id = i; - } - - auto end = distances.data() + distances.size(); - rows.push_back(end); - - // if neccessary, sort distances in rows according to the second id - dist_t* curBegin = rows[0]; - - for (size_t i = 0; i < rows.size() - 1; ++i) { - std::sort(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids < b.u.ids; }); - auto newEnd = std::unique(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; }); + for (auto& row : distances) { + std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); }); + auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; }); - if (rows[i] != curBegin) { - newEnd = std::copy(curBegin, newEnd, rows[i]); - } - - curBegin = rows[i + 1]; - rows[i + 1] = newEnd; - } + row.erase(newEnd, row.end()); - size_t newSize = rows.back() - rows.front(); - distances.erase(distances.begin() + newSize, distances.end()); + n_elements += row.size(); + } delete[] buf; - // debug stuff - //std::ofstream dbg("debug.log"); - //print(dbg); - return n_total_distances; } @@ -438,15 +405,11 @@ size_t SparseMatrixNumbered::load( auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; auto is_newline = [](char c) {return c == '\r' || c == '\n'; }; - std::vector counts; - - counts.reserve(8LL << 20); // assume space for 8M objects + // assume space for 8M objects + distances.reserve(8LL << 20); global2local.reserve(8LL << 20); local2global.reserve(8LL << 20); - std::vector tmp_dists; - tmp_dists.reserve(128LL << 20); // for 128M distances - bool continueReading = true; char* place = buf; @@ -553,15 +516,24 @@ size_t SparseMatrixNumbered::load( continue; } - // resize counts vector - if (j + 1 > counts.size()) { - counts.resize(j + 1); + if (distances.size() <= j) { + distances.resize(j + 1); + } + + auto& Di = distances[i]; + auto& Dj = distances[j]; + + // extend capacity by factor 1.5 with 16 as an initial state + if (Di.capacity() == Di.size()) { + Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5)); } - ++counts[i]; - ++counts[j]; + if (Dj.capacity() == Dj.size()) { + Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5)); + } - tmp_dists.emplace_back(i, j, d); + Di.emplace_back(i, j, d); + Dj.emplace_back(j, i, d); } // copy remaining part after consuming all the lines @@ -574,61 +546,50 @@ size_t SparseMatrixNumbered::load( } } + // if neccessary, sort distances in rows according to the second id + n_elements = 0; - distances.resize(tmp_dists.size() * 2); - rows.resize(counts.size()); - int cumulated = 0; - for (size_t i = 0; i < rows.size(); ++i) { - rows[i] = distances.data() + cumulated; - cumulated += counts[i]; - } - - struct row_info { - int n_filled{ 0 }; - int last_id{ -1 }; - }; - - std::vector rows_info(rows.size()); - - // second pass - put distances in the final structure - for (const dist_t& dist : tmp_dists) { - - uint32_t i = dist.u.s.lo; - uint32_t j = dist.u.s.hi; - double d = dist.d; - - rows[i][rows_info[i].n_filled] = dist; - ++rows_info[i].n_filled; - rows_info[i].last_id = j; + for (auto& row : distances) { + std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); }); + auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; }); - rows[j][rows_info[j].n_filled] = dist_t{ j,i,d }; - ++rows_info[j].n_filled; - rows_info[j].last_id = i; + row.erase(newEnd, row.end()); + n_elements += row.size(); } - auto end = distances.data() + distances.size(); - rows.push_back(end); + delete[] buf; - // if neccessary, sort distances in rows according to the second id - dist_t* curBegin = rows[0]; + // Print distance histogram in the verbose mode + if (Log::getInstance(Log::LEVEL_VERBOSE).isEnabled()) { - for (size_t i = 0; i < rows.size() - 1; ++i) { - std::sort(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); }); - auto newEnd = std::unique(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; }); + std::vector histo_bounds{ 0 }; + double width = 0.001; - if (rows[i] != curBegin) { - newEnd = std::copy(curBegin, newEnd, rows[i]); + while (histo_bounds.back() < 0.05) + { + histo_bounds.push_back(histo_bounds.back() + width); + } + histo_bounds.push_back(std::numeric_limits::max()); + std::vector histo(histo_bounds.size()); + + for (auto& row : distances) { + for (const auto& e : row) { + for (size_t i = 0; i < histo_bounds.size(); ++i) { + if (e.d < histo_bounds[i]) { + ++histo[i]; + break; + } + } + } } - curBegin = rows[i + 1]; - rows[i + 1] = newEnd; + LOG_VERBOSE << endl << "Distance histogram" << endl; + for (size_t i = 0; i < histo_bounds.size(); ++i) { + LOG_VERBOSE << " d < " << histo_bounds[i] << ": " << histo[i] << endl; + } + LOG_VERBOSE << endl; } - size_t newSize = rows.back() - rows.front(); - distances.erase(distances.begin() + newSize, distances.end()); - - delete[] buf; - return n_total_distances; } @@ -760,7 +721,7 @@ void SparseMatrixNamed::print(std::ostream& out) { for (auto name : names) { int i = names2ids[name].first; - std::vector row(rows[i], rows[i + 1]); + std::vector& row = distances[i]; std::sort(row.begin(), row.end(), [this](const dist_t& a, const dist_t& b) { return strcmp(ids2names[a.u.s.hi], ids2names[b.u.s.hi]) < 0; diff --git a/src/distances.h b/src/distances.h index 9674759..84e93c6 100644 --- a/src/distances.h +++ b/src/distances.h @@ -109,12 +109,18 @@ class StringEqual { class SparseMatrix { protected: + + size_t n_elements{ 0 }; + + std::vector> distances; + /** a vector of distances */ - std::vector distances; + //std::vector distances; - /** Each row holds a pointer to the first distance in - * the row. */ - std::vector rows; /// wskaźniki na kolejne wiersze, ostatni pokazuje adres za kolekcją + /** Each row holds a pointer to the first distance in the row. + The last points address right after the collection. + */ + //std::vector rows; public: @@ -123,48 +129,31 @@ class SparseMatrix virtual ~SparseMatrix() {} - size_t num_objects() const { return rows.size() - 1; } + size_t num_objects() const { return distances.size(); } - size_t num_elements() const { return distances.size(); } - void reserve(size_t count) { - distances.reserve(count); - } + size_t num_elements() const { return n_elements; } virtual size_t num_input_objects() const = 0; - const dist_t* begin(int row_id) const { return rows[row_id]; } - const dist_t* end(int row_id) const { return rows[row_id + 1]; } - - std::vector> get_distances_of_objects() const - { - std::vector> objects; - objects.reserve (distances.size()); - for (const auto & item : distances) - { - objects.emplace_back(item.u.s.lo, item.u.s.hi, item.d); - } - - return objects; - } - const std::vector get_distances() const { return distances; } - - std::vector get_distances() { return distances; } + const dist_t* begin(int row_id) const { return distances[row_id].data(); } + const dist_t* end(int row_id) const { return distances[row_id].data() + distances[row_id].size(); } - size_t get_num_neighbours(int i) { return rows[i + 1] - rows[i]; } + void clear(int row_id) { std::vector().swap(distances[row_id]); } + size_t get_num_neighbours(int i) { return distances[i].size(); } dist_t get(uint64_t i, uint64_t j) const { dist_t v {i, j, dist_t::MAX }; - auto it = std::lower_bound(rows[i], rows[i + 1], v, [](const dist_t& a, const dist_t& b) { return a.u.s.hi < b.u.s.hi; }); + auto it = std::lower_bound(begin(i), end(i), v, [](const dist_t& a, const dist_t& b) { return a.u.s.hi < b.u.s.hi; }); - return (it == rows[i + 1] || it->u.s.hi != j ) ? v : *it; + return (it == end(i) || it->u.s.hi != j) ? v : *it; } void extract_row(uint32_t row, uint32_t count, dist_t* out) const { - dist_t* cur = rows[row]; - dist_t* end = rows[row + 1]; + const dist_t* cur = this->begin(row); + const dist_t* end = this->end(row); for (uint32_t i = 0; i < count; ++i) { diff --git a/src/leiden.h b/src/leiden.h index f53a354..a3caf90 100644 --- a/src/leiden.h +++ b/src/leiden.h @@ -87,12 +87,24 @@ class Leiden : public IClustering { igraph_vector_init(&edge_weights, 0); + for (int i = 0; i < matrix.num_objects(); ++i) { + for (const dist_t* edge = matrix.begin(i); edge < matrix.end(i); ++edge) { + igraph_vector_int_push_back(&edges, edge->u.s.lo); + igraph_vector_int_push_back(&edges, edge->u.s.hi); + igraph_vector_push_back(&edge_weights, 1.0 - edge->d); + } + + matrix.clear(i); + } + + /* for (const dist_t& edge : matrix.get_distances()) { igraph_vector_int_push_back(&edges, edge.u.s.lo); igraph_vector_int_push_back(&edges, edge.u.s.hi); igraph_vector_push_back(&edge_weights, 1.0 - edge.d); //igraph_vector_push_back(&edge_weights, 1); } + */ igraph_empty(&g, matrix.num_objects(), IGRAPH_UNDIRECTED); igraph_add_edges(&g, &edges, nullptr); diff --git a/src/linkage_heaptrix.h b/src/linkage_heaptrix.h index 465cc8d..a24f71f 100644 --- a/src/linkage_heaptrix.h +++ b/src/linkage_heaptrix.h @@ -24,6 +24,7 @@ #include "clock.h" #include "utils.h" #include "memory_monotonic.h" +#include "log.h" namespace linkage_algorithm_heaptrix { @@ -149,7 +150,7 @@ namespace linkage_algorithm_heaptrix ht.resize(ht_size, std::make_pair(empty, nullptr)); - for (const auto [idx, ptr] : ht_old) + for (const auto& [idx, ptr] : ht_old) if (ptr) insert(idx, ptr); } @@ -765,15 +766,22 @@ namespace linkage_algorithm_heaptrix } public: - void read_matrix (const std::vector> & edges) + void read_matrix (DistanceMatrix & m) { _matrix._rows.clear(); - _heap.reserve((std::size_t)(1.1 * edges.size())); + _heap.reserve((std::size_t)(1.1 * m.num_elements())); + + for (size_t i = 0; i < m.num_objects(); ++i) { + + for (const dist_t* p = m.begin(i); p < m.end(i); ++p) { + const dist_t& edge = *p; + if (edge.d != MAX_DOUBLE and edge.d != INF_DOUBLE and edge.u.s.lo != edge.u.s.hi) + add_value(edge.u.s.lo, edge.u.s.hi, edge.d); + } - for (const auto [row_id, col_id, _value] : edges) - if (_value != MAX_DOUBLE and _value != INF_DOUBLE and row_id != col_id) - add_value(row_id, col_id, _value); + m.clear(i); + } _heap.make_heap(); }; @@ -787,15 +795,15 @@ namespace linkage_algorithm_heaptrix std::size_t id_of_the_next_group = _matrix.get_max_row() + 1; // id of aggregated group std::size_t number_of_objects = id_of_the_next_group; - std::cerr << std::endl; + //std::cerr << std::endl; std::vector merged_column; std::vector heap_insert_buffer; while (number_of_objects > 1 and not _heap.empty()) { - if(number_of_objects % 1000 == 0) - std::cerr << std::to_string(number_of_objects) + " \r"; + // if(number_of_objects % 1000 == 0) + // std::cerr << std::to_string(number_of_objects) + " \r"; // find the minimal distance auto pMinimal = _heap.top(); // I am not removing it from the heap. I am only reading the value. It will be removed in row merger. @@ -955,10 +963,11 @@ namespace linkage_algorithm_heaptrix mma_buf.clear(); mma_buf.shrink_to_fit(); - std::cerr << std::to_string(number_of_objects) + " \n"; + //std::cerr << std::to_string(number_of_objects) + " \n"; } public: + /* dendrogram do_clustering (const std::vector> & m) { ksi::clock stopwatch; @@ -976,23 +985,24 @@ namespace linkage_algorithm_heaptrix std::cout << "done in " << elapsed_time << " s. "; return _dendrogram; } + */ public: - dendrogram do_clustering (const std::vector> & m) + dendrogram do_clustering (DistanceMatrix & m) { ksi::clock stopwatch; - std::cout << "Loading data into data structure "; + LOG_VERBOSE << "Loading data into heap "; stopwatch.start(); read_matrix(m); stopwatch.stop(); double elapsed_time = (double) stopwatch.elapsed_milliseconds() / 1000; - std::cout << "done in " << elapsed_time << " s. "; - std::cout << "Clustering "; + LOG_VERBOSE << "done in " << elapsed_time << " s. "; + LOG_VERBOSE << "Performing linkage "; stopwatch.start(); do_clustering(); stopwatch.stop(); elapsed_time = (double) stopwatch.elapsed_milliseconds() / 1000; - std::cout << "done in " << elapsed_time << " s. "; + LOG_VERBOSE << "done in " << elapsed_time << " s. "; return _dendrogram; } @@ -1011,13 +1021,13 @@ namespace linkage_algorithm_heaptrix public: int run ( - const std::vector> & matrix_of_distances, + DistanceMatrix& matrix, const std::vector& objects, double threshold, std::vector& assignments ) { - auto dendrogram = do_clustering(matrix_of_distances); + auto dendrogram = do_clustering(matrix); std::vector node_t_dendrogram = makeDendrogram(dendrogram, objects.size()); assignments.resize(objects.size()); int n_clusters = this->dendrogramToAssignments(node_t_dendrogram, threshold, assignments); @@ -1067,7 +1077,7 @@ namespace linkage_algorithm_heaptrix std::ostream & operator << (std::ostream & sos, const std::vector> & dist) { - for (const auto [w, k, d] : dist) + for (const auto& [w, k, d] : dist) sos << "[" << w << ", " << k << ", " << d << "] "; sos << std::endl; return sos; @@ -1085,7 +1095,7 @@ class SingleLinkage : public linkage_algorithm_heaptrix::single_linkagerun (distances.get_distances_of_objects(), objects, threshold, assignments); + return this->run (distances, objects, threshold, assignments); } }; @@ -1101,6 +1111,6 @@ class CompleteLinkage : public linkage_algorithm_heaptrix::complete_linkagerun (distances.get_distances_of_objects(), objects, threshold, assignments); + return this->run (distances, objects, threshold, assignments); } }; diff --git a/src/log.h b/src/log.h index 82af0eb..7c79da3 100644 --- a/src/log.h +++ b/src/log.h @@ -55,8 +55,9 @@ class Log static const int LEVEL_VERBOSE; static const int LEVEL_DEBUG; - void enable() { enabled = true; } - void disable() { enabled = false; } + void enable() { enabled = true; } + void disable() { enabled = false; } + bool isEnabled() const { return enabled; } // ***************************************************************************************** static Log& getInstance(int level) { diff --git a/src/main.cpp b/src/main.cpp index 67789de..cdfd931 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -53,6 +53,10 @@ int main(int argc, char** argv) return 0; } + if (console.verbose) { + Log::getInstance(Log::LEVEL_VERBOSE).enable(); + } + shared_ptr dists; std::shared_ptr> clustering; @@ -166,7 +170,7 @@ int main(int argc, char** argv) } } - LOG_NORMAL << "Clustering... "; + LOG_NORMAL << "Clustering (algorithm: " << Console::algo2str(console.algo) << ")... "; vector assignments; t = std::chrono::high_resolution_clock::now(); diff --git a/src/version.h b/src/version.h index 79ea6f3..1cedc35 100644 --- a/src/version.h +++ b/src/version.h @@ -8,12 +8,16 @@ // // ******************************************************************************************* -#define VERSION "1.0.1" -#define DATE "2024-08-19" +#define VERSION "1.0.2" +#define DATE "2024-08-29" /********* Version history ********* + 1.0.2 (2024-08-29) + Fixed crash in reading very large input matrices. Reduced memory requirements. + + 1.0.1 (2024-08-19) Fixed bug in parsing floating point numbers with more than 15 decimal places.