Skip to content

Commit

Permalink
Fixed crash on large input datasets.
Browse files Browse the repository at this point in the history
  • Loading branch information
agudys authored Aug 29, 2024
1 parent 692aac2 commit 5ed7787
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 177 deletions.
2 changes: 2 additions & 0 deletions src/console.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ bool Console::parse(int argc, char** argv) {
findOption(args, PARAM_LEIDEN_BETA, leidenParams.beta);
findOption(args, PARAM_LEIDEN_ITERATIONS, leidenParams.numIterations);

verbose = findSwitch(args, FLAG_VERBOSE);

if (args.size() == 2) {
distancesFile = args[0];
output = args[1];
Expand Down
24 changes: 20 additions & 4 deletions src/console.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ class Console {
const std::string PARAM_LEIDEN_BETA{"--leiden-beta"};
const std::string PARAM_LEIDEN_ITERATIONS{"--leiden-iterations"};


Algo str2algo(const std::string& str)
{
const std::string FLAG_VERBOSE{ "-v" };

public:
static Algo str2algo(const std::string& str)
{
if (str == "single") { return Algo::SingleLinkage; }
else if (str == "complete") { return Algo::CompleteLinkage; }
else if (str == "uclust") { return Algo::UClust; }
Expand All @@ -65,6 +67,18 @@ class Console {
else { throw std::runtime_error("Unkown clustering algorithm"); }
}

static std::string algo2str(Algo algo) {
switch (algo) {
case Algo::SingleLinkage: return "single";
case Algo::CompleteLinkage: return "complete";
case Algo::UClust: return "uclust";
case Algo::SetCover: return "set-cover";
case Algo::Leiden: return "leiden";
case Algo::CdHit: return "cd-hit";
default: throw std::runtime_error("Unkown clustering algorithm");
}
}


public:

Expand All @@ -85,7 +99,9 @@ class Console {
bool outputCSV{ false };

LeidenParams leidenParams;


bool verbose{ false };

void printUsage() const;
bool parse(int argc, char** argv);

Expand Down
195 changes: 78 additions & 117 deletions src/distances.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,8 @@ size_t SparseMatrixNamed::load(
namesBuffer = new char[1LL << 30]; // 1 GB buffer for names
char* raw_ptr = namesBuffer;

std::vector<int> counts;
std::vector<dist_t> tmp_dists;
tmp_dists.reserve(128LL << 20); // for 128M distances
// assume space for 8M objects
distances.reserve(8LL << 20);

bool continueReading = true;
char* place = buf;
Expand Down Expand Up @@ -207,7 +206,6 @@ size_t SparseMatrixNamed::load(
// if name not mapped to numerical ids
if (it->second.first == -1) {
ids2names.push_back(it->first);
counts.push_back(0);
it->second.first = ids2names.size() - 1;
}
}
Expand All @@ -220,10 +218,24 @@ size_t SparseMatrixNamed::load(
continue;
}

++counts[i];
++counts[j];
if (distances.size() <= j) {
distances.resize(j + 1);
}

auto& Di = distances[i];
auto& Dj = distances[j];

// extend capacity by factor 1.5 with 16 as an initial state
if (Di.capacity() == Di.size()) {
Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5));
}

if (Dj.capacity() == Dj.size()) {
Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5));
}

tmp_dists.emplace_back(i, j, d);
Di.emplace_back(i, j, d);
Dj.emplace_back(j, i, d);
}

// copy remaining part after consuming all the lines
Expand All @@ -236,65 +248,20 @@ size_t SparseMatrixNamed::load(
}
}

// if neccessary, sort distances in rows according to the second id
n_elements = 0;

distances.resize(tmp_dists.size() * 2);
rows.resize(counts.size());
int cumulated = 0;
for (size_t i = 0; i < rows.size(); ++i) {
rows[i] = distances.data() + cumulated;
cumulated += counts[i];
}

struct row_info {
int n_filled{ 0 };
int last_id{ -1 };
};

std::vector <row_info> rows_info(rows.size());

// second pass - put distances in the final structure
for (const dist_t& dist : tmp_dists) {

uint32_t i = dist.u.s.lo;
uint32_t j = dist.u.s.hi;
double d = dist.d;

rows[i][rows_info[i].n_filled] = dist;
++rows_info[i].n_filled;
rows_info[i].last_id = j;

rows[j][rows_info[j].n_filled] = dist_t{ j,i,d };
++rows_info[j].n_filled;
rows_info[j].last_id = i;
}

auto end = distances.data() + distances.size();
rows.push_back(end);

// if neccessary, sort distances in rows according to the second id
dist_t* curBegin = rows[0];

for (size_t i = 0; i < rows.size() - 1; ++i) {
std::sort(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids < b.u.ids; });
auto newEnd = std::unique(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });
for (auto& row : distances) {
std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); });
auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });

if (rows[i] != curBegin) {
newEnd = std::copy(curBegin, newEnd, rows[i]);
}

curBegin = rows[i + 1];
rows[i + 1] = newEnd;
}
row.erase(newEnd, row.end());

size_t newSize = rows.back() - rows.front();
distances.erase(distances.begin() + newSize, distances.end());
n_elements += row.size();
}

delete[] buf;

// debug stuff
//std::ofstream dbg("debug.log");
//print(dbg);

return n_total_distances;
}

Expand Down Expand Up @@ -438,15 +405,11 @@ size_t SparseMatrixNumbered::load(
auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; };
auto is_newline = [](char c) {return c == '\r' || c == '\n'; };

std::vector<int> counts;

counts.reserve(8LL << 20); // assume space for 8M objects
// assume space for 8M objects
distances.reserve(8LL << 20);
global2local.reserve(8LL << 20);
local2global.reserve(8LL << 20);

std::vector<dist_t> tmp_dists;
tmp_dists.reserve(128LL << 20); // for 128M distances

bool continueReading = true;
char* place = buf;

Expand Down Expand Up @@ -553,15 +516,24 @@ size_t SparseMatrixNumbered::load(
continue;
}

// resize counts vector
if (j + 1 > counts.size()) {
counts.resize(j + 1);
if (distances.size() <= j) {
distances.resize(j + 1);
}

auto& Di = distances[i];
auto& Dj = distances[j];

// extend capacity by factor 1.5 with 16 as an initial state
if (Di.capacity() == Di.size()) {
Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5));
}

++counts[i];
++counts[j];
if (Dj.capacity() == Dj.size()) {
Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5));
}

tmp_dists.emplace_back(i, j, d);
Di.emplace_back(i, j, d);
Dj.emplace_back(j, i, d);
}

// copy remaining part after consuming all the lines
Expand All @@ -574,61 +546,50 @@ size_t SparseMatrixNumbered::load(
}
}

// if neccessary, sort distances in rows according to the second id
n_elements = 0;

distances.resize(tmp_dists.size() * 2);
rows.resize(counts.size());
int cumulated = 0;
for (size_t i = 0; i < rows.size(); ++i) {
rows[i] = distances.data() + cumulated;
cumulated += counts[i];
}

struct row_info {
int n_filled{ 0 };
int last_id{ -1 };
};

std::vector <row_info> rows_info(rows.size());

// second pass - put distances in the final structure
for (const dist_t& dist : tmp_dists) {

uint32_t i = dist.u.s.lo;
uint32_t j = dist.u.s.hi;
double d = dist.d;

rows[i][rows_info[i].n_filled] = dist;
++rows_info[i].n_filled;
rows_info[i].last_id = j;
for (auto& row : distances) {
std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); });
auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });

rows[j][rows_info[j].n_filled] = dist_t{ j,i,d };
++rows_info[j].n_filled;
rows_info[j].last_id = i;
row.erase(newEnd, row.end());
n_elements += row.size();
}

auto end = distances.data() + distances.size();
rows.push_back(end);
delete[] buf;

// if neccessary, sort distances in rows according to the second id
dist_t* curBegin = rows[0];
// Print distance histogram in the verbose mode
if (Log::getInstance(Log::LEVEL_VERBOSE).isEnabled()) {

for (size_t i = 0; i < rows.size() - 1; ++i) {
std::sort(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); });
auto newEnd = std::unique(curBegin, rows[i + 1], [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; });
std::vector<double> histo_bounds{ 0 };
double width = 0.001;

if (rows[i] != curBegin) {
newEnd = std::copy(curBegin, newEnd, rows[i]);
while (histo_bounds.back() < 0.05)
{
histo_bounds.push_back(histo_bounds.back() + width);
}
histo_bounds.push_back(std::numeric_limits<double>::max());
std::vector<int> histo(histo_bounds.size());

for (auto& row : distances) {
for (const auto& e : row) {
for (size_t i = 0; i < histo_bounds.size(); ++i) {
if (e.d < histo_bounds[i]) {
++histo[i];
break;
}
}
}
}

curBegin = rows[i + 1];
rows[i + 1] = newEnd;
LOG_VERBOSE << endl << "Distance histogram" << endl;
for (size_t i = 0; i < histo_bounds.size(); ++i) {
LOG_VERBOSE << " d < " << histo_bounds[i] << ": " << histo[i] << endl;
}
LOG_VERBOSE << endl;
}

size_t newSize = rows.back() - rows.front();
distances.erase(distances.begin() + newSize, distances.end());

delete[] buf;

return n_total_distances;
}

Expand Down Expand Up @@ -760,7 +721,7 @@ void SparseMatrixNamed::print(std::ostream& out) {
for (auto name : names) {

int i = names2ids[name].first;
std::vector<dist_t> row(rows[i], rows[i + 1]);
std::vector<dist_t>& row = distances[i];

std::sort(row.begin(), row.end(), [this](const dist_t& a, const dist_t& b) {
return strcmp(ids2names[a.u.s.hi], ids2names[b.u.s.hi]) < 0;
Expand Down
Loading

0 comments on commit 5ed7787

Please sign in to comment.