Skip to content

Commit

Permalink
Renamed SubsetRemapper class and moved it to its own file.
Browse files Browse the repository at this point in the history
- Improved its internal documentation for future me.
- Moved the associated tests to their own file.
- Cleaned up documentation for SubsetSanitizer, for consistency.
  • Loading branch information
LTLA committed Oct 10, 2024
1 parent 052356f commit fb2b357
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 168 deletions.
106 changes: 106 additions & 0 deletions include/singlepp/SubsetRemapper.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#ifndef SINGLEPP_SUBSET_REMAPPER_HPP
#define SINGLEPP_SUBSET_REMAPPER_HPP

#include "scaled_ranks.hpp"

#include <vector>
#include <limits>

namespace singlepp {

namespace internal {

/*
* This class remaps the indices in the RankVector to the subset of interest.
* For example, if our subset of features of interest is:
*
* [a, c, g, e]
*
* ... the user should call .add(a), .add(c), .add(e), etc. Then, when we
* receive a rank vector like:
*
* [(A, a), (B, b), (C, c), (D, d), (E, e), (F, f), (G, g)],
*
* ... the .remap() method filters out the entries that weren't added by
* .add(), and then remaps the remaining indices to their position on the
* subset vector, yielding:
*
* [(A, 0), (C, 1), (E, 3), (G, 2)]
*
* The idea is to adjust the indices so it appears as if we had been working
* with the subset of features all along, allowing us to call scaled_ranks() to
* perform the rest of the analysis on the subsets only. This is primarily
* intended for use on the reference rank vectors during fine-tuning, given that
* data extracted from the reference/test matrices is already subsetted when
* returned by SubsetSanitizer::fill_ranks().
*/
template<typename Index_>
class SubsetRemapper {
private:
// This uses a vector instead of an unordered_map for fast remap()
// inside the inner loop of the fine-tuning iterations.
std::vector<std::pair<bool, Index_> > my_mapping;
std::vector<size_t> my_used;
Index_ my_counter = 0;

public:
void add(size_t i) {
if (i >= my_mapping.size()) {
my_mapping.resize(i + 1);
}
if (!my_mapping[i].first) {
my_mapping[i].first = true;
my_mapping[i].second = my_counter;
my_used.push_back(i);
++my_counter;
}
}

void clear() {
my_counter = 0;
for (auto u : my_used) {
my_mapping[u].first = false;
}
my_used.clear();
}

void reserve(size_t n) {
my_mapping.reserve(n);
}

public:
template<typename Stat_>
void remap(const RankedVector<Stat_, Index_>& input, RankedVector<Stat_, Index_>& output) const {
output.clear();

if (static_cast<size_t>(std::numeric_limits<Index_>::max()) < my_mapping.size()) {
// Avoid unnecessary check if the size is already greater than the largest possible index.
// This also avoids the need to cast to indices size_t for comparison to my_mapping.size().
for (const auto& x : input) {
const auto& target = my_mapping[x.second];
if (target.first) {
output.emplace_back(x.first, target.second);
}
}

} else {
// Otherwise, it is safe to cast the size to Index_ outside the
// loop so that we don't need to cast x.second to size_t inside the loop.
Index_ maxed = my_mapping.size();
for (const auto& x : input) {
if (maxed > x.second) {
const auto& target = my_mapping[x.second];
if (target.first) {
output.emplace_back(x.first, target.second);
}
}
}
}
}
};

}

}

#endif
16 changes: 10 additions & 6 deletions include/singlepp/SubsetSanitizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ namespace singlepp {

namespace internal {

// This class sanitizes any user-provided subsets so that we can provide a
// sorted and unique subset to the tatami extractor. We then undo the sorting
// to use the original indices in the rank filler. This entire thing is
// necessary as the behavior of the subsets isn't something that the user can
// easily control (e.g., if the reference/test datasets do not use the same
// feature ordering, in which case the subset is necessarily unsorted).
/*
* This class sanitizes any user-provided subsets so that we can provide a
* sorted and unique subset to the tatami extractor. We then undo the sorting
* to use the original indices in the rank filler. This entire thing is
* necessary as the behavior of the subsets isn't something that the user can
* easily control (e.g., if the reference/test datasets do not use the same
* feature ordering, in which case the subset is necessarily unsorted).
*/
template<typename Index_>
class SubsetSanitizer {
private:
Expand Down Expand Up @@ -64,6 +66,8 @@ class SubsetSanitizer {

template<typename Stat_>
void fill_ranks(const Stat_* ptr, RankedVector<Stat_, Index_>& vec) const {
// The indices in the output 'vec' refer to positions on the subset
// vector, as if the input data was already subsetted.
vec.clear();
if (my_use_sorted_subset) {
size_t num = my_original_indices.size();
Expand Down
7 changes: 4 additions & 3 deletions include/singlepp/annotate_cells_integrated.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "tatami/tatami.hpp"

#include "scaled_ranks.hpp"
#include "SubsetRemapper.hpp"
#include "train_integrated.hpp"
#include "find_best_and_delta.hpp"
#include "fill_labels_in_use.hpp"
Expand All @@ -26,9 +27,9 @@ namespace internal {
// across cells and references.
template<typename Index_, typename Value_, typename Float_>
struct PerReferenceIntegratedWorkspace {
RankRemapper<Index_> intersect_mapping;
SubsetRemapper<Index_> intersect_mapping;
bool direct_mapping_filled;
RankRemapper<Index_> direct_mapping;
SubsetRemapper<Index_> direct_mapping;

RankedVector<Value_, Index_> test_ranked;
RankedVector<Index_, Index_> ref_ranked;
Expand All @@ -50,7 +51,7 @@ Float_ compute_single_reference_score_integrated(
{
// Further subsetting to the intersection of markers that are
// actual present in this particular reference.
const RankRemapper<Index_>* mapping;
const SubsetRemapper<Index_>* mapping;
if (trained.check_availability[ref_i]) {
const auto& cur_available = trained.available[ref_i];
workspace.intersect_mapping.clear();
Expand Down
3 changes: 2 additions & 1 deletion include/singlepp/annotate_cells_single.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "Markers.hpp"
#include "build_indices.hpp"
#include "SubsetSanitizer.hpp"
#include "SubsetRemapper.hpp"
#include "find_best_and_delta.hpp"
#include "scaled_ranks.hpp"
#include "correlations_to_score.hpp"
Expand All @@ -26,7 +27,7 @@ class FineTuneSingle {
private:
std::vector<Label_> my_labels_in_use;

RankRemapper<Index_> my_gene_subset;
SubsetRemapper<Index_> my_gene_subset;

std::vector<Float_> my_scaled_left, my_scaled_right;

Expand Down
67 changes: 1 addition & 66 deletions include/singlepp/scaled_ranks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <algorithm>
#include <vector>
#include <cmath>
#include <limits>
#include <type_traits>

namespace singlepp {

Expand Down Expand Up @@ -59,71 +59,6 @@ void scaled_ranks(const RankedVector<Stat_, Index_>& collected, Output_* outgoin
}
}

template<typename Index_>
class RankRemapper {
private:
// This uses a vector instead of an unordered_map for fast remap()
// inside the inner loop of the fine-tuning iterations.
std::vector<std::pair<bool, Index_> > my_mapping;
std::vector<size_t> my_used;
Index_ my_counter = 0;

public:
void add(size_t i) {
if (i >= my_mapping.size()) {
my_mapping.resize(i + 1);
}
if (!my_mapping[i].first) {
my_mapping[i].first = true;
my_mapping[i].second = my_counter;
my_used.push_back(i);
++my_counter;
}
}

void clear() {
my_counter = 0;
for (auto u : my_used) {
my_mapping[u].first = false;
}
my_used.clear();
}

void reserve(size_t n) {
my_mapping.reserve(n);
}

public:
template<typename Stat_>
void remap(const RankedVector<Stat_, Index_>& input, RankedVector<Stat_, Index_>& output) const {
output.clear();

if (static_cast<size_t>(std::numeric_limits<Index_>::max()) < my_mapping.size()) {
// Avoid unnecessary check if the size is already greater than the largest possible index.
// This also avoids the need to cast to indices size_t for comparison to my_mapping.size().
for (const auto& x : input) {
const auto& target = my_mapping[x.second];
if (target.first) {
output.emplace_back(x.first, target.second);
}
}

} else {
// Otherwise, it is safe to cast the size to Index_ outside the
// loop so that we don't need to cast x.second to size_t inside the loop.
Index_ maxed = my_mapping.size();
for (const auto& x : input) {
if (maxed > x.second) {
const auto& target = my_mapping[x.second];
if (target.first) {
output.emplace_back(x.first, target.second);
}
}
}
}
}
};

template<typename Stat_, typename Index_, typename Simple_>
void simplify_ranks(const RankedVector<Stat_, Index_>& x, RankedVector<Simple_, Index_>& output) {
if (x.size()) {
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_executable(
libtest
src/classify_single.cpp
src/scaled_ranks.cpp
src/SubsetRemapper.cpp
src/correlations_to_score.cpp
src/Intersection.cpp
src/subset_to_markers.cpp
Expand Down
96 changes: 96 additions & 0 deletions tests/src/SubsetRemapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include <gtest/gtest.h>

#include "singlepp/scaled_ranks.hpp"
#include "singlepp/SubsetRemapper.hpp"

TEST(SubsetRemapper, Subsets) {
singlepp::internal::SubsetRemapper<int> remapper;
remapper.reserve(10);
remapper.add(1);
remapper.add(6);
remapper.add(1); // duplicates are ignored.
remapper.add(8);

// All indices are retained.
{
singlepp::internal::RankedVector<double, int> input;
for (size_t i = 0; i < 10; ++i) {
input.emplace_back(static_cast<double>(i) / 10, i);
}

singlepp::internal::RankedVector<double, int> output;
remapper.remap(input, output);

EXPECT_EQ(output.size(), 3);
EXPECT_EQ(output[0].first, 0.1);
EXPECT_EQ(output[0].second, 0);
EXPECT_EQ(output[1].first, 0.6);
EXPECT_EQ(output[1].second, 1);
EXPECT_EQ(output[2].first, 0.8);
EXPECT_EQ(output[2].second, 2);

// Checking that the clear() method works as expected.
auto copy = remapper;
copy.clear();
copy.remap(input, output);
EXPECT_TRUE(output.empty());
}

// Only even indices are retained.
{
singlepp::internal::RankedVector<double, int> input;
for (size_t i = 0; i < 10; i += 2) {
input.emplace_back(static_cast<double>(i) / 10, i);
}

singlepp::internal::RankedVector<double, int> output;
remapper.remap(input, output);

EXPECT_EQ(output.size(), 2);
EXPECT_EQ(output[0].first, 0.6);
EXPECT_EQ(output[0].second, 1);
EXPECT_EQ(output[1].first, 0.8);
EXPECT_EQ(output[1].second, 2);

// Checking that the clear() method works as expected.
auto copy = remapper;
copy.clear();
copy.remap(input, output);
EXPECT_TRUE(output.empty());

copy.add(4);
copy.add(1);
copy.remap(input, output);
EXPECT_EQ(output.size(), 1);
EXPECT_EQ(output[0].first, 0.4);
EXPECT_EQ(output[0].second, 0);
}
}

TEST(SubsetRemapper, SubsetSmallType) {
// Check that the remapper behaves correctly when the index type is smaller
// than the mapping size.
singlepp::internal::SubsetRemapper<uint8_t> remapper;
remapper.reserve(300);
remapper.add(200);
remapper.add(100);
remapper.add(10);
remapper.add(100); // ignoring duplicates again!
remapper.add(255); // need this to force the mapping to exceed the max index size.

singlepp::internal::RankedVector<double, uint8_t> input;
for (size_t i = 0; i < 250; i += 10) {
input.emplace_back(static_cast<double>(i) / 100, i);
}

singlepp::internal::RankedVector<double, uint8_t> output;
remapper.remap(input, output);

EXPECT_EQ(output.size(), 3);
EXPECT_EQ(output[0].first, 0.1);
EXPECT_EQ(output[0].second, 2);
EXPECT_EQ(output[1].first, 1.0);
EXPECT_EQ(output[1].second, 1);
EXPECT_EQ(output[2].first, 2.0);
EXPECT_EQ(output[2].second, 0);
}
Loading

0 comments on commit fb2b357

Please sign in to comment.