Renamed SubsetRemapper class and moved it to its own file.

- Improved its internal documentation for future me. - Moved the associated tests to their own file. - Cleaned up documentation for SubsetSanitizer, for consistency.
SingleR-inc · Oct 10, 2024 · fb2b357 · fb2b357
1 parent 052356f
commit fb2b357
Show file tree

Hide file tree

Showing 8 changed files with 220 additions and 168 deletions.
diff --git a/include/singlepp/SubsetRemapper.hpp b/include/singlepp/SubsetRemapper.hpp
@@ -0,0 +1,106 @@
+#ifndef SINGLEPP_SUBSET_REMAPPER_HPP
+#define SINGLEPP_SUBSET_REMAPPER_HPP
+
+#include "scaled_ranks.hpp"
+
+#include <vector>
+#include <limits>
+
+namespace singlepp {
+
+namespace internal {
+
+/*
+ * This class remaps the indices in the RankVector to the subset of interest.
+ * For example, if our subset of features of interest is:
+ *
+ *    [a, c, g, e]
+ *
+ * ... the user should call .add(a), .add(c), .add(e), etc. Then, when we
+ * receive a rank vector like:
+ *
+ * [(A, a), (B, b), (C, c), (D, d), (E, e), (F, f), (G, g)],
+ *
+ * ... the .remap() method filters out the entries that weren't added by
+ * .add(), and then remaps the remaining indices to their position on the
+ * subset vector, yielding:
+ *
+ * [(A, 0), (C, 1), (E, 3), (G, 2)]
+ *
+ * The idea is to adjust the indices so it appears as if we had been working
+ * with the subset of features all along, allowing us to call scaled_ranks() to
+ * perform the rest of the analysis on the subsets only. This is primarily
+ * intended for use on the reference rank vectors during fine-tuning, given that
+ * data extracted from the reference/test matrices is already subsetted when
+ * returned by SubsetSanitizer::fill_ranks().
+ */
+template<typename Index_>
+class SubsetRemapper {
+private:
+    // This uses a vector instead of an unordered_map for fast remap()
+    // inside the inner loop of the fine-tuning iterations.
+    std::vector<std::pair<bool, Index_> > my_mapping;
+    std::vector<size_t> my_used;
+    Index_ my_counter = 0;
+
+public:
+    void add(size_t i) {
+        if (i >= my_mapping.size()) {
+            my_mapping.resize(i + 1);
+        }
+        if (!my_mapping[i].first) {
+            my_mapping[i].first = true;
+            my_mapping[i].second = my_counter;
+            my_used.push_back(i);
+            ++my_counter;
+        }
+    }
+
+    void clear() {
+        my_counter = 0;
+        for (auto u : my_used) {
+            my_mapping[u].first = false;
+        }
+        my_used.clear();
+    }
+
+    void reserve(size_t n) {
+        my_mapping.reserve(n);
+    }
+
+public:
+    template<typename Stat_>
+    void remap(const RankedVector<Stat_, Index_>& input, RankedVector<Stat_, Index_>& output) const {
+        output.clear();
+
+        if (static_cast<size_t>(std::numeric_limits<Index_>::max()) < my_mapping.size()) {
+            // Avoid unnecessary check if the size is already greater than the largest possible index.
+            // This also avoids the need to cast to indices size_t for comparison to my_mapping.size().
+            for (const auto& x : input) {
+                const auto& target = my_mapping[x.second];
+                if (target.first) {
+                    output.emplace_back(x.first, target.second);
+                }
+            }
+
+        } else {
+            // Otherwise, it is safe to cast the size to Index_ outside the
+            // loop so that we don't need to cast x.second to size_t inside the loop.
+            Index_ maxed = my_mapping.size();
+            for (const auto& x : input) {
+                if (maxed > x.second) {
+                    const auto& target = my_mapping[x.second];
+                    if (target.first) {
+                        output.emplace_back(x.first, target.second);
+                    }
+                }
+            }
+        }
+    }
+};
+
+}
+
+}
+
+#endif
diff --git a/include/singlepp/SubsetSanitizer.hpp b/include/singlepp/SubsetSanitizer.hpp
@@ -10,12 +10,14 @@ namespace singlepp {
 
 namespace internal {
 
-// This class sanitizes any user-provided subsets so that we can provide a
-// sorted and unique subset to the tatami extractor. We then undo the sorting
-// to use the original indices in the rank filler. This entire thing is
-// necessary as the behavior of the subsets isn't something that the user can
-// easily control (e.g., if the reference/test datasets do not use the same
-// feature ordering, in which case the subset is necessarily unsorted).
+/*
+ * This class sanitizes any user-provided subsets so that we can provide a
+ * sorted and unique subset to the tatami extractor. We then undo the sorting
+ * to use the original indices in the rank filler. This entire thing is
+ * necessary as the behavior of the subsets isn't something that the user can
+ * easily control (e.g., if the reference/test datasets do not use the same
+ * feature ordering, in which case the subset is necessarily unsorted).
+ */
 template<typename Index_>
 class SubsetSanitizer {
 private:
@@ -64,6 +66,8 @@ class SubsetSanitizer {
 
     template<typename Stat_>
     void fill_ranks(const Stat_* ptr, RankedVector<Stat_, Index_>& vec) const {
+        // The indices in the output 'vec' refer to positions on the subset
+        // vector, as if the input data was already subsetted. 
         vec.clear();
         if (my_use_sorted_subset) {
             size_t num = my_original_indices.size();

diff --git a/include/singlepp/annotate_cells_integrated.hpp b/include/singlepp/annotate_cells_integrated.hpp
@@ -6,6 +6,7 @@
 #include "tatami/tatami.hpp"
 
 #include "scaled_ranks.hpp"
+#include "SubsetRemapper.hpp"
 #include "train_integrated.hpp"
 #include "find_best_and_delta.hpp"
 #include "fill_labels_in_use.hpp"
@@ -26,9 +27,9 @@ namespace internal {
 // across cells and references.
 template<typename Index_, typename Value_, typename Float_>
 struct PerReferenceIntegratedWorkspace {
-    RankRemapper<Index_> intersect_mapping;
+    SubsetRemapper<Index_> intersect_mapping;
     bool direct_mapping_filled;
-    RankRemapper<Index_> direct_mapping;
+    SubsetRemapper<Index_> direct_mapping;
 
     RankedVector<Value_, Index_> test_ranked;
     RankedVector<Index_, Index_> ref_ranked;
@@ -50,7 +51,7 @@ Float_ compute_single_reference_score_integrated(
 {
     // Further subsetting to the intersection of markers that are
     // actual present in this particular reference.
-    const RankRemapper<Index_>* mapping;
+    const SubsetRemapper<Index_>* mapping;
     if (trained.check_availability[ref_i]) {
         const auto& cur_available = trained.available[ref_i];
         workspace.intersect_mapping.clear();

diff --git a/include/singlepp/annotate_cells_single.hpp b/include/singlepp/annotate_cells_single.hpp
@@ -8,6 +8,7 @@
 #include "Markers.hpp"
 #include "build_indices.hpp"
 #include "SubsetSanitizer.hpp"
+#include "SubsetRemapper.hpp"
 #include "find_best_and_delta.hpp"
 #include "scaled_ranks.hpp"
 #include "correlations_to_score.hpp"
@@ -26,7 +27,7 @@ class FineTuneSingle {
 private:
     std::vector<Label_> my_labels_in_use;
 
-    RankRemapper<Index_> my_gene_subset;
+    SubsetRemapper<Index_> my_gene_subset;
 
     std::vector<Float_> my_scaled_left, my_scaled_right;
 

diff --git a/include/singlepp/scaled_ranks.hpp b/include/singlepp/scaled_ranks.hpp
@@ -4,7 +4,7 @@
 #include <algorithm>
 #include <vector>
 #include <cmath>
-#include <limits>
+#include <type_traits>
 
 namespace singlepp {
 
@@ -59,71 +59,6 @@ void scaled_ranks(const RankedVector<Stat_, Index_>& collected, Output_* outgoin
     }
 }
 
-template<typename Index_>
-class RankRemapper {
-private:
-    // This uses a vector instead of an unordered_map for fast remap()
-    // inside the inner loop of the fine-tuning iterations.
-    std::vector<std::pair<bool, Index_> > my_mapping;
-    std::vector<size_t> my_used;
-    Index_ my_counter = 0;
-
-public:
-    void add(size_t i) {
-        if (i >= my_mapping.size()) {
-            my_mapping.resize(i + 1);
-        }
-        if (!my_mapping[i].first) {
-            my_mapping[i].first = true;
-            my_mapping[i].second = my_counter;
-            my_used.push_back(i);
-            ++my_counter;
-        }
-    }
-
-    void clear() {
-        my_counter = 0;
-        for (auto u : my_used) {
-            my_mapping[u].first = false;
-        }
-        my_used.clear();
-    }
-
-    void reserve(size_t n) {
-        my_mapping.reserve(n);
-    }
-
-public:
-    template<typename Stat_>
-    void remap(const RankedVector<Stat_, Index_>& input, RankedVector<Stat_, Index_>& output) const {
-        output.clear();
-
-        if (static_cast<size_t>(std::numeric_limits<Index_>::max()) < my_mapping.size()) {
-            // Avoid unnecessary check if the size is already greater than the largest possible index.
-            // This also avoids the need to cast to indices size_t for comparison to my_mapping.size().
-            for (const auto& x : input) {
-                const auto& target = my_mapping[x.second];
-                if (target.first) {
-                    output.emplace_back(x.first, target.second);
-                }
-            }
-
-        } else {
-            // Otherwise, it is safe to cast the size to Index_ outside the
-            // loop so that we don't need to cast x.second to size_t inside the loop.
-            Index_ maxed = my_mapping.size();
-            for (const auto& x : input) {
-                if (maxed > x.second) {
-                    const auto& target = my_mapping[x.second];
-                    if (target.first) {
-                        output.emplace_back(x.first, target.second);
-                    }
-                }
-            }
-        }
-    }
-};
-
 template<typename Stat_, typename Index_, typename Simple_>
 void simplify_ranks(const RankedVector<Stat_, Index_>& x, RankedVector<Simple_, Index_>& output) {
     if (x.size()) {

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -20,6 +20,7 @@ add_executable(
     libtest 
     src/classify_single.cpp
     src/scaled_ranks.cpp
+    src/SubsetRemapper.cpp
     src/correlations_to_score.cpp
     src/Intersection.cpp
     src/subset_to_markers.cpp

diff --git a/tests/src/SubsetRemapper.cpp b/tests/src/SubsetRemapper.cpp
@@ -0,0 +1,96 @@
+#include <gtest/gtest.h>
+
+#include "singlepp/scaled_ranks.hpp"
+#include "singlepp/SubsetRemapper.hpp"
+
+TEST(SubsetRemapper, Subsets) {
+    singlepp::internal::SubsetRemapper<int> remapper;
+    remapper.reserve(10);
+    remapper.add(1);
+    remapper.add(6); 
+    remapper.add(1); // duplicates are ignored.
+    remapper.add(8);
+
+    // All indices are retained.
+    {
+        singlepp::internal::RankedVector<double, int> input;
+        for (size_t i = 0; i < 10; ++i) {
+            input.emplace_back(static_cast<double>(i) / 10, i);
+        }
+
+        singlepp::internal::RankedVector<double, int> output;
+        remapper.remap(input, output);
+
+        EXPECT_EQ(output.size(), 3);
+        EXPECT_EQ(output[0].first, 0.1);
+        EXPECT_EQ(output[0].second, 0);
+        EXPECT_EQ(output[1].first, 0.6);
+        EXPECT_EQ(output[1].second, 1);
+        EXPECT_EQ(output[2].first, 0.8);
+        EXPECT_EQ(output[2].second, 2);
+
+        // Checking that the clear() method works as expected.
+        auto copy = remapper;
+        copy.clear();
+        copy.remap(input, output);
+        EXPECT_TRUE(output.empty());
+    }
+
+    // Only even indices are retained.
+    {
+        singlepp::internal::RankedVector<double, int> input;
+        for (size_t i = 0; i < 10; i += 2) {
+            input.emplace_back(static_cast<double>(i) / 10, i);
+        }
+
+        singlepp::internal::RankedVector<double, int> output;
+        remapper.remap(input, output);
+
+        EXPECT_EQ(output.size(), 2);
+        EXPECT_EQ(output[0].first, 0.6);
+        EXPECT_EQ(output[0].second, 1);
+        EXPECT_EQ(output[1].first, 0.8);
+        EXPECT_EQ(output[1].second, 2);
+
+        // Checking that the clear() method works as expected.
+        auto copy = remapper;
+        copy.clear();
+        copy.remap(input, output);
+        EXPECT_TRUE(output.empty());
+
+        copy.add(4);
+        copy.add(1);
+        copy.remap(input, output);
+        EXPECT_EQ(output.size(), 1);
+        EXPECT_EQ(output[0].first, 0.4);
+        EXPECT_EQ(output[0].second, 0);
+    }
+}
+
+TEST(SubsetRemapper, SubsetSmallType) {
+    // Check that the remapper behaves correctly when the index type is smaller
+    // than the mapping size.
+    singlepp::internal::SubsetRemapper<uint8_t> remapper;
+    remapper.reserve(300);
+    remapper.add(200);
+    remapper.add(100); 
+    remapper.add(10); 
+    remapper.add(100); // ignoring duplicates again!
+    remapper.add(255); // need this to force the mapping to exceed the max index size.
+
+    singlepp::internal::RankedVector<double, uint8_t> input;
+    for (size_t i = 0; i < 250; i += 10) {
+        input.emplace_back(static_cast<double>(i) / 100, i);
+    }
+
+    singlepp::internal::RankedVector<double, uint8_t> output;
+    remapper.remap(input, output);
+
+    EXPECT_EQ(output.size(), 3);
+    EXPECT_EQ(output[0].first, 0.1);
+    EXPECT_EQ(output[0].second, 2);
+    EXPECT_EQ(output[1].first, 1.0);
+    EXPECT_EQ(output[1].second, 1);
+    EXPECT_EQ(output[2].first, 2.0);
+    EXPECT_EQ(output[2].second, 0);
+}