-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Renamed SubsetRemapper class and moved it to its own file.
- Improved its internal documentation for future me. - Moved the associated tests to their own file. - Cleaned up documentation for SubsetSanitizer, for consistency.
- Loading branch information
Showing
8 changed files
with
220 additions
and
168 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#ifndef SINGLEPP_SUBSET_REMAPPER_HPP | ||
#define SINGLEPP_SUBSET_REMAPPER_HPP | ||
|
||
#include "scaled_ranks.hpp" | ||
|
||
#include <vector> | ||
#include <limits> | ||
|
||
namespace singlepp { | ||
|
||
namespace internal { | ||
|
||
/* | ||
* This class remaps the indices in the RankVector to the subset of interest. | ||
* For example, if our subset of features of interest is: | ||
* | ||
* [a, c, g, e] | ||
* | ||
* ... the user should call .add(a), .add(c), .add(e), etc. Then, when we | ||
* receive a rank vector like: | ||
* | ||
* [(A, a), (B, b), (C, c), (D, d), (E, e), (F, f), (G, g)], | ||
* | ||
* ... the .remap() method filters out the entries that weren't added by | ||
* .add(), and then remaps the remaining indices to their position on the | ||
* subset vector, yielding: | ||
* | ||
* [(A, 0), (C, 1), (E, 3), (G, 2)] | ||
* | ||
* The idea is to adjust the indices so it appears as if we had been working | ||
* with the subset of features all along, allowing us to call scaled_ranks() to | ||
* perform the rest of the analysis on the subsets only. This is primarily | ||
* intended for use on the reference rank vectors during fine-tuning, given that | ||
* data extracted from the reference/test matrices is already subsetted when | ||
* returned by SubsetSanitizer::fill_ranks(). | ||
*/ | ||
template<typename Index_> | ||
class SubsetRemapper { | ||
private: | ||
// This uses a vector instead of an unordered_map for fast remap() | ||
// inside the inner loop of the fine-tuning iterations. | ||
std::vector<std::pair<bool, Index_> > my_mapping; | ||
std::vector<size_t> my_used; | ||
Index_ my_counter = 0; | ||
|
||
public: | ||
void add(size_t i) { | ||
if (i >= my_mapping.size()) { | ||
my_mapping.resize(i + 1); | ||
} | ||
if (!my_mapping[i].first) { | ||
my_mapping[i].first = true; | ||
my_mapping[i].second = my_counter; | ||
my_used.push_back(i); | ||
++my_counter; | ||
} | ||
} | ||
|
||
void clear() { | ||
my_counter = 0; | ||
for (auto u : my_used) { | ||
my_mapping[u].first = false; | ||
} | ||
my_used.clear(); | ||
} | ||
|
||
void reserve(size_t n) { | ||
my_mapping.reserve(n); | ||
} | ||
|
||
public: | ||
template<typename Stat_> | ||
void remap(const RankedVector<Stat_, Index_>& input, RankedVector<Stat_, Index_>& output) const { | ||
output.clear(); | ||
|
||
if (static_cast<size_t>(std::numeric_limits<Index_>::max()) < my_mapping.size()) { | ||
// Avoid unnecessary check if the size is already greater than the largest possible index. | ||
// This also avoids the need to cast to indices size_t for comparison to my_mapping.size(). | ||
for (const auto& x : input) { | ||
const auto& target = my_mapping[x.second]; | ||
if (target.first) { | ||
output.emplace_back(x.first, target.second); | ||
} | ||
} | ||
|
||
} else { | ||
// Otherwise, it is safe to cast the size to Index_ outside the | ||
// loop so that we don't need to cast x.second to size_t inside the loop. | ||
Index_ maxed = my_mapping.size(); | ||
for (const auto& x : input) { | ||
if (maxed > x.second) { | ||
const auto& target = my_mapping[x.second]; | ||
if (target.first) { | ||
output.emplace_back(x.first, target.second); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
}; | ||
|
||
} | ||
|
||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#include <gtest/gtest.h> | ||
|
||
#include "singlepp/scaled_ranks.hpp" | ||
#include "singlepp/SubsetRemapper.hpp" | ||
|
||
TEST(SubsetRemapper, Subsets) { | ||
singlepp::internal::SubsetRemapper<int> remapper; | ||
remapper.reserve(10); | ||
remapper.add(1); | ||
remapper.add(6); | ||
remapper.add(1); // duplicates are ignored. | ||
remapper.add(8); | ||
|
||
// All indices are retained. | ||
{ | ||
singlepp::internal::RankedVector<double, int> input; | ||
for (size_t i = 0; i < 10; ++i) { | ||
input.emplace_back(static_cast<double>(i) / 10, i); | ||
} | ||
|
||
singlepp::internal::RankedVector<double, int> output; | ||
remapper.remap(input, output); | ||
|
||
EXPECT_EQ(output.size(), 3); | ||
EXPECT_EQ(output[0].first, 0.1); | ||
EXPECT_EQ(output[0].second, 0); | ||
EXPECT_EQ(output[1].first, 0.6); | ||
EXPECT_EQ(output[1].second, 1); | ||
EXPECT_EQ(output[2].first, 0.8); | ||
EXPECT_EQ(output[2].second, 2); | ||
|
||
// Checking that the clear() method works as expected. | ||
auto copy = remapper; | ||
copy.clear(); | ||
copy.remap(input, output); | ||
EXPECT_TRUE(output.empty()); | ||
} | ||
|
||
// Only even indices are retained. | ||
{ | ||
singlepp::internal::RankedVector<double, int> input; | ||
for (size_t i = 0; i < 10; i += 2) { | ||
input.emplace_back(static_cast<double>(i) / 10, i); | ||
} | ||
|
||
singlepp::internal::RankedVector<double, int> output; | ||
remapper.remap(input, output); | ||
|
||
EXPECT_EQ(output.size(), 2); | ||
EXPECT_EQ(output[0].first, 0.6); | ||
EXPECT_EQ(output[0].second, 1); | ||
EXPECT_EQ(output[1].first, 0.8); | ||
EXPECT_EQ(output[1].second, 2); | ||
|
||
// Checking that the clear() method works as expected. | ||
auto copy = remapper; | ||
copy.clear(); | ||
copy.remap(input, output); | ||
EXPECT_TRUE(output.empty()); | ||
|
||
copy.add(4); | ||
copy.add(1); | ||
copy.remap(input, output); | ||
EXPECT_EQ(output.size(), 1); | ||
EXPECT_EQ(output[0].first, 0.4); | ||
EXPECT_EQ(output[0].second, 0); | ||
} | ||
} | ||
|
||
TEST(SubsetRemapper, SubsetSmallType) { | ||
// Check that the remapper behaves correctly when the index type is smaller | ||
// than the mapping size. | ||
singlepp::internal::SubsetRemapper<uint8_t> remapper; | ||
remapper.reserve(300); | ||
remapper.add(200); | ||
remapper.add(100); | ||
remapper.add(10); | ||
remapper.add(100); // ignoring duplicates again! | ||
remapper.add(255); // need this to force the mapping to exceed the max index size. | ||
|
||
singlepp::internal::RankedVector<double, uint8_t> input; | ||
for (size_t i = 0; i < 250; i += 10) { | ||
input.emplace_back(static_cast<double>(i) / 100, i); | ||
} | ||
|
||
singlepp::internal::RankedVector<double, uint8_t> output; | ||
remapper.remap(input, output); | ||
|
||
EXPECT_EQ(output.size(), 3); | ||
EXPECT_EQ(output[0].first, 0.1); | ||
EXPECT_EQ(output[0].second, 2); | ||
EXPECT_EQ(output[1].first, 1.0); | ||
EXPECT_EQ(output[1].second, 1); | ||
EXPECT_EQ(output[2].first, 2.0); | ||
EXPECT_EQ(output[2].second, 0); | ||
} |
Oops, something went wrong.