diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 9f7095993..c152999e1 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -59,6 +59,9 @@ struct CompressedBlockMetadataNoBlockIndex { size_t compressedSize_; bool operator==(const OffsetAndCompressedSize&) const = default; }; + + using GraphInfo = std::optional>; + std::vector offsetsAndCompressedSize_; size_t numRows_; @@ -115,7 +118,14 @@ struct CompressedBlockMetadataNoBlockIndex { const CompressedBlockMetadataNoBlockIndex& blockMetadata) { str << "#BlockMetadata\n(first) " << blockMetadata.firstTriple_ << "(last) " << blockMetadata.lastTriple_ << "num. rows: " << blockMetadata.numRows_ - << "." << std::endl; + << ".\n"; + if (blockMetadata.graphInfo_.has_value()) { + str << "Graphs: "; + ad_utility::lazyStrJoin(&str, blockMetadata.graphInfo_.value(), ", "); + str << '\n'; + } + str << "[possibly] contains duplicates: " + << blockMetadata.containsDuplicatesWithDifferentGraphs_ << '\n'; return str; } }; @@ -133,9 +143,9 @@ struct CompressedBlockMetadata : CompressedBlockMetadataNoBlockIndex { // Format BlockMetadata contents for debugging. friend std::ostream& operator<<( std::ostream& str, const CompressedBlockMetadata& blockMetadata) { - str << "#BlockMetadata\n(first) " << blockMetadata.firstTriple_ << "(last) " - << blockMetadata.lastTriple_ << "num. rows: " << blockMetadata.numRows_ - << "." << std::endl; + str << static_cast( + blockMetadata); + str << "block index: " << blockMetadata.blockIndex_ << "\n"; return str; } }; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index b2244f896..05353324a 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -10,6 +10,7 @@ #include "absl/strings/str_join.h" #include "index/CompressedRelation.h" +#include "index/ConstantsIndexBuilding.h" #include "util/ChunkedForLoop.h" // ____________________________________________________________________________ @@ -250,6 +251,48 @@ void LocatedTriplesPerBlock::setOriginalMetadata( updateAugmentedMetadata(); } +// Update the `blockMetadata`, such that its graph info is consistent with the +// `locatedTriples` which are added to that block. In particular, all graphs to +// which at least one triple is inserted become part of the graph info, and if +// the number of total graphs becomes larger than the configured threshold, then +// the graph info is set to `nullopt`, which means that there is no info. +static auto updateGraphMetadata(CompressedBlockMetadata& blockMetadata, + const LocatedTriples& locatedTriples) { + // We do not know anything about the triples contained in the block, so we + // also cannot know if the `locatedTriples` introduces duplicates. We thus + // have to be conservative and assume that there are duplicates. + blockMetadata.containsDuplicatesWithDifferentGraphs_ = true; + auto& graphs = blockMetadata.graphInfo_; + if (!graphs.has_value()) { + // The original block already contains too many graphs, don't store any + // graph info. + return; + } + + // Compute a hash set of all graphs that are originally contained in the block + // and all the graphs that are added via the `locatedTriples`. + ad_utility::HashSet newGraphs(graphs.value().begin(), + graphs.value().end()); + for (auto& lt : locatedTriples) { + if (!lt.shouldTripleExist_) { + // Don't update the graph info for triples that are deleted. + continue; + } + newGraphs.insert(lt.triple_.ids_.at(ADDITIONAL_COLUMN_GRAPH_ID)); + // Handle the case that with the newly added triples we have too many + // distinct graphs to store them in the graph info. + if (newGraphs.size() > MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA) { + graphs.reset(); + return; + } + } + graphs.emplace(newGraphs.begin(), newGraphs.end()); + + // Sort the stored graphs. Note: this is currently not expected by the code + // that uses the graph info, but makes testing much easier. + std::ranges::sort(graphs.value()); +} + // ____________________________________________________________________________ void LocatedTriplesPerBlock::updateAugmentedMetadata() { // TODO use view::enumerate @@ -265,6 +308,7 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() { blockMetadata.lastTriple_ = std::max(blockMetadata.lastTriple_, blockUpdates.rbegin()->triple_.toPermutedTriple()); + updateGraphMetadata(blockMetadata, blockUpdates); } blockIndex++; } @@ -287,7 +331,10 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() { lastTriple, std::nullopt, true}; - augmentedMetadata_->emplace_back(lastBlockN, blockIndex); + lastBlockN.graphInfo_.emplace(); + CompressedBlockMetadata lastBlock{lastBlockN, blockIndex}; + updateGraphMetadata(lastBlock, blockUpdates); + augmentedMetadata_->push_back(lastBlock); } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3846361dc..6f8aa447e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -307,7 +307,7 @@ addLinkAndDiscoverTest(AlgorithmTest) addLinkAndDiscoverTestSerial(CompressedRelationsTest index) -addLinkAndDiscoverTestSerial(PrefilterExpressionIndexTest sparqlExpressions index) +addLinkAndDiscoverTestSerial(PrefilterExpressionIndexTest sparqlExpressions sparqlExpressions parser index) addLinkAndDiscoverTestSerial(GetPrefilterExpressionFromSparqlExpressionTest sparqlExpressions index) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index c4364023f..4a46644af 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -29,10 +29,12 @@ auto IT = [](const auto& c1, const auto& c2, const auto& c3, int graph = g) { auto PT = [](const auto& c1, const auto& c2, const auto& c3, int graph = g) { return CompressedBlockMetadata::PermutedTriple{V(c1), V(c2), V(c3), V(graph)}; }; -auto CBM = [](const auto firstTriple, const auto lastTriple) { +auto CBM = [](const auto firstTriple, const auto lastTriple, + CompressedBlockMetadata::GraphInfo graphs = std::nullopt) { size_t dummyBlockIndex = 0; - return CompressedBlockMetadata{{{}, 0, firstTriple, lastTriple, {}, false}, - dummyBlockIndex}; + return CompressedBlockMetadata{ + {{}, 0, firstTriple, lastTriple, std::move(graphs), false}, + dummyBlockIndex}; }; auto numBlocks = @@ -750,10 +752,12 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { Span{T1}, metadata, {0, 1, 2}, false, handle)); expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1); + expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true; EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), testing::ElementsAreArray(expectedAugmentedMetadata)); // T2 is inside block 1. Borders don't change. + expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true; locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( Span{T2}, metadata, {0, 1, 2}, true, handle)); @@ -762,6 +766,7 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { // T3 is equal to PT4, the beginning of block 2. All update (update and // delete) add to the block borders. Borders don't change. + expectedAugmentedMetadata[2].containsDuplicatesWithDifferentGraphs_ = true; locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( Span{T3}, metadata, {0, 1, 2}, false, handle)); @@ -774,6 +779,7 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { Span{T4}, metadata, {0, 1, 2}, true, handle)); expectedAugmentedMetadata[4] = CBM(T4.toPermutedTriple(), PT8); + expectedAugmentedMetadata[4].containsDuplicatesWithDifferentGraphs_ = true; EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), testing::ElementsAreArray(expectedAugmentedMetadata)); @@ -781,6 +787,9 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { locatedTriplesPerBlock.erase(4, handles[0]); expectedAugmentedMetadata[4] = CBM(PT8, PT8); + // The block 4 has no more updates, so we restore the info about the block + // having no duplicates from the original metadata. + expectedAugmentedMetadata[4].containsDuplicatesWithDifferentGraphs_ = false; EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), testing::ElementsAreArray(expectedAugmentedMetadata)); @@ -797,6 +806,124 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { } } +// _____________________________________________________________________________ +TEST_F(LocatedTriplesTest, augmentedMetadataGraphInfo) { + // Create a vector that is automatically converted to a span. + using Span = std::vector>; + + auto PT1 = PT(1, 10, 10); + auto PT2 = PT(2, 10, 10); + auto PT3 = PT(2, 15, 20); + // Two blocks, one without graph info, and one with graph info. + const std::vector metadata = { + CBM(PT1, PT1), CBM(PT2, PT3, std::vector{V(13)})}; + std::vector expectedAugmentedMetadata{metadata}; + + auto T1 = IT( + 1, 10, 10, + 12); // Before block 0 (because `12` is smaller than the default graph) + auto T2 = IT(1, 10, 10, + 99999999); // Becomes the lower bound of block 1, although it + // only differs in the graph info. + auto T3 = IT(2, 12, 10, 17); // Inside block 1, add graph 17. + auto T4 = IT(2, 12, 10, 18); // Inside block 1, add graph 18. + + auto T5 = IT(20, 30, 40, 19); // After the last block. + + ad_utility::SharedCancellationHandle handle = + std::make_shared>(); + + { + LocatedTriplesPerBlock locatedTriplesPerBlock; + locatedTriplesPerBlock.setOriginalMetadata(metadata); + + // Delete the located triples {T1 ... T4} + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + Span{T1, T2, T3, T4}, metadata, {0, 1, 2}, false, handle)); + + // All the blocks have updates, so their value of `containsDuplicates..` is + // set to `true`. + expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1); + expectedAugmentedMetadata[1].firstTriple_ = T2.toPermutedTriple(); + expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true; + expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true; + + // Note: the GraphInfo hasn't changed, because the new triples all were + // deleted. + EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), + testing::ElementsAreArray(expectedAugmentedMetadata)); + } + { + expectedAugmentedMetadata = metadata; + LocatedTriplesPerBlock locatedTriplesPerBlock; + locatedTriplesPerBlock.setOriginalMetadata(metadata); + + // Add the located triples {T1 ... T5} + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + Span{T1, T2, T3, T4, T5}, metadata, {0, 1, 2}, true, handle)); + + expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1); + expectedAugmentedMetadata[1].firstTriple_ = T2.toPermutedTriple(); + expectedAugmentedMetadata[1].graphInfo_.value() = + std::vector{V(13), V(17), V(18), V(99999999)}; + + // We have added a triple `T5` after the last block, so there now is an + // additional block, which also stores the correct graph info. + expectedAugmentedMetadata.push_back( + CBM(T5.toPermutedTriple(), T5.toPermutedTriple(), std::vector{V(19)})); + + // The automatically added metadata for the last block also has the correct + // block index and number of columns, so we have to properly initialize it. + expectedAugmentedMetadata.back().blockIndex_ = 2; + expectedAugmentedMetadata.back().offsetsAndCompressedSize_.resize(4, + {0, 0}); + + // All the blocks have updates, so their value of `containsDuplicates..` is + // set to `true`. + expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true; + expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true; + expectedAugmentedMetadata[2].containsDuplicatesWithDifferentGraphs_ = true; + + // Note: the GraphInfo hasn't changed, because the new triples all were + // deleted. + auto actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata(); + EXPECT_THAT(actualMetadata, + testing::ElementsAreArray(expectedAugmentedMetadata)); + + // Test the case that a block loses its graph info if the added located + // triples have too many distinct graphs. + ASSERT_TRUE(actualMetadata[1].graphInfo_.has_value()); + std::vector> triples; + // Note: The `30` is an offset to guarantee that the added graphs are not + // contained in the located triples before. + for (size_t i = 30; i < 30 + 2 * MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA; + ++i) { + auto tr = T3; + tr.ids_.at(ADDITIONAL_COLUMN_GRAPH_ID) = V(i); + triples.push_back(tr); + } + + size_t numGraphsBefore = actualMetadata[1].graphInfo_.value().size(); + size_t numGraphsToMax = + MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA - numGraphsBefore; + + // Add the exact amount of graphs such that we are at the maximum number of + // stored graphs. + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + std::span{triples}.subspan(0, numGraphsToMax), metadata, {0, 1, 2}, + true, handle)); + actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata(); + ASSERT_TRUE(actualMetadata[1].graphInfo_.has_value()); + + // Adding one more graph will exceed the maximum. + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + std::span{triples}.subspan(numGraphsToMax, numGraphsToMax + 1), + metadata, {0, 1, 2}, true, handle)); + actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata(); + ASSERT_FALSE(actualMetadata[1].graphInfo_.has_value()); + } +} + TEST_F(LocatedTriplesTest, debugPrints) { using LT = LocatedTriple; diff --git a/test/PrefilterExpressionIndexTest.cpp b/test/PrefilterExpressionIndexTest.cpp index a941afab6..3ef094957 100644 --- a/test/PrefilterExpressionIndexTest.cpp +++ b/test/PrefilterExpressionIndexTest.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures // Author: Hannes Baumann -#include +#include #include @@ -157,18 +157,32 @@ class PrefilterExpressionOnMetadataTest : public ::testing::Test { //______________________________________________________________________________ TEST_F(PrefilterExpressionOnMetadataTest, testBlockFormatForDebugging) { - EXPECT_EQ( - "#BlockMetadata\n(first) Triple: I:0 V:10 D:33.000000 V:0\n(last) " - "Triple: I:0 V:10 D:33.000000 V:0\nnum. rows: 0.\n", - (std::stringstream() << b5).str()); - EXPECT_EQ( - "#BlockMetadata\n(first) Triple: I:-4 V:10 D:33.000000 V:0\n(last) " - "Triple: D:2.000000 V:10 D:33.000000 V:0\nnum. rows: 0.\n", - (std::stringstream() << b11).str()); - EXPECT_EQ( - "#BlockMetadata\n(first) Triple: V:14 V:10 D:33.000000 V:0\n(last) " - "Triple: V:17 V:10 D:33.000000 V:0\nnum. rows: 0.\n", - (std::stringstream() << b21).str()); + auto toString = [](const CompressedBlockMetadata& b) { + return (std::stringstream{} << b).str(); + }; + + auto matcher = [&toString](const std::string& substring) { + return ::testing::ResultOf(toString, ::testing::HasSubstr(substring)); + }; + EXPECT_THAT( + b5, + matcher( + "#BlockMetadata\n(first) Triple: I:0 V:10 D:33.000000 V:0\n(last) " + "Triple: I:0 V:10 D:33.000000 V:0\nnum. rows: 0.\n")); + EXPECT_THAT( + b11, + matcher( + "#BlockMetadata\n(first) Triple: I:-4 V:10 D:33.000000 V:0\n(last) " + "Triple: D:2.000000 V:10 D:33.000000 V:0\nnum. rows: 0.\n")); + EXPECT_THAT( + b21, + matcher( + "#BlockMetadata\n(first) Triple: V:14 V:10 D:33.000000 V:0\n(last) " + "Triple: V:17 V:10 D:33.000000 V:0\nnum. rows: 0.\n")); + + auto blockWithGraphInfo = b21; + blockWithGraphInfo.graphInfo_.emplace({IntId(12), IntId(13)}); + EXPECT_THAT(blockWithGraphInfo, matcher("Graphs: I:12, I:13\n")); } // Test Relational Expressions