From 82ccc5157f68c83532005c309802e54fcc5e1244 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 29 Nov 2024 20:50:13 +0100 Subject: [PATCH] Update graph info in the augmented block metadata (#1642) An update can invalidate a cached query result in the sense that if one would run the query again after the update, the result may be different. This was ignored so far, and is now considered as follows: Each `LocatedTriplesSnapshot` gets its own "index" (starting from zero and then incremented for each new snaphot). That index becomes part of the cache key. That way, a query will make use of a cached result if and only if there was no update between the time of the query and the time when the cached result was computed. --- src/index/CompressedRelation.h | 18 +++- src/index/LocatedTriples.cpp | 49 +++++++++- test/CMakeLists.txt | 2 +- test/LocatedTriplesTest.cpp | 133 +++++++++++++++++++++++++- test/PrefilterExpressionIndexTest.cpp | 40 +++++--- 5 files changed, 220 insertions(+), 22 deletions(-) diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 9f7095993e..c152999e10 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -59,6 +59,9 @@ struct CompressedBlockMetadataNoBlockIndex { size_t compressedSize_; bool operator==(const OffsetAndCompressedSize&) const = default; }; + + using GraphInfo = std::optional>; + std::vector offsetsAndCompressedSize_; size_t numRows_; @@ -115,7 +118,14 @@ struct CompressedBlockMetadataNoBlockIndex { const CompressedBlockMetadataNoBlockIndex& blockMetadata) { str << "#BlockMetadata\n(first) " << blockMetadata.firstTriple_ << "(last) " << blockMetadata.lastTriple_ << "num. rows: " << blockMetadata.numRows_ - << "." << std::endl; + << ".\n"; + if (blockMetadata.graphInfo_.has_value()) { + str << "Graphs: "; + ad_utility::lazyStrJoin(&str, blockMetadata.graphInfo_.value(), ", "); + str << '\n'; + } + str << "[possibly] contains duplicates: " + << blockMetadata.containsDuplicatesWithDifferentGraphs_ << '\n'; return str; } }; @@ -133,9 +143,9 @@ struct CompressedBlockMetadata : CompressedBlockMetadataNoBlockIndex { // Format BlockMetadata contents for debugging. friend std::ostream& operator<<( std::ostream& str, const CompressedBlockMetadata& blockMetadata) { - str << "#BlockMetadata\n(first) " << blockMetadata.firstTriple_ << "(last) " - << blockMetadata.lastTriple_ << "num. rows: " << blockMetadata.numRows_ - << "." << std::endl; + str << static_cast( + blockMetadata); + str << "block index: " << blockMetadata.blockIndex_ << "\n"; return str; } }; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index b2244f8960..05353324a5 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -10,6 +10,7 @@ #include "absl/strings/str_join.h" #include "index/CompressedRelation.h" +#include "index/ConstantsIndexBuilding.h" #include "util/ChunkedForLoop.h" // ____________________________________________________________________________ @@ -250,6 +251,48 @@ void LocatedTriplesPerBlock::setOriginalMetadata( updateAugmentedMetadata(); } +// Update the `blockMetadata`, such that its graph info is consistent with the +// `locatedTriples` which are added to that block. In particular, all graphs to +// which at least one triple is inserted become part of the graph info, and if +// the number of total graphs becomes larger than the configured threshold, then +// the graph info is set to `nullopt`, which means that there is no info. +static auto updateGraphMetadata(CompressedBlockMetadata& blockMetadata, + const LocatedTriples& locatedTriples) { + // We do not know anything about the triples contained in the block, so we + // also cannot know if the `locatedTriples` introduces duplicates. We thus + // have to be conservative and assume that there are duplicates. + blockMetadata.containsDuplicatesWithDifferentGraphs_ = true; + auto& graphs = blockMetadata.graphInfo_; + if (!graphs.has_value()) { + // The original block already contains too many graphs, don't store any + // graph info. + return; + } + + // Compute a hash set of all graphs that are originally contained in the block + // and all the graphs that are added via the `locatedTriples`. + ad_utility::HashSet newGraphs(graphs.value().begin(), + graphs.value().end()); + for (auto& lt : locatedTriples) { + if (!lt.shouldTripleExist_) { + // Don't update the graph info for triples that are deleted. + continue; + } + newGraphs.insert(lt.triple_.ids_.at(ADDITIONAL_COLUMN_GRAPH_ID)); + // Handle the case that with the newly added triples we have too many + // distinct graphs to store them in the graph info. + if (newGraphs.size() > MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA) { + graphs.reset(); + return; + } + } + graphs.emplace(newGraphs.begin(), newGraphs.end()); + + // Sort the stored graphs. Note: this is currently not expected by the code + // that uses the graph info, but makes testing much easier. + std::ranges::sort(graphs.value()); +} + // ____________________________________________________________________________ void LocatedTriplesPerBlock::updateAugmentedMetadata() { // TODO use view::enumerate @@ -265,6 +308,7 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() { blockMetadata.lastTriple_ = std::max(blockMetadata.lastTriple_, blockUpdates.rbegin()->triple_.toPermutedTriple()); + updateGraphMetadata(blockMetadata, blockUpdates); } blockIndex++; } @@ -287,7 +331,10 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() { lastTriple, std::nullopt, true}; - augmentedMetadata_->emplace_back(lastBlockN, blockIndex); + lastBlockN.graphInfo_.emplace(); + CompressedBlockMetadata lastBlock{lastBlockN, blockIndex}; + updateGraphMetadata(lastBlock, blockUpdates); + augmentedMetadata_->push_back(lastBlock); } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3846361dc5..6f8aa447e7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -307,7 +307,7 @@ addLinkAndDiscoverTest(AlgorithmTest) addLinkAndDiscoverTestSerial(CompressedRelationsTest index) -addLinkAndDiscoverTestSerial(PrefilterExpressionIndexTest sparqlExpressions index) +addLinkAndDiscoverTestSerial(PrefilterExpressionIndexTest sparqlExpressions sparqlExpressions parser index) addLinkAndDiscoverTestSerial(GetPrefilterExpressionFromSparqlExpressionTest sparqlExpressions index) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index c4364023fa..4a46644aff 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -29,10 +29,12 @@ auto IT = [](const auto& c1, const auto& c2, const auto& c3, int graph = g) { auto PT = [](const auto& c1, const auto& c2, const auto& c3, int graph = g) { return CompressedBlockMetadata::PermutedTriple{V(c1), V(c2), V(c3), V(graph)}; }; -auto CBM = [](const auto firstTriple, const auto lastTriple) { +auto CBM = [](const auto firstTriple, const auto lastTriple, + CompressedBlockMetadata::GraphInfo graphs = std::nullopt) { size_t dummyBlockIndex = 0; - return CompressedBlockMetadata{{{}, 0, firstTriple, lastTriple, {}, false}, - dummyBlockIndex}; + return CompressedBlockMetadata{ + {{}, 0, firstTriple, lastTriple, std::move(graphs), false}, + dummyBlockIndex}; }; auto numBlocks = @@ -750,10 +752,12 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { Span{T1}, metadata, {0, 1, 2}, false, handle)); expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1); + expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true; EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), testing::ElementsAreArray(expectedAugmentedMetadata)); // T2 is inside block 1. Borders don't change. + expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true; locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( Span{T2}, metadata, {0, 1, 2}, true, handle)); @@ -762,6 +766,7 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { // T3 is equal to PT4, the beginning of block 2. All update (update and // delete) add to the block borders. Borders don't change. + expectedAugmentedMetadata[2].containsDuplicatesWithDifferentGraphs_ = true; locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( Span{T3}, metadata, {0, 1, 2}, false, handle)); @@ -774,6 +779,7 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { Span{T4}, metadata, {0, 1, 2}, true, handle)); expectedAugmentedMetadata[4] = CBM(T4.toPermutedTriple(), PT8); + expectedAugmentedMetadata[4].containsDuplicatesWithDifferentGraphs_ = true; EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), testing::ElementsAreArray(expectedAugmentedMetadata)); @@ -781,6 +787,9 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { locatedTriplesPerBlock.erase(4, handles[0]); expectedAugmentedMetadata[4] = CBM(PT8, PT8); + // The block 4 has no more updates, so we restore the info about the block + // having no duplicates from the original metadata. + expectedAugmentedMetadata[4].containsDuplicatesWithDifferentGraphs_ = false; EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), testing::ElementsAreArray(expectedAugmentedMetadata)); @@ -797,6 +806,124 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) { } } +// _____________________________________________________________________________ +TEST_F(LocatedTriplesTest, augmentedMetadataGraphInfo) { + // Create a vector that is automatically converted to a span. + using Span = std::vector>; + + auto PT1 = PT(1, 10, 10); + auto PT2 = PT(2, 10, 10); + auto PT3 = PT(2, 15, 20); + // Two blocks, one without graph info, and one with graph info. + const std::vector metadata = { + CBM(PT1, PT1), CBM(PT2, PT3, std::vector{V(13)})}; + std::vector expectedAugmentedMetadata{metadata}; + + auto T1 = IT( + 1, 10, 10, + 12); // Before block 0 (because `12` is smaller than the default graph) + auto T2 = IT(1, 10, 10, + 99999999); // Becomes the lower bound of block 1, although it + // only differs in the graph info. + auto T3 = IT(2, 12, 10, 17); // Inside block 1, add graph 17. + auto T4 = IT(2, 12, 10, 18); // Inside block 1, add graph 18. + + auto T5 = IT(20, 30, 40, 19); // After the last block. + + ad_utility::SharedCancellationHandle handle = + std::make_shared>(); + + { + LocatedTriplesPerBlock locatedTriplesPerBlock; + locatedTriplesPerBlock.setOriginalMetadata(metadata); + + // Delete the located triples {T1 ... T4} + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + Span{T1, T2, T3, T4}, metadata, {0, 1, 2}, false, handle)); + + // All the blocks have updates, so their value of `containsDuplicates..` is + // set to `true`. + expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1); + expectedAugmentedMetadata[1].firstTriple_ = T2.toPermutedTriple(); + expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true; + expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true; + + // Note: the GraphInfo hasn't changed, because the new triples all were + // deleted. + EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(), + testing::ElementsAreArray(expectedAugmentedMetadata)); + } + { + expectedAugmentedMetadata = metadata; + LocatedTriplesPerBlock locatedTriplesPerBlock; + locatedTriplesPerBlock.setOriginalMetadata(metadata); + + // Add the located triples {T1 ... T5} + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + Span{T1, T2, T3, T4, T5}, metadata, {0, 1, 2}, true, handle)); + + expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1); + expectedAugmentedMetadata[1].firstTriple_ = T2.toPermutedTriple(); + expectedAugmentedMetadata[1].graphInfo_.value() = + std::vector{V(13), V(17), V(18), V(99999999)}; + + // We have added a triple `T5` after the last block, so there now is an + // additional block, which also stores the correct graph info. + expectedAugmentedMetadata.push_back( + CBM(T5.toPermutedTriple(), T5.toPermutedTriple(), std::vector{V(19)})); + + // The automatically added metadata for the last block also has the correct + // block index and number of columns, so we have to properly initialize it. + expectedAugmentedMetadata.back().blockIndex_ = 2; + expectedAugmentedMetadata.back().offsetsAndCompressedSize_.resize(4, + {0, 0}); + + // All the blocks have updates, so their value of `containsDuplicates..` is + // set to `true`. + expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true; + expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true; + expectedAugmentedMetadata[2].containsDuplicatesWithDifferentGraphs_ = true; + + // Note: the GraphInfo hasn't changed, because the new triples all were + // deleted. + auto actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata(); + EXPECT_THAT(actualMetadata, + testing::ElementsAreArray(expectedAugmentedMetadata)); + + // Test the case that a block loses its graph info if the added located + // triples have too many distinct graphs. + ASSERT_TRUE(actualMetadata[1].graphInfo_.has_value()); + std::vector> triples; + // Note: The `30` is an offset to guarantee that the added graphs are not + // contained in the located triples before. + for (size_t i = 30; i < 30 + 2 * MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA; + ++i) { + auto tr = T3; + tr.ids_.at(ADDITIONAL_COLUMN_GRAPH_ID) = V(i); + triples.push_back(tr); + } + + size_t numGraphsBefore = actualMetadata[1].graphInfo_.value().size(); + size_t numGraphsToMax = + MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA - numGraphsBefore; + + // Add the exact amount of graphs such that we are at the maximum number of + // stored graphs. + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + std::span{triples}.subspan(0, numGraphsToMax), metadata, {0, 1, 2}, + true, handle)); + actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata(); + ASSERT_TRUE(actualMetadata[1].graphInfo_.has_value()); + + // Adding one more graph will exceed the maximum. + locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation( + std::span{triples}.subspan(numGraphsToMax, numGraphsToMax + 1), + metadata, {0, 1, 2}, true, handle)); + actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata(); + ASSERT_FALSE(actualMetadata[1].graphInfo_.has_value()); + } +} + TEST_F(LocatedTriplesTest, debugPrints) { using LT = LocatedTriple; diff --git a/test/PrefilterExpressionIndexTest.cpp b/test/PrefilterExpressionIndexTest.cpp index a941afab6a..3ef0949575 100644 --- a/test/PrefilterExpressionIndexTest.cpp +++ b/test/PrefilterExpressionIndexTest.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures // Author: Hannes Baumann -#include +#include #include @@ -157,18 +157,32 @@ class PrefilterExpressionOnMetadataTest : public ::testing::Test { //______________________________________________________________________________ TEST_F(PrefilterExpressionOnMetadataTest, testBlockFormatForDebugging) { - EXPECT_EQ( - "#BlockMetadata\n(first) Triple: I:0 V:10 D:33.000000 V:0\n(last) " - "Triple: I:0 V:10 D:33.000000 V:0\nnum. rows: 0.\n", - (std::stringstream() << b5).str()); - EXPECT_EQ( - "#BlockMetadata\n(first) Triple: I:-4 V:10 D:33.000000 V:0\n(last) " - "Triple: D:2.000000 V:10 D:33.000000 V:0\nnum. rows: 0.\n", - (std::stringstream() << b11).str()); - EXPECT_EQ( - "#BlockMetadata\n(first) Triple: V:14 V:10 D:33.000000 V:0\n(last) " - "Triple: V:17 V:10 D:33.000000 V:0\nnum. rows: 0.\n", - (std::stringstream() << b21).str()); + auto toString = [](const CompressedBlockMetadata& b) { + return (std::stringstream{} << b).str(); + }; + + auto matcher = [&toString](const std::string& substring) { + return ::testing::ResultOf(toString, ::testing::HasSubstr(substring)); + }; + EXPECT_THAT( + b5, + matcher( + "#BlockMetadata\n(first) Triple: I:0 V:10 D:33.000000 V:0\n(last) " + "Triple: I:0 V:10 D:33.000000 V:0\nnum. rows: 0.\n")); + EXPECT_THAT( + b11, + matcher( + "#BlockMetadata\n(first) Triple: I:-4 V:10 D:33.000000 V:0\n(last) " + "Triple: D:2.000000 V:10 D:33.000000 V:0\nnum. rows: 0.\n")); + EXPECT_THAT( + b21, + matcher( + "#BlockMetadata\n(first) Triple: V:14 V:10 D:33.000000 V:0\n(last) " + "Triple: V:17 V:10 D:33.000000 V:0\nnum. rows: 0.\n")); + + auto blockWithGraphInfo = b21; + blockWithGraphInfo.graphInfo_.emplace({IntId(12), IntId(13)}); + EXPECT_THAT(blockWithGraphInfo, matcher("Graphs: I:12, I:13\n")); } // Test Relational Expressions