Skip to content

Commit

Permalink
Update graph info in the augmented block metadata (#1642)
Browse files Browse the repository at this point in the history
An update can invalidate a cached query result in the sense that if one would run the query again after the update, the result may be different. This was ignored so far, and is now considered as follows: Each `LocatedTriplesSnapshot` gets its own "index" (starting from zero and then incremented for each new snaphot). That index becomes part of the cache key. That way, a query will make use of a cached result if and only if there was no update between the time of the query and the time when the cached result was computed.
  • Loading branch information
joka921 authored Nov 29, 2024
1 parent f86080c commit 82ccc51
Show file tree
Hide file tree
Showing 5 changed files with 220 additions and 22 deletions.
18 changes: 14 additions & 4 deletions src/index/CompressedRelation.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ struct CompressedBlockMetadataNoBlockIndex {
size_t compressedSize_;
bool operator==(const OffsetAndCompressedSize&) const = default;
};

using GraphInfo = std::optional<std::vector<Id>>;

std::vector<OffsetAndCompressedSize> offsetsAndCompressedSize_;
size_t numRows_;

Expand Down Expand Up @@ -115,7 +118,14 @@ struct CompressedBlockMetadataNoBlockIndex {
const CompressedBlockMetadataNoBlockIndex& blockMetadata) {
str << "#BlockMetadata\n(first) " << blockMetadata.firstTriple_ << "(last) "
<< blockMetadata.lastTriple_ << "num. rows: " << blockMetadata.numRows_
<< "." << std::endl;
<< ".\n";
if (blockMetadata.graphInfo_.has_value()) {
str << "Graphs: ";
ad_utility::lazyStrJoin(&str, blockMetadata.graphInfo_.value(), ", ");
str << '\n';
}
str << "[possibly] contains duplicates: "
<< blockMetadata.containsDuplicatesWithDifferentGraphs_ << '\n';
return str;
}
};
Expand All @@ -133,9 +143,9 @@ struct CompressedBlockMetadata : CompressedBlockMetadataNoBlockIndex {
// Format BlockMetadata contents for debugging.
friend std::ostream& operator<<(
std::ostream& str, const CompressedBlockMetadata& blockMetadata) {
str << "#BlockMetadata\n(first) " << blockMetadata.firstTriple_ << "(last) "
<< blockMetadata.lastTriple_ << "num. rows: " << blockMetadata.numRows_
<< "." << std::endl;
str << static_cast<const CompressedBlockMetadataNoBlockIndex&>(
blockMetadata);
str << "block index: " << blockMetadata.blockIndex_ << "\n";
return str;
}
};
Expand Down
49 changes: 48 additions & 1 deletion src/index/LocatedTriples.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include "absl/strings/str_join.h"
#include "index/CompressedRelation.h"
#include "index/ConstantsIndexBuilding.h"
#include "util/ChunkedForLoop.h"

// ____________________________________________________________________________
Expand Down Expand Up @@ -250,6 +251,48 @@ void LocatedTriplesPerBlock::setOriginalMetadata(
updateAugmentedMetadata();
}

// Update the `blockMetadata`, such that its graph info is consistent with the
// `locatedTriples` which are added to that block. In particular, all graphs to
// which at least one triple is inserted become part of the graph info, and if
// the number of total graphs becomes larger than the configured threshold, then
// the graph info is set to `nullopt`, which means that there is no info.
static auto updateGraphMetadata(CompressedBlockMetadata& blockMetadata,
const LocatedTriples& locatedTriples) {
// We do not know anything about the triples contained in the block, so we
// also cannot know if the `locatedTriples` introduces duplicates. We thus
// have to be conservative and assume that there are duplicates.
blockMetadata.containsDuplicatesWithDifferentGraphs_ = true;
auto& graphs = blockMetadata.graphInfo_;
if (!graphs.has_value()) {
// The original block already contains too many graphs, don't store any
// graph info.
return;
}

// Compute a hash set of all graphs that are originally contained in the block
// and all the graphs that are added via the `locatedTriples`.
ad_utility::HashSet<Id> newGraphs(graphs.value().begin(),
graphs.value().end());
for (auto& lt : locatedTriples) {
if (!lt.shouldTripleExist_) {
// Don't update the graph info for triples that are deleted.
continue;
}
newGraphs.insert(lt.triple_.ids_.at(ADDITIONAL_COLUMN_GRAPH_ID));
// Handle the case that with the newly added triples we have too many
// distinct graphs to store them in the graph info.
if (newGraphs.size() > MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA) {
graphs.reset();
return;
}
}
graphs.emplace(newGraphs.begin(), newGraphs.end());

// Sort the stored graphs. Note: this is currently not expected by the code
// that uses the graph info, but makes testing much easier.
std::ranges::sort(graphs.value());
}

// ____________________________________________________________________________
void LocatedTriplesPerBlock::updateAugmentedMetadata() {
// TODO<C++23> use view::enumerate
Expand All @@ -265,6 +308,7 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() {
blockMetadata.lastTriple_ =
std::max(blockMetadata.lastTriple_,
blockUpdates.rbegin()->triple_.toPermutedTriple());
updateGraphMetadata(blockMetadata, blockUpdates);
}
blockIndex++;
}
Expand All @@ -287,7 +331,10 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() {
lastTriple,
std::nullopt,
true};
augmentedMetadata_->emplace_back(lastBlockN, blockIndex);
lastBlockN.graphInfo_.emplace();
CompressedBlockMetadata lastBlock{lastBlockN, blockIndex};
updateGraphMetadata(lastBlock, blockUpdates);
augmentedMetadata_->push_back(lastBlock);
}
}

Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ addLinkAndDiscoverTest(AlgorithmTest)

addLinkAndDiscoverTestSerial(CompressedRelationsTest index)

addLinkAndDiscoverTestSerial(PrefilterExpressionIndexTest sparqlExpressions index)
addLinkAndDiscoverTestSerial(PrefilterExpressionIndexTest sparqlExpressions sparqlExpressions parser index)

addLinkAndDiscoverTestSerial(GetPrefilterExpressionFromSparqlExpressionTest sparqlExpressions index)

Expand Down
133 changes: 130 additions & 3 deletions test/LocatedTriplesTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ auto IT = [](const auto& c1, const auto& c2, const auto& c3, int graph = g) {
auto PT = [](const auto& c1, const auto& c2, const auto& c3, int graph = g) {
return CompressedBlockMetadata::PermutedTriple{V(c1), V(c2), V(c3), V(graph)};
};
auto CBM = [](const auto firstTriple, const auto lastTriple) {
auto CBM = [](const auto firstTriple, const auto lastTriple,
CompressedBlockMetadata::GraphInfo graphs = std::nullopt) {
size_t dummyBlockIndex = 0;
return CompressedBlockMetadata{{{}, 0, firstTriple, lastTriple, {}, false},
dummyBlockIndex};
return CompressedBlockMetadata{
{{}, 0, firstTriple, lastTriple, std::move(graphs), false},
dummyBlockIndex};
};

auto numBlocks =
Expand Down Expand Up @@ -750,10 +752,12 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) {
Span{T1}, metadata, {0, 1, 2}, false, handle));

expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1);
expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true;
EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(),
testing::ElementsAreArray(expectedAugmentedMetadata));

// T2 is inside block 1. Borders don't change.
expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true;
locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation(
Span{T2}, metadata, {0, 1, 2}, true, handle));

Expand All @@ -762,6 +766,7 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) {

// T3 is equal to PT4, the beginning of block 2. All update (update and
// delete) add to the block borders. Borders don't change.
expectedAugmentedMetadata[2].containsDuplicatesWithDifferentGraphs_ = true;
locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation(
Span{T3}, metadata, {0, 1, 2}, false, handle));

Expand All @@ -774,13 +779,17 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) {
Span{T4}, metadata, {0, 1, 2}, true, handle));

expectedAugmentedMetadata[4] = CBM(T4.toPermutedTriple(), PT8);
expectedAugmentedMetadata[4].containsDuplicatesWithDifferentGraphs_ = true;
EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(),
testing::ElementsAreArray(expectedAugmentedMetadata));

// Erasing the update of T4 restores the beginning of block 4.
locatedTriplesPerBlock.erase(4, handles[0]);

expectedAugmentedMetadata[4] = CBM(PT8, PT8);
// The block 4 has no more updates, so we restore the info about the block
// having no duplicates from the original metadata.
expectedAugmentedMetadata[4].containsDuplicatesWithDifferentGraphs_ = false;
EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(),
testing::ElementsAreArray(expectedAugmentedMetadata));

Expand All @@ -797,6 +806,124 @@ TEST_F(LocatedTriplesTest, augmentedMetadata) {
}
}

// _____________________________________________________________________________
TEST_F(LocatedTriplesTest, augmentedMetadataGraphInfo) {
// Create a vector that is automatically converted to a span.
using Span = std::vector<IdTriple<0>>;

auto PT1 = PT(1, 10, 10);
auto PT2 = PT(2, 10, 10);
auto PT3 = PT(2, 15, 20);
// Two blocks, one without graph info, and one with graph info.
const std::vector<CompressedBlockMetadata> metadata = {
CBM(PT1, PT1), CBM(PT2, PT3, std::vector<Id>{V(13)})};
std::vector<CompressedBlockMetadata> expectedAugmentedMetadata{metadata};

auto T1 = IT(
1, 10, 10,
12); // Before block 0 (because `12` is smaller than the default graph)
auto T2 = IT(1, 10, 10,
99999999); // Becomes the lower bound of block 1, although it
// only differs in the graph info.
auto T3 = IT(2, 12, 10, 17); // Inside block 1, add graph 17.
auto T4 = IT(2, 12, 10, 18); // Inside block 1, add graph 18.

auto T5 = IT(20, 30, 40, 19); // After the last block.

ad_utility::SharedCancellationHandle handle =
std::make_shared<ad_utility::CancellationHandle<>>();

{
LocatedTriplesPerBlock locatedTriplesPerBlock;
locatedTriplesPerBlock.setOriginalMetadata(metadata);

// Delete the located triples {T1 ... T4}
locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation(
Span{T1, T2, T3, T4}, metadata, {0, 1, 2}, false, handle));

// All the blocks have updates, so their value of `containsDuplicates..` is
// set to `true`.
expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1);
expectedAugmentedMetadata[1].firstTriple_ = T2.toPermutedTriple();
expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true;
expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true;

// Note: the GraphInfo hasn't changed, because the new triples all were
// deleted.
EXPECT_THAT(locatedTriplesPerBlock.getAugmentedMetadata(),
testing::ElementsAreArray(expectedAugmentedMetadata));
}
{
expectedAugmentedMetadata = metadata;
LocatedTriplesPerBlock locatedTriplesPerBlock;
locatedTriplesPerBlock.setOriginalMetadata(metadata);

// Add the located triples {T1 ... T5}
locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation(
Span{T1, T2, T3, T4, T5}, metadata, {0, 1, 2}, true, handle));

expectedAugmentedMetadata[0] = CBM(T1.toPermutedTriple(), PT1);
expectedAugmentedMetadata[1].firstTriple_ = T2.toPermutedTriple();
expectedAugmentedMetadata[1].graphInfo_.value() =
std::vector{V(13), V(17), V(18), V(99999999)};

// We have added a triple `T5` after the last block, so there now is an
// additional block, which also stores the correct graph info.
expectedAugmentedMetadata.push_back(
CBM(T5.toPermutedTriple(), T5.toPermutedTriple(), std::vector{V(19)}));

// The automatically added metadata for the last block also has the correct
// block index and number of columns, so we have to properly initialize it.
expectedAugmentedMetadata.back().blockIndex_ = 2;
expectedAugmentedMetadata.back().offsetsAndCompressedSize_.resize(4,
{0, 0});

// All the blocks have updates, so their value of `containsDuplicates..` is
// set to `true`.
expectedAugmentedMetadata[0].containsDuplicatesWithDifferentGraphs_ = true;
expectedAugmentedMetadata[1].containsDuplicatesWithDifferentGraphs_ = true;
expectedAugmentedMetadata[2].containsDuplicatesWithDifferentGraphs_ = true;

// Note: the GraphInfo hasn't changed, because the new triples all were
// deleted.
auto actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata();
EXPECT_THAT(actualMetadata,
testing::ElementsAreArray(expectedAugmentedMetadata));

// Test the case that a block loses its graph info if the added located
// triples have too many distinct graphs.
ASSERT_TRUE(actualMetadata[1].graphInfo_.has_value());
std::vector<IdTriple<0>> triples;
// Note: The `30` is an offset to guarantee that the added graphs are not
// contained in the located triples before.
for (size_t i = 30; i < 30 + 2 * MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA;
++i) {
auto tr = T3;
tr.ids_.at(ADDITIONAL_COLUMN_GRAPH_ID) = V(i);
triples.push_back(tr);
}

size_t numGraphsBefore = actualMetadata[1].graphInfo_.value().size();
size_t numGraphsToMax =
MAX_NUM_GRAPHS_STORED_IN_BLOCK_METADATA - numGraphsBefore;

// Add the exact amount of graphs such that we are at the maximum number of
// stored graphs.
locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation(
std::span{triples}.subspan(0, numGraphsToMax), metadata, {0, 1, 2},
true, handle));
actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata();
ASSERT_TRUE(actualMetadata[1].graphInfo_.has_value());

// Adding one more graph will exceed the maximum.
locatedTriplesPerBlock.add(LocatedTriple::locateTriplesInPermutation(
std::span{triples}.subspan(numGraphsToMax, numGraphsToMax + 1),
metadata, {0, 1, 2}, true, handle));
actualMetadata = locatedTriplesPerBlock.getAugmentedMetadata();
ASSERT_FALSE(actualMetadata[1].graphInfo_.has_value());
}
}

TEST_F(LocatedTriplesTest, debugPrints) {
using LT = LocatedTriple;

Expand Down
40 changes: 27 additions & 13 deletions test/PrefilterExpressionIndexTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Chair of Algorithms and Data Structures
// Author: Hannes Baumann <[email protected]>

#include <gtest/gtest.h>
#include <gmock/gmock.h>

#include <vector>

Expand Down Expand Up @@ -157,18 +157,32 @@ class PrefilterExpressionOnMetadataTest : public ::testing::Test {

//______________________________________________________________________________
TEST_F(PrefilterExpressionOnMetadataTest, testBlockFormatForDebugging) {
EXPECT_EQ(
"#BlockMetadata\n(first) Triple: I:0 V:10 D:33.000000 V:0\n(last) "
"Triple: I:0 V:10 D:33.000000 V:0\nnum. rows: 0.\n",
(std::stringstream() << b5).str());
EXPECT_EQ(
"#BlockMetadata\n(first) Triple: I:-4 V:10 D:33.000000 V:0\n(last) "
"Triple: D:2.000000 V:10 D:33.000000 V:0\nnum. rows: 0.\n",
(std::stringstream() << b11).str());
EXPECT_EQ(
"#BlockMetadata\n(first) Triple: V:14 V:10 D:33.000000 V:0\n(last) "
"Triple: V:17 V:10 D:33.000000 V:0\nnum. rows: 0.\n",
(std::stringstream() << b21).str());
auto toString = [](const CompressedBlockMetadata& b) {
return (std::stringstream{} << b).str();
};

auto matcher = [&toString](const std::string& substring) {
return ::testing::ResultOf(toString, ::testing::HasSubstr(substring));
};
EXPECT_THAT(
b5,
matcher(
"#BlockMetadata\n(first) Triple: I:0 V:10 D:33.000000 V:0\n(last) "
"Triple: I:0 V:10 D:33.000000 V:0\nnum. rows: 0.\n"));
EXPECT_THAT(
b11,
matcher(
"#BlockMetadata\n(first) Triple: I:-4 V:10 D:33.000000 V:0\n(last) "
"Triple: D:2.000000 V:10 D:33.000000 V:0\nnum. rows: 0.\n"));
EXPECT_THAT(
b21,
matcher(
"#BlockMetadata\n(first) Triple: V:14 V:10 D:33.000000 V:0\n(last) "
"Triple: V:17 V:10 D:33.000000 V:0\nnum. rows: 0.\n"));

auto blockWithGraphInfo = b21;
blockWithGraphInfo.graphInfo_.emplace({IntId(12), IntId(13)});
EXPECT_THAT(blockWithGraphInfo, matcher("Graphs: I:12, I:13\n"));
}

// Test Relational Expressions
Expand Down

0 comments on commit 82ccc51

Please sign in to comment.