Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spatial join bounding box #1546

Merged
merged 25 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e9ed3e4
add BoundingBox Algorithm
Jonathan24680 Oct 18, 2024
ada2978
Sonarqube issues
Jonathan24680 Nov 3, 2024
eeadcd7
Sonarqube issues
Jonathan24680 Nov 4, 2024
127ba53
dont use cpp 20 version constant of pi
Jonathan24680 Nov 4, 2024
dcb60dd
Merge branch 'master' into SpatialJoinBoundingBox
Jonathan24680 Nov 4, 2024
be09a86
part of first rework
Jonathan24680 Nov 8, 2024
c4ddec6
rework of some parts of the tests
Jonathan24680 Nov 9, 2024
122372b
mr feedback
Jonathan24680 Nov 18, 2024
12f3c13
Merge branch 'master' into SpatialJoinBoundingBox
Jonathan24680 Nov 18, 2024
213b02e
remove second definition of function
Jonathan24680 Nov 19, 2024
ecda249
add new macros AD_LOG vs LOG
joka921 Nov 19, 2024
97d039b
Merge branch 'AD_LOG' of https://github.com/joka921/QLever into AD_LOG
Jonathan24680 Nov 19, 2024
603ec46
Merge branch 'AD_LOG' into SpatialJoinBoundingBox
Jonathan24680 Nov 19, 2024
3831da8
PR feedback
Jonathan24680 Nov 20, 2024
9ffaa0a
Apply the suggestion by Hannah.
joka921 Nov 20, 2024
032eade
Clang-format.
joka921 Nov 20, 2024
d50e2cc
Merge branch 'AD_LOG' of https://github.com/joka921/QLever into AD_LOG
Jonathan24680 Nov 20, 2024
24e31c6
Merge branch 'AD_LOG' into SpatialJoinBoundingBox
Jonathan24680 Nov 20, 2024
8285355
merge log change and fix allocator
Jonathan24680 Nov 20, 2024
56e1618
PR feedback
Jonathan24680 Nov 20, 2024
7d8b98b
pr feedback
Jonathan24680 Nov 21, 2024
89491a1
PR feedback
Jonathan24680 Nov 21, 2024
8743986
merge master into SpatialJoinBoundingBox
Jonathan24680 Nov 22, 2024
23f2ed2
PR feedback
Jonathan24680 Nov 22, 2024
ce5db45
MR feedback
Jonathan24680 Nov 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions src/engine/SpatialJoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ size_t SpatialJoin::getCostEstimate() {
if (childLeft_ && childRight_) {
size_t inputEstimate =
childLeft_->getSizeEstimate() * childRight_->getSizeEstimate();
if (useBaselineAlgorithm_) {
if (algorithm_ == Algorithm::Baseline) {
return inputEstimate * inputEstimate;
} else {
// Let n be the size of the left table and m the size of the right table.
Expand Down Expand Up @@ -336,11 +336,21 @@ PreparedSpatialJoinParams SpatialJoin::prepareJoin() const {
// ____________________________________________________________________________
Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) {
SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(),
addDistToResult_, config_};
if (useBaselineAlgorithm_) {
addDistToResult_, config_, this};
if (algorithm_ == Algorithm::Baseline) {
return algorithms.BaselineAlgorithm();
} else {
} else if (algorithm_ == Algorithm::S2Geometry) {
return algorithms.S2geometryAlgorithm();
} else {
AD_CORRECTNESS_CHECK(algorithm_ == Algorithm::BoundingBox);
// as the BoundingBoxAlgorithms only works for max distance and not for
// nearest neighbors, S2geometry gets called as a backup, if the query is
// asking for the nearest neighbors
if (std::get_if<MaxDistanceConfig>(&config_)) {
return algorithms.BoundingBoxAlgorithm();
} else {
return algorithms.S2geometryAlgorithm();
}
}
}

Expand Down
14 changes: 10 additions & 4 deletions src/engine/SpatialJoin.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,20 @@ class SpatialJoin : public Operation {
// purposes
std::optional<size_t> getMaxResults() const;

void selectAlgorithm(bool useBaselineAlgorithm) {
useBaselineAlgorithm_ = useBaselineAlgorithm;
}
// options which can be used for the algorithm, which calculates the result
enum class Algorithm { Baseline, S2Geometry, BoundingBox };

void selectAlgorithm(Algorithm algorithm) { algorithm_ = algorithm; }

std::pair<size_t, size_t> onlyForTestingGetConfig() const {
return std::pair{getMaxDist().value_or(-1), getMaxResults().value_or(-1)};
}

std::variant<NearestNeighborsConfig, MaxDistanceConfig>
onlyForTestingGetActualConfig() const {
return config_;
}

std::shared_ptr<QueryExecutionTree> onlyForTestingGetLeftChild() const {
return childLeft_;
}
Expand Down Expand Up @@ -135,5 +141,5 @@ class SpatialJoin : public Operation {
// between the two objects
bool addDistToResult_ = true;
const string nameDistanceInternal_ = "?distOfTheTwoObjectsAddedInternally";
bool useBaselineAlgorithm_ = false;
Algorithm algorithm_ = Algorithm::S2Geometry;
};
305 changes: 303 additions & 2 deletions src/engine/SpatialJoinAlgorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,23 @@
#include <s2/s2point_index.h>
#include <s2/util/units/length-units.h>

#include <cmath>

#include "util/GeoSparqlHelpers.h"

using namespace BoostGeometryNamespace;

// ____________________________________________________________________________
SpatialJoinAlgorithms::SpatialJoinAlgorithms(
QueryExecutionContext* qec, PreparedSpatialJoinParams params,
bool addDistToResult,
std::variant<NearestNeighborsConfig, MaxDistanceConfig> config)
std::variant<NearestNeighborsConfig, MaxDistanceConfig> config,
std::optional<SpatialJoin*> spatialJoin)
: qec_{qec},
params_{std::move(params)},
addDistToResult_{addDistToResult},
config_{std::move(config)} {}
config_{std::move(config)},
spatialJoin_{spatialJoin} {}

// ____________________________________________________________________________
std::optional<GeoPoint> SpatialJoinAlgorithms::getPoint(const IdTable* restable,
Expand Down Expand Up @@ -223,3 +229,298 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() {
return Result(std::move(result), std::vector<ColumnIndex>{},
Result::getMergedLocalVocab(*resultLeft, *resultRight));
}

// ____________________________________________________________________________
std::vector<Box> SpatialJoinAlgorithms::computeBoundingBox(
const Point& startPoint) const {
const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns, maxDist, maxResults] = params_;
AD_CORRECTNESS_CHECK(maxDist.has_value(),
"Max distance must have a value for this operation");
// haversine function
auto haversine = [](double theta) { return (1 - std::cos(theta)) / 2; };

// inverse haversine function
auto archaversine = [](double theta) { return std::acos(1 - 2 * theta); };

// safety buffer for numerical inaccuracies
double maxDistInMetersBuffer;
if (maxDist.value() < 10) {
maxDistInMetersBuffer = 10;
} else if (static_cast<double>(maxDist.value()) <
static_cast<double>(std::numeric_limits<long long>::max()) /
1.02) {
maxDistInMetersBuffer = 1.01 * static_cast<double>(maxDist.value());
} else {
maxDistInMetersBuffer =
static_cast<double>(std::numeric_limits<long long>::max());
}

// for large distances, where the lower calculation would just result in
// a single bounding box for the whole planet, do an optimized version
if (static_cast<double>(maxDist.value()) > circumferenceMax_ / 4.0 &&
static_cast<double>(maxDist.value()) < circumferenceMax_ / 2.01) {
return computeBoundingBoxForLargeDistances(startPoint);
}

// compute latitude bound
double maxDistInDegrees = maxDistInMetersBuffer * (360 / circumferenceMax_);
double upperLatBound = startPoint.get<1>() + maxDistInDegrees;
double lowerLatBound = startPoint.get<1>() - maxDistInDegrees;

auto southPoleReached = isAPoleTouched(lowerLatBound).at(1);
auto northPoleReached = isAPoleTouched(upperLatBound).at(0);

if (southPoleReached || northPoleReached) {
return {Box(Point(-180.0f, lowerLatBound), Point(180.0f, upperLatBound))};
}

// compute longitude bound. For an explanation of the calculation and the
// naming convention see my master thesis
double alpha = maxDistInMetersBuffer / radius_;
double gamma = (90 - std::abs(startPoint.get<1>())) * (2 * M_PI / 360);
joka921 marked this conversation as resolved.
Show resolved Hide resolved
double beta = std::acos(std::cos(gamma) / std::cos(alpha));
double delta = 0;
if (maxDistInMetersBuffer > circumferenceMax_ / 20) {
// use law of cosines
delta = std::acos((std::cos(alpha) - std::cos(gamma) * std::cos(beta)) /
(std::sin(gamma) * std::sin(beta)));
} else {
// use law of haversines for numerical stability
delta = archaversine((haversine(alpha - haversine(gamma - beta))) /
(std::sin(gamma) * std::sin(beta)));
}
Jonathan24680 marked this conversation as resolved.
Show resolved Hide resolved
double lonRange = delta * 360 / (2 * M_PI);
double leftLonBound = startPoint.get<0>() - lonRange;
double rightLonBound = startPoint.get<0>() + lonRange;
// test for "overflows" and create two bounding boxes if necessary
if (leftLonBound < -180) {
auto box1 =
Box(Point(-180, lowerLatBound), Point(rightLonBound, upperLatBound));
auto box2 = Box(Point(leftLonBound + 360, lowerLatBound),
Point(180, upperLatBound));
return {box1, box2};
} else if (rightLonBound > 180) {
auto box1 =
Box(Point(leftLonBound, lowerLatBound), Point(180, upperLatBound));
auto box2 = Box(Point(-180, lowerLatBound),
Point(rightLonBound - 360, upperLatBound));
return {box1, box2};
}
// default case, when no bound has an "overflow"
return {Box(Point(leftLonBound, lowerLatBound),
Point(rightLonBound, upperLatBound))};
}

// ____________________________________________________________________________
std::vector<Box> SpatialJoinAlgorithms::computeBoundingBoxForLargeDistances(
const Point& startPoint) const {
const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns, maxDist, maxResults] = params_;
AD_CORRECTNESS_CHECK(maxDist.has_value(),
"Max distance must have a value for this operation");

// point on the opposite side of the globe
Point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1);
if (antiPoint.get<0>() > 180) {
antiPoint.set<0>(antiPoint.get<0>() - 360);
}
// for an explanation of the formula see the master thesis. Divide by two two
// only consider the distance from the point to the antiPoint, subtract
// maxDist and a safety margine from that
double antiDist =
(circumferenceMin_ / 2.0) - static_cast<double>(maxDist.value()) * 1.01;
// use the bigger circumference as an additional safety margin, use 2.01
// instead of 2.0 because of rounding inaccuracies in floating point
// operations
double distToAntiPoint = (360 / circumferenceMax_) * (antiDist / 2.01);
double upperBound = antiPoint.get<1>() + distToAntiPoint;
double lowerBound = antiPoint.get<1>() - distToAntiPoint;
double leftBound = antiPoint.get<0>() - distToAntiPoint;
double rightBound = antiPoint.get<0>() + distToAntiPoint;
bool northPoleTouched = false;
bool southPoleTouched = false;
bool boxCrosses180Longitude = false; // if the 180 to -180 line is touched
// if a pole is crossed, ignore the part after the crossing
if (upperBound > 90) {
upperBound = 90;
northPoleTouched = true;
}
if (lowerBound < -90) {
lowerBound = -90;
southPoleTouched = true;
}
if (leftBound < -180) {
leftBound += 360;
}
if (rightBound > 180) {
rightBound -= 360;
}
if (rightBound < leftBound) {
boxCrosses180Longitude = true;
}
// compute bounding boxes using the anti bounding box from above
std::vector<Box> boxes;
if (!northPoleTouched) {
// add upper bounding box(es)
if (boxCrosses180Longitude) {
boxes.emplace_back(Point(leftBound, upperBound), Point(180, 90));
boxes.emplace_back(Point(-180, upperBound), Point(rightBound, 90));
} else {
boxes.emplace_back(Point(leftBound, upperBound), Point(rightBound, 90));
}
}
if (!southPoleTouched) {
// add lower bounding box(es)
if (boxCrosses180Longitude) {
boxes.emplace_back(Point(leftBound, -90), Point(180, lowerBound));
boxes.emplace_back(Point(-180, -90), Point(rightBound, lowerBound));
} else {
boxes.emplace_back(Point(leftBound, -90), Point(rightBound, lowerBound));
}
}
// add the box(es) inbetween the longitude lines
if (boxCrosses180Longitude) {
// only one box needed to cover the longitudes
boxes.emplace_back(Point(rightBound, -90), Point(leftBound, 90));
} else {
// two boxes needed, one left and one right of the anti bounding box
boxes.emplace_back(Point(-180, -90), Point(leftBound, 90));
boxes.emplace_back(Point(rightBound, -90), Point(180, 90));
}
return boxes;
}

// ____________________________________________________________________________
bool SpatialJoinAlgorithms::isContainedInBoundingBoxes(
const std::vector<Box>& boundingBox, Point point) const {
convertToNormalCoordinates(point);

return std::ranges::any_of(boundingBox, [point](const Box& aBox) {
return boost::geometry::covered_by(point, aBox);
});
}

// ____________________________________________________________________________
void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) const {
// correct lon and lat bounds if necessary
while (point.get<0>() < -180) {
point.set<0>(point.get<0>() + 360);
}
while (point.get<0>() > 180) {
point.set<0>(point.get<0>() - 360);
}
if (point.get<1>() < -90) {
point.set<1>(-90);
} else if (point.get<1>() > 90) {
point.set<1>(90);
}
}

// ____________________________________________________________________________
std::array<bool, 2> SpatialJoinAlgorithms::isAPoleTouched(
Jonathan24680 marked this conversation as resolved.
Show resolved Hide resolved
const double& latitude) const {
bool northPoleReached = false;
bool southPoleReached = false;
if (latitude >= 90) {
northPoleReached = true;
}
if (latitude <= -90) {
southPoleReached = true;
}
return std::array{northPoleReached, southPoleReached};
}

// ____________________________________________________________________________
Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() {
auto printWarning = [alreadyWarned = false,
&spatialJoin = spatialJoin_]() mutable {
if (!alreadyWarned) {
std::string warning =
"The input to a spatial join contained at least one element, "
"that is not a point geometry and is thus skipped. Note that "
"QLever currently only accepts point geometries for the "
"spatial joins";
AD_LOG_WARN << warning << std::endl;
alreadyWarned = true;
if (spatialJoin.has_value()) {
AD_CORRECTNESS_CHECK(spatialJoin.value() != nullptr);
spatialJoin.value()->addWarning(warning);
}
}
};
Jonathan24680 marked this conversation as resolved.
Show resolved Hide resolved
Jonathan24680 marked this conversation as resolved.
Show resolved Hide resolved

const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol,
rightJoinCol, numColumns, maxDist, maxResults] = params_;
IdTable result{numColumns, qec_->getAllocator()};

// create r-tree for smaller result table
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to measure, It might be faster to create the index for the bigger IdTable.
But as everything else is symmetric, you should be able to easily switch this for experiments.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a point, which is already on my list for evaluations. I would postpone it until the evaluation phase if that's fine with you

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely.

auto smallerResult = idTableLeft;
auto otherResult = idTableRight;
bool leftResSmaller = true;
auto smallerResJoinCol = leftJoinCol;
auto otherResJoinCol = rightJoinCol;
if (idTableLeft->numRows() > idTableRight->numRows()) {
std::swap(smallerResult, otherResult);
leftResSmaller = false;
std::swap(smallerResJoinCol, otherResJoinCol);
}
Jonathan24680 marked this conversation as resolved.
Show resolved Hide resolved

bgi::rtree<Value, bgi::quadratic<16>, bgi::indexable<Value>,
bgi::equal_to<Value>, ad_utility::AllocatorWithLimit<Value>>
rtree(bgi::quadratic<16>{}, bgi::indexable<Value>{},
bgi::equal_to<Value>{}, qec_->getAllocator());
for (size_t i = 0; i < smallerResult->numRows(); i++) {
// get point of row i
auto geopoint = getPoint(smallerResult, i, smallerResJoinCol);

if (!geopoint) {
printWarning();
continue;
}

Point p(geopoint.value().getLng(), geopoint.value().getLat());

// add every point together with the row number into the rtree
rtree.insert(std::make_pair(std::move(p), i));
}
std::vector<Value, ad_utility::AllocatorWithLimit<Value>> results{
qec_->getAllocator()};
for (size_t i = 0; i < otherResult->numRows(); i++) {
auto geopoint1 = getPoint(otherResult, i, otherResJoinCol);

if (!geopoint1) {
printWarning();
continue;
}

Point p(geopoint1.value().getLng(), geopoint1.value().getLat());

// query the other rtree for every point using the following bounding box
std::vector<Box> bbox = computeBoundingBox(p);
results.clear();

std::ranges::for_each(bbox, [&](const Box& bbox) {
rtree.query(bgi::intersects(bbox), std::back_inserter(results));
});

std::ranges::for_each(results, [&](const Value& res) {
size_t rowLeft = res.second;
size_t rowRight = i;
if (!leftResSmaller) {
std::swap(rowLeft, rowRight);
}
auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight,
leftJoinCol, rightJoinCol);
AD_CORRECTNESS_CHECK(distance.getDatatype() == Datatype::Int);
if (distance.getInt() <= maxDist) {
addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft,
rowRight, distance);
}
});
}
auto resTable =
Result(std::move(result), std::vector<ColumnIndex>{},
Result::getMergedLocalVocab(*resultLeft, *resultRight));
return resTable;
}
Loading
Loading