Skip to content

Commit

Permalink
SubStr improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Annika Greif committed Nov 29, 2024
1 parent 52ef1f5 commit f15bf94
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 44 deletions.
27 changes: 16 additions & 11 deletions src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,11 @@ ExportQueryExecutionTrees::idToStringAndTypeForEncodedValue(Id id) {
std::optional<LiteralOrIri>
ExportQueryExecutionTrees::idToLiteralOrIriForEncodedValue(
Id id, bool onlyReturnLiteralsWithXsdString) {
if(onlyReturnLiteralsWithXsdString){
return std::nullopt;
}
auto optionalStringAndType = idToStringAndTypeForEncodedValue(id);
if (!optionalStringAndType || onlyReturnLiteralsWithXsdString) {
if (!optionalStringAndType) {
return std::nullopt;
}

Expand All @@ -363,29 +366,31 @@ ExportQueryExecutionTrees::idToLiteralOrIriForEncodedValue(

// _____________________________________________________________________________
std::optional<LiteralOrIri> ExportQueryExecutionTrees::handleIriOrLiteral(
const LiteralOrIri& word, bool onlyReturnLiterals,
LiteralOrIri word, bool onlyReturnLiterals,
bool onlyReturnLiteralsWithXsdString) {
auto datatypeIsXSDString = [](const LiteralOrIri& word) {
return word.hasDatatype() &&
std::string_view(
reinterpret_cast<const char*>(word.getDatatype().data()),
word.getDatatype().size()) == XSD_STRING;
asStringViewUnsafe(word.getDatatype()) == XSD_STRING;
};

if (onlyReturnLiterals && !word.isLiteral()) {
return std::nullopt;
if (!word.isLiteral()) {
if(onlyReturnLiterals || onlyReturnLiteralsWithXsdString){
return std::nullopt;
}
return word;
}

if (onlyReturnLiteralsWithXsdString) {
if (word.isLiteral() &&
(!word.hasDatatype() || datatypeIsXSDString(word))) {
if (!word.hasDatatype() || datatypeIsXSDString(word)) {
return word;
}
return std::nullopt;
}
if (word.isLiteral() && word.hasDatatype() && !datatypeIsXSDString(word)) {

if (word.hasDatatype() && !datatypeIsXSDString(word)) {
return LiteralOrIri{
ad_utility::triple_component::Literal::literalWithNormalizedContent(
word.getContent())};
std::move(word.getContent()))};
}
return word;
}
Expand Down
18 changes: 7 additions & 11 deletions src/engine/ExportQueryExecutionTrees.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,10 @@ class ExportQueryExecutionTrees {
static std::optional<std::pair<std::string, const char*>>
idToStringAndTypeForEncodedValue(Id id);

// Converts an Id to a LiteralOrIri based on its type and value.
// For VocabIndex or LocalVocabIndex: Return Literal or Iri. If
// `onlyReturnLiteralsWithXsdString` is true, return only literals (no IRIs)
// with no datatype or datatype `xsd:string`; otherwise, return any literal,
// but strip datatypes other than `xsd:string`. For Double, Int, Bool, Date,
// or GeoPoint: Return the literal without the datatype. If
// `onlyReturnLiteralsWithXsdString` is true return `std::nullopt`. For
// Undefined Id: Always return `std::nullopt`
// Convert the `id` to a 'LiteralOrIri'.Datatypes are always stripped unless they are 'xsd:string',
// so for literals with non-'xsd:string' datatypes (this includes IDs that directly store their value, like Doubles) the datatypes are always empty.
// If 'onlyReturnLiteralsWithXsdString' is true, all IRIs and literals with non-'xsd:string' datatypes (including encoded IDs) return std::nullopt.
// These semantics are useful for the string expressions in StringExpressions.cpp.
template <bool returnOnlyLiterals = false>
static std::optional<LiteralOrIri> idToLiteralOrIri(
const Index& index, Id id, const LocalVocab& localVocab,
Expand All @@ -88,13 +84,13 @@ class ExportQueryExecutionTrees {
// thrown.
// If `onlyReturnLiteralsWithXsdString` is `true`, returns `std::nullopt`.
// If `onlyReturnLiteralsWithXsdString` is `false`, removes datatypes from
// literals (e.g., `42^^xsd:integer` becomes `"42"`).
// literals (e.g. the integer `42` is converted to the plain literal `"42"`).
static std::optional<LiteralOrIri> idToLiteralOrIriForEncodedValue(
Id id, bool onlyReturnLiteralsWithXsdString = false);

// Checks and processes a LiteralOrIri based on the given flags.
// A helper function for the `idToLiteralOrIri` function. Checks and processes a LiteralOrIri based on the given parameters.
static std::optional<LiteralOrIri> handleIriOrLiteral(
const LiteralOrIri& word, bool onlyReturnLiterals,
LiteralOrIri word, bool onlyReturnLiterals,
bool onlyReturnLiteralsWithXsdString);

// Acts as a helper to retrieve an LiteralOrIri object
Expand Down
24 changes: 21 additions & 3 deletions src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,30 @@ std::optional<std::string> StringValueGetter::operator()(
// ____________________________________________________________________________
std::optional<LiteralOrIri> LiteralOrIriValueGetter::operator()(
Id id, const EvaluationContext* context) const {
// true means that immediately returns nullopt for everything that is not a
// literal
return ExportQueryExecutionTrees::idToLiteralOrIri<false>(
return ExportQueryExecutionTrees::idToLiteralOrIri(
context->_qec.getIndex(), id, context->_localVocab);
}

// ____________________________________________________________________________
std::optional<LiteralOrIri> LiteralOrIriValueGetterWithXsdStringFilter::operator()(
Id id, const EvaluationContext* context) const {
return ExportQueryExecutionTrees::idToLiteralOrIri(
context->_qec.getIndex(), id, context->_localVocab, true);
}

// ____________________________________________________________________________
std::optional<LiteralOrIri> LiteralOrIriValueGetterWithXsdStringFilter::operator()(
const LiteralOrIri& s, const EvaluationContext*) const {
auto datatypeIsXSDString = [](const LiteralOrIri& word) {
return word.hasDatatype() &&
asStringViewUnsafe(word.getDatatype()) == XSD_STRING;
};
if (!s.hasDatatype() || datatypeIsXSDString(s)) {
return s;
}
return std::nullopt;
}

// ____________________________________________________________________________
template <auto isSomethingFunction, auto prefix>
Id IsSomethingValueGetter<isSomethingFunction, prefix>::operator()(
Expand Down
11 changes: 11 additions & 0 deletions src/engine/sparqlExpressions/SparqlExpressionValueGetters.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ struct LiteralOrIriValueGetter : Mixin<LiteralOrIriValueGetter> {
}
};

// Same as above but only literals with 'xsd:string' datatype or no datatype are returned. So only literals w
struct LiteralOrIriValueGetterWithXsdStringFilter : Mixin<LiteralOrIriValueGetterWithXsdStringFilter> {
using Mixin<LiteralOrIriValueGetterWithXsdStringFilter>::operator();

std::optional<LiteralOrIri> operator()(ValueId,
const EvaluationContext*) const;

std::optional<LiteralOrIri> operator()(const LiteralOrIri& s,
const EvaluationContext*) const;
};

// Value getter for `isBlank`.
struct IsBlankNodeValueGetter : Mixin<IsBlankNodeValueGetter> {
using Mixin<IsBlankNodeValueGetter>::operator();
Expand Down
62 changes: 43 additions & 19 deletions src/engine/sparqlExpressions/StringExpressions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,20 +211,6 @@ class SubstrImpl {
}

const auto& str = asStringViewUnsafe(s.value().getContent());
std::optional<std::variant<Iri, std::string>> descriptor;

if (s->isLiteral()) {
if (s->hasLanguageTag()) {
descriptor = std::string(asStringViewUnsafe(s->getLanguageTag()));
} else if (s->hasDatatype()) {
descriptor =
ad_utility::triple_component::Iri::fromIrirefWithoutBrackets(
asStringViewUnsafe(s->getDatatype()));
}
} else {
descriptor = std::nullopt;
}

// Clamp the number such that it is in `[0, str.size()]`. That way we end up
// with valid arguments for the `getUTF8Substring` method below for both
// starting position and length since all the other corner cases have been
Expand All @@ -239,14 +225,52 @@ class SubstrImpl {
return static_cast<size_t>(n);
};

return toLiteralWithDescriptor(
ad_utility::getUTF8Substring(str, clamp(startInt), clamp(lengthInt)),
descriptor);
s.value().getLiteral().setSubstr(clamp(startInt), clamp(lengthInt));
return s.value();
}
};

// Implementation of the `SUBSTR` SPARQL function. It dynamically
// selects the appropriate value getter for the first argument based on whether
// it is a `STR()` expression (using `LiteralOrIriValueGetterWithXsdStringFilter`)
// or another type (using `LiteralOrIriValueGetter`).
class SubstrExpressionImpl : public SparqlExpression {
private:
using ExpressionWithStr = NARY<3, FV<SubstrImpl, LiteralOrIriValueGetterWithXsdStringFilter,
NumericValueGetter, NumericValueGetter>>;
using ExpressionWithoutStr = NARY<3, FV<SubstrImpl, LiteralOrIriValueGetter,
NumericValueGetter, NumericValueGetter>>;

SparqlExpression::Ptr impl_;

public:
explicit SubstrExpressionImpl(
SparqlExpression::Ptr child,
std::same_as<SparqlExpression::Ptr> auto... children)
requires(sizeof...(children) + 1 == 3) {
AD_CORRECTNESS_CHECK(child != nullptr);

if (child->isStrExpression()) {
auto childrenOfStr = std::move(*child).moveChildrenOut();
AD_CORRECTNESS_CHECK(childrenOfStr.size() == 1);
impl_ = std::make_unique<ExpressionWithStr>(
std::move(childrenOfStr.at(0)), std::move(children)...);
} else {
impl_ = std::make_unique<ExpressionWithoutStr>(std::move(child),
std::move(children)...);
}
}

ExpressionResult evaluate(EvaluationContext* context) const override {
return impl_->evaluate(context);
}

std::string getCacheKey(const VariableToColumnMap& varColMap) const override {
return impl_->getCacheKey(varColMap);
}
};

using SubstrExpression = NARY<3, FV<SubstrImpl, LiteralOrIriValueGetter,
NumericValueGetter, NumericValueGetter>>;
using SubstrExpression = SubstrExpressionImpl;

// STRSTARTS
[[maybe_unused]] auto strStartsImpl = [](std::string_view text,
Expand Down
9 changes: 9 additions & 0 deletions src/parser/Literal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,4 +135,13 @@ Literal Literal::fromStringRepresentation(std::string internal) {
return Literal{std::move(internal), endIdx + 1};
}

// __________________________________________
void Literal::setSubstr(std::size_t start, std::size_t length){
auto contentWithoutParentesis = content_.substr(1, beginOfSuffix_ - 2);
auto shortenedContent = ad_utility::getUTF8Substring(contentWithoutParentesis, start, length);
auto suffix = content_.substr(beginOfSuffix_);
content_ = absl::StrCat("\"", shortenedContent, "\"", suffix);
beginOfSuffix_ = content_.size() - suffix.size();
}

} // namespace ad_utility::triple_component
3 changes: 3 additions & 0 deletions src/parser/Literal.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,8 @@ class Literal {
static Literal literalWithoutQuotes(
std::string_view rdfContentWithoutQuotes,
std::optional<std::variant<Iri, std::string>> descriptor = std::nullopt);

// Set the substring of the current literal directly, based on start and length.
void setSubstr(std::size_t start, std::size_t length);
};
} // namespace ad_utility::triple_component
10 changes: 10 additions & 0 deletions src/parser/LiteralOrIri.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ const Literal& LiteralOrIri::getLiteral() const {
return std::get<Literal>(data_);
}

// __________________________________________
Literal& LiteralOrIri::getLiteral(){
if (!isLiteral()) {
AD_THROW(
"LiteralOrIri object does not contain an Literal object and "
"thus cannot return it");
}
return std::get<Literal>(data_);
}

// __________________________________________
bool LiteralOrIri::hasLanguageTag() const {
return getLiteral().hasLanguageTag();
Expand Down
4 changes: 4 additions & 0 deletions src/parser/LiteralOrIri.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ class alignas(16) LiteralOrIri {
// otherwise
const Literal& getLiteral() const;

// Return a modifiable reference to the contained Literal object if available, throw exception
// otherwise. Allows the caller to modify the Literal object e.g. for SubStr in StringExpressions.cpp
Literal& getLiteral();

// Create a new LiteralOrIri based on a Literal object
explicit LiteralOrIri(Literal literal);

Expand Down

0 comments on commit f15bf94

Please sign in to comment.