From 81eb71bb078ef7f0a85994e92ccfb43689276c7b Mon Sep 17 00:00:00 2001 From: MJedr Date: Thu, 6 Oct 2022 11:46:59 +0200 Subject: [PATCH] parser: add separate grammar for > and < date operators ref: cern-sis/issues-inspire#134 --- inspire_query_parser/ast.py | 8 ++ inspire_query_parser/parser.py | 28 +++- .../visitors/elastic_search_visitor.py | 6 + .../visitors/restructuring_visitor.py | 6 + tests/test_elastic_search_visitor.py | 18 +++ tests/test_parser_functionality.py | 134 ++++++++++-------- tests/test_restructuring_visitor.py | 25 +--- 7 files changed, 141 insertions(+), 84 deletions(-) diff --git a/inspire_query_parser/ast.py b/inspire_query_parser/ast.py index 41efa83..3ffa8e0 100644 --- a/inspire_query_parser/ast.py +++ b/inspire_query_parser/ast.py @@ -168,6 +168,14 @@ class GreaterThanOp(UnaryOp): pass +class GreaterThanDateOp(UnaryOp): + pass + + +class LessThanDateOp(UnaryOp): + pass + + class LessThanOp(UnaryOp): pass diff --git a/inspire_query_parser/parser.py b/inspire_query_parser/parser.py index f031bab..d247f54 100644 --- a/inspire_query_parser/parser.py +++ b/inspire_query_parser/parser.py @@ -367,7 +367,7 @@ class SimpleValueWithColonUnit(SimpleValueUnit): class SimpleDateValueUnit(LeafRule): - grammar = re.compile(r"[\d*\-\.\/]{4,10}(?=($|\s|\)))", re.UNICODE) + grammar = re.compile(r"[\d*\-\.\/\_]{1,10}(?=($|\s|\)))", re.UNICODE) date_specifiers_regex = re.compile(r"({})\s*(-\s*\d+)?".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE) string_month_date_regex = re.compile(MONTH_REGEX, re.IGNORECASE) @@ -555,6 +555,7 @@ def parse(cls, parser, text, pos): GreaterEqualOp, LessEqualOp, GreaterThanOp, + GreaterThanDateOp, LessThanOp, ComplexValue ] @@ -600,7 +601,6 @@ def parse(cls, parser, text, pos): SimpleValueNegation, SimpleValue, SimpleDateValueNegation, - SimpleDateValue, ] ) @@ -652,7 +652,15 @@ class GreaterThanOp(UnaryRule): Supports queries like author-count > 2000 or date after 10-2000. """ - grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr('op', [SimpleDateValue, SimpleValue]) + grammar = omit(re.compile(r">", re.IGNORECASE)), attr('op', [SimpleValue]) + + +class GreaterThanDateOp(UnaryRule): + """Greater than operator. + + Supports queries like author-count > 2000 or date after 10-2000. + """ + grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr('op', [SimpleDateValue]) class GreaterEqualOp(UnaryRule): @@ -673,7 +681,15 @@ class LessThanOp(UnaryRule): Supports queries like author-count < 100 or date before 1984. """ - grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr('op', [SimpleDateValue, SimpleValue]) + grammar = omit(re.compile(r"<", re.IGNORECASE)), attr('op', [SimpleValue]) + + +class LessThanDateOp(UnaryRule): + """Less than operator. + + Supports queries like author-count < 100 or date before 1984. + """ + grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr('op', [SimpleDateValue]) class LessEqualOp(UnaryRule): @@ -740,8 +756,8 @@ class DateValue(UnaryRule): (optional(omit(Literal("="))), RangeOp), GreaterEqualOp, LessEqualOp, - GreaterThanOp, - LessThanOp, + GreaterThanDateOp, + LessThanDateOp, ( optional(omit(Literal("="))), [ diff --git a/inspire_query_parser/visitors/elastic_search_visitor.py b/inspire_query_parser/visitors/elastic_search_visitor.py index 7107725..6a61be4 100644 --- a/inspire_query_parser/visitors/elastic_search_visitor.py +++ b/inspire_query_parser/visitors/elastic_search_visitor.py @@ -649,12 +649,18 @@ def visit_range_op(self, node, fieldnames): def visit_greater_than_op(self, node, fieldnames): return self._generate_range_queries(force_list(fieldnames), {'gt': node.op.value}) + def visit_greater_than_date_op(self, node, fieldnames): + return self._generate_range_queries(force_list(fieldnames), {'gt': node.op.value}) + def visit_greater_equal_than_op(self, node, fieldnames): return self._generate_range_queries(force_list(fieldnames), {'gte': node.op.value}) def visit_less_than_op(self, node, fieldnames): return self._generate_range_queries(force_list(fieldnames), {'lt': node.op.value}) + def visit_less_than_date_op(self, node, fieldnames): + return self._generate_range_queries(force_list(fieldnames), {'lt': node.op.value}) + def visit_less_equal_than_op(self, node, fieldnames): return self._generate_range_queries(force_list(fieldnames), {'lte': node.op.value}) diff --git a/inspire_query_parser/visitors/restructuring_visitor.py b/inspire_query_parser/visitors/restructuring_visitor.py index bdb2225..6e6435c 100644 --- a/inspire_query_parser/visitors/restructuring_visitor.py +++ b/inspire_query_parser/visitors/restructuring_visitor.py @@ -268,6 +268,9 @@ def visit_value(self, node): def visit_range_op(self, node): return ast.RangeOp(node.left.accept(self), node.right.accept(self)) + def visit_greater_than_date_op(self, node): + return ast.GreaterThanDateOp(node.op.accept(self)) + def visit_greater_than_op(self, node): return ast.GreaterThanOp(node.op.accept(self)) @@ -279,6 +282,9 @@ def visit_greater_equal_op(self, node): return ast.GreaterEqualThanOp(value) def visit_less_than_op(self, node): + return ast.LessThanDateOp(node.op.accept(self)) + + def visit_less_than_date_op(self, node): return ast.LessThanOp(node.op.accept(self)) def visit_less_equal_op(self, node): diff --git a/tests/test_elastic_search_visitor.py b/tests/test_elastic_search_visitor.py index 2c8101c..e33040f 100644 --- a/tests/test_elastic_search_visitor.py +++ b/tests/test_elastic_search_visitor.py @@ -3235,3 +3235,21 @@ def test_elastic_search_visitor_complex_query(): } generated_es_query = _parse_query(query_str) assert generated_es_query == expected_es_query + + +def test_elastic_search_visitor_regression_greater_than_for_non_date(): + query_str = "t after something" + expected_es_query = { + "match": {"titles.full_title": {"query": "after something", "operator": "and"}} + } + generated_es_query = _parse_query(query_str) + assert generated_es_query == expected_es_query + + +def test_elastic_search_visitor_regression_less_than_for_non_date(): + query_str = "t before something" + expected_es_query = { + "match": {"titles.full_title": {"query": "before something", "operator": "and"}} + } + generated_es_query = _parse_query(query_str) + assert generated_es_query == expected_es_query diff --git a/tests/test_parser_functionality.py b/tests/test_parser_functionality.py index d0f9dec..078a307 100644 --- a/tests/test_parser_functionality.py +++ b/tests/test_parser_functionality.py @@ -27,6 +27,7 @@ from inspire_query_parser.parser import (And, BooleanQuery, ComplexValue, DateValue, EmptyQuery, Expression, GreaterEqualOp, GreaterThanOp, + GreaterThanDateOp, InspireDateKeyword, InspireKeyword, InvenioKeywordQuery, LessEqualOp, LessThanOp, MalformedQueryWords, @@ -1762,64 +1763,6 @@ ("", Query([EmptyQuery()])), (" ", Query([EmptyQuery()])), # G, GE, LT, LE, E queries - ( - "date > 2000-10 and < 2000-12", - Query( - [ - Statement( - BooleanQuery( - Expression( - SimpleQuery( - SpiresDateKeywordQuery( - InspireDateKeyword("date"), - DateValue( - GreaterThanOp(SimpleDateValue("2000-10")) - ), - ) - ) - ), - And(), - Statement( - Expression( - SimpleQuery( - Value(LessThanOp(SimpleDateValue("2000-12"))) - ) - ) - ), - ) - ) - ] - ), - ), - ( - "date after 10/2000 and before 2000-12", - Query( - [ - Statement( - BooleanQuery( - Expression( - SimpleQuery( - SpiresDateKeywordQuery( - InspireDateKeyword("date"), - DateValue( - GreaterThanOp(SimpleDateValue("10/2000")) - ), - ) - ) - ), - And(), - Statement( - Expression( - SimpleQuery( - Value(LessThanOp(SimpleDateValue("2000-12"))) - ) - ) - ), - ) - ) - ] - ), - ), ( "date >= nov 2000 and d<=2005", Query( @@ -2070,7 +2013,7 @@ SpiresDateKeywordQuery( InspireDateKeyword("date-updated"), DateValue( - GreaterThanOp(SimpleDateValue("yesterday - 2")) + GreaterThanDateOp(SimpleDateValue("yesterday - 2")) ), ) ) @@ -2335,3 +2278,76 @@ def test_parser_functionality(query_str, expected_parse_tree): parser = StatefulParser() _, parse_tree = parser.parse(query_str, Query) assert parse_tree == expected_parse_tree + + +@pytest.mark.parametrize( + ["query_str", "expected_parse_tree"], + { + ( + "date > 2000-10 and < 2000-12", + Query( + [ + Statement( + BooleanQuery( + Expression( + SimpleQuery( + SpiresDateKeywordQuery( + InspireDateKeyword("date"), + DateValue( + GreaterThanOp(SimpleDateValue("2000-10")) + ), + ) + ) + ), + And(), + Statement( + Expression( + SimpleQuery( + Value(LessThanOp(SimpleDateValue("2000-12"))) + ) + ) + ), + ) + ) + ] + ), + ), + ( + "date after 10/2000 and before 2000-12", + Query( + [ + Statement( + BooleanQuery( + Expression( + SimpleQuery( + SpiresDateKeywordQuery( + InspireDateKeyword("date"), + DateValue( + GreaterThanOp(SimpleDateValue("10/2000")) + ), + ) + ) + ), + And(), + Statement( + Expression( + SimpleQuery( + Value(LessThanOp(SimpleDateValue("2000-12"))) + ) + ) + ), + ) + ) + ] + ), + ), + }, +) +@pytest.mark.xfail( + reason="the queries are not correct, should be fixed by https://github.com/cern-sis/issues-inspire/issues/150 " +) +def test_parser_functionality_regressions(query_str, expected_parse_tree): + print("Parsing: " + query_str) + parser = StatefulParser() + _, parse_tree = parser.parse(query_str, Query) + assert parse_tree == expected_parse_tree diff --git a/tests/test_restructuring_visitor.py b/tests/test_restructuring_visitor.py index 93982cd..c38a20c 100644 --- a/tests/test_restructuring_visitor.py +++ b/tests/test_restructuring_visitor.py @@ -30,6 +30,7 @@ from inspire_query_parser import parser from inspire_query_parser.ast import (AndOp, EmptyQuery, ExactMatchValue, GreaterEqualThanOp, GreaterThanOp, + GreaterThanDateOp,LessThanDateOp, Keyword, KeywordOp, LessEqualThanOp, LessThanOp, MalformedQuery, NestedKeywordOp, NotOp, OrOp, @@ -354,17 +355,11 @@ # G, GE, LT, LE, E queries ( 'date > 2000-10 and date < 2000-12', - AndOp( - KeywordOp(Keyword('date'), GreaterThanOp(Value('2000-10'))), - KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))) - ) + AndOp(KeywordOp(Keyword('date'), GreaterThanDateOp(Value('2000-10'))), KeywordOp(Keyword('date'), LessThanOp(Value('2000-12')))) ), ( 'date after 10/2000 and date before 2000-12', - AndOp( - KeywordOp(Keyword('date'), GreaterThanOp(Value('10/2000'))), - KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))) - ) + AndOp(KeywordOp(Keyword('date'), GreaterThanDateOp(Value('10/2000'))), KeywordOp(Keyword('date'), LessThanOp(Value('2000-12')))) ), ( 'date >= nov 2000 and d<=2005', @@ -445,22 +440,14 @@ 'du > yesterday - 2', KeywordOp( Keyword('date-updated'), - GreaterThanOp(Value(str((date.today() - relativedelta(days=3))))) + GreaterThanDateOp(Value(str((date.today() - relativedelta(days=3))))) ) ), # Wildcard queries ( 'find a \'o*aigh\' and t "alge*" and date >2013', - AndOp( - KeywordOp(Keyword('author'), PartialMatchValue('o*aigh', contains_wildcard=True)), - AndOp( - KeywordOp(Keyword('title'), ExactMatchValue('alge*' - - )), - KeywordOp(Keyword('date'), GreaterThanOp(Value('2013'))) - ) - ) + AndOp(KeywordOp(Keyword('author'), PartialMatchValue('o*aigh')), AndOp(KeywordOp(Keyword('title'), ExactMatchValue('alge*')), KeywordOp(Keyword('date'), GreaterThanDateOp(Value('2013'))))) ), ( 'a *alge | a alge* | a o*aigh', @@ -711,7 +698,7 @@ def test_foo_bar(): ) ), ('find cc italy', KeywordOp(Keyword('country'), Value('italy'))), - ('fin date > today', KeywordOp(Keyword('date'), GreaterThanOp(Value(str(date.today()))))), + ('fin date > today', KeywordOp(Keyword('date'), GreaterThanDateOp(Value(str(date.today()))))), ('find r atlas-conf-*', KeywordOp(Keyword('reportnumber'), Value('atlas-conf-*', contains_wildcard=True))), ( 'find caption "Diagram for the fermion flow violating process"',