diff --git a/include/yaramod/parser/location.h b/include/yaramod/parser/location.h index 36b093b0..93b589c9 100644 --- a/include/yaramod/parser/location.h +++ b/include/yaramod/parser/location.h @@ -21,13 +21,19 @@ class Location */ struct Position { + Position() : line(1), column(0) {} Position(std::size_t line, std::size_t column) : line(line), column(column) {} - std::size_t getLine() { return line; } - std::size_t getColumn() { return column; } + std::size_t getLine() const { return line; } + std::size_t getColumn() const { return column; } std::size_t line; std::size_t column; + friend std::ostream& operator<<(std::ostream& os, const Position& position) + { + os << position.getLine() << '.' << position.getColumn(); + return os; + } }; Location() : Location(std::string{}) {} @@ -53,8 +59,7 @@ class Location void addColumn(std::size_t count) { - _begin.line = _end.line; - _begin.column = _end.column; + _begin = _end; _end.column += count; } @@ -63,6 +68,12 @@ class Location _begin = {1, 0}; _end = {1, 0}; } + + void setBegin(const Position& begin) + { + _begin.line = begin.line; + _begin.column = begin.column - 1; + } /// @} /// @name Getters @@ -82,8 +93,10 @@ class Location { if (!location.isUnnamed()) os << location.getFilePath() << ':'; - os << location.begin().line << '.' << location.begin().column; - if (location.begin().column < location.end().column) + os << location.begin(); + if (location.begin().line != location.end().line) + os << '-' << location.end(); + else if (location.begin().column < location.end().column) os << '-' << location.end().column; return os; } diff --git a/include/yaramod/parser/parser_driver.h b/include/yaramod/parser/parser_driver.h index 38299248..460ff0cb 100644 --- a/include/yaramod/parser/parser_driver.h +++ b/include/yaramod/parser/parser_driver.h @@ -179,6 +179,7 @@ class ParserDriver private: std::string _strLiteral; ///< Currently processed string literal. + Location::Position _positionBegin; ///< Variable storing the position the currently processed token begins at. std::string _indent; ///< Variable storing current indentation std::string _comment; ///< For incremental construction of parsed comments std::string _regexpClass; ///< Currently processed regular expression class. diff --git a/include/yaramod/types/hex_string.h b/include/yaramod/types/hex_string.h index 6e9101eb..bc25e157 100644 --- a/include/yaramod/types/hex_string.h +++ b/include/yaramod/types/hex_string.h @@ -45,6 +45,8 @@ class HexStringUnit /// @{ virtual std::string getText() const = 0; virtual std::size_t getLength() const = 0; + virtual TokenIt getFirstTokenIt() const = 0; + virtual TokenIt getLastTokenIt() const = 0; /// @} /// @name Detection methods @@ -78,6 +80,8 @@ class HexString : public String /// @{ explicit HexString(const std::shared_ptr& ts, const std::vector>& units); explicit HexString(const std::shared_ptr& ts, std::vector>&& units); + explicit HexString(const std::shared_ptr& ts, TokenIt leftBracket, const std::vector>& units, TokenIt rightBracket); + explicit HexString(const std::shared_ptr& ts, TokenIt leftBracket, std::vector>&& units, TokenIt rightBracket); /// @} /// @name Virtual methods. @@ -109,7 +113,9 @@ class HexString : public String bool empty() const { return _units.empty(); } private: + std::optional _leftBracket; std::vector> _units; ///< Units in the hex string + std::optional _rightBracket; }; /** @@ -148,9 +154,10 @@ class HexStringNibble : public HexStringUnit assert(output <= 0xf); return output; } - /// @} - virtual std::size_t getLength() const override { return 1; } + virtual TokenIt getFirstTokenIt() const override { return _value; } + virtual TokenIt getLastTokenIt() const override { return _value; } + /// @} private: TokenIt _value; ///< Value of the nibble @@ -173,6 +180,8 @@ class HexStringWildcard : public HexStringUnit /// @{ virtual std::string getText() const override { return "?"; } virtual std::size_t getLength() const override { return 1; } + virtual TokenIt getFirstTokenIt() const override { return _value; } + virtual TokenIt getLastTokenIt() const override { return _value; } /// @} private: TokenIt _value; ///< Value of the nibble @@ -189,9 +198,27 @@ class HexStringJump : public HexStringUnit public: /// @name Constructors /// @{ - HexStringJump() : HexStringUnit(Type::Jump) {} - HexStringJump(TokenIt low) : HexStringUnit(Type::Jump), _low(low), _high() {} - HexStringJump(TokenIt low, TokenIt high) : HexStringUnit(Type::Jump), _low(low), _high(high) {} + HexStringJump(TokenIt leftBracket, TokenIt rightBracket) + : HexStringUnit(Type::Jump) + , _leftBracket(leftBracket) + , _rightBracket(rightBracket) + { + } + HexStringJump(TokenIt leftBracket, TokenIt low, TokenIt rightBracket) + : HexStringUnit(Type::Jump) + , _leftBracket(leftBracket) + , _low(low) + , _rightBracket(rightBracket) + { + } + HexStringJump(TokenIt leftBracket, TokenIt low, TokenIt high, TokenIt rightBracket) + : HexStringUnit(Type::Jump) + , _leftBracket(leftBracket) + , _low(low) + , _high(high) + , _rightBracket(rightBracket) + { + } /// @} /// @name Virtual methods @@ -233,10 +260,14 @@ class HexStringJump : public HexStringUnit return _high.value()->getUInt(); return std::nullopt; } + virtual TokenIt getFirstTokenIt() const override { return _leftBracket; } + virtual TokenIt getLastTokenIt() const override { return _rightBracket; } /// @} private: + TokenIt _leftBracket; std::optional _low, _high; ///< Low and high bounds of the jump. + TokenIt _rightBracket; }; /** @@ -282,6 +313,8 @@ class HexStringOr : public HexStringUnit /// @name Getters /// @{ const std::vector> getSubstrings() const { return _substrings; } + virtual TokenIt getFirstTokenIt() const override { assert(!_substrings.empty()); return _substrings.front()->getFirstTokenIt(); } + virtual TokenIt getLastTokenIt() const override { assert(!_substrings.empty()); return _substrings.back()->getLastTokenIt(); } /// @} /// @name Iterators diff --git a/src/builder/yara_hex_string_builder.cpp b/src/builder/yara_hex_string_builder.cpp index 4f1fe2bf..7f6436a5 100644 --- a/src/builder/yara_hex_string_builder.cpp +++ b/src/builder/yara_hex_string_builder.cpp @@ -203,16 +203,22 @@ YaraHexStringBuilder::YaraHexStringBuilder(const std::shared_ptr& t */ std::shared_ptr YaraHexStringBuilder::get(const std::shared_ptr& acceptor /*= nullptr*/, bool addHexParentheses /*= true*/) const { + std::optional leftBracket, rightBracket; if (addHexParentheses) { - _tokenStream->emplace(_tokenStream->begin(), HEX_START_BRACKET, "{"); - _tokenStream->emplace_back(HEX_END_BRACKET, "}"); + leftBracket = _tokenStream->emplace(_tokenStream->begin(), HEX_START_BRACKET, "{"); + rightBracket = _tokenStream->emplace_back(HEX_END_BRACKET, "}"); } if (acceptor) { acceptor->moveAppend(_tokenStream.get()); - return std::make_shared(acceptor, _units); + if (leftBracket && rightBracket) + return std::make_shared(acceptor, *leftBracket, _units, *rightBracket); + else + return std::make_shared(acceptor, _units); } + else if (leftBracket && rightBracket) + return std::make_shared(std::move(_tokenStream), *leftBracket, _units, *rightBracket); else return std::make_shared(std::move(_tokenStream), _units); } @@ -309,10 +315,10 @@ YaraHexStringBuilder wildcardHigh(std::uint8_t low) YaraHexStringBuilder jumpVarying() { auto ts = std::make_shared(); - ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); + auto left = ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); ts->emplace_back(DASH, "-"); - ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); - return YaraHexStringBuilder(ts, std::make_shared()); + auto right = ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); + return YaraHexStringBuilder(ts, std::make_shared(left, right)); } /** @@ -328,11 +334,11 @@ YaraHexStringBuilder jumpVarying() YaraHexStringBuilder jumpFixed(std::uint64_t value) { auto ts = std::make_shared(); - ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); + auto left = ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); TokenIt t = ts->emplace_back(HEX_NIBBLE, value); - ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); + auto right = ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); - return YaraHexStringBuilder(ts, std::make_shared(t, t)); + return YaraHexStringBuilder(ts, std::make_shared(left, t, t, right)); } /** @@ -348,12 +354,12 @@ YaraHexStringBuilder jumpFixed(std::uint64_t value) YaraHexStringBuilder jumpVaryingRange(std::uint64_t low) { auto ts = std::make_shared(); - ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); + auto left = ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); TokenIt t = ts->emplace_back(HEX_NIBBLE, low); ts->emplace_back(DASH, "-"); - ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); + auto right = ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); - return YaraHexStringBuilder(ts, std::make_shared(t)); + return YaraHexStringBuilder(ts, std::make_shared(left, t, right)); } /** @@ -369,13 +375,13 @@ YaraHexStringBuilder jumpVaryingRange(std::uint64_t low) YaraHexStringBuilder jumpRange(std::uint64_t low, std::uint64_t high) { auto ts = std::make_shared(); - ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); + auto left = ts->emplace_back(HEX_JUMP_LEFT_BRACKET, "["); TokenIt t1 = ts->emplace_back(HEX_NIBBLE, low); ts->emplace_back(DASH, "-"); TokenIt t2 = ts->emplace_back(HEX_NIBBLE, high); - ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); + auto right = ts->emplace_back(HEX_JUMP_RIGHT_BRACKET, "]"); - return YaraHexStringBuilder(ts, std::make_shared(t1, t2)); + return YaraHexStringBuilder(ts, std::make_shared(left, t1, t2, right)); } /** diff --git a/src/parser/parser_driver.cpp b/src/parser/parser_driver.cpp index d314bbee..d685b148 100644 --- a/src/parser/parser_driver.cpp +++ b/src/parser/parser_driver.cpp @@ -227,6 +227,7 @@ void ParserDriver::defineTokens() // $str tokens are not delegated with return Value but stored in _strLiteral _parser.token(R"(\")").states("@default").enter_state("$str").action([&](std::string_view) -> Value { _strLiteral.clear(); + _positionBegin = currentFileContext()->getLocation().begin(); _escapedContent = false; return {}; }); @@ -241,6 +242,7 @@ void ParserDriver::defineTokens() _parser.token(R"(\\[^\"tnx\\])").states("$str").action([&](std::string_view str) -> Value { error_handle(currentFileContext()->getLocation(), "Syntax error: Unknown escaped sequence '" + std::string{str} + "'"); return {}; }); _parser.token(R"(([^\\"])+)").states("$str").action([&](std::string_view str) -> Value { _strLiteral += std::string{str}; return {}; }); _parser.token(R"(\")").states("$str").symbol("STRING_LITERAL").description("\"").enter_state("@default").action([&](std::string_view) -> Value { + currentFileContext()->getLocation().setBegin(_positionBegin); auto strIt = emplace_back(STRING_LITERAL, _strLiteral); if (_escapedContent) strIt->markEscaped(); @@ -597,7 +599,7 @@ void ParserDriver::defineGrammar() }, "hex_string", "RCB", "hex_string_mods", [&](auto&& args) -> Value { args[3].getTokenIt()->setType(HEX_END_BRACKET); - auto hexString = std::make_shared(currentFileContext()->getTokenStream(), std::move(args[2].getMultipleHexUnits())); + auto hexString = std::make_shared(currentFileContext()->getTokenStream(), args[0].getTokenIt(), std::move(args[2].getMultipleHexUnits()), args[3].getTokenIt()); hexString->setModifiers(std::move(args[4].getStringMods())); return hexString; } @@ -802,22 +804,22 @@ void ParserDriver::defineGrammar() .production("LSQB", "HEX_INTEGER", "RSQB", [](auto&& args) -> Value { args[0].getTokenIt()->setType(HEX_JUMP_LEFT_BRACKET); args[2].getTokenIt()->setType(HEX_JUMP_RIGHT_BRACKET); - return std::make_shared(args[1].getTokenIt(), args[1].getTokenIt()); + return std::make_shared(args[0].getTokenIt(), args[1].getTokenIt(), args[1].getTokenIt(), args[2].getTokenIt()); }) .production("LSQB", "HEX_INTEGER", "DASH", "HEX_INTEGER", "RSQB", [](auto&& args) -> Value { args[0].getTokenIt()->setType(HEX_JUMP_LEFT_BRACKET); args[4].getTokenIt()->setType(HEX_JUMP_RIGHT_BRACKET); - return std::make_shared(args[1].getTokenIt(), args[3].getTokenIt()); + return std::make_shared(args[0].getTokenIt(), args[1].getTokenIt(), args[3].getTokenIt(), args[4].getTokenIt()); }) .production("LSQB", "HEX_INTEGER", "DASH", "RSQB", [](auto&& args) -> Value { args[0].getTokenIt()->setType(HEX_JUMP_LEFT_BRACKET); args[3].getTokenIt()->setType(HEX_JUMP_RIGHT_BRACKET); - return std::make_shared(args[1].getTokenIt()); + return std::make_shared(args[0].getTokenIt(), args[1].getTokenIt(), args[3].getTokenIt()); }) .production("LSQB", "DASH", "RSQB", [](auto&& args) -> Value { args[0].getTokenIt()->setType(HEX_JUMP_LEFT_BRACKET); args[2].getTokenIt()->setType(HEX_JUMP_RIGHT_BRACKET); - return std::make_shared(); + return std::make_shared(args[0].getTokenIt(), args[2].getTokenIt()); }) ; diff --git a/src/types/hex_string.cpp b/src/types/hex_string.cpp index 2281a8d5..5f382b83 100644 --- a/src/types/hex_string.cpp +++ b/src/types/hex_string.cpp @@ -29,6 +29,26 @@ HexString::HexString(const std::shared_ptr& ts, std::vector& ts, TokenIt leftBracket, const std::vector>& units, TokenIt rightBracket) + : String(ts, String::Type::Hex), _leftBracket(leftBracket), _units(units), _rightBracket(rightBracket) +{ +} + +/** + * Constructor. + * + * @param units Units of the hex string. + */ +HexString::HexString(const std::shared_ptr& ts, TokenIt leftBracket, std::vector>&& units, TokenIt rightBracket) + : String(ts, String::Type::Hex), _leftBracket(leftBracket), _units(std::move(units)), _rightBracket(rightBracket) +{ +} + /** * Return the string representation of the hex string. * @@ -72,15 +92,22 @@ std::string HexString::getPureText() const TokenIt HexString::getFirstTokenIt() const { - return _tokenStream->begin(); + if (_leftBracket) + return *_leftBracket; + else if (_units.empty()) + return _tokenStream->begin(); + else + return _units.front()->getFirstTokenIt(); } TokenIt HexString::getLastTokenIt() const { - if (_units.empty()) + if (_rightBracket) + return *_rightBracket; + else if (_units.empty()) return _tokenStream->begin(); else - return std::prev(_tokenStream->end()); + return _units.back()->getLastTokenIt(); } /** diff --git a/tests/cpp/parser_tests.cpp b/tests/cpp/parser_tests.cpp index dd4cb3f7..00bf0330 100644 --- a/tests/cpp/parser_tests.cpp +++ b/tests/cpp/parser_tests.cpp @@ -580,7 +580,7 @@ TEST_F(ParserTests, HexStringWithSimpleOrWorks) { prepareInput( R"( -rule hex_string_with_simple_or_jump +rule hex_string_with_simple_or { strings: $1 = { 01 23 ( AB | CD ) 45 56 } @@ -593,7 +593,7 @@ rule hex_string_with_simple_or_jump ASSERT_EQ(1u, driver.getParsedFile().getRules().size()); const auto& rule = driver.getParsedFile().getRules()[0]; - EXPECT_EQ("hex_string_with_simple_or_jump", rule->getName()); + EXPECT_EQ("hex_string_with_simple_or", rule->getName()); EXPECT_EQ(Rule::Modifier::None, rule->getModifier()); auto strings = rule->getStrings(); @@ -611,7 +611,7 @@ TEST_F(ParserTests, HexStringWithMultibyteSimpleOrWorks) { prepareInput( R"( -rule hex_string_with_multibyte_simple_or_jump +rule hex_string_with_multibyte_simple_or { strings: $1 = { 01 23 ( AB CD EF | AA BB | EE | FF FF ) 45 56 } @@ -624,7 +624,7 @@ rule hex_string_with_multibyte_simple_or_jump ASSERT_EQ(1u, driver.getParsedFile().getRules().size()); const auto& rule = driver.getParsedFile().getRules()[0]; - EXPECT_EQ("hex_string_with_multibyte_simple_or_jump", rule->getName()); + EXPECT_EQ("hex_string_with_multibyte_simple_or", rule->getName()); EXPECT_EQ(Rule::Modifier::None, rule->getModifier()); auto strings = rule->getStrings(); @@ -1873,7 +1873,7 @@ rule dummy_rule { EXPECT_EQ(0u, driver.getParsedFile().getRules().size()); ASSERT_EQ(0u, driver.getParsedFile().getImports().size()); - EXPECT_EQ("Error at 2.15: Unrecognized module 'module' imported", err.getErrorMessage()); + EXPECT_EQ("Error at 2.8-15: Unrecognized module 'module' imported", err.getErrorMessage()); } } @@ -2987,7 +2987,7 @@ rule dummy_rule { EXPECT_EQ(0u, driverNoAvastSymbols.getParsedFile().getRules().size()); ASSERT_EQ(0u, driverNoAvastSymbols.getParsedFile().getImports().size()); - EXPECT_EQ("Error at 2.19: Unrecognized module 'androguard' imported", err.getErrorMessage()); + EXPECT_EQ("Error at 2.8-19: Unrecognized module 'androguard' imported", err.getErrorMessage()); } } @@ -3042,7 +3042,7 @@ rule dummy_rule { EXPECT_EQ(0u, driverNoAvastSymbols.getParsedFile().getRules().size()); ASSERT_EQ(0u, driverNoAvastSymbols.getParsedFile().getImports().size()); - EXPECT_EQ("Error at 2.14: Unrecognized module 'phish' imported", err.getErrorMessage()); + EXPECT_EQ("Error at 2.8-14: Unrecognized module 'phish' imported", err.getErrorMessage()); } } @@ -4311,7 +4311,7 @@ rule public_rule { catch (const ParserError& err) { EXPECT_EQ(0u, driver.getParsedFile().getRules().size()); - EXPECT_EQ("Error at 1.11: Syntax error: Unknown symbol on input, expected one of @end, global, private, rule, import, include", err.getErrorMessage()); + EXPECT_EQ("Error at 1.8-11: Syntax error: Unknown symbol on input, expected one of @end, global, private, rule, import, include", err.getErrorMessage()); } } diff --git a/tests/python/test_parser.py b/tests/python/test_parser.py index 33d734a0..125f0c4f 100644 --- a/tests/python/test_parser.py +++ b/tests/python/test_parser.py @@ -1646,12 +1646,12 @@ def test_string_locations(self): self.assertEqual(string1.token_assign.location.end.column, 12) self.assertEqual(string1.token_first.type, yaramod.TokenType.StringLiteral) self.assertEqual(string1.token_first.location.begin.line, 3) - # self.assertEqual(string1.token_first.location.begin.column, 14) # FIXME: 14 != 27 + self.assertEqual(string1.token_first.location.begin.column, 14) # FIXME: 14 != 27 self.assertEqual(string1.token_first.location.end.line, 3) self.assertEqual(string1.token_first.location.end.column, 27) self.assertEqual(string1.token_last.type, yaramod.TokenType.StringLiteral) self.assertEqual(string1.token_last.location.begin.line, 3) - # self.assertEqual(string1.token_last.location.begin.column, 14) # FIXME: 14 != 27 + self.assertEqual(string1.token_last.location.begin.column, 14) # FIXME: 14 != 27 self.assertEqual(string1.token_last.location.end.line, 3) self.assertEqual(string1.token_last.location.end.column, 27) @@ -1659,8 +1659,8 @@ def test_string_locations(self): self.assertEqual(string2.identifier, '$2') self.assertEqual(string2.location.begin.line, 4) self.assertEqual(string2.location.begin.column, 10) - # self.assertEqual(string2.location.end.line, 4) # FIXME: 4 != 8 - # self.assertEqual(string2.location.end.column, 40) # FIXME: 2 != 40 + self.assertEqual(string2.location.end.line, 4) # FIXME: 4 != 8 + self.assertEqual(string2.location.end.column, 40) # FIXME: 2 != 40 self.assertEqual(string2.token_id.type, yaramod.TokenType.StringIdAfterNewline) self.assertEqual(string2.token_id.location.begin.line, 4) self.assertEqual(string2.token_id.location.begin.column, 9) @@ -1671,16 +1671,16 @@ def test_string_locations(self): self.assertEqual(string2.token_assign.location.begin.column, 12) self.assertEqual(string2.token_assign.location.end.line, 4) self.assertEqual(string2.token_assign.location.end.column, 12) - # self.assertEqual(string2.token_first.type, yaramod.TokenType.HexStartBracket) # FIXME: not a TokenType.Rule - # self.assertEqual(string2.token_first.location.begin.line, 4) # FIXME: 4 != 1 - # self.assertEqual(string2.token_first.location.begin.column, 14) # FIXME: 14 != 40 - # self.assertEqual(string2.token_first.location.end.line, 4) # FIXME: 4 != 1 - # self.assertEqual(string2.token_first.location.end.column, 14) # FIXME: 14 != 4 - # self.assertEqual(string2.token_last.type, yaramod.TokenType.HexEndBracket) # FIXME: not a TokenType.NewLine - # self.assertEqual(string2.token_last.location.begin.line, 4) # FIXME: 4 != 8 - # self.assertEqual(string2.token_last.location.begin.column, 40) # FIXME: 40 != 2 - # self.assertEqual(string2.token_last.location.end.line, 4) # FIXME: 4 != 8 - # self.assertEqual(string2.token_last.location.end.column, 40) # FIXME: 40 != 2 + self.assertEqual(string2.token_first.type, yaramod.TokenType.HexStartBracket) # FIXME: not a TokenType.Rule + self.assertEqual(string2.token_first.location.begin.line, 4) # FIXME: 4 != 1 + self.assertEqual(string2.token_first.location.begin.column, 14) # FIXME: 14 != 40 + self.assertEqual(string2.token_first.location.end.line, 4) # FIXME: 4 != 1 + self.assertEqual(string2.token_first.location.end.column, 14) # FIXME: 14 != 4 + self.assertEqual(string2.token_last.type, yaramod.TokenType.HexEndBracket) # FIXME: not a TokenType.NewLine + self.assertEqual(string2.token_last.location.begin.line, 4) # FIXME: 4 != 8 + self.assertEqual(string2.token_last.location.begin.column, 40) # FIXME: 40 != 2 + self.assertEqual(string2.token_last.location.end.line, 4) # FIXME: 4 != 8 + self.assertEqual(string2.token_last.location.end.column, 40) # FIXME: 40 != 2 string3 = rule.strings[2] self.assertEqual(string3.identifier, '$3')