Scanner: Generate error on inbalanced RLO/LRO/PDF override markers.

2023-10-03 13:03:40 +00:00 · 2020-11-18 14:35:16 +01:00 · 2020-11-18 14:35:16 +01:00 · 381c63ec99
commit 381c63ec99
parent 67b7267735
31 changed files with 333 additions and 7 deletions
--- a/Changelog.md
+++ b/Changelog.md
@ -6,6 +6,7 @@ Language Features:
 * Code generator: Support conversion from calldata slices to memory and storage arrays.
 * The fallback function can now also have a single ``calldata`` argument (equaling ``msg.data``) and return ``bytes memory`` (which will not be ABI-encoded but returned as-is).
 * Wasm backend: Add ``i32.select`` and ``i64.select`` instructions.
 * Scanner: Generate a parser error when comments or unicode strings contain an unbalanced or underflowing set of unicode direction override markers (LRO, RLO, LRE, RLE, PDF).
 Compiler Features:
 * Build System: Optionally support dynamic loading of Z3 and use that mechanism for Linux release builds.
--- a/liblangutil/CharStream.h
+++ b/liblangutil/CharStream.h
@ -98,6 +98,20 @@ public:
 	std::tuple<int, int> translatePositionToLineColumn(int _position) const;
 	///@}
 	/// Tests whether or not given octet sequence is present at the current position in stream.
 	/// @returns true if the sequence could be found, false otherwise.
 	bool prefixMatch(std::string_view _sequence)
 	{
 		if (isPastEndOfInput(_sequence.size()))
 			return false;
 		for (size_t i = 0; i < _sequence.size(); ++i)
 			if (_sequence[i] != get(i))
 				return false;
 		return true;
 	}
 private:
 	std::string m_source;
 	std::string m_name;
--- a/liblangutil/Scanner.cpp
+++ b/liblangutil/Scanner.cpp
@ -54,9 +54,10 @@
 #include <liblangutil/Exceptions.h>
 #include <liblangutil/Scanner.h>
-#include <algorithm>
+#include <boost/algorithm/string/classification.hpp>
 #include <optional>
-#include <ostream>
+#include <string_view>
 #include <tuple>
 using namespace std;
@ -79,6 +80,8 @@ string to_string(ScannerError _errorCode)
 		case ScannerError::IllegalExponent: return "Invalid exponent.";
 		case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
 		case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
 		case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
 		case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
 		default:
 			solAssert(false, "Unhandled case in to_string(ScannerError)");
 			return "";
@ -271,12 +274,61 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
 	return sourcePos() != startPosition;
 }
 namespace
 {
 /// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
 ///
 /// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
 ///          and error code in case the input's lexical parser state is invalid and this error should be reported
 ///          to the user.
 static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
 {
 	static array<pair<string_view, int>, 5> constexpr directionalSequences{
 		pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
 		pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
 		pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
 		pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
 		pair<string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting
 	};
 	size_t endPosition = _stream.position();
 	_stream.setPosition(_startPosition);
 	int directionOverrideDepth = 0;
 	for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
 	{
 		_stream.setPosition(currentPos);
 		for (auto const& [sequence, depthChange]: directionalSequences)
 			if (_stream.prefixMatch(sequence))
 				directionOverrideDepth += depthChange;
 		if (directionOverrideDepth < 0)
 			return ScannerError::DirectionalOverrideUnderflow;
 	}
 	_stream.setPosition(endPosition);
 	return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
 }
 }
 Token Scanner::skipSingleLineComment()
 {
 	// Line terminator is not part of the comment. If it is a
 	// non-ascii line terminator, it will result in a parser error.
 	size_t startPosition = m_source->position();
 	while (!isUnicodeLinebreak())
-		if (!advance()) break;
+		if (!advance())
 			break;
 	ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
 	if (unicodeDirectionError != ScannerError::NoError)
 		return setError(unicodeDirectionError);
 	return Token::Whitespace;
 }
@ -349,16 +401,21 @@ size_t Scanner::scanSingleLineDocComment()
 Token Scanner::skipMultiLineComment()
 {
 	size_t startPosition = m_source->position();
 	while (!isSourcePastEndOfInput())
 	{
-		char ch = m_char;
+		char prevChar = m_char;
 		advance();
 		// If we have reached the end of the multi-line comment, we
 		// consume the '/' and insert a whitespace. This way all
 		// multi-line comments are treated as whitespace.
-		if (ch == '*' && m_char == '/')
+		if (prevChar == '*' && m_char == '/')
 		{
 			ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
 			if (unicodeDirectionError != ScannerError::NoError)
 				return setError(unicodeDirectionError);
 			m_char = ' ';
 			return Token::Whitespace;
 		}
@ -785,6 +842,7 @@ bool Scanner::isUnicodeLinebreak()
 Token Scanner::scanString(bool const _isUnicode)
 {
 	size_t startPosition = m_source->position();
 	char const quote = m_char;
 	advance();  // consume quote
 	LiteralScope literal(this, LITERAL_TYPE_STRING);
@ -812,6 +870,14 @@ Token Scanner::scanString(bool const _isUnicode)
 	}
 	if (m_char != quote)
 		return setError(ScannerError::IllegalStringEndQuote);
 	if (_isUnicode)
 	{
 		ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
 		if (unicodeDirectionError != ScannerError::NoError)
 			return setError(unicodeDirectionError);
 	}
 	literal.complete();
 	advance();  // consume quote
 	return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
--- a/liblangutil/Scanner.h
+++ b/liblangutil/Scanner.h
@ -89,6 +89,9 @@ enum class ScannerError
 	IllegalExponent,
 	IllegalNumberEnd,
 	DirectionalOverrideUnderflow,
 	DirectionalOverrideMismatch,
 	OctalNotAllowed,
 };
@ -183,6 +186,7 @@ public:
 	///@}
 private:
 	inline Token setError(ScannerError _error) noexcept
 	{
 		m_tokens[NextNext].error = _error;
--- a/scripts/check_style.sh
+++ b/scripts/check_style.sh
@ -6,7 +6,10 @@
 REPO_ROOT="$(dirname "$0")"/..
 cd $REPO_ROOT
-WHITESPACE=$(git grep -n -I -E "^.*[[:space:]]+$" | grep -v "test/libsolidity/ASTJSON\|test/libsolidity/ASTRecoveryTests\|test/compilationTests/zeppelin/LICENSE")
+WHITESPACE=$(git grep -n -I -E "^.*[[:space:]]+$" |
  grep -v "test/libsolidity/ASTJSON\|test/libsolidity/ASTRecoveryTests\|test/compilationTests/zeppelin/LICENSE" |
  grep -v -E "test/libsolidity/syntaxTests/comments/unicode_direction_override_1.sol"
 )
 if [[ "$WHITESPACE" != "" ]]
 then
--- a/scripts/test_antlr_grammar.sh
+++ b/scripts/test_antlr_grammar.sh
@ -116,7 +116,10 @@ done < <(
  grep -riL -E \
    "^\/\/ (Syntax|Type|Declaration)Error|^\/\/ ParserError (2837|3716|3997|5333|6275|6281|6933|7319)|^==== Source:" \
    "${ROOT_DIR}/test/libsolidity/syntaxTests" \
-    "${ROOT_DIR}/test/libsolidity/semanticTests" \
+    "${ROOT_DIR}/test/libsolidity/semanticTests" |
      grep -v -E 'comments/.*_direction_override.*.sol' |
      grep -v -E 'literals/.*_direction_override.*.sol'
      # Skipping the unicode tests as I couldn't adapt the lexical grammar to recursively counting RLO/LRO/PDF's.
 )
 YUL_FILES=()
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_1.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_1.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF
        /*underflow ‬*/
    }
 }
 // ----
 // ParserError 8936: (71-83): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_2.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_2.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF PDF
        /*underflow ‬‬*/
    }
 }
 // ----
 // ParserError 8936: (75-87): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_3.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_3.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // RLO
        /*overflow ‮*/
    }
 }
 // ----
 // ParserError 8936: (71-86): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_4.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_4.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // RLO RLO
        /*overflow ‮‮*/
    }
 }
 // ----
 // ParserError 8936: (75-93): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_5.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_5.sol
@ -0,0 +1,14 @@
 contract C {
    function f() public pure
    {
        // RLO PDF
        /*ok ‮‬*/
        // RLO RLO PDF PDF
        /*ok ‮‮‬‬*/
        // RLO RLO RLO PDF PDF PDF
        /*ok ‮‮‮‬‬‬*/
    }
 }
 // ----
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_6.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_6.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF RLO
        /*overflow ‬‮*/
    }
 }
 // ----
 // ParserError 8936: (75-86): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_7.sol
+++ b/test/libsolidity/syntaxTests/comments/multiline_unicode_direction_override_7.sol
@ -0,0 +1,7 @@
 contract C {
    function f() public pure {
        /* LRO‭ LRE‪ RLE ‫  PDF‬ RLO‮ PDF ‬ PDF‬
    }
 }
 // ----
 // ParserError 8936: (52-115): Expected multi-line comment-terminator.
--- a/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_1.sol
+++ b/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_1.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF
        // underflow ‬
    }
 }
 // ----
 // ParserError 8936: (71-84): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_2.sol
+++ b/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_2.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF PDF
        // underflow ‬‬
    }
 }
 // ----
 // ParserError 8936: (75-88): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_3.sol
+++ b/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_3.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // RLO
        // overflow ‮
    }
 }
 // ----
 // ParserError 8936: (71-86): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_4.sol
+++ b/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_4.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // RLO RLO
        // overflow ‮‮
    }
 }
 // ----
 // ParserError 8936: (75-93): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_5.sol
+++ b/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_5.sol
@ -0,0 +1,14 @@
 contract C {
    function f() public pure
    {
        // RLO PDF
        // ok ‮‬
        // RLO RLO PDF PDF
        // ok ‮‮‬‬
        // RLO RLO RLO PDF PDF PDF
        // ok ‮‮‮‬‬‬
    }
 }
 // ----
--- a/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_6.sol
+++ b/test/libsolidity/syntaxTests/comments/singleline_unicode_direction_override_6.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF RLO
        // underflow ‬‮
    }
 }
 // ----
 // ParserError 8936: (75-88): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/comments/unicode_direction_in_source_1.sol
+++ b/test/libsolidity/syntaxTests/comments/unicode_direction_in_source_1.sol
@ -0,0 +1,8 @@
 contract C {
    function f(bool b) public pure
    {
        if ‬(b) { return; }
    }
 }
 // ----
 // ParserError 2314: (65-66): Expected '(' but got 'ILLEGAL'
--- a/test/libsolidity/syntaxTests/comments/unicode_direction_in_source_2.sol
+++ b/test/libsolidity/syntaxTests/comments/unicode_direction_in_source_2.sol
@ -0,0 +1,8 @@
 contract C {
    function f(bool b) public pure
    {
        uint a = 10; ‬
    }
 }
 // ----
 // ParserError 8936: (75-76): Invalid token.
--- a/test/libsolidity/syntaxTests/comments/unicode_direction_override_1.sol
+++ b/test/libsolidity/syntaxTests/comments/unicode_direction_override_1.sol
@ -0,0 +1,10 @@
 contract TimelockUpgrade {
    function confirmUpgrade() external {
        uint256 m;
        uint256 d;
        (/*year*/,/*month‮*/,d/*yad*/,m/*‬‬hour*/,/*minute*/,/*second*/) = BokkyDateTime.timestampToDateTime(block.timestamp);
    }
 }
 // ----
 // ParserError 8936: (128-139): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_1.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_1.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF
        bytes memory s = unicode"underflow ‬";
    }
 }
 // ----
 // ParserError 8936: (88-106): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_2.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_2.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF PDF
        bytes memory m = unicode"underflow ‬‬";
    }
 }
 // ----
 // ParserError 8936: (92-110): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_3.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_3.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // RLO
        bytes memory m = unicode"overflow ‮";
    }
 }
 // ----
 // ParserError 8936: (88-108): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_4.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_4.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // RLO RLO
        bytes memory m = unicode"overflow ‮‮";
    }
 }
 // ----
 // ParserError 8936: (92-115): Mismatching directional override markers in comment or string literal.
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_5.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_5.sol
@ -0,0 +1,14 @@
 contract C {
    function f() public pure
    {
        // RLO PDF
        bytes memory m = unicode" ok ‮‬";
        // RLO RLO PDF PDF
        m = unicode" ok ‮‮‬‬";
        // RLO RLO RLO PDF PDF PDF
        m = unicode" ok ‮‮‮‬‬‬";
    }
 }
 // ----
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_6.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_6.sol
@ -0,0 +1,9 @@
 contract C {
    function f() public pure
    {
        // PDF RLO
        bytes memory m = unicode" underflow ‬‮";
    }
 }
 // ----
 // ParserError 8936: (92-111): Unicode direction override underflow in comment or string literal.
--- a/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_7.sol
+++ b/test/libsolidity/syntaxTests/literals/unicode_string_direction_override_7.sol
@ -0,0 +1,13 @@
 contract C {
    function f() public pure
    {
        // LRO PDF RLO PDF
        bytes memory m = unicode"‭ ok ‬‮‬";
        // lre rle pdf pdf
        m = unicode"lre‪ rle‫ pdf‬ pdf‬";
        // lre lro pdf pdf
        m = unicode"lre‪ lro‭ pdf‬ pdf‬";
    }
 }
 // ----
--- a/test/libyul/yulSyntaxTests/invalid/unicode_comment_direction_override.sol
+++ b/test/libyul/yulSyntaxTests/invalid/unicode_comment_direction_override.sol
@ -0,0 +1,6 @@
 {
    // pop 1
    // underflow ‬
 }
 // ----
 // ParserError 1856: (19-32): Literal or identifier expected.
--- a/test/libyul/yulSyntaxTests/invalid/unicode_string_direction_override.sol
+++ b/test/libyul/yulSyntaxTests/invalid/unicode_string_direction_override.sol
@ -0,0 +1,6 @@
 {
    // pop 1
    let s := unicode"underflow ‬";
 }
 // ----
 // ParserError 1856: (35-47): Literal or identifier expected.