diff --git a/Changelog.md b/Changelog.md index 5d78caeba..7541aad9c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -8,6 +8,7 @@ Breaking changes: * Parser: Disallow ``gwei`` as identifier. * Parser: Disallow dot syntax for ``value`` and ``gas``. * Parser: Disallow non-printable characters in string literals. + * Parser: Introduce Unicode string literals: ``unicode"😃"``. * Parser: NatSpec comments on variables are only allowed for public state variables. * Parser: Remove the ``finney`` and ``szabo`` denominations. * Parser: Remove the identifier ``now`` (replaced by ``block.timestamp``). diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index e526abb35..3e125f04e 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -509,7 +509,7 @@ void Scanner::scanToken() { case '"': case '\'': - token = scanString(); + token = scanString(false); break; case '<': // < <= << <<= @@ -684,6 +684,18 @@ void Scanner::scanToken() else token = setError(ScannerError::IllegalToken); } + else if (token == Token::Unicode) + { + // reset + m = 0; + n = 0; + + // Special quoted hex string must follow + if (m_char == '"' || m_char == '\'') + token = scanString(true); + else + token = setError(ScannerError::IllegalToken); + } } else if (isDecimalDigit(m_char)) token = scanNumber(); @@ -775,7 +787,7 @@ bool Scanner::isUnicodeLinebreak() return false; } -Token Scanner::scanString() +Token Scanner::scanString(bool const _isUnicode) { char const quote = m_char; advance(); // consume quote @@ -791,11 +803,13 @@ Token Scanner::scanString() } else { - // Report error on non-printable characters in string literals. + // Report error on non-printable characters in string literals, however + // allow anything for unicode string literals, because their validity will + // be verified later (in the syntax checker). // // We are using a manual range and not isprint() to avoid // any potential complications with locale. - if (static_cast(c) <= 0x1f || static_cast(c) >= 0x7f) + if (!_isUnicode && (static_cast(c) <= 0x1f || static_cast(c) >= 0x7f)) return setError(ScannerError::IllegalCharacterInString); addLiteralChar(c); } @@ -804,7 +818,7 @@ Token Scanner::scanString() return setError(ScannerError::IllegalStringEndQuote); literal.complete(); advance(); // consume quote - return Token::StringLiteral; + return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral; } Token Scanner::scanHexString() diff --git a/liblangutil/Scanner.h b/liblangutil/Scanner.h index e72ac3ba5..24ec098ab 100644 --- a/liblangutil/Scanner.h +++ b/liblangutil/Scanner.h @@ -229,7 +229,7 @@ private: Token scanNumber(char _charSeen = 0); std::tuple scanIdentifierOrKeyword(); - Token scanString(); + Token scanString(bool const _isUnicode); Token scanHexString(); /// Scans a single line comment and returns its corrected end position. size_t scanSingleLineDocComment(); diff --git a/liblangutil/Token.h b/liblangutil/Token.h index 0daceb68d..366c354cd 100644 --- a/liblangutil/Token.h +++ b/liblangutil/Token.h @@ -190,6 +190,7 @@ namespace solidity::langutil K(Throw, "throw", 0) \ K(Try, "try", 0) \ K(Type, "type", 0) \ + K(Unicode, "unicode", 0) \ K(Using, "using", 0) \ K(View, "view", 0) \ K(Virtual, "virtual", 0) \ @@ -227,6 +228,7 @@ namespace solidity::langutil K(FalseLiteral, "false", 0) \ T(Number, nullptr, 0) \ T(StringLiteral, nullptr, 0) \ + T(UnicodeStringLiteral, nullptr, 0) \ T(HexStringLiteral, nullptr, 0) \ T(CommentLiteral, nullptr, 0) \ \ diff --git a/libsolidity/analysis/SyntaxChecker.cpp b/libsolidity/analysis/SyntaxChecker.cpp index 7f27e11ab..c0380ffe5 100644 --- a/libsolidity/analysis/SyntaxChecker.cpp +++ b/libsolidity/analysis/SyntaxChecker.cpp @@ -28,6 +28,8 @@ #include #include +#include + #include #include @@ -37,7 +39,7 @@ using namespace std; using namespace solidity; using namespace solidity::langutil; using namespace solidity::frontend; - +using namespace solidity::util; bool SyntaxChecker::checkSyntax(ASTNode const& _astRoot) { @@ -217,6 +219,13 @@ bool SyntaxChecker::visit(Throw const& _throwStatement) bool SyntaxChecker::visit(Literal const& _literal) { + if ((_literal.token() == Token::UnicodeStringLiteral) && !validateUTF8(_literal.value())) + m_errorReporter.syntaxError( + 8452_error, + _literal.location(), + "Invalid UTF-8 sequence found" + ); + if (_literal.token() != Token::Number) return true; diff --git a/libsolidity/ast/ASTJsonConverter.cpp b/libsolidity/ast/ASTJsonConverter.cpp index 193d61375..5403b05c5 100644 --- a/libsolidity/ast/ASTJsonConverter.cpp +++ b/libsolidity/ast/ASTJsonConverter.cpp @@ -920,6 +920,8 @@ string ASTJsonConverter::literalTokenKind(Token _token) return "number"; case Token::StringLiteral: return "string"; + case Token::UnicodeStringLiteral: + return "unicodeString"; case Token::HexStringLiteral: return "hexString"; case Token::TrueLiteral: diff --git a/libsolidity/ast/ASTJsonImporter.cpp b/libsolidity/ast/ASTJsonImporter.cpp index d3b54d687..2d1fda322 100644 --- a/libsolidity/ast/ASTJsonImporter.cpp +++ b/libsolidity/ast/ASTJsonImporter.cpp @@ -943,6 +943,8 @@ Token ASTJsonImporter::literalTokenKind(Json::Value const& _node) tok = Token::Number; else if (_node["kind"].asString() == "string") tok = Token::StringLiteral; + else if (_node["kind"].asString() == "unicodeString") + tok = Token::UnicodeStringLiteral; else if (_node["kind"].asString() == "hexString") tok = Token::HexStringLiteral; else if (_node["kind"].asString() == "bool") diff --git a/libsolidity/ast/TypeProvider.cpp b/libsolidity/ast/TypeProvider.cpp index c7ef3cfbd..3a06b2339 100644 --- a/libsolidity/ast/TypeProvider.cpp +++ b/libsolidity/ast/TypeProvider.cpp @@ -349,6 +349,7 @@ TypePointer TypeProvider::forLiteral(Literal const& _literal) case Token::Number: return rationalNumber(_literal); case Token::StringLiteral: + case Token::UnicodeStringLiteral: case Token::HexStringLiteral: return stringLiteral(_literal.value()); default: diff --git a/libsolidity/parsing/Parser.cpp b/libsolidity/parsing/Parser.cpp index 16ed2ae37..cfb7de46e 100644 --- a/libsolidity/parsing/Parser.cpp +++ b/libsolidity/parsing/Parser.cpp @@ -1782,6 +1782,7 @@ ASTPointer Parser::parsePrimaryExpression() } break; case Token::StringLiteral: + case Token::UnicodeStringLiteral: case Token::HexStringLiteral: { string literal = m_scanner->currentLiteral();