Add check that regular and unicode string literals are well formatted

This commit is contained in:
Alex Beregszaszi 2020-07-02 17:39:04 +01:00
parent 6fe8e63eee
commit 6eb60bc8cd
9 changed files with 39 additions and 7 deletions

View File

@ -8,6 +8,7 @@ Breaking changes:
* Parser: Disallow ``gwei`` as identifier.
* Parser: Disallow dot syntax for ``value`` and ``gas``.
* Parser: Disallow non-printable characters in string literals.
* Parser: Introduce Unicode string literals: ``unicode"😃"``.
* Parser: NatSpec comments on variables are only allowed for public state variables.
* Parser: Remove the ``finney`` and ``szabo`` denominations.
* Parser: Remove the identifier ``now`` (replaced by ``block.timestamp``).

View File

@ -509,7 +509,7 @@ void Scanner::scanToken()
{
case '"':
case '\'':
token = scanString();
token = scanString(false);
break;
case '<':
// < <= << <<=
@ -684,6 +684,18 @@ void Scanner::scanToken()
else
token = setError(ScannerError::IllegalToken);
}
else if (token == Token::Unicode)
{
// reset
m = 0;
n = 0;
// Special quoted hex string must follow
if (m_char == '"' || m_char == '\'')
token = scanString(true);
else
token = setError(ScannerError::IllegalToken);
}
}
else if (isDecimalDigit(m_char))
token = scanNumber();
@ -775,7 +787,7 @@ bool Scanner::isUnicodeLinebreak()
return false;
}
Token Scanner::scanString()
Token Scanner::scanString(bool const _isUnicode)
{
char const quote = m_char;
advance(); // consume quote
@ -791,11 +803,13 @@ Token Scanner::scanString()
}
else
{
// Report error on non-printable characters in string literals.
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
if (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f)
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
return setError(ScannerError::IllegalCharacterInString);
addLiteralChar(c);
}
@ -804,7 +818,7 @@ Token Scanner::scanString()
return setError(ScannerError::IllegalStringEndQuote);
literal.complete();
advance(); // consume quote
return Token::StringLiteral;
return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
}
Token Scanner::scanHexString()

View File

@ -229,7 +229,7 @@ private:
Token scanNumber(char _charSeen = 0);
std::tuple<Token, unsigned, unsigned> scanIdentifierOrKeyword();
Token scanString();
Token scanString(bool const _isUnicode);
Token scanHexString();
/// Scans a single line comment and returns its corrected end position.
size_t scanSingleLineDocComment();

View File

@ -190,6 +190,7 @@ namespace solidity::langutil
K(Throw, "throw", 0) \
K(Try, "try", 0) \
K(Type, "type", 0) \
K(Unicode, "unicode", 0) \
K(Using, "using", 0) \
K(View, "view", 0) \
K(Virtual, "virtual", 0) \
@ -227,6 +228,7 @@ namespace solidity::langutil
K(FalseLiteral, "false", 0) \
T(Number, nullptr, 0) \
T(StringLiteral, nullptr, 0) \
T(UnicodeStringLiteral, nullptr, 0) \
T(HexStringLiteral, nullptr, 0) \
T(CommentLiteral, nullptr, 0) \
\

View File

@ -28,6 +28,8 @@
#include <liblangutil/ErrorReporter.h>
#include <liblangutil/SemVerHandler.h>
#include <libsolutil/UTF8.h>
#include <boost/algorithm/string.hpp>
#include <memory>
@ -37,7 +39,7 @@ using namespace std;
using namespace solidity;
using namespace solidity::langutil;
using namespace solidity::frontend;
using namespace solidity::util;
bool SyntaxChecker::checkSyntax(ASTNode const& _astRoot)
{
@ -217,6 +219,13 @@ bool SyntaxChecker::visit(Throw const& _throwStatement)
bool SyntaxChecker::visit(Literal const& _literal)
{
if ((_literal.token() == Token::UnicodeStringLiteral) && !validateUTF8(_literal.value()))
m_errorReporter.syntaxError(
8452_error,
_literal.location(),
"Invalid UTF-8 sequence found"
);
if (_literal.token() != Token::Number)
return true;

View File

@ -920,6 +920,8 @@ string ASTJsonConverter::literalTokenKind(Token _token)
return "number";
case Token::StringLiteral:
return "string";
case Token::UnicodeStringLiteral:
return "unicodeString";
case Token::HexStringLiteral:
return "hexString";
case Token::TrueLiteral:

View File

@ -943,6 +943,8 @@ Token ASTJsonImporter::literalTokenKind(Json::Value const& _node)
tok = Token::Number;
else if (_node["kind"].asString() == "string")
tok = Token::StringLiteral;
else if (_node["kind"].asString() == "unicodeString")
tok = Token::UnicodeStringLiteral;
else if (_node["kind"].asString() == "hexString")
tok = Token::HexStringLiteral;
else if (_node["kind"].asString() == "bool")

View File

@ -349,6 +349,7 @@ TypePointer TypeProvider::forLiteral(Literal const& _literal)
case Token::Number:
return rationalNumber(_literal);
case Token::StringLiteral:
case Token::UnicodeStringLiteral:
case Token::HexStringLiteral:
return stringLiteral(_literal.value());
default:

View File

@ -1782,6 +1782,7 @@ ASTPointer<Expression> Parser::parsePrimaryExpression()
}
break;
case Token::StringLiteral:
case Token::UnicodeStringLiteral:
case Token::HexStringLiteral:
{
string literal = m_scanner->currentLiteral();