solidity/liblangutil/Scanner.cpp

1036 lines
28 KiB
C++
Raw Permalink Normal View History

2014-10-20 14:37:04 +00:00
/*
2019-02-13 15:56:46 +00:00
* This file is part of solidity.
*
* solidity is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* solidity is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with solidity. If not, see <http://www.gnu.org/licenses/>.
*
* This file is derived from the file "scanner.cc", which was part of the
* V8 project. The original copyright header follows:
*
* Copyright 2006-2012, the V8 project authors. All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2014-10-20 14:37:04 +00:00
*/
/**
* @author Christian <c@ethdev.com>
* @date 2014
* Solidity scanner.
*/
2019-02-21 00:04:34 +00:00
#include <liblangutil/Common.h>
#include <liblangutil/Exceptions.h>
#include <liblangutil/Scanner.h>
#include <boost/algorithm/string/classification.hpp>
#include <optional>
#include <string_view>
#include <tuple>
2021-09-16 14:33:28 +00:00
#include <array>
2014-10-24 17:06:30 +00:00
2021-05-27 15:41:04 +00:00
namespace solidity::langutil
{
2019-12-11 16:31:36 +00:00
std::string to_string(ScannerError _errorCode)
{
switch (_errorCode)
{
case ScannerError::NoError: return "No error.";
case ScannerError::IllegalToken: return "Invalid token.";
2019-09-06 13:29:51 +00:00
case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles.";
case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
case ScannerError::UnicodeCharacterInNonUnicodeString: return "Invalid character in string. If you are trying to use Unicode characters, use a unicode\"...\" string literal.";
2020-07-15 14:06:43 +00:00
case ScannerError::IllegalCharacterInString: return "Invalid character in string.";
case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
case ScannerError::IllegalExponent: return "Invalid exponent.";
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
default:
solAssert(false, "Unhandled case in to_string(ScannerError)");
return "";
}
}
2014-11-30 22:25:42 +00:00
2019-04-18 11:17:11 +00:00
std::ostream& operator<<(std::ostream& os, ScannerError _errorCode)
{
2019-04-18 11:17:11 +00:00
return os << to_string(_errorCode);
}
2014-11-30 22:25:42 +00:00
/// Scoped helper for literal recording. Automatically drops the literal
/// if aborting the scanning before it's complete.
2019-04-18 11:17:11 +00:00
enum LiteralType
{
2014-11-30 22:25:42 +00:00
LITERAL_TYPE_STRING,
LITERAL_TYPE_NUMBER, // not really different from string type in behaviour
LITERAL_TYPE_COMMENT
};
class LiteralScope
{
public:
2019-04-18 11:17:11 +00:00
explicit LiteralScope(Scanner* _self, enum LiteralType _type):
m_type(_type),
m_scanner(_self),
m_complete(false)
2014-11-30 22:25:42 +00:00
{
if (_type == LITERAL_TYPE_COMMENT)
2020-01-22 19:10:56 +00:00
m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
2014-11-30 22:25:42 +00:00
else
2020-01-22 19:10:56 +00:00
m_scanner->m_tokens[Scanner::NextNext].literal.clear();
2014-11-30 22:25:42 +00:00
}
~LiteralScope()
{
if (!m_complete)
{
if (m_type == LITERAL_TYPE_COMMENT)
2020-01-22 19:10:56 +00:00
m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
2014-11-30 22:25:42 +00:00
else
2020-01-22 19:10:56 +00:00
m_scanner->m_tokens[Scanner::NextNext].literal.clear();
2014-11-30 22:25:42 +00:00
}
}
void complete() { m_complete = true; }
private:
enum LiteralType m_type;
Scanner* m_scanner;
bool m_complete;
2019-04-18 11:17:11 +00:00
};
2014-11-30 22:25:42 +00:00
void Scanner::reset()
{
2021-07-14 10:53:39 +00:00
m_source.reset();
m_kind = ScannerKind::Solidity;
2021-07-14 10:53:39 +00:00
m_char = m_source.get();
2014-10-09 10:28:37 +00:00
skipWhitespace();
2014-12-01 00:05:55 +00:00
next();
next();
2020-01-22 19:10:56 +00:00
next();
}
void Scanner::setPosition(size_t _offset)
{
2021-07-14 10:53:39 +00:00
m_char = m_source.setPosition(_offset);
scanToken();
next();
2020-01-22 19:10:56 +00:00
next();
}
2014-11-05 13:20:56 +00:00
bool Scanner::scanHexByte(char& o_scannedByte)
{
2014-10-09 10:28:37 +00:00
char x = 0;
for (size_t i = 0; i < 2; i++)
2014-10-16 12:08:54 +00:00
{
int d = hexValue(m_char);
2014-10-16 12:08:54 +00:00
if (d < 0)
{
2014-10-09 10:28:37 +00:00
rollback(i);
return false;
}
2020-06-05 12:30:57 +00:00
x = static_cast<char>(x * 16 + d);
2014-10-09 10:28:37 +00:00
advance();
}
2014-11-05 13:20:56 +00:00
o_scannedByte = x;
2014-10-09 10:28:37 +00:00
return true;
}
std::optional<unsigned> Scanner::scanUnicode()
{
unsigned x = 0;
for (size_t i = 0; i < 4; i++)
{
int d = hexValue(m_char);
if (d < 0)
{
rollback(i);
2019-04-18 11:17:11 +00:00
return {};
}
2020-06-05 12:30:57 +00:00
x = x * 16 + static_cast<unsigned>(d);
advance();
}
2019-04-18 11:17:11 +00:00
return x;
}
// This supports codepoints between 0000 and FFFF.
void Scanner::addUnicodeAsUTF8(unsigned codepoint)
{
if (codepoint <= 0x7f)
addLiteralChar(char(codepoint));
else if (codepoint <= 0x7ff)
{
addLiteralChar(char(0xc0u | (codepoint >> 6u)));
addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
}
else
{
addLiteralChar(char(0xe0u | (codepoint >> 12u)));
addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu)));
addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
}
}
void Scanner::rescan()
{
size_t rollbackTo = 0;
2020-01-22 19:10:56 +00:00
if (m_skippedComments[Current].literal.empty())
rollbackTo = static_cast<size_t>(m_tokens[Current].location.start);
else
rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start);
2021-07-14 10:53:39 +00:00
m_char = m_source.rollback(m_source.position() - rollbackTo);
next();
next();
2020-01-22 19:10:56 +00:00
next();
}
// Ensure that tokens can be stored in a byte.
BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
Token Scanner::next()
{
2020-01-22 19:10:56 +00:00
m_tokens[Current] = std::move(m_tokens[Next]);
m_tokens[Next] = std::move(m_tokens[NextNext]);
m_skippedComments[Current] = std::move(m_skippedComments[Next]);
m_skippedComments[Next] = std::move(m_skippedComments[NextNext]);
scanToken();
2020-01-22 19:10:56 +00:00
return m_tokens[Current].token;
}
Token Scanner::selectToken(char _next, Token _then, Token _else)
2014-10-16 21:49:45 +00:00
{
advance();
if (m_char == _next)
2014-10-17 10:52:39 +00:00
return selectToken(_then);
2014-10-16 21:49:45 +00:00
else
return _else;
}
bool Scanner::skipWhitespace()
{
size_t const startPosition = sourcePos();
while (isWhiteSpace(m_char))
2014-10-16 12:08:54 +00:00
advance();
2014-10-09 10:28:37 +00:00
// Return whether or not we skipped any characters.
2015-08-31 16:44:29 +00:00
return sourcePos() != startPosition;
}
bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
{
size_t const startPosition = sourcePos();
while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
advance();
// Return whether or not we skipped any characters.
return sourcePos() != startPosition;
}
namespace
{
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
///
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
/// and error code in case the input's lexical parser state is invalid and this error should be reported
/// to the user.
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
{
static std::array<std::pair<std::string_view, int>, 5> constexpr directionalSequences{
std::pair<std::string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
std::pair<std::string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
std::pair<std::string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
std::pair<std::string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
std::pair<std::string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting
};
size_t endPosition = _stream.position();
_stream.setPosition(_startPosition);
int directionOverrideDepth = 0;
for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
{
_stream.setPosition(currentPos);
for (auto const& [sequence, depthChange]: directionalSequences)
if (_stream.prefixMatch(sequence))
directionOverrideDepth += depthChange;
if (directionOverrideDepth < 0)
return ScannerError::DirectionalOverrideUnderflow;
}
_stream.setPosition(endPosition);
return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
}
}
Token Scanner::skipSingleLineComment()
{
// Line terminator is not part of the comment. If it is a
// non-ascii line terminator, it will result in a parser error.
2021-07-14 10:53:39 +00:00
size_t startPosition = m_source.position();
while (!isUnicodeLinebreak())
if (!advance())
break;
2021-07-14 10:53:39 +00:00
ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
return Token::Whitespace;
}
bool Scanner::atEndOfLine() const
{
return m_char == '\n' || m_char == '\r';
}
bool Scanner::tryScanEndOfLine()
{
if (m_char == '\n')
{
advance();
return true;
}
if (m_char == '\r')
{
if (advance() && m_char == '\n')
advance();
return true;
}
return false;
}
size_t Scanner::scanSingleLineDocComment()
{
LiteralScope literal(this, LITERAL_TYPE_COMMENT);
2021-07-14 10:53:39 +00:00
size_t endPosition = m_source.position();
skipWhitespaceExceptUnicodeLinebreak();
while (!isSourcePastEndOfInput())
{
2021-07-14 10:53:39 +00:00
endPosition = m_source.position();
if (tryScanEndOfLine())
{
// Check if next line is also a single-line comment.
// If any whitespaces were skipped, use source position before.
if (!skipWhitespaceExceptUnicodeLinebreak())
2021-07-14 10:53:39 +00:00
endPosition = m_source.position();
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(3) &&
m_source.get(0) == '/' &&
m_source.get(1) == '/' &&
m_source.get(2) == '/')
{
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(4) && m_source.get(3) == '/')
2020-06-13 00:02:32 +00:00
break; // "////" is not a documentation comment
2021-07-14 10:53:39 +00:00
m_char = m_source.advanceAndGet(3);
if (atEndOfLine())
continue;
addCommentLiteralChar('\n');
}
else
break; // next line is not a documentation comment, we are done
}
else if (isUnicodeLinebreak())
// Any line terminator that is not '\n' is considered to end the
// comment.
break;
2014-11-21 08:09:39 +00:00
addCommentLiteralChar(m_char);
advance();
}
literal.complete();
return endPosition;
}
Token Scanner::skipMultiLineComment()
{
2021-07-14 10:53:39 +00:00
size_t startPosition = m_source.position();
2014-10-16 12:08:54 +00:00
while (!isSourcePastEndOfInput())
{
char prevChar = m_char;
2014-10-09 10:28:37 +00:00
advance();
2014-10-16 21:49:45 +00:00
2014-10-09 10:28:37 +00:00
// If we have reached the end of the multi-line comment, we
// consume the '/' and insert a whitespace. This way all
// multi-line comments are treated as whitespace.
if (prevChar == '*' && m_char == '/')
2014-10-16 12:08:54 +00:00
{
2021-07-14 10:53:39 +00:00
ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
2014-10-09 10:28:37 +00:00
m_char = ' ';
return Token::Whitespace;
2014-10-09 10:28:37 +00:00
}
}
// Unterminated multi-line comment.
return setError(ScannerError::IllegalCommentTerminator);
}
Token Scanner::scanMultiLineDocComment()
{
LiteralScope literal(this, LITERAL_TYPE_COMMENT);
bool endFound = false;
bool charsAdded = false;
while (isWhiteSpace(m_char) && !atEndOfLine())
advance();
while (!isSourcePastEndOfInput())
{
// handle newlines in multiline comments
if (atEndOfLine())
{
skipWhitespace();
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*')
{ // it is unknown if this leads to the end of the comment
addCommentLiteralChar('*');
advance();
}
2021-07-14 10:53:39 +00:00
else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/')
{ // skip first '*' in subsequent lines
2021-07-14 10:53:39 +00:00
m_char = m_source.advanceAndGet(1);
if (atEndOfLine()) // ignores empty lines
continue;
if (charsAdded)
addCommentLiteralChar('\n'); // corresponds to the end of previous line
}
2021-07-14 10:53:39 +00:00
else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
{ // if after newline the comment ends, don't insert the newline
2021-07-14 10:53:39 +00:00
m_char = m_source.advanceAndGet(2);
endFound = true;
break;
}
else if (charsAdded)
addCommentLiteralChar('\n');
}
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
{
2021-07-14 10:53:39 +00:00
m_char = m_source.advanceAndGet(2);
endFound = true;
break;
}
addCommentLiteralChar(m_char);
charsAdded = true;
advance();
}
literal.complete();
if (!endFound)
return setError(ScannerError::IllegalCommentTerminator);
else
return Token::CommentLiteral;
}
Token Scanner::scanSlash()
{
int firstSlashPosition = static_cast<int>(sourcePos());
advance();
if (m_char == '/')
{
if (!advance()) /* double slash comment directly before EOS */
return Token::Whitespace;
else if (m_char == '/')
{
2020-06-13 00:02:32 +00:00
advance(); //consume the last '/' at ///
// "////"
if (m_char == '/')
return skipSingleLineComment();
// doxygen style /// comment
2020-01-22 19:10:56 +00:00
m_skippedComments[NextNext].location.start = firstSlashPosition;
2021-06-29 12:38:59 +00:00
m_skippedComments[NextNext].location.sourceName = m_sourceName;
m_skippedComments[NextNext].token = Token::CommentLiteral;
m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment());
return Token::Whitespace;
}
else
return skipSingleLineComment();
}
else if (m_char == '*')
{
// doxygen style /** natspec comment
if (!advance()) /* slash star comment before EOS */
return setError(ScannerError::IllegalCommentTerminator);
else if (m_char == '*')
{
advance(); //consume the last '*' at /**
// "/**/"
if (m_char == '/')
{
advance(); //skip the closing slash
return Token::Whitespace;
}
2020-06-13 00:02:32 +00:00
// "/***"
if (m_char == '*')
// "/***/" may be interpreted as empty natspec or skipped; skipping is simpler
return skipMultiLineComment();
// we actually have a multiline documentation comment
2020-01-22 19:10:56 +00:00
m_skippedComments[NextNext].location.start = firstSlashPosition;
2021-06-29 12:38:59 +00:00
m_skippedComments[NextNext].location.sourceName = m_sourceName;
2020-06-13 00:02:32 +00:00
Token comment = scanMultiLineDocComment();
m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos());
2020-01-22 19:10:56 +00:00
m_skippedComments[NextNext].token = comment;
if (comment == Token::Illegal)
return Token::Illegal; // error already set
else
return Token::Whitespace;
}
else
return skipMultiLineComment();
}
else if (m_char == '=')
return selectToken(Token::AssignDiv);
else
return Token::Div;
}
void Scanner::scanToken()
{
2020-01-22 19:10:56 +00:00
m_tokens[NextNext] = {};
m_skippedComments[NextNext] = {};
Token token;
// M and N are for the purposes of grabbing different type sizes
unsigned m = 0;
unsigned n = 0;
2014-10-16 12:08:54 +00:00
do
{
// Remember the position of the next token
m_tokens[NextNext].location.start = static_cast<int>(sourcePos());
2014-10-16 12:08:54 +00:00
switch (m_char)
{
case '"':
case '\'':
token = scanString(false);
2014-10-16 12:08:54 +00:00
break;
case '<':
// < <= << <<=
advance();
if (m_char == '=')
2015-02-10 08:52:19 +00:00
token = selectToken(Token::LessThanOrEqual);
2014-10-16 12:08:54 +00:00
else if (m_char == '<')
token = selectToken('=', Token::AssignShl, Token::SHL);
2014-10-16 12:08:54 +00:00
else
token = Token::LessThan;
2014-10-16 12:08:54 +00:00
break;
case '>':
// > >= >> >>= >>> >>>=
advance();
if (m_char == '=')
2015-02-10 08:52:19 +00:00
token = selectToken(Token::GreaterThanOrEqual);
2014-10-16 12:08:54 +00:00
else if (m_char == '>')
{
// >> >>= >>> >>>=
advance();
if (m_char == '=')
token = selectToken(Token::AssignSar);
2014-10-16 12:08:54 +00:00
else if (m_char == '>')
token = selectToken('=', Token::AssignShr, Token::SHR);
2014-10-16 12:08:54 +00:00
else
token = Token::SAR;
}
else
token = Token::GreaterThan;
2014-10-16 12:08:54 +00:00
break;
case '=':
// = == =>
advance();
if (m_char == '=')
2015-02-10 08:52:19 +00:00
token = selectToken(Token::Equal);
2014-10-16 12:08:54 +00:00
else if (m_char == '>')
2020-08-27 10:42:00 +00:00
token = selectToken(Token::DoubleArrow);
2014-10-16 12:08:54 +00:00
else
token = Token::Assign;
2014-10-16 12:08:54 +00:00
break;
case '!':
// ! !=
advance();
if (m_char == '=')
2015-02-10 08:52:19 +00:00
token = selectToken(Token::NotEqual);
2014-10-16 12:08:54 +00:00
else
token = Token::Not;
2014-10-16 12:08:54 +00:00
break;
case '+':
// + ++ +=
advance();
if (m_char == '+')
token = selectToken(Token::Inc);
2014-10-16 12:08:54 +00:00
else if (m_char == '=')
token = selectToken(Token::AssignAdd);
2014-10-16 12:08:54 +00:00
else
token = Token::Add;
2014-10-16 12:08:54 +00:00
break;
case '-':
// - -- -= ->
2014-10-16 12:08:54 +00:00
advance();
if (m_char == '-')
token = selectToken(Token::Dec);
2014-10-16 12:08:54 +00:00
else if (m_char == '=')
token = selectToken(Token::AssignSub);
else if (m_char == '>')
token = selectToken(Token::RightArrow);
2014-10-16 12:08:54 +00:00
else
token = Token::Sub;
2014-10-16 12:08:54 +00:00
break;
case '*':
// * ** *=
advance();
if (m_char == '*')
token = selectToken(Token::Exp);
else if (m_char == '=')
token = selectToken(Token::AssignMul);
else
token = Token::Mul;
2014-10-16 12:08:54 +00:00
break;
case '%':
// % %=
token = selectToken('=', Token::AssignMod, Token::Mod);
2014-10-16 12:08:54 +00:00
break;
case '/':
// / // /* /=
token = scanSlash();
2014-10-16 12:08:54 +00:00
break;
case '&':
// & && &=
advance();
if (m_char == '&')
token = selectToken(Token::And);
2014-10-16 12:08:54 +00:00
else if (m_char == '=')
token = selectToken(Token::AssignBitAnd);
2014-10-16 12:08:54 +00:00
else
token = Token::BitAnd;
2014-10-16 12:08:54 +00:00
break;
case '|':
// | || |=
advance();
if (m_char == '|')
token = selectToken(Token::Or);
2014-10-16 12:08:54 +00:00
else if (m_char == '=')
token = selectToken(Token::AssignBitOr);
2014-10-16 12:08:54 +00:00
else
token = Token::BitOr;
2014-10-16 12:08:54 +00:00
break;
case '^':
// ^ ^=
token = selectToken('=', Token::AssignBitXor, Token::BitXor);
2014-10-16 12:08:54 +00:00
break;
case '.':
// . Number
advance();
if (isDecimalDigit(m_char))
2014-11-05 07:40:21 +00:00
token = scanNumber('.');
2014-10-16 12:08:54 +00:00
else
token = Token::Period;
2014-10-16 12:08:54 +00:00
break;
case ':':
// : :=
advance();
if (m_char == '=')
token = selectToken(Token::AssemblyAssign);
else
token = Token::Colon;
2014-10-16 12:08:54 +00:00
break;
case ';':
token = selectToken(Token::Semicolon);
2014-10-16 12:08:54 +00:00
break;
case ',':
token = selectToken(Token::Comma);
2014-10-16 12:08:54 +00:00
break;
case '(':
token = selectToken(Token::LParen);
2014-10-16 12:08:54 +00:00
break;
case ')':
token = selectToken(Token::RParen);
2014-10-16 12:08:54 +00:00
break;
case '[':
token = selectToken(Token::LBrack);
2014-10-16 12:08:54 +00:00
break;
case ']':
token = selectToken(Token::RBrack);
2014-10-16 12:08:54 +00:00
break;
case '{':
token = selectToken(Token::LBrace);
2014-10-16 12:08:54 +00:00
break;
case '}':
token = selectToken(Token::RBrace);
2014-10-16 12:08:54 +00:00
break;
case '?':
token = selectToken(Token::Conditional);
2014-10-16 12:08:54 +00:00
break;
case '~':
token = selectToken(Token::BitNot);
2014-10-16 12:08:54 +00:00
break;
default:
if (isIdentifierStart(m_char))
{
std::tie(token, m, n) = scanIdentifierOrKeyword();
2018-10-04 11:03:55 +00:00
// Special case for hexadecimal literals
if (token == Token::Hex)
{
// reset
m = 0;
n = 0;
// Special quoted hex string must follow
if (m_char == '"' || m_char == '\'')
token = scanHexString();
else
token = setError(ScannerError::IllegalToken);
}
else if (token == Token::Unicode && m_kind != ScannerKind::Yul)
{
// reset
m = 0;
n = 0;
// Special quoted hex string must follow
if (m_char == '"' || m_char == '\'')
token = scanString(true);
else
token = setError(ScannerError::IllegalToken);
}
}
else if (isDecimalDigit(m_char))
2014-11-05 07:40:21 +00:00
token = scanNumber();
2014-10-16 12:08:54 +00:00
else if (skipWhitespace())
token = Token::Whitespace;
2014-10-16 12:08:54 +00:00
else if (isSourcePastEndOfInput())
token = Token::EOS;
else
token = selectErrorToken(ScannerError::IllegalToken);
2014-10-16 12:08:54 +00:00
break;
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
// Continue scanning for tokens as long as we're just skipping
// whitespace.
2014-10-09 10:28:37 +00:00
}
while (token == Token::Whitespace);
m_tokens[NextNext].location.end = static_cast<int>(sourcePos());
2021-06-29 12:38:59 +00:00
m_tokens[NextNext].location.sourceName = m_sourceName;
2020-01-22 19:10:56 +00:00
m_tokens[NextNext].token = token;
m_tokens[NextNext].extendedTokenInfo = std::make_tuple(m, n);
}
bool Scanner::scanEscape()
{
2014-10-09 10:28:37 +00:00
char c = m_char;
2014-10-09 10:28:37 +00:00
// Skip escaped newlines.
if (tryScanEndOfLine())
2014-10-09 10:28:37 +00:00
return true;
advance();
2014-10-16 12:08:54 +00:00
switch (c)
{
2014-10-09 10:28:37 +00:00
case '\'': // fall through
2014-10-16 21:49:45 +00:00
case '"': // fall through
2014-10-16 12:08:54 +00:00
case '\\':
break;
2014-10-16 21:49:45 +00:00
case 'n':
2014-10-16 12:08:54 +00:00
c = '\n';
break;
2014-10-16 21:49:45 +00:00
case 'r':
2014-10-16 12:08:54 +00:00
c = '\r';
break;
2014-10-16 21:49:45 +00:00
case 't':
2014-10-16 12:08:54 +00:00
c = '\t';
break;
case 'u':
{
if (auto const codepoint = scanUnicode(); codepoint.has_value())
2019-04-18 11:17:11 +00:00
addUnicodeAsUTF8(*codepoint);
else
return false;
return true;
}
2014-10-16 21:49:45 +00:00
case 'x':
2014-11-05 13:20:56 +00:00
if (!scanHexByte(c))
return false;
2014-10-09 10:28:37 +00:00
break;
default:
return false;
2014-10-09 10:28:37 +00:00
}
2014-10-16 21:49:45 +00:00
2014-10-09 10:28:37 +00:00
addLiteralChar(c);
return true;
}
bool Scanner::isUnicodeLinebreak()
{
if (0x0a <= m_char && m_char <= 0x0d)
// line feed, vertical tab, form feed, carriage return
return true;
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85)
// NEL - U+0085, C2 85 in utf8
return true;
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && (
uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9
))
// LS - U+2028, E2 80 A8 in utf8
// PS - U+2029, E2 80 A9 in utf8
return true;
2020-06-13 00:02:32 +00:00
return false;
}
Token Scanner::scanString(bool const _isUnicode)
{
2021-07-14 10:53:39 +00:00
size_t startPosition = m_source.position();
char const quote = m_char;
2014-10-09 10:28:37 +00:00
advance(); // consume quote
LiteralScope literal(this, LITERAL_TYPE_STRING);
while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
2014-10-16 12:08:54 +00:00
{
2014-10-09 10:28:37 +00:00
char c = m_char;
advance();
2014-10-16 12:08:54 +00:00
if (c == '\\')
{
if (isSourcePastEndOfInput() || !scanEscape())
return setError(ScannerError::IllegalEscapeSequence);
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
else
2020-07-15 14:06:43 +00:00
{
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
2020-07-15 14:06:43 +00:00
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
{
if (m_kind == ScannerKind::Yul)
return setError(ScannerError::IllegalCharacterInString);
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
}
2014-10-16 12:08:54 +00:00
addLiteralChar(c);
2020-07-15 14:06:43 +00:00
}
2014-10-09 10:28:37 +00:00
}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);
if (_isUnicode)
{
2021-07-14 10:53:39 +00:00
ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
}
literal.complete();
2014-10-09 10:28:37 +00:00
advance(); // consume quote
return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
}
Token Scanner::scanHexString()
{
char const quote = m_char;
advance(); // consume quote
LiteralScope literal(this, LITERAL_TYPE_STRING);
2019-09-06 13:29:51 +00:00
bool allowUnderscore = false;
while (m_char != quote && !isSourcePastEndOfInput())
{
char c = m_char;
2019-09-06 13:29:51 +00:00
if (scanHexByte(c))
{
addLiteralChar(c);
allowUnderscore = true;
}
else if (c == '_')
{
advance();
if (!allowUnderscore || m_char == quote)
return setError(ScannerError::IllegalNumberSeparator);
allowUnderscore = false;
}
else
return setError(ScannerError::IllegalHexString);
}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);
literal.complete();
advance(); // consume quote
return Token::HexStringLiteral;
}
// Parse for regex [:digit:]+(_[:digit:]+)*
void Scanner::scanDecimalDigits()
{
// MUST begin with a decimal digit.
if (!isDecimalDigit(m_char))
return;
2017-10-25 08:12:07 +00:00
// May continue with decimal digit or underscore for grouping.
2019-04-18 11:17:11 +00:00
do
addLiteralCharAndAdvance();
2021-07-14 10:53:39 +00:00
while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
2017-10-25 08:12:07 +00:00
// Defer further validation of underscore to SyntaxChecker.
}
Token Scanner::scanNumber(char _charSeen)
{
2014-11-05 07:40:21 +00:00
enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
LiteralScope literal(this, LITERAL_TYPE_NUMBER);
2014-11-05 07:40:21 +00:00
if (_charSeen == '.')
2014-10-16 12:08:54 +00:00
{
// we have already seen a decimal point of the float
addLiteralChar('.');
if (m_char == '_')
return setError(ScannerError::IllegalToken);
2014-10-16 12:08:54 +00:00
scanDecimalDigits(); // we know we have at least one digit
}
else
{
2014-12-19 10:31:17 +00:00
solAssert(_charSeen == 0, "");
2014-10-16 12:08:54 +00:00
// if the first character is '0' we must check for octals and hex
if (m_char == '0')
{
addLiteralCharAndAdvance();
2014-11-05 07:40:21 +00:00
// either 0, 0exxx, 0Exxx, 0.xxx or a hex number
if (m_char == 'x')
2014-10-16 12:08:54 +00:00
{
// hex number
kind = HEX;
addLiteralCharAndAdvance();
if (!isHexDigit(m_char))
return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x'
2017-10-25 08:12:07 +00:00
while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation
2014-10-16 12:08:54 +00:00
addLiteralCharAndAdvance();
}
2017-03-07 11:34:32 +00:00
else if (isDecimalDigit(m_char))
// We do not allow octal numbers
return setError(ScannerError::OctalNotAllowed);
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
// Parse decimal digits and allow trailing fractional part.
if (kind == DECIMAL)
{
scanDecimalDigits(); // optional
if (m_char == '.')
{
2021-07-14 10:53:39 +00:00
if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
{
// Assume the input may be a floating point number with leading '_' in fraction part.
// Recover by consuming it all but returning `Illegal` right away.
addLiteralCharAndAdvance(); // '.'
addLiteralCharAndAdvance(); // '_'
scanDecimalDigits();
}
2021-07-14 10:53:39 +00:00
if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1)))
{
// A '.' has to be followed by a number.
literal.complete();
return Token::Number;
}
2014-10-16 12:08:54 +00:00
addLiteralCharAndAdvance();
scanDecimalDigits();
2014-10-16 12:08:54 +00:00
}
2014-10-09 10:28:37 +00:00
}
}
2014-10-16 12:08:54 +00:00
// scan exponent, if any
if (m_char == 'e' || m_char == 'E')
{
solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
if (kind != DECIMAL)
return setError(ScannerError::IllegalExponent);
2021-07-14 10:53:39 +00:00
else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
{
// Recover from wrongly placed underscore as delimiter in literal with scientific
// notation by consuming until the end.
addLiteralCharAndAdvance(); // 'e'
addLiteralCharAndAdvance(); // '_'
scanDecimalDigits();
literal.complete();
return Token::Number;
}
2014-10-16 12:08:54 +00:00
// scan exponent
addLiteralCharAndAdvance(); // 'e' | 'E'
2014-10-16 12:08:54 +00:00
if (m_char == '+' || m_char == '-')
addLiteralCharAndAdvance();
if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E'
return setError(ScannerError::IllegalExponent);
2014-10-16 12:08:54 +00:00
scanDecimalDigits();
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
// The source character immediately following a numeric literal must
// not be an identifier start or a decimal digit; see ECMA-262
// section 7.8.3, page 17 (note that we read only one decimal digit
// if the value is 0).
if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
return setError(ScannerError::IllegalNumberEnd);
literal.complete();
return Token::Number;
}
std::tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
{
solAssert(isIdentifierStart(m_char), "");
LiteralScope literal(this, LITERAL_TYPE_STRING);
2014-10-09 10:28:37 +00:00
addLiteralCharAndAdvance();
// Scan the rest of the identifier characters.
while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul))
2014-10-09 10:28:37 +00:00
addLiteralCharAndAdvance();
literal.complete();
auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
if (m_kind == ScannerKind::Yul)
{
2020-07-27 18:11:38 +00:00
// Turn Solidity identifier into a Yul keyword
if (m_tokens[NextNext].literal == "leave")
return std::make_tuple(Token::Leave, 0, 0);
// Turn non-Yul keywords into identifiers.
if (!TokenTraits::isYulKeyword(std::get<0>(token)))
return std::make_tuple(Token::Identifier, 0, 0);
}
return token;
}
2019-12-11 16:31:36 +00:00
} // namespace solidity::langutil