2014-10-20 14:37:04 +00:00
|
|
|
/*
|
2019-02-13 15:56:46 +00:00
|
|
|
* This file is part of solidity.
|
|
|
|
*
|
|
|
|
* solidity is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* solidity is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with solidity. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*
|
|
|
|
* This file is derived from the file "scanner.cc", which was part of the
|
|
|
|
* V8 project. The original copyright header follows:
|
|
|
|
*
|
|
|
|
* Copyright 2006-2012, the V8 project authors. All rights reserved.
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are
|
|
|
|
* met:
|
|
|
|
*
|
|
|
|
* * Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials provided
|
|
|
|
* with the distribution.
|
|
|
|
* * Neither the name of Google Inc. nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived
|
|
|
|
* from this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2014-10-20 14:37:04 +00:00
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* @author Christian <c@ethdev.com>
|
|
|
|
* @date 2014
|
|
|
|
* Solidity scanner.
|
|
|
|
*/
|
2014-10-06 15:13:52 +00:00
|
|
|
|
2019-02-21 00:04:34 +00:00
|
|
|
#include <liblangutil/Common.h>
|
2018-11-14 13:59:30 +00:00
|
|
|
#include <liblangutil/Exceptions.h>
|
|
|
|
#include <liblangutil/Scanner.h>
|
2019-10-28 10:39:30 +00:00
|
|
|
|
2020-11-18 13:35:16 +00:00
|
|
|
#include <boost/algorithm/string/classification.hpp>
|
|
|
|
|
2019-10-28 10:39:30 +00:00
|
|
|
#include <optional>
|
2020-11-18 13:35:16 +00:00
|
|
|
#include <string_view>
|
2018-11-14 16:11:55 +00:00
|
|
|
#include <tuple>
|
2014-10-06 15:13:52 +00:00
|
|
|
|
2014-10-24 17:06:30 +00:00
|
|
|
using namespace std;
|
|
|
|
|
2021-05-27 15:41:04 +00:00
|
|
|
namespace solidity::langutil
|
|
|
|
{
|
2019-12-11 16:31:36 +00:00
|
|
|
|
|
|
|
string to_string(ScannerError _errorCode)
|
2018-11-23 15:47:34 +00:00
|
|
|
{
|
|
|
|
switch (_errorCode)
|
|
|
|
{
|
|
|
|
case ScannerError::NoError: return "No error.";
|
|
|
|
case ScannerError::IllegalToken: return "Invalid token.";
|
2019-09-06 13:29:51 +00:00
|
|
|
case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles.";
|
2018-11-23 15:47:34 +00:00
|
|
|
case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
|
|
|
|
case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
|
|
|
|
case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
|
2020-07-15 14:06:43 +00:00
|
|
|
case ScannerError::IllegalCharacterInString: return "Invalid character in string.";
|
2018-11-23 15:47:34 +00:00
|
|
|
case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
|
|
|
|
case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
|
|
|
|
case ScannerError::IllegalExponent: return "Invalid exponent.";
|
|
|
|
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
|
|
|
|
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
|
2020-11-18 13:35:16 +00:00
|
|
|
case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
|
|
|
|
case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
|
2018-11-23 15:47:34 +00:00
|
|
|
default:
|
|
|
|
solAssert(false, "Unhandled case in to_string(ScannerError)");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
2014-11-30 22:25:42 +00:00
|
|
|
|
2019-04-18 11:17:11 +00:00
|
|
|
|
2019-12-11 16:31:36 +00:00
|
|
|
ostream& operator<<(ostream& os, ScannerError _errorCode)
|
2018-11-23 15:47:34 +00:00
|
|
|
{
|
2019-04-18 11:17:11 +00:00
|
|
|
return os << to_string(_errorCode);
|
2018-11-23 15:47:34 +00:00
|
|
|
}
|
2014-11-30 22:25:42 +00:00
|
|
|
|
|
|
|
/// Scoped helper for literal recording. Automatically drops the literal
|
|
|
|
/// if aborting the scanning before it's complete.
|
2019-04-18 11:17:11 +00:00
|
|
|
enum LiteralType
|
|
|
|
{
|
2014-11-30 22:25:42 +00:00
|
|
|
LITERAL_TYPE_STRING,
|
|
|
|
LITERAL_TYPE_NUMBER, // not really different from string type in behaviour
|
|
|
|
LITERAL_TYPE_COMMENT
|
|
|
|
};
|
|
|
|
|
|
|
|
class LiteralScope
|
|
|
|
{
|
|
|
|
public:
|
2019-04-18 11:17:11 +00:00
|
|
|
explicit LiteralScope(Scanner* _self, enum LiteralType _type):
|
|
|
|
m_type(_type),
|
|
|
|
m_scanner(_self),
|
|
|
|
m_complete(false)
|
2014-11-30 22:25:42 +00:00
|
|
|
{
|
|
|
|
if (_type == LITERAL_TYPE_COMMENT)
|
2020-01-22 19:10:56 +00:00
|
|
|
m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
|
2014-11-30 22:25:42 +00:00
|
|
|
else
|
2020-01-22 19:10:56 +00:00
|
|
|
m_scanner->m_tokens[Scanner::NextNext].literal.clear();
|
2014-11-30 22:25:42 +00:00
|
|
|
}
|
|
|
|
~LiteralScope()
|
|
|
|
{
|
|
|
|
if (!m_complete)
|
|
|
|
{
|
|
|
|
if (m_type == LITERAL_TYPE_COMMENT)
|
2020-01-22 19:10:56 +00:00
|
|
|
m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
|
2014-11-30 22:25:42 +00:00
|
|
|
else
|
2020-01-22 19:10:56 +00:00
|
|
|
m_scanner->m_tokens[Scanner::NextNext].literal.clear();
|
2014-11-30 22:25:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
void complete() { m_complete = true; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
enum LiteralType m_type;
|
|
|
|
Scanner* m_scanner;
|
|
|
|
bool m_complete;
|
2019-04-18 11:17:11 +00:00
|
|
|
};
|
2014-11-30 22:25:42 +00:00
|
|
|
|
2018-11-29 00:58:15 +00:00
|
|
|
void Scanner::reset(CharStream _source)
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2018-11-28 15:19:22 +00:00
|
|
|
m_source = make_shared<CharStream>(std::move(_source));
|
2014-12-03 16:45:12 +00:00
|
|
|
reset();
|
|
|
|
}
|
|
|
|
|
2019-04-18 11:17:11 +00:00
|
|
|
void Scanner::reset(shared_ptr<CharStream> _source)
|
2018-11-28 15:19:22 +00:00
|
|
|
{
|
|
|
|
solAssert(_source.get() != nullptr, "You MUST provide a CharStream when resetting.");
|
2019-04-18 11:17:11 +00:00
|
|
|
m_source = std::move(_source);
|
2018-11-28 15:19:22 +00:00
|
|
|
reset();
|
|
|
|
}
|
|
|
|
|
2014-12-03 16:45:12 +00:00
|
|
|
void Scanner::reset()
|
|
|
|
{
|
2018-11-28 15:19:22 +00:00
|
|
|
m_source->reset();
|
2020-07-10 15:05:52 +00:00
|
|
|
m_kind = ScannerKind::Solidity;
|
2018-11-28 15:19:22 +00:00
|
|
|
m_char = m_source->get();
|
2014-10-09 10:28:37 +00:00
|
|
|
skipWhitespace();
|
2014-12-01 00:05:55 +00:00
|
|
|
next();
|
2019-04-24 11:16:43 +00:00
|
|
|
next();
|
2020-01-22 19:10:56 +00:00
|
|
|
next();
|
2019-04-24 11:16:43 +00:00
|
|
|
}
|
|
|
|
|
2019-05-27 14:13:27 +00:00
|
|
|
void Scanner::setPosition(size_t _offset)
|
|
|
|
{
|
|
|
|
m_char = m_source->setPosition(_offset);
|
|
|
|
scanToken();
|
|
|
|
next();
|
2020-01-22 19:10:56 +00:00
|
|
|
next();
|
2019-05-27 14:13:27 +00:00
|
|
|
}
|
|
|
|
|
2014-11-05 13:20:56 +00:00
|
|
|
bool Scanner::scanHexByte(char& o_scannedByte)
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2014-10-09 10:28:37 +00:00
|
|
|
char x = 0;
|
2020-06-02 13:45:03 +00:00
|
|
|
for (size_t i = 0; i < 2; i++)
|
2014-10-16 12:08:54 +00:00
|
|
|
{
|
2014-11-21 16:08:35 +00:00
|
|
|
int d = hexValue(m_char);
|
2014-10-16 12:08:54 +00:00
|
|
|
if (d < 0)
|
|
|
|
{
|
2014-10-09 10:28:37 +00:00
|
|
|
rollback(i);
|
|
|
|
return false;
|
|
|
|
}
|
2020-06-05 12:30:57 +00:00
|
|
|
x = static_cast<char>(x * 16 + d);
|
2014-10-09 10:28:37 +00:00
|
|
|
advance();
|
|
|
|
}
|
2014-11-05 13:20:56 +00:00
|
|
|
o_scannedByte = x;
|
2014-10-09 10:28:37 +00:00
|
|
|
return true;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2019-10-28 10:39:30 +00:00
|
|
|
std::optional<unsigned> Scanner::scanUnicode()
|
2016-06-07 18:23:19 +00:00
|
|
|
{
|
|
|
|
unsigned x = 0;
|
2020-06-02 13:45:03 +00:00
|
|
|
for (size_t i = 0; i < 4; i++)
|
2016-06-07 18:23:19 +00:00
|
|
|
{
|
|
|
|
int d = hexValue(m_char);
|
|
|
|
if (d < 0)
|
|
|
|
{
|
|
|
|
rollback(i);
|
2019-04-18 11:17:11 +00:00
|
|
|
return {};
|
2016-06-07 18:23:19 +00:00
|
|
|
}
|
2020-06-05 12:30:57 +00:00
|
|
|
x = x * 16 + static_cast<unsigned>(d);
|
2016-06-07 18:23:19 +00:00
|
|
|
advance();
|
|
|
|
}
|
2019-04-18 11:17:11 +00:00
|
|
|
return x;
|
2016-06-07 18:23:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// This supports codepoints between 0000 and FFFF.
|
2016-08-01 13:10:46 +00:00
|
|
|
void Scanner::addUnicodeAsUTF8(unsigned codepoint)
|
2016-06-07 18:23:19 +00:00
|
|
|
{
|
|
|
|
if (codepoint <= 0x7f)
|
2020-06-02 13:45:03 +00:00
|
|
|
addLiteralChar(char(codepoint));
|
2016-06-07 18:23:19 +00:00
|
|
|
else if (codepoint <= 0x7ff)
|
|
|
|
{
|
2020-06-02 13:45:03 +00:00
|
|
|
addLiteralChar(char(0xc0u | (codepoint >> 6u)));
|
|
|
|
addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
|
2016-06-07 18:23:19 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-06-02 13:45:03 +00:00
|
|
|
addLiteralChar(char(0xe0u | (codepoint >> 12u)));
|
|
|
|
addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu)));
|
|
|
|
addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
|
2016-06-07 18:23:19 +00:00
|
|
|
}
|
|
|
|
}
|
2014-10-06 15:13:52 +00:00
|
|
|
|
2019-04-24 11:16:43 +00:00
|
|
|
void Scanner::rescan()
|
|
|
|
{
|
|
|
|
size_t rollbackTo = 0;
|
2020-01-22 19:10:56 +00:00
|
|
|
if (m_skippedComments[Current].literal.empty())
|
2020-06-02 13:45:03 +00:00
|
|
|
rollbackTo = static_cast<size_t>(m_tokens[Current].location.start);
|
2019-04-24 11:16:43 +00:00
|
|
|
else
|
2020-06-02 13:45:03 +00:00
|
|
|
rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start);
|
|
|
|
m_char = m_source->rollback(m_source->position() - rollbackTo);
|
2019-04-24 11:16:43 +00:00
|
|
|
next();
|
|
|
|
next();
|
2020-01-22 19:10:56 +00:00
|
|
|
next();
|
2019-04-24 11:16:43 +00:00
|
|
|
}
|
|
|
|
|
2014-10-06 15:13:52 +00:00
|
|
|
// Ensure that tokens can be stored in a byte.
|
2018-10-22 14:48:21 +00:00
|
|
|
BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
|
2014-10-06 15:13:52 +00:00
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::next()
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2020-01-22 19:10:56 +00:00
|
|
|
m_tokens[Current] = std::move(m_tokens[Next]);
|
|
|
|
m_tokens[Next] = std::move(m_tokens[NextNext]);
|
|
|
|
m_skippedComments[Current] = std::move(m_skippedComments[Next]);
|
|
|
|
m_skippedComments[Next] = std::move(m_skippedComments[NextNext]);
|
|
|
|
|
2014-11-30 21:43:40 +00:00
|
|
|
scanToken();
|
|
|
|
|
2020-01-22 19:10:56 +00:00
|
|
|
return m_tokens[Current].token;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::selectToken(char _next, Token _then, Token _else)
|
2014-10-16 21:49:45 +00:00
|
|
|
{
|
|
|
|
advance();
|
|
|
|
if (m_char == _next)
|
2014-10-17 10:52:39 +00:00
|
|
|
return selectToken(_then);
|
2014-10-16 21:49:45 +00:00
|
|
|
else
|
|
|
|
return _else;
|
|
|
|
}
|
|
|
|
|
2014-10-06 15:13:52 +00:00
|
|
|
bool Scanner::skipWhitespace()
|
|
|
|
{
|
2020-06-02 13:45:03 +00:00
|
|
|
size_t const startPosition = sourcePos();
|
2014-11-21 16:08:35 +00:00
|
|
|
while (isWhiteSpace(m_char))
|
2014-10-16 12:08:54 +00:00
|
|
|
advance();
|
2014-10-09 10:28:37 +00:00
|
|
|
// Return whether or not we skipped any characters.
|
2015-08-31 16:44:29 +00:00
|
|
|
return sourcePos() != startPosition;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2020-04-28 08:34:07 +00:00
|
|
|
bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
|
2014-12-18 12:27:25 +00:00
|
|
|
{
|
2020-06-02 13:45:03 +00:00
|
|
|
size_t const startPosition = sourcePos();
|
2018-09-06 09:05:35 +00:00
|
|
|
while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
|
2014-12-18 12:27:25 +00:00
|
|
|
advance();
|
2020-04-28 08:34:07 +00:00
|
|
|
// Return whether or not we skipped any characters.
|
|
|
|
return sourcePos() != startPosition;
|
2014-12-18 12:27:25 +00:00
|
|
|
}
|
|
|
|
|
2020-11-18 13:35:16 +00:00
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
|
|
|
|
///
|
|
|
|
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
|
|
|
|
/// and error code in case the input's lexical parser state is invalid and this error should be reported
|
|
|
|
/// to the user.
|
|
|
|
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
|
|
|
|
{
|
|
|
|
static array<pair<string_view, int>, 5> constexpr directionalSequences{
|
|
|
|
pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
|
|
|
|
pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
|
|
|
|
pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
|
|
|
|
pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
|
|
|
|
pair<string_view, int>{"\xE2\x80\xAC", -1} // U+202C (PDF - Pop Directional Formatting
|
|
|
|
};
|
|
|
|
|
|
|
|
size_t endPosition = _stream.position();
|
|
|
|
_stream.setPosition(_startPosition);
|
|
|
|
|
|
|
|
int directionOverrideDepth = 0;
|
|
|
|
|
|
|
|
for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
|
|
|
|
{
|
|
|
|
_stream.setPosition(currentPos);
|
|
|
|
|
|
|
|
for (auto const& [sequence, depthChange]: directionalSequences)
|
|
|
|
if (_stream.prefixMatch(sequence))
|
|
|
|
directionOverrideDepth += depthChange;
|
|
|
|
|
|
|
|
if (directionOverrideDepth < 0)
|
|
|
|
return ScannerError::DirectionalOverrideUnderflow;
|
|
|
|
}
|
|
|
|
|
|
|
|
_stream.setPosition(endPosition);
|
|
|
|
|
|
|
|
return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::skipSingleLineComment()
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2018-09-06 09:05:35 +00:00
|
|
|
// Line terminator is not part of the comment. If it is a
|
|
|
|
// non-ascii line terminator, it will result in a parser error.
|
2020-11-18 13:35:16 +00:00
|
|
|
size_t startPosition = m_source->position();
|
2018-09-06 09:05:35 +00:00
|
|
|
while (!isUnicodeLinebreak())
|
2020-11-18 13:35:16 +00:00
|
|
|
if (!advance())
|
|
|
|
break;
|
|
|
|
|
|
|
|
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
|
|
|
|
if (unicodeDirectionError != ScannerError::NoError)
|
|
|
|
return setError(unicodeDirectionError);
|
2015-09-10 12:26:34 +00:00
|
|
|
|
2015-02-09 13:00:12 +00:00
|
|
|
return Token::Whitespace;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2019-08-05 09:53:24 +00:00
|
|
|
bool Scanner::atEndOfLine() const
|
|
|
|
{
|
|
|
|
return m_char == '\n' || m_char == '\r';
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Scanner::tryScanEndOfLine()
|
|
|
|
{
|
|
|
|
if (m_char == '\n')
|
|
|
|
{
|
|
|
|
advance();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (m_char == '\r')
|
|
|
|
{
|
|
|
|
if (advance() && m_char == '\n')
|
|
|
|
advance();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-06-02 13:45:03 +00:00
|
|
|
size_t Scanner::scanSingleLineDocComment()
|
2014-11-18 17:50:40 +00:00
|
|
|
{
|
2014-11-27 17:57:50 +00:00
|
|
|
LiteralScope literal(this, LITERAL_TYPE_COMMENT);
|
2020-06-02 13:45:03 +00:00
|
|
|
size_t endPosition = m_source->position();
|
2018-09-06 09:05:35 +00:00
|
|
|
|
|
|
|
skipWhitespaceExceptUnicodeLinebreak();
|
|
|
|
|
2014-11-27 17:57:50 +00:00
|
|
|
while (!isSourcePastEndOfInput())
|
2014-11-18 17:50:40 +00:00
|
|
|
{
|
2020-01-25 16:53:48 +00:00
|
|
|
endPosition = m_source->position();
|
2019-08-05 09:53:24 +00:00
|
|
|
if (tryScanEndOfLine())
|
2014-11-27 17:57:50 +00:00
|
|
|
{
|
2020-01-25 16:53:48 +00:00
|
|
|
// Check if next line is also a single-line comment.
|
|
|
|
// If any whitespaces were skipped, use source position before.
|
2020-04-28 08:34:07 +00:00
|
|
|
if (!skipWhitespaceExceptUnicodeLinebreak())
|
2020-01-25 16:53:48 +00:00
|
|
|
endPosition = m_source->position();
|
|
|
|
|
2018-11-28 15:19:22 +00:00
|
|
|
if (!m_source->isPastEndOfInput(3) &&
|
|
|
|
m_source->get(0) == '/' &&
|
|
|
|
m_source->get(1) == '/' &&
|
|
|
|
m_source->get(2) == '/')
|
2014-11-27 17:57:50 +00:00
|
|
|
{
|
2020-06-13 00:02:32 +00:00
|
|
|
if (!m_source->isPastEndOfInput(4) && m_source->get(3) == '/')
|
|
|
|
break; // "////" is not a documentation comment
|
2018-11-28 15:19:22 +00:00
|
|
|
m_char = m_source->advanceAndGet(3);
|
2020-04-28 08:34:07 +00:00
|
|
|
if (atEndOfLine())
|
|
|
|
continue;
|
|
|
|
addCommentLiteralChar('\n');
|
2014-11-27 17:57:50 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
break; // next line is not a documentation comment, we are done
|
|
|
|
}
|
2018-09-06 09:05:35 +00:00
|
|
|
else if (isUnicodeLinebreak())
|
|
|
|
// Any line terminator that is not '\n' is considered to end the
|
|
|
|
// comment.
|
|
|
|
break;
|
2014-11-21 08:09:39 +00:00
|
|
|
addCommentLiteralChar(m_char);
|
2014-11-18 17:50:40 +00:00
|
|
|
advance();
|
|
|
|
}
|
2014-11-21 16:08:35 +00:00
|
|
|
literal.complete();
|
2020-01-25 16:53:48 +00:00
|
|
|
return endPosition;
|
2014-11-18 17:50:40 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::skipMultiLineComment()
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2020-11-18 13:35:16 +00:00
|
|
|
size_t startPosition = m_source->position();
|
2014-10-16 12:08:54 +00:00
|
|
|
while (!isSourcePastEndOfInput())
|
|
|
|
{
|
2020-11-18 13:35:16 +00:00
|
|
|
char prevChar = m_char;
|
2014-10-09 10:28:37 +00:00
|
|
|
advance();
|
2014-10-16 21:49:45 +00:00
|
|
|
|
2014-10-09 10:28:37 +00:00
|
|
|
// If we have reached the end of the multi-line comment, we
|
|
|
|
// consume the '/' and insert a whitespace. This way all
|
|
|
|
// multi-line comments are treated as whitespace.
|
2020-11-18 13:35:16 +00:00
|
|
|
if (prevChar == '*' && m_char == '/')
|
2014-10-16 12:08:54 +00:00
|
|
|
{
|
2020-11-18 13:35:16 +00:00
|
|
|
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
|
|
|
|
if (unicodeDirectionError != ScannerError::NoError)
|
|
|
|
return setError(unicodeDirectionError);
|
|
|
|
|
2014-10-09 10:28:37 +00:00
|
|
|
m_char = ' ';
|
2015-02-09 13:00:12 +00:00
|
|
|
return Token::Whitespace;
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// Unterminated multi-line comment.
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalCommentTerminator);
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::scanMultiLineDocComment()
|
2014-12-17 17:53:18 +00:00
|
|
|
{
|
|
|
|
LiteralScope literal(this, LITERAL_TYPE_COMMENT);
|
|
|
|
bool endFound = false;
|
2014-12-18 12:27:25 +00:00
|
|
|
bool charsAdded = false;
|
2014-12-17 17:53:18 +00:00
|
|
|
|
2019-08-05 09:53:24 +00:00
|
|
|
while (isWhiteSpace(m_char) && !atEndOfLine())
|
2018-09-06 09:05:35 +00:00
|
|
|
advance();
|
|
|
|
|
2014-12-17 17:53:18 +00:00
|
|
|
while (!isSourcePastEndOfInput())
|
|
|
|
{
|
2014-12-18 12:27:25 +00:00
|
|
|
//handle newlines in multline comments
|
2019-08-05 09:53:24 +00:00
|
|
|
if (atEndOfLine())
|
2014-12-17 17:53:18 +00:00
|
|
|
{
|
|
|
|
skipWhitespace();
|
2018-11-28 15:19:22 +00:00
|
|
|
if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '*')
|
2016-11-30 16:28:07 +00:00
|
|
|
{ // it is unknown if this leads to the end of the comment
|
|
|
|
addCommentLiteralChar('*');
|
|
|
|
advance();
|
|
|
|
}
|
2018-11-28 15:19:22 +00:00
|
|
|
else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) != '/')
|
2014-12-18 12:27:25 +00:00
|
|
|
{ // skip first '*' in subsequent lines
|
2020-04-28 08:34:07 +00:00
|
|
|
m_char = m_source->advanceAndGet(1);
|
|
|
|
if (atEndOfLine()) // ignores empty lines
|
|
|
|
continue;
|
2014-12-18 12:27:25 +00:00
|
|
|
if (charsAdded)
|
2020-04-28 08:34:07 +00:00
|
|
|
addCommentLiteralChar('\n'); // corresponds to the end of previous line
|
2014-12-18 12:27:25 +00:00
|
|
|
}
|
2018-11-28 15:19:22 +00:00
|
|
|
else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/')
|
2014-12-18 12:27:25 +00:00
|
|
|
{ // if after newline the comment ends, don't insert the newline
|
2018-11-28 15:19:22 +00:00
|
|
|
m_char = m_source->advanceAndGet(2);
|
2014-12-18 12:27:25 +00:00
|
|
|
endFound = true;
|
|
|
|
break;
|
2014-12-17 17:53:18 +00:00
|
|
|
}
|
2014-12-18 15:48:25 +00:00
|
|
|
else if (charsAdded)
|
2014-12-17 17:53:18 +00:00
|
|
|
addCommentLiteralChar('\n');
|
|
|
|
}
|
|
|
|
|
2018-11-28 15:19:22 +00:00
|
|
|
if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/')
|
2014-12-17 17:53:18 +00:00
|
|
|
{
|
2018-11-28 15:19:22 +00:00
|
|
|
m_char = m_source->advanceAndGet(2);
|
2014-12-17 17:53:18 +00:00
|
|
|
endFound = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
addCommentLiteralChar(m_char);
|
2014-12-18 12:27:25 +00:00
|
|
|
charsAdded = true;
|
2014-12-17 17:53:18 +00:00
|
|
|
advance();
|
|
|
|
}
|
|
|
|
literal.complete();
|
|
|
|
if (!endFound)
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalCommentTerminator);
|
2014-12-17 17:53:18 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
return Token::CommentLiteral;
|
2014-12-17 17:53:18 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::scanSlash()
|
2014-12-18 16:30:10 +00:00
|
|
|
{
|
2020-06-02 13:45:03 +00:00
|
|
|
int firstSlashPosition = static_cast<int>(sourcePos());
|
2014-12-18 16:30:10 +00:00
|
|
|
advance();
|
|
|
|
if (m_char == '/')
|
|
|
|
{
|
|
|
|
if (!advance()) /* double slash comment directly before EOS */
|
2018-09-06 09:05:35 +00:00
|
|
|
return Token::Whitespace;
|
2014-12-18 16:30:10 +00:00
|
|
|
else if (m_char == '/')
|
|
|
|
{
|
2020-06-13 00:02:32 +00:00
|
|
|
advance(); //consume the last '/' at ///
|
|
|
|
|
|
|
|
// "////"
|
|
|
|
if (m_char == '/')
|
|
|
|
return skipSingleLineComment();
|
2014-12-18 16:30:10 +00:00
|
|
|
// doxygen style /// comment
|
2020-01-22 19:10:56 +00:00
|
|
|
m_skippedComments[NextNext].location.start = firstSlashPosition;
|
2020-01-29 22:13:42 +00:00
|
|
|
m_skippedComments[NextNext].location.source = m_source;
|
2020-01-25 16:53:48 +00:00
|
|
|
m_skippedComments[NextNext].token = Token::CommentLiteral;
|
2020-06-02 13:45:03 +00:00
|
|
|
m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment());
|
2015-02-09 13:00:12 +00:00
|
|
|
return Token::Whitespace;
|
2014-12-18 16:30:10 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
return skipSingleLineComment();
|
|
|
|
}
|
|
|
|
else if (m_char == '*')
|
|
|
|
{
|
|
|
|
// doxygen style /** natspec comment
|
|
|
|
if (!advance()) /* slash star comment before EOS */
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalCommentTerminator);
|
2014-12-18 16:30:10 +00:00
|
|
|
else if (m_char == '*')
|
|
|
|
{
|
2015-01-05 15:37:43 +00:00
|
|
|
advance(); //consume the last '*' at /**
|
|
|
|
|
2018-09-06 09:05:35 +00:00
|
|
|
// "/**/"
|
|
|
|
if (m_char == '/')
|
2015-01-05 15:37:43 +00:00
|
|
|
{
|
2018-09-06 09:05:35 +00:00
|
|
|
advance(); //skip the closing slash
|
|
|
|
return Token::Whitespace;
|
2015-01-05 15:37:43 +00:00
|
|
|
}
|
2020-06-13 00:02:32 +00:00
|
|
|
// "/***"
|
|
|
|
if (m_char == '*')
|
|
|
|
// "/***/" may be interpreted as empty natspec or skipped; skipping is simpler
|
|
|
|
return skipMultiLineComment();
|
2018-09-06 09:05:35 +00:00
|
|
|
// we actually have a multiline documentation comment
|
2020-01-22 19:10:56 +00:00
|
|
|
m_skippedComments[NextNext].location.start = firstSlashPosition;
|
2020-01-29 22:13:42 +00:00
|
|
|
m_skippedComments[NextNext].location.source = m_source;
|
2020-06-13 00:02:32 +00:00
|
|
|
Token comment = scanMultiLineDocComment();
|
2020-06-02 13:45:03 +00:00
|
|
|
m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos());
|
2020-01-22 19:10:56 +00:00
|
|
|
m_skippedComments[NextNext].token = comment;
|
2018-11-23 15:47:34 +00:00
|
|
|
if (comment == Token::Illegal)
|
|
|
|
return Token::Illegal; // error already set
|
2018-09-06 09:05:35 +00:00
|
|
|
else
|
|
|
|
return Token::Whitespace;
|
2014-12-18 16:30:10 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
return skipMultiLineComment();
|
|
|
|
}
|
|
|
|
else if (m_char == '=')
|
2015-02-09 13:00:12 +00:00
|
|
|
return selectToken(Token::AssignDiv);
|
2014-12-18 16:30:10 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
return Token::Div;
|
2014-12-18 16:30:10 +00:00
|
|
|
}
|
|
|
|
|
2014-11-30 21:43:40 +00:00
|
|
|
void Scanner::scanToken()
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2020-01-22 19:10:56 +00:00
|
|
|
m_tokens[NextNext] = {};
|
|
|
|
m_skippedComments[NextNext] = {};
|
2016-02-08 21:43:22 +00:00
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token token;
|
2016-02-12 21:01:27 +00:00
|
|
|
// M and N are for the purposes of grabbing different type sizes
|
2016-02-15 16:34:45 +00:00
|
|
|
unsigned m;
|
|
|
|
unsigned n;
|
2014-10-16 12:08:54 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
// Remember the position of the next token
|
2020-06-02 13:45:03 +00:00
|
|
|
m_tokens[NextNext].location.start = static_cast<int>(sourcePos());
|
2014-10-16 12:08:54 +00:00
|
|
|
switch (m_char)
|
|
|
|
{
|
|
|
|
case '"':
|
|
|
|
case '\'':
|
2020-07-02 16:39:04 +00:00
|
|
|
token = scanString(false);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '<':
|
|
|
|
// < <= << <<=
|
|
|
|
advance();
|
|
|
|
if (m_char == '=')
|
2015-02-10 08:52:19 +00:00
|
|
|
token = selectToken(Token::LessThanOrEqual);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '<')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken('=', Token::AssignShl, Token::SHL);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::LessThan;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '>':
|
|
|
|
// > >= >> >>= >>> >>>=
|
|
|
|
advance();
|
|
|
|
if (m_char == '=')
|
2015-02-10 08:52:19 +00:00
|
|
|
token = selectToken(Token::GreaterThanOrEqual);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '>')
|
|
|
|
{
|
|
|
|
// >> >>= >>> >>>=
|
|
|
|
advance();
|
|
|
|
if (m_char == '=')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::AssignSar);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '>')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken('=', Token::AssignShr, Token::SHR);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
|
|
|
token = Token::SAR;
|
|
|
|
}
|
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::GreaterThan;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '=':
|
|
|
|
// = == =>
|
|
|
|
advance();
|
|
|
|
if (m_char == '=')
|
2015-02-10 08:52:19 +00:00
|
|
|
token = selectToken(Token::Equal);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '>')
|
2020-08-27 10:42:00 +00:00
|
|
|
token = selectToken(Token::DoubleArrow);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::Assign;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '!':
|
|
|
|
// ! !=
|
|
|
|
advance();
|
|
|
|
if (m_char == '=')
|
2015-02-10 08:52:19 +00:00
|
|
|
token = selectToken(Token::NotEqual);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::Not;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '+':
|
|
|
|
// + ++ +=
|
|
|
|
advance();
|
|
|
|
if (m_char == '+')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::Inc);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '=')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::AssignAdd);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::Add;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '-':
|
2020-08-12 17:56:24 +00:00
|
|
|
// - -- -= ->
|
2014-10-16 12:08:54 +00:00
|
|
|
advance();
|
|
|
|
if (m_char == '-')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::Dec);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '=')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::AssignSub);
|
2020-08-12 17:56:24 +00:00
|
|
|
else if (m_char == '>')
|
|
|
|
token = selectToken(Token::RightArrow);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::Sub;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '*':
|
2015-02-08 11:23:17 +00:00
|
|
|
// * ** *=
|
|
|
|
advance();
|
|
|
|
if (m_char == '*')
|
|
|
|
token = selectToken(Token::Exp);
|
|
|
|
else if (m_char == '=')
|
|
|
|
token = selectToken(Token::AssignMul);
|
|
|
|
else
|
|
|
|
token = Token::Mul;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '%':
|
|
|
|
// % %=
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken('=', Token::AssignMod, Token::Mod);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '/':
|
|
|
|
// / // /* /=
|
2014-12-18 16:30:10 +00:00
|
|
|
token = scanSlash();
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '&':
|
|
|
|
// & && &=
|
|
|
|
advance();
|
|
|
|
if (m_char == '&')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::And);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '=')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::AssignBitAnd);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::BitAnd;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '|':
|
|
|
|
// | || |=
|
|
|
|
advance();
|
|
|
|
if (m_char == '|')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::Or);
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (m_char == '=')
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::AssignBitOr);
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::BitOr;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '^':
|
|
|
|
// ^ ^=
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken('=', Token::AssignBitXor, Token::BitXor);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '.':
|
|
|
|
// . Number
|
|
|
|
advance();
|
2014-11-21 16:08:35 +00:00
|
|
|
if (isDecimalDigit(m_char))
|
2014-11-05 07:40:21 +00:00
|
|
|
token = scanNumber('.');
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::Period;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case ':':
|
2019-02-18 14:07:15 +00:00
|
|
|
// : :=
|
|
|
|
advance();
|
|
|
|
if (m_char == '=')
|
|
|
|
token = selectToken(Token::AssemblyAssign);
|
|
|
|
else
|
|
|
|
token = Token::Colon;
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case ';':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::Semicolon);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case ',':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::Comma);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '(':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::LParen);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case ')':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::RParen);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '[':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::LBrack);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case ']':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::RBrack);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '{':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::LBrace);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '}':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::RBrace);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '?':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::Conditional);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
case '~':
|
2015-02-09 13:00:12 +00:00
|
|
|
token = selectToken(Token::BitNot);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
|
|
|
default:
|
2016-02-09 21:43:23 +00:00
|
|
|
if (isIdentifierStart(m_char))
|
2016-08-16 14:31:23 +00:00
|
|
|
{
|
2016-02-15 16:34:45 +00:00
|
|
|
tie(token, m, n) = scanIdentifierOrKeyword();
|
2016-08-16 14:31:23 +00:00
|
|
|
|
2018-10-04 11:03:55 +00:00
|
|
|
// Special case for hexadecimal literals
|
2020-08-27 14:53:45 +00:00
|
|
|
if (token == Token::Hex)
|
2016-08-16 14:31:23 +00:00
|
|
|
{
|
|
|
|
// reset
|
|
|
|
m = 0;
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
// Special quoted hex string must follow
|
|
|
|
if (m_char == '"' || m_char == '\'')
|
|
|
|
token = scanHexString();
|
|
|
|
else
|
2018-11-23 15:47:34 +00:00
|
|
|
token = setError(ScannerError::IllegalToken);
|
2016-08-16 14:31:23 +00:00
|
|
|
}
|
2020-07-10 15:20:04 +00:00
|
|
|
else if (token == Token::Unicode && m_kind != ScannerKind::Yul)
|
2020-07-02 16:39:04 +00:00
|
|
|
{
|
|
|
|
// reset
|
|
|
|
m = 0;
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
// Special quoted hex string must follow
|
|
|
|
if (m_char == '"' || m_char == '\'')
|
|
|
|
token = scanString(true);
|
|
|
|
else
|
|
|
|
token = setError(ScannerError::IllegalToken);
|
|
|
|
}
|
2016-08-16 14:31:23 +00:00
|
|
|
}
|
2014-11-21 16:08:35 +00:00
|
|
|
else if (isDecimalDigit(m_char))
|
2014-11-05 07:40:21 +00:00
|
|
|
token = scanNumber();
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (skipWhitespace())
|
2015-02-09 13:00:12 +00:00
|
|
|
token = Token::Whitespace;
|
2014-10-16 12:08:54 +00:00
|
|
|
else if (isSourcePastEndOfInput())
|
|
|
|
token = Token::EOS;
|
|
|
|
else
|
2018-11-23 15:47:34 +00:00
|
|
|
token = selectErrorToken(ScannerError::IllegalToken);
|
2014-10-16 12:08:54 +00:00
|
|
|
break;
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
// Continue scanning for tokens as long as we're just skipping
|
|
|
|
// whitespace.
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2015-02-09 13:00:12 +00:00
|
|
|
while (token == Token::Whitespace);
|
2020-06-02 13:45:03 +00:00
|
|
|
m_tokens[NextNext].location.end = static_cast<int>(sourcePos());
|
2020-02-05 23:04:18 +00:00
|
|
|
m_tokens[NextNext].location.source = m_source;
|
2020-01-22 19:10:56 +00:00
|
|
|
m_tokens[NextNext].token = token;
|
|
|
|
m_tokens[NextNext].extendedTokenInfo = make_tuple(m, n);
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Scanner::scanEscape()
|
|
|
|
{
|
2014-10-09 10:28:37 +00:00
|
|
|
char c = m_char;
|
2019-08-05 09:53:24 +00:00
|
|
|
|
2014-10-09 10:28:37 +00:00
|
|
|
// Skip escaped newlines.
|
2019-08-05 09:53:24 +00:00
|
|
|
if (tryScanEndOfLine())
|
2014-10-09 10:28:37 +00:00
|
|
|
return true;
|
2019-08-05 09:53:24 +00:00
|
|
|
advance();
|
|
|
|
|
2014-10-16 12:08:54 +00:00
|
|
|
switch (c)
|
|
|
|
{
|
2014-10-09 10:28:37 +00:00
|
|
|
case '\'': // fall through
|
2014-10-16 21:49:45 +00:00
|
|
|
case '"': // fall through
|
2014-10-16 12:08:54 +00:00
|
|
|
case '\\':
|
|
|
|
break;
|
2014-10-16 21:49:45 +00:00
|
|
|
case 'n':
|
2014-10-16 12:08:54 +00:00
|
|
|
c = '\n';
|
|
|
|
break;
|
2014-10-16 21:49:45 +00:00
|
|
|
case 'r':
|
2014-10-16 12:08:54 +00:00
|
|
|
c = '\r';
|
|
|
|
break;
|
2014-10-16 21:49:45 +00:00
|
|
|
case 't':
|
2014-10-16 12:08:54 +00:00
|
|
|
c = '\t';
|
|
|
|
break;
|
2016-06-07 18:23:19 +00:00
|
|
|
case 'u':
|
|
|
|
{
|
2019-10-28 10:39:30 +00:00
|
|
|
if (auto const codepoint = scanUnicode(); codepoint.has_value())
|
2019-04-18 11:17:11 +00:00
|
|
|
addUnicodeAsUTF8(*codepoint);
|
|
|
|
else
|
2016-06-07 18:23:19 +00:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
2014-10-16 21:49:45 +00:00
|
|
|
case 'x':
|
2014-11-05 13:20:56 +00:00
|
|
|
if (!scanHexByte(c))
|
2014-10-20 12:00:37 +00:00
|
|
|
return false;
|
2014-10-09 10:28:37 +00:00
|
|
|
break;
|
2018-09-06 09:05:35 +00:00
|
|
|
default:
|
|
|
|
return false;
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2014-10-16 21:49:45 +00:00
|
|
|
|
2014-10-09 10:28:37 +00:00
|
|
|
addLiteralChar(c);
|
|
|
|
return true;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2018-09-06 09:05:35 +00:00
|
|
|
bool Scanner::isUnicodeLinebreak()
|
|
|
|
{
|
|
|
|
if (0x0a <= m_char && m_char <= 0x0d)
|
|
|
|
// line feed, vertical tab, form feed, carriage return
|
|
|
|
return true;
|
2020-06-13 00:02:32 +00:00
|
|
|
if (!m_source->isPastEndOfInput(1) && uint8_t(m_source->get(0)) == 0xc2 && uint8_t(m_source->get(1)) == 0x85)
|
2018-09-06 09:05:35 +00:00
|
|
|
// NEL - U+0085, C2 85 in utf8
|
|
|
|
return true;
|
2020-06-13 00:02:32 +00:00
|
|
|
if (!m_source->isPastEndOfInput(2) && uint8_t(m_source->get(0)) == 0xe2 && uint8_t(m_source->get(1)) == 0x80 && (
|
2018-11-28 15:19:22 +00:00
|
|
|
uint8_t(m_source->get(2)) == 0xa8 || uint8_t(m_source->get(2)) == 0xa9
|
2018-09-06 09:05:35 +00:00
|
|
|
))
|
|
|
|
// LS - U+2028, E2 80 A8 in utf8
|
|
|
|
// PS - U+2029, E2 80 A9 in utf8
|
|
|
|
return true;
|
2020-06-13 00:02:32 +00:00
|
|
|
return false;
|
2018-09-06 09:05:35 +00:00
|
|
|
}
|
|
|
|
|
2020-07-02 16:39:04 +00:00
|
|
|
Token Scanner::scanString(bool const _isUnicode)
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2020-11-18 13:35:16 +00:00
|
|
|
size_t startPosition = m_source->position();
|
2014-10-20 11:02:06 +00:00
|
|
|
char const quote = m_char;
|
2014-10-09 10:28:37 +00:00
|
|
|
advance(); // consume quote
|
2014-11-27 17:57:50 +00:00
|
|
|
LiteralScope literal(this, LITERAL_TYPE_STRING);
|
2018-09-06 09:05:35 +00:00
|
|
|
while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
|
2014-10-16 12:08:54 +00:00
|
|
|
{
|
2014-10-09 10:28:37 +00:00
|
|
|
char c = m_char;
|
|
|
|
advance();
|
2014-10-16 12:08:54 +00:00
|
|
|
if (c == '\\')
|
|
|
|
{
|
|
|
|
if (isSourcePastEndOfInput() || !scanEscape())
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalEscapeSequence);
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
else
|
2020-07-15 14:06:43 +00:00
|
|
|
{
|
2020-07-02 16:39:04 +00:00
|
|
|
// Report error on non-printable characters in string literals, however
|
|
|
|
// allow anything for unicode string literals, because their validity will
|
|
|
|
// be verified later (in the syntax checker).
|
2020-07-15 14:06:43 +00:00
|
|
|
//
|
|
|
|
// We are using a manual range and not isprint() to avoid
|
|
|
|
// any potential complications with locale.
|
2020-07-02 16:39:04 +00:00
|
|
|
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
|
2020-07-15 14:06:43 +00:00
|
|
|
return setError(ScannerError::IllegalCharacterInString);
|
2014-10-16 12:08:54 +00:00
|
|
|
addLiteralChar(c);
|
2020-07-15 14:06:43 +00:00
|
|
|
}
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2014-11-21 16:08:35 +00:00
|
|
|
if (m_char != quote)
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalStringEndQuote);
|
2020-11-18 13:35:16 +00:00
|
|
|
|
|
|
|
if (_isUnicode)
|
|
|
|
{
|
|
|
|
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
|
|
|
|
if (unicodeDirectionError != ScannerError::NoError)
|
|
|
|
return setError(unicodeDirectionError);
|
|
|
|
}
|
|
|
|
|
2014-11-21 16:08:35 +00:00
|
|
|
literal.complete();
|
2014-10-09 10:28:37 +00:00
|
|
|
advance(); // consume quote
|
2020-07-02 16:39:04 +00:00
|
|
|
return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::scanHexString()
|
2016-08-16 14:31:23 +00:00
|
|
|
{
|
|
|
|
char const quote = m_char;
|
|
|
|
advance(); // consume quote
|
|
|
|
LiteralScope literal(this, LITERAL_TYPE_STRING);
|
2019-09-06 13:29:51 +00:00
|
|
|
bool allowUnderscore = false;
|
2018-09-06 09:05:35 +00:00
|
|
|
while (m_char != quote && !isSourcePastEndOfInput())
|
2016-08-16 14:31:23 +00:00
|
|
|
{
|
|
|
|
char c = m_char;
|
2019-09-06 13:29:51 +00:00
|
|
|
|
|
|
|
if (scanHexByte(c))
|
|
|
|
{
|
|
|
|
addLiteralChar(c);
|
|
|
|
allowUnderscore = true;
|
|
|
|
}
|
|
|
|
else if (c == '_')
|
|
|
|
{
|
|
|
|
advance();
|
|
|
|
if (!allowUnderscore || m_char == quote)
|
|
|
|
return setError(ScannerError::IllegalNumberSeparator);
|
|
|
|
allowUnderscore = false;
|
|
|
|
}
|
|
|
|
else
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalHexString);
|
2016-08-16 14:31:23 +00:00
|
|
|
}
|
2018-11-23 15:47:34 +00:00
|
|
|
|
2016-08-16 14:31:23 +00:00
|
|
|
if (m_char != quote)
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalStringEndQuote);
|
|
|
|
|
2016-08-16 14:31:23 +00:00
|
|
|
literal.complete();
|
|
|
|
advance(); // consume quote
|
2019-10-05 20:47:23 +00:00
|
|
|
return Token::HexStringLiteral;
|
2016-08-16 14:31:23 +00:00
|
|
|
}
|
|
|
|
|
2018-08-03 14:13:52 +00:00
|
|
|
// Parse for regex [:digit:]+(_[:digit:]+)*
|
2014-10-06 15:13:52 +00:00
|
|
|
void Scanner::scanDecimalDigits()
|
|
|
|
{
|
2018-08-03 14:13:52 +00:00
|
|
|
// MUST begin with a decimal digit.
|
|
|
|
if (!isDecimalDigit(m_char))
|
|
|
|
return;
|
2017-10-25 08:12:07 +00:00
|
|
|
|
2018-08-03 14:13:52 +00:00
|
|
|
// May continue with decimal digit or underscore for grouping.
|
2019-04-18 11:17:11 +00:00
|
|
|
do
|
|
|
|
addLiteralCharAndAdvance();
|
2018-11-28 15:19:22 +00:00
|
|
|
while (!m_source->isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
|
2017-10-25 08:12:07 +00:00
|
|
|
|
2018-08-03 14:13:52 +00:00
|
|
|
// Defer further validation of underscore to SyntaxChecker.
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
Token Scanner::scanNumber(char _charSeen)
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2014-11-05 07:40:21 +00:00
|
|
|
enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
|
2014-11-27 17:57:50 +00:00
|
|
|
LiteralScope literal(this, LITERAL_TYPE_NUMBER);
|
2014-11-05 07:40:21 +00:00
|
|
|
if (_charSeen == '.')
|
2014-10-16 12:08:54 +00:00
|
|
|
{
|
|
|
|
// we have already seen a decimal point of the float
|
|
|
|
addLiteralChar('.');
|
2018-08-03 14:13:52 +00:00
|
|
|
if (m_char == '_')
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalToken);
|
2014-10-16 12:08:54 +00:00
|
|
|
scanDecimalDigits(); // we know we have at least one digit
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-12-19 10:31:17 +00:00
|
|
|
solAssert(_charSeen == 0, "");
|
2014-10-16 12:08:54 +00:00
|
|
|
// if the first character is '0' we must check for octals and hex
|
|
|
|
if (m_char == '0')
|
|
|
|
{
|
|
|
|
addLiteralCharAndAdvance();
|
2014-11-05 07:40:21 +00:00
|
|
|
// either 0, 0exxx, 0Exxx, 0.xxx or a hex number
|
2018-09-18 17:09:16 +00:00
|
|
|
if (m_char == 'x')
|
2014-10-16 12:08:54 +00:00
|
|
|
{
|
|
|
|
// hex number
|
|
|
|
kind = HEX;
|
|
|
|
addLiteralCharAndAdvance();
|
2014-11-21 16:08:35 +00:00
|
|
|
if (!isHexDigit(m_char))
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x'
|
2017-10-25 08:12:07 +00:00
|
|
|
|
2018-08-03 14:13:52 +00:00
|
|
|
while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation
|
2014-10-16 12:08:54 +00:00
|
|
|
addLiteralCharAndAdvance();
|
|
|
|
}
|
2017-03-07 11:34:32 +00:00
|
|
|
else if (isDecimalDigit(m_char))
|
|
|
|
// We do not allow octal numbers
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::OctalNotAllowed);
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
// Parse decimal digits and allow trailing fractional part.
|
|
|
|
if (kind == DECIMAL)
|
|
|
|
{
|
|
|
|
scanDecimalDigits(); // optional
|
|
|
|
if (m_char == '.')
|
|
|
|
{
|
2018-11-28 15:19:22 +00:00
|
|
|
if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_')
|
2018-08-03 14:13:52 +00:00
|
|
|
{
|
|
|
|
// Assume the input may be a floating point number with leading '_' in fraction part.
|
|
|
|
// Recover by consuming it all but returning `Illegal` right away.
|
|
|
|
addLiteralCharAndAdvance(); // '.'
|
|
|
|
addLiteralCharAndAdvance(); // '_'
|
|
|
|
scanDecimalDigits();
|
|
|
|
}
|
2018-11-28 15:19:22 +00:00
|
|
|
if (m_source->isPastEndOfInput() || !isDecimalDigit(m_source->get(1)))
|
2018-05-22 15:37:21 +00:00
|
|
|
{
|
2018-08-03 14:13:52 +00:00
|
|
|
// A '.' has to be followed by a number.
|
2018-05-22 15:37:21 +00:00
|
|
|
literal.complete();
|
|
|
|
return Token::Number;
|
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
addLiteralCharAndAdvance();
|
2018-05-22 15:37:21 +00:00
|
|
|
scanDecimalDigits();
|
2014-10-16 12:08:54 +00:00
|
|
|
}
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
// scan exponent, if any
|
|
|
|
if (m_char == 'e' || m_char == 'E')
|
|
|
|
{
|
2014-12-17 15:23:18 +00:00
|
|
|
solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
|
2014-11-21 16:08:35 +00:00
|
|
|
if (kind != DECIMAL)
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalExponent);
|
2018-11-28 15:19:22 +00:00
|
|
|
else if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_')
|
2018-08-03 14:13:52 +00:00
|
|
|
{
|
|
|
|
// Recover from wrongly placed underscore as delimiter in literal with scientific
|
|
|
|
// notation by consuming until the end.
|
|
|
|
addLiteralCharAndAdvance(); // 'e'
|
|
|
|
addLiteralCharAndAdvance(); // '_'
|
|
|
|
scanDecimalDigits();
|
|
|
|
literal.complete();
|
|
|
|
return Token::Number;
|
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
// scan exponent
|
2018-08-03 14:13:52 +00:00
|
|
|
addLiteralCharAndAdvance(); // 'e' | 'E'
|
2014-10-16 12:08:54 +00:00
|
|
|
if (m_char == '+' || m_char == '-')
|
|
|
|
addLiteralCharAndAdvance();
|
2018-11-23 15:47:34 +00:00
|
|
|
if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E'
|
|
|
|
return setError(ScannerError::IllegalExponent);
|
2014-10-16 12:08:54 +00:00
|
|
|
scanDecimalDigits();
|
2014-10-09 10:28:37 +00:00
|
|
|
}
|
2014-10-16 12:08:54 +00:00
|
|
|
// The source character immediately following a numeric literal must
|
|
|
|
// not be an identifier start or a decimal digit; see ECMA-262
|
|
|
|
// section 7.8.3, page 17 (note that we read only one decimal digit
|
|
|
|
// if the value is 0).
|
2016-02-09 21:43:23 +00:00
|
|
|
if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
|
2018-11-23 15:47:34 +00:00
|
|
|
return setError(ScannerError::IllegalNumberEnd);
|
2014-11-21 16:08:35 +00:00
|
|
|
literal.complete();
|
2015-02-09 13:00:12 +00:00
|
|
|
return Token::Number;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 14:48:21 +00:00
|
|
|
tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
|
2014-10-06 15:13:52 +00:00
|
|
|
{
|
2016-02-09 21:43:23 +00:00
|
|
|
solAssert(isIdentifierStart(m_char), "");
|
2014-11-27 17:57:50 +00:00
|
|
|
LiteralScope literal(this, LITERAL_TYPE_STRING);
|
2014-10-09 10:28:37 +00:00
|
|
|
addLiteralCharAndAdvance();
|
|
|
|
// Scan the rest of the identifier characters.
|
2020-07-10 15:05:52 +00:00
|
|
|
while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul))
|
2014-10-09 10:28:37 +00:00
|
|
|
addLiteralCharAndAdvance();
|
2014-11-21 16:08:35 +00:00
|
|
|
literal.complete();
|
2020-07-10 15:20:04 +00:00
|
|
|
auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
|
|
|
|
if (m_kind == ScannerKind::Yul)
|
|
|
|
{
|
2020-07-27 18:11:38 +00:00
|
|
|
// Turn Solidity identifier into a Yul keyword
|
|
|
|
if (m_tokens[NextNext].literal == "leave")
|
|
|
|
return std::make_tuple(Token::Leave, 0, 0);
|
2020-07-10 15:20:04 +00:00
|
|
|
// Turn non-Yul keywords into identifiers.
|
|
|
|
if (!TokenTraits::isYulKeyword(std::get<0>(token)))
|
|
|
|
return std::make_tuple(Token::Identifier, 0, 0);
|
|
|
|
}
|
|
|
|
return token;
|
2014-10-06 15:13:52 +00:00
|
|
|
}
|
2019-12-11 16:31:36 +00:00
|
|
|
|
|
|
|
} // namespace solidity::langutil
|