2014-10-20 14:37:04 +00:00
/*
2019-02-13 15:56:46 +00:00
* This file is part of solidity .
*
* solidity is free software : you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation , either version 3 of the License , or
* ( at your option ) any later version .
*
* solidity is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with solidity . If not , see < http : //www.gnu.org/licenses/>.
*
* This file is derived from the file " scanner.cc " , which was part of the
* V8 project . The original copyright header follows :
*
* Copyright 2006 - 2012 , the V8 project authors . All rights reserved .
* Redistribution and use in source and binary forms , with or without
* modification , are permitted provided that the following conditions are
* met :
*
* * Redistributions of source code must retain the above copyright
* notice , this list of conditions and the following disclaimer .
* * Redistributions in binary form must reproduce the above
* copyright notice , this list of conditions and the following
* disclaimer in the documentation and / or other materials provided
* with the distribution .
* * Neither the name of Google Inc . nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* " AS IS " AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT
* LIMITED TO , THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL ,
* SPECIAL , EXEMPLARY , OR CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT
* LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE ,
* DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY , OR TORT
* ( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
2014-10-20 14:37:04 +00:00
*/
/**
* @ author Christian < c @ ethdev . com >
* @ date 2014
* Solidity scanner .
*/
2014-10-06 15:13:52 +00:00
2019-02-21 00:04:34 +00:00
# include <liblangutil/Common.h>
2018-11-14 13:59:30 +00:00
# include <liblangutil/Exceptions.h>
# include <liblangutil/Scanner.h>
2019-10-28 10:39:30 +00:00
2020-11-18 13:35:16 +00:00
# include <boost/algorithm/string/classification.hpp>
2019-10-28 10:39:30 +00:00
# include <optional>
2020-11-18 13:35:16 +00:00
# include <string_view>
2018-11-14 16:11:55 +00:00
# include <tuple>
2021-09-16 14:33:28 +00:00
# include <array>
2014-10-06 15:13:52 +00:00
2014-10-24 17:06:30 +00:00
2021-05-27 15:41:04 +00:00
namespace solidity : : langutil
{
2019-12-11 16:31:36 +00:00
2023-07-12 08:05:47 +00:00
std : : string to_string ( ScannerError _errorCode )
2018-11-23 15:47:34 +00:00
{
switch ( _errorCode )
{
case ScannerError : : NoError : return " No error. " ;
case ScannerError : : IllegalToken : return " Invalid token. " ;
2019-09-06 13:29:51 +00:00
case ScannerError : : IllegalHexString : return " Expected even number of hex-nibbles. " ;
2018-11-23 15:47:34 +00:00
case ScannerError : : IllegalHexDigit : return " Hexadecimal digit missing or invalid. " ;
case ScannerError : : IllegalCommentTerminator : return " Expected multi-line comment-terminator. " ;
case ScannerError : : IllegalEscapeSequence : return " Invalid escape sequence. " ;
2022-05-15 13:24:23 +00:00
case ScannerError : : UnicodeCharacterInNonUnicodeString : return " Invalid character in string. If you are trying to use Unicode characters, use a unicode \" ... \" string literal. " ;
2020-07-15 14:06:43 +00:00
case ScannerError : : IllegalCharacterInString : return " Invalid character in string. " ;
2018-11-23 15:47:34 +00:00
case ScannerError : : IllegalStringEndQuote : return " Expected string end-quote. " ;
case ScannerError : : IllegalNumberSeparator : return " Invalid use of number separator '_'. " ;
case ScannerError : : IllegalExponent : return " Invalid exponent. " ;
case ScannerError : : IllegalNumberEnd : return " Identifier-start is not allowed at end of a number. " ;
case ScannerError : : OctalNotAllowed : return " Octal numbers not allowed. " ;
2020-11-18 13:35:16 +00:00
case ScannerError : : DirectionalOverrideUnderflow : return " Unicode direction override underflow in comment or string literal. " ;
case ScannerError : : DirectionalOverrideMismatch : return " Mismatching directional override markers in comment or string literal. " ;
2018-11-23 15:47:34 +00:00
default :
solAssert ( false , " Unhandled case in to_string(ScannerError) " ) ;
return " " ;
}
}
2014-11-30 22:25:42 +00:00
2019-04-18 11:17:11 +00:00
2023-07-12 08:05:47 +00:00
std : : ostream & operator < < ( std : : ostream & os , ScannerError _errorCode )
2018-11-23 15:47:34 +00:00
{
2019-04-18 11:17:11 +00:00
return os < < to_string ( _errorCode ) ;
2018-11-23 15:47:34 +00:00
}
2014-11-30 22:25:42 +00:00
/// Scoped helper for literal recording. Automatically drops the literal
/// if aborting the scanning before it's complete.
2019-04-18 11:17:11 +00:00
enum LiteralType
{
2014-11-30 22:25:42 +00:00
LITERAL_TYPE_STRING ,
LITERAL_TYPE_NUMBER , // not really different from string type in behaviour
LITERAL_TYPE_COMMENT
} ;
class LiteralScope
{
public :
2019-04-18 11:17:11 +00:00
explicit LiteralScope ( Scanner * _self , enum LiteralType _type ) :
m_type ( _type ) ,
m_scanner ( _self ) ,
m_complete ( false )
2014-11-30 22:25:42 +00:00
{
if ( _type = = LITERAL_TYPE_COMMENT )
2020-01-22 19:10:56 +00:00
m_scanner - > m_skippedComments [ Scanner : : NextNext ] . literal . clear ( ) ;
2014-11-30 22:25:42 +00:00
else
2020-01-22 19:10:56 +00:00
m_scanner - > m_tokens [ Scanner : : NextNext ] . literal . clear ( ) ;
2014-11-30 22:25:42 +00:00
}
~ LiteralScope ( )
{
if ( ! m_complete )
{
if ( m_type = = LITERAL_TYPE_COMMENT )
2020-01-22 19:10:56 +00:00
m_scanner - > m_skippedComments [ Scanner : : NextNext ] . literal . clear ( ) ;
2014-11-30 22:25:42 +00:00
else
2020-01-22 19:10:56 +00:00
m_scanner - > m_tokens [ Scanner : : NextNext ] . literal . clear ( ) ;
2014-11-30 22:25:42 +00:00
}
}
void complete ( ) { m_complete = true ; }
private :
enum LiteralType m_type ;
Scanner * m_scanner ;
bool m_complete ;
2019-04-18 11:17:11 +00:00
} ;
2014-11-30 22:25:42 +00:00
2014-12-03 16:45:12 +00:00
void Scanner : : reset ( )
{
2021-07-14 10:53:39 +00:00
m_source . reset ( ) ;
2020-07-10 15:05:52 +00:00
m_kind = ScannerKind : : Solidity ;
2021-07-14 10:53:39 +00:00
m_char = m_source . get ( ) ;
2014-10-09 10:28:37 +00:00
skipWhitespace ( ) ;
2014-12-01 00:05:55 +00:00
next ( ) ;
2019-04-24 11:16:43 +00:00
next ( ) ;
2020-01-22 19:10:56 +00:00
next ( ) ;
2019-04-24 11:16:43 +00:00
}
2019-05-27 14:13:27 +00:00
void Scanner : : setPosition ( size_t _offset )
{
2021-07-14 10:53:39 +00:00
m_char = m_source . setPosition ( _offset ) ;
2019-05-27 14:13:27 +00:00
scanToken ( ) ;
next ( ) ;
2020-01-22 19:10:56 +00:00
next ( ) ;
2019-05-27 14:13:27 +00:00
}
2014-11-05 13:20:56 +00:00
bool Scanner : : scanHexByte ( char & o_scannedByte )
2014-10-06 15:13:52 +00:00
{
2014-10-09 10:28:37 +00:00
char x = 0 ;
2020-06-02 13:45:03 +00:00
for ( size_t i = 0 ; i < 2 ; i + + )
2014-10-16 12:08:54 +00:00
{
2014-11-21 16:08:35 +00:00
int d = hexValue ( m_char ) ;
2014-10-16 12:08:54 +00:00
if ( d < 0 )
{
2014-10-09 10:28:37 +00:00
rollback ( i ) ;
return false ;
}
2020-06-05 12:30:57 +00:00
x = static_cast < char > ( x * 16 + d ) ;
2014-10-09 10:28:37 +00:00
advance ( ) ;
}
2014-11-05 13:20:56 +00:00
o_scannedByte = x ;
2014-10-09 10:28:37 +00:00
return true ;
2014-10-06 15:13:52 +00:00
}
2019-10-28 10:39:30 +00:00
std : : optional < unsigned > Scanner : : scanUnicode ( )
2016-06-07 18:23:19 +00:00
{
unsigned x = 0 ;
2020-06-02 13:45:03 +00:00
for ( size_t i = 0 ; i < 4 ; i + + )
2016-06-07 18:23:19 +00:00
{
int d = hexValue ( m_char ) ;
if ( d < 0 )
{
rollback ( i ) ;
2019-04-18 11:17:11 +00:00
return { } ;
2016-06-07 18:23:19 +00:00
}
2020-06-05 12:30:57 +00:00
x = x * 16 + static_cast < unsigned > ( d ) ;
2016-06-07 18:23:19 +00:00
advance ( ) ;
}
2019-04-18 11:17:11 +00:00
return x ;
2016-06-07 18:23:19 +00:00
}
// This supports codepoints between 0000 and FFFF.
2016-08-01 13:10:46 +00:00
void Scanner : : addUnicodeAsUTF8 ( unsigned codepoint )
2016-06-07 18:23:19 +00:00
{
if ( codepoint < = 0x7f )
2020-06-02 13:45:03 +00:00
addLiteralChar ( char ( codepoint ) ) ;
2016-06-07 18:23:19 +00:00
else if ( codepoint < = 0x7ff )
{
2020-06-02 13:45:03 +00:00
addLiteralChar ( char ( 0xc0u | ( codepoint > > 6u ) ) ) ;
addLiteralChar ( char ( 0x80u | ( codepoint & 0x3fu ) ) ) ;
2016-06-07 18:23:19 +00:00
}
else
{
2020-06-02 13:45:03 +00:00
addLiteralChar ( char ( 0xe0u | ( codepoint > > 12u ) ) ) ;
addLiteralChar ( char ( 0x80u | ( ( codepoint > > 6u ) & 0x3fu ) ) ) ;
addLiteralChar ( char ( 0x80u | ( codepoint & 0x3fu ) ) ) ;
2016-06-07 18:23:19 +00:00
}
}
2014-10-06 15:13:52 +00:00
2019-04-24 11:16:43 +00:00
void Scanner : : rescan ( )
{
size_t rollbackTo = 0 ;
2020-01-22 19:10:56 +00:00
if ( m_skippedComments [ Current ] . literal . empty ( ) )
2020-06-02 13:45:03 +00:00
rollbackTo = static_cast < size_t > ( m_tokens [ Current ] . location . start ) ;
2019-04-24 11:16:43 +00:00
else
2020-06-02 13:45:03 +00:00
rollbackTo = static_cast < size_t > ( m_skippedComments [ Current ] . location . start ) ;
2021-07-14 10:53:39 +00:00
m_char = m_source . rollback ( m_source . position ( ) - rollbackTo ) ;
2019-04-24 11:16:43 +00:00
next ( ) ;
next ( ) ;
2020-01-22 19:10:56 +00:00
next ( ) ;
2019-04-24 11:16:43 +00:00
}
2014-10-06 15:13:52 +00:00
// Ensure that tokens can be stored in a byte.
2018-10-22 14:48:21 +00:00
BOOST_STATIC_ASSERT ( TokenTraits : : count ( ) < = 0x100 ) ;
2014-10-06 15:13:52 +00:00
2018-10-22 14:48:21 +00:00
Token Scanner : : next ( )
2014-10-06 15:13:52 +00:00
{
2020-01-22 19:10:56 +00:00
m_tokens [ Current ] = std : : move ( m_tokens [ Next ] ) ;
m_tokens [ Next ] = std : : move ( m_tokens [ NextNext ] ) ;
m_skippedComments [ Current ] = std : : move ( m_skippedComments [ Next ] ) ;
m_skippedComments [ Next ] = std : : move ( m_skippedComments [ NextNext ] ) ;
2014-11-30 21:43:40 +00:00
scanToken ( ) ;
2020-01-22 19:10:56 +00:00
return m_tokens [ Current ] . token ;
2014-10-06 15:13:52 +00:00
}
2018-10-22 14:48:21 +00:00
Token Scanner : : selectToken ( char _next , Token _then , Token _else )
2014-10-16 21:49:45 +00:00
{
advance ( ) ;
if ( m_char = = _next )
2014-10-17 10:52:39 +00:00
return selectToken ( _then ) ;
2014-10-16 21:49:45 +00:00
else
return _else ;
}
2014-10-06 15:13:52 +00:00
bool Scanner : : skipWhitespace ( )
{
2020-06-02 13:45:03 +00:00
size_t const startPosition = sourcePos ( ) ;
2014-11-21 16:08:35 +00:00
while ( isWhiteSpace ( m_char ) )
2014-10-16 12:08:54 +00:00
advance ( ) ;
2014-10-09 10:28:37 +00:00
// Return whether or not we skipped any characters.
2015-08-31 16:44:29 +00:00
return sourcePos ( ) ! = startPosition ;
2014-10-06 15:13:52 +00:00
}
2020-04-28 08:34:07 +00:00
bool Scanner : : skipWhitespaceExceptUnicodeLinebreak ( )
2014-12-18 12:27:25 +00:00
{
2020-06-02 13:45:03 +00:00
size_t const startPosition = sourcePos ( ) ;
2018-09-06 09:05:35 +00:00
while ( isWhiteSpace ( m_char ) & & ! isUnicodeLinebreak ( ) )
2014-12-18 12:27:25 +00:00
advance ( ) ;
2020-04-28 08:34:07 +00:00
// Return whether or not we skipped any characters.
return sourcePos ( ) ! = startPosition ;
2014-12-18 12:27:25 +00:00
}
2020-11-18 13:35:16 +00:00
namespace
{
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
///
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
/// and error code in case the input's lexical parser state is invalid and this error should be reported
/// to the user.
static ScannerError validateBiDiMarkup ( CharStream & _stream , size_t _startPosition )
{
2023-07-12 08:05:47 +00:00
static std : : array < std : : pair < std : : string_view , int > , 5 > constexpr directionalSequences {
std : : pair < std : : string_view , int > { " \xE2 \x80 \xAD " , 1 } , // U+202D (LRO - Left-to-Right Override)
std : : pair < std : : string_view , int > { " \xE2 \x80 \xAE " , 1 } , // U+202E (RLO - Right-to-Left Override)
std : : pair < std : : string_view , int > { " \xE2 \x80 \xAA " , 1 } , // U+202A (LRE - Left-to-Right Embedding)
std : : pair < std : : string_view , int > { " \xE2 \x80 \xAB " , 1 } , // U+202B (RLE - Right-to-Left Embedding)
std : : pair < std : : string_view , int > { " \xE2 \x80 \xAC " , - 1 } // U+202C (PDF - Pop Directional Formatting
2020-11-18 13:35:16 +00:00
} ;
size_t endPosition = _stream . position ( ) ;
_stream . setPosition ( _startPosition ) ;
int directionOverrideDepth = 0 ;
for ( size_t currentPos = _startPosition ; currentPos < endPosition ; + + currentPos )
{
_stream . setPosition ( currentPos ) ;
for ( auto const & [ sequence , depthChange ] : directionalSequences )
if ( _stream . prefixMatch ( sequence ) )
directionOverrideDepth + = depthChange ;
if ( directionOverrideDepth < 0 )
return ScannerError : : DirectionalOverrideUnderflow ;
}
_stream . setPosition ( endPosition ) ;
return directionOverrideDepth > 0 ? ScannerError : : DirectionalOverrideMismatch : ScannerError : : NoError ;
}
}
2018-10-22 14:48:21 +00:00
Token Scanner : : skipSingleLineComment ( )
2014-10-06 15:13:52 +00:00
{
2018-09-06 09:05:35 +00:00
// Line terminator is not part of the comment. If it is a
// non-ascii line terminator, it will result in a parser error.
2021-07-14 10:53:39 +00:00
size_t startPosition = m_source . position ( ) ;
2018-09-06 09:05:35 +00:00
while ( ! isUnicodeLinebreak ( ) )
2020-11-18 13:35:16 +00:00
if ( ! advance ( ) )
break ;
2021-07-14 10:53:39 +00:00
ScannerError unicodeDirectionError = validateBiDiMarkup ( m_source , startPosition ) ;
2020-11-18 13:35:16 +00:00
if ( unicodeDirectionError ! = ScannerError : : NoError )
return setError ( unicodeDirectionError ) ;
2015-09-10 12:26:34 +00:00
2015-02-09 13:00:12 +00:00
return Token : : Whitespace ;
2014-10-06 15:13:52 +00:00
}
2019-08-05 09:53:24 +00:00
bool Scanner : : atEndOfLine ( ) const
{
return m_char = = ' \n ' | | m_char = = ' \r ' ;
}
bool Scanner : : tryScanEndOfLine ( )
{
if ( m_char = = ' \n ' )
{
advance ( ) ;
return true ;
}
if ( m_char = = ' \r ' )
{
if ( advance ( ) & & m_char = = ' \n ' )
advance ( ) ;
return true ;
}
return false ;
}
2020-06-02 13:45:03 +00:00
size_t Scanner : : scanSingleLineDocComment ( )
2014-11-18 17:50:40 +00:00
{
2014-11-27 17:57:50 +00:00
LiteralScope literal ( this , LITERAL_TYPE_COMMENT ) ;
2021-07-14 10:53:39 +00:00
size_t endPosition = m_source . position ( ) ;
2018-09-06 09:05:35 +00:00
skipWhitespaceExceptUnicodeLinebreak ( ) ;
2014-11-27 17:57:50 +00:00
while ( ! isSourcePastEndOfInput ( ) )
2014-11-18 17:50:40 +00:00
{
2021-07-14 10:53:39 +00:00
endPosition = m_source . position ( ) ;
2019-08-05 09:53:24 +00:00
if ( tryScanEndOfLine ( ) )
2014-11-27 17:57:50 +00:00
{
2020-01-25 16:53:48 +00:00
// Check if next line is also a single-line comment.
// If any whitespaces were skipped, use source position before.
2020-04-28 08:34:07 +00:00
if ( ! skipWhitespaceExceptUnicodeLinebreak ( ) )
2021-07-14 10:53:39 +00:00
endPosition = m_source . position ( ) ;
2020-01-25 16:53:48 +00:00
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 3 ) & &
m_source . get ( 0 ) = = ' / ' & &
m_source . get ( 1 ) = = ' / ' & &
m_source . get ( 2 ) = = ' / ' )
2014-11-27 17:57:50 +00:00
{
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 4 ) & & m_source . get ( 3 ) = = ' / ' )
2020-06-13 00:02:32 +00:00
break ; // "////" is not a documentation comment
2021-07-14 10:53:39 +00:00
m_char = m_source . advanceAndGet ( 3 ) ;
2020-04-28 08:34:07 +00:00
if ( atEndOfLine ( ) )
continue ;
addCommentLiteralChar ( ' \n ' ) ;
2014-11-27 17:57:50 +00:00
}
else
break ; // next line is not a documentation comment, we are done
}
2018-09-06 09:05:35 +00:00
else if ( isUnicodeLinebreak ( ) )
// Any line terminator that is not '\n' is considered to end the
// comment.
break ;
2014-11-21 08:09:39 +00:00
addCommentLiteralChar ( m_char ) ;
2014-11-18 17:50:40 +00:00
advance ( ) ;
}
2014-11-21 16:08:35 +00:00
literal . complete ( ) ;
2020-01-25 16:53:48 +00:00
return endPosition ;
2014-11-18 17:50:40 +00:00
}
2018-10-22 14:48:21 +00:00
Token Scanner : : skipMultiLineComment ( )
2014-10-06 15:13:52 +00:00
{
2021-07-14 10:53:39 +00:00
size_t startPosition = m_source . position ( ) ;
2014-10-16 12:08:54 +00:00
while ( ! isSourcePastEndOfInput ( ) )
{
2020-11-18 13:35:16 +00:00
char prevChar = m_char ;
2014-10-09 10:28:37 +00:00
advance ( ) ;
2014-10-16 21:49:45 +00:00
2014-10-09 10:28:37 +00:00
// If we have reached the end of the multi-line comment, we
// consume the '/' and insert a whitespace. This way all
// multi-line comments are treated as whitespace.
2020-11-18 13:35:16 +00:00
if ( prevChar = = ' * ' & & m_char = = ' / ' )
2014-10-16 12:08:54 +00:00
{
2021-07-14 10:53:39 +00:00
ScannerError unicodeDirectionError = validateBiDiMarkup ( m_source , startPosition ) ;
2020-11-18 13:35:16 +00:00
if ( unicodeDirectionError ! = ScannerError : : NoError )
return setError ( unicodeDirectionError ) ;
2014-10-09 10:28:37 +00:00
m_char = ' ' ;
2015-02-09 13:00:12 +00:00
return Token : : Whitespace ;
2014-10-09 10:28:37 +00:00
}
}
// Unterminated multi-line comment.
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalCommentTerminator ) ;
2014-10-06 15:13:52 +00:00
}
2018-10-22 14:48:21 +00:00
Token Scanner : : scanMultiLineDocComment ( )
2014-12-17 17:53:18 +00:00
{
LiteralScope literal ( this , LITERAL_TYPE_COMMENT ) ;
bool endFound = false ;
2014-12-18 12:27:25 +00:00
bool charsAdded = false ;
2014-12-17 17:53:18 +00:00
2019-08-05 09:53:24 +00:00
while ( isWhiteSpace ( m_char ) & & ! atEndOfLine ( ) )
2018-09-06 09:05:35 +00:00
advance ( ) ;
2014-12-17 17:53:18 +00:00
while ( ! isSourcePastEndOfInput ( ) )
{
2022-08-18 11:43:16 +00:00
// handle newlines in multiline comments
2019-08-05 09:53:24 +00:00
if ( atEndOfLine ( ) )
2014-12-17 17:53:18 +00:00
{
skipWhitespace ( ) ;
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 1 ) & & m_source . get ( 0 ) = = ' * ' & & m_source . get ( 1 ) = = ' * ' )
2016-11-30 16:28:07 +00:00
{ // it is unknown if this leads to the end of the comment
addCommentLiteralChar ( ' * ' ) ;
advance ( ) ;
}
2021-07-14 10:53:39 +00:00
else if ( ! m_source . isPastEndOfInput ( 1 ) & & m_source . get ( 0 ) = = ' * ' & & m_source . get ( 1 ) ! = ' / ' )
2014-12-18 12:27:25 +00:00
{ // skip first '*' in subsequent lines
2021-07-14 10:53:39 +00:00
m_char = m_source . advanceAndGet ( 1 ) ;
2020-04-28 08:34:07 +00:00
if ( atEndOfLine ( ) ) // ignores empty lines
continue ;
2014-12-18 12:27:25 +00:00
if ( charsAdded )
2020-04-28 08:34:07 +00:00
addCommentLiteralChar ( ' \n ' ) ; // corresponds to the end of previous line
2014-12-18 12:27:25 +00:00
}
2021-07-14 10:53:39 +00:00
else if ( ! m_source . isPastEndOfInput ( 1 ) & & m_source . get ( 0 ) = = ' * ' & & m_source . get ( 1 ) = = ' / ' )
2014-12-18 12:27:25 +00:00
{ // if after newline the comment ends, don't insert the newline
2021-07-14 10:53:39 +00:00
m_char = m_source . advanceAndGet ( 2 ) ;
2014-12-18 12:27:25 +00:00
endFound = true ;
break ;
2014-12-17 17:53:18 +00:00
}
2014-12-18 15:48:25 +00:00
else if ( charsAdded )
2014-12-17 17:53:18 +00:00
addCommentLiteralChar ( ' \n ' ) ;
}
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 1 ) & & m_source . get ( 0 ) = = ' * ' & & m_source . get ( 1 ) = = ' / ' )
2014-12-17 17:53:18 +00:00
{
2021-07-14 10:53:39 +00:00
m_char = m_source . advanceAndGet ( 2 ) ;
2014-12-17 17:53:18 +00:00
endFound = true ;
break ;
}
addCommentLiteralChar ( m_char ) ;
2014-12-18 12:27:25 +00:00
charsAdded = true ;
2014-12-17 17:53:18 +00:00
advance ( ) ;
}
literal . complete ( ) ;
if ( ! endFound )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalCommentTerminator ) ;
2014-12-17 17:53:18 +00:00
else
2015-02-09 13:00:12 +00:00
return Token : : CommentLiteral ;
2014-12-17 17:53:18 +00:00
}
2018-10-22 14:48:21 +00:00
Token Scanner : : scanSlash ( )
2014-12-18 16:30:10 +00:00
{
2020-06-02 13:45:03 +00:00
int firstSlashPosition = static_cast < int > ( sourcePos ( ) ) ;
2014-12-18 16:30:10 +00:00
advance ( ) ;
if ( m_char = = ' / ' )
{
if ( ! advance ( ) ) /* double slash comment directly before EOS */
2018-09-06 09:05:35 +00:00
return Token : : Whitespace ;
2014-12-18 16:30:10 +00:00
else if ( m_char = = ' / ' )
{
2020-06-13 00:02:32 +00:00
advance ( ) ; //consume the last '/' at ///
// "////"
if ( m_char = = ' / ' )
return skipSingleLineComment ( ) ;
2014-12-18 16:30:10 +00:00
// doxygen style /// comment
2020-01-22 19:10:56 +00:00
m_skippedComments [ NextNext ] . location . start = firstSlashPosition ;
2021-06-29 12:38:59 +00:00
m_skippedComments [ NextNext ] . location . sourceName = m_sourceName ;
2020-01-25 16:53:48 +00:00
m_skippedComments [ NextNext ] . token = Token : : CommentLiteral ;
2020-06-02 13:45:03 +00:00
m_skippedComments [ NextNext ] . location . end = static_cast < int > ( scanSingleLineDocComment ( ) ) ;
2015-02-09 13:00:12 +00:00
return Token : : Whitespace ;
2014-12-18 16:30:10 +00:00
}
else
return skipSingleLineComment ( ) ;
}
else if ( m_char = = ' * ' )
{
// doxygen style /** natspec comment
if ( ! advance ( ) ) /* slash star comment before EOS */
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalCommentTerminator ) ;
2014-12-18 16:30:10 +00:00
else if ( m_char = = ' * ' )
{
2015-01-05 15:37:43 +00:00
advance ( ) ; //consume the last '*' at /**
2018-09-06 09:05:35 +00:00
// "/**/"
if ( m_char = = ' / ' )
2015-01-05 15:37:43 +00:00
{
2018-09-06 09:05:35 +00:00
advance ( ) ; //skip the closing slash
return Token : : Whitespace ;
2015-01-05 15:37:43 +00:00
}
2020-06-13 00:02:32 +00:00
// "/***"
if ( m_char = = ' * ' )
// "/***/" may be interpreted as empty natspec or skipped; skipping is simpler
return skipMultiLineComment ( ) ;
2018-09-06 09:05:35 +00:00
// we actually have a multiline documentation comment
2020-01-22 19:10:56 +00:00
m_skippedComments [ NextNext ] . location . start = firstSlashPosition ;
2021-06-29 12:38:59 +00:00
m_skippedComments [ NextNext ] . location . sourceName = m_sourceName ;
2020-06-13 00:02:32 +00:00
Token comment = scanMultiLineDocComment ( ) ;
2020-06-02 13:45:03 +00:00
m_skippedComments [ NextNext ] . location . end = static_cast < int > ( sourcePos ( ) ) ;
2020-01-22 19:10:56 +00:00
m_skippedComments [ NextNext ] . token = comment ;
2018-11-23 15:47:34 +00:00
if ( comment = = Token : : Illegal )
return Token : : Illegal ; // error already set
2018-09-06 09:05:35 +00:00
else
return Token : : Whitespace ;
2014-12-18 16:30:10 +00:00
}
else
return skipMultiLineComment ( ) ;
}
else if ( m_char = = ' = ' )
2015-02-09 13:00:12 +00:00
return selectToken ( Token : : AssignDiv ) ;
2014-12-18 16:30:10 +00:00
else
2015-02-09 13:00:12 +00:00
return Token : : Div ;
2014-12-18 16:30:10 +00:00
}
2014-11-30 21:43:40 +00:00
void Scanner : : scanToken ( )
2014-10-06 15:13:52 +00:00
{
2020-01-22 19:10:56 +00:00
m_tokens [ NextNext ] = { } ;
m_skippedComments [ NextNext ] = { } ;
2016-02-08 21:43:22 +00:00
2018-10-22 14:48:21 +00:00
Token token ;
2016-02-12 21:01:27 +00:00
// M and N are for the purposes of grabbing different type sizes
2022-07-01 10:09:08 +00:00
unsigned m = 0 ;
unsigned n = 0 ;
2014-10-16 12:08:54 +00:00
do
{
// Remember the position of the next token
2020-06-02 13:45:03 +00:00
m_tokens [ NextNext ] . location . start = static_cast < int > ( sourcePos ( ) ) ;
2014-10-16 12:08:54 +00:00
switch ( m_char )
{
case ' " ' :
case ' \' ' :
2020-07-02 16:39:04 +00:00
token = scanString ( false ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' < ' :
// < <= << <<=
advance ( ) ;
if ( m_char = = ' = ' )
2015-02-10 08:52:19 +00:00
token = selectToken ( Token : : LessThanOrEqual ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' < ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( ' = ' , Token : : AssignShl , Token : : SHL ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : LessThan ;
2014-10-16 12:08:54 +00:00
break ;
case ' > ' :
// > >= >> >>= >>> >>>=
advance ( ) ;
if ( m_char = = ' = ' )
2015-02-10 08:52:19 +00:00
token = selectToken ( Token : : GreaterThanOrEqual ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' > ' )
{
// >> >>= >>> >>>=
advance ( ) ;
if ( m_char = = ' = ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : AssignSar ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' > ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( ' = ' , Token : : AssignShr , Token : : SHR ) ;
2014-10-16 12:08:54 +00:00
else
token = Token : : SAR ;
}
else
2015-02-09 13:00:12 +00:00
token = Token : : GreaterThan ;
2014-10-16 12:08:54 +00:00
break ;
case ' = ' :
// = == =>
advance ( ) ;
if ( m_char = = ' = ' )
2015-02-10 08:52:19 +00:00
token = selectToken ( Token : : Equal ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' > ' )
2020-08-27 10:42:00 +00:00
token = selectToken ( Token : : DoubleArrow ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : Assign ;
2014-10-16 12:08:54 +00:00
break ;
case ' ! ' :
// ! !=
advance ( ) ;
if ( m_char = = ' = ' )
2015-02-10 08:52:19 +00:00
token = selectToken ( Token : : NotEqual ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : Not ;
2014-10-16 12:08:54 +00:00
break ;
case ' + ' :
// + ++ +=
advance ( ) ;
if ( m_char = = ' + ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : Inc ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' = ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : AssignAdd ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : Add ;
2014-10-16 12:08:54 +00:00
break ;
case ' - ' :
2020-08-12 17:56:24 +00:00
// - -- -= ->
2014-10-16 12:08:54 +00:00
advance ( ) ;
if ( m_char = = ' - ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : Dec ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' = ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : AssignSub ) ;
2020-08-12 17:56:24 +00:00
else if ( m_char = = ' > ' )
token = selectToken ( Token : : RightArrow ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : Sub ;
2014-10-16 12:08:54 +00:00
break ;
case ' * ' :
2015-02-08 11:23:17 +00:00
// * ** *=
advance ( ) ;
if ( m_char = = ' * ' )
token = selectToken ( Token : : Exp ) ;
else if ( m_char = = ' = ' )
token = selectToken ( Token : : AssignMul ) ;
else
token = Token : : Mul ;
2014-10-16 12:08:54 +00:00
break ;
case ' % ' :
// % %=
2015-02-09 13:00:12 +00:00
token = selectToken ( ' = ' , Token : : AssignMod , Token : : Mod ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' / ' :
// / // /* /=
2014-12-18 16:30:10 +00:00
token = scanSlash ( ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' & ' :
// & && &=
advance ( ) ;
if ( m_char = = ' & ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : And ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' = ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : AssignBitAnd ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : BitAnd ;
2014-10-16 12:08:54 +00:00
break ;
case ' | ' :
// | || |=
advance ( ) ;
if ( m_char = = ' | ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : Or ) ;
2014-10-16 12:08:54 +00:00
else if ( m_char = = ' = ' )
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : AssignBitOr ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : BitOr ;
2014-10-16 12:08:54 +00:00
break ;
case ' ^ ' :
// ^ ^=
2015-02-09 13:00:12 +00:00
token = selectToken ( ' = ' , Token : : AssignBitXor , Token : : BitXor ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' . ' :
// . Number
advance ( ) ;
2014-11-21 16:08:35 +00:00
if ( isDecimalDigit ( m_char ) )
2014-11-05 07:40:21 +00:00
token = scanNumber ( ' . ' ) ;
2014-10-16 12:08:54 +00:00
else
2015-02-09 13:00:12 +00:00
token = Token : : Period ;
2014-10-16 12:08:54 +00:00
break ;
case ' : ' :
2019-02-18 14:07:15 +00:00
// : :=
advance ( ) ;
if ( m_char = = ' = ' )
token = selectToken ( Token : : AssemblyAssign ) ;
else
token = Token : : Colon ;
2014-10-16 12:08:54 +00:00
break ;
case ' ; ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : Semicolon ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' , ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : Comma ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' ( ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : LParen ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' ) ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : RParen ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' [ ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : LBrack ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' ] ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : RBrack ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' { ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : LBrace ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' } ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : RBrace ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' ? ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : Conditional ) ;
2014-10-16 12:08:54 +00:00
break ;
case ' ~ ' :
2015-02-09 13:00:12 +00:00
token = selectToken ( Token : : BitNot ) ;
2014-10-16 12:08:54 +00:00
break ;
default :
2016-02-09 21:43:23 +00:00
if ( isIdentifierStart ( m_char ) )
2016-08-16 14:31:23 +00:00
{
2023-07-12 08:05:47 +00:00
std : : tie ( token , m , n ) = scanIdentifierOrKeyword ( ) ;
2016-08-16 14:31:23 +00:00
2018-10-04 11:03:55 +00:00
// Special case for hexadecimal literals
2020-08-27 14:53:45 +00:00
if ( token = = Token : : Hex )
2016-08-16 14:31:23 +00:00
{
// reset
m = 0 ;
n = 0 ;
// Special quoted hex string must follow
if ( m_char = = ' " ' | | m_char = = ' \' ' )
token = scanHexString ( ) ;
else
2018-11-23 15:47:34 +00:00
token = setError ( ScannerError : : IllegalToken ) ;
2016-08-16 14:31:23 +00:00
}
2020-07-10 15:20:04 +00:00
else if ( token = = Token : : Unicode & & m_kind ! = ScannerKind : : Yul )
2020-07-02 16:39:04 +00:00
{
// reset
m = 0 ;
n = 0 ;
// Special quoted hex string must follow
if ( m_char = = ' " ' | | m_char = = ' \' ' )
token = scanString ( true ) ;
else
token = setError ( ScannerError : : IllegalToken ) ;
}
2016-08-16 14:31:23 +00:00
}
2014-11-21 16:08:35 +00:00
else if ( isDecimalDigit ( m_char ) )
2014-11-05 07:40:21 +00:00
token = scanNumber ( ) ;
2014-10-16 12:08:54 +00:00
else if ( skipWhitespace ( ) )
2015-02-09 13:00:12 +00:00
token = Token : : Whitespace ;
2014-10-16 12:08:54 +00:00
else if ( isSourcePastEndOfInput ( ) )
token = Token : : EOS ;
else
2018-11-23 15:47:34 +00:00
token = selectErrorToken ( ScannerError : : IllegalToken ) ;
2014-10-16 12:08:54 +00:00
break ;
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
// Continue scanning for tokens as long as we're just skipping
// whitespace.
2014-10-09 10:28:37 +00:00
}
2015-02-09 13:00:12 +00:00
while ( token = = Token : : Whitespace ) ;
2020-06-02 13:45:03 +00:00
m_tokens [ NextNext ] . location . end = static_cast < int > ( sourcePos ( ) ) ;
2021-06-29 12:38:59 +00:00
m_tokens [ NextNext ] . location . sourceName = m_sourceName ;
2020-01-22 19:10:56 +00:00
m_tokens [ NextNext ] . token = token ;
2023-07-12 08:05:47 +00:00
m_tokens [ NextNext ] . extendedTokenInfo = std : : make_tuple ( m , n ) ;
2014-10-06 15:13:52 +00:00
}
bool Scanner : : scanEscape ( )
{
2014-10-09 10:28:37 +00:00
char c = m_char ;
2019-08-05 09:53:24 +00:00
2014-10-09 10:28:37 +00:00
// Skip escaped newlines.
2019-08-05 09:53:24 +00:00
if ( tryScanEndOfLine ( ) )
2014-10-09 10:28:37 +00:00
return true ;
2019-08-05 09:53:24 +00:00
advance ( ) ;
2014-10-16 12:08:54 +00:00
switch ( c )
{
2014-10-09 10:28:37 +00:00
case ' \' ' : // fall through
2014-10-16 21:49:45 +00:00
case ' " ' : // fall through
2014-10-16 12:08:54 +00:00
case ' \\ ' :
break ;
2014-10-16 21:49:45 +00:00
case ' n ' :
2014-10-16 12:08:54 +00:00
c = ' \n ' ;
break ;
2014-10-16 21:49:45 +00:00
case ' r ' :
2014-10-16 12:08:54 +00:00
c = ' \r ' ;
break ;
2014-10-16 21:49:45 +00:00
case ' t ' :
2014-10-16 12:08:54 +00:00
c = ' \t ' ;
break ;
2016-06-07 18:23:19 +00:00
case ' u ' :
{
2019-10-28 10:39:30 +00:00
if ( auto const codepoint = scanUnicode ( ) ; codepoint . has_value ( ) )
2019-04-18 11:17:11 +00:00
addUnicodeAsUTF8 ( * codepoint ) ;
else
2016-06-07 18:23:19 +00:00
return false ;
return true ;
}
2014-10-16 21:49:45 +00:00
case ' x ' :
2014-11-05 13:20:56 +00:00
if ( ! scanHexByte ( c ) )
2014-10-20 12:00:37 +00:00
return false ;
2014-10-09 10:28:37 +00:00
break ;
2018-09-06 09:05:35 +00:00
default :
return false ;
2014-10-09 10:28:37 +00:00
}
2014-10-16 21:49:45 +00:00
2014-10-09 10:28:37 +00:00
addLiteralChar ( c ) ;
return true ;
2014-10-06 15:13:52 +00:00
}
2018-09-06 09:05:35 +00:00
bool Scanner : : isUnicodeLinebreak ( )
{
if ( 0x0a < = m_char & & m_char < = 0x0d )
// line feed, vertical tab, form feed, carriage return
return true ;
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 1 ) & & uint8_t ( m_source . get ( 0 ) ) = = 0xc2 & & uint8_t ( m_source . get ( 1 ) ) = = 0x85 )
2018-09-06 09:05:35 +00:00
// NEL - U+0085, C2 85 in utf8
return true ;
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 2 ) & & uint8_t ( m_source . get ( 0 ) ) = = 0xe2 & & uint8_t ( m_source . get ( 1 ) ) = = 0x80 & & (
uint8_t ( m_source . get ( 2 ) ) = = 0xa8 | | uint8_t ( m_source . get ( 2 ) ) = = 0xa9
2018-09-06 09:05:35 +00:00
) )
// LS - U+2028, E2 80 A8 in utf8
// PS - U+2029, E2 80 A9 in utf8
return true ;
2020-06-13 00:02:32 +00:00
return false ;
2018-09-06 09:05:35 +00:00
}
2020-07-02 16:39:04 +00:00
Token Scanner : : scanString ( bool const _isUnicode )
2014-10-06 15:13:52 +00:00
{
2021-07-14 10:53:39 +00:00
size_t startPosition = m_source . position ( ) ;
2014-10-20 11:02:06 +00:00
char const quote = m_char ;
2014-10-09 10:28:37 +00:00
advance ( ) ; // consume quote
2014-11-27 17:57:50 +00:00
LiteralScope literal ( this , LITERAL_TYPE_STRING ) ;
2018-09-06 09:05:35 +00:00
while ( m_char ! = quote & & ! isSourcePastEndOfInput ( ) & & ! isUnicodeLinebreak ( ) )
2014-10-16 12:08:54 +00:00
{
2014-10-09 10:28:37 +00:00
char c = m_char ;
advance ( ) ;
2014-10-16 12:08:54 +00:00
if ( c = = ' \\ ' )
{
if ( isSourcePastEndOfInput ( ) | | ! scanEscape ( ) )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalEscapeSequence ) ;
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
else
2020-07-15 14:06:43 +00:00
{
2020-07-02 16:39:04 +00:00
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
2020-07-15 14:06:43 +00:00
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
2020-07-02 16:39:04 +00:00
if ( ! _isUnicode & & ( static_cast < unsigned > ( c ) < = 0x1f | | static_cast < unsigned > ( c ) > = 0x7f ) )
2022-05-15 13:24:23 +00:00
{
if ( m_kind = = ScannerKind : : Yul )
return setError ( ScannerError : : IllegalCharacterInString ) ;
return setError ( ScannerError : : UnicodeCharacterInNonUnicodeString ) ;
}
2014-10-16 12:08:54 +00:00
addLiteralChar ( c ) ;
2020-07-15 14:06:43 +00:00
}
2014-10-09 10:28:37 +00:00
}
2014-11-21 16:08:35 +00:00
if ( m_char ! = quote )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalStringEndQuote ) ;
2020-11-18 13:35:16 +00:00
if ( _isUnicode )
{
2021-07-14 10:53:39 +00:00
ScannerError unicodeDirectionError = validateBiDiMarkup ( m_source , startPosition ) ;
2020-11-18 13:35:16 +00:00
if ( unicodeDirectionError ! = ScannerError : : NoError )
return setError ( unicodeDirectionError ) ;
}
2014-11-21 16:08:35 +00:00
literal . complete ( ) ;
2014-10-09 10:28:37 +00:00
advance ( ) ; // consume quote
2020-07-02 16:39:04 +00:00
return _isUnicode ? Token : : UnicodeStringLiteral : Token : : StringLiteral ;
2014-10-06 15:13:52 +00:00
}
2018-10-22 14:48:21 +00:00
Token Scanner : : scanHexString ( )
2016-08-16 14:31:23 +00:00
{
char const quote = m_char ;
advance ( ) ; // consume quote
LiteralScope literal ( this , LITERAL_TYPE_STRING ) ;
2019-09-06 13:29:51 +00:00
bool allowUnderscore = false ;
2018-09-06 09:05:35 +00:00
while ( m_char ! = quote & & ! isSourcePastEndOfInput ( ) )
2016-08-16 14:31:23 +00:00
{
char c = m_char ;
2019-09-06 13:29:51 +00:00
if ( scanHexByte ( c ) )
{
addLiteralChar ( c ) ;
allowUnderscore = true ;
}
else if ( c = = ' _ ' )
{
advance ( ) ;
if ( ! allowUnderscore | | m_char = = quote )
return setError ( ScannerError : : IllegalNumberSeparator ) ;
allowUnderscore = false ;
}
else
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalHexString ) ;
2016-08-16 14:31:23 +00:00
}
2018-11-23 15:47:34 +00:00
2016-08-16 14:31:23 +00:00
if ( m_char ! = quote )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalStringEndQuote ) ;
2016-08-16 14:31:23 +00:00
literal . complete ( ) ;
advance ( ) ; // consume quote
2019-10-05 20:47:23 +00:00
return Token : : HexStringLiteral ;
2016-08-16 14:31:23 +00:00
}
2018-08-03 14:13:52 +00:00
// Parse for regex [:digit:]+(_[:digit:]+)*
2014-10-06 15:13:52 +00:00
void Scanner : : scanDecimalDigits ( )
{
2018-08-03 14:13:52 +00:00
// MUST begin with a decimal digit.
if ( ! isDecimalDigit ( m_char ) )
return ;
2017-10-25 08:12:07 +00:00
2018-08-03 14:13:52 +00:00
// May continue with decimal digit or underscore for grouping.
2019-04-18 11:17:11 +00:00
do
addLiteralCharAndAdvance ( ) ;
2021-07-14 10:53:39 +00:00
while ( ! m_source . isPastEndOfInput ( ) & & ( isDecimalDigit ( m_char ) | | m_char = = ' _ ' ) ) ;
2017-10-25 08:12:07 +00:00
2018-08-03 14:13:52 +00:00
// Defer further validation of underscore to SyntaxChecker.
2014-10-06 15:13:52 +00:00
}
2018-10-22 14:48:21 +00:00
Token Scanner : : scanNumber ( char _charSeen )
2014-10-06 15:13:52 +00:00
{
2014-11-05 07:40:21 +00:00
enum { DECIMAL , HEX , BINARY } kind = DECIMAL ;
2014-11-27 17:57:50 +00:00
LiteralScope literal ( this , LITERAL_TYPE_NUMBER ) ;
2014-11-05 07:40:21 +00:00
if ( _charSeen = = ' . ' )
2014-10-16 12:08:54 +00:00
{
// we have already seen a decimal point of the float
addLiteralChar ( ' . ' ) ;
2018-08-03 14:13:52 +00:00
if ( m_char = = ' _ ' )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalToken ) ;
2014-10-16 12:08:54 +00:00
scanDecimalDigits ( ) ; // we know we have at least one digit
}
else
{
2014-12-19 10:31:17 +00:00
solAssert ( _charSeen = = 0 , " " ) ;
2014-10-16 12:08:54 +00:00
// if the first character is '0' we must check for octals and hex
if ( m_char = = ' 0 ' )
{
addLiteralCharAndAdvance ( ) ;
2014-11-05 07:40:21 +00:00
// either 0, 0exxx, 0Exxx, 0.xxx or a hex number
2018-09-18 17:09:16 +00:00
if ( m_char = = ' x ' )
2014-10-16 12:08:54 +00:00
{
// hex number
kind = HEX ;
addLiteralCharAndAdvance ( ) ;
2014-11-21 16:08:35 +00:00
if ( ! isHexDigit ( m_char ) )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalHexDigit ) ; // we must have at least one hex digit after 'x'
2017-10-25 08:12:07 +00:00
2018-08-03 14:13:52 +00:00
while ( isHexDigit ( m_char ) | | m_char = = ' _ ' ) // We keep the underscores for later validation
2014-10-16 12:08:54 +00:00
addLiteralCharAndAdvance ( ) ;
}
2017-03-07 11:34:32 +00:00
else if ( isDecimalDigit ( m_char ) )
// We do not allow octal numbers
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : OctalNotAllowed ) ;
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
// Parse decimal digits and allow trailing fractional part.
if ( kind = = DECIMAL )
{
scanDecimalDigits ( ) ; // optional
if ( m_char = = ' . ' )
{
2021-07-14 10:53:39 +00:00
if ( ! m_source . isPastEndOfInput ( 1 ) & & m_source . get ( 1 ) = = ' _ ' )
2018-08-03 14:13:52 +00:00
{
// Assume the input may be a floating point number with leading '_' in fraction part.
// Recover by consuming it all but returning `Illegal` right away.
addLiteralCharAndAdvance ( ) ; // '.'
addLiteralCharAndAdvance ( ) ; // '_'
scanDecimalDigits ( ) ;
}
2021-07-14 10:53:39 +00:00
if ( m_source . isPastEndOfInput ( ) | | ! isDecimalDigit ( m_source . get ( 1 ) ) )
2018-05-22 15:37:21 +00:00
{
2018-08-03 14:13:52 +00:00
// A '.' has to be followed by a number.
2018-05-22 15:37:21 +00:00
literal . complete ( ) ;
return Token : : Number ;
}
2014-10-16 12:08:54 +00:00
addLiteralCharAndAdvance ( ) ;
2018-05-22 15:37:21 +00:00
scanDecimalDigits ( ) ;
2014-10-16 12:08:54 +00:00
}
2014-10-09 10:28:37 +00:00
}
}
2014-10-16 12:08:54 +00:00
// scan exponent, if any
if ( m_char = = ' e ' | | m_char = = ' E ' )
{
2014-12-17 15:23:18 +00:00
solAssert ( kind ! = HEX , " 'e'/'E' must be scanned as part of the hex number " ) ;
2014-11-21 16:08:35 +00:00
if ( kind ! = DECIMAL )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalExponent ) ;
2021-07-14 10:53:39 +00:00
else if ( ! m_source . isPastEndOfInput ( 1 ) & & m_source . get ( 1 ) = = ' _ ' )
2018-08-03 14:13:52 +00:00
{
// Recover from wrongly placed underscore as delimiter in literal with scientific
// notation by consuming until the end.
addLiteralCharAndAdvance ( ) ; // 'e'
addLiteralCharAndAdvance ( ) ; // '_'
scanDecimalDigits ( ) ;
literal . complete ( ) ;
return Token : : Number ;
}
2014-10-16 12:08:54 +00:00
// scan exponent
2018-08-03 14:13:52 +00:00
addLiteralCharAndAdvance ( ) ; // 'e' | 'E'
2014-10-16 12:08:54 +00:00
if ( m_char = = ' + ' | | m_char = = ' - ' )
addLiteralCharAndAdvance ( ) ;
2018-11-23 15:47:34 +00:00
if ( ! isDecimalDigit ( m_char ) ) // we must have at least one decimal digit after 'e'/'E'
return setError ( ScannerError : : IllegalExponent ) ;
2014-10-16 12:08:54 +00:00
scanDecimalDigits ( ) ;
2014-10-09 10:28:37 +00:00
}
2014-10-16 12:08:54 +00:00
// The source character immediately following a numeric literal must
// not be an identifier start or a decimal digit; see ECMA-262
// section 7.8.3, page 17 (note that we read only one decimal digit
// if the value is 0).
2016-02-09 21:43:23 +00:00
if ( isDecimalDigit ( m_char ) | | isIdentifierStart ( m_char ) )
2018-11-23 15:47:34 +00:00
return setError ( ScannerError : : IllegalNumberEnd ) ;
2014-11-21 16:08:35 +00:00
literal . complete ( ) ;
2015-02-09 13:00:12 +00:00
return Token : : Number ;
2014-10-06 15:13:52 +00:00
}
2023-07-12 08:05:47 +00:00
std : : tuple < Token , unsigned , unsigned > Scanner : : scanIdentifierOrKeyword ( )
2014-10-06 15:13:52 +00:00
{
2016-02-09 21:43:23 +00:00
solAssert ( isIdentifierStart ( m_char ) , " " ) ;
2014-11-27 17:57:50 +00:00
LiteralScope literal ( this , LITERAL_TYPE_STRING ) ;
2014-10-09 10:28:37 +00:00
addLiteralCharAndAdvance ( ) ;
// Scan the rest of the identifier characters.
2020-07-10 15:05:52 +00:00
while ( isIdentifierPart ( m_char ) | | ( m_char = = ' . ' & & m_kind = = ScannerKind : : Yul ) )
2014-10-09 10:28:37 +00:00
addLiteralCharAndAdvance ( ) ;
2014-11-21 16:08:35 +00:00
literal . complete ( ) ;
2020-07-10 15:20:04 +00:00
auto const token = TokenTraits : : fromIdentifierOrKeyword ( m_tokens [ NextNext ] . literal ) ;
if ( m_kind = = ScannerKind : : Yul )
{
2020-07-27 18:11:38 +00:00
// Turn Solidity identifier into a Yul keyword
if ( m_tokens [ NextNext ] . literal = = " leave " )
return std : : make_tuple ( Token : : Leave , 0 , 0 ) ;
2020-07-10 15:20:04 +00:00
// Turn non-Yul keywords into identifiers.
if ( ! TokenTraits : : isYulKeyword ( std : : get < 0 > ( token ) ) )
return std : : make_tuple ( Token : : Identifier , 0 , 0 ) ;
}
return token ;
2014-10-06 15:13:52 +00:00
}
2019-12-11 16:31:36 +00:00
} // namespace solidity::langutil