/* * This file is part of solidity. * * solidity is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * solidity is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with solidity. If not, see . * * This file is derived from the file "scanner.cc", which was part of the * V8 project. The original copyright header follows: * * Copyright 2006-2012, the V8 project authors. All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * @author Christian * @date 2014 * Solidity scanner. */ #include #include #include #include #include #include #include using namespace std; namespace solidity::langutil { string to_string(ScannerError _errorCode) { switch (_errorCode) { case ScannerError::NoError: return "No error."; case ScannerError::IllegalToken: return "Invalid token."; case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles."; case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid."; case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator."; case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence."; case ScannerError::IllegalCharacterInString: return "Invalid character in string."; case ScannerError::IllegalStringEndQuote: return "Expected string end-quote."; case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'."; case ScannerError::IllegalExponent: return "Invalid exponent."; case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number."; case ScannerError::OctalNotAllowed: return "Octal numbers not allowed."; default: solAssert(false, "Unhandled case in to_string(ScannerError)"); return ""; } } ostream& operator<<(ostream& os, ScannerError _errorCode) { return os << to_string(_errorCode); } /// Scoped helper for literal recording. Automatically drops the literal /// if aborting the scanning before it's complete. enum LiteralType { LITERAL_TYPE_STRING, LITERAL_TYPE_NUMBER, // not really different from string type in behaviour LITERAL_TYPE_COMMENT }; class LiteralScope { public: explicit LiteralScope(Scanner* _self, enum LiteralType _type): m_type(_type), m_scanner(_self), m_complete(false) { if (_type == LITERAL_TYPE_COMMENT) m_scanner->m_skippedComments[Scanner::NextNext].literal.clear(); else m_scanner->m_tokens[Scanner::NextNext].literal.clear(); } ~LiteralScope() { if (!m_complete) { if (m_type == LITERAL_TYPE_COMMENT) m_scanner->m_skippedComments[Scanner::NextNext].literal.clear(); else m_scanner->m_tokens[Scanner::NextNext].literal.clear(); } } void complete() { m_complete = true; } private: enum LiteralType m_type; Scanner* m_scanner; bool m_complete; }; void Scanner::reset(CharStream _source) { m_source = make_shared(std::move(_source)); reset(); } void Scanner::reset(shared_ptr _source) { solAssert(_source.get() != nullptr, "You MUST provide a CharStream when resetting."); m_source = std::move(_source); reset(); } void Scanner::reset() { m_source->reset(); m_kind = ScannerKind::Solidity; m_char = m_source->get(); skipWhitespace(); next(); next(); next(); } void Scanner::setPosition(size_t _offset) { m_char = m_source->setPosition(_offset); scanToken(); next(); next(); } bool Scanner::scanHexByte(char& o_scannedByte) { char x = 0; for (size_t i = 0; i < 2; i++) { int d = hexValue(m_char); if (d < 0) { rollback(i); return false; } x = static_cast(x * 16 + d); advance(); } o_scannedByte = x; return true; } std::optional Scanner::scanUnicode() { unsigned x = 0; for (size_t i = 0; i < 4; i++) { int d = hexValue(m_char); if (d < 0) { rollback(i); return {}; } x = x * 16 + static_cast(d); advance(); } return x; } // This supports codepoints between 0000 and FFFF. void Scanner::addUnicodeAsUTF8(unsigned codepoint) { if (codepoint <= 0x7f) addLiteralChar(char(codepoint)); else if (codepoint <= 0x7ff) { addLiteralChar(char(0xc0u | (codepoint >> 6u))); addLiteralChar(char(0x80u | (codepoint & 0x3fu))); } else { addLiteralChar(char(0xe0u | (codepoint >> 12u))); addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu))); addLiteralChar(char(0x80u | (codepoint & 0x3fu))); } } void Scanner::rescan() { size_t rollbackTo = 0; if (m_skippedComments[Current].literal.empty()) rollbackTo = static_cast(m_tokens[Current].location.start); else rollbackTo = static_cast(m_skippedComments[Current].location.start); m_char = m_source->rollback(m_source->position() - rollbackTo); next(); next(); next(); } // Ensure that tokens can be stored in a byte. BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100); Token Scanner::next() { m_tokens[Current] = std::move(m_tokens[Next]); m_tokens[Next] = std::move(m_tokens[NextNext]); m_skippedComments[Current] = std::move(m_skippedComments[Next]); m_skippedComments[Next] = std::move(m_skippedComments[NextNext]); scanToken(); return m_tokens[Current].token; } Token Scanner::selectToken(char _next, Token _then, Token _else) { advance(); if (m_char == _next) return selectToken(_then); else return _else; } bool Scanner::skipWhitespace() { size_t const startPosition = sourcePos(); while (isWhiteSpace(m_char)) advance(); // Return whether or not we skipped any characters. return sourcePos() != startPosition; } bool Scanner::skipWhitespaceExceptUnicodeLinebreak() { size_t const startPosition = sourcePos(); while (isWhiteSpace(m_char) && !isUnicodeLinebreak()) advance(); // Return whether or not we skipped any characters. return sourcePos() != startPosition; } Token Scanner::skipSingleLineComment() { // Line terminator is not part of the comment. If it is a // non-ascii line terminator, it will result in a parser error. while (!isUnicodeLinebreak()) if (!advance()) break; return Token::Whitespace; } bool Scanner::atEndOfLine() const { return m_char == '\n' || m_char == '\r'; } bool Scanner::tryScanEndOfLine() { if (m_char == '\n') { advance(); return true; } if (m_char == '\r') { if (advance() && m_char == '\n') advance(); return true; } return false; } size_t Scanner::scanSingleLineDocComment() { LiteralScope literal(this, LITERAL_TYPE_COMMENT); size_t endPosition = m_source->position(); skipWhitespaceExceptUnicodeLinebreak(); while (!isSourcePastEndOfInput()) { endPosition = m_source->position(); if (tryScanEndOfLine()) { // Check if next line is also a single-line comment. // If any whitespaces were skipped, use source position before. if (!skipWhitespaceExceptUnicodeLinebreak()) endPosition = m_source->position(); if (!m_source->isPastEndOfInput(3) && m_source->get(0) == '/' && m_source->get(1) == '/' && m_source->get(2) == '/') { if (!m_source->isPastEndOfInput(4) && m_source->get(3) == '/') break; // "////" is not a documentation comment m_char = m_source->advanceAndGet(3); if (atEndOfLine()) continue; addCommentLiteralChar('\n'); } else break; // next line is not a documentation comment, we are done } else if (isUnicodeLinebreak()) // Any line terminator that is not '\n' is considered to end the // comment. break; addCommentLiteralChar(m_char); advance(); } literal.complete(); return endPosition; } Token Scanner::skipMultiLineComment() { while (!isSourcePastEndOfInput()) { char ch = m_char; advance(); // If we have reached the end of the multi-line comment, we // consume the '/' and insert a whitespace. This way all // multi-line comments are treated as whitespace. if (ch == '*' && m_char == '/') { m_char = ' '; return Token::Whitespace; } } // Unterminated multi-line comment. return setError(ScannerError::IllegalCommentTerminator); } Token Scanner::scanMultiLineDocComment() { LiteralScope literal(this, LITERAL_TYPE_COMMENT); bool endFound = false; bool charsAdded = false; while (isWhiteSpace(m_char) && !atEndOfLine()) advance(); while (!isSourcePastEndOfInput()) { //handle newlines in multline comments if (atEndOfLine()) { skipWhitespace(); if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '*') { // it is unknown if this leads to the end of the comment addCommentLiteralChar('*'); advance(); } else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) != '/') { // skip first '*' in subsequent lines m_char = m_source->advanceAndGet(1); if (atEndOfLine()) // ignores empty lines continue; if (charsAdded) addCommentLiteralChar('\n'); // corresponds to the end of previous line } else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/') { // if after newline the comment ends, don't insert the newline m_char = m_source->advanceAndGet(2); endFound = true; break; } else if (charsAdded) addCommentLiteralChar('\n'); } if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/') { m_char = m_source->advanceAndGet(2); endFound = true; break; } addCommentLiteralChar(m_char); charsAdded = true; advance(); } literal.complete(); if (!endFound) return setError(ScannerError::IllegalCommentTerminator); else return Token::CommentLiteral; } Token Scanner::scanSlash() { int firstSlashPosition = static_cast(sourcePos()); advance(); if (m_char == '/') { if (!advance()) /* double slash comment directly before EOS */ return Token::Whitespace; else if (m_char == '/') { advance(); //consume the last '/' at /// // "////" if (m_char == '/') return skipSingleLineComment(); // doxygen style /// comment m_skippedComments[NextNext].location.start = firstSlashPosition; m_skippedComments[NextNext].location.source = m_source; m_skippedComments[NextNext].token = Token::CommentLiteral; m_skippedComments[NextNext].location.end = static_cast(scanSingleLineDocComment()); return Token::Whitespace; } else return skipSingleLineComment(); } else if (m_char == '*') { // doxygen style /** natspec comment if (!advance()) /* slash star comment before EOS */ return setError(ScannerError::IllegalCommentTerminator); else if (m_char == '*') { advance(); //consume the last '*' at /** // "/**/" if (m_char == '/') { advance(); //skip the closing slash return Token::Whitespace; } // "/***" if (m_char == '*') // "/***/" may be interpreted as empty natspec or skipped; skipping is simpler return skipMultiLineComment(); // we actually have a multiline documentation comment m_skippedComments[NextNext].location.start = firstSlashPosition; m_skippedComments[NextNext].location.source = m_source; Token comment = scanMultiLineDocComment(); m_skippedComments[NextNext].location.end = static_cast(sourcePos()); m_skippedComments[NextNext].token = comment; if (comment == Token::Illegal) return Token::Illegal; // error already set else return Token::Whitespace; } else return skipMultiLineComment(); } else if (m_char == '=') return selectToken(Token::AssignDiv); else return Token::Div; } void Scanner::scanToken() { m_tokens[NextNext] = {}; m_skippedComments[NextNext] = {}; Token token; // M and N are for the purposes of grabbing different type sizes unsigned m; unsigned n; do { // Remember the position of the next token m_tokens[NextNext].location.start = static_cast(sourcePos()); switch (m_char) { case '"': case '\'': token = scanString(false); break; case '<': // < <= << <<= advance(); if (m_char == '=') token = selectToken(Token::LessThanOrEqual); else if (m_char == '<') token = selectToken('=', Token::AssignShl, Token::SHL); else token = Token::LessThan; break; case '>': // > >= >> >>= >>> >>>= advance(); if (m_char == '=') token = selectToken(Token::GreaterThanOrEqual); else if (m_char == '>') { // >> >>= >>> >>>= advance(); if (m_char == '=') token = selectToken(Token::AssignSar); else if (m_char == '>') token = selectToken('=', Token::AssignShr, Token::SHR); else token = Token::SAR; } else token = Token::GreaterThan; break; case '=': // = == => advance(); if (m_char == '=') token = selectToken(Token::Equal); else if (m_char == '>') token = selectToken(Token::DoubleArrow); else token = Token::Assign; break; case '!': // ! != advance(); if (m_char == '=') token = selectToken(Token::NotEqual); else token = Token::Not; break; case '+': // + ++ += advance(); if (m_char == '+') token = selectToken(Token::Inc); else if (m_char == '=') token = selectToken(Token::AssignAdd); else token = Token::Add; break; case '-': // - -- -= -> advance(); if (m_char == '-') token = selectToken(Token::Dec); else if (m_char == '=') token = selectToken(Token::AssignSub); else if (m_char == '>') token = selectToken(Token::RightArrow); else token = Token::Sub; break; case '*': // * ** *= advance(); if (m_char == '*') token = selectToken(Token::Exp); else if (m_char == '=') token = selectToken(Token::AssignMul); else token = Token::Mul; break; case '%': // % %= token = selectToken('=', Token::AssignMod, Token::Mod); break; case '/': // / // /* /= token = scanSlash(); break; case '&': // & && &= advance(); if (m_char == '&') token = selectToken(Token::And); else if (m_char == '=') token = selectToken(Token::AssignBitAnd); else token = Token::BitAnd; break; case '|': // | || |= advance(); if (m_char == '|') token = selectToken(Token::Or); else if (m_char == '=') token = selectToken(Token::AssignBitOr); else token = Token::BitOr; break; case '^': // ^ ^= token = selectToken('=', Token::AssignBitXor, Token::BitXor); break; case '.': // . Number advance(); if (isDecimalDigit(m_char)) token = scanNumber('.'); else token = Token::Period; break; case ':': // : := advance(); if (m_char == '=') token = selectToken(Token::AssemblyAssign); else token = Token::Colon; break; case ';': token = selectToken(Token::Semicolon); break; case ',': token = selectToken(Token::Comma); break; case '(': token = selectToken(Token::LParen); break; case ')': token = selectToken(Token::RParen); break; case '[': token = selectToken(Token::LBrack); break; case ']': token = selectToken(Token::RBrack); break; case '{': token = selectToken(Token::LBrace); break; case '}': token = selectToken(Token::RBrace); break; case '?': token = selectToken(Token::Conditional); break; case '~': token = selectToken(Token::BitNot); break; default: if (isIdentifierStart(m_char)) { tie(token, m, n) = scanIdentifierOrKeyword(); // Special case for hexadecimal literals if (token == Token::Hex) { // reset m = 0; n = 0; // Special quoted hex string must follow if (m_char == '"' || m_char == '\'') token = scanHexString(); else token = setError(ScannerError::IllegalToken); } else if (token == Token::Unicode && m_kind != ScannerKind::Yul) { // reset m = 0; n = 0; // Special quoted hex string must follow if (m_char == '"' || m_char == '\'') token = scanString(true); else token = setError(ScannerError::IllegalToken); } } else if (isDecimalDigit(m_char)) token = scanNumber(); else if (skipWhitespace()) token = Token::Whitespace; else if (isSourcePastEndOfInput()) token = Token::EOS; else token = selectErrorToken(ScannerError::IllegalToken); break; } // Continue scanning for tokens as long as we're just skipping // whitespace. } while (token == Token::Whitespace); m_tokens[NextNext].location.end = static_cast(sourcePos()); m_tokens[NextNext].location.source = m_source; m_tokens[NextNext].token = token; m_tokens[NextNext].extendedTokenInfo = make_tuple(m, n); } bool Scanner::scanEscape() { char c = m_char; // Skip escaped newlines. if (tryScanEndOfLine()) return true; advance(); switch (c) { case '\'': // fall through case '"': // fall through case '\\': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case 'u': { if (auto const codepoint = scanUnicode(); codepoint.has_value()) addUnicodeAsUTF8(*codepoint); else return false; return true; } case 'x': if (!scanHexByte(c)) return false; break; default: return false; } addLiteralChar(c); return true; } bool Scanner::isUnicodeLinebreak() { if (0x0a <= m_char && m_char <= 0x0d) // line feed, vertical tab, form feed, carriage return return true; if (!m_source->isPastEndOfInput(1) && uint8_t(m_source->get(0)) == 0xc2 && uint8_t(m_source->get(1)) == 0x85) // NEL - U+0085, C2 85 in utf8 return true; if (!m_source->isPastEndOfInput(2) && uint8_t(m_source->get(0)) == 0xe2 && uint8_t(m_source->get(1)) == 0x80 && ( uint8_t(m_source->get(2)) == 0xa8 || uint8_t(m_source->get(2)) == 0xa9 )) // LS - U+2028, E2 80 A8 in utf8 // PS - U+2029, E2 80 A9 in utf8 return true; return false; } Token Scanner::scanString(bool const _isUnicode) { char const quote = m_char; advance(); // consume quote LiteralScope literal(this, LITERAL_TYPE_STRING); while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak()) { char c = m_char; advance(); if (c == '\\') { if (isSourcePastEndOfInput() || !scanEscape()) return setError(ScannerError::IllegalEscapeSequence); } else { // Report error on non-printable characters in string literals, however // allow anything for unicode string literals, because their validity will // be verified later (in the syntax checker). // // We are using a manual range and not isprint() to avoid // any potential complications with locale. if (!_isUnicode && (static_cast(c) <= 0x1f || static_cast(c) >= 0x7f)) return setError(ScannerError::IllegalCharacterInString); addLiteralChar(c); } } if (m_char != quote) return setError(ScannerError::IllegalStringEndQuote); literal.complete(); advance(); // consume quote return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral; } Token Scanner::scanHexString() { char const quote = m_char; advance(); // consume quote LiteralScope literal(this, LITERAL_TYPE_STRING); bool allowUnderscore = false; while (m_char != quote && !isSourcePastEndOfInput()) { char c = m_char; if (scanHexByte(c)) { addLiteralChar(c); allowUnderscore = true; } else if (c == '_') { advance(); if (!allowUnderscore || m_char == quote) return setError(ScannerError::IllegalNumberSeparator); allowUnderscore = false; } else return setError(ScannerError::IllegalHexString); } if (m_char != quote) return setError(ScannerError::IllegalStringEndQuote); literal.complete(); advance(); // consume quote return Token::HexStringLiteral; } // Parse for regex [:digit:]+(_[:digit:]+)* void Scanner::scanDecimalDigits() { // MUST begin with a decimal digit. if (!isDecimalDigit(m_char)) return; // May continue with decimal digit or underscore for grouping. do addLiteralCharAndAdvance(); while (!m_source->isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_')); // Defer further validation of underscore to SyntaxChecker. } Token Scanner::scanNumber(char _charSeen) { enum { DECIMAL, HEX, BINARY } kind = DECIMAL; LiteralScope literal(this, LITERAL_TYPE_NUMBER); if (_charSeen == '.') { // we have already seen a decimal point of the float addLiteralChar('.'); if (m_char == '_') return setError(ScannerError::IllegalToken); scanDecimalDigits(); // we know we have at least one digit } else { solAssert(_charSeen == 0, ""); // if the first character is '0' we must check for octals and hex if (m_char == '0') { addLiteralCharAndAdvance(); // either 0, 0exxx, 0Exxx, 0.xxx or a hex number if (m_char == 'x') { // hex number kind = HEX; addLiteralCharAndAdvance(); if (!isHexDigit(m_char)) return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x' while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation addLiteralCharAndAdvance(); } else if (isDecimalDigit(m_char)) // We do not allow octal numbers return setError(ScannerError::OctalNotAllowed); } // Parse decimal digits and allow trailing fractional part. if (kind == DECIMAL) { scanDecimalDigits(); // optional if (m_char == '.') { if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_') { // Assume the input may be a floating point number with leading '_' in fraction part. // Recover by consuming it all but returning `Illegal` right away. addLiteralCharAndAdvance(); // '.' addLiteralCharAndAdvance(); // '_' scanDecimalDigits(); } if (m_source->isPastEndOfInput() || !isDecimalDigit(m_source->get(1))) { // A '.' has to be followed by a number. literal.complete(); return Token::Number; } addLiteralCharAndAdvance(); scanDecimalDigits(); } } } // scan exponent, if any if (m_char == 'e' || m_char == 'E') { solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); if (kind != DECIMAL) return setError(ScannerError::IllegalExponent); else if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_') { // Recover from wrongly placed underscore as delimiter in literal with scientific // notation by consuming until the end. addLiteralCharAndAdvance(); // 'e' addLiteralCharAndAdvance(); // '_' scanDecimalDigits(); literal.complete(); return Token::Number; } // scan exponent addLiteralCharAndAdvance(); // 'e' | 'E' if (m_char == '+' || m_char == '-') addLiteralCharAndAdvance(); if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E' return setError(ScannerError::IllegalExponent); scanDecimalDigits(); } // The source character immediately following a numeric literal must // not be an identifier start or a decimal digit; see ECMA-262 // section 7.8.3, page 17 (note that we read only one decimal digit // if the value is 0). if (isDecimalDigit(m_char) || isIdentifierStart(m_char)) return setError(ScannerError::IllegalNumberEnd); literal.complete(); return Token::Number; } tuple Scanner::scanIdentifierOrKeyword() { solAssert(isIdentifierStart(m_char), ""); LiteralScope literal(this, LITERAL_TYPE_STRING); addLiteralCharAndAdvance(); // Scan the rest of the identifier characters. while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul)) addLiteralCharAndAdvance(); literal.complete(); auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal); if (m_kind == ScannerKind::Yul) { // Turn Solidity identifier into a Yul keyword if (m_tokens[NextNext].literal == "leave") return std::make_tuple(Token::Leave, 0, 0); // Turn non-Yul keywords into identifiers. if (!TokenTraits::isYulKeyword(std::get<0>(token))) return std::make_tuple(Token::Identifier, 0, 0); } return token; } } // namespace solidity::langutil