/* * This file is part of solidity. * * solidity is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * solidity is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with solidity. If not, see . * * This file is derived from the file "scanner.cc", which was part of the * V8 project. The original copyright header follows: * * Copyright 2006-2012, the V8 project authors. All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * @author Christian * @date 2014 * Solidity scanner. */ #include #include #include #include #include #include using namespace std; namespace langutil { std::string to_string(ScannerError _errorCode) { switch (_errorCode) { case ScannerError::NoError: return "No error."; case ScannerError::IllegalToken: return "Invalid token."; case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles within double-quotes."; case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid."; case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator."; case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence."; case ScannerError::IllegalStringEndQuote: return "Expected string end-quote."; case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'."; case ScannerError::IllegalExponent: return "Invalid exponent."; case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number."; case ScannerError::OctalNotAllowed: return "Octal numbers not allowed."; default: solAssert(false, "Unhandled case in to_string(ScannerError)"); return ""; } } std::ostream& operator<<(std::ostream& os, ScannerError _errorCode) { os << to_string(_errorCode); return os; } /// Scoped helper for literal recording. Automatically drops the literal /// if aborting the scanning before it's complete. enum LiteralType { LITERAL_TYPE_STRING, LITERAL_TYPE_NUMBER, // not really different from string type in behaviour LITERAL_TYPE_COMMENT }; class LiteralScope { public: explicit LiteralScope(Scanner* _self, enum LiteralType _type): m_type(_type) , m_scanner(_self) , m_complete(false) { if (_type == LITERAL_TYPE_COMMENT) m_scanner->m_nextSkippedComment.literal.clear(); else m_scanner->m_nextToken.literal.clear(); } ~LiteralScope() { if (!m_complete) { if (m_type == LITERAL_TYPE_COMMENT) m_scanner->m_nextSkippedComment.literal.clear(); else m_scanner->m_nextToken.literal.clear(); } } void complete() { m_complete = true; } private: enum LiteralType m_type; Scanner* m_scanner; bool m_complete; }; // end of LiteralScope class void Scanner::reset(CharStream _source) { m_source = make_shared(std::move(_source)); reset(); } void Scanner::reset(std::shared_ptr _source) { solAssert(_source.get() != nullptr, "You MUST provide a CharStream when resetting."); m_source = _source; reset(); } void Scanner::reset() { m_source->reset(); m_char = m_source->get(); skipWhitespace(); scanToken(); next(); } bool Scanner::scanHexByte(char& o_scannedByte) { char x = 0; for (int i = 0; i < 2; i++) { int d = hexValue(m_char); if (d < 0) { rollback(i); return false; } x = x * 16 + d; advance(); } o_scannedByte = x; return true; } bool Scanner::scanUnicode(unsigned & o_codepoint) { unsigned x = 0; for (int i = 0; i < 4; i++) { int d = hexValue(m_char); if (d < 0) { rollback(i); return false; } x = x * 16 + d; advance(); } o_codepoint = x; return true; } // This supports codepoints between 0000 and FFFF. void Scanner::addUnicodeAsUTF8(unsigned codepoint) { if (codepoint <= 0x7f) addLiteralChar(codepoint); else if (codepoint <= 0x7ff) { addLiteralChar(0xc0 | (codepoint >> 6)); addLiteralChar(0x80 | (codepoint & 0x3f)); } else { addLiteralChar(0xe0 | (codepoint >> 12)); addLiteralChar(0x80 | ((codepoint >> 6) & 0x3f)); addLiteralChar(0x80 | (codepoint & 0x3f)); } } // Ensure that tokens can be stored in a byte. BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100); Token Scanner::next() { m_currentToken = m_nextToken; m_skippedComment = m_nextSkippedComment; scanToken(); return m_currentToken.token; } Token Scanner::selectToken(char _next, Token _then, Token _else) { advance(); if (m_char == _next) return selectToken(_then); else return _else; } bool Scanner::skipWhitespace() { int const startPosition = sourcePos(); while (isWhiteSpace(m_char)) advance(); // Return whether or not we skipped any characters. return sourcePos() != startPosition; } void Scanner::skipWhitespaceExceptUnicodeLinebreak() { while (isWhiteSpace(m_char) && !isUnicodeLinebreak()) advance(); } Token Scanner::skipSingleLineComment() { // Line terminator is not part of the comment. If it is a // non-ascii line terminator, it will result in a parser error. while (!isUnicodeLinebreak()) if (!advance()) break; return Token::Whitespace; } Token Scanner::scanSingleLineDocComment() { LiteralScope literal(this, LITERAL_TYPE_COMMENT); advance(); //consume the last '/' at /// skipWhitespaceExceptUnicodeLinebreak(); while (!isSourcePastEndOfInput()) { if (isLineTerminator(m_char)) { // check if next line is also a documentation comment skipWhitespace(); if (!m_source->isPastEndOfInput(3) && m_source->get(0) == '/' && m_source->get(1) == '/' && m_source->get(2) == '/') { addCommentLiteralChar('\n'); m_char = m_source->advanceAndGet(3); } else break; // next line is not a documentation comment, we are done } else if (isUnicodeLinebreak()) // Any line terminator that is not '\n' is considered to end the // comment. break; addCommentLiteralChar(m_char); advance(); } literal.complete(); return Token::CommentLiteral; } Token Scanner::skipMultiLineComment() { advance(); while (!isSourcePastEndOfInput()) { char ch = m_char; advance(); // If we have reached the end of the multi-line comment, we // consume the '/' and insert a whitespace. This way all // multi-line comments are treated as whitespace. if (ch == '*' && m_char == '/') { m_char = ' '; return Token::Whitespace; } } // Unterminated multi-line comment. return setError(ScannerError::IllegalCommentTerminator); } Token Scanner::scanMultiLineDocComment() { LiteralScope literal(this, LITERAL_TYPE_COMMENT); bool endFound = false; bool charsAdded = false; while (isWhiteSpace(m_char) && !isLineTerminator(m_char)) advance(); while (!isSourcePastEndOfInput()) { //handle newlines in multline comments if (isLineTerminator(m_char)) { skipWhitespace(); if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '*') { // it is unknown if this leads to the end of the comment addCommentLiteralChar('*'); advance(); } else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) != '/') { // skip first '*' in subsequent lines if (charsAdded) addCommentLiteralChar('\n'); m_char = m_source->advanceAndGet(2); } else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/') { // if after newline the comment ends, don't insert the newline m_char = m_source->advanceAndGet(2); endFound = true; break; } else if (charsAdded) addCommentLiteralChar('\n'); } if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/') { m_char = m_source->advanceAndGet(2); endFound = true; break; } addCommentLiteralChar(m_char); charsAdded = true; advance(); } literal.complete(); if (!endFound) return setError(ScannerError::IllegalCommentTerminator); else return Token::CommentLiteral; } Token Scanner::scanSlash() { int firstSlashPosition = sourcePos(); advance(); if (m_char == '/') { if (!advance()) /* double slash comment directly before EOS */ return Token::Whitespace; else if (m_char == '/') { // doxygen style /// comment Token comment; m_nextSkippedComment.location.start = firstSlashPosition; comment = scanSingleLineDocComment(); m_nextSkippedComment.location.end = sourcePos(); m_nextSkippedComment.token = comment; return Token::Whitespace; } else return skipSingleLineComment(); } else if (m_char == '*') { // doxygen style /** natspec comment if (!advance()) /* slash star comment before EOS */ return setError(ScannerError::IllegalCommentTerminator); else if (m_char == '*') { advance(); //consume the last '*' at /** // "/**/" if (m_char == '/') { advance(); //skip the closing slash return Token::Whitespace; } // we actually have a multiline documentation comment Token comment; m_nextSkippedComment.location.start = firstSlashPosition; comment = scanMultiLineDocComment(); m_nextSkippedComment.location.end = sourcePos(); m_nextSkippedComment.token = comment; if (comment == Token::Illegal) return Token::Illegal; // error already set else return Token::Whitespace; } else return skipMultiLineComment(); } else if (m_char == '=') return selectToken(Token::AssignDiv); else return Token::Div; } void Scanner::scanToken() { m_nextToken.error = ScannerError::NoError; m_nextToken.literal.clear(); m_nextToken.extendedTokenInfo = make_tuple(0, 0); m_nextSkippedComment.literal.clear(); m_nextSkippedComment.extendedTokenInfo = make_tuple(0, 0); Token token; // M and N are for the purposes of grabbing different type sizes unsigned m; unsigned n; do { // Remember the position of the next token m_nextToken.location.start = sourcePos(); switch (m_char) { case '"': case '\'': token = scanString(); break; case '<': // < <= << <<= advance(); if (m_char == '=') token = selectToken(Token::LessThanOrEqual); else if (m_char == '<') token = selectToken('=', Token::AssignShl, Token::SHL); else token = Token::LessThan; break; case '>': // > >= >> >>= >>> >>>= advance(); if (m_char == '=') token = selectToken(Token::GreaterThanOrEqual); else if (m_char == '>') { // >> >>= >>> >>>= advance(); if (m_char == '=') token = selectToken(Token::AssignSar); else if (m_char == '>') token = selectToken('=', Token::AssignShr, Token::SHR); else token = Token::SAR; } else token = Token::GreaterThan; break; case '=': // = == => advance(); if (m_char == '=') token = selectToken(Token::Equal); else if (m_char == '>') token = selectToken(Token::Arrow); else token = Token::Assign; break; case '!': // ! != advance(); if (m_char == '=') token = selectToken(Token::NotEqual); else token = Token::Not; break; case '+': // + ++ += advance(); if (m_char == '+') token = selectToken(Token::Inc); else if (m_char == '=') token = selectToken(Token::AssignAdd); else token = Token::Add; break; case '-': // - -- -= advance(); if (m_char == '-') token = selectToken(Token::Dec); else if (m_char == '=') token = selectToken(Token::AssignSub); else token = Token::Sub; break; case '*': // * ** *= advance(); if (m_char == '*') token = selectToken(Token::Exp); else if (m_char == '=') token = selectToken(Token::AssignMul); else token = Token::Mul; break; case '%': // % %= token = selectToken('=', Token::AssignMod, Token::Mod); break; case '/': // / // /* /= token = scanSlash(); break; case '&': // & && &= advance(); if (m_char == '&') token = selectToken(Token::And); else if (m_char == '=') token = selectToken(Token::AssignBitAnd); else token = Token::BitAnd; break; case '|': // | || |= advance(); if (m_char == '|') token = selectToken(Token::Or); else if (m_char == '=') token = selectToken(Token::AssignBitOr); else token = Token::BitOr; break; case '^': // ^ ^= token = selectToken('=', Token::AssignBitXor, Token::BitXor); break; case '.': // . Number advance(); if (isDecimalDigit(m_char)) token = scanNumber('.'); else token = Token::Period; break; case ':': token = selectToken(Token::Colon); break; case ';': token = selectToken(Token::Semicolon); break; case ',': token = selectToken(Token::Comma); break; case '(': token = selectToken(Token::LParen); break; case ')': token = selectToken(Token::RParen); break; case '[': token = selectToken(Token::LBrack); break; case ']': token = selectToken(Token::RBrack); break; case '{': token = selectToken(Token::LBrace); break; case '}': token = selectToken(Token::RBrace); break; case '?': token = selectToken(Token::Conditional); break; case '~': token = selectToken(Token::BitNot); break; default: if (isIdentifierStart(m_char)) { tie(token, m, n) = scanIdentifierOrKeyword(); // Special case for hexadecimal literals if (token == Token::Hex) { // reset m = 0; n = 0; // Special quoted hex string must follow if (m_char == '"' || m_char == '\'') token = scanHexString(); else token = setError(ScannerError::IllegalToken); } } else if (isDecimalDigit(m_char)) token = scanNumber(); else if (skipWhitespace()) token = Token::Whitespace; else if (isSourcePastEndOfInput()) token = Token::EOS; else token = selectErrorToken(ScannerError::IllegalToken); break; } // Continue scanning for tokens as long as we're just skipping // whitespace. } while (token == Token::Whitespace); m_nextToken.location.end = sourcePos(); m_nextToken.token = token; m_nextToken.extendedTokenInfo = make_tuple(m, n); } bool Scanner::scanEscape() { char c = m_char; advance(); // Skip escaped newlines. if (isLineTerminator(c)) return true; switch (c) { case '\'': // fall through case '"': // fall through case '\\': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case 'u': { unsigned codepoint; if (!scanUnicode(codepoint)) return false; addUnicodeAsUTF8(codepoint); return true; } case 'x': if (!scanHexByte(c)) return false; break; default: return false; } addLiteralChar(c); return true; } bool Scanner::isUnicodeLinebreak() { if (0x0a <= m_char && m_char <= 0x0d) // line feed, vertical tab, form feed, carriage return return true; else if (!m_source->isPastEndOfInput(1) && uint8_t(m_source->get(0)) == 0xc2 && uint8_t(m_source->get(1)) == 0x85) // NEL - U+0085, C2 85 in utf8 return true; else if (!m_source->isPastEndOfInput(2) && uint8_t(m_source->get(0)) == 0xe2 && uint8_t(m_source->get(1)) == 0x80 && ( uint8_t(m_source->get(2)) == 0xa8 || uint8_t(m_source->get(2)) == 0xa9 )) // LS - U+2028, E2 80 A8 in utf8 // PS - U+2029, E2 80 A9 in utf8 return true; else return false; } Token Scanner::scanString() { char const quote = m_char; advance(); // consume quote LiteralScope literal(this, LITERAL_TYPE_STRING); while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak()) { char c = m_char; advance(); if (c == '\\') { if (isSourcePastEndOfInput() || !scanEscape()) return setError(ScannerError::IllegalEscapeSequence); } else addLiteralChar(c); } if (m_char != quote) return setError(ScannerError::IllegalStringEndQuote); literal.complete(); advance(); // consume quote return Token::StringLiteral; } Token Scanner::scanHexString() { char const quote = m_char; advance(); // consume quote LiteralScope literal(this, LITERAL_TYPE_STRING); while (m_char != quote && !isSourcePastEndOfInput()) { char c = m_char; if (!scanHexByte(c)) // can only return false if hex-byte is incomplete (only one hex digit instead of two) return setError(ScannerError::IllegalHexString); addLiteralChar(c); } if (m_char != quote) return setError(ScannerError::IllegalStringEndQuote); literal.complete(); advance(); // consume quote return Token::StringLiteral; } // Parse for regex [:digit:]+(_[:digit:]+)* void Scanner::scanDecimalDigits() { // MUST begin with a decimal digit. if (!isDecimalDigit(m_char)) return; // May continue with decimal digit or underscore for grouping. do addLiteralCharAndAdvance(); while (!m_source->isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_')); // Defer further validation of underscore to SyntaxChecker. } Token Scanner::scanNumber(char _charSeen) { enum { DECIMAL, HEX, BINARY } kind = DECIMAL; LiteralScope literal(this, LITERAL_TYPE_NUMBER); if (_charSeen == '.') { // we have already seen a decimal point of the float addLiteralChar('.'); if (m_char == '_') return setError(ScannerError::IllegalToken); scanDecimalDigits(); // we know we have at least one digit } else { solAssert(_charSeen == 0, ""); // if the first character is '0' we must check for octals and hex if (m_char == '0') { addLiteralCharAndAdvance(); // either 0, 0exxx, 0Exxx, 0.xxx or a hex number if (m_char == 'x') { // hex number kind = HEX; addLiteralCharAndAdvance(); if (!isHexDigit(m_char)) return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x' while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation addLiteralCharAndAdvance(); } else if (isDecimalDigit(m_char)) // We do not allow octal numbers return setError(ScannerError::OctalNotAllowed); } // Parse decimal digits and allow trailing fractional part. if (kind == DECIMAL) { scanDecimalDigits(); // optional if (m_char == '.') { if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_') { // Assume the input may be a floating point number with leading '_' in fraction part. // Recover by consuming it all but returning `Illegal` right away. addLiteralCharAndAdvance(); // '.' addLiteralCharAndAdvance(); // '_' scanDecimalDigits(); } if (m_source->isPastEndOfInput() || !isDecimalDigit(m_source->get(1))) { // A '.' has to be followed by a number. literal.complete(); return Token::Number; } addLiteralCharAndAdvance(); scanDecimalDigits(); } } } // scan exponent, if any if (m_char == 'e' || m_char == 'E') { solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); if (kind != DECIMAL) return setError(ScannerError::IllegalExponent); else if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_') { // Recover from wrongly placed underscore as delimiter in literal with scientific // notation by consuming until the end. addLiteralCharAndAdvance(); // 'e' addLiteralCharAndAdvance(); // '_' scanDecimalDigits(); literal.complete(); return Token::Number; } // scan exponent addLiteralCharAndAdvance(); // 'e' | 'E' if (m_char == '+' || m_char == '-') addLiteralCharAndAdvance(); if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E' return setError(ScannerError::IllegalExponent); scanDecimalDigits(); } // The source character immediately following a numeric literal must // not be an identifier start or a decimal digit; see ECMA-262 // section 7.8.3, page 17 (note that we read only one decimal digit // if the value is 0). if (isDecimalDigit(m_char) || isIdentifierStart(m_char)) return setError(ScannerError::IllegalNumberEnd); literal.complete(); return Token::Number; } tuple Scanner::scanIdentifierOrKeyword() { solAssert(isIdentifierStart(m_char), ""); LiteralScope literal(this, LITERAL_TYPE_STRING); addLiteralCharAndAdvance(); // Scan the rest of the identifier characters. while (isIdentifierPart(m_char)) //get full literal addLiteralCharAndAdvance(); literal.complete(); return TokenTraits::fromIdentifierOrKeyword(m_nextToken.literal); } }