From 569e0c53f276eccbd9452988910aa8f3b4bcf13f Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Fri, 16 Jun 2017 17:13:18 +0100 Subject: [PATCH] Implement strict UTF-8 validation --- Changelog.md | 3 ++- libdevcore/UTF8.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/Changelog.md b/Changelog.md index cfedf1fc4..5f4ec10b9 100644 --- a/Changelog.md +++ b/Changelog.md @@ -13,13 +13,14 @@ Features: * Inline Assembly: function definitions and function calls. * Code Generator: Added the Whiskers template system. * Remove obsolete Why3 output. + * Type Checker: Enforce strict UTF-8 validation. Bugfixes: * Code generator: Use ``REVERT`` instead of ``INVALID`` for generated input validation routines. * Type Checker: Fix address literals not being treated as compile-time constants. * Type Checker: Disallow invoking the same modifier multiple times. - * Type Checker: Make UTF8-validation a bit more sloppy to include more valid sequences. * Type Checker: Do not treat strings that look like addresses as addresses. + * Type Checker: Support valid, but incorrectly rejected UTF-8 sequences. * Fixed crash concerning non-callable types. * Unused variable warnings no longer issued for variables used inside inline assembly. * Code Generator: Fix ABI encoding of empty literal string. diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 449ccc5db..793bc0808 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -27,6 +27,50 @@ namespace dev { +namespace +{ + +/// Validate byte sequence against Unicode chapter 3 Table 3-7. +bool isWellFormed(unsigned char byte1, unsigned char byte2) +{ + switch (byte1) + { + case 0xc0 ... 0xc1: + return false; + case 0xc2 ... 0xdf: + break; + case 0xe0: + if (byte2 < 0xa0) + return false; + break; + case 0xe1 ... 0xec: + break; + case 0xed: + if (byte2 > 0x9f) + return false; + break; + case 0xee ... 0xef: + break; + case 0xf0: + if (byte2 < 0x90) + return false; + break; + case 0xf1 ... 0xf3: + break; + case 0xf4: + if (byte2 > 0x8f) + return false; + break; + case 0xf5 ... 0xf7: + default: + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; + } + return true; +} + +} bool validateUTF8(std::string const& _input, size_t& _invalidPosition) { @@ -36,6 +80,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) for (; i < length; i++) { + // Check for Unicode Chapter 3 Table 3-6 conformity. if ((unsigned char)_input[i] < 0x80) continue; @@ -67,6 +112,13 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) valid = false; break; } + + // Check for Unicode Chapter 3 Table 3-7 conformity. + if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) + { + valid = false; + break; + } } } @@ -77,5 +129,4 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) return false; } - }