Merge pull request #12804 from ethereum/benchmark-differ

Benchmark differ
2023-10-03 13:03:40 +00:00 · 2022-04-07 09:24:01 +02:00 · 2022-04-07 09:24:01 +02:00 · 52f5ffd876
commit 52f5ffd876
parent 31b5485779 8c9856c52c
5 changed files with 1235 additions and 0 deletions
--- a/scripts/externalTests/benchmark_diff.py
+++ b/scripts/externalTests/benchmark_diff.py
@ -0,0 +1,453 @@
+#!/usr/bin/env python3
+
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from textwrap import dedent
+from typing import Any, Mapping, Optional, Set, Sequence, Union
+import json
+import sys
+
+
+class DiffMode(Enum):
+    IN_PLACE = 'inplace'
+    TABLE = 'table'
+
+
+class DifferenceStyle(Enum):
+    ABSOLUTE = 'absolute'
+    RELATIVE = 'relative'
+    HUMANIZED = 'humanized'
+
+
+class OutputFormat(Enum):
+    JSON = 'json'
+    CONSOLE = 'console'
+    MARKDOWN = 'markdown'
+
+
+DEFAULT_RELATIVE_PRECISION = 4
+
+DEFAULT_DIFFERENCE_STYLE = {
+    DiffMode.IN_PLACE: DifferenceStyle.ABSOLUTE,
+    DiffMode.TABLE: DifferenceStyle.HUMANIZED,
+}
+assert all(t in DiffMode for t in DEFAULT_DIFFERENCE_STYLE)
+assert all(d in DifferenceStyle for d in DEFAULT_DIFFERENCE_STYLE.values())
+
+DEFAULT_OUTPUT_FORMAT = {
+    DiffMode.IN_PLACE: OutputFormat.JSON,
+    DiffMode.TABLE: OutputFormat.CONSOLE,
+}
+assert all(m in DiffMode for m in DEFAULT_OUTPUT_FORMAT)
+assert all(o in OutputFormat for o in DEFAULT_OUTPUT_FORMAT.values())
+
+
+class ValidationError(Exception):
+    pass
+
+
+class CommandLineError(ValidationError):
+    pass
+
+
+class BenchmarkDiffer:
+    difference_style: DifferenceStyle
+    relative_precision: Optional[int]
+    output_format: OutputFormat
+
+    def __init__(
+        self,
+        difference_style: DifferenceStyle,
+        relative_precision: Optional[int],
+        output_format: OutputFormat,
+    ):
+        self.difference_style = difference_style
+        self.relative_precision = relative_precision
+        self.output_format = output_format
+
+    def run(self, before: Any, after: Any) -> Optional[Union[dict, str, int, float]]:
+        if not isinstance(before, dict) or not isinstance(after, dict):
+            return self._diff_scalars(before, after)
+
+        if before.get('version') != after.get('version'):
+            return self._humanize_diff('!V')
+
+        diff = {}
+        for key in (set(before) | set(after)) - {'version'}:
+            value_diff = self.run(before.get(key), after.get(key))
+            if value_diff not in [None, {}]:
+                diff[key] = value_diff
+
+        return diff
+
+    def _diff_scalars(self, before: Any, after: Any) -> Optional[Union[str, int, float, dict]]:
+        assert not isinstance(before, dict) or not isinstance(after, dict)
+
+        if before is None and after is None:
+            return {}
+        if before is None:
+            return self._humanize_diff('!B')
+        if after is None:
+            return self._humanize_diff('!A')
+        if not isinstance(before, (int, float)) or not isinstance(after, (int, float)):
+            return self._humanize_diff('!T')
+
+        number_diff = self._diff_numbers(before, after)
+        if self.difference_style != DifferenceStyle.HUMANIZED:
+            return number_diff
+
+        return self._humanize_diff(number_diff)
+
+    def _diff_numbers(self, value_before: Union[int, float], value_after: Union[int, float]) -> Union[str, int, float]:
+        diff: Union[str, int, float]
+
+        if self.difference_style == DifferenceStyle.ABSOLUTE:
+            diff = value_after - value_before
+            if isinstance(diff, float) and diff.is_integer():
+                diff = int(diff)
+
+            return diff
+
+        if value_before == 0:
+            if value_after > 0:
+                return '+INF'
+            elif value_after < 0:
+                return '-INF'
+            else:
+                return 0
+
+        diff = (value_after - value_before) / abs(value_before)
+        if self.relative_precision is not None:
+            rounded_diff = round(diff, self.relative_precision)
+            if rounded_diff == 0 and diff < 0:
+                diff = '-0'
+            elif rounded_diff == 0 and diff > 0:
+                diff = '+0'
+            else:
+                diff = rounded_diff
+
+        if isinstance(diff, float) and diff.is_integer():
+            diff = int(diff)
+
+        return diff
+
+    def _humanize_diff(self, diff: Union[str, int, float]) -> str:
+        def wrap(value: str, symbol: str):
+            return f"{symbol}{value}{symbol}"
+
+        markdown = (self.output_format == OutputFormat.MARKDOWN)
+
+        if isinstance(diff, str) and diff.startswith('!'):
+            return wrap(diff, '`' if markdown else '')
+
+        value: Union[str, int, float]
+        if isinstance(diff, (int, float)):
+            value = diff * 100
+            if isinstance(value, float) and self.relative_precision is not None:
+                # The multiplication can result in new significant digits appearing. We need to reround.
+                # NOTE: round() works fine with negative precision.
+                value = round(value, self.relative_precision - 2)
+                if isinstance(value, float) and value.is_integer():
+                    value = int(value)
+            suffix = ''
+            prefix = ''
+            if diff < 0:
+                prefix = ''
+                if markdown:
+                    suffix += ' ✅'
+            elif diff > 0:
+                prefix = '+'
+                if markdown:
+                    suffix += ' ❌'
+            important = (diff != 0)
+        else:
+            value = diff
+            important = False
+            prefix = ''
+            suffix = ''
+
+        return wrap(
+            wrap(
+                f"{prefix}{value}%{suffix}",
+                '`' if markdown else ''
+            ),
+            '**' if important and markdown else ''
+        )
+
+
+@dataclass(frozen=True)
+class DiffTable:
+    columns: Mapping[str, Sequence[Union[int, float, str]]]
+
+
+class DiffTableSet:
+    table_headers: Sequence[str]
+    row_headers: Sequence[str]
+    column_headers: Sequence[str]
+
+    # Cells is a nested dict rather than a 3D array so that conversion to JSON is straightforward
+    cells: Mapping[str, Mapping[str, Mapping[str, Union[int, float, str]]]] # preset -> project -> attribute
+
+    def __init__(self, diff: dict):
+        self.table_headers = sorted(self._find_all_preset_names(diff))
+        self.column_headers = sorted(self._find_all_attribute_names(diff))
+        self.row_headers = sorted(project for project in diff)
+
+        # All dimensions must have unique values
+        assert len(self.table_headers) == len(set(self.table_headers))
+        assert len(self.column_headers) == len(set(self.column_headers))
+        assert len(self.row_headers) == len(set(self.row_headers))
+
+        self.cells = {
+            preset: {
+                project: {
+                    attribute: self._cell_content(diff, project, preset, attribute)
+                    for attribute in self.column_headers
+                }
+                for project in self.row_headers
+            }
+            for preset in self.table_headers
+        }
+
+    def calculate_row_column_width(self) -> int:
+        return max(len(h) for h in self.row_headers)
+
+    def calculate_column_widths(self, table_header: str) -> Sequence[int]:
+        assert table_header in self.table_headers
+
+        return [
+            max(
+                len(column_header),
+                max(
+                    len(str(self.cells[table_header][row_header][column_header]))
+                    for row_header in self.row_headers
+                )
+            )
+            for column_header in self.column_headers
+        ]
+
+    @classmethod
+    def _find_all_preset_names(cls, diff: dict) -> Set[str]:
+        return {
+            preset
+            for project, project_diff in diff.items()
+            if isinstance(project_diff, dict)
+            for preset in project_diff
+        }
+
+    @classmethod
+    def _find_all_attribute_names(cls, diff: dict) -> Set[str]:
+        return {
+            attribute
+            for project, project_diff in diff.items()
+            if isinstance(project_diff, dict)
+            for preset, preset_diff in project_diff.items()
+            if isinstance(preset_diff, dict)
+            for attribute in preset_diff
+        }
+
+    @classmethod
+    def _cell_content(cls, diff: dict, project: str, preset: str, attribute: str) -> str:
+        assert project in diff
+
+        if isinstance(diff[project], str):
+            return diff[project]
+        if preset not in diff[project]:
+            return ''
+        if isinstance(diff[project][preset], str):
+            return diff[project][preset]
+        if attribute not in diff[project][preset]:
+            return ''
+
+        return diff[project][preset][attribute]
+
+
+class DiffTableFormatter:
+    LEGEND = dedent("""
+        `!V` = version mismatch
+        `!B` = no value in the "before" version
+        `!A` = no value in the "after" version
+        `!T` = one or both values were not numeric and could not be compared
+        `-0` = very small negative value rounded to zero
+        `+0` = very small positive value rounded to zero
+    """)
+
+    @classmethod
+    def run(cls, diff_table_set: DiffTableSet, output_format: OutputFormat):
+        if output_format == OutputFormat.JSON:
+            return json.dumps(diff_table_set.cells, indent=4, sort_keys=True)
+        else:
+            assert output_format in {OutputFormat.CONSOLE, OutputFormat.MARKDOWN}
+
+            output = ''
+            for table_header in diff_table_set.table_headers:
+                column_widths = ([
+                    diff_table_set.calculate_row_column_width(),
+                    *diff_table_set.calculate_column_widths(table_header)
+                ])
+
+                if output_format == OutputFormat.MARKDOWN:
+                    output += f'\n### `{table_header}`\n'
+                else:
+                    output += f'\n{table_header.upper()}\n'
+
+                if output_format == OutputFormat.CONSOLE:
+                    output += cls._format_separator_row(column_widths, output_format) + '\n'
+                output += cls._format_data_row(['project', *diff_table_set.column_headers], column_widths) + '\n'
+                output += cls._format_separator_row(column_widths, output_format) + '\n'
+
+                for row_header in diff_table_set.row_headers:
+                    row = [
+                        diff_table_set.cells[table_header][row_header][column_header]
+                        for column_header in diff_table_set.column_headers
+                    ]
+                    output += cls._format_data_row([row_header, *row], column_widths) + '\n'
+
+                if output_format == OutputFormat.CONSOLE:
+                    output += cls._format_separator_row(column_widths, output_format) + '\n'
+
+            if output_format == OutputFormat.MARKDOWN:
+                output += f'\n{cls.LEGEND}\n'
+            return output
+
+    @classmethod
+    def _format_separator_row(cls, widths: Sequence[int], output_format: OutputFormat):
+        assert output_format in {OutputFormat.CONSOLE, OutputFormat.MARKDOWN}
+
+        if output_format == OutputFormat.MARKDOWN:
+            return '|:' + ':|-'.join('-' * width for width in widths) + ':|'
+        else:
+            return '|-' + '-|-'.join('-' * width for width in widths) + '-|'
+
+    @classmethod
+    def _format_data_row(cls, cells: Sequence[Union[int, float, str]], widths: Sequence[int]):
+        assert len(cells) == len(widths)
+
+        return '| ' + ' | '.join(str(cell).rjust(width) for cell, width in zip(cells, widths)) + ' |'
+
+
+@dataclass(frozen=True)
+class CommandLineOptions:
+    diff_mode: DiffMode
+    report_before: Path
+    report_after: Path
+    difference_style: DifferenceStyle
+    relative_precision: int
+    output_format: OutputFormat
+
+
+def process_commandline() -> CommandLineOptions:
+    script_description = (
+        "Compares summarized benchmark reports and outputs JSON with the same structure but listing only differences. "
+        "Can also print the output as markdown table and format the values to make differences stand out more."
+    )
+
+    parser = ArgumentParser(description=script_description)
+    parser.add_argument(
+        dest='diff_mode',
+        choices=[m.value for m in DiffMode],
+        help=(
+            "Diff mode: "
+            f"'{DiffMode.IN_PLACE.value}' preserves input JSON structure and replace values with differences; "
+            f"'{DiffMode.TABLE.value}' creates a table assuming 3-level project/preset/attribute structure."
+        )
+    )
+    parser.add_argument(dest='report_before', help="Path to a JSON file containing original benchmark results.")
+    parser.add_argument(dest='report_after', help="Path to a JSON file containing new benchmark results.")
+    parser.add_argument(
+        '--style',
+        dest='difference_style',
+        choices=[s.value for s in DifferenceStyle],
+        help=(
+            "How to present numeric differences: "
+            f"'{DifferenceStyle.ABSOLUTE.value}' subtracts new from original; "
+            f"'{DifferenceStyle.RELATIVE.value}' also divides by the original; "
+            f"'{DifferenceStyle.HUMANIZED.value}' is like relative but value is a percentage and "
+            "positive/negative changes are emphasized. "
+            f"(default: '{DEFAULT_DIFFERENCE_STYLE[DiffMode.IN_PLACE]}' in '{DiffMode.IN_PLACE.value}' mode, "
+            f"'{DEFAULT_DIFFERENCE_STYLE[DiffMode.TABLE]}' in '{DiffMode.TABLE.value}' mode)"
+        )
+    )
+    # NOTE: Negative values are valid for precision. round() handles them in a sensible way.
+    parser.add_argument(
+        '--precision',
+        dest='relative_precision',
+        type=int,
+        default=DEFAULT_RELATIVE_PRECISION,
+        help=(
+            "Number of significant digits for relative differences. "
+            f"Note that with --style={DifferenceStyle.HUMANIZED.value} the rounding is applied "
+            "**before** converting the value to a percentage so you need to add 2. "
+            f"Has no effect when used together with --style={DifferenceStyle.ABSOLUTE.value}. "
+            f"(default: {DEFAULT_RELATIVE_PRECISION})"
+        )
+    )
+    parser.add_argument(
+        '--output-format',
+        dest='output_format',
+        choices=[o.value for o in OutputFormat],
+        help=(
+            "The format to use for the diff: "
+            f"'{OutputFormat.JSON.value}' is raw JSON; "
+            f"'{OutputFormat.CONSOLE.value}' is a table with human-readable values that will look good in the console output. "
+            f"'{OutputFormat.MARKDOWN.value}' is similar '{OutputFormat.CONSOLE.value}' but adjusted to "
+            "render as proper markdown and with extra elements (legend, emoji to make non-zero values stand out more, etc)."
+            f"(default: '{DEFAULT_OUTPUT_FORMAT[DiffMode.IN_PLACE]}' in '{DiffMode.IN_PLACE.value}' mode, "
+            f"'{DEFAULT_OUTPUT_FORMAT[DiffMode.TABLE]}' in '{DiffMode.TABLE.value}' mode)"
+        )
+    )
+
+    options = parser.parse_args()
+
+    if options.difference_style is not None:
+        difference_style = DifferenceStyle(options.difference_style)
+    else:
+        difference_style = DEFAULT_DIFFERENCE_STYLE[DiffMode(options.diff_mode)]
+
+    if options.output_format is not None:
+        output_format = OutputFormat(options.output_format)
+    else:
+        output_format = DEFAULT_OUTPUT_FORMAT[DiffMode(options.diff_mode)]
+
+    processed_options = CommandLineOptions(
+        diff_mode=DiffMode(options.diff_mode),
+        report_before=Path(options.report_before),
+        report_after=Path(options.report_after),
+        difference_style=difference_style,
+        relative_precision=options.relative_precision,
+        output_format=output_format,
+    )
+
+    if processed_options.diff_mode == DiffMode.IN_PLACE and processed_options.output_format != OutputFormat.JSON:
+        raise CommandLineError(
+            f"Only the '{OutputFormat.JSON.value}' output format is supported in the '{DiffMode.IN_PLACE.value}' mode."
+        )
+
+    return processed_options
+
+
+def main():
+    try:
+        options = process_commandline()
+
+        differ = BenchmarkDiffer(options.difference_style, options.relative_precision, options.output_format)
+        diff = differ.run(
+            json.loads(options.report_before.read_text('utf-8')),
+            json.loads(options.report_after.read_text('utf-8')),
+        )
+
+        if options.diff_mode == DiffMode.IN_PLACE:
+            print(json.dumps(diff, indent=4, sort_keys=True))
+        else:
+            assert options.diff_mode == DiffMode.TABLE
+            print(DiffTableFormatter.run(DiffTableSet(diff), options.output_format))
+
+        return 0
+    except CommandLineError as exception:
+        print(f"ERROR: {exception}", file=sys.stderr)
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/test/scripts/fixtures/summarized-benchmark-diff-develop-branch-humanized.md
+++ b/test/scripts/fixtures/summarized-benchmark-diff-develop-branch-humanized.md
@ -0,0 +1,75 @@
+
+### `ir-no-optimize`
+|   project |  bytecode_size | deployment_gas |     method_gas |
+|:---------:|---------------:|---------------:|---------------:|
+|    bleeps |                |                |                |
+|    colony |                |                |                |
+| elementfi |                |                |           `0%` |
+|       ens |           `!A` |           `!A` |           `!A` |
+|     euler | **`+1.43% ❌`** |           `0%` | **`+2.47% ❌`** |
+|    gnosis |           `!B` |           `!B` |           `!B` |
+|  zeppelin |                |                |                |
+
+### `ir-optimize-evm+yul`
+|   project |   bytecode_size |  deployment_gas | method_gas |
+|:---------:|----------------:|----------------:|-----------:|
+|    bleeps |  **`+0.53% ❌`** |            `0%` |      `-0%` |
+|    colony |            `!A` |            `!A` |       `!A` |
+| elementfi |                 |                 |            |
+|       ens |            `!A` |            `!A` |       `!A` |
+|     euler | **`+12.64% ❌`** | **`+11.98% ❌`** |       `0%` |
+|    gnosis |            `!B` |            `!B` |       `!B` |
+|  zeppelin |                 |                 |            |
+
+### `ir-optimize-evm-only`
+|   project | bytecode_size | deployment_gas | method_gas |
+|:---------:|--------------:|---------------:|-----------:|
+|    bleeps |               |                |            |
+|    colony |               |                |            |
+| elementfi |          `!B` |           `!B` |       `!B` |
+|       ens |          `!A` |           `!A` |       `!A` |
+|     euler |          `!V` |           `!V` |       `!V` |
+|    gnosis |          `!B` |           `!B` |       `!B` |
+|  zeppelin |               |                |            |
+
+### `legacy-no-optimize`
+|   project | bytecode_size | deployment_gas | method_gas |
+|:---------:|--------------:|---------------:|-----------:|
+|    bleeps |               |                |            |
+|    colony |          `!B` |           `!B` |       `!B` |
+| elementfi |          `!A` |           `!B` |            |
+|       ens |          `!A` |           `!A` |       `!A` |
+|     euler |          `!V` |           `!V` |       `!V` |
+|    gnosis |          `!B` |           `!B` |       `!B` |
+|  zeppelin |               |                |            |
+
+### `legacy-optimize-evm+yul`
+|   project | bytecode_size | deployment_gas | method_gas |
+|:---------:|--------------:|---------------:|-----------:|
+|    bleeps |          `0%` |           `0%` |       `0%` |
+|    colony |          `0%` |                |            |
+| elementfi |          `!A` |           `!B` |            |
+|       ens |          `!A` |           `!A` |       `!A` |
+|     euler |          `!V` |           `!V` |       `!V` |
+|    gnosis |          `!B` |           `!B` |       `!B` |
+|  zeppelin |          `0%` |           `0%` |            |
+
+### `legacy-optimize-evm-only`
+|   project | bytecode_size | deployment_gas | method_gas |
+|:---------:|--------------:|---------------:|-----------:|
+|    bleeps |               |                |            |
+|    colony |               |                |            |
+| elementfi |          `!A` |           `!A` |       `!A` |
+|       ens |          `!A` |           `!A` |       `!A` |
+|     euler |          `!V` |           `!V` |       `!V` |
+|    gnosis |          `!B` |           `!B` |       `!B` |
+|  zeppelin |               |                |            |
+
+
+`!V` = version mismatch
+`!B` = no value in the "before" version
+`!A` = no value in the "after" version
+`!T` = one or both values were not numeric and could not be compared
+`-0` = very small negative value rounded to zero
+`+0` = very small positive value rounded to zero
+
--- a/test/scripts/fixtures/summarized-benchmarks-branch.json
+++ b/test/scripts/fixtures/summarized-benchmarks-branch.json
@ -0,0 +1,100 @@
+{
+    "bleeps": {
+        "ir-optimize-evm+yul": {
+            "bytecode_size": 132868,
+            "deployment_gas": 0,
+            "method_gas": 39289198,
+            "version": "bb90cd0"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 137869,
+            "deployment_gas": 0,
+            "method_gas": 38863224,
+            "version": "bb90cd0"
+        }
+    },
+    "colony": {
+        "legacy-no-optimize": {
+            "bytecode_size": 664190,
+            "deployment_gas": null,
+            "method_gas": null,
+            "version": "573399b"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 363606,
+            "deployment_gas": null,
+            "method_gas": null,
+            "version": "573399b"
+        }
+    },
+    "elementfi": {
+        "legacy-no-optimize": {
+            "bytecode_size": null,
+            "deployment_gas": 69200158,
+            "method_gas": null,
+            "version": "87f8b5e"
+        },
+        "legacy-optimize-evm+yul": {
+            "deployment_gas": 40951128,
+            "version": "87f8b5e"
+        },
+        "ir-optimize-evm-only": {},
+        "ir-no-optimize": {
+            "deployment_gas": null,
+            "method_gas": 2777867251,
+            "version": "87f8b5e"
+        }
+    },
+    "euler": {
+        "ir-no-optimize": {
+            "bytecode_size": 328540,
+            "deployment_gas": 61591870,
+            "method_gas": 3537419168,
+            "version": "2ef99fc"
+        },
+        "legacy-no-optimize": {
+            "bytecode_size": 328540,
+            "deployment_gas": 62590688,
+            "method_gas": 3537419168,
+            "version": "2ef99fc"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 182190,
+            "deployment_gas": 35236828,
+            "method_gas": 2777867251,
+            "version": "2ef99fc"
+        },
+        "legacy-optimize-evm-only": {
+            "bytecode_size": 205211,
+            "deployment_gas": 39459629,
+            "method_gas": 2978467272,
+            "version": "2ef99fc"
+        },
+        "ir-optimize-evm-only": {
+            "bytecode_size": 205211,
+            "deployment_gas": 39459629,
+            "method_gas": 2978467272,
+            "version": "2ef99fc"
+        },
+        "ir-optimize-evm+yul": {
+            "bytecode_size": 205211,
+            "deployment_gas": 39459629,
+            "method_gas": 2777867251
+        }
+    },
+    "gnosis": {
+        "ir-optimize-evm+yul": {
+            "bytecode_size": 56069,
+            "deployment_gas": null,
+            "method_gas": null,
+            "version": "ea09294"
+        }
+    },
+    "zeppelin": {
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 510428,
+            "deployment_gas": 94501114,
+            "version": "af7ec04"
+        }
+    }
+}
--- a/test/scripts/fixtures/summarized-benchmarks-develop.json
+++ b/test/scripts/fixtures/summarized-benchmarks-develop.json
@ -0,0 +1,99 @@
+{
+    "bleeps": {
+        "ir-optimize-evm+yul": {
+            "bytecode_size": 132165,
+            "deployment_gas": 0,
+            "method_gas": 39289935,
+            "version": "bb90cd0"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 137869,
+            "deployment_gas": 0,
+            "method_gas": 38863224,
+            "version": "bb90cd0"
+        }
+    },
+    "colony": {
+        "ir-optimize-evm+yul": {
+            "bytecode_size": 363606,
+            "deployment_gas": null,
+            "method_gas": null,
+            "version": "573399b"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 363606,
+            "deployment_gas": null,
+            "method_gas": null,
+            "version": "573399b"
+        }
+    },
+    "elementfi": {
+        "legacy-no-optimize": {
+            "bytecode_size": 890560,
+            "deployment_gas": null,
+            "method_gas": null,
+            "version": "87f8b5e"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 536668,
+            "version": "87f8b5e"
+        },
+        "legacy-optimize-evm-only": {},
+        "ir-no-optimize": {
+            "bytecode_size": null,
+            "method_gas": 2777867251,
+            "version": "87f8b5e"
+        }
+    },
+    "euler": {
+        "ir-no-optimize": {
+            "bytecode_size": 323909,
+            "deployment_gas": 61591870,
+            "method_gas": 3452105184,
+            "version": "2ef99fc"
+        },
+        "legacy-no-optimize": {
+            "bytecode_size": 323909,
+            "deployment_gas": 61591870,
+            "method_gas": 3452105184,
+            "version": "c23e8bd"
+        },
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 182190,
+            "deployment_gas": 35236828,
+            "method_gas": 2777867251,
+            "version": "c23e8bd"
+        },
+        "legacy-optimize-evm-only": {
+            "bytecode_size": 202106,
+            "deployment_gas": 38790600,
+            "method_gas": 2907368790,
+            "version": "v1.2.3"
+        },
+        "ir-optimize-evm-only": {
+            "bytecode_size": 182190,
+            "deployment_gas": 35236828,
+            "method_gas": 2777867251
+        },
+        "ir-optimize-evm+yul": {
+            "bytecode_size": 182190,
+            "deployment_gas": 35236828,
+            "method_gas": 2777867251
+        }
+    },
+    "ens": {
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 156937,
+            "deployment_gas": 30073789,
+            "method_gas": 105365362,
+            "version": "v0.0.8"
+        }
+    },
+    "zeppelin": {
+        "legacy-optimize-evm+yul": {
+            "bytecode_size": 510428,
+            "deployment_gas": 94501114,
+            "version": "af7ec04"
+        }
+    }
+}
--- a/test/scripts/test_externalTests_benchmark_diff.py
+++ b/test/scripts/test_externalTests_benchmark_diff.py
@ -0,0 +1,508 @@
+#!/usr/bin/env python3
+
+from textwrap import dedent
+import json
+import unittest
+
+from unittest_helpers import FIXTURE_DIR, load_fixture
+
+# NOTE: This test file file only works with scripts/ added to PYTHONPATH so pylint can't find the imports
+# pragma pylint: disable=import-error
+from externalTests.benchmark_diff import BenchmarkDiffer, DifferenceStyle, DiffTableSet, DiffTableFormatter, OutputFormat
+# pragma pylint: enable=import-error
+
+SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH = FIXTURE_DIR / 'summarized-benchmarks-develop.json'
+SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH = FIXTURE_DIR / 'summarized-benchmarks-branch.json'
+
+SUMMARIZED_DIFF_HUMANIZED_MD_PATH = FIXTURE_DIR / 'summarized-benchmark-diff-develop-branch-humanized.md'
+SUMMARIZED_DIFF_HUMANIZED_MD = load_fixture(SUMMARIZED_DIFF_HUMANIZED_MD_PATH)
+
+
+class TestBenchmarkDiff(unittest.TestCase):
+    def setUp(self):
+        self.maxDiff = 10000
+
+    def test_benchmark_diff(self):
+        report_before = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH))
+        report_after = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH))
+        expected_diff = {
+            "bleeps": {
+                "ir-optimize-evm+yul": {
+                    # Numerical difference -> negative/positive/zero.
+                    # Zeros are not skipped to differentiate them from missing values.
+                    "bytecode_size": 132868 - 132165,
+                    "deployment_gas": 0,
+                    "method_gas": 39289198 - 39289935,
+                },
+                "legacy-optimize-evm+yul": {
+                    # No differences within preset -> zeros still present.
+                    "bytecode_size": 0,
+                    "deployment_gas": 0,
+                    "method_gas": 0,
+                },
+            },
+            "colony": {
+                # Preset missing on one side -> replace dict with string
+                "ir-optimize-evm+yul": "!A",
+                "legacy-no-optimize": "!B",
+                "legacy-optimize-evm+yul": {
+                    "bytecode_size": 0,
+                    # Attribute missing on both sides -> skip
+                    #"deployment_gas":
+                    #"method_gas":
+                },
+            },
+            "elementfi": {
+                "legacy-no-optimize": {
+                    # Attributes null on one side -> replace value with string
+                    "bytecode_size": "!A",
+                    "deployment_gas": "!B",
+                    # Attribute null on both sides -> skip
+                    #"method_gas":
+                },
+                "legacy-optimize-evm+yul": {
+                    # Attributes missing on one side -> replace value with string
+                    "bytecode_size": "!A",
+                    "deployment_gas": "!B",
+                    # Attribute missing on both sides -> skip
+                    #"method_gas":
+                },
+                "ir-no-optimize": {
+                    # Attributes missing on one side, null on the other -> skip
+                    #"bytecode_size":
+                    #"deployment_gas":
+                    "method_gas": 0,
+                },
+                # Empty preset missing on one side -> replace dict with string
+                "legacy-optimize-evm-only": "!A",
+                "ir-optimize-evm-only": "!B",
+            },
+            "euler": {
+                # Matching versions -> show attributes, skip version
+                "ir-no-optimize": {
+                    "bytecode_size": 328540 - 323909,
+                    "deployment_gas": 0,
+                    "method_gas": 3537419168 - 3452105184,
+                },
+                # Different versions, different values -> replace whole preset with string
+                "legacy-no-optimize": "!V",
+                # Different versions, same values -> replace whole preset with string
+                "legacy-optimize-evm+yul": "!V",
+                # Different versions (not a commit hash), different values -> replace whole preset with string
+                "legacy-optimize-evm-only": "!V",
+                # Version missing on one side -> replace whole preset with string
+                "ir-optimize-evm-only": "!V",
+                # Version missing on both sides -> assume same version
+                "ir-optimize-evm+yul": {
+                    "bytecode_size": 205211 - 182190,
+                    "deployment_gas": 39459629 - 35236828,
+                    "method_gas": 0,
+                },
+            },
+            "zeppelin": {
+                "legacy-optimize-evm+yul": {
+                    # Whole project identical -> attributes still present, with zeros
+                    "bytecode_size": 0,
+                    "deployment_gas": 0,
+                    # Field missing on both sides -> skip
+                    #"method_gas":
+                }
+            },
+            # Empty project missing on one side -> replace its dict with a string
+            "gnosis": "!B",
+            "ens": "!A",
+        }
+        differ = BenchmarkDiffer(DifferenceStyle.ABSOLUTE, None, OutputFormat.JSON)
+        self.assertEqual(differ.run(report_before, report_after), expected_diff)
+
+
+class TestBenchmarkDiffer(unittest.TestCase):
+    def setUp(self):
+        self.maxDiff = 10000
+
+    @staticmethod
+    def _nest(value, levels):
+        nested_value = value
+        for level in levels:
+            nested_value = {level: nested_value}
+
+        return nested_value
+
+    def _assert_single_value_diff_matches(self, differ, cases, nest_result=True, nestings=None):
+        if nestings is None:
+            nestings = [[], ['p'], ['p', 's'], ['p', 's', 'a']]
+
+        for levels in nestings:
+            for (before, after, expected_diff) in cases:
+                self.assertEqual(
+                    differ.run(self._nest(before, levels), self._nest(after, levels)),
+                    self._nest(expected_diff, levels) if nest_result else expected_diff,
+                    f'Wrong diff for {self._nest(before, levels)} vs {self._nest(after, levels)}'
+                )
+
+    def test_empty(self):
+        for style in DifferenceStyle:
+            differ = BenchmarkDiffer(style, None, OutputFormat.JSON)
+            self._assert_single_value_diff_matches(differ, [({}, {}, {})], nest_result=False)
+
+    def test_null(self):
+        for style in DifferenceStyle:
+            differ = BenchmarkDiffer(style, None, OutputFormat.JSON)
+            self._assert_single_value_diff_matches(differ, [(None, None, {})], nest_result=False)
+
+    def test_number_diff_absolute_json(self):
+        for output_format in OutputFormat:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(DifferenceStyle.ABSOLUTE, 4, output_format),
+                [
+                    (2,   2,    0),
+                    (2,   5,    3),
+                    (5,   2,   -3),
+                    (2.0, 2.0,  0),
+                    (2,   2.0,  0),
+                    (2.0, 2,    0),
+                    (2,   2.5,  2.5 - 2),
+                    (2.5, 2,    2 - 2.5),
+
+                    (0,   0,    0),
+                    (0,   2,    2),
+                    (0,   -2,  -2),
+
+                    (-3, -1,    2),
+                    (-1, -3,   -2),
+                    (2,   0,   -2),
+                    (-2,  0,    2),
+
+                    (1.00006, 1,  1 - 1.00006),
+                    (1, 1.00006,  1.00006 - 1),
+                    (1.00004, 1, 1 - 1.00004),
+                    (1, 1.00004, 1.00004 - 1),
+                ],
+            )
+
+    def test_number_diff_json(self):
+        for output_format in OutputFormat:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(DifferenceStyle.RELATIVE, 4, output_format),
+                [
+                    (2,   2,   0),
+                    (2,   5,   (5 - 2) / 2),
+                    (5,   2,   (2 - 5) / 5),
+                    (2.0, 2.0, 0),
+                    (2,   2.0, 0),
+                    (2.0, 2,   0),
+                    (2,   2.5, (2.5 - 2) / 2),
+                    (2.5, 2,   (2 - 2.5) / 2.5),
+
+                    (0,   0,   0),
+                    (0,   2,   '+INF'),
+                    (0,   -2,  '-INF'),
+
+                    (-3, -1,   0.6667),
+                    (-1, -3,  -2),
+                    (2,   0,  -1),
+                    (-2,  0,   1),
+
+                    (1.00006, 1,   -0.0001),
+                    (1, 1.00006,    0.0001),
+                    (1.000004, 1, '-0'),
+                    (1, 1.000004, '+0'),
+                ],
+            )
+
+    def test_number_diff_humanized_json_and_console(self):
+        for output_format in [OutputFormat.JSON, OutputFormat.CONSOLE]:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(DifferenceStyle.HUMANIZED, 4, output_format),
+                [
+                    (2,   2,      '0%'),
+                    (2,   5,   '+150%'),
+                    (5,   2,    '-60%'),
+                    (2.0, 2.0,    '0%'),
+                    (2,   2.0,    '0%'),
+                    (2.0, 2,      '0%'),
+                    (2,   2.5,  '+25%'),
+                    (2.5, 2,    '-20%'),
+
+                    (0,   0,      '0%'),
+                    (0,   2,   '+INF%'),
+                    (0,   -2,  '-INF%'),
+
+                    (-3, -1, '+66.67%'),
+                    (-1, -3,   '-200%'),
+                    (2,   0,   '-100%'),
+                    (-2,  0,   '+100%'),
+
+                    (1.00006, 1,  '-0.01%'),
+                    (1, 1.00006,  '+0.01%'),
+                    (1.000004, 1,    '-0%'),
+                    (1, 1.000004,    '+0%'),
+                ],
+            )
+
+    def test_number_diff_humanized_markdown(self):
+        self._assert_single_value_diff_matches(
+            BenchmarkDiffer(DifferenceStyle.HUMANIZED, 4, OutputFormat.MARKDOWN),
+            [
+                (2,   2,             '`0%`'),
+                (2,   5,   '**`+150% ❌`**'),
+                (5,   2,    '**`-60% ✅`**'),
+                (2.0, 2.0,           '`0%`'),
+                (2,   2.0,           '`0%`'),
+                (2.0, 2,             '`0%`'),
+                (2,   2.5,  '**`+25% ❌`**'),
+                (2.5, 2,    '**`-20% ✅`**'),
+
+                (0,   0,             '`0%`'),
+                (0,   2,          '`+INF%`'),
+                (0,   -2,         '`-INF%`'),
+
+                (-3, -1, '**`+66.67% ❌`**'),
+                (-1, -3,   '**`-200% ✅`**'),
+                (2,   0,   '**`-100% ✅`**'),
+                (-2,  0,   '**`+100% ❌`**'),
+
+                (1.00006, 1,  '**`-0.01% ✅`**'),
+                (1, 1.00006,  '**`+0.01% ❌`**'),
+                (1.000004, 1,           '`-0%`'),
+                (1, 1.000004,           '`+0%`'),
+            ],
+        )
+
+    def test_type_mismatch(self):
+        for style in DifferenceStyle:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(style, 4, OutputFormat.JSON),
+                [
+                    (1, {}, '!T'),
+                    ({}, 1, '!T'),
+                    (1.5, {}, '!T'),
+                    ({}, 1.5, '!T'),
+                    ('1', {}, '!T'),
+                    ({}, '1', '!T'),
+                    (1, '1', '!T'),
+                    ('1', 1, '!T'),
+                    (1.5, '1', '!T'),
+                    ('1', 1.5, '!T'),
+                    ('1', '1', '!T'),
+                ],
+            )
+
+    def test_version_mismatch(self):
+        for style in DifferenceStyle:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(style, 4, OutputFormat.JSON),
+                [
+                    ({'a': 123, 'version': 1}, {'a': 123, 'version': 2}, '!V'),
+                    ({'a': 123, 'version': 2}, {'a': 123, 'version': 1}, '!V'),
+                    ({'a': 123, 'version': 'a'}, {'a': 123, 'version': 'b'}, '!V'),
+                    ({'a': 123, 'version': 'a'}, {'a': 123, 'version': 1}, '!V'),
+
+                    ({'a': 'a', 'version': 1}, {'a': 'a', 'version': 2}, '!V'),
+                    ({'a': {}, 'version': 1}, {'a': {}, 'version': 2}, '!V'),
+                    ({'s': {'a': 1}, 'version': 1}, {'s': {'a': 1}, 'version': 2}, '!V'),
+
+                    ({'a': 123, 'version': 1}, {'a': 456, 'version': 2}, '!V'),
+                    ({'a': 'a', 'version': 1}, {'a': 'b', 'version': 2}, '!V'),
+                    ({'s': {'a': 1}, 'version': 1}, {'s': {'a': 2}, 'version': 2}, '!V'),
+                ],
+            )
+
+    def test_missing(self):
+        for style in DifferenceStyle:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(style, None, OutputFormat.JSON),
+                [
+                    (1, None, '!A'),
+                    (None, 1, '!B'),
+                    ('1', None, '!A'),
+                    (None, '1', '!B'),
+                    ({}, None, '!A'),
+                    (None, {}, '!B'),
+
+                    ({'x': 1}, {}, {'x': '!A'}),
+                    ({}, {'x': 1}, {'x': '!B'}),
+                    ({'x': 1}, {'x': None}, {'x': '!A'}),
+                    ({'x': None}, {'x': 1}, {'x': '!B'}),
+                    ({'x': 1}, {'y': 1}, {'x': '!A', 'y': '!B'}),
+
+                    ({'x': {}}, {}, {'x': '!A'}),
+                    ({}, {'x': {}}, {'x': '!B'}),
+                    ({'p': {'x': {}}}, {}, {'p': '!A'}),
+                    ({}, {'p': {'x': {}}}, {'p': '!B'}),
+                ],
+            )
+
+    def test_missing_vs_null(self):
+        for style in DifferenceStyle:
+            self._assert_single_value_diff_matches(
+                BenchmarkDiffer(style, None, OutputFormat.JSON),
+                [
+                    ({'a': None}, {}, {}),
+                    ({}, {'a': None}, {}),
+                ],
+                nest_result=False,
+            )
+
+
+class TestDiffTableFormatter(unittest.TestCase):
+    def setUp(self):
+        self.maxDiff = 10000
+
+        self.report_before = {
+            'project A': {
+                'preset X': {'A1':  99, 'A2': 50, 'version': 1},
+                'preset Y': {'A1':   0, 'A2': 50, 'version': 1},
+            },
+            'project B': {
+                'preset X': {           'A2': 50},
+                'preset Y': {'A1':   0},
+            },
+            'project C': {
+                'preset X': {'A1':   0, 'A2': 50, 'version': 1},
+            },
+            'project D': {
+                'preset X': {'A1': 999},
+            },
+        }
+        self.report_after = {
+            'project A': {
+                'preset X': {'A1': 100, 'A2':  50, 'version': 1},
+                'preset Y': {'A1': 500, 'A2': 500, 'version': 2},
+            },
+            'project B': {
+                'preset X': {'A1':   0},
+                'preset Y': {           'A2': 50},
+            },
+            'project C': {
+                'preset Y': {'A1':   0, 'A2': 50, 'version': 1},
+            },
+            'project E': {
+                'preset Y': {           'A2': 999},
+            },
+        }
+
+    def test_diff_table_formatter(self):
+        report_before = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH))
+        report_after = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH))
+        differ = BenchmarkDiffer(DifferenceStyle.HUMANIZED, 4, OutputFormat.MARKDOWN)
+        diff = differ.run(report_before, report_after)
+
+        self.assertEqual(DiffTableFormatter.run(DiffTableSet(diff), OutputFormat.MARKDOWN), SUMMARIZED_DIFF_HUMANIZED_MD)
+
+    def test_diff_table_formatter_json_absolute(self):
+        differ = BenchmarkDiffer(DifferenceStyle.ABSOLUTE, 4, OutputFormat.JSON)
+        diff = differ.run(self.report_before, self.report_after)
+
+        expected_formatted_table = dedent("""\
+            {
+                "preset X": {
+                    "project A": {
+                        "A1": 1,
+                        "A2": 0
+                    },
+                    "project B": {
+                        "A1": "!B",
+                        "A2": "!A"
+                    },
+                    "project C": {
+                        "A1": "!A",
+                        "A2": "!A"
+                    },
+                    "project D": {
+                        "A1": "!A",
+                        "A2": "!A"
+                    },
+                    "project E": {
+                        "A1": "!B",
+                        "A2": "!B"
+                    }
+                },
+                "preset Y": {
+                    "project A": {
+                        "A1": "!V",
+                        "A2": "!V"
+                    },
+                    "project B": {
+                        "A1": "!A",
+                        "A2": "!B"
+                    },
+                    "project C": {
+                        "A1": "!B",
+                        "A2": "!B"
+                    },
+                    "project D": {
+                        "A1": "!A",
+                        "A2": "!A"
+                    },
+                    "project E": {
+                        "A1": "!B",
+                        "A2": "!B"
+                    }
+                }
+            }"""
+        )
+        self.assertEqual(DiffTableFormatter.run(DiffTableSet(diff), OutputFormat.JSON), expected_formatted_table)
+
+    def test_diff_table_formatter_console_relative(self):
+        differ = BenchmarkDiffer(DifferenceStyle.RELATIVE, 4, OutputFormat.CONSOLE)
+        diff = differ.run(self.report_before, self.report_after)
+
+        expected_formatted_table = dedent("""
+            PRESET X
+            |-----------|--------|----|
+            |   project |     A1 | A2 |
+            |-----------|--------|----|
+            | project A | 0.0101 |  0 |
+            | project B |     !B | !A |
+            | project C |     !A | !A |
+            | project D |     !A | !A |
+            | project E |     !B | !B |
+            |-----------|--------|----|
+
+            PRESET Y
+            |-----------|----|----|
+            |   project | A1 | A2 |
+            |-----------|----|----|
+            | project A | !V | !V |
+            | project B | !A | !B |
+            | project C | !B | !B |
+            | project D | !A | !A |
+            | project E | !B | !B |
+            |-----------|----|----|
+        """)
+        self.assertEqual(DiffTableFormatter.run(DiffTableSet(diff), OutputFormat.CONSOLE), expected_formatted_table)
+
+    def test_diff_table_formatter_markdown_humanized(self):
+        differ = BenchmarkDiffer(DifferenceStyle.HUMANIZED, 4, OutputFormat.MARKDOWN)
+        diff = differ.run(self.report_before, self.report_after)
+
+        expected_formatted_table = dedent("""
+            ### `preset X`
+            |   project |             A1 |   A2 |
+            |:---------:|---------------:|-----:|
+            | project A | **`+1.01% ❌`** | `0%` |
+            | project B |           `!B` | `!A` |
+            | project C |           `!A` | `!A` |
+            | project D |           `!A` | `!A` |
+            | project E |           `!B` | `!B` |
+
+            ### `preset Y`
+            |   project |   A1 |   A2 |
+            |:---------:|-----:|-----:|
+            | project A | `!V` | `!V` |
+            | project B | `!A` | `!B` |
+            | project C | `!B` | `!B` |
+            | project D | `!A` | `!A` |
+            | project E | `!B` | `!B` |
+
+
+            `!V` = version mismatch
+            `!B` = no value in the "before" version
+            `!A` = no value in the "after" version
+            `!T` = one or both values were not numeric and could not be compared
+            `-0` = very small negative value rounded to zero
+            `+0` = very small positive value rounded to zero
+
+        """)
+        self.assertEqual(DiffTableFormatter.run(DiffTableSet(diff), OutputFormat.MARKDOWN), expected_formatted_table)