benchmark_diff: A Python script for diffing summarized benchmarks from external tests

This commit is contained in:
Kamil Śliwak 2022-02-09 18:05:38 +01:00
parent 0944e6853f
commit ee5e878ad7
4 changed files with 720 additions and 0 deletions

View File

@ -0,0 +1,212 @@
#!/usr/bin/env python3
from argparse import ArgumentParser
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Any, Optional, Union
import json
import sys
class DifferenceStyle(Enum):
ABSOLUTE = 'absolute'
RELATIVE = 'relative'
HUMANIZED = 'humanized'
DEFAULT_RELATIVE_PRECISION = 4
DEFAULT_DIFFERENCE_STYLE = DifferenceStyle.ABSOLUTE
class ValidationError(Exception):
pass
class CommandLineError(ValidationError):
pass
class BenchmarkDiffer:
difference_style: DifferenceStyle
relative_precision: Optional[int]
def __init__(
self,
difference_style: DifferenceStyle,
relative_precision: Optional[int],
):
self.difference_style = difference_style
self.relative_precision = relative_precision
def run(self, before: Any, after: Any) -> Optional[Union[dict, str, int, float]]:
if not isinstance(before, dict) or not isinstance(after, dict):
return self._diff_scalars(before, after)
if before.get('version') != after.get('version'):
return self._humanize_diff('!V')
diff = {}
for key in (set(before) | set(after)) - {'version'}:
value_diff = self.run(before.get(key), after.get(key))
if value_diff not in [None, {}]:
diff[key] = value_diff
return diff
def _diff_scalars(self, before: Any, after: Any) -> Optional[Union[str, int, float, dict]]:
assert not isinstance(before, dict) or not isinstance(after, dict)
if before is None and after is None:
return {}
if before is None:
return self._humanize_diff('!B')
if after is None:
return self._humanize_diff('!A')
if not isinstance(before, (int, float)) or not isinstance(after, (int, float)):
return self._humanize_diff('!T')
number_diff = self._diff_numbers(before, after)
if self.difference_style != DifferenceStyle.HUMANIZED:
return number_diff
return self._humanize_diff(number_diff)
def _diff_numbers(self, value_before: Union[int, float], value_after: Union[int, float]) -> Union[str, int, float]:
diff: Union[str, int, float]
if self.difference_style == DifferenceStyle.ABSOLUTE:
diff = value_after - value_before
if isinstance(diff, float) and diff.is_integer():
diff = int(diff)
return diff
if value_before == 0:
if value_after > 0:
return '+INF'
elif value_after < 0:
return '-INF'
else:
return 0
diff = (value_after - value_before) / abs(value_before)
if self.relative_precision is not None:
rounded_diff = round(diff, self.relative_precision)
if rounded_diff == 0 and diff < 0:
diff = '-0'
elif rounded_diff == 0 and diff > 0:
diff = '+0'
else:
diff = rounded_diff
if isinstance(diff, float) and diff.is_integer():
diff = int(diff)
return diff
def _humanize_diff(self, diff: Union[str, int, float]) -> str:
if isinstance(diff, str) and diff.startswith('!'):
return diff
value: Union[str, int, float]
if isinstance(diff, (int, float)):
value = diff * 100
if isinstance(value, float) and self.relative_precision is not None:
# The multiplication can result in new significant digits appearing. We need to reround.
# NOTE: round() works fine with negative precision.
value = round(value, self.relative_precision - 2)
if isinstance(value, float) and value.is_integer():
value = int(value)
prefix = ''
if diff < 0:
prefix = ''
elif diff > 0:
prefix = '+'
else:
value = diff
prefix = ''
return f"{prefix}{value}%"
@dataclass(frozen=True)
class CommandLineOptions:
report_before: Path
report_after: Path
difference_style: DifferenceStyle
relative_precision: int
def process_commandline() -> CommandLineOptions:
script_description = (
"Compares summarized benchmark reports and outputs JSON with the same structure but listing only differences."
)
parser = ArgumentParser(description=script_description)
parser.add_argument(dest='report_before', help="Path to a JSON file containing original benchmark results.")
parser.add_argument(dest='report_after', help="Path to a JSON file containing new benchmark results.")
parser.add_argument(
'--style',
dest='difference_style',
choices=[s.value for s in DifferenceStyle],
help=(
"How to present numeric differences: "
f"'{DifferenceStyle.ABSOLUTE.value}' subtracts new from original; "
f"'{DifferenceStyle.RELATIVE.value}' also divides by the original; "
f"'{DifferenceStyle.HUMANIZED.value}' is like relative but value is a percentage and "
"positive/negative changes are emphasized. "
f"(default: '{DEFAULT_DIFFERENCE_STYLE}')."
)
)
# NOTE: Negative values are valid for precision. round() handles them in a sensible way.
parser.add_argument(
'--precision',
dest='relative_precision',
type=int,
default=DEFAULT_RELATIVE_PRECISION,
help=(
"Number of significant digits for relative differences. "
f"Note that with --style={DifferenceStyle.HUMANIZED.value} the rounding is applied "
"**before** converting the value to a percentage so you need to add 2. "
f"Has no effect when used together with --style={DifferenceStyle.ABSOLUTE.value}. "
f"(default: {DEFAULT_RELATIVE_PRECISION})"
)
)
options = parser.parse_args()
if options.difference_style is not None:
difference_style = DifferenceStyle(options.difference_style)
else:
difference_style = DEFAULT_DIFFERENCE_STYLE
processed_options = CommandLineOptions(
report_before=Path(options.report_before),
report_after=Path(options.report_after),
difference_style=difference_style,
relative_precision=options.relative_precision,
)
return processed_options
def main():
try:
options = process_commandline()
differ = BenchmarkDiffer(options.difference_style, options.relative_precision)
diff = differ.run(
json.loads(options.report_before.read_text('utf-8')),
json.loads(options.report_after.read_text('utf-8')),
)
print(json.dumps(diff, indent=4, sort_keys=True))
return 0
except CommandLineError as exception:
print(f"ERROR: {exception}", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,100 @@
{
"bleeps": {
"ir-optimize-evm+yul": {
"bytecode_size": 132868,
"deployment_gas": 0,
"method_gas": 39289198,
"version": "bb90cd0"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 137869,
"deployment_gas": 0,
"method_gas": 38863224,
"version": "bb90cd0"
}
},
"colony": {
"legacy-no-optimize": {
"bytecode_size": 664190,
"deployment_gas": null,
"method_gas": null,
"version": "573399b"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 363606,
"deployment_gas": null,
"method_gas": null,
"version": "573399b"
}
},
"elementfi": {
"legacy-no-optimize": {
"bytecode_size": null,
"deployment_gas": 69200158,
"method_gas": null,
"version": "87f8b5e"
},
"legacy-optimize-evm+yul": {
"deployment_gas": 40951128,
"version": "87f8b5e"
},
"ir-optimize-evm-only": {},
"ir-no-optimize": {
"deployment_gas": null,
"method_gas": 2777867251,
"version": "87f8b5e"
}
},
"euler": {
"ir-no-optimize": {
"bytecode_size": 328540,
"deployment_gas": 61591870,
"method_gas": 3537419168,
"version": "2ef99fc"
},
"legacy-no-optimize": {
"bytecode_size": 328540,
"deployment_gas": 62590688,
"method_gas": 3537419168,
"version": "2ef99fc"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 182190,
"deployment_gas": 35236828,
"method_gas": 2777867251,
"version": "2ef99fc"
},
"legacy-optimize-evm-only": {
"bytecode_size": 205211,
"deployment_gas": 39459629,
"method_gas": 2978467272,
"version": "2ef99fc"
},
"ir-optimize-evm-only": {
"bytecode_size": 205211,
"deployment_gas": 39459629,
"method_gas": 2978467272,
"version": "2ef99fc"
},
"ir-optimize-evm+yul": {
"bytecode_size": 205211,
"deployment_gas": 39459629,
"method_gas": 2777867251
}
},
"gnosis": {
"ir-optimize-evm+yul": {
"bytecode_size": 56069,
"deployment_gas": null,
"method_gas": null,
"version": "ea09294"
}
},
"zeppelin": {
"legacy-optimize-evm+yul": {
"bytecode_size": 510428,
"deployment_gas": 94501114,
"version": "af7ec04"
}
}
}

View File

@ -0,0 +1,99 @@
{
"bleeps": {
"ir-optimize-evm+yul": {
"bytecode_size": 132165,
"deployment_gas": 0,
"method_gas": 39289935,
"version": "bb90cd0"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 137869,
"deployment_gas": 0,
"method_gas": 38863224,
"version": "bb90cd0"
}
},
"colony": {
"ir-optimize-evm+yul": {
"bytecode_size": 363606,
"deployment_gas": null,
"method_gas": null,
"version": "573399b"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 363606,
"deployment_gas": null,
"method_gas": null,
"version": "573399b"
}
},
"elementfi": {
"legacy-no-optimize": {
"bytecode_size": 890560,
"deployment_gas": null,
"method_gas": null,
"version": "87f8b5e"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 536668,
"version": "87f8b5e"
},
"legacy-optimize-evm-only": {},
"ir-no-optimize": {
"bytecode_size": null,
"method_gas": 2777867251,
"version": "87f8b5e"
}
},
"euler": {
"ir-no-optimize": {
"bytecode_size": 323909,
"deployment_gas": 61591870,
"method_gas": 3452105184,
"version": "2ef99fc"
},
"legacy-no-optimize": {
"bytecode_size": 323909,
"deployment_gas": 61591870,
"method_gas": 3452105184,
"version": "c23e8bd"
},
"legacy-optimize-evm+yul": {
"bytecode_size": 182190,
"deployment_gas": 35236828,
"method_gas": 2777867251,
"version": "c23e8bd"
},
"legacy-optimize-evm-only": {
"bytecode_size": 202106,
"deployment_gas": 38790600,
"method_gas": 2907368790,
"version": "v1.2.3"
},
"ir-optimize-evm-only": {
"bytecode_size": 182190,
"deployment_gas": 35236828,
"method_gas": 2777867251
},
"ir-optimize-evm+yul": {
"bytecode_size": 182190,
"deployment_gas": 35236828,
"method_gas": 2777867251
}
},
"ens": {
"legacy-optimize-evm+yul": {
"bytecode_size": 156937,
"deployment_gas": 30073789,
"method_gas": 105365362,
"version": "v0.0.8"
}
},
"zeppelin": {
"legacy-optimize-evm+yul": {
"bytecode_size": 510428,
"deployment_gas": 94501114,
"version": "af7ec04"
}
}
}

View File

@ -0,0 +1,309 @@
#!/usr/bin/env python3
import json
import unittest
from unittest_helpers import FIXTURE_DIR, load_fixture
# NOTE: This test file file only works with scripts/ added to PYTHONPATH so pylint can't find the imports
# pragma pylint: disable=import-error
from externalTests.benchmark_diff import BenchmarkDiffer, DifferenceStyle
# pragma pylint: enable=import-error
SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH = FIXTURE_DIR / 'summarized-benchmarks-develop.json'
SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH = FIXTURE_DIR / 'summarized-benchmarks-branch.json'
class TestBenchmarkDiff(unittest.TestCase):
def setUp(self):
self.maxDiff = 10000
def test_benchmark_diff(self):
report_before = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH))
report_after = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH))
expected_diff = {
"bleeps": {
"ir-optimize-evm+yul": {
# Numerical difference -> negative/positive/zero.
# Zeros are not skipped to differentiate them from missing values.
"bytecode_size": 132868 - 132165,
"deployment_gas": 0,
"method_gas": 39289198 - 39289935,
},
"legacy-optimize-evm+yul": {
# No differences within preset -> zeros still present.
"bytecode_size": 0,
"deployment_gas": 0,
"method_gas": 0,
},
},
"colony": {
# Preset missing on one side -> replace dict with string
"ir-optimize-evm+yul": "!A",
"legacy-no-optimize": "!B",
"legacy-optimize-evm+yul": {
"bytecode_size": 0,
# Attribute missing on both sides -> skip
#"deployment_gas":
#"method_gas":
},
},
"elementfi": {
"legacy-no-optimize": {
# Attributes null on one side -> replace value with string
"bytecode_size": "!A",
"deployment_gas": "!B",
# Attribute null on both sides -> skip
#"method_gas":
},
"legacy-optimize-evm+yul": {
# Attributes missing on one side -> replace value with string
"bytecode_size": "!A",
"deployment_gas": "!B",
# Attribute missing on both sides -> skip
#"method_gas":
},
"ir-no-optimize": {
# Attributes missing on one side, null on the other -> skip
#"bytecode_size":
#"deployment_gas":
"method_gas": 0,
},
# Empty preset missing on one side -> replace dict with string
"legacy-optimize-evm-only": "!A",
"ir-optimize-evm-only": "!B",
},
"euler": {
# Matching versions -> show attributes, skip version
"ir-no-optimize": {
"bytecode_size": 328540 - 323909,
"deployment_gas": 0,
"method_gas": 3537419168 - 3452105184,
},
# Different versions, different values -> replace whole preset with string
"legacy-no-optimize": "!V",
# Different versions, same values -> replace whole preset with string
"legacy-optimize-evm+yul": "!V",
# Different versions (not a commit hash), different values -> replace whole preset with string
"legacy-optimize-evm-only": "!V",
# Version missing on one side -> replace whole preset with string
"ir-optimize-evm-only": "!V",
# Version missing on both sides -> assume same version
"ir-optimize-evm+yul": {
"bytecode_size": 205211 - 182190,
"deployment_gas": 39459629 - 35236828,
"method_gas": 0,
},
},
"zeppelin": {
"legacy-optimize-evm+yul": {
# Whole project identical -> attributes still present, with zeros
"bytecode_size": 0,
"deployment_gas": 0,
# Field missing on both sides -> skip
#"method_gas":
}
},
# Empty project missing on one side -> replace its dict with a string
"gnosis": "!B",
"ens": "!A",
}
differ = BenchmarkDiffer(DifferenceStyle.ABSOLUTE, None)
self.assertEqual(differ.run(report_before, report_after), expected_diff)
class TestBenchmarkDiffer(unittest.TestCase):
def setUp(self):
self.maxDiff = 10000
@staticmethod
def _nest(value, levels):
nested_value = value
for level in levels:
nested_value = {level: nested_value}
return nested_value
def _assert_single_value_diff_matches(self, differ, cases, nest_result=True, nestings=None):
if nestings is None:
nestings = [[], ['p'], ['p', 's'], ['p', 's', 'a']]
for levels in nestings:
for (before, after, expected_diff) in cases:
self.assertEqual(
differ.run(self._nest(before, levels), self._nest(after, levels)),
self._nest(expected_diff, levels) if nest_result else expected_diff,
f'Wrong diff for {self._nest(before, levels)} vs {self._nest(after, levels)}'
)
def test_empty(self):
for style in DifferenceStyle:
differ = BenchmarkDiffer(style, None)
self._assert_single_value_diff_matches(differ, [({}, {}, {})], nest_result=False)
def test_null(self):
for style in DifferenceStyle:
differ = BenchmarkDiffer(style, None)
self._assert_single_value_diff_matches(differ, [(None, None, {})], nest_result=False)
def test_number_diff_absolute_json(self):
self._assert_single_value_diff_matches(
BenchmarkDiffer(DifferenceStyle.ABSOLUTE, 4),
[
(2, 2, 0),
(2, 5, 3),
(5, 2, -3),
(2.0, 2.0, 0),
(2, 2.0, 0),
(2.0, 2, 0),
(2, 2.5, 2.5 - 2),
(2.5, 2, 2 - 2.5),
(0, 0, 0),
(0, 2, 2),
(0, -2, -2),
(-3, -1, 2),
(-1, -3, -2),
(2, 0, -2),
(-2, 0, 2),
(1.00006, 1, 1 - 1.00006),
(1, 1.00006, 1.00006 - 1),
(1.00004, 1, 1 - 1.00004),
(1, 1.00004, 1.00004 - 1),
],
)
def test_number_diff_json(self):
self._assert_single_value_diff_matches(
BenchmarkDiffer(DifferenceStyle.RELATIVE, 4),
[
(2, 2, 0),
(2, 5, (5 - 2) / 2),
(5, 2, (2 - 5) / 5),
(2.0, 2.0, 0),
(2, 2.0, 0),
(2.0, 2, 0),
(2, 2.5, (2.5 - 2) / 2),
(2.5, 2, (2 - 2.5) / 2.5),
(0, 0, 0),
(0, 2, '+INF'),
(0, -2, '-INF'),
(-3, -1, 0.6667),
(-1, -3, -2),
(2, 0, -1),
(-2, 0, 1),
(1.00006, 1, -0.0001),
(1, 1.00006, 0.0001),
(1.000004, 1, '-0'),
(1, 1.000004, '+0'),
],
)
def test_number_diff_humanized_json(self):
self._assert_single_value_diff_matches(
BenchmarkDiffer(DifferenceStyle.HUMANIZED, 4),
[
(2, 2, '0%'),
(2, 5, '+150%'),
(5, 2, '-60%'),
(2.0, 2.0, '0%'),
(2, 2.0, '0%'),
(2.0, 2, '0%'),
(2, 2.5, '+25%'),
(2.5, 2, '-20%'),
(0, 0, '0%'),
(0, 2, '+INF%'),
(0, -2, '-INF%'),
(-3, -1, '+66.67%'),
(-1, -3, '-200%'),
(2, 0, '-100%'),
(-2, 0, '+100%'),
(1.00006, 1, '-0.01%'),
(1, 1.00006, '+0.01%'),
(1.000004, 1, '-0%'),
(1, 1.000004, '+0%'),
],
)
def test_type_mismatch(self):
for style in DifferenceStyle:
self._assert_single_value_diff_matches(
BenchmarkDiffer(style, 4),
[
(1, {}, '!T'),
({}, 1, '!T'),
(1.5, {}, '!T'),
({}, 1.5, '!T'),
('1', {}, '!T'),
({}, '1', '!T'),
(1, '1', '!T'),
('1', 1, '!T'),
(1.5, '1', '!T'),
('1', 1.5, '!T'),
('1', '1', '!T'),
],
)
def test_version_mismatch(self):
for style in DifferenceStyle:
self._assert_single_value_diff_matches(
BenchmarkDiffer(style, 4),
[
({'a': 123, 'version': 1}, {'a': 123, 'version': 2}, '!V'),
({'a': 123, 'version': 2}, {'a': 123, 'version': 1}, '!V'),
({'a': 123, 'version': 'a'}, {'a': 123, 'version': 'b'}, '!V'),
({'a': 123, 'version': 'a'}, {'a': 123, 'version': 1}, '!V'),
({'a': 'a', 'version': 1}, {'a': 'a', 'version': 2}, '!V'),
({'a': {}, 'version': 1}, {'a': {}, 'version': 2}, '!V'),
({'s': {'a': 1}, 'version': 1}, {'s': {'a': 1}, 'version': 2}, '!V'),
({'a': 123, 'version': 1}, {'a': 456, 'version': 2}, '!V'),
({'a': 'a', 'version': 1}, {'a': 'b', 'version': 2}, '!V'),
({'s': {'a': 1}, 'version': 1}, {'s': {'a': 2}, 'version': 2}, '!V'),
],
)
def test_missing(self):
for style in DifferenceStyle:
self._assert_single_value_diff_matches(
BenchmarkDiffer(style, None),
[
(1, None, '!A'),
(None, 1, '!B'),
('1', None, '!A'),
(None, '1', '!B'),
({}, None, '!A'),
(None, {}, '!B'),
({'x': 1}, {}, {'x': '!A'}),
({}, {'x': 1}, {'x': '!B'}),
({'x': 1}, {'x': None}, {'x': '!A'}),
({'x': None}, {'x': 1}, {'x': '!B'}),
({'x': 1}, {'y': 1}, {'x': '!A', 'y': '!B'}),
({'x': {}}, {}, {'x': '!A'}),
({}, {'x': {}}, {'x': '!B'}),
({'p': {'x': {}}}, {}, {'p': '!A'}),
({}, {'p': {'x': {}}}, {'p': '!B'}),
],
)
def test_missing_vs_null(self):
for style in DifferenceStyle:
self._assert_single_value_diff_matches(
BenchmarkDiffer(style, None),
[
({'a': None}, {}, {}),
({}, {'a': None}, {}),
],
nest_result=False,
)