Unverified Commit 3519bec6 authored by Jim Graham's avatar Jim Graham Committed by GitHub

Save results of A/B test runs in a JSON file for future processing (#56416)

parent da66a45e
...@@ -167,8 +167,10 @@ An example of a local engine architecture is `android_debug_unopt_x86`. ...@@ -167,8 +167,10 @@ An example of a local engine architecture is `android_debug_unopt_x86`.
You can run an A/B test that compares the performance of the default engine You can run an A/B test that compares the performance of the default engine
against a local engine build. The test runs the same benchmark a specified against a local engine build. The test runs the same benchmark a specified
number of times against both engines, then outputs a tab-separated spreadsheet number of times against both engines, then outputs a tab-separated spreadsheet
with the results. The results can be copied to a Google Spreadsheet for further with the results and stores them in a JSON file for future reference. The
inspection. results can be copied to a Google Spreadsheet for further inspection and the
JSON file can be reprocessed with the summarize.dart command for more detailed
output.
Example: Example:
...@@ -183,6 +185,11 @@ The `--ab=10` tells the runner to run an A/B test 10 times. ...@@ -183,6 +185,11 @@ The `--ab=10` tells the runner to run an A/B test 10 times.
`--local-engine=host_debug_unopt` tells the A/B test to use the `host_debug_unopt` `--local-engine=host_debug_unopt` tells the A/B test to use the `host_debug_unopt`
engine build. `--local-engine` is required for A/B test. engine build. `--local-engine` is required for A/B test.
`--ab-result-file=filename` can be used to provide an alternate location to output
the JSON results file (defaults to `ABresults#.json`). A single `#` character can be
used to indicate where to insert a serial number if a file with that name already
exists, otherwise the file will be overwritten.
A/B can run exactly one task. Multiple tasks are not supported. A/B can run exactly one task. Multiple tasks are not supported.
Example output: Example output:
...@@ -203,6 +210,21 @@ the default engine. Values less than 1.0 indicate a slow-down. For example, ...@@ -203,6 +210,21 @@ the default engine. Values less than 1.0 indicate a slow-down. For example,
0.5x means the local engine is twice as slow as the default engine, and 2.0x 0.5x means the local engine is twice as slow as the default engine, and 2.0x
means it's twice as fast. Higher is better. means it's twice as fast. Higher is better.
Summarize tool example:
```sh
../../bin/cache/dart-sdk/bin/dart bin/summarize.dart --[no-]tsv-table --[no-]raw-summary \
ABresults.json ABresults1.json ABresults2.json ...
```
`--[no-]tsv-table` tells the tool to print the summary in a table with tabs for easy spreadsheet
entry. (defaults to on)
`--[no-]raw-summary` tells the tool to print all per-run data collected by the A/B test formatted
with tabs for easy spreadsheet entry. (defaults to on)
Multiple trailing filenames can be specified and each such results file will be processed in turn.
# Reproducing broken builds locally # Reproducing broken builds locally
To reproduce the breakage locally `git checkout` the corresponding Flutter To reproduce the breakage locally `git checkout` the corresponding Flutter
......
...@@ -125,7 +125,7 @@ Future<void> _runABTest() async { ...@@ -125,7 +125,7 @@ Future<void> _runABTest() async {
print('$taskName A/B test. Will run $runsPerTest times.'); print('$taskName A/B test. Will run $runsPerTest times.');
final ABTest abTest = ABTest(); final ABTest abTest = ABTest(localEngine, taskName);
for (int i = 1; i <= runsPerTest; i++) { for (int i = 1; i <= runsPerTest; i++) {
section('Run #$i'); section('Run #$i');
...@@ -168,6 +168,10 @@ Future<void> _runABTest() async { ...@@ -168,6 +168,10 @@ Future<void> _runABTest() async {
print(abTest.printSummary()); print(abTest.printSummary());
} }
} }
abTest.finalize();
final File jsonFile = _uniqueFile(args['ab-result-file'] as String ?? 'ABresults#.json');
jsonFile.writeAsString(const JsonEncoder.withIndent(' ').convert(abTest.jsonMap));
if (!silent) { if (!silent) {
section('Raw results'); section('Raw results');
...@@ -176,6 +180,23 @@ Future<void> _runABTest() async { ...@@ -176,6 +180,23 @@ Future<void> _runABTest() async {
section('Final A/B results'); section('Final A/B results');
print(abTest.printSummary()); print(abTest.printSummary());
print('');
print('Results saved to ${jsonFile.path}');
}
File _uniqueFile(String filenameTemplate) {
final List<String> parts = filenameTemplate.split('#');
if (parts.length != 2) {
return File(filenameTemplate);
}
File file = File(parts[0] + parts[1]);
int i = 1;
while (file.existsSync()) {
file = File(parts[0]+i.toString()+parts[1]);
i++;
}
return file;
} }
void addTasks({ void addTasks({
...@@ -245,6 +266,12 @@ final ArgParser _argParser = ArgParser() ...@@ -245,6 +266,12 @@ final ArgParser _argParser = ArgParser()
} }
}, },
) )
..addOption(
'ab-result-file',
help: 'The filename in which to place the json encoded results of an A/B test.\n'
'The filename may contain a single # character to be replaced by a sequence\n'
'number if the name already exists.',
)
..addFlag( ..addFlag(
'all', 'all',
abbr: 'a', abbr: 'a',
......
// Copyright 2014 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'dart:convert';
import 'dart:io';
import 'package:args/args.dart';
import 'package:flutter_devicelab/framework/ab.dart';
import 'package:flutter_devicelab/framework/utils.dart';
String kRawSummaryOpt = 'raw-summary';
String kTabTableOpt = 'tsv-table';
String kAsciiTableOpt = 'ascii-table';
void _usage(String error) {
stderr.writeln(error);
stderr.writeln('Usage:\n');
stderr.writeln(_argParser.usage);
exitCode = 1;
}
Future<void> main(List<String> rawArgs) async {
ArgResults args;
try {
args = _argParser.parse(rawArgs);
} on FormatException catch (error) {
_usage('${error.message}\n');
return;
}
final List<String> jsonFiles = args.rest.isNotEmpty ? args.rest : <String>[ 'ABresults.json' ];
for (final String filename in jsonFiles) {
final File file = File(filename);
if (!file.existsSync()) {
_usage('File "$filename" does not exist');
return;
}
ABTest test;
try {
test = ABTest.fromJsonMap(
const JsonDecoder().convert(await file.readAsString()) as Map<String, dynamic>
);
} catch(error) {
_usage('Could not parse json file "$filename"');
return;
}
if (args[kRawSummaryOpt] as bool) {
section('Raw results for "$filename"');
print(test.rawResults());
}
if (args[kTabTableOpt] as bool) {
section('A/B comparison for "$filename"');
print(test.printSummary());
}
if (args[kAsciiTableOpt] as bool) {
section('Formatted summary for "$filename"');
print(test.asciiSummary());
}
}
}
/// Command-line options for the `summarize.dart` command.
final ArgParser _argParser = ArgParser()
..addFlag(
kAsciiTableOpt,
defaultsTo: true,
help: 'Prints the summary in a table formatted nicely for terminal output.',
)
..addFlag(
kTabTableOpt,
defaultsTo: true,
help: 'Prints the summary in a table with tabs for easy spreadsheet entry.',
)
..addFlag(
kRawSummaryOpt,
defaultsTo: true,
help: 'Prints all per-run data collected by the A/B test formatted with\n'
'tabs for easy spreadsheet entry.',
);
...@@ -5,12 +5,53 @@ ...@@ -5,12 +5,53 @@
import 'dart:math' as math; import 'dart:math' as math;
import 'package:meta/meta.dart'; import 'package:meta/meta.dart';
const String kBenchmarkTypeKeyName = 'benchmark_type';
const String kBenchmarkVersionKeyName = 'version';
const String kLocalEngineKeyName = 'local_engine';
const String kTaskNameKeyName = 'task_name';
const String kRunStartKeyName = 'run_start';
const String kRunEndKeyName = 'run_end';
const String kAResultsKeyName = 'default_results';
const String kBResultsKeyName = 'local_engine_results';
const String kBenchmarkResultsType = 'A/B summaries';
const String kBenchmarkABVersion = '1.0';
enum FieldJustification { LEFT, RIGHT, CENTER }
/// Collects data from an A/B test and produces a summary for human evaluation. /// Collects data from an A/B test and produces a summary for human evaluation.
/// ///
/// See [printSummary] for more. /// See [printSummary] for more.
class ABTest { class ABTest {
final Map<String, List<double>> _aResults = <String, List<double>>{}; ABTest(this.localEngine, this.taskName)
final Map<String, List<double>> _bResults = <String, List<double>>{}; : runStart = DateTime.now(),
_aResults = <String, List<double>>{},
_bResults = <String, List<double>>{};
ABTest.fromJsonMap(Map<String, dynamic> jsonResults)
: localEngine = jsonResults[kLocalEngineKeyName] as String,
taskName = jsonResults[kTaskNameKeyName] as String,
runStart = DateTime.parse(jsonResults[kRunStartKeyName] as String),
_runEnd = DateTime.parse(jsonResults[kRunEndKeyName] as String),
_aResults = _convertFrom(jsonResults[kAResultsKeyName] as Map<String, dynamic>),
_bResults = _convertFrom(jsonResults[kBResultsKeyName] as Map<String, dynamic>);
final String localEngine;
final String taskName;
final DateTime runStart;
DateTime _runEnd;
DateTime get runEnd => _runEnd;
final Map<String, List<double>> _aResults;
final Map<String, List<double>> _bResults;
static Map<String, List<double>> _convertFrom(dynamic results) {
final Map<String, dynamic> resultMap = results as Map<String, dynamic>;
return <String, List<double>> {
for (String key in resultMap.keys)
key: (resultMap[key] as List<dynamic>).cast<double>()
};
}
/// Adds the result of a single A run of the benchmark. /// Adds the result of a single A run of the benchmark.
/// ///
...@@ -18,6 +59,9 @@ class ABTest { ...@@ -18,6 +59,9 @@ class ABTest {
/// ///
/// [result] is expected to be a serialization of [TaskResult]. /// [result] is expected to be a serialization of [TaskResult].
void addAResult(Map<String, dynamic> result) { void addAResult(Map<String, dynamic> result) {
if (_runEnd != null) {
throw StateError('Cannot add results to ABTest after it is finalized');
}
_addResult(result, _aResults); _addResult(result, _aResults);
} }
...@@ -27,9 +71,115 @@ class ABTest { ...@@ -27,9 +71,115 @@ class ABTest {
/// ///
/// [result] is expected to be a serialization of [TaskResult]. /// [result] is expected to be a serialization of [TaskResult].
void addBResult(Map<String, dynamic> result) { void addBResult(Map<String, dynamic> result) {
if (_runEnd != null) {
throw StateError('Cannot add results to ABTest after it is finalized');
}
_addResult(result, _bResults); _addResult(result, _bResults);
} }
void finalize() {
_runEnd = DateTime.now();
}
Map<String, dynamic> get jsonMap => <String, dynamic>{
kBenchmarkTypeKeyName: kBenchmarkResultsType,
kBenchmarkVersionKeyName: kBenchmarkABVersion,
kLocalEngineKeyName: localEngine,
kTaskNameKeyName: taskName,
kRunStartKeyName: runStart.toIso8601String(),
kRunEndKeyName: runEnd.toIso8601String(),
kAResultsKeyName: _aResults,
kBResultsKeyName: _bResults,
};
static void updateColumnLengths(List<int> lengths, List<String> results) {
for (int column = 0; column < lengths.length; column++) {
if (results[column] != null) {
lengths[column] = math.max(lengths[column], results[column].length);
}
}
}
static void formatResult(StringBuffer buffer,
List<int> lengths,
List<FieldJustification> aligns,
List<String> values) {
for (int column = 0; column < lengths.length; column++) {
final int len = lengths[column];
String value = values[column];
if (value == null) {
value = ''.padRight(len);
} else {
switch (aligns[column]) {
case FieldJustification.LEFT:
value = value.padRight(len);
break;
case FieldJustification.RIGHT:
value = value.padLeft(len);
break;
case FieldJustification.CENTER:
value = value.padLeft((len + value.length) ~/2);
value = value.padRight(len);
break;
}
}
if (column > 0) {
value = value.padLeft(len+1);
}
buffer.write(value);
}
buffer.writeln();
}
/// Returns the summary as a tab-separated spreadsheet.
///
/// This value can be copied straight to a Google Spreadsheet for further analysis.
String asciiSummary() {
final Map<String, _ScoreSummary> summariesA = _summarize(_aResults);
final Map<String, _ScoreSummary> summariesB = _summarize(_bResults);
final List<List<String>> tableRows = <List<String>>[
for (final String scoreKey in <String>{...summariesA.keys, ...summariesB.keys})
<String>[
scoreKey,
summariesA[scoreKey]?.averageString, summariesA[scoreKey]?.noiseString,
summariesB[scoreKey]?.averageString, summariesB[scoreKey]?.noiseString,
summariesA[scoreKey]?.improvementOver(summariesB[scoreKey]),
],
];
final List<String> titles = <String>[
'Score',
'Average A', '(noise)',
'Average B', '(noise)',
'Speed-up'
];
final List<FieldJustification> alignments = <FieldJustification>[
FieldJustification.LEFT,
FieldJustification.RIGHT, FieldJustification.LEFT,
FieldJustification.RIGHT, FieldJustification.LEFT,
FieldJustification.CENTER
];
final List<int> lengths = List<int>.filled(6, 0);
updateColumnLengths(lengths, titles);
for (final List<String> row in tableRows) {
updateColumnLengths(lengths, row);
}
final StringBuffer buffer = StringBuffer();
formatResult(buffer, lengths,
<FieldJustification>[
FieldJustification.CENTER,
...alignments.skip(1),
], titles);
for (final List<String> row in tableRows) {
formatResult(buffer, lengths, alignments, row);
}
return buffer.toString();
}
/// Returns unprocessed data collected by the A/B test formatted as /// Returns unprocessed data collected by the A/B test formatted as
/// a tab-separated spreadsheet. /// a tab-separated spreadsheet.
String rawResults() { String rawResults() {
...@@ -83,19 +233,19 @@ class ABTest { ...@@ -83,19 +233,19 @@ class ABTest {
buffer.write('$scoreKey\t'); buffer.write('$scoreKey\t');
if (summaryA != null) { if (summaryA != null) {
buffer.write('${summaryA.average.toStringAsFixed(2)} (${_ratioToPercent(summaryA.noise)})\t'); buffer.write('${summaryA.averageString} ${summaryA.noiseString}\t');
} else { } else {
buffer.write('\t'); buffer.write('\t');
} }
if (summaryB != null) { if (summaryB != null) {
buffer.write('${summaryB.average.toStringAsFixed(2)} (${_ratioToPercent(summaryB.noise)})\t'); buffer.write('${summaryB.averageString} ${summaryB.noiseString}\t');
} else { } else {
buffer.write('\t'); buffer.write('\t');
} }
if (summaryA != null && summaryB != null) { if (summaryA != null && summaryB != null) {
buffer.write('${(summaryA.average / summaryB.average).toStringAsFixed(2)}x\t'); buffer.write('${summaryA.improvementOver(summaryB)}\t');
} }
buffer.writeln(); buffer.writeln();
...@@ -117,6 +267,13 @@ class _ScoreSummary { ...@@ -117,6 +267,13 @@ class _ScoreSummary {
/// The noise (standard deviation divided by [average]) in the collected /// The noise (standard deviation divided by [average]) in the collected
/// values. /// values.
final double noise; final double noise;
String get averageString => average.toStringAsFixed(2);
String get noiseString => '(${_ratioToPercent(noise)})';
String improvementOver(_ScoreSummary other) {
return other == null ? '' : '${(average / other.average).toStringAsFixed(2)}x';
}
} }
void _addResult(Map<String, dynamic> result, Map<String, List<double>> results) { void _addResult(Map<String, dynamic> result, Map<String, List<double>> results) {
......
...@@ -8,7 +8,7 @@ import 'common.dart'; ...@@ -8,7 +8,7 @@ import 'common.dart';
void main() { void main() {
test('ABTest', () { test('ABTest', () {
final ABTest ab = ABTest(); final ABTest ab = ABTest('engine', 'test');
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ab.addAResult(<String, dynamic>{ ab.addAResult(<String, dynamic>{
...@@ -28,6 +28,7 @@ void main() { ...@@ -28,6 +28,7 @@ void main() {
'benchmarkScoreKeys': <String>['i', 'k'], 'benchmarkScoreKeys': <String>['i', 'k'],
}); });
} }
ab.finalize();
expect( expect(
ab.rawResults(), ab.rawResults(),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment