Remove outliers in Web benchmarks to reduce noise; add visualization (#54883)

* web benchmarks: separate outliers to reduce noise; add visualization

Remove outliers in Web benchmarks to reduce noise; add visualization (#54883)
* web benchmarks: separate outliers to reduce noise; add visualization
a30c46dc · Yegor · GitHub · 20803507 · a30c46dc · a30c46dc
Unverified Commit a30c46dc authored Apr 16, 2020 by Yegor Committed by GitHub Apr 16, 2020
3 changed files
--- a/dev/benchmarks/macrobenchmarks/lib/src/web/bench_dynamic_clip_on_static_picture.dart
+++ b/dev/benchmarks/macrobenchmarks/lib/src/web/bench_dynamic_clip_on_static_picture.dart
@@ -36,7 +36,7 @@ class BenchDynamicClipOnStaticPicture extends SceneBuilderRecorder {
    // If the scrollable extent is too small, the benchmark may end up
    // scrolling the picture out of the clip area entirely, resulting in
    // bogus metric vaules.
-    const double maxScrollExtent = kMaxSampleCount * kScrollDelta;
+    const double maxScrollExtent = kTotalSampleCount * kScrollDelta;
    const double pictureHeight = kRows * kRowHeight;
    if (maxScrollExtent > pictureHeight) {
      throw Exception(

--- a/dev/benchmarks/macrobenchmarks/lib/src/web/recorder.dart
+++ b/dev/benchmarks/macrobenchmarks/lib/src/web/recorder.dart
@@ -16,28 +16,16 @@ import 'package:flutter/scheduler.dart';
 import 'package:flutter/rendering.dart';
 import 'package:flutter/widgets.dart';
-/// Minimum number of samples collected by a benchmark irrespective of noise
+/// The number of samples from warm-up iterations.
-/// levels.
-const int kMinSampleCount = 50;
-/// Maximum number of samples collected by a benchmark irrespective of noise
-/// levels.
 ///
-/// If the noise doesn't settle down before we reach the max we'll report noisy
+/// We warm-up the benchmark prior to measuring to allow JIT and caches to settle.
-/// results assuming the benchmarks is simply always noisy.
+const int _kWarmUpSampleCount = 200;
-const int kMaxSampleCount = 10 * kMinSampleCount;
-/// The number of samples used to extract metrics, such as noise, means,
+/// The number of samples we use to collect statistics from.
-/// max/min values.
+const int _kMeasuredSampleCount = 100;
-///
-/// Keep this constant in sync with the same constant defined in `dev/devicelab/lib/framework/browser.dart`.
-const int _kMeasuredSampleCount = 10;
-/// Maximum tolerated noise level.
+/// The total number of samples collected by a benchmark.
-///
+const int kTotalSampleCount = _kWarmUpSampleCount + _kMeasuredSampleCount;
-/// A benchmark continues running until a noise level below this threshold is
-/// reached.
-const double _kNoiseThreshold = 0.05; // 5%
 /// Measures the amount of time [action] takes.
 Duration timeAction(VoidCallback action) {
@@ -321,8 +309,7 @@ abstract class WidgetRecorder extends Recorder implements FrameRecorder {
  ///
  /// The widget must create its own animation to drive the benchmark. The
  /// animation should continue indefinitely. The benchmark harness will stop
-  /// pumping frames automatically as soon as the noise levels are sufficiently
+  /// pumping frames automatically.
-  /// low.
  Widget createWidget();
  @override
@@ -503,54 +490,186 @@ class _WidgetBuildRecorderHostState extends State<_WidgetBuildRecorderHost> {
 class Timeseries {
  Timeseries(this.name);
+  /// The label of this timeseries used for debugging and result inspection.
  final String name;
  /// List of all the values that have been recorded.
  ///
  /// This list has no limit.
-  final List<num> _allValues = <num>[];
+  final List<double> _allValues = <double>[];
-  /// List of values that are being used for measurement purposes.
-  ///
-  /// [average], [standardDeviation] and [noise] are all based on this list, not
-  /// the [_allValues] list.
-  final List<num> _measuredValues = <num>[];
  /// The total amount of data collected, including ones that were dropped
  /// because of the sample size limit.
  int get count => _allValues.length;
-  /// Computes the average value of the measured values.
+  /// Extracts useful statistics out of this timeseries.
-  double get average => _computeAverage(name, _measuredValues);
-  /// Computes the standard deviation of the measured values.
-  double get standardDeviation =>
-      _computeStandardDeviationForPopulation(name, _measuredValues);
-  /// Computes noise as a multiple of the [average] value.
-  ///
-  /// This value can be multiplied by 100.0 to get noise as a percentage of
-  /// the average.
  ///
-  /// If [average] is zero, treats the result as perfect score, returns zero.
+  /// See [TimeseriesStats] for more details.
-  double get noise => average > 0.0 ? standardDeviation / average : 0.0;
+  TimeseriesStats computeStats() {
+    // The first few values we simply discard and never look at. They're from the warm-up phase.
+    final List<double> warmUpValues = _allValues.sublist(0, _allValues.length - _kMeasuredSampleCount);
+    // Values we analyze.
+    final List<double> candidateValues = _allValues.sublist(_allValues.length - _kMeasuredSampleCount);
+    // The average that includes outliers.
+    final double dirtyAverage = _computeAverage(name, candidateValues);
+    // The standard deviation that includes outliers.
+    final double dirtyStandardDeviation = _computeStandardDeviationForPopulation(name, candidateValues);
+    // Any value that's higher than this is considered an outlier.
+    final double outlierCutOff = dirtyAverage + dirtyStandardDeviation;
+    // Candidates with outliers removed.
+    final Iterable<double> cleanValues = candidateValues.where((double value) => value <= outlierCutOff);
+    // Outlier candidates.
+    final Iterable<double> outliers = candidateValues.where((double value) => value > outlierCutOff);
+    // Final statistics.
+    final double cleanAverage = _computeAverage(name, cleanValues);
+    final double outlierAverage = _computeAverage(name, outliers);
+    final double standardDeviation = _computeStandardDeviationForPopulation(name, cleanValues);
+    final double noise = cleanAverage > 0.0 ? standardDeviation / cleanAverage : 0.0;
+    final List<AnnotatedSample> annotatedValues = <AnnotatedSample>[
+      for (final double warmUpValue in warmUpValues)
+        AnnotatedSample(
+          magnitude: warmUpValue,
+          isOutlier: warmUpValue > outlierCutOff,
+          isWarmUpValue: true,
+        ),
+      for (final double candidate in candidateValues)
+        AnnotatedSample(
+          magnitude: candidate,
+          isOutlier: candidate > outlierCutOff,
+          isWarmUpValue: false,
+        ),
+    ];
+    return TimeseriesStats(
+      name: name,
+      average: cleanAverage,
+      outlierCutOff: outlierCutOff,
+      outlierAverage: outlierAverage,
+      standardDeviation: standardDeviation,
+      noise: noise,
+      cleanSampleCount: cleanValues.length,
+      outlierSampleCount: outliers.length,
+      samples: annotatedValues,
+    );
+  }
  /// Adds a value to this timeseries.
-  void add(num value) {
+  void add(double value) {
    if (value < 0.0) {
      throw StateError(
        'Timeseries $name: negative metric values are not supported. Got: $value',
      );
    }
-    _measuredValues.add(value);
    _allValues.add(value);
-    // Don't let the [_measuredValues] list grow beyond [_kMeasuredSampleCount].
-    if (_measuredValues.length > _kMeasuredSampleCount) {
-      _measuredValues.removeAt(0);
  }
+}
+/// Various statistics about a [Timeseries].
+///
+/// See the docs on the individual fields for more details.
+@sealed
+class TimeseriesStats {
+  const TimeseriesStats({
+    @required this.name,
+    @required this.average,
+    @required this.outlierCutOff,
+    @required this.outlierAverage,
+    @required this.standardDeviation,
+    @required this.noise,
+    @required this.cleanSampleCount,
+    @required this.outlierSampleCount,
+    @required this.samples,
+  });
+  /// The label used to refer to the corresponding timeseries.
+  final String name;
+  /// The average value of the measured samples without outliers.
+  final double average;
+  /// The standard deviation in the measured samples without outliers.
+  final double standardDeviation;
+  /// The noise as a multiple of the [average] value takes from clean samples.
+  ///
+  /// This value can be multiplied by 100.0 to get noise as a percentage of
+  /// the average.
+  ///
+  /// If [average] is zero, treats the result as perfect score, returns zero.
+  final double noise;
+  /// The maximum value a sample can have without being considered an outlier.
+  ///
+  /// See [Timeseries.computeStats] for details on how this value is computed.
+  final double outlierCutOff;
+  /// The average of outlier samples.
+  ///
+  /// This value can be used to judge how badly we jank, when we jank.
+  ///
+  /// Another useful metrics is the difference between [outlierAverage] and
+  /// [average]. The smaller the value the more predictable is the performance
+  /// of the corresponding benchmark.
+  final double outlierAverage;
+  /// The number of measured samples after outlier are removed.
+  final int cleanSampleCount;
+  /// The number of outliers.
+  final int outlierSampleCount;
+  /// All collected samples, annotated with statistical information.
+  ///
+  /// See [AnnotatedSample] for more details.
+  final List<AnnotatedSample> samples;
+  @override
+  String toString() {
+    final StringBuffer buffer = StringBuffer();
+    buffer.writeln(
+      '$name: (samples: $cleanSampleCount clean/$outlierSampleCount outliers/'
+      '${cleanSampleCount + outlierSampleCount} measured/'
+      '${samples.length} total)');
+    buffer.writeln(' | average: $average μs');
+    buffer.writeln(' | outlier average: $outlierAverage μs');
+    buffer.writeln(' | noise: ${_ratioToPercent(noise)}');
+    return buffer.toString();
  }
 }
+/// Annotates a single measurement with statistical information.
+@sealed
+class AnnotatedSample {
+  const AnnotatedSample({
+    @required this.magnitude,
+    @required this.isOutlier,
+    @required this.isWarmUpValue,
+  });
+  /// The non-negative raw result of the measurement.
+  final double magnitude;
+  /// Whether this sample was considered an outlier.
+  final bool isOutlier;
+  /// Whether this sample was taken during the warm-up phase.
+  ///
+  /// If this value is `true`, this sample does not participate in
+  /// statistical computations. However, the sample would still be
+  /// shown in the visualization of results so that the benchmark
+  /// can be inspected manually to make sure there's a predictable
+  /// warm-up regression slope.
+  final bool isWarmUpValue;
+}
 /// Base class for a profile collected from running a benchmark.
 class Profile {
  Profile({@required this.name}) : assert(name != null);
@@ -572,7 +691,7 @@ class Profile {
  }
  void addDataPoint(String key, Duration duration) {
-    scoreData.putIfAbsent(key, () => Timeseries(key)).add(duration.inMicroseconds);
+    scoreData.putIfAbsent(key, () => Timeseries(key)).add(duration.inMicroseconds.toDouble());
  }
  /// Decides whether the data collected so far is sufficient to stop, or
@@ -584,56 +703,15 @@ class Profile {
  /// method will return true (asking the benchmark to continue collecting
  /// data).
  bool shouldContinue() {
-    // If we haven't recorded anything yet, we don't wanna stop now.
+    // If there are no `Timeseries` in the `scoreData`, then we haven't
+    // recorded anything yet. Don't stop.
    if (scoreData.isEmpty) {
      return true;
    }
-    // Accumulates all the messages to be printed when the final decision is to
+    // We have recorded something, but do we have enough samples? If every
-    // stop collecting data.
+    // timeseries has collected enough samples, stop the benchmark.
-    final StringBuffer buffer = StringBuffer();
+    return !scoreData.keys.every((String key) => scoreData[key].count >= kTotalSampleCount);
-    final Iterable<bool> shouldContinueList = scoreData.keys.map((String key) {
-      final Timeseries timeseries = scoreData[key];
-      // Collect enough data points before considering to stop.
-      if (timeseries.count < kMinSampleCount) {
-        return true;
-      }
-      // Is it still too noisy?
-      if (timeseries.noise > _kNoiseThreshold) {
-        // If the timeseries has enough data, stop it, even if it's noisy under
-        // the assumption that this benchmark is always noisy and there's nothing
-        // we can do about it.
-        if (timeseries.count > kMaxSampleCount) {
-          buffer.writeln(
-            'WARNING: Noise of benchmark "$name.$key" did not converge below '
-            '${_ratioToPercent(_kNoiseThreshold)}. Stopping because it reached the '
-            'maximum number of samples $kMaxSampleCount. Noise level is '
-            '${_ratioToPercent(timeseries.noise)}.',
-          );
-          return false;
-        } else {
-          return true;
-        }
-      }
-      buffer.writeln(
-        'SUCCESS: Benchmark "$name.$key" converged below ${_ratioToPercent(_kNoiseThreshold)}. '
-        'Noise level is ${_ratioToPercent(timeseries.noise)}.',
-      );
-      return false;
-    });
-    // If any of the score data needs to continue to be collected, we should
-    // return true.
-    final bool finalDecision =
-        shouldContinueList.any((bool element) => element);
-    if (!finalDecision) {
-      print(buffer.toString());
-    }
-    return finalDecision;
  }
  /// Returns a JSON representation of the profile that will be sent to the
@@ -647,9 +725,12 @@ class Profile {
    for (final String key in scoreData.keys) {
      scoreKeys.add('$key.average');
+      scoreKeys.add('$key.outlierAverage');
      final Timeseries timeseries = scoreData[key];
-      json['$key.average'] = timeseries.average;
+      final TimeseriesStats stats = timeseries.computeStats();
-      json['$key.noise'] = timeseries.noise;
+      json['$key.average'] = stats.average;
+      json['$key.outlierAverage'] = stats.outlierAverage;
+      json['$key.noise'] = stats.noise;
    }
    json.addAll(extraData);
@@ -663,9 +744,8 @@ class Profile {
    buffer.writeln('name: $name');
    for (final String key in scoreData.keys) {
      final Timeseries timeseries = scoreData[key];
-      buffer.writeln('$key: (samples=${timeseries.count})');
+      final TimeseriesStats stats = timeseries.computeStats();
-      buffer.writeln(' | average: ${timeseries.average} μs');
+      buffer.writeln(stats.toString());
-      buffer.writeln(' | noise: ${_ratioToPercent(timeseries.noise)}');
    }
    for (final String key in extraData.keys) {
      final dynamic value = extraData[key];
@@ -683,12 +763,12 @@ class Profile {
 }
 /// Computes the arithmetic mean (or average) of given [values].
-double _computeAverage(String label, Iterable<num> values) {
+double _computeAverage(String label, Iterable<double> values) {
  if (values.isEmpty) {
    throw StateError('$label: attempted to compute an average of an empty value list.');
  }
-  final num sum = values.reduce((num a, num b) => a + b);
+  final double sum = values.reduce((double a, double b) => a + b);
  return sum / values.length;
 }
@@ -699,14 +779,14 @@ double _computeAverage(String label, Iterable<num> values) {
 /// See also:
 ///
 /// * https://en.wikipedia.org/wiki/Standard_deviation
-double _computeStandardDeviationForPopulation(String label, Iterable<num> population) {
+double _computeStandardDeviationForPopulation(String label, Iterable<double> population) {
  if (population.isEmpty) {
    throw StateError('$label: attempted to compute the standard deviation of empty population.');
  }
  final double mean = _computeAverage(label, population);
  final double sumOfSquaredDeltas = population.fold<double>(
    0.0,
-    (double previous, num value) => previous += math.pow(value - mean, 2),
+    (double previous, double value) => previous += math.pow(value - mean, 2),
  );
  return math.sqrt(sumOfSquaredDeltas / population.length);
 }

--- a/dev/benchmarks/macrobenchmarks/lib/web_benchmarks.dart
+++ b/dev/benchmarks/macrobenchmarks/lib/web_benchmarks.dart
@@ -5,6 +5,7 @@
 import 'dart:async';
 import 'dart:convert' show json;
 import 'dart:html' as html;
+import 'dart:math' as math;
 import 'package:macrobenchmarks/src/web/bench_text_layout.dart';
 import 'package:macrobenchmarks/src/web/bench_text_out_of_picture_bounds.dart';
@@ -83,6 +84,7 @@ Future<void> _runBenchmark(String benchmarkName) async {
    if (!_client.isInManualMode) {
      await _client.sendProfileData(profile);
    } else {
+      _printResultsToScreen(profile);
      print(profile);
    }
  } catch (error, stackTrace) {
@@ -121,6 +123,120 @@ void _fallbackToManual(String error) {
  }
 }
+/// Visualizes results on the Web page for manual inspection.
+void _printResultsToScreen(Profile profile) {
+  html.document.body.remove();
+  html.document.body = html.BodyElement();
+  html.document.body.appendHtml('<h2>${profile.name}</h2>');
+  profile.scoreData.forEach((String scoreKey, Timeseries timeseries) {
+    html.document.body.appendHtml('<h2>$scoreKey</h2>');
+    html.document.body.appendHtml('<pre>${timeseries.computeStats()}</pre>');
+    html.document.body.append(TimeseriesVisualization(timeseries).render());
+  });
+}
+/// Draws timeseries data and statistics on a canvas.
+class TimeseriesVisualization {
+  TimeseriesVisualization(this._timeseries) {
+    _stats = _timeseries.computeStats();
+    _canvas = html.CanvasElement();
+    _screenWidth = html.window.screen.width;
+    _canvas.width = _screenWidth;
+    _canvas.height = (_kCanvasHeight * html.window.devicePixelRatio).round();
+    _canvas.style
+      ..width = '100%'
+      ..height = '${_kCanvasHeight}px'
+      ..outline = '1px solid green';
+    _ctx = _canvas.context2D;
+    // The amount of vertical space available on the chart. Because some
+    // outliers can be huge they can dwarf all the useful values. So we
+    // limit it to 1.5 x the biggest non-outlier.
+    _maxValueChartRange = 1.5 * _stats.samples
+      .where((AnnotatedSample sample) => !sample.isOutlier)
+      .map<double>((AnnotatedSample sample) => sample.magnitude)
+      .fold<double>(0, math.max);
+  }
+  static const double _kCanvasHeight = 200;
+  final Timeseries _timeseries;
+  TimeseriesStats _stats;
+  html.CanvasElement _canvas;
+  html.CanvasRenderingContext2D _ctx;
+  int _screenWidth;
+  // Used to normalize benchmark values to chart height.
+  double _maxValueChartRange;
+  /// Converts a sample value to vertical canvas coordinates.
+  ///
+  /// This does not work for horizontal coordinates.
+  double _normalized(double value) {
+    return _kCanvasHeight * value / _maxValueChartRange;
+  }
+  /// A utility for drawing lines.
+  void drawLine(num x1, num y1, num x2, num y2) {
+    _ctx.beginPath();
+    _ctx.moveTo(x1, y1);
+    _ctx.lineTo(x2, y2);
+    _ctx.stroke();
+  }
+  /// Renders the timeseries into a `<canvas>` and returns the canvas element.
+  html.CanvasElement render() {
+    _ctx.translate(0, _kCanvasHeight * html.window.devicePixelRatio);
+    _ctx.scale(1, -html.window.devicePixelRatio);
+    final double barWidth = _screenWidth / _stats.samples.length;
+    double xOffset = 0;
+    for (int i = 0; i < _stats.samples.length; i++) {
+      final AnnotatedSample sample = _stats.samples[i];
+      if (sample.isWarmUpValue) {
+        // Put gray background behing warm-up samples.
+        _ctx.fillStyle = 'rgba(200,200,200,1)';
+        _ctx.fillRect(xOffset, 0, barWidth, _normalized(_maxValueChartRange));
+      }
+      if (sample.magnitude > _maxValueChartRange) {
+        // The sample value is so big it doesn't fit on the chart. Paint it purple.
+        _ctx.fillStyle = 'rgba(100,50,100,0.8)';
+      } else if (sample.isOutlier) {
+        // The sample is an outlier, color it light red.
+        _ctx.fillStyle = 'rgba(255,50,50,0.6)';
+      } else {
+        // A non-outlier sample, color it light blue.
+        _ctx.fillStyle = 'rgba(50,50,255,0.6)';
+      }
+      _ctx.fillRect(xOffset, 0, barWidth - 1, _normalized(sample.magnitude));
+      xOffset += barWidth;
+    }
+    // Draw a horizontal solid line corresponding to the average.
+    _ctx.lineWidth = 1;
+    drawLine(0, _normalized(_stats.average), _screenWidth, _normalized(_stats.average));
+    // Draw a horizontal dashed line corresponding to the outlier cut off.
+    _ctx.setLineDash(<num>[5, 5]);
+    drawLine(0, _normalized(_stats.outlierCutOff), _screenWidth, _normalized(_stats.outlierCutOff));
+    // Draw a light red band that shows the noise (1 stddev in each direction).
+    _ctx.fillStyle = 'rgba(255,50,50,0.3)';
+    _ctx.fillRect(
+      0,
+      _normalized(_stats.average * (1 - _stats.noise)),
+      _screenWidth,
+      _normalized(2 * _stats.average * _stats.noise),
+    );
+    return _canvas;
+  }
+}
 /// Implements the client REST API for the local benchmark server.
 ///
 /// The local server is optional. If it is not available the benchmark UI must