blob: 676db4c2d52ee0ccd68060c9e78a9a2b1436959e [file] [log] [blame]
#!/usr/bin/env python -u
# Copyright (C) 2019 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the name of Apple Inc. ("Apple") nor the names of
# its contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
import argparse
import json
from webkitpy.benchmark_runner.benchmark_results import BenchmarkResults
try:
from scipy import stats
except:
print "ERROR: scipy package is not installed. Run `pip install scipy`"
sys.exit(1)
try:
import numpy
except:
print "ERROR: numpy package is not installed. Run `pip install numpy`"
sys.exit(1)
def readJSONFile(path):
with open(path, 'r') as contents:
result = json.loads(contents.read())
if 'debugOutput' in result:
del result['debugOutput']
return result
Speedometer2 = "Speedometer2"
JetStream2 = "JetStream2"
PLT5 = "PLT5"
MotionMark = "MotionMark"
MotionMark1_1 = "MotionMark-1.1"
MotionMark1_1_1 = "MotionMark-1.1.1"
unitMarker = "__unit__"
def speedometer2Breakdown(jsonObject):
breakdown = BenchmarkResults(jsonObject)
result = {}
result[unitMarker] = "ms"
for test in breakdown._results["Speedometer-2"]["tests"].keys():
result[test] = breakdown._results["Speedometer-2"]["tests"][test]["metrics"]["Time"]["Total"]["current"]
return result
def jetStream2Breakdown(jsonObject):
breakdown = BenchmarkResults(jsonObject)
result = {}
result[unitMarker] = "pts"
for test in breakdown._results["JetStream2.0"]["tests"].keys():
result[test] = breakdown._results["JetStream2.0"]["tests"][test]["metrics"]["Score"][None]["current"]
return result
def motionMarkBreakdown(jsonObject):
breakdown = BenchmarkResults(jsonObject)
result = {}
result[unitMarker] = "pts"
if detectMotionMark(jsonObject):
name = "MotionMark"
elif detectMotionMark1_1(jsonObject):
name = "MotionMark-1.1"
else:
name = "MotionMark-1.1.1"
for test in breakdown._results[name]["tests"].keys():
result[test] = breakdown._results[name]["tests"][test]["metrics"]["Score"][None]["current"]
return result
def plt5Breakdown(jsonObject):
nameMapping = {}
for mappings in jsonObject["urls"]:
for key in mappings.keys():
nameMapping[key] = mappings[key]
result = {}
result[unitMarker] = "ms"
for test in jsonObject["iterations"][0]["warm"].keys():
if test == "Geometric":
continue
result["warm--" + nameMapping[test]] = []
result["cold--" + nameMapping[test]] = []
for payload in jsonObject["iterations"]:
warmTests = payload["warm"]
coldTests = payload["cold"]
for test in warmTests.keys():
if test == "Geometric":
continue
result["warm--" + nameMapping[test]].append(warmTests[test]["Geometric"])
result["cold--" + nameMapping[test]].append(coldTests[test]["Geometric"])
return result
def displayStr(value):
return "{:.6f}".format(float(value))
def computeMultipleHypothesesSignificance(a, b):
# This is using the Benjamini-Hochberg procedure based on False Discovery Rate
# for computing signifcance in multiple hypothesis testing
# Read more here:
# - https://en.wikipedia.org/wiki/False_discovery_rate
# - https://www.stat.berkeley.edu/~mgoldman/Section0402.pdf
# This is best used for independent variables. We know subtests aren't
# fully independent, this it's a reasonable approximation.
# We use this instead of Bonferroni because we control for almost the same
# false positive error rate (marking as signficant when it's not), but with a much
# lower false negative error rate (not marking something as signficant when it is).
sortedPValues = []
reversePValueMap = {}
for key in a.keys():
if key == unitMarker:
continue
(tStatistic, pValue) = stats.ttest_ind(a[key], b[key], equal_var=False)
sortedPValues.append(pValue)
if pValue not in reversePValueMap:
reversePValueMap[pValue] = []
reversePValueMap[pValue].append(key)
sortedPValues.sort()
assert sortedPValues[0] <= sortedPValues[-1]
isSignificant = False
result = {}
rank = float(len(sortedPValues))
for pValue in reversed(sortedPValues):
assert rank >= 1.0
threshold = (rank * .05) / float(len(sortedPValues))
if pValue <= threshold:
isSignificant = True
assert len(reversePValueMap[pValue]) > 0
for test in reversePValueMap[pValue]:
result[test] = isSignificant
rank = rank - 1.0
return result
def dumpBreakdowns(a, b):
nameLength = len("subtest")
aLength = len(a[unitMarker])
bLength = len(a[unitMarker])
ratioLength = len("b / a")
pValueHeader = "pValue (significance using False Discovery Rate)"
pLength = len(pValueHeader)
isSignificant = computeMultipleHypothesesSignificance(a, b)
for key in a.keys():
if key == unitMarker:
continue
nameLength = max(nameLength, len(key))
aLength = max(aLength, len(displayStr(numpy.mean(a[key]))))
bLength = max(bLength, len(displayStr(numpy.mean(b[key]))))
ratioLength = max(ratioLength, len(displayStr(numpy.mean(b[key]) / numpy.mean(a[key]))))
(tStatistic, pValue) = stats.ttest_ind(a[key], b[key], equal_var=False)
significantStr = ""
if isSignificant[key]:
significantStr = " (significant)"
pLength = max(pLength, len(displayStr(pValue)) + len(significantStr))
aLength += 2
bLength += 2
nameLength += 2
ratioLength += 2
pLength += 2
strings = []
strings.append("|{key:^{nameLength}}|{aScore:^{aLength}} |{bScore:^{bLength}} |{compare:^{ratioLength}}|{pMarker:^{pLength}}|".format(key="subtest", aScore=a[unitMarker], bScore=b[unitMarker], nameLength=nameLength, aLength=aLength, bLength=bLength , compare="b / a", ratioLength=ratioLength, pMarker=pValueHeader, pLength=pLength))
for key in a.keys():
if key == unitMarker:
continue
aScore = numpy.mean(a[key])
bScore = numpy.mean(b[key])
(tStatistic, pValue) = stats.ttest_ind(a[key], b[key], equal_var=False)
significantStr = ""
if isSignificant[key]:
significantStr = " (significant)"
strings.append("| {key:{nameLength}}|{aScore:{aLength}} |{bScore:{bLength}} |{compare:{ratioLength}}| {pValue:<{pLength}}|".format(key=key, aScore=displayStr(aScore), bScore=displayStr(bScore), nameLength=nameLength - 1, aLength=aLength, bLength=bLength, ratioLength=ratioLength, compare=displayStr(bScore / aScore), pValue = displayStr(pValue) + significantStr, pLength=pLength - 1))
maxLen = 0
for s in strings:
maxLen = max(maxLen, len(s))
verticalSeparator = "-" * maxLen
strings.insert(0, verticalSeparator)
strings.insert(2, verticalSeparator)
strings.append(verticalSeparator)
print "\n"
for s in strings:
print(s)
print "\n"
def writeCSV(a, b, fileName):
strings = []
result = ""
result += "test_name, {}, {}, b_divided_by_a, pValue, is_significant_using_False_Discovery_Rate\n".format("a_in_" + a[unitMarker], "b_in_" + b[unitMarker])
isSignificant = computeMultipleHypothesesSignificance(a, b)
for key in a.keys():
if key == unitMarker:
continue
aScore = numpy.mean(a[key])
bScore = numpy.mean(b[key])
(tStatistic, pValue) = stats.ttest_ind(a[key], b[key], equal_var=False)
significantStr = "No"
if isSignificant[key]:
significantStr = "Yes"
result += "{},{},{},{},{},{}\n".format(key, displayStr(aScore), displayStr(bScore), displayStr(bScore / aScore), displayStr(pValue), significantStr)
f = open(fileName, "w")
f.write(result)
f.close()
def detectJetStream2(payload):
return "JetStream2.0" in payload
def JetStream2Results(payload):
assert detectJetStream2(payload)
js = payload["JetStream2.0"]
iterations = len(js["tests"]["gaussian-blur"]["metrics"]["Score"]["current"])
results = []
for i in range(iterations):
scores = []
for test in js["tests"].keys():
scores.append(js["tests"][test]["metrics"]["Score"]["current"][i])
geomean = stats.gmean(scores)
results.append(geomean)
return results
def detectSpeedometer2(payload):
return "Speedometer-2" in payload
def Speedometer2Results(payload):
assert detectSpeedometer2(payload)
results = []
for arr in payload["Speedometer-2"]["metrics"]["Score"]["current"]:
results.append(numpy.mean(arr))
return results
def detectPLT5(payload):
if "iterations" not in payload:
return False
iterations = payload["iterations"]
if not isinstance(iterations, list):
return False
if not len(iterations):
return False
if "cold" not in iterations[0]:
return False
if "warm" not in iterations[0]:
return False
if "Geometric" not in iterations[0]:
return False
return True
def PLT5Results(payload):
assert detectPLT5(payload)
results = []
for obj in payload["iterations"]:
results.append(obj["Geometric"])
return results
def detectMotionMark(payload):
return "MotionMark" in payload
def detectMotionMark1_1(payload):
return "MotionMark-1.1" in payload
def detectMotionMark1_1_1(payload):
return "MotionMark-1.1.1" in payload
def motionMarkResults(payload):
assert detectMotionMark(payload) or detectMotionMark1_1(payload) or detectMotionMark1_1_1(payload)
if detectMotionMark(payload):
payload = payload["MotionMark"]
elif detectMotionMark1_1(payload):
payload = payload["MotionMark-1.1"]
else:
payload = payload["MotionMark-1.1.1"]
testNames = payload["tests"].keys()
numTests = len(payload["tests"][testNames[0]]["metrics"]["Score"]["current"])
results = []
for i in range(numTests):
scores = []
for test in testNames:
scores.append(payload["tests"][test]["metrics"]["Score"]["current"][i])
results.append(stats.gmean(scores))
return results
def detectBenchmark(payload):
if detectJetStream2(payload):
return JetStream2
if detectSpeedometer2(payload):
return Speedometer2
if detectPLT5(payload):
return PLT5
if detectMotionMark(payload):
return MotionMark
if detectMotionMark1_1(payload):
return MotionMark1_1
if detectMotionMark1_1_1(payload):
return MotionMark1_1
return None
def biggerIsBetter(benchmarkType):
if benchmarkType == JetStream2:
return True
if benchmarkType == Speedometer2:
return True
if benchmarkType == MotionMark:
return True
if benchmarkType == MotionMark1_1:
return True
if benchmarkType == PLT5:
return False
print "Should not be reached."
assert False
def ttest(benchmarkType, a, b):
# We use two-tailed Welch's
(tStatistic, pValue) = stats.ttest_ind(a, b, equal_var=False)
aMean = numpy.mean(a)
bMean = numpy.mean(b)
print "a mean = {:.5f}".format(aMean)
print "b mean = {:.5f}".format(bMean)
print "pValue = {:.10f}".format(pValue)
if biggerIsBetter(benchmarkType):
print "(Bigger means are better.)"
if aMean > bMean:
print "{:.3f} times worse".format((aMean / bMean))
else:
print "{:.3f} times better".format((bMean / aMean))
else:
print "(Smaller means are better.)"
if aMean > bMean:
print "{:.3f} times better".format((aMean / bMean))
else:
print "{:.3f} times worse".format((bMean / aMean))
if pValue <= 0.05:
print "Results ARE significant"
else:
print "Results ARE NOT significant"
def getOptions():
parser = argparse.ArgumentParser(description="Compare two WebKit benchmark results. Pass in two JSON result files to compare them. This script prints the pValue along with the magnitude of the change.")
parser.add_argument("-a",
type=str,
required=True,
help="a of a/b. Path to JSON results file.")
parser.add_argument("-b",
type=str,
required=True,
help="b of a/b. Path to JSON results file.")
parser.add_argument("--csv",
type=str,
required=False,
help="Path to write a csv file containing subtest breakdown.")
parser.add_argument("--breakdown", action="store_true",
default=False, help="Print a per subtest breakdown.")
return parser.parse_known_args()[0]
def main():
args = getOptions()
a = readJSONFile(args.a)
b = readJSONFile(args.b)
typeA = detectBenchmark(a)
typeB = detectBenchmark(b)
if typeA != typeB:
print "-a and -b are not the same benchmark. a={} b={}".format(typeA, typeB)
sys.exit(1)
if not (typeA and typeB):
print "Unknown benchmark type. a={} b={}".format(typeA, typeB)
sys.exit(1)
if typeA == JetStream2:
if args.breakdown:
dumpBreakdowns(jetStream2Breakdown(a), jetStream2Breakdown(b))
ttest(typeA, JetStream2Results(a), JetStream2Results(b))
if args.csv:
writeCSV(jetStream2Breakdown(a), jetStream2Breakdown(b), args.csv)
elif typeA == Speedometer2:
if args.breakdown:
dumpBreakdowns(speedometer2Breakdown(a), speedometer2Breakdown(b))
ttest(typeA, Speedometer2Results(a), Speedometer2Results(b))
if args.csv:
writeCSV(speedometer2Breakdown(a), speedometer2Breakdown(b), args.csv)
elif typeA == MotionMark or typeA == MotionMark1_1 or typeA == MotionMark1_1_1:
if args.breakdown:
dumpBreakdowns(motionMarkBreakdown(a), motionMarkBreakdown(b))
ttest(typeA, motionMarkResults(a), motionMarkResults(b))
if args.csv:
writeCSV(motionMarkBreakdown(a), motionMarkBreakdown(b), args.csv)
elif typeA == PLT5:
if args.breakdown:
dumpBreakdowns(plt5Breakdown(a), plt5Breakdown(b))
ttest(typeA, PLT5Results(a), PLT5Results(b))
if args.csv:
writeCSV(plt5Breakdown(a), plt5Breakdown(b), args.csv)
else:
print "Unknown benchmark type"
sys.exit(1)
if __name__ == "__main__":
main()