Tools/Scripts/compare-results - WebKit - Git at Google

 #!/usr/bin/env python -u

 # Copyright (C) 2019 Apple Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1.  Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 2.  Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 # 3.  Neither the name of Apple Inc. ("Apple") nor the names of
 #     its contributors may be used to endorse or promote products derived
 #     from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 import sys
 import argparse
 import json

 try:
     from scipy import stats
 except:
     print "ERROR: scipy package is not installed. Run `pip install scipy`"
     sys.exit(1)

 try:
     import numpy
 except:
     print "ERROR: numpy package is not installed. Run `pip install numpy`"
     sys.exit(1)

 def readJSONFile(path):
     with open(path, 'r') as contents:
         return json.loads(contents.read())

 Speedometer2 = "Speedometer2"
 JetStream2 = "JetStream2"
 PLT5 = "PLT5"
 MotionMark = "MotionMark"
 MotionMark1_1 = "MotionMark-1.1"

 def detectJetStream2(payload):
     return "JetStream2.0" in payload

 def JetStream2Results(payload):
     assert detectJetStream2(payload)

     js = payload["JetStream2.0"]
     iterations = len(js["tests"]["gaussian-blur"]["metrics"]["Score"]["current"])
     results = []
     for i in range(iterations):
         scores = []
         for test in js["tests"].keys():
             scores.append(js["tests"][test]["metrics"]["Score"]["current"][i])
         geomean = stats.gmean(scores)

         results.append(geomean)

     return results

 def detectSpeedometer2(payload):
     return "Speedometer-2" in payload

 def Speedometer2Results(payload):
     assert detectSpeedometer2(payload)
     results = []
     for arr in payload["Speedometer-2"]["metrics"]["Score"]["current"]:
         results.append(numpy.mean(arr))
     return results

 def detectPLT5(payload):
     if "iterations" not in payload:
         return False
     iterations = payload["iterations"]
     if not isinstance(iterations, list):
         return False
     if not len(iterations):
         return False
     if "cold" not in iterations[0]:
         return False
     if "warm" not in iterations[0]:
         return False
     if "Geometric" not in iterations[0]:
         return False
     return True

 def PLT5Results(payload):
     assert detectPLT5(payload)
     results = []
     for obj in payload["iterations"]:
         results.append(obj["Geometric"])
     return results

 def detectMotionMark(payload):
     return "MotionMark" in payload

 def detectMotionMark1_1(payload):
     return "MotionMark-1.1" in payload

 def motionMarkResults(payload):
     assert detectMotionMark(payload) or detectMotionMark1_1(payload)
     if detectMotionMark(payload):
         payload = payload["MotionMark"]
     else:
         payload = payload["MotionMark-1.1"]
     testNames = payload["tests"].keys()
     numTests = len(payload["tests"][testNames[0]]["metrics"]["Score"]["current"])
     results = []
     for i in range(numTests):
         scores = []
         for test in testNames:
             scores.append(payload["tests"][test]["metrics"]["Score"]["current"][i])
         results.append(stats.gmean(scores))

     return results

 def motionMark1_1Results(payload):
     return motionMarkResults(payload)

 def detectBenchmark(payload):
     if detectJetStream2(payload):
         return JetStream2
     if detectSpeedometer2(payload):
         return Speedometer2
     if detectPLT5(payload):
         return PLT5
     if detectMotionMark(payload):
         return MotionMark
     if detectMotionMark1_1(payload):
         return MotionMark1_1
     return None

 def biggerIsBetter(benchmarkType):
     if benchmarkType == JetStream2:
         return True
     if benchmarkType == Speedometer2:
         return True
     if benchmarkType == MotionMark:
         return True
     if benchmarkType == MotionMark1_1:
         return True
     if benchmarkType == PLT5:
         return False

     print "Should not be reached."
     assert False

 def ttest(benchmarkType, a, b):
     # We use two-tailed Welch's
     (tStatistic, pValue) = stats.ttest_ind(a, b, equal_var=False)
     aMean = numpy.mean(a)
     bMean = numpy.mean(b)
     print "a mean = {:.5f}".format(aMean)
     print "b mean = {:.5f}".format(bMean)

     print "pValue = {:.10f}".format(pValue)

     if biggerIsBetter(benchmarkType):
         print "(Bigger means are better.)"
         if aMean > bMean:
             print "{:.3f} times worse".format((aMean / bMean))
         else:
             print "{:.3f} times better".format((bMean / aMean))
     else:
         print "(Smaller means are better.)"
         if aMean > bMean:
             print "{:.3f} times better".format((aMean / bMean))
         else:
             print "{:.3f} times worse".format((bMean / aMean))

     if pValue <= 0.05:
         print "Results ARE significant"
     else:
         print "Results ARE NOT significant"

 def getOptions():
     parser = argparse.ArgumentParser(description="Compare two WebKit benchmark results. Pass in two JSON result files to compare them. This script prints the pValue along with the magnitude of the change.")

     parser.add_argument("-a",
         type=str,
         required=True,
         help="a of a/b. Path to JSON results file.")

     parser.add_argument("-b",
         type=str,
         required=True,
         help="b of a/b. Path to JSON results file.")

     return parser.parse_known_args()[0]


 def main():
     args = getOptions()

     a = readJSONFile(args.a)
     b = readJSONFile(args.b)

     typeA = detectBenchmark(a)
     typeB = detectBenchmark(b)

     if typeA != typeB:
         print "-a and -b are not the same benchmark. a={} b={}".format(typeA, typeB)
         sys.exit(1)

     if not (typeA and typeB):
         print "Unknown benchmark type. a={} b={}".format(typeA, typeB)
         sys.exit(1)

     if typeA == JetStream2:
         ttest(typeA, JetStream2Results(a), JetStream2Results(b))
     elif typeA == Speedometer2:
         ttest(typeA, Speedometer2Results(a), Speedometer2Results(b))
     elif typeA == MotionMark:
         ttest(typeA, motionMarkResults(a), motionMarkResults(b))
     elif typeA == MotionMark1_1:
         ttest(typeA, motionMark1_1Results(a), motionMark1_1Results(b))
     elif typeA == PLT5:
         ttest(typeA, PLT5Results(a), PLT5Results(b))
     else:
         print "Unknown benchmark type"
         sys.exit(1)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python -u

	# Copyright (C) 2019 Apple Inc. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# 3. Neither the name of Apple Inc. ("Apple") nor the names of
	# its contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	import sys
	import argparse
	import json

	try:
	from scipy import stats
	except:
	print "ERROR: scipy package is not installed. Run `pip install scipy`"
	sys.exit(1)

	try:
	import numpy
	except:
	print "ERROR: numpy package is not installed. Run `pip install numpy`"
	sys.exit(1)

	def readJSONFile(path):
	with open(path, 'r') as contents:
	return json.loads(contents.read())

	Speedometer2 = "Speedometer2"
	JetStream2 = "JetStream2"
	PLT5 = "PLT5"
	MotionMark = "MotionMark"
	MotionMark1_1 = "MotionMark-1.1"

	def detectJetStream2(payload):
	return "JetStream2.0" in payload

	def JetStream2Results(payload):
	assert detectJetStream2(payload)

	js = payload["JetStream2.0"]
	iterations = len(js["tests"]["gaussian-blur"]["metrics"]["Score"]["current"])
	results = []
	for i in range(iterations):
	scores = []
	for test in js["tests"].keys():
	scores.append(js["tests"][test]["metrics"]["Score"]["current"][i])
	geomean = stats.gmean(scores)

	results.append(geomean)

	return results

	def detectSpeedometer2(payload):
	return "Speedometer-2" in payload

	def Speedometer2Results(payload):
	assert detectSpeedometer2(payload)
	results = []
	for arr in payload["Speedometer-2"]["metrics"]["Score"]["current"]:
	results.append(numpy.mean(arr))
	return results

	def detectPLT5(payload):
	if "iterations" not in payload:
	return False
	iterations = payload["iterations"]
	if not isinstance(iterations, list):
	return False
	if not len(iterations):
	return False
	if "cold" not in iterations[0]:
	return False
	if "warm" not in iterations[0]:
	return False
	if "Geometric" not in iterations[0]:
	return False
	return True

	def PLT5Results(payload):
	assert detectPLT5(payload)
	results = []
	for obj in payload["iterations"]:
	results.append(obj["Geometric"])
	return results

	def detectMotionMark(payload):
	return "MotionMark" in payload

	def detectMotionMark1_1(payload):
	return "MotionMark-1.1" in payload

	def motionMarkResults(payload):
	assert detectMotionMark(payload) or detectMotionMark1_1(payload)
	if detectMotionMark(payload):
	payload = payload["MotionMark"]
	else:
	payload = payload["MotionMark-1.1"]
	testNames = payload["tests"].keys()
	numTests = len(payload["tests"][testNames[0]]["metrics"]["Score"]["current"])
	results = []
	for i in range(numTests):
	scores = []
	for test in testNames:
	scores.append(payload["tests"][test]["metrics"]["Score"]["current"][i])
	results.append(stats.gmean(scores))

	return results

	def motionMark1_1Results(payload):
	return motionMarkResults(payload)

	def detectBenchmark(payload):
	if detectJetStream2(payload):
	return JetStream2
	if detectSpeedometer2(payload):
	return Speedometer2
	if detectPLT5(payload):
	return PLT5
	if detectMotionMark(payload):
	return MotionMark
	if detectMotionMark1_1(payload):
	return MotionMark1_1
	return None

	def biggerIsBetter(benchmarkType):
	if benchmarkType == JetStream2:
	return True
	if benchmarkType == Speedometer2:
	return True
	if benchmarkType == MotionMark:
	return True
	if benchmarkType == MotionMark1_1:
	return True
	if benchmarkType == PLT5:
	return False

	print "Should not be reached."
	assert False

	def ttest(benchmarkType, a, b):
	# We use two-tailed Welch's
	(tStatistic, pValue) = stats.ttest_ind(a, b, equal_var=False)
	aMean = numpy.mean(a)
	bMean = numpy.mean(b)
	print "a mean = {:.5f}".format(aMean)
	print "b mean = {:.5f}".format(bMean)

	print "pValue = {:.10f}".format(pValue)

	if biggerIsBetter(benchmarkType):
	print "(Bigger means are better.)"
	if aMean > bMean:
	print "{:.3f} times worse".format((aMean / bMean))
	else:
	print "{:.3f} times better".format((bMean / aMean))
	else:
	print "(Smaller means are better.)"
	if aMean > bMean:
	print "{:.3f} times better".format((aMean / bMean))
	else:
	print "{:.3f} times worse".format((bMean / aMean))

	if pValue <= 0.05:
	print "Results ARE significant"
	else:
	print "Results ARE NOT significant"

	def getOptions():
	parser = argparse.ArgumentParser(description="Compare two WebKit benchmark results. Pass in two JSON result files to compare them. This script prints the pValue along with the magnitude of the change.")

	parser.add_argument("-a",
	type=str,
	required=True,
	help="a of a/b. Path to JSON results file.")

	parser.add_argument("-b",
	type=str,
	required=True,
	help="b of a/b. Path to JSON results file.")

	return parser.parse_known_args()[0]


	def main():
	args = getOptions()

	a = readJSONFile(args.a)
	b = readJSONFile(args.b)

	typeA = detectBenchmark(a)
	typeB = detectBenchmark(b)

	if typeA != typeB:
	print "-a and -b are not the same benchmark. a={} b={}".format(typeA, typeB)
	sys.exit(1)

	if not (typeA and typeB):
	print "Unknown benchmark type. a={} b={}".format(typeA, typeB)
	sys.exit(1)

	if typeA == JetStream2:
	ttest(typeA, JetStream2Results(a), JetStream2Results(b))
	elif typeA == Speedometer2:
	ttest(typeA, Speedometer2Results(a), Speedometer2Results(b))
	elif typeA == MotionMark:
	ttest(typeA, motionMarkResults(a), motionMarkResults(b))
	elif typeA == MotionMark1_1:
	ttest(typeA, motionMark1_1Results(a), motionMark1_1Results(b))
	elif typeA == PLT5:
	ttest(typeA, PLT5Results(a), PLT5Results(b))
	else:
	print "Unknown benchmark type"
	sys.exit(1)


	if __name__ == "__main__":
	main()