blob: f776c664d9a262959f37a2648cf09b76add09397 [file] [log] [blame]
# Copyright (C) 2019 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import datetime
import contextlib
import json
import re
import requests
import threading
import urllib
import xmltodict
from resultsdbpy.controller.commit import Commit
from xml.parsers.expat import ExpatError
class SCMException(RuntimeError):
pass
class Repository(object):
DEFAULT_BRANCH = 'master'
def __init__(self, key):
self.key = key
def commit_for_id(self, id, branch=None):
raise NotImplementedError()
def url_for_commit(self, commit):
return None
class HTTPRepository(Repository):
def __init__(self, key, url, redis=None, username=None, password=None):
super(HTTPRepository, self).__init__(key)
self.url = url
self._username = username
self._password = password
self._session = None
self._session_depth = 0
self.redis = redis # In many cases, we can greatly reduce the number of requests made by caching requests in Redis.
@contextlib.contextmanager
def session(self):
if not self._session:
self._session = requests.Session()
if self._username and self._password:
self._session.auth = (self._username, self._password)
self._session.mount(self.url, requests.adapters.HTTPAdapter(max_retries=5))
self._session_depth += 1
yield
self._session_depth -= 1
if self._session_depth <= 0:
self._session.close()
self._session = None
def request(self, url, method='GET', **kwargs):
with self.session():
return self._session.request(method=method, url=url, **kwargs)
def cache_result(self, key, function, ex=60 * 60 * 24 * 2):
result = None
if self.redis:
result = self.redis.get(key)
if result is None:
result = function()
else:
result = result.decode('utf-8')
if result and self.redis:
self.redis.set(key, result, ex=ex)
return result
class StashRepository(HTTPRepository):
# Stash's timestamps are accurate to the millisecond.
COMMIT_TIMESTAMP_CONVERSION = 1000
def __init__(self, url, **kwargs):
PROJECTS = '/projects'
if len(url.split(PROJECTS)) != 2:
raise Exception(f'{url} is not a Stash URL')
self.base_url = url
super(StashRepository, self).__init__(None, ('/rest/api/1.0' + PROJECTS).join(url.split(PROJECTS)), **kwargs)
with self.session():
try:
stash_data = self.get()
except ValueError:
stash_data = {}
self.key = stash_data.get('slug', None)
if not self.key:
raise Exception(f'{self.url} is not an Stash repository')
def get(self, args=None, cache=True):
url = self.url + ('' if not args else f'/{args}')
def callback(url=url, obj=self):
response = obj.request(url)
if response.status_code == 200:
return response.text
return None
result = self.cache_result(
f'repository:{url}',
callback,
) if cache else callback(url)
if result:
return json.loads(result)
return {}
def commit_for_id(self, id, branch=Repository.DEFAULT_BRANCH):
try:
with self.session():
commit_data = self.get(f'commits/{id}')
if not commit_data:
raise SCMException(f'Commit {id} does not exist on branch {branch}')
timestamp = int(commit_data['committerTimestamp']) // self.COMMIT_TIMESTAMP_CONVERSION
branch_filter = urllib.parse.quote(f'refs/heads/{branch}')
commits_between_commit_and_branch_head = self.get(f"commits?since={commit_data['id']}&until={branch_filter}&limit=1", cache=False)
if commits_between_commit_and_branch_head['isLastPage'] and commits_between_commit_and_branch_head['size'] == 0:
# We may have missed the case the commit in question is the HEAD of the branch
commits_at_branch_head = self.get(f'commits?until={branch_filter}&limit=20', cache=False)
if all([commit['id'] != commit_data['id'] for commit in commits_at_branch_head['values']]):
raise SCMException(f'{id} exists, but not on {branch}')
# Ordering commits by timestamp in git is a bit problematic because multiple commits can share a timestamp.
# Generally, if your parent shares your timestamp, your order needs to be incremented.
order = 0
loop_data = commit_data
while len(loop_data['parents']) == 1 and int(loop_data['parents'][0]['committerTimestamp']) // self.COMMIT_TIMESTAMP_CONVERSION == timestamp:
order += 1
loop_data = self.get(f"commits/{loop_data['parents'][0]['id']}")
return Commit(
repository_id=self.key,
id=commit_data['id'],
branch=branch,
timestamp=timestamp,
order=order,
committer=commit_data['committer']['emailAddress'],
message=commit_data['message'],
)
except ValueError:
raise SCMException(f'Failed to connect to {self.url}')
def url_for_commit(self, commit):
return f'{self.base_url}/commits/{commit}'
class SVNRepository(HTTPRepository):
DEFAULT_BRANCH = 'trunk'
DAV_XML = """<?xml version="1.0" encoding="utf-8"?>
<propfind xmlns="DAV:">
<prop>
<resourcetype xmlns="DAV:"/>
<getcontentlength xmlns="DAV:"/>
<deadprop-count xmlns="http://subversion.tigris.org/xmlns/dav/"/>
<version-name xmlns="DAV:"/>
<creationdate xmlns="DAV:"/>
<creator-displayname xmlns="DAV:"/>
</prop></propfind>"""
def get(self, branch=DEFAULT_BRANCH, revision=None):
url = self.url
if revision:
url = url + f"!svn/rvr/{revision}/{branch if branch == self.DEFAULT_BRANCH else 'branches/' + branch}"
def callback(url=url, obj=self, with_revision=True if revision else False):
if with_revision:
# Constructed this from WireShark and an svn info command, modify with caution!
response = self.request(
method='PROPFIND',
url=url,
headers={
'Content-Type': 'text/xml',
'Accept-Encoding': 'gzip',
'Depth': '0',
},
data=obj.DAV_XML)
else:
response = self.request(self.url)
if response.status_code in [200, 207]:
return response.text
return None
result = self.cache_result(
f'repository:{url}',
callback,
)
if result:
return xmltodict.parse(result)
return {}
def __init__(self, url, trac_url=None, **kwargs):
super(SVNRepository, self).__init__(None, url, **kwargs)
self.trac_url = trac_url
with self.session():
self.key = self.get().get('svn', {}).get('index').get('@base')
if not self.key:
raise Exception(f'{self.url} is not an SVN repository')
def commit_for_id(self, id, branch=DEFAULT_BRANCH):
with self.session():
try:
commit_data = self.get(branch=branch, revision=id)['D:multistatus']['D:response']['D:propstat'][0]['D:prop']
if commit_data['lp1:version-name'] != str(id):
raise SCMException(f'Revision {id} does not exist on branch {branch}')
# Of the form '2018-09-24T22:03:34.436217Z'
timestamp = datetime.datetime.strptime(commit_data['lp1:creationdate'], '%Y-%m-%dT%H:%M:%S.%fZ')
return Commit(
repository_id=self.key,
id=str(id),
branch=branch,
timestamp=timestamp,
order=0,
committer=commit_data['lp1:creator-displayname'],
)
except ExpatError:
raise SCMException(f'Failed to connect to {self.url}')
except KeyError:
raise SCMException(f'Revision {id} does not exist on branch {branch}')
def url_for_commit(self, commit):
return f'{self.url}/!svn/bc/{commit}/'
class WebKitRepository(SVNRepository):
CHANGELOGS = (
'ChangeLog',
'Source/bmalloc/ChangeLog',
'Source/JavaScriptCore/ChangeLog',
'Source/ThirdParty/ANGLE/ChangeLog',
'Source/ThirdParty/ChangeLog',
'Source/ThirdParty/libwebrtc/ChangeLog',
'Source/WebCore/ChangeLog',
'Source/WebCore/PAL/ChangeLog',
'Source/WebCore/platform/gtk/po/ChangeLog',
'Source/WebDriver/ChangeLog',
'Source/WebInspectorUI/ChangeLog',
'Source/WebKit/ChangeLog',
'Source/WebKitLegacy/cf/ChangeLog',
'Source/WebKitLegacy/ChangeLog',
'Source/WebKitLegacy/ios/ChangeLog',
'Source/WebKitLegacy/mac/ChangeLog',
'Source/WebKitLegacy/win/ChangeLog',
'Source/WTF/ChangeLog',
'Tools/ChangeLog',
'Examples/ChangeLog',
'JSTests/ChangeLog',
'LayoutTests/ChangeLog',
'LayoutTests/imported/mozilla/ChangeLog',
'LayoutTests/imported/w3c/ChangeLog',
'PerformanceTests/ChangeLog',
'PerformanceTests/SunSpider/ChangeLog',
'WebDriverTests/ChangeLog',
'WebKitLibraries/ChangeLog',
'Websites/browserbench.org/ChangeLog',
'Websites/bugs.webkit.org/ChangeLog',
'Websites/perf.webkit.org/ChangeLog',
'Websites/planet.webkit.org/ChangeLog',
'Websites/webkit.org/ChangeLog',
'Websites/webkit.org/specs/CSSVisualEffects/ChangeLog',
)
CHANGE_LOG_THREADS = 6
FIRST_LINE_REGEX = re.compile(r'(?P<date>\d+-\d+-\d+)\s+(?P<name>\S.+\S)\s+<(?P<email>\S+)>')
def __init__(self, **kwargs):
super(WebKitRepository, self).__init__('https://svn.webkit.org/repository/webkit/', **kwargs)
def commit_for_id(self, id, branch=SVNRepository.DEFAULT_BRANCH):
with self.session():
result = super(WebKitRepository, self).commit_for_id(id, branch=branch)
# Find the last commit on the given branch so we can diff the changelogs
previous_id = self.get(branch=branch, revision=int(id) - 1).get(
'D:multistatus', {}).get('D:response', {}).get('D:propstat', [{}])[0].get('D:prop', {}).get('lp1:version-name', None)
branch_url = f"{self.url}{branch if branch == self.DEFAULT_BRANCH else 'branches/' + branch}"
changelogs = {}
request_threads = []
# Compare changelog from previous commit to the changelog from this commit
def diff_changelogs(changelog_list):
for changelog in changelog_list:
current_url = f'{branch_url}/{changelog}/?p={id}'
previous_url = f'{branch_url}/{changelog}/?p={previous_id}' if previous_id else None
current_response = self.request(current_url)
if current_response.status_code != 200:
continue
previous_response = self.request(previous_url) if previous_url else None
if previous_response and previous_response.status_code != 200:
continue
previous_response_length = 0
if previous_response:
previous_response_length = len(previous_response.text)
if len(current_response.text) == previous_response_length:
continue
if not previous_response_length or len(current_response.text) < previous_response_length:
changelogs[changelog] = current_response.text
else:
changelogs[changelog] = current_response.text[0:-len(previous_response.text)]
# Because we're mostly blocked on network I/O, threads can speed this up ~4x
shards = [[] for _ in range(self.CHANGE_LOG_THREADS)]
index = 0
for changelog in self.CHANGELOGS:
shards[index].append(changelog)
index += 1
if index >= len(shards):
index = 0
for shard in shards:
request_threads.append(threading.Thread(
target=diff_changelogs,
kwargs=dict(changelog_list=shard),
))
request_threads[-1].start()
for thread in request_threads:
thread.join()
commit_message = []
for changelog in self.CHANGELOGS:
partial_message = changelogs.get(changelog, None)
if not partial_message:
continue
lines_in_message = partial_message.split('\n')
# The first line of the commit message might contain the real committer, in the case
# commit queue committed a change on the behalf of someone
start_of_message_shard = 0
match = self.FIRST_LINE_REGEX.match(lines_in_message[0])
if match:
if match.group('email') != result.committer:
result.committer = match.group('email')
start_of_message_shard += 1
while not lines_in_message[start_of_message_shard]:
start_of_message_shard += 1
# WebKit commit messages are often split across multiple changelogs with the first few lines duplicated.
start_of_full_message = 0
while start_of_message_shard < len(lines_in_message) and start_of_full_message < len(commit_message):
if lines_in_message[start_of_message_shard].startswith(' ' * 8):
line = lines_in_message[start_of_message_shard][8:]
else:
line = lines_in_message[start_of_message_shard]
if line != commit_message[start_of_full_message]:
break
start_of_message_shard += 1
start_of_full_message += 1
for line in lines_in_message[start_of_message_shard:]:
commit_message.append(line[8:] if line.startswith(' ' * 8) else line)
result.message = str('\n'.join(commit_message)) if commit_message else None
return result
def url_for_commit(self, commit):
return f'https://trac.webkit.org/changeset/{commit}/{self.key}'