blob: 5df240e2d6c6dbd615a2eeab7cc8f138a0b8974a [file] [log] [blame]
# Copyright (C) 2019 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import calendar
import hashlib
import io
import json
import time
import zipfile
from cassandra.cqlengine import columns
from cassandra.cqlengine.models import Model
from collections import OrderedDict
from datetime import datetime
from resultsdbpy.controller.commit import Commit
from resultsdbpy.controller.configuration import Configuration
from resultsdbpy.model.commit_context import CommitContext
from resultsdbpy.model.configuration_context import ClusteredByConfiguration
from resultsdbpy.model.upload_context import UploadContext
def _get_time(input_time):
if isinstance(input_time, datetime):
return calendar.timegm(input_time.timetumple())
if input_time:
return int(input_time)
return None
class ArchiveContext(object):
DEFAULT_LIMIT = 10
CHUNK_SIZE = 10 * 1024 * 1024 # Cassandra doesn't do well with data blobs of more than 10 MB
MEMORY_LIMIT = 2 * 1024 * 1024 * 1024 # Don't allow more than 2 gigs of archives in memory at one time
class ArchiveMetaDataByCommit(ClusteredByConfiguration):
__table_name__ = 'archive_metadata_by_commit'
suite = columns.Text(partition_key=True, required=True)
branch = columns.Text(partition_key=True, required=True)
uuid = columns.BigInt(primary_key=True, required=True, clustering_order='DESC')
sdk = columns.Text(primary_key=True, required=True)
start_time = columns.BigInt(primary_key=True, required=True)
digest = columns.Text(required=True)
size = columns.BigInt(required=True)
def unpack(self):
return dict(
uuid=self.uuid,
start_time=self.start_time,
digest=self.digest,
size=self.size,
)
# According to https://cwiki.apache.org/confluence/display/CASSANDRA2/CassandraLimitations, we should shard
# large data blobs.
class ArchiveChunks(Model):
__table_name__ = 'archive_chunks'
digest = columns.Text(partition_key=True, required=True)
index = columns.Integer(primary_key=True, required=True)
chunk = columns.Blob(required=True)
@classmethod
def assert_zipfile(cls, archive):
if not isinstance(archive, io.BytesIO):
raise TypeError(f'Archive expected {io.BytesIO}, got {type(archive)} instead')
if not zipfile.is_zipfile(archive):
raise TypeError(f'Archive is not a zipfile')
@classmethod
def open_zipfile(cls, archive):
cls.assert_zipfile(archive)
return zipfile.ZipFile(archive, mode='r')
def __init__(self, configuration_context, commit_context, ttl_seconds=None):
self.configuration_context = configuration_context
self.commit_context = commit_context
self.cassandra = self.configuration_context.cassandra
self.ttl_seconds = ttl_seconds
with self:
self.cassandra.create_table(self.ArchiveMetaDataByCommit)
self.cassandra.create_table(self.ArchiveChunks)
self.cassandra.create_table(UploadContext.SuitesByConfiguration)
def __enter__(self):
self.configuration_context.__enter__()
self.commit_context.__enter__()
def __exit__(self, *args, **kwargs):
self.commit_context.__exit__(*args, **kwargs)
self.configuration_context.__exit__(*args, **kwargs)
def register(self, archive, configuration, commits, suite, timestamp=None):
self.assert_zipfile(archive)
timestamp = _get_time(timestamp) or time.time()
with self:
uuid = self.commit_context.uuid_for_commits(commits)
ttl = int((uuid // Commit.TIMESTAMP_TO_UUID_MULTIPLIER) + self.ttl_seconds - time.time()) if self.ttl_seconds else None
for branch in self.commit_context.branch_keys_for_commits(commits):
self.configuration_context.register_configuration(configuration, branch=branch, timestamp=timestamp)
self.configuration_context.insert_row_with_configuration(
UploadContext.SuitesByConfiguration.__table_name__, configuration, suite=suite, branch=branch, ttl=ttl,
)
# Breaking up the archive into chunks
index = 0
size = len(archive.getvalue())
digest = hashlib.md5(archive.getvalue()).hexdigest()
archive.seek(0)
while size > index * self.CHUNK_SIZE:
self.cassandra.insert_row(
self.ArchiveChunks.__table_name__,
digest=digest, index=index,
chunk=archive.read(self.CHUNK_SIZE),
)
index += 1
self.configuration_context.insert_row_with_configuration(
self.ArchiveMetaDataByCommit.__table_name__, configuration=configuration, suite=suite,
branch=branch, uuid=uuid, ttl=ttl,
sdk=configuration.sdk or '?', start_time=timestamp,
digest=digest,
size=size,
)
def find_archive(
self, configurations=None, suite=None, recent=True,
branch=None, begin=None, end=None,
begin_query_time=None, end_query_time=None,
limit=DEFAULT_LIMIT,
):
if not configurations:
configurations = []
if not isinstance(suite, str):
raise TypeError(f'Expected type {str}, got {type(suite)}')
with self:
metadata_by_config = {}
for configuration in configurations:
metadata_by_config.update({config: [value.unpack() for value in values] for config, values in self.configuration_context.select_from_table_with_configurations(
self.ArchiveMetaDataByCommit.__table_name__, configurations=[configuration], recent=recent,
suite=suite, sdk=configuration.sdk, branch=branch or self.commit_context.DEFAULT_BRANCH_KEY,
uuid__gte=CommitContext.convert_to_uuid(begin),
uuid__lte=CommitContext.convert_to_uuid(end, CommitContext.timestamp_to_uuid()),
start_time__gte=_get_time(begin_query_time), start_time__lte=_get_time(end_query_time),
limit=limit,
).items()})
memory_used = 0
for values in metadata_by_config.values():
for value in values:
if not value.get('digest'):
continue
memory_used += value.get('size', 0)
if memory_used > self.MEMORY_LIMIT:
raise RuntimeError('Hit soft-memory cap when fetching archives, aborting')
result = {}
for config, values in metadata_by_config.items():
for value in values:
if not value.get('digest'):
continue
rows = self.cassandra.select_from_table(
self.ArchiveChunks.__table_name__,
digest=value.get('digest'),
limit=1 + int(value.get('size', 0) / self.CHUNK_SIZE),
)
if len(rows) == 0:
continue
digest = hashlib.md5()
archive = io.BytesIO()
archive_size = 0
for row in rows:
archive_size += len(row.chunk)
digest.update(row.chunk)
archive.write(row.chunk)
if archive_size != value.get('size', 0) or value.get('digest', '') != digest.hexdigest():
raise RuntimeError('Failed to reconstruct archive from chunks')
archive.seek(0)
result.setdefault(config, [])
result[config].append(dict(
archive=archive,
uuid=value['uuid'],
start_time=value['start_time'],
))
return result
@classmethod
def _files_for_archive(cls, archive):
master = None
files = set()
for item in archive.namelist():
if item.startswith('__'):
continue
if not master:
master = item
continue
files.add(item[len(master):])
return master, sorted([*files])
def ls(self, *args, **kwargs):
candidates = self.find_archive(*args, **kwargs)
result = {}
for config, archives in candidates.items():
result[config] = []
for archive in archives:
result[config].append(dict(
uuid=archive['uuid'],
start_time=archive['start_time'],
files=set(),
))
with self.open_zipfile(archive['archive']) as unpacked:
_, result[config][-1]['files'] = self._files_for_archive(unpacked)
return result
def file(self, path=None, **kwargs):
candidates = self.find_archive(**kwargs)
result = {}
for config, archives in candidates.items():
result[config] = []
for archive in archives:
file = None
with self.open_zipfile(archive['archive']) as unpacked:
master, files = self._files_for_archive(unpacked)
if not path:
file = files
elif path[-1] == '/':
file = [f[len(path):] for f in files if f.startswith(path)]
elif path in files:
file = unpacked.open(master + path).read()
if file:
result[config].append(dict(
uuid=archive['uuid'],
start_time=archive['start_time'],
file=file,
))
return result