results.webkit.org: Handle duplicate archives
https://bugs.webkit.org/show_bug.cgi?id=204860

Reviewed by Stephanie Lewis.

* resultsdbpy/resultsdbpy/controller/archive_controller.py:
(ArchiveController): Pass test time to ArchiveContext, de-duplicate any
identical archives.
* resultsdbpy/resultsdbpy/model/archive_context.py:
(ArchiveContext): Only upack identical archives once, pass digest to caller.


git-svn-id: http://svn.webkit.org/repository/webkit/trunk@253132 268f45cc-cd09-0410-ab3c-d52691b4dbfc
diff --git a/Tools/ChangeLog b/Tools/ChangeLog
index f322aa3..ace151c 100644
--- a/Tools/ChangeLog
+++ b/Tools/ChangeLog
@@ -1,5 +1,18 @@
 2019-12-04  Jonathan Bedard  <jbedard@apple.com>
 
+        results.webkit.org: Handle duplicate archives
+        https://bugs.webkit.org/show_bug.cgi?id=204860
+
+        Reviewed by Stephanie Lewis.
+
+        * resultsdbpy/resultsdbpy/controller/archive_controller.py:
+        (ArchiveController): Pass test time to ArchiveContext, de-duplicate any
+        identical archives.
+        * resultsdbpy/resultsdbpy/model/archive_context.py:
+        (ArchiveContext): Only upack identical archives once, pass digest to caller.
+
+2019-12-04  Jonathan Bedard  <jbedard@apple.com>
+
         Python 3: Add support in webkitpy.benchmark_runner
         https://bugs.webkit.org/show_bug.cgi?id=204784
 
diff --git a/Tools/resultsdbpy/resultsdbpy/controller/archive_controller.py b/Tools/resultsdbpy/resultsdbpy/controller/archive_controller.py
index 04e0451..93f5b9f 100644
--- a/Tools/resultsdbpy/resultsdbpy/controller/archive_controller.py
+++ b/Tools/resultsdbpy/resultsdbpy/controller/archive_controller.py
@@ -20,6 +20,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import hashlib
 import io
 import json
 import time
@@ -66,19 +67,22 @@
             suites = set(suite)
 
         result = None
-        filename = 'archive.zip'
+        filename = None
+        digest = None
         with self.archive_context, self.upload_context:
             for suite in suites:
                 for configuration, archives in self.archive_context.find_archive(
                     configurations=configurations, suite=suite, branch=branch[0],
                     begin=begin, end=end, recent=recent, limit=2,
+                    begin_query_time=begin_query_time, end_query_time=end_query_time,
                 ).items():
                     for archive in archives:
-                        if archive.get('archive'):
-                            if result:
+                        if archive.get('archive') and archive.get('digest'):
+                            if digest and digest != archive.get('digest'):
                                 abort(400, description='Multiple archives matching the specified criteria')
                             result = archive.get('archive')
                             filename = f'{configuration}@{archive["uuid"]}'.replace(' ', '_').replace('.', '-')
+                            digest = archive.get('digest')
 
         if not result:
             abort(404, description='No archives matching the specified criteria')
diff --git a/Tools/resultsdbpy/resultsdbpy/model/archive_context.py b/Tools/resultsdbpy/resultsdbpy/model/archive_context.py
index 5874de4..e726caf 100644
--- a/Tools/resultsdbpy/resultsdbpy/model/archive_context.py
+++ b/Tools/resultsdbpy/resultsdbpy/model/archive_context.py
@@ -177,35 +177,40 @@
                     if memory_used > self.MEMORY_LIMIT:
                         raise RuntimeError('Hit soft-memory cap when fetching archives, aborting')
 
+            archive_by_digest = {}
             result = {}
             for config, values in metadata_by_config.items():
                 for value in values:
                     if not value.get('digest'):
                         continue
 
-                    rows = self.cassandra.select_from_table(
-                        self.ArchiveChunks.__table_name__,
-                        digest=value.get('digest'),
-                        limit=1 + int(value.get('size', 0) / self.CHUNK_SIZE),
-                    )
-                    if len(rows) == 0:
-                        continue
+                    if not archive_by_digest.get(value.get('digest')):
+                        rows = self.cassandra.select_from_table(
+                            self.ArchiveChunks.__table_name__,
+                            digest=value.get('digest'),
+                            limit=1 + int(value.get('size', 0) / self.CHUNK_SIZE),
+                        )
+                        if len(rows) == 0:
+                            continue
 
-                    digest = hashlib.md5()
-                    archive = io.BytesIO()
-                    archive_size = 0
-                    for row in rows:
-                        archive_size += len(row.chunk)
-                        digest.update(row.chunk)
-                        archive.write(row.chunk)
+                        digest = hashlib.md5()
+                        archive = io.BytesIO()
+                        archive_size = 0
+                        for row in rows:
+                            archive_size += len(row.chunk)
+                            digest.update(row.chunk)
+                            archive.write(row.chunk)
 
-                    if archive_size != value.get('size', 0) or value.get('digest', '') != digest.hexdigest():
-                        raise RuntimeError('Failed to reconstruct archive from chunks')
+                        if archive_size != value.get('size', 0) or value.get('digest', '') != digest.hexdigest():
+                            raise RuntimeError('Failed to reconstruct archive from chunks')
 
-                    archive.seek(0)
+                        archive_by_digest[value.get('digest')] = archive
+
+                    archive_by_digest.get(value.get('digest')).seek(0)
                     result.setdefault(config, [])
                     result[config].append(dict(
-                        archive=archive,
+                        archive=archive_by_digest.get(value.get('digest')),
+                        digest=digest.hexdigest(),
                         uuid=value['uuid'],
                         start_time=value['start_time'],
                     ))