darin@apple.com | fe915df | 2020-08-02 15:12:40 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # |
| 3 | # Copyright (C) 2020 Apple Inc. All rights reserved. |
| 4 | # |
| 5 | # Redistribution and use in source and binary forms, with or without |
| 6 | # modification, are permitted provided that the following conditions |
| 7 | # are met: |
| 8 | # 1. Redistributions of source code must retain the above copyright |
| 9 | # notice, this list of conditions and the following disclaimer. |
| 10 | # 2. Redistributions in binary form must reproduce the above copyright |
| 11 | # notice, this list of conditions and the following disclaimer in the |
| 12 | # documentation and/or other materials provided with the distribution. |
| 13 | # |
| 14 | # THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY |
| 15 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 16 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 17 | # DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| 18 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 19 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 20 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| 21 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 22 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 23 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 24 | |
| 25 | import argparse, hashlib, os, sys |
| 26 | from collections import defaultdict |
| 27 | |
| 28 | parser = argparse.ArgumentParser(description='Find any files that have the same contents.') |
| 29 | args = parser.parse_args() |
| 30 | |
| 31 | filesByDigest = defaultdict(list) |
| 32 | |
| 33 | width = 80 |
| 34 | |
| 35 | root = os.getcwd() |
| 36 | for subroot, directories, files in os.walk(os.getcwd()): |
| 37 | prefix = subroot[len(root) + 1:] |
| 38 | for file in files: |
| 39 | path = os.path.join(prefix, file) |
| 40 | blockSize = 65536 |
| 41 | hash = hashlib.md5() |
| 42 | with open(os.path.join(subroot, file), "rb") as handle: |
| 43 | bytes = handle.read(blockSize) |
| 44 | while len(bytes) > 0: |
| 45 | hash.update(bytes) |
| 46 | bytes = handle.read(blockSize) |
| 47 | filesByDigest[hash.digest()].append(path) |
| 48 | |
| 49 | duplicates = list() |
| 50 | |
| 51 | for fileList in filesByDigest.values(): |
| 52 | if len(fileList) != 1: |
| 53 | duplicates.append(sorted(fileList)) |
| 54 | |
| 55 | for fileList in sorted(duplicates): |
ap@apple.com | 0dcde70 | 2021-01-05 17:59:56 +0000 | [diff] [blame] | 56 | print(fileList) |