| #!/usr/bin/env python |
| # |
| # Copyright (C) 2020 Apple Inc. All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY |
| # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| # DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| import argparse, hashlib, os, sys |
| from collections import defaultdict |
| |
| parser = argparse.ArgumentParser(description='Find any files that have the same contents.') |
| args = parser.parse_args() |
| |
| filesByDigest = defaultdict(list) |
| |
| width = 80 |
| |
| root = os.getcwd() |
| for subroot, directories, files in os.walk(os.getcwd()): |
| prefix = subroot[len(root) + 1:] |
| for file in files: |
| path = os.path.join(prefix, file) |
| blockSize = 65536 |
| hash = hashlib.md5() |
| with open(os.path.join(subroot, file), "rb") as handle: |
| bytes = handle.read(blockSize) |
| while len(bytes) > 0: |
| hash.update(bytes) |
| bytes = handle.read(blockSize) |
| filesByDigest[hash.digest()].append(path) |
| |
| duplicates = list() |
| |
| for fileList in filesByDigest.values(): |
| if len(fileList) != 1: |
| duplicates.append(sorted(fileList)) |
| |
| for fileList in sorted(duplicates): |
| print(fileList) |