| #! /usr/bin/env python |
| |
| # Copyright (C) 2016-2019 Apple Inc. All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| # This tool processes the Unicode Character Database file CaseFolding.txt to create |
| # canonicalization table as decribed in ECMAScript 6 standard in section |
| # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2. |
| |
| import optparse |
| import os |
| import re |
| import sys |
| |
| header = """/* |
| * Copyright (C) 2016 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| // DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode |
| |
| #include "config.h" |
| #include "YarrCanonicalize.h" |
| |
| namespace JSC { namespace Yarr { |
| |
| """ |
| |
| footer = """} } // JSC::Yarr |
| """ |
| |
| MaxUnicode = 0x10ffff |
| commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE) |
| |
| def openOrExit(path, mode): |
| try: |
| dirname = os.path.dirname(path) |
| if not os.path.isdir(dirname): |
| os.makedirs(dirname) |
| if sys.version_info.major >= 3: |
| return open(path, mode, encoding="UTF-8") |
| else: |
| return open(path, mode) |
| except IOError as e: |
| print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) |
| exit(1) |
| |
| class Canonicalize: |
| def __init__(self): |
| self.canonicalGroups = {}; |
| |
| def addMapping(self, code, mapping): |
| if mapping not in self.canonicalGroups: |
| self.canonicalGroups[mapping] = [] |
| self.canonicalGroups[mapping].append(code) |
| |
| def readCaseFolding(self, file): |
| codesSeen = set() |
| for line in file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = commonAndSimpleLinesRE.match(line) |
| if (not fields): |
| continue |
| |
| code = int(fields.group('code'), 16) |
| mapping = int(fields.group('mapping'), 16) |
| |
| codesSeen.add(code) |
| self.addMapping(code, mapping) |
| |
| for i in range(MaxUnicode + 1): |
| if i in codesSeen: |
| continue; |
| |
| self.addMapping(i, i) |
| |
| def createTables(self, file): |
| typeInfo = [""] * (MaxUnicode + 1) |
| characterSets = [] |
| |
| for mapping in sorted(self.canonicalGroups.keys()): |
| characters = self.canonicalGroups[mapping] |
| if len(characters) == 1: |
| typeInfo[characters[0]] = "CanonicalizeUnique:0" |
| else: |
| characters.sort() |
| if len(characters) > 2: |
| for ch in characters: |
| typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets) |
| characterSets.append(characters) |
| else: |
| low = characters[0] |
| high = characters[1] |
| delta = high - low |
| if delta == 1: |
| type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0" |
| typeInfo[low] = type |
| typeInfo[high] = type |
| else: |
| typeInfo[low] = "CanonicalizeRangeLo:%d" % delta |
| typeInfo[high] = "CanonicalizeRangeHi:%d" % delta |
| |
| rangeInfo = [] |
| end = 0 |
| while end <= MaxUnicode: |
| begin = end |
| type = typeInfo[end] |
| while end < MaxUnicode and typeInfo[end + 1] == type: |
| end = end + 1 |
| rangeInfo.append({"begin": begin, "end": end, "type": type}) |
| end = end + 1 |
| |
| for i in range(len(characterSets)): |
| characters = "" |
| cur_set = characterSets[i] |
| for ch in cur_set: |
| characters = characters + "0x{character:04x}, ".format(character=ch) |
| file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters)) |
| |
| file.write("\n") |
| file.write("static constexpr size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets))) |
| file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n") |
| |
| for i in range(len(characterSets)): |
| file.write(" unicodeCharacterSet{setNumber:d},\n".format(setNumber=i)) |
| |
| file.write("};\n") |
| file.write("\n") |
| file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo))) |
| file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n") |
| |
| for info in rangeInfo: |
| typeAndValue = info["type"].split(":") |
| file.write(" {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0])) |
| |
| file.write("};\n") |
| file.write("\n") |
| |
| |
| if __name__ == "__main__": |
| parser = optparse.OptionParser(usage = "usage: %prog <CaseFolding.txt> <YarrCanonicalizeUnicode.h>") |
| (options, args) = parser.parse_args() |
| |
| if len(args) != 2: |
| parser.error("<CaseFolding.txt> <YarrCanonicalizeUnicode.h>") |
| |
| caseFoldingTxtPath = args[0] |
| canonicalizeHPath = args[1] |
| caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r") |
| canonicalizeHFile = openOrExit(canonicalizeHPath, "w") |
| |
| canonicalize = Canonicalize() |
| canonicalize.readCaseFolding(caseFoldingTxtFile) |
| |
| canonicalizeHFile.write(header); |
| canonicalize.createTables(canonicalizeHFile) |
| canonicalizeHFile.write(footer); |
| |
| caseFoldingTxtFile.close() |
| canonicalizeHFile.close() |
| |
| exit(0) |