| #!/usr/bin/env python3 |
| |
| # Copyright (C) 2017 Apple Inc. All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| # This tool processes the Unicode Character Database file CaseFolding.txt to create |
| # canonicalization table as decribed in ECMAScript 6 standard in section |
| # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2. |
| |
| import sys |
| import copy |
| import optparse |
| import os |
| import re |
| from hasher import stringHash |
| |
| header = """/* |
| * Copyright (C) 2017-2022 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| // DO NO EDIT! - This file was generated by """ + __file__ + """ |
| """ |
| |
| |
| footer = """ |
| """ |
| |
| RequiredUCDFiles = ["DerivedBinaryProperties.txt", "DerivedCoreProperties.txt", "DerivedNormalizationProps.txt", "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", "ScriptExtensions.txt", "UnicodeData.txt", "emoji-data.txt"] |
| UCDDirectoryPath = None |
| |
| SupportedBinaryProperties = [ |
| "Alphabetic", "Any", "ASCII", "ASCII_Hex_Digit", "Assigned", "Bidi_Control", "Bidi_Mirrored", "Case_Ignorable", |
| "Cased", "Changes_When_Casefolded", "Changes_When_Casemapped", "Changes_When_Lowercased", "Changes_When_NFKC_Casefolded", |
| "Changes_When_Titlecased", "Changes_When_Uppercased", "Dash", "Default_Ignorable_Code_Point", "Deprecated", |
| "Diacritic", "Emoji", "Emoji_Component", "Emoji_Modifier_Base", "Emoji_Modifier", "Emoji_Presentation", |
| "Extended_Pictographic", "Extender", "Grapheme_Base", "Grapheme_Extend", "Hex_Digit", "ID_Continue", "ID_Start", |
| "Ideographic", "IDS_Binary_Operator", "IDS_Trinary_Operator", "Join_Control", "Logical_Order_Exception", "Lowercase", |
| "Math", "Noncharacter_Code_Point", "Pattern_Syntax", "Pattern_White_Space", "Quotation_Mark", "Radical", |
| "Regional_Indicator", "Sentence_Terminal", "Soft_Dotted", "Terminal_Punctuation", "Unified_Ideograph", "Uppercase", |
| "Variation_Selector", "White_Space", "XID_Continue", "XID_Start"] |
| |
| lastASCIICodePoint = 0x7f |
| firstUnicodeCodePoint = 0x80 |
| MaxUnicode = 0x10ffff |
| MaxBMP = 0xffff |
| commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE) |
| aliases = None |
| |
| |
| def openOrExit(path, mode): |
| try: |
| if sys.version_info.major >= 3: |
| return open(path, mode, encoding="UTF-8") |
| else: |
| return open(path, mode) |
| except IOError as e: |
| print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) |
| exit(1) |
| |
| |
| def openUCDFileOrExit(path): |
| if not UCDDirectoryPath: |
| exit(1) |
| |
| return openOrExit(os.path.join(UCDDirectoryPath, path), 'r') |
| |
| |
| def verifyUCDFilesExist(): |
| if not UCDDirectoryPath: |
| exit(1) |
| |
| missingFileCount = 0 |
| for file in RequiredUCDFiles: |
| fullPath = os.path.join(UCDDirectoryPath, file) |
| if not os.path.exists(fullPath): |
| print("Couldn't find UCD file {0} at {1}".format(file, fullPath)) |
| missingFileCount = missingFileCount + 1 |
| if missingFileCount: |
| exit(1) |
| |
| |
| def ceilingToPowerOf2(size): |
| powerOf2 = 1 |
| while size > powerOf2: |
| powerOf2 = powerOf2 << 1 |
| |
| return powerOf2 |
| |
| |
| class Aliases: |
| def __init__(self): |
| self.globalNameToAliases = {} |
| self.generalCategoryToAliases = {} |
| self.aliasToGeneralCategory = {} |
| self.scriptToAliases = {} |
| self.aliasToScript = {} |
| |
| def parsePropertyAliasesFile(self, file): |
| for line in file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = line.split(';') |
| if (not fields): |
| continue |
| |
| aliases = [fields[0].strip()] |
| fullName = fields[1].strip() |
| for otherAlias in fields[2:]: |
| aliases.append(otherAlias.strip()) |
| |
| if fullName in self.globalNameToAliases: |
| print("Error, already an alias for {}".format(fullName)) |
| else: |
| self.globalNameToAliases[fullName] = aliases |
| |
| def parsePropertyValueAliasesFile(self, file): |
| for line in file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = line.split(';') |
| if (not fields): |
| continue |
| |
| propertyType = fields[0].strip() |
| |
| if propertyType == "gc": |
| mapToModify = self.generalCategoryToAliases |
| reverseMapToModify = self.aliasToGeneralCategory |
| elif propertyType == "sc": |
| mapToModify = self.scriptToAliases |
| reverseMapToModify = self.aliasToScript |
| else: |
| continue |
| |
| primaryAlias = fields[1].strip() |
| fullName = fields[2].strip() |
| aliases = [primaryAlias] |
| for otherAlias in fields[3:]: |
| aliases.append(otherAlias.strip()) |
| |
| if fullName in mapToModify: |
| print("Error, already an {} alias for {}".format(propertyType, fullName)) |
| else: |
| mapToModify[fullName] = aliases |
| if reverseMapToModify != None: |
| reverseMapToModify[primaryAlias] = fullName |
| |
| def globalAliasesFor(self, name): |
| if name not in self.globalNameToAliases: |
| return [] |
| return self.globalNameToAliases[name] |
| |
| def generalCategoryAliasesFor(self, name): |
| if name not in self.generalCategoryToAliases: |
| return "" |
| return self.generalCategoryToAliases[name] |
| |
| def generalCategoryForAlias(self, name): |
| if name not in self.aliasToGeneralCategory: |
| return "" |
| return self.aliasToGeneralCategory[name] |
| |
| def scriptAliasesFor(self, name): |
| if name not in self.scriptToAliases: |
| return "" |
| return self.scriptToAliases[name] |
| |
| def scriptNameForAlias(self, name): |
| if name not in self.aliasToScript: |
| return "" |
| return self.aliasToScript[name] |
| |
| |
| class PropertyData: |
| allPropertyData = [] |
| |
| def __init__(self, name): |
| self.name = name |
| self.aliases = [] |
| self.index = len(PropertyData.allPropertyData) |
| self.hasBMPCharacters = False |
| self.hasNonBMPCharacters = False |
| self.matches = [] |
| self.ranges = [] |
| self.unicodeMatches = [] |
| self.unicodeRanges = [] |
| self.codePointCount = 0 |
| PropertyData.allPropertyData.append(self) |
| |
| def setAliases(self, aliases): |
| self.aliases = aliases |
| |
| def makeCopy(self): |
| result = copy.deepcopy(self) |
| result.index = len(PropertyData.allPropertyData) |
| PropertyData.allPropertyData.append(result) |
| return result |
| |
| def getIndex(self): |
| return self.index |
| |
| def getCreateFuncName(self): |
| return "createCharacterClass{}".format(self.index) |
| |
| def addMatch(self, codePoint): |
| if codePoint <= MaxBMP: |
| self.hasBMPCharacters = True |
| else: |
| self.hasNonBMPCharacters = True |
| if codePoint <= lastASCIICodePoint: |
| if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint): |
| self.addMatchUnordered(codePoint) |
| return |
| |
| self.codePointCount = self.codePointCount + 1 |
| if len(self.matches) and self.matches[-1] == (codePoint - 1): |
| lowCodePoint = self.matches.pop() |
| self.ranges.append((lowCodePoint, codePoint)) |
| elif len(self.ranges) and self.ranges[-1][1] == (codePoint - 1): |
| priorRange = self.ranges.pop() |
| self.ranges.append((priorRange[0], codePoint)) |
| else: |
| self.matches.append(codePoint) |
| else: |
| if (len(self.unicodeMatches) and self.unicodeMatches[-1] > codePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > codePoint): |
| self.addMatchUnordered(codePoint) |
| return |
| |
| self.codePointCount = self.codePointCount + 1 |
| if len(self.unicodeMatches) and self.unicodeMatches[-1] == (codePoint - 1): |
| lowCodePoint = self.unicodeMatches.pop() |
| self.unicodeRanges.append((lowCodePoint, codePoint)) |
| elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (codePoint - 1): |
| priorRange = self.unicodeRanges.pop() |
| self.unicodeRanges.append((priorRange[0], codePoint)) |
| else: |
| self.unicodeMatches.append(codePoint) |
| |
| def addRange(self, lowCodePoint, highCodePoint): |
| if lowCodePoint <= MaxBMP: |
| self.hasBMPCharacters = True |
| if highCodePoint > MaxBMP: |
| self.hasNonBMPCharacters = True |
| if highCodePoint <= lastASCIICodePoint: |
| if (len(self.matches) and self.matches[-1] > lowCodePoint) or (len(self.ranges) and self.ranges[-1][1] > lowCodePoint): |
| self.addRangeUnordered(lowCodePoint, highCodePoint) |
| return |
| |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| if len(self.matches) and self.matches[-1] == (lowCodePoint - 1): |
| lowCodePoint = self.matches.pop() |
| elif len(self.ranges) and self.ranges[-1][1] == (lowCodePoint - 1): |
| priorRange = self.ranges.pop() |
| lowCodePoint = priorRange[0] |
| self.ranges.append((lowCodePoint, highCodePoint)) |
| elif lowCodePoint <= lastASCIICodePoint: |
| if lowCodePoint == lastASCIICodePoint: |
| self.addMatch(lowCodePoint) |
| else: |
| self.addRange(lowCodePoint, lastASCIICodePoint) |
| if highCodePoint == firstUnicodeCodePoint: |
| self.addMatch(highCodePoint) |
| else: |
| self.addRange(firstUnicodeCodePoint, highCodePoint) |
| else: |
| if (len(self.unicodeMatches) and self.unicodeMatches[-1] > lowCodePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > lowCodePoint): |
| self.addRangeUnordered(lowCodePoint, highCodePoint) |
| return |
| |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| if len(self.unicodeMatches) and self.unicodeMatches[-1] == (lowCodePoint - 1): |
| lowCodePoint = self.unicodeMatches.pop() |
| self.unicodeRanges.append((lowCodePoint, highCodePoint)) |
| elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (lowCodePoint - 1): |
| priorRange = self.unicodeRanges.pop() |
| self.unicodeRanges.append((priorRange[0], highCodePoint)) |
| else: |
| self.unicodeRanges.append((lowCodePoint, highCodePoint)) |
| |
| def addMatchUnorderedForMatchesAndRanges(self, codePoint, matches, ranges): |
| if codePoint in matches: |
| return |
| insertLocation = None |
| lowCodePoint = None |
| highCodePoint = None |
| for idx in range(len(matches)): |
| match = matches[idx] |
| if codePoint == match + 1: |
| lowCodePoint = match |
| if idx < (len(matches) - 1) and codePoint == matches[idx + 1] - 1: |
| highCodePoint = matches[idx + 1] |
| del matches[idx + 1] |
| self.codePointCount = self.codePointCount - 1 |
| else: |
| highCodePoint = codePoint |
| del matches[idx] |
| self.codePointCount = self.codePointCount - 1 |
| break |
| elif codePoint == match - 1: |
| lowCodePoint = codePoint |
| highCodePoint = match |
| del matches[idx] |
| self.codePointCount = self.codePointCount - 1 |
| break |
| elif codePoint < match: |
| insertLocation = idx |
| break |
| |
| if insertLocation is None: |
| insertLocation = len(matches) |
| if lowCodePoint is None: |
| lowCodePoint = codePoint |
| highCodePoint = codePoint |
| |
| for idx in range(len(ranges)): |
| cur_range = ranges[idx] |
| if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]: |
| return |
| if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1): |
| while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1): |
| cur_range = ranges[idx] |
| lowCodePoint = min(lowCodePoint, cur_range[0]) |
| highCodePoint = max(highCodePoint, cur_range[1]) |
| del ranges[idx] |
| self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1 |
| |
| ranges.insert(idx, (lowCodePoint, highCodePoint)) |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| return |
| elif highCodePoint < cur_range[0]: |
| if lowCodePoint != highCodePoint: |
| ranges.insert(idx, (lowCodePoint, highCodePoint)) |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| return |
| break |
| |
| if lowCodePoint != highCodePoint: |
| ranges.append((lowCodePoint, highCodePoint)) |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| return |
| |
| if insertLocation is not None: |
| matches.insert(insertLocation, codePoint) |
| self.codePointCount = self.codePointCount + 1 |
| |
| def addRangeUnorderedForMatchesAndRanges(self, lowCodePoint, highCodePoint, matches, ranges): |
| if len(matches) and highCodePoint >= matches[0] and lowCodePoint <= matches[-1]: |
| for idx in range(len(matches)): |
| match = matches[idx] |
| if lowCodePoint <= match and highCodePoint >= match: |
| while idx < len(matches) and highCodePoint >= matches[idx]: |
| del matches[idx] |
| self.codePointCount = self.codePointCount - 1 |
| if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1: |
| highCodePoint = matches[idx + 1] |
| del matches[idx + 1] |
| self.codePointCount = self.codePointCount - 1 |
| break |
| elif lowCodePoint == match + 1: |
| lowCodePoint = match |
| while idx < len(matches) and highCodePoint >= matches[idx]: |
| del matches[idx] |
| self.codePointCount = self.codePointCount - 1 |
| |
| if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1: |
| highCodePoint = matches[idx + 1] |
| del matches[idx + 1] |
| self.codePointCount = self.codePointCount - 1 |
| break |
| elif highCodePoint == match - 1: |
| highCodePoint = match |
| del matches[idx] |
| self.codePointCount = self.codePointCount - 1 |
| break |
| elif highCodePoint < match: |
| break |
| |
| for idx in range(len(ranges)): |
| cur_range = ranges[idx] |
| if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]: |
| return |
| if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1): |
| while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1): |
| cur_range = ranges[idx] |
| lowCodePoint = min(lowCodePoint, cur_range[0]) |
| highCodePoint = max(highCodePoint, cur_range[1]) |
| del ranges[idx] |
| self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1 |
| |
| ranges.insert(idx, (lowCodePoint, highCodePoint)) |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| return |
| elif highCodePoint < cur_range[0]: |
| ranges.insert(idx, (lowCodePoint, highCodePoint)) |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| return |
| |
| ranges.append((lowCodePoint, highCodePoint)) |
| self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 |
| |
| def addMatchUnordered(self, codePoint): |
| if codePoint <= lastASCIICodePoint: |
| self.addMatchUnorderedForMatchesAndRanges(codePoint, self.matches, self.ranges) |
| else: |
| self.addMatchUnorderedForMatchesAndRanges(codePoint, self.unicodeMatches, self.unicodeRanges) |
| |
| def addRangeUnordered(self, lowCodePoint, highCodePoint): |
| if highCodePoint <= lastASCIICodePoint: |
| self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.matches, self.ranges) |
| elif lowCodePoint >= firstUnicodeCodePoint: |
| self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges) |
| else: |
| if lowCodePoint == lastASCIICodePoint: |
| self.addMatchUnorderedForMatchesAndRanges(lowCodePoint, self.matches, self.ranges) |
| else: |
| self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, lastASCIICodePoint, self.unicodeMatches, self.ranges) |
| if highCodePoint == firstUnicodeCodePoint: |
| self.addMatchUnorderedForMatchesAndRanges(highCodePoint, self.unicodeMatches, self.unicodeRanges) |
| else: |
| self.addRangeUnorderedForMatchesAndRanges(firstUnicodeCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges) |
| |
| def removeMatchFromRanges(self, codePoint, ranges): |
| for idx in range(len(ranges)): |
| cur_range = ranges[idx] |
| if cur_range[0] <= codePoint and codePoint <= cur_range[1]: |
| ranges.pop(idx) |
| if cur_range[0] < codePoint and codePoint < cur_range[1]: |
| lowRange = (cur_range[0], codePoint - 1) |
| highRange = (codePoint + 1, cur_range[1]) |
| # Since list.insert inserts before the index given, handle the high range first |
| if highRange[0] == highRange[1]: |
| self.addMatchUnordered(highRange[0]) |
| else: |
| ranges.insert(idx, highRange) |
| if lowRange[0] == lowRange[1]: |
| self.addMatchUnordered(lowRange[0]) |
| else: |
| ranges.insert(idx, lowRange) |
| else: |
| if cur_range[0] == codePoint: |
| cur_range = (codePoint + 1, cur_range[1]) |
| else: |
| cur_range = (cur_range[0], codePoint - 1) |
| if cur_range[0] == cur_range[1]: |
| self.addMatchUnordered(cur_range[0]) |
| else: |
| ranges.insert(idx, cur_range) |
| self.codePointCount = self.codePointCount - 1 |
| return |
| |
| def removeMatch(self, codePoint): |
| if codePoint <= lastASCIICodePoint: |
| if codePoint in self.matches: |
| self.matches.remove(codePoint) |
| self.codePointCount = self.codePointCount - 1 |
| else: |
| self.removeMatchFromRanges(codePoint, self.ranges) |
| else: |
| if codePoint in self.unicodeMatches: |
| self.unicodeMatches.remove(codePoint) |
| self.codePointCount = self.codePointCount - 1 |
| else: |
| self.removeMatchFromRanges(codePoint, self.unicodeRanges) |
| |
| def dumpMatchData(self, file, valuesPerLine, dataList, formatter): |
| valuesThisLine = 0 |
| firstValue = True |
| |
| file.write("{") |
| for elem in dataList: |
| if firstValue: |
| firstValue = False |
| else: |
| file.write(", ") |
| valuesThisLine = valuesThisLine + 1 |
| if valuesThisLine > valuesPerLine: |
| file.write("\n ") |
| valuesThisLine = 1 |
| formatter(file, elem) |
| file.write("}") |
| |
| def dump(self, file, commaAfter): |
| file.write("static std::unique_ptr<CharacterClass> {}()\n{{\n".format(self.getCreateFuncName())) |
| file.write(" // Name = {}, number of codePoints: {}\n".format(self.name, self.codePointCount)) |
| file.write(" auto characterClass = makeUnique<CharacterClass>(\n") |
| file.write(" std::initializer_list<UChar32>(") |
| self.dumpMatchData(file, 8, self.matches, lambda file, match: (file.write("{0:0=#4x}".format(match)))) |
| file.write("),\n") |
| file.write(" std::initializer_list<CharacterRange>(") |
| self.dumpMatchData(file, 4, self.ranges, lambda file, range: (file.write("{{{0:0=#4x}, {1:0=#4x}}}".format(range[0], range[1])))) |
| file.write("),\n") |
| file.write(" std::initializer_list<UChar32>(") |
| self.dumpMatchData(file, 8, self.unicodeMatches, lambda file, match: (file.write("{0:0=#6x}".format(match)))) |
| file.write("),\n") |
| file.write(" std::initializer_list<CharacterRange>(") |
| self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1])))) |
| file.write("),\n") |
| |
| file.write(" CharacterClassWidths::{});\n".format(("Unknown", "HasBMPChars", "HasNonBMPChars", "HasBothBMPAndNonBMP")[int(self.hasNonBMPCharacters) * 2 + int(self.hasBMPCharacters)])) |
| file.write(" return characterClass;\n}\n\n") |
| |
| @classmethod |
| def dumpAll(cls, file): |
| for propertyData in cls.allPropertyData: |
| propertyData.dump(file, propertyData != cls.allPropertyData[-1]) |
| |
| file.write("using CreateCharacterClass = std::unique_ptr<CharacterClass> (*)();\n") |
| file.write("static CreateCharacterClass createFunctions[{}] = {{\n ".format(len(cls.allPropertyData))) |
| functionsOnThisLine = 0 |
| for propertyData in cls.allPropertyData: |
| file.write(" {},".format(propertyData.getCreateFuncName())) |
| functionsOnThisLine = functionsOnThisLine + 1 |
| if functionsOnThisLine == 4: |
| file.write("\n ") |
| functionsOnThisLine = 0 |
| |
| file.write("};\n\n") |
| |
| @classmethod |
| def createAndDumpHashTable(self, file, propertyDict, tablePrefix): |
| propertyKeys = propertyDict.keys() |
| numberOfKeys = len(propertyKeys) |
| hashSize = ceilingToPowerOf2(numberOfKeys * 2) |
| hashMask = hashSize - 1 |
| hashTable = [None] * hashSize |
| valueTable = [] |
| tableSize = hashSize |
| |
| keyValuesToHash = [] |
| for propertyName in propertyKeys: |
| propertyData = propertyDict[propertyName] |
| keyValuesToHash.append((propertyName, propertyData.getIndex())) |
| for alias in propertyData.aliases: |
| keyValuesToHash.append((alias, propertyData.getIndex())) |
| |
| for keyValue in keyValuesToHash: |
| key = keyValue[0] |
| hash = stringHash(key) % hashSize |
| while hashTable[hash] is not None: |
| if hashTable[hash][1] is not None: |
| hash = hashTable[hash][1] |
| else: |
| hashTable[hash] = (hashTable[hash][0], tableSize) |
| hashTable.append(None) |
| hash = tableSize |
| tableSize = tableSize + 1 |
| |
| hashTable[hash] = (len(valueTable), None) |
| valueTable.append((key, keyValue[1])) |
| |
| file.write("static const struct HashIndex {}TableIndex[{}] = {{\n".format(tablePrefix, len(hashTable))) |
| |
| for tableIndex in hashTable: |
| value = -1 |
| next = -1 |
| if tableIndex is not None: |
| value = tableIndex[0] |
| if tableIndex[1] is not None: |
| next = tableIndex[1] |
| |
| file.write(" {{ {}, {} }},\n".format(value, next)) |
| |
| file.write("};\n\n") |
| |
| file.write("static const struct HashValue {}TableValue[{}] = {{\n".format(tablePrefix, len(valueTable))) |
| for value in valueTable: |
| file.write(" {{ \"{}\", {} }},\n".format(value[0], value[1])) |
| file.write("};\n\n") |
| |
| file.write("static const struct HashTable {}HashTable = \n".format(tablePrefix)) |
| file.write(" {{ {}, {}, {}TableValue, {}TableIndex }};\n\n".format(len(valueTable), hashMask, tablePrefix, tablePrefix)) |
| |
| |
| class Scripts: |
| def __init__(self): |
| self.allPropertyData = [] |
| self.scriptsByName = {} |
| self.scriptExtensionsByName = {} |
| self.unknownScript = PropertyData("Unknown") |
| self.unknownScript.setAliases(aliases.scriptAliasesFor("Unknown")) |
| self.allPropertyData.append(self.unknownScript) |
| self.scriptsParsed = False |
| |
| def parseScriptsFile(self, file): |
| currentScriptName = None |
| currentPropertyData = None |
| # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges, |
| # sort them, and then go the list to create the inverse of the assigned ranges. |
| assignedCodePointRanges = [] |
| |
| for line in file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = line.split(';') |
| if (not fields): |
| continue |
| |
| codePoints = fields[0].strip() |
| scriptName = fields[1].strip() |
| |
| if scriptName != currentScriptName: |
| currentScriptName = scriptName |
| currentPropertyData = PropertyData(scriptName) |
| currentPropertyData.setAliases(aliases.scriptAliasesFor(scriptName)) |
| self.allPropertyData.append(currentPropertyData) |
| self.scriptsByName[scriptName] = currentPropertyData |
| |
| dotDot = codePoints.find("..") |
| if dotDot == -1: |
| codePoint = int(codePoints, 16) |
| currentPropertyData.addMatch(codePoint) |
| assignedCodePointRanges.append((codePoint, codePoint)) |
| else: |
| lowCodePoint = int(codePoints[:dotDot], 16) |
| highCodePoint = int(codePoints[dotDot + 2:], 16) |
| currentPropertyData.addRange(lowCodePoint, highCodePoint) |
| assignedCodePointRanges.append((lowCodePoint, highCodePoint)) |
| |
| assignedCodePointRanges.sort(key=lambda range: range[0]) |
| lastAssignedCodePoint = 0 |
| |
| for range in assignedCodePointRanges: |
| if range[0] - lastAssignedCodePoint > 1: |
| if range[0] - lastAssignedCodePoint == 2: |
| self.unknownScript.addMatch(lastAssignedCodePoint + 1) |
| else: |
| self.unknownScript.addRange(lastAssignedCodePoint + 1, range[0] - 1) |
| lastAssignedCodePoint = range[1] |
| |
| if lastAssignedCodePoint < MaxUnicode: |
| if MaxUnicode - lastAssignedCodePoint == 1: |
| self.unknownScript.addMatch(MaxUnicode) |
| else: |
| self.unknownScript.addRange(lastAssignedCodePoint + 1, MaxUnicode) |
| |
| self.scriptsParsed = True |
| |
| def parseScriptExtensionsFile(self, file): |
| currentPropertyData = None |
| # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges, |
| # sort them, and then go the list to create the inverse of the assigned ranges. |
| assignedCodePointRanges = [] |
| |
| if not self.scriptsParsed: |
| print("Error: parsing ScriptExtensions.txt before Scripts.txt") |
| exit(1) |
| |
| commonScriptExtenstionPropertyData = None |
| inheritedScriptExtensionPropertyData = None |
| |
| scriptName = "Common" |
| if scriptName in self.scriptsByName: |
| commonScriptExtenstionPropertyData = self.scriptsByName[scriptName].makeCopy() |
| else: |
| commonScriptExtenstionPropertyData = PropertyData(scriptName) |
| commonScriptExtenstionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName)) |
| self.allPropertyData.append(commonScriptExtenstionPropertyData) |
| self.scriptExtensionsByName[scriptName] = commonScriptExtenstionPropertyData |
| |
| scriptName = "Inherited" |
| if scriptName in self.scriptsByName: |
| inheritedScriptExtensionPropertyData = self.scriptsByName[scriptName].makeCopy() |
| else: |
| inheritedScriptExtensionPropertyData = PropertyData(scriptName) |
| inheritedScriptExtensionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName)) |
| self.allPropertyData.append(inheritedScriptExtensionPropertyData) |
| self.scriptExtensionsByName[scriptName] = inheritedScriptExtensionPropertyData |
| |
| for line in file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = line.split(';') |
| if (not fields): |
| continue |
| |
| codePoints = fields[0].strip() |
| scriptAliasList = fields[1].strip() |
| |
| for scriptAlias in scriptAliasList.split(' '): |
| scriptName = aliases.scriptNameForAlias(scriptAlias) |
| currentPropertyData = None |
| |
| if scriptName not in self.scriptExtensionsByName: |
| currentPropertyData = self.scriptsByName[scriptName].makeCopy() |
| self.allPropertyData.append(currentPropertyData) |
| self.scriptExtensionsByName[scriptName] = currentPropertyData |
| else: |
| currentPropertyData = self.scriptExtensionsByName[scriptName] |
| |
| dotDot = codePoints.find("..") |
| if dotDot == -1: |
| codePoint = int(codePoints, 16) |
| currentPropertyData.addMatch(codePoint) |
| commonScriptExtenstionPropertyData.removeMatch(codePoint) |
| inheritedScriptExtensionPropertyData.removeMatch(codePoint) |
| else: |
| lowCodePoint = int(codePoints[:dotDot], 16) |
| highCodePoint = int(codePoints[dotDot + 2:], 16) |
| currentPropertyData.addRange(lowCodePoint, highCodePoint) |
| for codePoint in range(lowCodePoint, highCodePoint + 1): |
| commonScriptExtenstionPropertyData.removeMatch(codePoint) |
| inheritedScriptExtensionPropertyData.removeMatch(codePoint) |
| |
| # For the scripts that don't have any additional extension codePoints, copy the script |
| # data to the script extension with the same name |
| for scriptName, propertyData in self.scriptsByName.items(): |
| if scriptName not in self.scriptExtensionsByName: |
| self.scriptExtensionsByName[scriptName] = propertyData |
| |
| def dump(self, file): |
| file.write("// Scripts:\n") |
| PropertyData.createAndDumpHashTable(file, self.scriptsByName, "script") |
| |
| file.write("// Script_Extensions:\n") |
| PropertyData.createAndDumpHashTable(file, self.scriptExtensionsByName, "scriptExtension") |
| |
| |
| class GeneralCategory: |
| def __init__(self, file): |
| self.file = file |
| self.allPropertyData = [] |
| self.propertyDataByCategory = {} |
| self.createSpecialPropertyData("Any", (0, MaxUnicode)) |
| self.createSpecialPropertyData("ASCII", (0, lastASCIICodePoint)) |
| self.assignedPropertyData = self.createSpecialPropertyData("Assigned") |
| self.unassignedProperyData = self.findPropertyGroupFor("Cn")[1] |
| self.casedLetterPropertyData = self.findPropertyGroupFor("LC")[1] |
| self.lastAddedCodePoint = 0 |
| |
| def createSpecialPropertyData(self, name, range=None): |
| propertyData = PropertyData(name) |
| self.allPropertyData.append(propertyData) |
| self.propertyDataByCategory[name] = propertyData |
| if range: |
| propertyData.addRange(range[0], range[1]) |
| |
| return propertyData |
| |
| def findPropertyGroupFor(self, categoryAlias): |
| category = aliases.generalCategoryForAlias(categoryAlias) |
| allCategoryAliases = aliases.generalCategoryAliasesFor(category) |
| categoryGroupAlias = categoryAlias[0] |
| categoryGroup = aliases.generalCategoryForAlias(categoryGroupAlias) |
| allCategoryGroupAlias = aliases.generalCategoryAliasesFor(categoryGroup) |
| groupPropertyData = None |
| propertyData = None |
| |
| if categoryGroup not in self.propertyDataByCategory: |
| groupPropertyData = PropertyData(categoryGroup) |
| groupPropertyData.setAliases(allCategoryGroupAlias) |
| self.allPropertyData.append(groupPropertyData) |
| self.propertyDataByCategory[categoryGroup] = groupPropertyData |
| else: |
| groupPropertyData = self.propertyDataByCategory[categoryGroup] |
| |
| if category not in self.propertyDataByCategory: |
| propertyData = PropertyData(category) |
| propertyData.setAliases(allCategoryAliases) |
| self.allPropertyData.append(propertyData) |
| self.propertyDataByCategory[category] = propertyData |
| else: |
| propertyData = self.propertyDataByCategory[category] |
| |
| return (groupPropertyData, propertyData) |
| |
| def addNextCodePoints(self, categoryAlias, codePoint, highCodePoint=None): |
| if codePoint - self.lastAddedCodePoint > 1: |
| propertyDatas = self.findPropertyGroupFor("Cn") |
| if codePoint - self.lastAddedCodePoint == 2: |
| propertyDatas[0].addMatch(self.lastAddedCodePoint + 1) |
| propertyDatas[1].addMatch(self.lastAddedCodePoint + 1) |
| else: |
| propertyDatas[0].addRange(self.lastAddedCodePoint + 1, codePoint - 1) |
| propertyDatas[1].addRange(self.lastAddedCodePoint + 1, codePoint - 1) |
| |
| propertyDatas = self.findPropertyGroupFor(categoryAlias) |
| if highCodePoint: |
| propertyDatas[0].addRange(codePoint, highCodePoint) |
| propertyDatas[1].addRange(codePoint, highCodePoint) |
| if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu": |
| self.casedLetterPropertyData.addRange(codePoint, highCodePoint) |
| self.assignedPropertyData.addRange(codePoint, highCodePoint) |
| |
| self.lastAddedCodePoint = highCodePoint |
| else: |
| propertyDatas[0].addMatch(codePoint) |
| propertyDatas[1].addMatch(codePoint) |
| if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu": |
| self.casedLetterPropertyData.addMatch(codePoint) |
| self.assignedPropertyData.addMatch(codePoint) |
| |
| self.lastAddedCodePoint = codePoint |
| |
| def parse(self): |
| lastLineFirstOfRange = None |
| lastLineCodePoint = 0 |
| for line in self.file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = line.split(';') |
| if (not fields): |
| continue |
| |
| codePoint = int(fields[0].strip(), 16) |
| description = fields[1].strip() |
| categoryAlias = fields[2].strip() |
| |
| if lastLineFirstOfRange: |
| if description[-5:-1] == "Last": |
| self.addNextCodePoints(categoryAlias, lastLineFirstOfRange, codePoint) |
| lastLineFirstOfRange = None |
| continue |
| else: |
| print("Malformed First..Last pair of lines") |
| |
| if description[-6:-1] == "First": |
| lastLineFirstOfRange = codePoint |
| continue |
| |
| self.addNextCodePoints(categoryAlias, codePoint) |
| |
| if self.lastAddedCodePoint < MaxUnicode: |
| propertyDatas = self.findPropertyGroupFor("Cn") |
| if MaxUnicode - self.lastAddedCodePoint == 1: |
| propertyDatas[0].addMatch(MaxUnicode) |
| propertyDatas[1].addMatch(MaxUnicode) |
| else: |
| propertyDatas[0].addRange(self.lastAddedCodePoint + 1, MaxUnicode) |
| propertyDatas[1].addRange(self.lastAddedCodePoint + 1, MaxUnicode) |
| |
| def dump(self, file): |
| file.write("// General_Category:\n") |
| PropertyData.createAndDumpHashTable(file, self.propertyDataByCategory, "generalCategory") |
| |
| |
| class BinaryProperty: |
| def __init__(self): |
| self.allPropertyData = [] |
| self.propertyDataByProperty = {} |
| |
| def parsePropertyFile(self, file): |
| currentPropertyName = None |
| currentPropertyData = None |
| |
| for line in file: |
| line = line.split('#', 1)[0] |
| line = line.rstrip() |
| if (not len(line)): |
| continue |
| |
| fields = line.split(';') |
| if (not fields): |
| continue |
| |
| codePoints = fields[0].strip() |
| propertyName = fields[1].strip() |
| |
| if propertyName != currentPropertyName: |
| if propertyName not in SupportedBinaryProperties: |
| continue |
| |
| currentPropertyName = propertyName |
| currentPropertyData = PropertyData(propertyName) |
| currentPropertyData.setAliases(aliases.globalAliasesFor(propertyName)) |
| self.allPropertyData.append(currentPropertyData) |
| self.propertyDataByProperty[propertyName] = currentPropertyData |
| |
| dotDot = codePoints.find("..") |
| if dotDot == -1: |
| currentPropertyData.addMatch(int(codePoints, 16)) |
| else: |
| currentPropertyData.addRange(int(codePoints[:dotDot], 16), int(codePoints[dotDot + 2:], 16)) |
| |
| def dump(self, file): |
| file.write("// binary properties:\n") |
| PropertyData.createAndDumpHashTable(file, self.propertyDataByProperty, "binaryProperty") |
| |
| if __name__ == "__main__": |
| parser = optparse.OptionParser(usage="usage: %prog <UCD-Directory> <YarrUnicodePropertyData.h>") |
| (options, args) = parser.parse_args() |
| |
| if len(args) != 2: |
| parser.error("<UCD-Directory> <YarrUnicodePropertyData.h>") |
| |
| UCDDirectoryPath = args[0] |
| unicodeProertyDataHPath = args[1] |
| |
| verifyUCDFilesExist() |
| |
| propertyAliasesFile = openUCDFileOrExit("PropertyAliases.txt") |
| propertyValueAliasesFile = openUCDFileOrExit("PropertyValueAliases.txt") |
| scriptsFile = openUCDFileOrExit("Scripts.txt") |
| scriptExtensionsFile = openUCDFileOrExit("ScriptExtensions.txt") |
| unicodeDataFile = openUCDFileOrExit("UnicodeData.txt") |
| derivedBinaryPropertiesFile = openUCDFileOrExit("DerivedBinaryProperties.txt") |
| derivedCorePropertiesFile = openUCDFileOrExit("DerivedCoreProperties.txt") |
| derivedNormalizationPropertiesFile = openUCDFileOrExit("DerivedNormalizationProps.txt") |
| propListFile = openUCDFileOrExit("PropList.txt") |
| emojiDataFile = openUCDFileOrExit("emoji-data.txt") |
| |
| aliases = Aliases() |
| |
| propertyDataHFile = openOrExit(unicodeProertyDataHPath, "w") |
| |
| propertyDataHFile.write(header) |
| |
| aliases.parsePropertyAliasesFile(propertyAliasesFile) |
| aliases.parsePropertyValueAliasesFile(propertyValueAliasesFile) |
| |
| generalCategory = GeneralCategory(unicodeDataFile) |
| generalCategory.parse() |
| |
| binaryProperty = BinaryProperty() |
| binaryProperty.parsePropertyFile(derivedBinaryPropertiesFile) |
| binaryProperty.parsePropertyFile(derivedCorePropertiesFile) |
| binaryProperty.parsePropertyFile(derivedNormalizationPropertiesFile) |
| binaryProperty.parsePropertyFile(propListFile) |
| binaryProperty.parsePropertyFile(emojiDataFile) |
| |
| scripts = Scripts() |
| scripts.parseScriptsFile(scriptsFile) |
| scripts.parseScriptExtensionsFile(scriptExtensionsFile) |
| |
| PropertyData.dumpAll(propertyDataHFile) |
| generalCategory.dump(propertyDataHFile) |
| binaryProperty.dump(propertyDataHFile) |
| scripts.dump(propertyDataHFile) |
| |
| propertyDataHFile.write(footer) |
| |
| exit(0) |