Source/JavaScriptCore/yarr/generateYarrUnicodePropertyTables.py - WebKit - Git at Google

 #!/usr/bin/env python3

 # Copyright (C) 2017 Apple Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1.  Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 2.  Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 # This tool processes the Unicode Character Database file CaseFolding.txt to create
 # canonicalization table as decribed in ECMAScript 6 standard in section
 # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.

 import sys
 import copy
 import optparse
 import os
 import re
 from hasher import stringHash

 header = """/*
 * Copyright (C) 2017-2020 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1.  Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 * 2.  Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 // DO NO EDIT! - This file was generated by """ + __file__ + """
 """


 footer = """
 """

 RequiredUCDFiles = ["DerivedBinaryProperties.txt", "DerivedCoreProperties.txt", "DerivedNormalizationProps.txt", "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", "ScriptExtensions.txt", "UnicodeData.txt", "emoji-data.txt"]
 UCDDirectoryPath = None

 SupportedBinaryProperties = [
     "Alphabetic", "Any", "ASCII", "ASCII_Hex_Digit", "Assigned", "Bidi_Control", "Bidi_Mirrored", "Case_Ignorable",
     "Cased", "Changes_When_Casefolded", "Changes_When_Casemapped", "Changes_When_Lowercased", "Changes_When_NFKC_Casefolded",
     "Changes_When_Titlecased", "Changes_When_Uppercased", "Dash", "Default_Ignorable_Code_Point", "Deprecated",
     "Diacritic", "Emoji", "Emoji_Component", "Emoji_Modifier_Base", "Emoji_Modifier", "Emoji_Presentation",
     "Extended_Pictographic", "Extender", "Grapheme_Base", "Grapheme_Extend", "Hex_Digit", "ID_Continue", "ID_Start",
     "Ideographic", "IDS_Binary_Operator", "IDS_Trinary_Operator", "Join_Control", "Logical_Order_Exception", "Lowercase",
     "Math", "Noncharacter_Code_Point", "Pattern_Syntax", "Pattern_White_Space", "Quotation_Mark", "Radical",
     "Regional_Indicator", "Sentence_Terminal", "Soft_Dotted", "Terminal_Punctuation", "Unified_Ideograph", "Uppercase",
     "Variation_Selector", "White_Space", "XID_Continue", "XID_Start"]

 lastASCIICodePoint = 0x7f
 firstUnicodeCodePoint = 0x80
 MaxUnicode = 0x10ffff
 MaxBMP = 0xffff
 commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)
 aliases = None


 def openOrExit(path, mode):
     try:
         if sys.version_info.major >= 3:
             return open(path, mode, encoding="UTF-8")
         else:
             return open(path, mode)
     except IOError as e:
         print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
         exit(1)


 def openUCDFileOrExit(path):
     if not UCDDirectoryPath:
         exit(1)

     return openOrExit(os.path.join(UCDDirectoryPath, path), 'r')


 def verifyUCDFilesExist():
     if not UCDDirectoryPath:
         exit(1)

     missingFileCount = 0
     for file in RequiredUCDFiles:
         fullPath = os.path.join(UCDDirectoryPath, file)
         if not os.path.exists(fullPath):
             print("Couldn't find UCD file {0} at {1}".format(file, fullPath))
             missingFileCount = missingFileCount + 1
     if missingFileCount:
         exit(1)


 def ceilingToPowerOf2(size):
     powerOf2 = 1
     while size > powerOf2:
         powerOf2 = powerOf2 << 1

     return powerOf2


 class Aliases:
     def __init__(self):
         self.globalNameToAliases = {}
         self.generalCategoryToAliases = {}
         self.aliasToGeneralCategory = {}
         self.scriptToAliases = {}
         self.aliasToScript = {}

     def parsePropertyAliasesFile(self, file):
         for line in file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = line.split(';')
             if (not fields):
                 continue

             aliases = [fields[0].strip()]
             fullName = fields[1].strip()
             for otherAlias in fields[2:]:
                 aliases.append(otherAlias.strip())

             if fullName in self.globalNameToAliases:
                 print("Error, already an alias for {}".format(fullName))
             else:
                 self.globalNameToAliases[fullName] = aliases

     def parsePropertyValueAliasesFile(self, file):
         for line in file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = line.split(';')
             if (not fields):
                 continue

             propertyType = fields[0].strip()

             if propertyType == "gc":
                 mapToModify = self.generalCategoryToAliases
                 reverseMapToModify = self.aliasToGeneralCategory
             elif propertyType == "sc":
                 mapToModify = self.scriptToAliases
                 reverseMapToModify = self.aliasToScript
             else:
                 continue

             primaryAlias = fields[1].strip()
             fullName = fields[2].strip()
             aliases = [primaryAlias]
             for otherAlias in fields[3:]:
                 aliases.append(otherAlias.strip())

             if fullName in mapToModify:
                 print("Error, already an {} alias for {}".format(propertyType, fullName))
             else:
                 mapToModify[fullName] = aliases
                 if reverseMapToModify != None:
                     reverseMapToModify[primaryAlias] = fullName

     def globalAliasesFor(self, name):
         if name not in self.globalNameToAliases:
             return []
         return self.globalNameToAliases[name]

     def generalCategoryAliasesFor(self, name):
         if name not in self.generalCategoryToAliases:
             return ""
         return self.generalCategoryToAliases[name]

     def generalCategoryForAlias(self, name):
         if name not in self.aliasToGeneralCategory:
             return ""
         return self.aliasToGeneralCategory[name]

     def scriptAliasesFor(self, name):
         if name not in self.scriptToAliases:
             return ""
         return self.scriptToAliases[name]

     def scriptNameForAlias(self, name):
         if name not in self.aliasToScript:
             return ""
         return self.aliasToScript[name]


 class PropertyData:
     allPropertyData = []

     def __init__(self, name):
         self.name = name
         self.aliases = []
         self.index = len(PropertyData.allPropertyData)
         self.hasBMPCharacters = False
         self.hasNonBMPCharacters = False
         self.matches = []
         self.ranges = []
         self.unicodeMatches = []
         self.unicodeRanges = []
         self.codePointCount = 0
         PropertyData.allPropertyData.append(self)

     def setAliases(self, aliases):
         self.aliases = aliases

     def makeCopy(self):
         result = copy.deepcopy(self)
         result.index = len(PropertyData.allPropertyData)
         PropertyData.allPropertyData.append(result)
         return result

     def getIndex(self):
         return self.index

     def getCreateFuncName(self):
         return "createCharacterClass{}".format(self.index)

     def addMatch(self, codePoint):
         if codePoint <= MaxBMP:
             self.hasBMPCharacters = True
         else:
             self.hasNonBMPCharacters = True
         if codePoint <= lastASCIICodePoint:
             if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint):
                 self.addMatchUnordered(codePoint)
                 return

             self.codePointCount = self.codePointCount + 1
             if len(self.matches) and self.matches[-1] == (codePoint - 1):
                 lowCodePoint = self.matches.pop()
                 self.ranges.append((lowCodePoint, codePoint))
             elif len(self.ranges) and self.ranges[-1][1] == (codePoint - 1):
                 priorRange = self.ranges.pop()
                 self.ranges.append((priorRange[0], codePoint))
             else:
                 self.matches.append(codePoint)
         else:
             if (len(self.unicodeMatches) and self.unicodeMatches[-1] > codePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > codePoint):
                 self.addMatchUnordered(codePoint)
                 return

             self.codePointCount = self.codePointCount + 1
             if len(self.unicodeMatches) and self.unicodeMatches[-1] == (codePoint - 1):
                 lowCodePoint = self.unicodeMatches.pop()
                 self.unicodeRanges.append((lowCodePoint, codePoint))
             elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (codePoint - 1):
                 priorRange = self.unicodeRanges.pop()
                 self.unicodeRanges.append((priorRange[0], codePoint))
             else:
                 self.unicodeMatches.append(codePoint)

     def addRange(self, lowCodePoint, highCodePoint):
         if lowCodePoint <= MaxBMP:
             self.hasBMPCharacters = True
         if highCodePoint > MaxBMP:
             self.hasNonBMPCharacters = True
         if highCodePoint <= lastASCIICodePoint:
             if (len(self.matches) and self.matches[-1] > lowCodePoint) or (len(self.ranges) and self.ranges[-1][1] > lowCodePoint):
                 self.addRangeUnordered(lowCodePoint, highCodePoint)
                 return

             self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
             if len(self.matches) and self.matches[-1] == (lowCodePoint - 1):
                 lowCodePoint = self.matches.pop()
             elif len(self.ranges) and self.ranges[-1][1] == (lowCodePoint - 1):
                 priorRange = self.ranges.pop()
                 lowCodePoint = priorRange[0]
             self.ranges.append((lowCodePoint, highCodePoint))
         elif lowCodePoint <= lastASCIICodePoint:
             if lowCodePoint == lastASCIICodePoint:
                 self.addMatch(lowCodePoint)
             else:
                 self.addRange(lowCodePoint, lastASCIICodePoint)
             if highCodePoint == firstUnicodeCodePoint:
                 self.addMatch(highCodePoint)
             else:
                 self.addRange(firstUnicodeCodePoint, highCodePoint)
         else:
             if (len(self.unicodeMatches) and self.unicodeMatches[-1] > lowCodePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > lowCodePoint):
                 self.addRangeUnordered(lowCodePoint, highCodePoint)
                 return

             self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
             if len(self.unicodeMatches) and self.unicodeMatches[-1] == (lowCodePoint - 1):
                 lowCodePoint = self.unicodeMatches.pop()
                 self.unicodeRanges.append((lowCodePoint, highCodePoint))
             elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (lowCodePoint - 1):
                 priorRange = self.unicodeRanges.pop()
                 self.unicodeRanges.append((priorRange[0], highCodePoint))
             else:
                 self.unicodeRanges.append((lowCodePoint, highCodePoint))

     def addMatchUnorderedForMatchesAndRanges(self, codePoint, matches, ranges):
         if codePoint in matches:
             return
         insertLocation = None
         lowCodePoint = None
         highCodePoint = None
         for idx in range(len(matches)):
             match = matches[idx]
             if codePoint == match + 1:
                 lowCodePoint = match
                 if idx < (len(matches) - 1) and codePoint == matches[idx + 1] - 1:
                     highCodePoint = matches[idx + 1]
                     del matches[idx + 1]
                     self.codePointCount = self.codePointCount - 1
                 else:
                     highCodePoint = codePoint
                 del matches[idx]
                 self.codePointCount = self.codePointCount - 1
                 break
             elif codePoint == match - 1:
                 lowCodePoint = codePoint
                 highCodePoint = match
                 del matches[idx]
                 self.codePointCount = self.codePointCount - 1
                 break
             elif codePoint < match:
                 insertLocation = idx
                 break

         if insertLocation is None:
             insertLocation = len(matches)
         if lowCodePoint is None:
             lowCodePoint = codePoint
             highCodePoint = codePoint

         for idx in range(len(ranges)):
             cur_range = ranges[idx]
             if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]:
                 return
             if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1):
                 while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
                     cur_range = ranges[idx]
                     lowCodePoint = min(lowCodePoint, cur_range[0])
                     highCodePoint = max(highCodePoint, cur_range[1])
                     del ranges[idx]
                     self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1

                 ranges.insert(idx, (lowCodePoint, highCodePoint))
                 self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                 return
             elif highCodePoint < cur_range[0]:
                 if lowCodePoint != highCodePoint:
                     ranges.insert(idx, (lowCodePoint, highCodePoint))
                     self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                     return
                 break

         if lowCodePoint != highCodePoint:
             ranges.append((lowCodePoint, highCodePoint))
             self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
             return

         if insertLocation is not None:
             matches.insert(insertLocation, codePoint)
             self.codePointCount = self.codePointCount + 1

     def addRangeUnorderedForMatchesAndRanges(self, lowCodePoint, highCodePoint, matches, ranges):
         if len(matches) and highCodePoint >= matches[0] and lowCodePoint <= matches[-1]:
             for idx in range(len(matches)):
                 match = matches[idx]
                 if lowCodePoint <= match and highCodePoint >= match:
                     while idx < len(matches) and highCodePoint >= matches[idx]:
                         del matches[idx]
                         self.codePointCount = self.codePointCount - 1
                     if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
                         highCodePoint = matches[idx + 1]
                         del matches[idx + 1]
                         self.codePointCount = self.codePointCount - 1
                     break
                 elif lowCodePoint == match + 1:
                     lowCodePoint = match
                     while idx < len(matches) and highCodePoint >= matches[idx]:
                         del matches[idx]
                         self.codePointCount = self.codePointCount - 1

                     if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
                         highCodePoint = matches[idx + 1]
                         del matches[idx + 1]
                         self.codePointCount = self.codePointCount - 1
                     break
                 elif highCodePoint == match - 1:
                     highCodePoint = match
                     del matches[idx]
                     self.codePointCount = self.codePointCount - 1
                     break
                 elif highCodePoint < match:
                     break

         for idx in range(len(ranges)):
             cur_range = ranges[idx]
             if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]:
                 return
             if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1):
                 while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
                     cur_range = ranges[idx]
                     lowCodePoint = min(lowCodePoint, cur_range[0])
                     highCodePoint = max(highCodePoint, cur_range[1])
                     del ranges[idx]
                     self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1

                 ranges.insert(idx, (lowCodePoint, highCodePoint))
                 self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                 return
             elif highCodePoint < cur_range[0]:
                 ranges.insert(idx, (lowCodePoint, highCodePoint))
                 self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                 return

         ranges.append((lowCodePoint, highCodePoint))
         self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1

     def addMatchUnordered(self, codePoint):
         if codePoint <= lastASCIICodePoint:
             self.addMatchUnorderedForMatchesAndRanges(codePoint, self.matches, self.ranges)
         else:
             self.addMatchUnorderedForMatchesAndRanges(codePoint, self.unicodeMatches, self.unicodeRanges)

     def addRangeUnordered(self, lowCodePoint, highCodePoint):
         if highCodePoint <= lastASCIICodePoint:
             self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.matches, self.ranges)
         elif lowCodePoint >= firstUnicodeCodePoint:
             self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)
         else:
             if lowCodePoint == lastASCIICodePoint:
                 self.addMatchUnorderedForMatchesAndRanges(lowCodePoint, self.matches, self.ranges)
             else:
                 self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, lastASCIICodePoint, self.unicodeMatches, self.ranges)
             if highCodePoint == firstUnicodeCodePoint:
                 self.addMatchUnorderedForMatchesAndRanges(highCodePoint, self.unicodeMatches, self.unicodeRanges)
             else:
                 self.addRangeUnorderedForMatchesAndRanges(firstUnicodeCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)

     def removeMatchFromRanges(self, codePoint, ranges):
         for idx in range(len(ranges)):
             cur_range = ranges[idx]
             if cur_range[0] <= codePoint and codePoint <= cur_range[1]:
                 ranges.pop(idx)
                 if cur_range[0] < codePoint and codePoint < cur_range[1]:
                     lowRange = (cur_range[0], codePoint - 1)
                     highRange = (codePoint + 1, cur_range[1])
                     # Since list.insert inserts before the index given, handle the high range first
                     if highRange[0] == highRange[1]:
                         self.addMatchUnordered(highRange[0])
                     else:
                         ranges.insert(idx, highRange)
                     if lowRange[0] == lowRange[1]:
                         self.addMatchUnordered(lowRange[0])
                     else:
                         ranges.insert(idx, lowRange)
                 else:
                     if cur_range[0] == codePoint:
                         cur_range = (codePoint + 1, cur_range[1])
                     else:
                         cur_range = (cur_range[0], codePoint - 1)
                     if cur_range[0] == cur_range[1]:
                         self.addMatchUnordered(cur_range[0])
                     else:
                         ranges.insert(idx, cur_range)
                 self.codePointCount = self.codePointCount - 1
                 return

     def removeMatch(self, codePoint):
         if codePoint <= lastASCIICodePoint:
             if codePoint in self.matches:
                 self.matches.remove(codePoint)
                 self.codePointCount = self.codePointCount - 1
             else:
                 self.removeMatchFromRanges(codePoint, self.ranges)
         else:
             if codePoint in self.unicodeMatches:
                 self.unicodeMatches.remove(codePoint)
                 self.codePointCount = self.codePointCount - 1
             else:
                 self.removeMatchFromRanges(codePoint, self.unicodeRanges)

     def dumpMatchData(self, file, valuesPerLine, dataList, formatter):
         valuesThisLine = 0
         firstValue = True

         file.write("{")
         for elem in dataList:
             if firstValue:
                 firstValue = False
             else:
                 file.write(", ")
             valuesThisLine = valuesThisLine + 1
             if valuesThisLine > valuesPerLine:
                 file.write("\n                 ")
                 valuesThisLine = 1
             formatter(file, elem)
         file.write("}")

     def dump(self, file, commaAfter):
         file.write("static std::unique_ptr<CharacterClass> {}()\n{{\n".format(self.getCreateFuncName()))
         file.write("    // Name = {}, number of codePoints: {}\n".format(self.name, self.codePointCount))
         file.write("    auto characterClass = makeUnique<CharacterClass>(\n")
         file.write("        std::initializer_list<UChar32>(")
         self.dumpMatchData(file, 8, self.matches, lambda file, match: (file.write("{0:0=#4x}".format(match))))
         file.write("),\n")
         file.write("        std::initializer_list<CharacterRange>(")
         self.dumpMatchData(file, 4, self.ranges, lambda file, range: (file.write("{{{0:0=#4x}, {1:0=#4x}}}".format(range[0], range[1]))))
         file.write("),\n")
         file.write("        std::initializer_list<UChar32>(")
         self.dumpMatchData(file, 8, self.unicodeMatches, lambda file, match: (file.write("{0:0=#6x}".format(match))))
         file.write("),\n")
         file.write("        std::initializer_list<CharacterRange>(")
         self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1]))))
         file.write("),\n")

         file.write("        CharacterClassWidths::{});\n".format(("Unknown", "HasBMPChars", "HasNonBMPChars", "HasBothBMPAndNonBMP")[int(self.hasNonBMPCharacters) * 2 + int(self.hasBMPCharacters)]))
         file.write("    return characterClass;\n}\n\n")

     @classmethod
     def dumpAll(cls, file):
         for propertyData in cls.allPropertyData:
             propertyData.dump(file, propertyData != cls.allPropertyData[-1])

         file.write("using CreateCharacterClass = std::unique_ptr<CharacterClass> (*)();\n")
         file.write("static CreateCharacterClass createFunctions[{}] = {{\n   ".format(len(cls.allPropertyData)))
         functionsOnThisLine = 0
         for propertyData in cls.allPropertyData:
             file.write(" {},".format(propertyData.getCreateFuncName()))
             functionsOnThisLine = functionsOnThisLine + 1
             if functionsOnThisLine == 4:
                 file.write("\n   ")
                 functionsOnThisLine = 0

         file.write("};\n\n")

     @classmethod
     def createAndDumpHashTable(self, file, propertyDict, tablePrefix):
         propertyKeys = propertyDict.keys()
         numberOfKeys = len(propertyKeys)
         hashSize = ceilingToPowerOf2(numberOfKeys * 2)
         hashMask = hashSize - 1
         hashTable = [None] * hashSize
         valueTable = []
         tableSize = hashSize

         keyValuesToHash = []
         for propertyName in propertyKeys:
             propertyData = propertyDict[propertyName]
             keyValuesToHash.append((propertyName, propertyData.getIndex()))
             for alias in propertyData.aliases:
                 keyValuesToHash.append((alias, propertyData.getIndex()))

         for keyValue in keyValuesToHash:
             key = keyValue[0]
             hash = stringHash(key) % hashSize
             while hashTable[hash] is not None:
                 if hashTable[hash][1] is not None:
                     hash = hashTable[hash][1]
                 else:
                     hashTable[hash] = (hashTable[hash][0], tableSize)
                     hashTable.append(None)
                     hash = tableSize
                     tableSize = tableSize + 1

             hashTable[hash] = (len(valueTable), None)
             valueTable.append((key, keyValue[1]))

         file.write("static const struct HashIndex {}TableIndex[{}] = {{\n".format(tablePrefix, len(hashTable)))

         for tableIndex in hashTable:
             value = -1
             next = -1
             if tableIndex is not None:
                 value = tableIndex[0]
                 if tableIndex[1] is not None:
                     next = tableIndex[1]

             file.write("    {{ {}, {} }},\n".format(value, next))

         file.write("};\n\n")

         file.write("static const struct HashValue {}TableValue[{}] = {{\n".format(tablePrefix, len(valueTable)))
         for value in valueTable:
             file.write("    {{ \"{}\", {} }},\n".format(value[0], value[1]))
         file.write("};\n\n")

         file.write("static const struct HashTable {}HashTable = \n".format(tablePrefix))
         file.write("    {{ {}, {}, {}TableValue, {}TableIndex }};\n\n".format(len(valueTable), hashMask, tablePrefix, tablePrefix))


 class Scripts:
     def __init__(self):
         self.allPropertyData = []
         self.scriptsByName = {}
         self.scriptExtensionsByName = {}
         self.unknownScript = PropertyData("Unknown")
         self.unknownScript.setAliases(aliases.scriptAliasesFor("Unknown"))
         self.allPropertyData.append(self.unknownScript)
         self.scriptsParsed = False

     def parseScriptsFile(self, file):
         currentScriptName = None
         currentPropertyData = None
         # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
         # sort them, and then go the list to create the inverse of the assigned ranges.
         assignedCodePointRanges = []

         for line in file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = line.split(';')
             if (not fields):
                 continue

             codePoints = fields[0].strip()
             scriptName = fields[1].strip()

             if scriptName != currentScriptName:
                 currentScriptName = scriptName
                 currentPropertyData = PropertyData(scriptName)
                 currentPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
                 self.allPropertyData.append(currentPropertyData)
                 self.scriptsByName[scriptName] = currentPropertyData

             dotDot = codePoints.find("..")
             if dotDot == -1:
                 codePoint = int(codePoints, 16)
                 currentPropertyData.addMatch(codePoint)
                 assignedCodePointRanges.append((codePoint, codePoint))
             else:
                 lowCodePoint = int(codePoints[:dotDot], 16)
                 highCodePoint = int(codePoints[dotDot + 2:], 16)
                 currentPropertyData.addRange(lowCodePoint, highCodePoint)
                 assignedCodePointRanges.append((lowCodePoint, highCodePoint))

         assignedCodePointRanges.sort(key=lambda range: range[0])
         lastAssignedCodePoint = 0

         for range in assignedCodePointRanges:
             if range[0] - lastAssignedCodePoint > 1:
                 if range[0] - lastAssignedCodePoint == 2:
                     self.unknownScript.addMatch(lastAssignedCodePoint + 1)
                 else:
                     self.unknownScript.addRange(lastAssignedCodePoint + 1, range[0] - 1)
             lastAssignedCodePoint = range[1]

         if lastAssignedCodePoint < MaxUnicode:
             if MaxUnicode - lastAssignedCodePoint == 1:
                 self.unknownScript.addMatch(MaxUnicode)
             else:
                 self.unknownScript.addRange(lastAssignedCodePoint + 1, MaxUnicode)

         self.scriptsParsed = True

     def parseScriptExtensionsFile(self, file):
         currentPropertyData = None
         # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
         # sort them, and then go the list to create the inverse of the assigned ranges.
         assignedCodePointRanges = []

         if not self.scriptsParsed:
             print("Error: parsing ScriptExtensions.txt before Scripts.txt")
             exit(1)

         commonScriptExtenstionPropertyData = None
         inheritedScriptExtensionPropertyData = None

         scriptName = "Common"
         if scriptName in self.scriptsByName:
             commonScriptExtenstionPropertyData = self.scriptsByName[scriptName].makeCopy()
         else:
             commonScriptExtenstionPropertyData = PropertyData(scriptName)
             commonScriptExtenstionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
         self.allPropertyData.append(commonScriptExtenstionPropertyData)
         self.scriptExtensionsByName[scriptName] = commonScriptExtenstionPropertyData

         scriptName = "Inherited"
         if scriptName in self.scriptsByName:
             inheritedScriptExtensionPropertyData = self.scriptsByName[scriptName].makeCopy()
         else:
             inheritedScriptExtensionPropertyData = PropertyData(scriptName)
             inheritedScriptExtensionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
         self.allPropertyData.append(inheritedScriptExtensionPropertyData)
         self.scriptExtensionsByName[scriptName] = inheritedScriptExtensionPropertyData

         for line in file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = line.split(';')
             if (not fields):
                 continue

             codePoints = fields[0].strip()
             scriptAliasList = fields[1].strip()

             for scriptAlias in scriptAliasList.split(' '):
                 scriptName = aliases.scriptNameForAlias(scriptAlias)
                 currentPropertyData = None

                 if scriptName not in self.scriptExtensionsByName:
                     currentPropertyData = self.scriptsByName[scriptName].makeCopy()
                     self.allPropertyData.append(currentPropertyData)
                     self.scriptExtensionsByName[scriptName] = currentPropertyData
                 else:
                     currentPropertyData = self.scriptExtensionsByName[scriptName]

                 dotDot = codePoints.find("..")
                 if dotDot == -1:
                     codePoint = int(codePoints, 16)
                     currentPropertyData.addMatch(codePoint)
                     commonScriptExtenstionPropertyData.removeMatch(codePoint)
                     inheritedScriptExtensionPropertyData.removeMatch(codePoint)
                 else:
                     lowCodePoint = int(codePoints[:dotDot], 16)
                     highCodePoint = int(codePoints[dotDot + 2:], 16)
                     currentPropertyData.addRange(lowCodePoint, highCodePoint)
                     for codePoint in range(lowCodePoint, highCodePoint + 1):
                         commonScriptExtenstionPropertyData.removeMatch(codePoint)
                         inheritedScriptExtensionPropertyData.removeMatch(codePoint)

         # For the scripts that don't have any additional extension codePoints, copy the script
         # data to the script extension with the same name
         for scriptName, propertyData in self.scriptsByName.items():
             if scriptName not in self.scriptExtensionsByName:
                 self.scriptExtensionsByName[scriptName] = propertyData

     def dump(self, file):
         file.write("// Scripts:\n")
         PropertyData.createAndDumpHashTable(file, self.scriptsByName, "script")

         file.write("// Script_Extensions:\n")
         PropertyData.createAndDumpHashTable(file, self.scriptExtensionsByName, "scriptExtension")


 class GeneralCategory:
     def __init__(self, file):
         self.file = file
         self.allPropertyData = []
         self.propertyDataByCategory = {}
         self.createSpecialPropertyData("Any", (0, MaxUnicode))
         self.createSpecialPropertyData("ASCII", (0, lastASCIICodePoint))
         self.assignedPropertyData = self.createSpecialPropertyData("Assigned")
         self.unassignedProperyData = self.findPropertyGroupFor("Cn")[1]
         self.casedLetterPropertyData = self.findPropertyGroupFor("LC")[1]
         self.lastAddedCodePoint = 0

     def createSpecialPropertyData(self, name, range=None):
         propertyData = PropertyData(name)
         self.allPropertyData.append(propertyData)
         self.propertyDataByCategory[name] = propertyData
         if range:
             propertyData.addRange(range[0], range[1])

         return propertyData

     def findPropertyGroupFor(self, categoryAlias):
         category = aliases.generalCategoryForAlias(categoryAlias)
         allCategoryAliases = aliases.generalCategoryAliasesFor(category)
         categoryGroupAlias = categoryAlias[0]
         categoryGroup = aliases.generalCategoryForAlias(categoryGroupAlias)
         allCategoryGroupAlias = aliases.generalCategoryAliasesFor(categoryGroup)
         groupPropertyData = None
         propertyData = None

         if categoryGroup not in self.propertyDataByCategory:
             groupPropertyData = PropertyData(categoryGroup)
             groupPropertyData.setAliases(allCategoryGroupAlias)
             self.allPropertyData.append(groupPropertyData)
             self.propertyDataByCategory[categoryGroup] = groupPropertyData
         else:
             groupPropertyData = self.propertyDataByCategory[categoryGroup]

         if category not in self.propertyDataByCategory:
             propertyData = PropertyData(category)
             propertyData.setAliases(allCategoryAliases)
             self.allPropertyData.append(propertyData)
             self.propertyDataByCategory[category] = propertyData
         else:
             propertyData = self.propertyDataByCategory[category]

         return (groupPropertyData, propertyData)

     def addNextCodePoints(self, categoryAlias, codePoint, highCodePoint=None):
         if codePoint - self.lastAddedCodePoint > 1:
             propertyDatas = self.findPropertyGroupFor("Cn")
             if codePoint - self.lastAddedCodePoint == 2:
                 propertyDatas[0].addMatch(self.lastAddedCodePoint + 1)
                 propertyDatas[1].addMatch(self.lastAddedCodePoint + 1)
             else:
                 propertyDatas[0].addRange(self.lastAddedCodePoint + 1, codePoint - 1)
                 propertyDatas[1].addRange(self.lastAddedCodePoint + 1, codePoint - 1)

         propertyDatas = self.findPropertyGroupFor(categoryAlias)
         if highCodePoint:
             propertyDatas[0].addRange(codePoint, highCodePoint)
             propertyDatas[1].addRange(codePoint, highCodePoint)
             if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
                 self.casedLetterPropertyData.addRange(codePoint, highCodePoint)
             self.assignedPropertyData.addRange(codePoint, highCodePoint)

             self.lastAddedCodePoint = highCodePoint
         else:
             propertyDatas[0].addMatch(codePoint)
             propertyDatas[1].addMatch(codePoint)
             if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
                 self.casedLetterPropertyData.addMatch(codePoint)
             self.assignedPropertyData.addMatch(codePoint)

             self.lastAddedCodePoint = codePoint

     def parse(self):
         lastLineFirstOfRange = None
         lastLineCodePoint = 0
         for line in self.file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = line.split(';')
             if (not fields):
                 continue

             codePoint = int(fields[0].strip(), 16)
             description = fields[1].strip()
             categoryAlias = fields[2].strip()

             if lastLineFirstOfRange:
                 if description[-5:-1] == "Last":
                     self.addNextCodePoints(categoryAlias, lastLineFirstOfRange, codePoint)
                     lastLineFirstOfRange = None
                     continue
                 else:
                     print("Malformed First..Last pair of lines")

             if description[-6:-1] == "First":
                 lastLineFirstOfRange = codePoint
                 continue

             self.addNextCodePoints(categoryAlias, codePoint)

         if self.lastAddedCodePoint < MaxUnicode:
             propertyDatas = self.findPropertyGroupFor("Cn")
             if MaxUnicode - self.lastAddedCodePoint == 1:
                 propertyDatas[0].addMatch(MaxUnicode)
                 propertyDatas[1].addMatch(MaxUnicode)
             else:
                 propertyDatas[0].addRange(self.lastAddedCodePoint + 1, MaxUnicode)
                 propertyDatas[1].addRange(self.lastAddedCodePoint + 1, MaxUnicode)

     def dump(self, file):
         file.write("// General_Category:\n")
         PropertyData.createAndDumpHashTable(file, self.propertyDataByCategory, "generalCategory")


 class BinaryProperty:
     def __init__(self):
         self.allPropertyData = []
         self.propertyDataByProperty = {}

     def parsePropertyFile(self, file):
         currentPropertyName = None
         currentPropertyData = None

         for line in file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = line.split(';')
             if (not fields):
                 continue

             codePoints = fields[0].strip()
             propertyName = fields[1].strip()

             if propertyName != currentPropertyName:
                 if propertyName not in SupportedBinaryProperties:
                     continue

                 currentPropertyName = propertyName
                 currentPropertyData = PropertyData(propertyName)
                 currentPropertyData.setAliases(aliases.globalAliasesFor(propertyName))
                 self.allPropertyData.append(currentPropertyData)
                 self.propertyDataByProperty[propertyName] = currentPropertyData

             dotDot = codePoints.find("..")
             if dotDot == -1:
                 currentPropertyData.addMatch(int(codePoints, 16))
             else:
                 currentPropertyData.addRange(int(codePoints[:dotDot], 16), int(codePoints[dotDot + 2:], 16))

     def dump(self, file):
         file.write("// binary properties:\n")
         PropertyData.createAndDumpHashTable(file, self.propertyDataByProperty, "binaryProperty")

 if __name__ == "__main__":
     parser = optparse.OptionParser(usage="usage: %prog <UCD-Directory> <YarrUnicodePropertyData.h>")
     (options, args) = parser.parse_args()

     if len(args) != 2:
         parser.error("<UCD-Directory> <YarrUnicodePropertyData.h>")

     UCDDirectoryPath = args[0]
     unicodeProertyDataHPath = args[1]

     verifyUCDFilesExist()

     propertyAliasesFile = openUCDFileOrExit("PropertyAliases.txt")
     propertyValueAliasesFile = openUCDFileOrExit("PropertyValueAliases.txt")
     scriptsFile = openUCDFileOrExit("Scripts.txt")
     scriptExtensionsFile = openUCDFileOrExit("ScriptExtensions.txt")
     unicodeDataFile = openUCDFileOrExit("UnicodeData.txt")
     derivedBinaryPropertiesFile = openUCDFileOrExit("DerivedBinaryProperties.txt")
     derivedCorePropertiesFile = openUCDFileOrExit("DerivedCoreProperties.txt")
     derivedNormalizationPropertiesFile = openUCDFileOrExit("DerivedNormalizationProps.txt")
     propListFile = openUCDFileOrExit("PropList.txt")
     emojiDataFile = openUCDFileOrExit("emoji-data.txt")

     aliases = Aliases()

     propertyDataHFile = openOrExit(unicodeProertyDataHPath, "w")

     propertyDataHFile.write(header)

     aliases.parsePropertyAliasesFile(propertyAliasesFile)
     aliases.parsePropertyValueAliasesFile(propertyValueAliasesFile)

     generalCategory = GeneralCategory(unicodeDataFile)
     generalCategory.parse()

     binaryProperty = BinaryProperty()
     binaryProperty.parsePropertyFile(derivedBinaryPropertiesFile)
     binaryProperty.parsePropertyFile(derivedCorePropertiesFile)
     binaryProperty.parsePropertyFile(derivedNormalizationPropertiesFile)
     binaryProperty.parsePropertyFile(propListFile)
     binaryProperty.parsePropertyFile(emojiDataFile)

     scripts = Scripts()
     scripts.parseScriptsFile(scriptsFile)
     scripts.parseScriptExtensionsFile(scriptExtensionsFile)

     PropertyData.dumpAll(propertyDataHFile)
     generalCategory.dump(propertyDataHFile)
     binaryProperty.dump(propertyDataHFile)
     scripts.dump(propertyDataHFile)

     propertyDataHFile.write(footer)

     exit(0)