blob: 97af189e1b31b8b9d0c013a603177eb7e432be75 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright (C) 2017 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# This tool processes the Unicode Character Database file CaseFolding.txt to create
# canonicalization table as decribed in ECMAScript 6 standard in section
# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.
import sys
import copy
import optparse
import os
import re
from hasher import stringHash
header = """/*
* Copyright (C) 2017-2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// DO NO EDIT! - This file was generated by """ + __file__ + """
"""
footer = """
"""
RequiredUCDFiles = ["DerivedBinaryProperties.txt", "DerivedCoreProperties.txt", "DerivedNormalizationProps.txt", "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", "ScriptExtensions.txt", "UnicodeData.txt", "emoji-data.txt"]
UCDDirectoryPath = None
SupportedBinaryProperties = [
"Alphabetic", "Any", "ASCII", "ASCII_Hex_Digit", "Assigned", "Bidi_Control", "Bidi_Mirrored", "Case_Ignorable",
"Cased", "Changes_When_Casefolded", "Changes_When_Casemapped", "Changes_When_Lowercased", "Changes_When_NFKC_Casefolded",
"Changes_When_Titlecased", "Changes_When_Uppercased", "Dash", "Default_Ignorable_Code_Point", "Deprecated",
"Diacritic", "Emoji", "Emoji_Component", "Emoji_Modifier_Base", "Emoji_Modifier", "Emoji_Presentation",
"Extended_Pictographic", "Extender", "Grapheme_Base", "Grapheme_Extend", "Hex_Digit", "ID_Continue", "ID_Start",
"Ideographic", "IDS_Binary_Operator", "IDS_Trinary_Operator", "Join_Control", "Logical_Order_Exception", "Lowercase",
"Math", "Noncharacter_Code_Point", "Pattern_Syntax", "Pattern_White_Space", "Quotation_Mark", "Radical",
"Regional_Indicator", "Sentence_Terminal", "Soft_Dotted", "Terminal_Punctuation", "Unified_Ideograph", "Uppercase",
"Variation_Selector", "White_Space", "XID_Continue", "XID_Start"]
lastASCIICodePoint = 0x7f
firstUnicodeCodePoint = 0x80
MaxUnicode = 0x10ffff
MaxBMP = 0xffff
commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)
aliases = None
def openOrExit(path, mode):
try:
if sys.version_info.major >= 3:
return open(path, mode, encoding="UTF-8")
else:
return open(path, mode)
except IOError as e:
print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
exit(1)
def openUCDFileOrExit(path):
if not UCDDirectoryPath:
exit(1)
return openOrExit(os.path.join(UCDDirectoryPath, path), 'r')
def verifyUCDFilesExist():
if not UCDDirectoryPath:
exit(1)
missingFileCount = 0
for file in RequiredUCDFiles:
fullPath = os.path.join(UCDDirectoryPath, file)
if not os.path.exists(fullPath):
print("Couldn't find UCD file {0} at {1}".format(file, fullPath))
missingFileCount = missingFileCount + 1
if missingFileCount:
exit(1)
def ceilingToPowerOf2(size):
powerOf2 = 1
while size > powerOf2:
powerOf2 = powerOf2 << 1
return powerOf2
class Aliases:
def __init__(self):
self.globalNameToAliases = {}
self.generalCategoryToAliases = {}
self.aliasToGeneralCategory = {}
self.scriptToAliases = {}
self.aliasToScript = {}
def parsePropertyAliasesFile(self, file):
for line in file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = line.split(';')
if (not fields):
continue
aliases = [fields[0].strip()]
fullName = fields[1].strip()
for otherAlias in fields[2:]:
aliases.append(otherAlias.strip())
if fullName in self.globalNameToAliases:
print("Error, already an alias for {}".format(fullName))
else:
self.globalNameToAliases[fullName] = aliases
def parsePropertyValueAliasesFile(self, file):
for line in file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = line.split(';')
if (not fields):
continue
propertyType = fields[0].strip()
if propertyType == "gc":
mapToModify = self.generalCategoryToAliases
reverseMapToModify = self.aliasToGeneralCategory
elif propertyType == "sc":
mapToModify = self.scriptToAliases
reverseMapToModify = self.aliasToScript
else:
continue
primaryAlias = fields[1].strip()
fullName = fields[2].strip()
aliases = [primaryAlias]
for otherAlias in fields[3:]:
aliases.append(otherAlias.strip())
if fullName in mapToModify:
print("Error, already an {} alias for {}".format(propertyType, fullName))
else:
mapToModify[fullName] = aliases
if reverseMapToModify != None:
reverseMapToModify[primaryAlias] = fullName
def globalAliasesFor(self, name):
if name not in self.globalNameToAliases:
return []
return self.globalNameToAliases[name]
def generalCategoryAliasesFor(self, name):
if name not in self.generalCategoryToAliases:
return ""
return self.generalCategoryToAliases[name]
def generalCategoryForAlias(self, name):
if name not in self.aliasToGeneralCategory:
return ""
return self.aliasToGeneralCategory[name]
def scriptAliasesFor(self, name):
if name not in self.scriptToAliases:
return ""
return self.scriptToAliases[name]
def scriptNameForAlias(self, name):
if name not in self.aliasToScript:
return ""
return self.aliasToScript[name]
class PropertyData:
allPropertyData = []
def __init__(self, name):
self.name = name
self.aliases = []
self.index = len(PropertyData.allPropertyData)
self.hasBMPCharacters = False
self.hasNonBMPCharacters = False
self.matches = []
self.ranges = []
self.unicodeMatches = []
self.unicodeRanges = []
self.codePointCount = 0
PropertyData.allPropertyData.append(self)
def setAliases(self, aliases):
self.aliases = aliases
def makeCopy(self):
result = copy.deepcopy(self)
result.index = len(PropertyData.allPropertyData)
PropertyData.allPropertyData.append(result)
return result
def getIndex(self):
return self.index
def getCreateFuncName(self):
return "createCharacterClass{}".format(self.index)
def addMatch(self, codePoint):
if codePoint <= MaxBMP:
self.hasBMPCharacters = True
else:
self.hasNonBMPCharacters = True
if codePoint <= lastASCIICodePoint:
if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint):
self.addMatchUnordered(codePoint)
return
self.codePointCount = self.codePointCount + 1
if len(self.matches) and self.matches[-1] == (codePoint - 1):
lowCodePoint = self.matches.pop()
self.ranges.append((lowCodePoint, codePoint))
elif len(self.ranges) and self.ranges[-1][1] == (codePoint - 1):
priorRange = self.ranges.pop()
self.ranges.append((priorRange[0], codePoint))
else:
self.matches.append(codePoint)
else:
if (len(self.unicodeMatches) and self.unicodeMatches[-1] > codePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > codePoint):
self.addMatchUnordered(codePoint)
return
self.codePointCount = self.codePointCount + 1
if len(self.unicodeMatches) and self.unicodeMatches[-1] == (codePoint - 1):
lowCodePoint = self.unicodeMatches.pop()
self.unicodeRanges.append((lowCodePoint, codePoint))
elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (codePoint - 1):
priorRange = self.unicodeRanges.pop()
self.unicodeRanges.append((priorRange[0], codePoint))
else:
self.unicodeMatches.append(codePoint)
def addRange(self, lowCodePoint, highCodePoint):
if lowCodePoint <= MaxBMP:
self.hasBMPCharacters = True
if highCodePoint > MaxBMP:
self.hasNonBMPCharacters = True
if highCodePoint <= lastASCIICodePoint:
if (len(self.matches) and self.matches[-1] > lowCodePoint) or (len(self.ranges) and self.ranges[-1][1] > lowCodePoint):
self.addRangeUnordered(lowCodePoint, highCodePoint)
return
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
if len(self.matches) and self.matches[-1] == (lowCodePoint - 1):
lowCodePoint = self.matches.pop()
elif len(self.ranges) and self.ranges[-1][1] == (lowCodePoint - 1):
priorRange = self.ranges.pop()
lowCodePoint = priorRange[0]
self.ranges.append((lowCodePoint, highCodePoint))
elif lowCodePoint <= lastASCIICodePoint:
if lowCodePoint == lastASCIICodePoint:
self.addMatch(lowCodePoint)
else:
self.addRange(lowCodePoint, lastASCIICodePoint)
if highCodePoint == firstUnicodeCodePoint:
self.addMatch(highCodePoint)
else:
self.addRange(firstUnicodeCodePoint, highCodePoint)
else:
if (len(self.unicodeMatches) and self.unicodeMatches[-1] > lowCodePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > lowCodePoint):
self.addRangeUnordered(lowCodePoint, highCodePoint)
return
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
if len(self.unicodeMatches) and self.unicodeMatches[-1] == (lowCodePoint - 1):
lowCodePoint = self.unicodeMatches.pop()
self.unicodeRanges.append((lowCodePoint, highCodePoint))
elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (lowCodePoint - 1):
priorRange = self.unicodeRanges.pop()
self.unicodeRanges.append((priorRange[0], highCodePoint))
else:
self.unicodeRanges.append((lowCodePoint, highCodePoint))
def addMatchUnorderedForMatchesAndRanges(self, codePoint, matches, ranges):
if codePoint in matches:
return
insertLocation = None
lowCodePoint = None
highCodePoint = None
for idx in range(len(matches)):
match = matches[idx]
if codePoint == match + 1:
lowCodePoint = match
if idx < (len(matches) - 1) and codePoint == matches[idx + 1] - 1:
highCodePoint = matches[idx + 1]
del matches[idx + 1]
self.codePointCount = self.codePointCount - 1
else:
highCodePoint = codePoint
del matches[idx]
self.codePointCount = self.codePointCount - 1
break
elif codePoint == match - 1:
lowCodePoint = codePoint
highCodePoint = match
del matches[idx]
self.codePointCount = self.codePointCount - 1
break
elif codePoint < match:
insertLocation = idx
break
if insertLocation is None:
insertLocation = len(matches)
if lowCodePoint is None:
lowCodePoint = codePoint
highCodePoint = codePoint
for idx in range(len(ranges)):
cur_range = ranges[idx]
if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]:
return
if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1):
while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
cur_range = ranges[idx]
lowCodePoint = min(lowCodePoint, cur_range[0])
highCodePoint = max(highCodePoint, cur_range[1])
del ranges[idx]
self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1
ranges.insert(idx, (lowCodePoint, highCodePoint))
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
return
elif highCodePoint < cur_range[0]:
if lowCodePoint != highCodePoint:
ranges.insert(idx, (lowCodePoint, highCodePoint))
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
return
break
if lowCodePoint != highCodePoint:
ranges.append((lowCodePoint, highCodePoint))
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
return
if insertLocation is not None:
matches.insert(insertLocation, codePoint)
self.codePointCount = self.codePointCount + 1
def addRangeUnorderedForMatchesAndRanges(self, lowCodePoint, highCodePoint, matches, ranges):
if len(matches) and highCodePoint >= matches[0] and lowCodePoint <= matches[-1]:
for idx in range(len(matches)):
match = matches[idx]
if lowCodePoint <= match and highCodePoint >= match:
while idx < len(matches) and highCodePoint >= matches[idx]:
del matches[idx]
self.codePointCount = self.codePointCount - 1
if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
highCodePoint = matches[idx + 1]
del matches[idx + 1]
self.codePointCount = self.codePointCount - 1
break
elif lowCodePoint == match + 1:
lowCodePoint = match
while idx < len(matches) and highCodePoint >= matches[idx]:
del matches[idx]
self.codePointCount = self.codePointCount - 1
if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
highCodePoint = matches[idx + 1]
del matches[idx + 1]
self.codePointCount = self.codePointCount - 1
break
elif highCodePoint == match - 1:
highCodePoint = match
del matches[idx]
self.codePointCount = self.codePointCount - 1
break
elif highCodePoint < match:
break
for idx in range(len(ranges)):
cur_range = ranges[idx]
if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]:
return
if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1):
while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
cur_range = ranges[idx]
lowCodePoint = min(lowCodePoint, cur_range[0])
highCodePoint = max(highCodePoint, cur_range[1])
del ranges[idx]
self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1
ranges.insert(idx, (lowCodePoint, highCodePoint))
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
return
elif highCodePoint < cur_range[0]:
ranges.insert(idx, (lowCodePoint, highCodePoint))
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
return
ranges.append((lowCodePoint, highCodePoint))
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
def addMatchUnordered(self, codePoint):
if codePoint <= lastASCIICodePoint:
self.addMatchUnorderedForMatchesAndRanges(codePoint, self.matches, self.ranges)
else:
self.addMatchUnorderedForMatchesAndRanges(codePoint, self.unicodeMatches, self.unicodeRanges)
def addRangeUnordered(self, lowCodePoint, highCodePoint):
if highCodePoint <= lastASCIICodePoint:
self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.matches, self.ranges)
elif lowCodePoint >= firstUnicodeCodePoint:
self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)
else:
if lowCodePoint == lastASCIICodePoint:
self.addMatchUnorderedForMatchesAndRanges(lowCodePoint, self.matches, self.ranges)
else:
self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, lastASCIICodePoint, self.unicodeMatches, self.ranges)
if highCodePoint == firstUnicodeCodePoint:
self.addMatchUnorderedForMatchesAndRanges(highCodePoint, self.unicodeMatches, self.unicodeRanges)
else:
self.addRangeUnorderedForMatchesAndRanges(firstUnicodeCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)
def removeMatchFromRanges(self, codePoint, ranges):
for idx in range(len(ranges)):
cur_range = ranges[idx]
if cur_range[0] <= codePoint and codePoint <= cur_range[1]:
ranges.pop(idx)
if cur_range[0] < codePoint and codePoint < cur_range[1]:
lowRange = (cur_range[0], codePoint - 1)
highRange = (codePoint + 1, cur_range[1])
# Since list.insert inserts before the index given, handle the high range first
if highRange[0] == highRange[1]:
self.addMatchUnordered(highRange[0])
else:
ranges.insert(idx, highRange)
if lowRange[0] == lowRange[1]:
self.addMatchUnordered(lowRange[0])
else:
ranges.insert(idx, lowRange)
else:
if cur_range[0] == codePoint:
cur_range = (codePoint + 1, cur_range[1])
else:
cur_range = (cur_range[0], codePoint - 1)
if cur_range[0] == cur_range[1]:
self.addMatchUnordered(cur_range[0])
else:
ranges.insert(idx, cur_range)
self.codePointCount = self.codePointCount - 1
return
def removeMatch(self, codePoint):
if codePoint <= lastASCIICodePoint:
if codePoint in self.matches:
self.matches.remove(codePoint)
self.codePointCount = self.codePointCount - 1
else:
self.removeMatchFromRanges(codePoint, self.ranges)
else:
if codePoint in self.unicodeMatches:
self.unicodeMatches.remove(codePoint)
self.codePointCount = self.codePointCount - 1
else:
self.removeMatchFromRanges(codePoint, self.unicodeRanges)
def dumpMatchData(self, file, valuesPerLine, dataList, formatter):
valuesThisLine = 0
firstValue = True
file.write("{")
for elem in dataList:
if firstValue:
firstValue = False
else:
file.write(", ")
valuesThisLine = valuesThisLine + 1
if valuesThisLine > valuesPerLine:
file.write("\n ")
valuesThisLine = 1
formatter(file, elem)
file.write("}")
def dump(self, file, commaAfter):
file.write("static std::unique_ptr<CharacterClass> {}()\n{{\n".format(self.getCreateFuncName()))
file.write(" // Name = {}, number of codePoints: {}\n".format(self.name, self.codePointCount))
file.write(" auto characterClass = makeUnique<CharacterClass>(\n")
file.write(" std::initializer_list<UChar32>(")
self.dumpMatchData(file, 8, self.matches, lambda file, match: (file.write("{0:0=#4x}".format(match))))
file.write("),\n")
file.write(" std::initializer_list<CharacterRange>(")
self.dumpMatchData(file, 4, self.ranges, lambda file, range: (file.write("{{{0:0=#4x}, {1:0=#4x}}}".format(range[0], range[1]))))
file.write("),\n")
file.write(" std::initializer_list<UChar32>(")
self.dumpMatchData(file, 8, self.unicodeMatches, lambda file, match: (file.write("{0:0=#6x}".format(match))))
file.write("),\n")
file.write(" std::initializer_list<CharacterRange>(")
self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1]))))
file.write("),\n")
file.write(" CharacterClassWidths::{});\n".format(("Unknown", "HasBMPChars", "HasNonBMPChars", "HasBothBMPAndNonBMP")[int(self.hasNonBMPCharacters) * 2 + int(self.hasBMPCharacters)]))
file.write(" return characterClass;\n}\n\n")
@classmethod
def dumpAll(cls, file):
for propertyData in cls.allPropertyData:
propertyData.dump(file, propertyData != cls.allPropertyData[-1])
file.write("using CreateCharacterClass = std::unique_ptr<CharacterClass> (*)();\n")
file.write("static CreateCharacterClass createFunctions[{}] = {{\n ".format(len(cls.allPropertyData)))
functionsOnThisLine = 0
for propertyData in cls.allPropertyData:
file.write(" {},".format(propertyData.getCreateFuncName()))
functionsOnThisLine = functionsOnThisLine + 1
if functionsOnThisLine == 4:
file.write("\n ")
functionsOnThisLine = 0
file.write("};\n\n")
@classmethod
def createAndDumpHashTable(self, file, propertyDict, tablePrefix):
propertyKeys = propertyDict.keys()
numberOfKeys = len(propertyKeys)
hashSize = ceilingToPowerOf2(numberOfKeys * 2)
hashMask = hashSize - 1
hashTable = [None] * hashSize
valueTable = []
tableSize = hashSize
keyValuesToHash = []
for propertyName in propertyKeys:
propertyData = propertyDict[propertyName]
keyValuesToHash.append((propertyName, propertyData.getIndex()))
for alias in propertyData.aliases:
keyValuesToHash.append((alias, propertyData.getIndex()))
for keyValue in keyValuesToHash:
key = keyValue[0]
hash = stringHash(key) % hashSize
while hashTable[hash] is not None:
if hashTable[hash][1] is not None:
hash = hashTable[hash][1]
else:
hashTable[hash] = (hashTable[hash][0], tableSize)
hashTable.append(None)
hash = tableSize
tableSize = tableSize + 1
hashTable[hash] = (len(valueTable), None)
valueTable.append((key, keyValue[1]))
file.write("static const struct HashIndex {}TableIndex[{}] = {{\n".format(tablePrefix, len(hashTable)))
for tableIndex in hashTable:
value = -1
next = -1
if tableIndex is not None:
value = tableIndex[0]
if tableIndex[1] is not None:
next = tableIndex[1]
file.write(" {{ {}, {} }},\n".format(value, next))
file.write("};\n\n")
file.write("static const struct HashValue {}TableValue[{}] = {{\n".format(tablePrefix, len(valueTable)))
for value in valueTable:
file.write(" {{ \"{}\", {} }},\n".format(value[0], value[1]))
file.write("};\n\n")
file.write("static const struct HashTable {}HashTable = \n".format(tablePrefix))
file.write(" {{ {}, {}, {}TableValue, {}TableIndex }};\n\n".format(len(valueTable), hashMask, tablePrefix, tablePrefix))
class Scripts:
def __init__(self):
self.allPropertyData = []
self.scriptsByName = {}
self.scriptExtensionsByName = {}
self.unknownScript = PropertyData("Unknown")
self.unknownScript.setAliases(aliases.scriptAliasesFor("Unknown"))
self.allPropertyData.append(self.unknownScript)
self.scriptsParsed = False
def parseScriptsFile(self, file):
currentScriptName = None
currentPropertyData = None
# To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
# sort them, and then go the list to create the inverse of the assigned ranges.
assignedCodePointRanges = []
for line in file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = line.split(';')
if (not fields):
continue
codePoints = fields[0].strip()
scriptName = fields[1].strip()
if scriptName != currentScriptName:
currentScriptName = scriptName
currentPropertyData = PropertyData(scriptName)
currentPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
self.allPropertyData.append(currentPropertyData)
self.scriptsByName[scriptName] = currentPropertyData
dotDot = codePoints.find("..")
if dotDot == -1:
codePoint = int(codePoints, 16)
currentPropertyData.addMatch(codePoint)
assignedCodePointRanges.append((codePoint, codePoint))
else:
lowCodePoint = int(codePoints[:dotDot], 16)
highCodePoint = int(codePoints[dotDot + 2:], 16)
currentPropertyData.addRange(lowCodePoint, highCodePoint)
assignedCodePointRanges.append((lowCodePoint, highCodePoint))
assignedCodePointRanges.sort(key=lambda range: range[0])
lastAssignedCodePoint = 0
for range in assignedCodePointRanges:
if range[0] - lastAssignedCodePoint > 1:
if range[0] - lastAssignedCodePoint == 2:
self.unknownScript.addMatch(lastAssignedCodePoint + 1)
else:
self.unknownScript.addRange(lastAssignedCodePoint + 1, range[0] - 1)
lastAssignedCodePoint = range[1]
if lastAssignedCodePoint < MaxUnicode:
if MaxUnicode - lastAssignedCodePoint == 1:
self.unknownScript.addMatch(MaxUnicode)
else:
self.unknownScript.addRange(lastAssignedCodePoint + 1, MaxUnicode)
self.scriptsParsed = True
def parseScriptExtensionsFile(self, file):
currentPropertyData = None
# To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
# sort them, and then go the list to create the inverse of the assigned ranges.
assignedCodePointRanges = []
if not self.scriptsParsed:
print("Error: parsing ScriptExtensions.txt before Scripts.txt")
exit(1)
commonScriptExtenstionPropertyData = None
inheritedScriptExtensionPropertyData = None
scriptName = "Common"
if scriptName in self.scriptsByName:
commonScriptExtenstionPropertyData = self.scriptsByName[scriptName].makeCopy()
else:
commonScriptExtenstionPropertyData = PropertyData(scriptName)
commonScriptExtenstionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
self.allPropertyData.append(commonScriptExtenstionPropertyData)
self.scriptExtensionsByName[scriptName] = commonScriptExtenstionPropertyData
scriptName = "Inherited"
if scriptName in self.scriptsByName:
inheritedScriptExtensionPropertyData = self.scriptsByName[scriptName].makeCopy()
else:
inheritedScriptExtensionPropertyData = PropertyData(scriptName)
inheritedScriptExtensionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
self.allPropertyData.append(inheritedScriptExtensionPropertyData)
self.scriptExtensionsByName[scriptName] = inheritedScriptExtensionPropertyData
for line in file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = line.split(';')
if (not fields):
continue
codePoints = fields[0].strip()
scriptAliasList = fields[1].strip()
for scriptAlias in scriptAliasList.split(' '):
scriptName = aliases.scriptNameForAlias(scriptAlias)
currentPropertyData = None
if scriptName not in self.scriptExtensionsByName:
currentPropertyData = self.scriptsByName[scriptName].makeCopy()
self.allPropertyData.append(currentPropertyData)
self.scriptExtensionsByName[scriptName] = currentPropertyData
else:
currentPropertyData = self.scriptExtensionsByName[scriptName]
dotDot = codePoints.find("..")
if dotDot == -1:
codePoint = int(codePoints, 16)
currentPropertyData.addMatch(codePoint)
commonScriptExtenstionPropertyData.removeMatch(codePoint)
inheritedScriptExtensionPropertyData.removeMatch(codePoint)
else:
lowCodePoint = int(codePoints[:dotDot], 16)
highCodePoint = int(codePoints[dotDot + 2:], 16)
currentPropertyData.addRange(lowCodePoint, highCodePoint)
for codePoint in range(lowCodePoint, highCodePoint + 1):
commonScriptExtenstionPropertyData.removeMatch(codePoint)
inheritedScriptExtensionPropertyData.removeMatch(codePoint)
# For the scripts that don't have any additional extension codePoints, copy the script
# data to the script extension with the same name
for scriptName, propertyData in self.scriptsByName.items():
if scriptName not in self.scriptExtensionsByName:
self.scriptExtensionsByName[scriptName] = propertyData
def dump(self, file):
file.write("// Scripts:\n")
PropertyData.createAndDumpHashTable(file, self.scriptsByName, "script")
file.write("// Script_Extensions:\n")
PropertyData.createAndDumpHashTable(file, self.scriptExtensionsByName, "scriptExtension")
class GeneralCategory:
def __init__(self, file):
self.file = file
self.allPropertyData = []
self.propertyDataByCategory = {}
self.createSpecialPropertyData("Any", (0, MaxUnicode))
self.createSpecialPropertyData("ASCII", (0, lastASCIICodePoint))
self.assignedPropertyData = self.createSpecialPropertyData("Assigned")
self.unassignedProperyData = self.findPropertyGroupFor("Cn")[1]
self.casedLetterPropertyData = self.findPropertyGroupFor("LC")[1]
self.lastAddedCodePoint = 0
def createSpecialPropertyData(self, name, range=None):
propertyData = PropertyData(name)
self.allPropertyData.append(propertyData)
self.propertyDataByCategory[name] = propertyData
if range:
propertyData.addRange(range[0], range[1])
return propertyData
def findPropertyGroupFor(self, categoryAlias):
category = aliases.generalCategoryForAlias(categoryAlias)
allCategoryAliases = aliases.generalCategoryAliasesFor(category)
categoryGroupAlias = categoryAlias[0]
categoryGroup = aliases.generalCategoryForAlias(categoryGroupAlias)
allCategoryGroupAlias = aliases.generalCategoryAliasesFor(categoryGroup)
groupPropertyData = None
propertyData = None
if categoryGroup not in self.propertyDataByCategory:
groupPropertyData = PropertyData(categoryGroup)
groupPropertyData.setAliases(allCategoryGroupAlias)
self.allPropertyData.append(groupPropertyData)
self.propertyDataByCategory[categoryGroup] = groupPropertyData
else:
groupPropertyData = self.propertyDataByCategory[categoryGroup]
if category not in self.propertyDataByCategory:
propertyData = PropertyData(category)
propertyData.setAliases(allCategoryAliases)
self.allPropertyData.append(propertyData)
self.propertyDataByCategory[category] = propertyData
else:
propertyData = self.propertyDataByCategory[category]
return (groupPropertyData, propertyData)
def addNextCodePoints(self, categoryAlias, codePoint, highCodePoint=None):
if codePoint - self.lastAddedCodePoint > 1:
propertyDatas = self.findPropertyGroupFor("Cn")
if codePoint - self.lastAddedCodePoint == 2:
propertyDatas[0].addMatch(self.lastAddedCodePoint + 1)
propertyDatas[1].addMatch(self.lastAddedCodePoint + 1)
else:
propertyDatas[0].addRange(self.lastAddedCodePoint + 1, codePoint - 1)
propertyDatas[1].addRange(self.lastAddedCodePoint + 1, codePoint - 1)
propertyDatas = self.findPropertyGroupFor(categoryAlias)
if highCodePoint:
propertyDatas[0].addRange(codePoint, highCodePoint)
propertyDatas[1].addRange(codePoint, highCodePoint)
if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
self.casedLetterPropertyData.addRange(codePoint, highCodePoint)
self.assignedPropertyData.addRange(codePoint, highCodePoint)
self.lastAddedCodePoint = highCodePoint
else:
propertyDatas[0].addMatch(codePoint)
propertyDatas[1].addMatch(codePoint)
if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
self.casedLetterPropertyData.addMatch(codePoint)
self.assignedPropertyData.addMatch(codePoint)
self.lastAddedCodePoint = codePoint
def parse(self):
lastLineFirstOfRange = None
lastLineCodePoint = 0
for line in self.file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = line.split(';')
if (not fields):
continue
codePoint = int(fields[0].strip(), 16)
description = fields[1].strip()
categoryAlias = fields[2].strip()
if lastLineFirstOfRange:
if description[-5:-1] == "Last":
self.addNextCodePoints(categoryAlias, lastLineFirstOfRange, codePoint)
lastLineFirstOfRange = None
continue
else:
print("Malformed First..Last pair of lines")
if description[-6:-1] == "First":
lastLineFirstOfRange = codePoint
continue
self.addNextCodePoints(categoryAlias, codePoint)
if self.lastAddedCodePoint < MaxUnicode:
propertyDatas = self.findPropertyGroupFor("Cn")
if MaxUnicode - self.lastAddedCodePoint == 1:
propertyDatas[0].addMatch(MaxUnicode)
propertyDatas[1].addMatch(MaxUnicode)
else:
propertyDatas[0].addRange(self.lastAddedCodePoint + 1, MaxUnicode)
propertyDatas[1].addRange(self.lastAddedCodePoint + 1, MaxUnicode)
def dump(self, file):
file.write("// General_Category:\n")
PropertyData.createAndDumpHashTable(file, self.propertyDataByCategory, "generalCategory")
class BinaryProperty:
def __init__(self):
self.allPropertyData = []
self.propertyDataByProperty = {}
def parsePropertyFile(self, file):
currentPropertyName = None
currentPropertyData = None
for line in file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = line.split(';')
if (not fields):
continue
codePoints = fields[0].strip()
propertyName = fields[1].strip()
if propertyName != currentPropertyName:
if propertyName not in SupportedBinaryProperties:
continue
currentPropertyName = propertyName
currentPropertyData = PropertyData(propertyName)
currentPropertyData.setAliases(aliases.globalAliasesFor(propertyName))
self.allPropertyData.append(currentPropertyData)
self.propertyDataByProperty[propertyName] = currentPropertyData
dotDot = codePoints.find("..")
if dotDot == -1:
currentPropertyData.addMatch(int(codePoints, 16))
else:
currentPropertyData.addRange(int(codePoints[:dotDot], 16), int(codePoints[dotDot + 2:], 16))
def dump(self, file):
file.write("// binary properties:\n")
PropertyData.createAndDumpHashTable(file, self.propertyDataByProperty, "binaryProperty")
if __name__ == "__main__":
parser = optparse.OptionParser(usage="usage: %prog <UCD-Directory> <YarrUnicodePropertyData.h>")
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error("<UCD-Directory> <YarrUnicodePropertyData.h>")
UCDDirectoryPath = args[0]
unicodeProertyDataHPath = args[1]
verifyUCDFilesExist()
propertyAliasesFile = openUCDFileOrExit("PropertyAliases.txt")
propertyValueAliasesFile = openUCDFileOrExit("PropertyValueAliases.txt")
scriptsFile = openUCDFileOrExit("Scripts.txt")
scriptExtensionsFile = openUCDFileOrExit("ScriptExtensions.txt")
unicodeDataFile = openUCDFileOrExit("UnicodeData.txt")
derivedBinaryPropertiesFile = openUCDFileOrExit("DerivedBinaryProperties.txt")
derivedCorePropertiesFile = openUCDFileOrExit("DerivedCoreProperties.txt")
derivedNormalizationPropertiesFile = openUCDFileOrExit("DerivedNormalizationProps.txt")
propListFile = openUCDFileOrExit("PropList.txt")
emojiDataFile = openUCDFileOrExit("emoji-data.txt")
aliases = Aliases()
propertyDataHFile = openOrExit(unicodeProertyDataHPath, "w")
propertyDataHFile.write(header)
aliases.parsePropertyAliasesFile(propertyAliasesFile)
aliases.parsePropertyValueAliasesFile(propertyValueAliasesFile)
generalCategory = GeneralCategory(unicodeDataFile)
generalCategory.parse()
binaryProperty = BinaryProperty()
binaryProperty.parsePropertyFile(derivedBinaryPropertiesFile)
binaryProperty.parsePropertyFile(derivedCorePropertiesFile)
binaryProperty.parsePropertyFile(derivedNormalizationPropertiesFile)
binaryProperty.parsePropertyFile(propListFile)
binaryProperty.parsePropertyFile(emojiDataFile)
scripts = Scripts()
scripts.parseScriptsFile(scriptsFile)
scripts.parseScriptExtensionsFile(scriptExtensionsFile)
PropertyData.dumpAll(propertyDataHFile)
generalCategory.dump(propertyDataHFile)
binaryProperty.dump(propertyDataHFile)
scripts.dump(propertyDataHFile)
propertyDataHFile.write(footer)
exit(0)