blob: ff9f8ecb3a7b71417bcc76699feafb521d04c1a6 [file] [log] [blame]
/*
* Copyright (C) 2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecCJK.h"
#include "EncodingTables.h"
#include <mutex>
#include <wtf/text/CodePointIterator.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/unicode/CharacterNames.h>
namespace WebCore {
enum class TextCodecCJK::Encoding : uint8_t {
EUC_JP,
ISO2022JP,
Shift_JIS,
EUC_KR,
Big5,
GBK,
GB18030
};
TextCodecCJK::TextCodecCJK(Encoding encoding)
: m_encoding(encoding)
{
checkEncodingTableInvariants();
}
void TextCodecCJK::registerEncodingNames(EncodingNameRegistrar registrar)
{
// https://encoding.spec.whatwg.org/#names-and-labels
auto registerAliases = [&] (std::initializer_list<const char*> list) {
for (auto* alias : list)
registrar(alias, *list.begin());
};
registerAliases({
"Big5",
"big5-hkscs",
"cn-big5",
"csbig5",
"x-x-big5"
});
registerAliases({
"EUC-JP",
"cseucpkdfmtjapanese",
"x-euc-jp"
});
registerAliases({
"Shift_JIS",
"csshiftjis",
"ms932",
"ms_kanji",
"shift-jis",
"sjis",
"windows-31j",
"x-sjis"
});
registerAliases({
"EUC-KR",
"cseuckr",
"csksc56011987",
"iso-ir-149",
"korean",
"ks_c_5601-1987",
"ks_c_5601-1989",
"ksc5601",
"ksc_5601",
"windows-949",
// These aliases are not in the specification, but WebKit has historically supported them.
"x-windows-949",
"x-uhc",
});
registerAliases({
"ISO-2022-JP",
"csiso2022jp"
});
registerAliases({
"GBK",
"chinese",
"csgb2312",
"csiso58gb231280",
"gb2312",
"gb_2312",
"gb_2312-80",
"iso-ir-58",
"x-gbk",
// These aliases are not in the specification, but WebKit has historically supported them.
"cn-gb",
"csgb231280",
"x-euc-cn",
"euc-cn",
"cp936",
"ms936",
"gb2312-1980",
"windows-936",
"windows-936-2000"
});
registerAliases({
"gb18030",
// These aliases are not in the specification, but WebKit has historically supported them.
"ibm-1392",
"windows-54936"
});
}
void TextCodecCJK::registerCodecs(TextCodecRegistrar registrar)
{
registrar("EUC-JP", [] {
return makeUnique<TextCodecCJK>(Encoding::EUC_JP);
});
registrar("Big5", [] {
return makeUnique<TextCodecCJK>(Encoding::Big5);
});
registrar("Shift_JIS", [] {
return makeUnique<TextCodecCJK>(Encoding::Shift_JIS);
});
registrar("EUC-KR", [] {
return makeUnique<TextCodecCJK>(Encoding::EUC_KR);
});
registrar("ISO-2022-JP", [] {
return makeUnique<TextCodecCJK>(Encoding::ISO2022JP);
});
registrar("GBK", [] {
return makeUnique<TextCodecCJK>(Encoding::GBK);
});
registrar("gb18030", [] {
return makeUnique<TextCodecCJK>(Encoding::GB18030);
});
}
using JIS0208EncodeIndex = std::array<std::pair<UChar, uint16_t>, sizeof(jis0208()) / sizeof(jis0208()[0])>;
static const JIS0208EncodeIndex& jis0208EncodeIndex()
{
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
static JIS0208EncodeIndex* table;
static std::once_flag once;
std::call_once(once, [&] {
table = new JIS0208EncodeIndex;
auto& index = jis0208();
for (size_t i = 0; i < index.size(); i++)
(*table)[i] = { index[i].second, index[i].first };
stableSortByFirst(*table);
});
return *table;
}
String TextCodecCJK::decodeCommon(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError, const Function<SawError(uint8_t, StringBuilder&)>& byteParser)
{
StringBuilder result;
result.reserveCapacity(length);
if (m_prependedByte && byteParser(*std::exchange(m_prependedByte, std::nullopt), result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
for (size_t i = 0; i < length; i++) {
if (byteParser(bytes[i], result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
if (m_prependedByte && byteParser(*std::exchange(m_prependedByte, std::nullopt), result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
}
if (flush && m_lead) {
m_lead = 0x00;
sawError = true;
result.append(replacementCharacter);
}
return result.toString();
}
static std::optional<UChar> codePointJIS0208(uint16_t pointer)
{
return findFirstInSortedPairs(jis0208(), pointer);
}
static std::optional<UChar> codePointJIS0212(uint16_t pointer)
{
return findFirstInSortedPairs(jis0212(), pointer);
}
// https://encoding.spec.whatwg.org/#euc-jp-decoder
String TextCodecCJK::eucJPDecode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
return decodeCommon(bytes, length, flush, stopOnError, sawError, [this] (uint8_t byte, StringBuilder& result) {
if (uint8_t lead = std::exchange(m_lead, 0x00)) {
if (lead == 0x8E && byte >= 0xA1 && byte <= 0xDF) {
result.appendCharacter(0xFF61 - 0xA1 + byte);
return SawError::No;
}
if (lead == 0x8F && byte >= 0xA1 && byte <= 0xFE) {
m_jis0212 = true;
m_lead = byte;
return SawError::No;
}
if (lead >= 0xA1 && lead <= 0xFE && byte >= 0xA1 && byte <= 0xFE) {
uint16_t pointer = (lead - 0xA1) * 94 + byte - 0xA1;
if (auto codePoint = std::exchange(m_jis0212, false) ? codePointJIS0212(pointer) : codePointJIS0208(pointer)) {
result.append(*codePoint);
return SawError::No;
}
}
if (isASCII(byte))
m_prependedByte = byte;
return SawError::Yes;
}
if (isASCII(byte)) {
result.append(static_cast<char>(byte));
return SawError::No;
}
if (byte == 0x8E || byte == 0x8F || (byte >= 0xA1 && byte <= 0xFE)) {
m_lead = byte;
return SawError::No;
}
return SawError::Yes;
});
}
// https://encoding.spec.whatwg.org/#euc-jp-encoder
static Vector<uint8_t> eucJPEncode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
auto characters = string.upconvertedCharacters();
for (WTF::CodePointIterator<UChar> iterator(characters.get(), characters.get() + string.length()); !iterator.atEnd(); ++iterator) {
auto codePoint = *iterator;
if (isASCII(codePoint)) {
result.append(codePoint);
continue;
}
if (codePoint == 0x00A5) {
result.append(0x5C);
continue;
}
if (codePoint == 0x203E) {
result.append(0x7E);
continue;
}
if (codePoint >= 0xFF61 && codePoint <= 0xFF9F) {
result.append(0x8E);
result.append(codePoint - 0xFF61 + 0xA1);
continue;
}
if (codePoint == 0x2212)
codePoint = 0xFF0D;
auto pointer = findFirstInSortedPairs(jis0208EncodeIndex(), codePoint);
if (!pointer) {
unencodableHandler(codePoint, result);
continue;
}
result.append(*pointer / 94 + 0xA1);
result.append(*pointer % 94 + 0xA1);
}
return result;
}
// https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
String TextCodecCJK::iso2022JPDecode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
auto byteParser = [&] (uint8_t byte, StringBuilder& result) {
switch (m_iso2022JPDecoderState) {
case ISO2022JPDecoderState::ASCII:
if (byte == 0x1B) {
m_iso2022JPDecoderState = ISO2022JPDecoderState::EscapeStart;
break;
}
if (byte <= 0x7F && byte != 0x0E && byte != 0x0F && byte != 0x1B) {
m_iso2022JPOutput = false;
result.append(byte);
break;
}
m_iso2022JPOutput = false;
return SawError::Yes;
case ISO2022JPDecoderState::Roman:
if (byte == 0x1B) {
m_iso2022JPDecoderState = ISO2022JPDecoderState::EscapeStart;
break;
}
if (byte == 0x5C) {
m_iso2022JPOutput = false;
result.append(static_cast<UChar>(0x00A5));
break;
}
if (byte == 0x7E) {
m_iso2022JPOutput = false;
result.append(static_cast<UChar>(0x203E));
break;
}
if (byte <= 0x7F && byte != 0x0E && byte != 0x0F && byte != 0x1B && byte != 0x5C && byte != 0x7E) {
m_iso2022JPOutput = false;
result.append(byte);
break;
}
m_iso2022JPOutput = false;
return SawError::Yes;
case ISO2022JPDecoderState::Katakana:
if (byte == 0x1B) {
m_iso2022JPDecoderState = ISO2022JPDecoderState::EscapeStart;
break;
}
if (byte >= 0x21 && byte <= 0x5F) {
m_iso2022JPOutput = false;
result.append(static_cast<UChar>(0xFF61 - 0x21 + byte));
break;
}
m_iso2022JPOutput = false;
return SawError::Yes;
case ISO2022JPDecoderState::LeadByte:
if (byte == 0x1B) {
m_iso2022JPDecoderState = ISO2022JPDecoderState::EscapeStart;
break;
}
if (byte >= 0x21 && byte <= 0x7E) {
m_iso2022JPOutput = false;
m_lead = byte;
m_iso2022JPDecoderState = ISO2022JPDecoderState::TrailByte;
break;
}
m_iso2022JPOutput = false;
return SawError::Yes;
case ISO2022JPDecoderState::TrailByte:
if (byte == 0x1B) {
m_iso2022JPDecoderState = ISO2022JPDecoderState::EscapeStart;
return SawError::Yes;
}
m_iso2022JPDecoderState = ISO2022JPDecoderState::LeadByte;
if (byte >= 0x21 && byte <= 0x7E) {
uint16_t pointer = (m_lead - 0x21) * 94 + byte - 0x21;
if (auto codePoint = codePointJIS0208(pointer)) {
result.append(*codePoint);
break;
}
return SawError::Yes;
}
return SawError::Yes;
case ISO2022JPDecoderState::EscapeStart:
if (byte == 0x24 || byte == 0x28) {
m_lead = byte;
m_iso2022JPDecoderState = ISO2022JPDecoderState::Escape;
break;
}
m_prependedByte = byte;
m_iso2022JPOutput = false;
m_iso2022JPDecoderState = m_iso2022JPDecoderOutputState;
return SawError::Yes;
case ISO2022JPDecoderState::Escape: {
uint8_t lead = std::exchange(m_lead, 0x00);
std::optional<ISO2022JPDecoderState> state;
if (lead == 0x28) {
if (byte == 0x42)
state = ISO2022JPDecoderState::ASCII;
else if (byte == 0x4A)
state = ISO2022JPDecoderState::Roman;
else if (byte == 0x49)
state = ISO2022JPDecoderState::Katakana;
} else if (lead == 0x24 && (byte == 0x40 || byte == 0x42))
state = ISO2022JPDecoderState::LeadByte;
if (state) {
m_iso2022JPDecoderState = *state;
m_iso2022JPDecoderOutputState = *state;
if (std::exchange(m_iso2022JPOutput, true))
return SawError::Yes;
break;
}
m_prependedByte = lead;
m_iso2022JPSecondPrependedByte = byte;
m_iso2022JPOutput = false;
m_iso2022JPDecoderState = m_iso2022JPDecoderOutputState;
return SawError::Yes;
}
}
return SawError::No;
};
StringBuilder result;
result.reserveCapacity(length);
if (m_prependedByte && byteParser(*std::exchange(m_prependedByte, std::nullopt), result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
if (m_iso2022JPSecondPrependedByte && byteParser(*std::exchange(m_iso2022JPSecondPrependedByte, std::nullopt), result) == SawError::Yes && stopOnError) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
for (size_t i = 0; i < length; i++) {
if (byteParser(bytes[i], result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
if (m_prependedByte && byteParser(*std::exchange(m_prependedByte, std::nullopt), result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
if (m_iso2022JPSecondPrependedByte && byteParser(*std::exchange(m_iso2022JPSecondPrependedByte, std::nullopt), result) == SawError::Yes && stopOnError) {
sawError = true;
result.append(replacementCharacter);
if (stopOnError) {
m_lead = 0x00;
return result.toString();
}
}
}
if (flush) {
switch (m_iso2022JPDecoderState) {
case ISO2022JPDecoderState::ASCII:
case ISO2022JPDecoderState::Roman:
case ISO2022JPDecoderState::Katakana:
case ISO2022JPDecoderState::LeadByte:
break;
case ISO2022JPDecoderState::TrailByte:
m_iso2022JPDecoderState = ISO2022JPDecoderState::LeadByte;
FALLTHROUGH;
case ISO2022JPDecoderState::EscapeStart:
sawError = true;
result.append(replacementCharacter);
break;
case ISO2022JPDecoderState::Escape:
sawError = true;
result.append(replacementCharacter);
if (m_lead) {
ASSERT(isASCII(m_lead));
result.append(std::exchange(m_lead, 0x00));
}
break;
}
}
return result.toString();
}
// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder
static Vector<uint8_t> iso2022JPEncode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
enum class State : uint8_t { ASCII, Roman, Jis0208 };
State state { State::ASCII };
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
auto changeStateToASCII = [&] {
state = State::ASCII;
result.append(0x1B);
result.append(0x28);
result.append(0x42);
};
auto statefulUnencodableHandler = [&] (UChar32 codePoint, Vector<uint8_t>& result) {
if (state == State::Jis0208)
changeStateToASCII();
unencodableHandler(codePoint, result);
};
Function<void(UChar32)> parseCodePoint;
parseCodePoint = [&] (UChar32 codePoint) {
if ((state == State::ASCII || state == State::Roman) && (codePoint == 0x000E || codePoint == 0x000F || codePoint == 0x001B)) {
statefulUnencodableHandler(replacementCharacter, result);
return;
}
if (state == State::ASCII && isASCII(codePoint)) {
result.append(codePoint);
return;
}
if (state == State::Roman) {
if (isASCII(codePoint) && codePoint != 0x005C && codePoint !=0x007E) {
result.append(codePoint);
return;
}
if (codePoint == 0x00A5) {
result.append(0x5C);
return;
}
if (codePoint == 0x203E) {
result.append(0x7E);
return;
}
}
if (isASCII(codePoint) && state != State::ASCII) {
if (state != State::ASCII)
changeStateToASCII();
parseCodePoint(codePoint);
return;
}
if ((codePoint == 0x00A5 || codePoint == 0x203E) && state != State::Roman) {
state = State::Roman;
result.append(0x1B);
result.append(0x28);
result.append(0x4A);
parseCodePoint(codePoint);
return;
}
if (codePoint == 0x2212)
codePoint = 0xFF0D;
if (codePoint >= 0xFF61 && codePoint <= 0xFF9F) {
// From https://encoding.spec.whatwg.org/index-iso-2022-jp-katakana.txt
static constexpr std::array<UChar32, 63> iso2022JPKatakana {
0x3002, 0x300C, 0x300D, 0x3001, 0x30FB, 0x30F2, 0x30A1, 0x30A3, 0x30A5, 0x30A7, 0x30A9, 0x30E3, 0x30E5, 0x30E7, 0x30C3, 0x30FC,
0x30A2, 0x30A4, 0x30A6, 0x30A8, 0x30AA, 0x30AB, 0x30AD, 0x30AF, 0x30B1, 0x30B3, 0x30B5, 0x30B7, 0x30B9, 0x30BB, 0x30BD, 0x30BF,
0x30C1, 0x30C4, 0x30C6, 0x30C8, 0x30CA, 0x30CB, 0x30CC, 0x30CD, 0x30CE, 0x30CF, 0x30D2, 0x30D5, 0x30D8, 0x30DB, 0x30DE, 0x30DF,
0x30E0, 0x30E1, 0x30E2, 0x30E4, 0x30E6, 0x30E8, 0x30E9, 0x30EA, 0x30EB, 0x30EC, 0x30ED, 0x30EF, 0x30F3, 0x309B, 0x309C
};
static_assert(std::size(iso2022JPKatakana) == 0xFF9F - 0xFF61 + 1);
codePoint = iso2022JPKatakana[codePoint - 0xFF61];
}
auto pointer = findFirstInSortedPairs(jis0208EncodeIndex(), codePoint);
if (!pointer) {
statefulUnencodableHandler(codePoint, result);
return;
}
if (state != State::Jis0208) {
state = State::Jis0208;
result.append(0x1B);
result.append(0x24);
result.append(0x42);
parseCodePoint(codePoint);
return;
}
result.append(*pointer / 94 + 0x21);
result.append(*pointer % 94 + 0x21);
};
auto characters = string.upconvertedCharacters();
for (WTF::CodePointIterator<UChar> iterator(characters.get(), characters.get() + string.length()); !iterator.atEnd(); ++iterator)
parseCodePoint(*iterator);
if (state != State::ASCII)
changeStateToASCII();
return result;
}
// https://encoding.spec.whatwg.org/#shift_jis-decoder
String TextCodecCJK::shiftJISDecode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
return decodeCommon(bytes, length, flush, stopOnError, sawError, [this] (uint8_t byte, StringBuilder& result) {
if (uint8_t lead = std::exchange(m_lead, 0x00)) {
uint8_t offset = byte < 0x7F ? 0x40 : 0x41;
uint8_t leadOffset = lead < 0xA0 ? 0x81 : 0xC1;
if ((byte >= 0x40 && byte <= 0x7E) || (byte >= 0x80 && byte <= 0xFC)) {
uint16_t pointer = (lead - leadOffset) * 188 + byte - offset;
if (pointer >= 8836 && pointer <= 10715) {
result.append(static_cast<UChar>(0xE000 - 8836 + pointer));
return SawError::No;
}
if (auto codePoint = codePointJIS0208(pointer)) {
result.append(*codePoint);
return SawError::No;
}
}
if (isASCII(byte))
m_prependedByte = byte;
return SawError::Yes;
}
if (isASCII(byte) || byte == 0x80) {
result.append(byte);
return SawError::No;
}
if (byte >= 0xA1 && byte <= 0xDF) {
result.append(static_cast<UChar>(0xFF61 - 0xA1 + byte));
return SawError::No;
}
if ((byte >= 0x81 && byte <= 0x9F) || (byte >= 0xE0 && byte <= 0xFC)) {
m_lead = byte;
return SawError::No;
}
return SawError::Yes;
});
}
// https://encoding.spec.whatwg.org/#shift_jis-encoder
static Vector<uint8_t> shiftJISEncode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
auto characters = string.upconvertedCharacters();
for (WTF::CodePointIterator<UChar> iterator(characters.get(), characters.get() + string.length()); !iterator.atEnd(); ++iterator) {
auto codePoint = *iterator;
if (isASCII(codePoint) || codePoint == 0x0080) {
result.append(codePoint);
continue;
}
if (codePoint == 0x00A5) {
result.append(0x5C);
continue;
}
if (codePoint == 0x203E) {
result.append(0x7E);
continue;
}
if (codePoint >= 0xFF61 && codePoint <= 0xFF9F) {
result.append(codePoint - 0xFF61 + 0xA1);
continue;
}
if (codePoint == 0x2212)
codePoint = 0xFF0D;
auto range = findInSortedPairs(jis0208EncodeIndex(), codePoint);
if (range.first == range.second) {
unencodableHandler(codePoint, result);
continue;
}
ASSERT(range.first + 3 >= range.second);
for (auto pair = range.first; pair < range.second; pair++) {
uint16_t pointer = pair->second;
if (pointer >= 8272 && pointer <= 8835)
continue;
uint8_t lead = pointer / 188;
uint8_t leadOffset = lead < 0x1F ? 0x81 : 0xC1;
uint8_t trail = pointer % 188;
uint8_t offset = trail < 0x3F ? 0x40 : 0x41;
result.append(lead + leadOffset);
result.append(trail + offset);
break;
}
}
return result;
}
using EUCKREncodingIndex = std::array<std::pair<UChar, uint16_t>, sizeof(eucKR()) / sizeof(eucKR()[0])>;
static const EUCKREncodingIndex& eucKREncodingIndex()
{
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
static EUCKREncodingIndex* table;
static std::once_flag once;
std::call_once(once, [&] {
table = new EUCKREncodingIndex;
auto& index = eucKR();
for (size_t i = 0; i < index.size(); i++)
(*table)[i] = { index[i].second, index[i].first };
sortByFirst(*table);
ASSERT(sortedFirstsAreUnique(*table));
});
return *table;
}
// https://encoding.spec.whatwg.org/#euc-kr-encoder
static Vector<uint8_t> eucKREncode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
auto characters = string.upconvertedCharacters();
for (WTF::CodePointIterator<UChar> iterator(characters.get(), characters.get() + string.length()); !iterator.atEnd(); ++iterator) {
auto codePoint = *iterator;
if (isASCII(codePoint)) {
result.append(codePoint);
continue;
}
auto pointer = findFirstInSortedPairs(eucKREncodingIndex(), codePoint);
if (!pointer) {
unencodableHandler(codePoint, result);
continue;
}
result.append(*pointer / 190 + 0x81);
result.append(*pointer % 190 + 0x41);
}
return result;
}
// https://encoding.spec.whatwg.org/#euc-kr-decoder
String TextCodecCJK::eucKRDecode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
return decodeCommon(bytes, length, flush, stopOnError, sawError, [this] (uint8_t byte, StringBuilder& result) {
if (uint8_t lead = std::exchange(m_lead, 0x00)) {
if (byte >= 0x41 && byte <= 0xFE) {
if (auto codePoint = findFirstInSortedPairs(eucKR(), (lead - 0x81) * 190 + byte - 0x41)) {
result.append(*codePoint);
return SawError::No;
}
}
if (isASCII(byte))
m_prependedByte = byte;
return SawError::Yes;
}
if (isASCII(byte)) {
result.append(byte);
return SawError::No;
}
if (byte >= 0x81 && byte <= 0xFE) {
m_lead = byte;
return SawError::No;
}
return SawError::Yes;
});
}
using Big5EncodeIndex = std::array<std::pair<UChar32, uint16_t>, sizeof(big5()) / sizeof(big5()[0]) - 3904>;
static const Big5EncodeIndex& big5EncodeIndex()
{
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
static Big5EncodeIndex* table;
static std::once_flag once;
std::call_once(once, [&] {
table = new Big5EncodeIndex;
auto& index = big5();
// Remove the first 3094 elements because of https://encoding.spec.whatwg.org/#index-big5-pointer
ASSERT(index[3903].first == (0xA1 - 0x81) * 157 - 1);
ASSERT(index[3904].first == (0xA1 - 0x81) * 157);
for (size_t i = 3904; i < index.size(); i++)
(*table)[i - 3904] = { index[i].second, index[i].first };
stableSortByFirst(*table);
});
return *table;
}
// https://encoding.spec.whatwg.org/#big5-encoder
static Vector<uint8_t> big5Encode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
auto characters = string.upconvertedCharacters();
for (WTF::CodePointIterator<UChar> iterator(characters.get(), characters.get() + string.length()); !iterator.atEnd(); ++iterator) {
auto codePoint = *iterator;
if (isASCII(codePoint)) {
result.append(codePoint);
continue;
}
auto pointerRange = findInSortedPairs(big5EncodeIndex(), codePoint);
if (pointerRange.first == pointerRange.second) {
unencodableHandler(codePoint, result);
continue;
}
uint16_t pointer = 0;
if (codePoint == 0x2550 || codePoint == 0x255E || codePoint == 0x2561 || codePoint == 0x256A || codePoint == 0x5341 || codePoint == 0x5345)
pointer = (pointerRange.second - 1)->second;
else
pointer = pointerRange.first->second;
if (pointer < 157 * (0xA1 - 0x81)) {
unencodableHandler(codePoint, result);
continue;
}
uint8_t lead = pointer / 157 + 0x81;
uint8_t trail = pointer % 157;
uint8_t offset = trail < 0x3F ? 0x40 : 0x62;
result.append(lead);
result.append(trail + offset);
}
return result;
}
// https://encoding.spec.whatwg.org/index-gb18030-ranges.txt
static const std::array<std::pair<uint32_t, UChar32>, 207>& gb18030Ranges()
{
static std::array<std::pair<uint32_t, UChar32>, 207> ranges {{
{ 0, 0x0080 }, { 36, 0x00A5 }, { 38, 0x00A9 }, { 45, 0x00B2 }, { 50, 0x00B8 }, { 81, 0x00D8 }, { 89, 0x00E2 }, { 95, 0x00EB },
{ 96, 0x00EE }, { 100, 0x00F4 }, { 103, 0x00F8 }, { 104, 0x00FB }, { 105, 0x00FD }, { 109, 0x0102 }, { 126, 0x0114 }, { 133, 0x011C },
{ 148, 0x012C }, { 172, 0x0145 }, { 175, 0x0149 }, { 179, 0x014E }, { 208, 0x016C }, { 306, 0x01CF }, { 307, 0x01D1 }, { 308, 0x01D3 },
{ 309, 0x01D5 }, { 310, 0x01D7 }, { 311, 0x01D9 }, { 312, 0x01DB }, { 313, 0x01DD }, { 341, 0x01FA }, { 428, 0x0252 }, { 443, 0x0262 },
{ 544, 0x02C8 }, { 545, 0x02CC }, { 558, 0x02DA }, { 741, 0x03A2 }, { 742, 0x03AA }, { 749, 0x03C2 }, { 750, 0x03CA }, { 805, 0x0402 },
{ 819, 0x0450 }, { 820, 0x0452 }, { 7922, 0x2011 }, { 7924, 0x2017 }, { 7925, 0x201A }, { 7927, 0x201E }, { 7934, 0x2027 }, { 7943, 0x2031 },
{ 7944, 0x2034 }, { 7945, 0x2036 }, { 7950, 0x203C }, { 8062, 0x20AD }, { 8148, 0x2104 }, { 8149, 0x2106 }, { 8152, 0x210A }, { 8164, 0x2117 },
{ 8174, 0x2122 }, { 8236, 0x216C }, { 8240, 0x217A }, { 8262, 0x2194 }, { 8264, 0x219A }, { 8374, 0x2209 }, { 8380, 0x2210 }, { 8381, 0x2212 },
{ 8384, 0x2216 }, { 8388, 0x221B }, { 8390, 0x2221 }, { 8392, 0x2224 }, { 8393, 0x2226 }, { 8394, 0x222C }, { 8396, 0x222F }, { 8401, 0x2238 },
{ 8406, 0x223E }, { 8416, 0x2249 }, { 8419, 0x224D }, { 8424, 0x2253 }, { 8437, 0x2262 }, { 8439, 0x2268 }, { 8445, 0x2270 }, { 8482, 0x2296 },
{ 8485, 0x229A }, { 8496, 0x22A6 }, { 8521, 0x22C0 }, { 8603, 0x2313 }, { 8936, 0x246A }, { 8946, 0x249C }, { 9046, 0x254C }, { 9050, 0x2574 },
{ 9063, 0x2590 }, { 9066, 0x2596 }, { 9076, 0x25A2 }, { 9092, 0x25B4 }, { 9100, 0x25BE }, { 9108, 0x25C8 }, { 9111, 0x25CC }, { 9113, 0x25D0 },
{ 9131, 0x25E6 }, { 9162, 0x2607 }, { 9164, 0x260A }, { 9218, 0x2641 }, { 9219, 0x2643 }, { 11329, 0x2E82 }, { 11331, 0x2E85 }, { 11334, 0x2E89 },
{ 11336, 0x2E8D }, { 11346, 0x2E98 }, { 11361, 0x2EA8 }, { 11363, 0x2EAB }, { 11366, 0x2EAF }, { 11370, 0x2EB4 }, { 11372, 0x2EB8 }, { 11375, 0x2EBC },
{ 11389, 0x2ECB }, { 11682, 0x2FFC }, { 11686, 0x3004 }, { 11687, 0x3018 }, { 11692, 0x301F }, { 11694, 0x302A }, { 11714, 0x303F }, { 11716, 0x3094 },
{ 11723, 0x309F }, { 11725, 0x30F7 }, { 11730, 0x30FF }, { 11736, 0x312A }, { 11982, 0x322A }, { 11989, 0x3232 }, { 12102, 0x32A4 }, { 12336, 0x3390 },
{ 12348, 0x339F }, { 12350, 0x33A2 }, { 12384, 0x33C5 }, { 12393, 0x33CF }, { 12395, 0x33D3 }, { 12397, 0x33D6 }, { 12510, 0x3448 }, { 12553, 0x3474 },
{ 12851, 0x359F }, { 12962, 0x360F }, { 12973, 0x361B }, { 13738, 0x3919 }, { 13823, 0x396F }, { 13919, 0x39D1 }, { 13933, 0x39E0 }, { 14080, 0x3A74 },
{ 14298, 0x3B4F }, { 14585, 0x3C6F }, { 14698, 0x3CE1 }, { 15583, 0x4057 }, { 15847, 0x4160 }, { 16318, 0x4338 }, { 16434, 0x43AD }, { 16438, 0x43B2 },
{ 16481, 0x43DE }, { 16729, 0x44D7 }, { 17102, 0x464D }, { 17122, 0x4662 }, { 17315, 0x4724 }, { 17320, 0x472A }, { 17402, 0x477D }, { 17418, 0x478E },
{ 17859, 0x4948 }, { 17909, 0x497B }, { 17911, 0x497E }, { 17915, 0x4984 }, { 17916, 0x4987 }, { 17936, 0x499C }, { 17939, 0x49A0 }, { 17961, 0x49B8 },
{ 18664, 0x4C78 }, { 18703, 0x4CA4 }, { 18814, 0x4D1A }, { 18962, 0x4DAF }, { 19043, 0x9FA6 }, { 33469, 0xE76C }, { 33470, 0xE7C8 }, { 33471, 0xE7E7 },
{ 33484, 0xE815 }, { 33485, 0xE819 }, { 33490, 0xE81F }, { 33497, 0xE827 }, { 33501, 0xE82D }, { 33505, 0xE833 }, { 33513, 0xE83C }, { 33520, 0xE844 },
{ 33536, 0xE856 }, { 33550, 0xE865 }, { 37845, 0xF92D }, { 37921, 0xF97A }, { 37948, 0xF996 }, { 38029, 0xF9E8 }, { 38038, 0xF9F2 }, { 38064, 0xFA10 },
{ 38065, 0xFA12 }, { 38066, 0xFA15 }, { 38069, 0xFA19 }, { 38075, 0xFA22 }, { 38076, 0xFA25 }, { 38078, 0xFA2A }, { 39108, 0xFE32 }, { 39109, 0xFE45 },
{ 39113, 0xFE53 }, { 39114, 0xFE58 }, { 39115, 0xFE67 }, { 39116, 0xFE6C }, { 39265, 0xFF5F }, { 39394, 0xFFE6 }, { 189000, 0x10000 }
}};
return ranges;
}
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point
static std::optional<UChar32> gb18030RangesCodePoint(uint32_t pointer)
{
if ((pointer > 39419 && pointer < 189000) || pointer > 1237575)
return std::nullopt;
if (pointer == 7457)
return 0xE7C7;
auto upperBound = std::upper_bound(gb18030Ranges().begin(), gb18030Ranges().end(), makeFirstAdapter(pointer), CompareFirst { });
ASSERT(upperBound != gb18030Ranges().begin());
uint32_t offset = (upperBound - 1)->first;
UChar32 codePointOffset = (upperBound - 1)->second;
return codePointOffset + pointer - offset;
}
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
static uint32_t gb18030RangesPointer(UChar32 codePoint)
{
if (codePoint == 0xE7C7)
return 7457;
auto upperBound = std::upper_bound(gb18030Ranges().begin(), gb18030Ranges().end(), makeSecondAdapter(codePoint), CompareSecond { });
ASSERT(upperBound != gb18030Ranges().begin());
uint32_t pointerOffset = (upperBound - 1)->first;
UChar32 offset = (upperBound - 1)->second;
return pointerOffset + codePoint - offset;
}
using GB18030EncodeIndex = std::array<std::pair<UChar, uint16_t>, 23940>;
static const GB18030EncodeIndex& gb18030EncodeIndex()
{
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
static GB18030EncodeIndex* table;
static std::once_flag once;
std::call_once(once, [&] {
table = new GB18030EncodeIndex;
auto& index = gb18030();
for (uint16_t i = 0; i < index.size(); i++)
(*table)[i] = { index[i], i };
stableSortByFirst(*table);
});
return *table;
}
// https://encoding.spec.whatwg.org/#gb18030-decoder
String TextCodecCJK::gb18030Decode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
Function<SawError(uint8_t, StringBuilder&)> parseByte;
parseByte = [&] (uint8_t byte, StringBuilder& result) {
if (m_gb18030Third) {
if (byte < 0x30 || byte > 0x39) {
sawError = true;
result.append(replacementCharacter);
m_gb18030First = 0x00;
uint8_t second = std::exchange(m_gb18030Second, 0x00);
uint8_t third = std::exchange(m_gb18030Third, 0x00);
if (parseByte(second, result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
}
if (parseByte(third, result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
}
return parseByte(byte, result);
}
uint8_t first = std::exchange(m_gb18030First, 0x00);
uint8_t second = std::exchange(m_gb18030Second, 0x00);
uint8_t third = std::exchange(m_gb18030Third, 0x00);
if (auto codePoint = gb18030RangesCodePoint(((first - 0x81) * 10 * 126 * 10) + ((second - 0x30) * 10 * 126) + ((third - 0x81) * 10) + byte - 0x30)) {
result.appendCharacter(*codePoint);
return SawError::No;
}
return SawError::Yes;
}
if (m_gb18030Second) {
if (byte >= 0x81 && byte <= 0xFE) {
m_gb18030Third = byte;
return SawError::No;
}
sawError = true;
result.append(replacementCharacter);
m_gb18030First = 0x00;
if (parseByte(std::exchange(m_gb18030Second, 0x00), result) == SawError::Yes) {
sawError = true;
result.append(replacementCharacter);
}
return parseByte(byte, result);
}
if (m_gb18030First) {
if (byte >= 0x30 && byte <= 0x39) {
m_gb18030Second = byte;
return SawError::No;
}
uint8_t lead = std::exchange(m_gb18030First, 0x00);
uint8_t offset = byte < 0x7F ? 0x40 : 0x41;
if ((byte >= 0x40 && byte <= 0x7E) || (byte >= 0x80 && byte <= 0xFE)) {
size_t pointer = (lead - 0x81) * 190 + byte - offset;
if (pointer < gb18030().size()) {
result.append(gb18030()[pointer]);
return SawError::No;
}
}
if (isASCII(byte))
m_prependedByte = byte;
return SawError::Yes;
}
if (isASCII(byte)) {
result.append(byte);
return SawError::No;
}
if (byte == 0x80) {
result.appendCharacter(0x20AC);
return SawError::No;
}
if (byte >= 0x81 && byte <= 0xFE) {
m_gb18030First = byte;
return SawError::No;
}
return SawError::Yes;
};
auto result = decodeCommon(bytes, length, flush, stopOnError, sawError, parseByte);
if (flush && (m_gb18030First || m_gb18030Second || m_gb18030Third)) {
m_gb18030First = 0x00;
m_gb18030Second = 0x00;
m_gb18030Third = 0x00;
sawError = true;
result = makeString(result, replacementCharacter);
}
return result;
}
// https://encoding.spec.whatwg.org/#gb18030-encoder
enum class IsGBK : bool { No, Yes };
static Vector<uint8_t> gbEncodeShared(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler, IsGBK isGBK)
{
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
auto characters = string.upconvertedCharacters();
for (WTF::CodePointIterator<UChar> iterator(characters.get(), characters.get() + string.length()); !iterator.atEnd(); ++iterator) {
auto codePoint = *iterator;
if (isASCII(codePoint)) {
result.append(codePoint);
continue;
}
if (codePoint == 0xE5E5) {
unencodableHandler(codePoint, result);
continue;
}
if (isGBK == IsGBK::Yes && codePoint == 0x20AC) {
result.append(0x80);
continue;
}
auto pointerRange = findInSortedPairs(gb18030EncodeIndex(), codePoint);
if (pointerRange.first != pointerRange.second) {
uint16_t pointer = pointerRange.first->second;
uint8_t lead = pointer / 190 + 0x81;
uint8_t trail = pointer % 190;
uint8_t offset = trail < 0x3F ? 0x40 : 0x41;
result.append(lead);
result.append(trail + offset);
continue;
}
if (isGBK == IsGBK::Yes) {
unencodableHandler(codePoint, result);
continue;
}
uint32_t pointer = gb18030RangesPointer(codePoint);
uint8_t byte1 = pointer / (10 * 126 * 10);
pointer = pointer % (10 * 126 * 10);
uint8_t byte2 = pointer / (10 * 126);
pointer = pointer % (10 * 126);
uint8_t byte3 = pointer / 10;
uint8_t byte4 = pointer % 10;
result.append(byte1 + 0x81);
result.append(byte2 + 0x30);
result.append(byte3 + 0x81);
result.append(byte4 + 0x30);
}
return result;
}
static Vector<uint8_t> gb18030Encode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
return gbEncodeShared(string, WTFMove(unencodableHandler), IsGBK::No);
}
// https://encoding.spec.whatwg.org/#gbk-decoder
String TextCodecCJK::gbkDecode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
return gb18030Decode(bytes, length, flush, stopOnError, sawError);
}
static Vector<uint8_t> gbkEncode(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler)
{
return gbEncodeShared(string, WTFMove(unencodableHandler), IsGBK::Yes);
}
constexpr size_t maxUChar32Digits = 10;
static void appendDecimal(UChar32 c, Vector<uint8_t>& result)
{
uint8_t buffer[lengthOfIntegerAsString(std::numeric_limits<decltype(c)>::max())];
writeIntegerToBuffer(c, buffer);
result.append(buffer, lengthOfIntegerAsString(c));
}
static void urlEncodedEntityUnencodableHandler(UChar32 c, Vector<uint8_t>& result)
{
result.reserveCapacity(result.size() + 9 + maxUChar32Digits);
result.uncheckedAppend('%');
result.uncheckedAppend('2');
result.uncheckedAppend('6');
result.uncheckedAppend('%');
result.uncheckedAppend('2');
result.uncheckedAppend('3');
appendDecimal(c, result);
result.uncheckedAppend('%');
result.uncheckedAppend('3');
result.uncheckedAppend('B');
}
static void entityUnencodableHandler(UChar32 c, Vector<uint8_t>& result)
{
result.reserveCapacity(result.size() + 3 + maxUChar32Digits);
result.uncheckedAppend('&');
result.uncheckedAppend('#');
appendDecimal(c, result);
result.uncheckedAppend(';');
}
static void questionMarkUnencodableHandler(UChar32, Vector<uint8_t>& result)
{
result.append('?');
}
Function<void(UChar32, Vector<uint8_t>&)> unencodableHandler(UnencodableHandling handling)
{
switch (handling) {
case UnencodableHandling::QuestionMarks:
return questionMarkUnencodableHandler;
case UnencodableHandling::Entities:
return entityUnencodableHandler;
case UnencodableHandling::URLEncodedEntities:
return urlEncodedEntityUnencodableHandler;
}
ASSERT_NOT_REACHED();
return entityUnencodableHandler;
}
String TextCodecCJK::big5Decode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
return decodeCommon(bytes, length, flush, stopOnError, sawError, [this] (uint8_t byte, StringBuilder& result) {
if (uint8_t lead = std::exchange(m_lead, 0x00)) {
uint8_t offset = byte < 0x7F ? 0x40 : 0x62;
if ((byte >= 0x40 && byte <= 0x7E) || (byte >= 0xA1 && byte <= 0xFE)) {
uint16_t pointer = (lead - 0x81) * 157 + (byte - offset);
if (pointer == 1133) {
result.appendCharacter(0x00CA);
result.appendCharacter(0x0304);
} else if (pointer == 1135) {
result.appendCharacter(0x00CA);
result.appendCharacter(0x030C);
} else if (pointer == 1164) {
result.appendCharacter(0x00EA);
result.appendCharacter(0x0304);
} else if (pointer == 1166) {
result.appendCharacter(0x00EA);
result.appendCharacter(0x030C);
} else {
if (auto codePoint = findFirstInSortedPairs(big5(), pointer))
result.appendCharacter(*codePoint);
else
return SawError::Yes;
}
return SawError::No;
}
if (isASCII(byte))
m_prependedByte = byte;
return SawError::Yes;
}
if (isASCII(byte)) {
result.append(static_cast<char>(byte));
return SawError::No;
}
if (byte >= 0x81 && byte <= 0xFE) {
m_lead = byte;
return SawError::No;
}
return SawError::Yes;
});
}
String TextCodecCJK::decode(const char* charBytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
auto bytes = reinterpret_cast<const uint8_t*>(charBytes);
switch (m_encoding) {
case Encoding::EUC_JP:
return eucJPDecode(bytes, length, flush, stopOnError, sawError);
case Encoding::Shift_JIS:
return shiftJISDecode(bytes, length, flush, stopOnError, sawError);
case Encoding::ISO2022JP:
return iso2022JPDecode(bytes, length, flush, stopOnError, sawError);
case Encoding::EUC_KR:
return eucKRDecode(bytes, length, flush, stopOnError, sawError);
case Encoding::Big5:
return big5Decode(bytes, length, flush, stopOnError, sawError);
case Encoding::GBK:
return gbkDecode(bytes, length, flush, stopOnError, sawError);
case Encoding::GB18030:
return gb18030Decode(bytes, length, flush, stopOnError, sawError);
}
ASSERT_NOT_REACHED();
return { };
}
Vector<uint8_t> TextCodecCJK::encode(StringView string, UnencodableHandling handling) const
{
switch (m_encoding) {
case Encoding::EUC_JP:
return eucJPEncode(string, unencodableHandler(handling));
case Encoding::Shift_JIS:
return shiftJISEncode(string, unencodableHandler(handling));
case Encoding::ISO2022JP:
return iso2022JPEncode(string, unencodableHandler(handling));
case Encoding::EUC_KR:
return eucKREncode(string, unencodableHandler(handling));
case Encoding::Big5:
return big5Encode(string, unencodableHandler(handling));
case Encoding::GBK:
return gbkEncode(string, unencodableHandler(handling));
case Encoding::GB18030:
return gb18030Encode(string, unencodableHandler(handling));
}
ASSERT_NOT_REACHED();
return { };
}
} // namespace WebCore