| /* |
| * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved. |
| * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "StreamingTextDecoderICU.h" |
| |
| #include <unicode/unorm.h> |
| #include <wtf/Assertions.h> |
| |
| using std::min; |
| |
| namespace WebCore { |
| |
| StreamingTextDecoderICU::StreamingTextDecoderICU(const TextEncoding& encoding) |
| : m_encoding(encoding) |
| , m_littleEndian(encoding.flags() & LittleEndian) |
| , m_atStart(true) |
| , m_numBufferedBytes(0) |
| , m_converterICU(0) |
| { |
| } |
| |
| static const UChar BOM = 0xFEFF; |
| static const size_t ConversionBufferSize = 16384; |
| |
| static UConverter* cachedConverterICU; |
| static TextEncodingID cachedConverterEncoding = InvalidEncoding; |
| |
| StreamingTextDecoderICU::~StreamingTextDecoderICU() |
| { |
| releaseICUConverter(); |
| } |
| |
| void StreamingTextDecoderICU::releaseICUConverter() |
| { |
| if (m_converterICU) { |
| if (cachedConverterICU != 0) |
| ucnv_close(cachedConverterICU); |
| cachedConverterICU = m_converterICU; |
| cachedConverterEncoding = m_encoding.encodingID(); |
| m_converterICU = 0; |
| } |
| } |
| |
| bool StreamingTextDecoderICU::textEncodingSupported() |
| { |
| if (!m_converterICU) |
| createICUConverter(); |
| |
| return m_converterICU; |
| } |
| |
| DeprecatedString StreamingTextDecoderICU::convertUTF16(const unsigned char* s, int length) |
| { |
| ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1); |
| |
| const unsigned char* p = s; |
| size_t len = length; |
| |
| DeprecatedString result(""); |
| |
| result.reserve(length / 2); |
| |
| if (m_numBufferedBytes != 0 && len != 0) { |
| ASSERT(m_numBufferedBytes == 1); |
| UChar c; |
| if (m_littleEndian) |
| c = m_bufferedBytes[0] | (p[0] << 8); |
| else |
| c = (m_bufferedBytes[0] << 8) | p[0]; |
| |
| if (c) |
| result.append(reinterpret_cast<DeprecatedChar*>(&c), 1); |
| |
| m_numBufferedBytes = 0; |
| p += 1; |
| len -= 1; |
| } |
| |
| while (len > 1) { |
| UChar buffer[ConversionBufferSize]; |
| int runLength = min(len / 2, ConversionBufferSize); |
| int bufferLength = 0; |
| if (m_littleEndian) { |
| for (int i = 0; i < runLength; ++i) { |
| UChar c = p[0] | (p[1] << 8); |
| p += 2; |
| if (c != BOM) |
| buffer[bufferLength++] = c; |
| } |
| } else { |
| for (int i = 0; i < runLength; ++i) { |
| UChar c = (p[0] << 8) | p[1]; |
| p += 2; |
| if (c != BOM) |
| buffer[bufferLength++] = c; |
| } |
| } |
| result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength); |
| len -= runLength * 2; |
| } |
| |
| if (len) { |
| ASSERT(m_numBufferedBytes == 0); |
| m_numBufferedBytes = 1; |
| m_bufferedBytes[0] = p[0]; |
| } |
| |
| return result; |
| } |
| |
| bool StreamingTextDecoderICU::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str) |
| { |
| ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1); |
| |
| DeprecatedString result(""); |
| result.reserve(length); |
| |
| const unsigned char* p = s; |
| size_t len = length; |
| unsigned char ored = 0; |
| while (len) { |
| UChar buffer[ConversionBufferSize]; |
| int runLength = min(len, ConversionBufferSize); |
| int bufferLength = 0; |
| for (int i = 0; i < runLength; ++i) { |
| unsigned char c = *p++; |
| ored |= c; |
| buffer[bufferLength++] = c; |
| } |
| if (ored & 0x80) |
| return false; |
| result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength); |
| len -= runLength; |
| } |
| |
| str = result; |
| return true; |
| } |
| |
| void StreamingTextDecoderICU::createICUConverter() |
| { |
| TextEncoding encoding = m_encoding.effectiveEncoding(); |
| const char* encodingName = encoding.name(); |
| |
| bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID(); |
| cachedConverterEncoding = InvalidEncoding; |
| |
| if (cachedEncodingEqual && cachedConverterICU) { |
| m_converterICU = cachedConverterICU; |
| cachedConverterICU = 0; |
| } else { |
| UErrorCode err = U_ZERO_ERROR; |
| ASSERT(!m_converterICU); |
| m_converterICU = ucnv_open(encodingName, &err); |
| #if !LOG_DISABLED |
| if (err == U_AMBIGUOUS_ALIAS_WARNING) |
| LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName); |
| #endif |
| } |
| } |
| |
| // We strip BOM characters because they can show up both at the start of content |
| // and inside content, and we never want them to end up in the decoded text. |
| void StreamingTextDecoderICU::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount) |
| { |
| ASSERT(byteCount % sizeof(UChar) == 0); |
| int start = 0; |
| int characterCount = byteCount / sizeof(UChar); |
| for (int i = 0; i != characterCount; ++i) { |
| if (BOM == characters[i]) { |
| if (start != i) |
| s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), i - start); |
| start = i + 1; |
| } |
| } |
| if (start != characterCount) |
| s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), characterCount - start); |
| } |
| |
| DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush) |
| { |
| // Get a converter for the passed-in encoding. |
| if (!m_converterICU) { |
| createICUConverter(); |
| if (!m_converterICU) |
| return DeprecatedString(); |
| } |
| |
| DeprecatedString result(""); |
| result.reserve(len); |
| |
| UChar buffer[ConversionBufferSize]; |
| const char* source = reinterpret_cast<const char*>(chs); |
| const char* sourceLimit = source + len; |
| int32_t* offsets = NULL; |
| UErrorCode err; |
| |
| do { |
| UChar* target = buffer; |
| const UChar* targetLimit = target + ConversionBufferSize; |
| err = U_ZERO_ERROR; |
| ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err); |
| int count = target - buffer; |
| appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar)); |
| } while (err == U_BUFFER_OVERFLOW_ERROR); |
| |
| if (U_FAILURE(err)) { |
| // flush the converter so it can be reused, and not be bothered by this error. |
| do { |
| UChar *target = buffer; |
| const UChar *targetLimit = target + ConversionBufferSize; |
| err = U_ZERO_ERROR; |
| ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err); |
| } while (source < sourceLimit); |
| LOG_ERROR("ICU conversion error"); |
| return DeprecatedString(); |
| } |
| |
| return result; |
| } |
| |
| DeprecatedString StreamingTextDecoderICU::convert(const unsigned char* chs, int len, bool flush) |
| { |
| switch (m_encoding.encodingID()) { |
| case UTF16Encoding: |
| return convertUTF16(chs, len); |
| |
| case ASCIIEncoding: |
| case Latin1Encoding: |
| case WinLatin1Encoding: { |
| DeprecatedString result; |
| if (convertIfASCII(chs, len, result)) |
| return result; |
| break; |
| } |
| |
| case UTF8Encoding: |
| // If a previous run used ICU, we might have a partly converted character. |
| // If so, don't use the optimized ASCII code path. |
| if (!m_converterICU) { |
| DeprecatedString result; |
| if (convertIfASCII(chs, len, result)) |
| return result; |
| } |
| break; |
| |
| default: |
| break; |
| } |
| |
| //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000 |
| #ifdef PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE |
| DeprecatedString result; |
| int chunkSize; |
| for (int i = 0; i != len; i += chunkSize) { |
| chunkSize = len - i; |
| if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) { |
| chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE; |
| } |
| result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len)); |
| } |
| return result; |
| #else |
| return convertUsingICU(chs, len, flush); |
| #endif |
| } |
| |
| DeprecatedString StreamingTextDecoderICU::toUnicode(const char* chs, int len, bool flush) |
| { |
| ASSERT_ARG(len, len >= 0); |
| |
| if (!chs) |
| return DeprecatedString(); |
| |
| if (len <= 0 && !flush) |
| return ""; |
| |
| // Handle normal case. |
| if (!m_atStart) |
| return convert(chs, len, flush); |
| |
| // Check to see if we found a BOM. |
| int numBufferedBytes = m_numBufferedBytes; |
| int buf1Len = numBufferedBytes; |
| int buf2Len = len; |
| const unsigned char* buf1 = m_bufferedBytes; |
| const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs); |
| unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; |
| unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; |
| unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; |
| int BOMLength = 0; |
| if (c1 == 0xFF && c2 == 0xFE) { |
| if (m_encoding != TextEncoding(UTF16Encoding, LittleEndian)) { |
| releaseICUConverter(); |
| m_encoding = TextEncoding(UTF16Encoding, LittleEndian); |
| m_littleEndian = true; |
| } |
| BOMLength = 2; |
| } else if (c1 == 0xFE && c2 == 0xFF) { |
| if (m_encoding != TextEncoding(UTF16Encoding, BigEndian)) { |
| releaseICUConverter(); |
| m_encoding = TextEncoding(UTF16Encoding, BigEndian); |
| m_littleEndian = false; |
| } |
| BOMLength = 2; |
| } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { |
| if (m_encoding != TextEncoding(UTF8Encoding)) { |
| releaseICUConverter(); |
| m_encoding = TextEncoding(UTF8Encoding); |
| } |
| BOMLength = 3; |
| } |
| |
| // Handle case where we found a BOM. |
| if (BOMLength != 0) { |
| ASSERT(numBufferedBytes + len >= BOMLength); |
| int skip = BOMLength - numBufferedBytes; |
| m_numBufferedBytes = 0; |
| m_atStart = false; |
| return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush); |
| } |
| |
| // Handle case where we know there is no BOM coming. |
| const int bufferSize = sizeof(m_bufferedBytes); |
| if (numBufferedBytes + len > bufferSize || flush) { |
| m_atStart = false; |
| if (numBufferedBytes == 0) { |
| return convert(chs, len, flush); |
| } |
| unsigned char bufferedBytes[sizeof(m_bufferedBytes)]; |
| memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes); |
| m_numBufferedBytes = 0; |
| return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush); |
| } |
| |
| // Continue to look for the BOM. |
| memcpy(&m_bufferedBytes[numBufferedBytes], chs, len); |
| m_numBufferedBytes += len; |
| return ""; |
| } |
| |
| DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities) |
| { |
| TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID(); |
| |
| if (encoding == WinLatin1Encoding && qcs.isAllLatin1()) |
| return qcs.latin1(); |
| |
| if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) |
| && qcs.isAllASCII()) |
| return qcs.ascii(); |
| |
| // FIXME: We should see if there is "force ASCII range" mode in ICU; |
| // until then, we change the backslash into a yen sign. |
| // Encoding will change the yen sign back into a backslash. |
| DeprecatedString copy = qcs; |
| copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); |
| |
| if (!m_converterICU) |
| createICUConverter(); |
| if (!m_converterICU) |
| return DeprecatedCString(); |
| |
| // FIXME: when DeprecatedString buffer is latin1, it would be nice to |
| // convert from that w/o having to allocate a unicode buffer |
| |
| char buffer[ConversionBufferSize]; |
| const UChar* source = reinterpret_cast<const UChar*>(copy.unicode()); |
| const UChar* sourceLimit = source + copy.length(); |
| |
| UErrorCode err = U_ZERO_ERROR; |
| DeprecatedString normalizedString; |
| if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) { |
| normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed |
| |
| int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err); |
| if (err == U_BUFFER_OVERFLOW_ERROR) { |
| err = U_ZERO_ERROR; |
| normalizedString.truncate(normalizedLength); |
| normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err); |
| } |
| |
| source = reinterpret_cast<const UChar*>(normalizedString.unicode()); |
| sourceLimit = source + normalizedLength; |
| } |
| |
| DeprecatedCString result(1); // for trailing zero |
| |
| if (allowEntities) |
| ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); |
| else { |
| ucnv_setSubstChars(m_converterICU, "?", 1, &err); |
| ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); |
| } |
| |
| ASSERT(U_SUCCESS(err)); |
| if (U_FAILURE(err)) |
| return DeprecatedCString(); |
| |
| do { |
| char* target = buffer; |
| char* targetLimit = target + ConversionBufferSize; |
| err = U_ZERO_ERROR; |
| ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); |
| int count = target - buffer; |
| buffer[count] = 0; |
| result.append(buffer); |
| } while (err == U_BUFFER_OVERFLOW_ERROR); |
| |
| return result; |
| } |
| |
| |
| } // namespace WebCore |