WebCore/platform/StreamingTextDecoderICU.cpp - WebKit - Git at Google

 /*
  * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "config.h"
 #include "StreamingTextDecoderICU.h"

 #include <unicode/unorm.h>
 #include <wtf/Assertions.h>

 using std::min;

 namespace WebCore {

 StreamingTextDecoderICU::StreamingTextDecoderICU(const TextEncoding& encoding)
     : m_encoding(encoding)
     , m_littleEndian(encoding.flags() & LittleEndian)
     , m_atStart(true)
     , m_numBufferedBytes(0)
     , m_converterICU(0)
 {
 }

 static const UChar BOM = 0xFEFF;
 static const size_t ConversionBufferSize = 16384;

 static UConverter* cachedConverterICU;
 static TextEncodingID cachedConverterEncoding = InvalidEncoding;

 StreamingTextDecoderICU::~StreamingTextDecoderICU()
 {
     releaseICUConverter();
 }

 void StreamingTextDecoderICU::releaseICUConverter()
 {
     if (m_converterICU) {
         if (cachedConverterICU != 0)
             ucnv_close(cachedConverterICU);
         cachedConverterICU = m_converterICU;
         cachedConverterEncoding = m_encoding.encodingID();
         m_converterICU = 0;
     }
 }

 bool StreamingTextDecoderICU::textEncodingSupported()
 {
     if (!m_converterICU)
         createICUConverter();

     return m_converterICU;
 }

 DeprecatedString StreamingTextDecoderICU::convertUTF16(const unsigned char* s, int length)
 {
     ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);

     const unsigned char* p = s;
     size_t len = length;

     DeprecatedString result("");

     result.reserve(length / 2);

     if (m_numBufferedBytes != 0 && len != 0) {
         ASSERT(m_numBufferedBytes == 1);
         UChar c;
         if (m_littleEndian)
             c = m_bufferedBytes[0] | (p[0] << 8);
         else
             c = (m_bufferedBytes[0] << 8) | p[0];

         if (c)
             result.append(reinterpret_cast<DeprecatedChar*>(&c), 1);

         m_numBufferedBytes = 0;
         p += 1;
         len -= 1;
     }

     while (len > 1) {
         UChar buffer[ConversionBufferSize];
         int runLength = min(len / 2, ConversionBufferSize);
         int bufferLength = 0;
         if (m_littleEndian) {
             for (int i = 0; i < runLength; ++i) {
                 UChar c = p[0] | (p[1] << 8);
                 p += 2;
                 if (c != BOM)
                     buffer[bufferLength++] = c;
             }
         } else {
             for (int i = 0; i < runLength; ++i) {
                 UChar c = (p[0] << 8) | p[1];
                 p += 2;
                 if (c != BOM)
                     buffer[bufferLength++] = c;
             }
         }
         result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
         len -= runLength * 2;
     }

     if (len) {
         ASSERT(m_numBufferedBytes == 0);
         m_numBufferedBytes = 1;
         m_bufferedBytes[0] = p[0];
     }

     return result;
 }

 bool StreamingTextDecoderICU::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str)
 {
     ASSERT(m_numBufferedBytes == 0 || m_numBufferedBytes == 1);

     DeprecatedString result("");
     result.reserve(length);

     const unsigned char* p = s;
     size_t len = length;
     unsigned char ored = 0;
     while (len) {
         UChar buffer[ConversionBufferSize];
         int runLength = min(len, ConversionBufferSize);
         int bufferLength = 0;
         for (int i = 0; i < runLength; ++i) {
             unsigned char c = *p++;
             ored |= c;
             buffer[bufferLength++] = c;
         }
         if (ored & 0x80)
             return false;
         result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
         len -= runLength;
     }

     str = result;
     return true;
 }

 void StreamingTextDecoderICU::createICUConverter()
 {
     TextEncoding encoding = m_encoding.effectiveEncoding();
     const char* encodingName = encoding.name();

     bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID();
     cachedConverterEncoding = InvalidEncoding;

     if (cachedEncodingEqual && cachedConverterICU) {
         m_converterICU = cachedConverterICU;
         cachedConverterICU = 0;
     } else {
         UErrorCode err = U_ZERO_ERROR;
         ASSERT(!m_converterICU);
         m_converterICU = ucnv_open(encodingName, &err);
 #if !LOG_DISABLED
         if (err == U_AMBIGUOUS_ALIAS_WARNING)
             LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
 #endif
     }
 }

 // We strip BOM characters because they can show up both at the start of content
 // and inside content, and we never want them to end up in the decoded text.
 void StreamingTextDecoderICU::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount)
 {
     ASSERT(byteCount % sizeof(UChar) == 0);
     int start = 0;
     int characterCount = byteCount / sizeof(UChar);
     for (int i = 0; i != characterCount; ++i) {
         if (BOM == characters[i]) {
             if (start != i)
                 s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), i - start);
             start = i + 1;
         }
     }
     if (start != characterCount)
         s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), characterCount - start);
 }

 DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush)
 {
     // Get a converter for the passed-in encoding.
     if (!m_converterICU) {
         createICUConverter();
         if (!m_converterICU)
             return DeprecatedString();
     }

     DeprecatedString result("");
     result.reserve(len);

     UChar buffer[ConversionBufferSize];
     const char* source = reinterpret_cast<const char*>(chs);
     const char* sourceLimit = source + len;
     int32_t* offsets = NULL;
     UErrorCode err;

     do {
         UChar* target = buffer;
         const UChar* targetLimit = target + ConversionBufferSize;
         err = U_ZERO_ERROR;
         ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
         int count = target - buffer;
         appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
     } while (err == U_BUFFER_OVERFLOW_ERROR);

     if (U_FAILURE(err)) {
         // flush the converter so it can be reused, and not be bothered by this error.
         do {
             UChar *target = buffer;
             const UChar *targetLimit = target + ConversionBufferSize;
             err = U_ZERO_ERROR;
             ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
         } while (source < sourceLimit);
         LOG_ERROR("ICU conversion error");
         return DeprecatedString();
     }

     return result;
 }

 DeprecatedString StreamingTextDecoderICU::convert(const unsigned char* chs, int len, bool flush)
 {
     switch (m_encoding.encodingID()) {
         case UTF16Encoding:
             return convertUTF16(chs, len);

         case ASCIIEncoding:
         case Latin1Encoding:
         case WinLatin1Encoding: {
             DeprecatedString result;
             if (convertIfASCII(chs, len, result))
                 return result;
             break;
         }

         case UTF8Encoding:
             // If a previous run used ICU, we might have a partly converted character.
             // If so, don't use the optimized ASCII code path.
             if (!m_converterICU) {
                 DeprecatedString result;
                 if (convertIfASCII(chs, len, result))
                     return result;
             }
             break;

         default:
             break;
     }

     //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000
 #ifdef PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
     DeprecatedString result;
     int chunkSize;
     for (int i = 0; i != len; i += chunkSize) {
         chunkSize = len - i;
         if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
             chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
         }
         result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len));
     }
     return result;
 #else
     return convertUsingICU(chs, len, flush);
 #endif
 }

 DeprecatedString StreamingTextDecoderICU::toUnicode(const char* chs, int len, bool flush)
 {
     ASSERT_ARG(len, len >= 0);

     if (!chs)
         return DeprecatedString();

     if (len <= 0 && !flush)
         return "";

     // Handle normal case.
     if (!m_atStart)
         return convert(chs, len, flush);

     // Check to see if we found a BOM.
     int numBufferedBytes = m_numBufferedBytes;
     int buf1Len = numBufferedBytes;
     int buf2Len = len;
     const unsigned char* buf1 = m_bufferedBytes;
     const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs);
     unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
     unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
     unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
     int BOMLength = 0;
     if (c1 == 0xFF && c2 == 0xFE) {
         if (m_encoding != TextEncoding(UTF16Encoding, LittleEndian)) {
             releaseICUConverter();
             m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
             m_littleEndian = true;
         }
         BOMLength = 2;
     } else if (c1 == 0xFE && c2 == 0xFF) {
         if (m_encoding != TextEncoding(UTF16Encoding, BigEndian)) {
             releaseICUConverter();
             m_encoding = TextEncoding(UTF16Encoding, BigEndian);
             m_littleEndian = false;
         }
         BOMLength = 2;
     } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
         if (m_encoding != TextEncoding(UTF8Encoding)) {
             releaseICUConverter();
             m_encoding = TextEncoding(UTF8Encoding);
         }
         BOMLength = 3;
     }

     // Handle case where we found a BOM.
     if (BOMLength != 0) {
         ASSERT(numBufferedBytes + len >= BOMLength);
         int skip = BOMLength - numBufferedBytes;
         m_numBufferedBytes = 0;
         m_atStart = false;
         return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
     }

     // Handle case where we know there is no BOM coming.
     const int bufferSize = sizeof(m_bufferedBytes);
     if (numBufferedBytes + len > bufferSize || flush) {
         m_atStart = false;
         if (numBufferedBytes == 0) {
             return convert(chs, len, flush);
         }
         unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
         memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
         m_numBufferedBytes = 0;
         return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
     }

     // Continue to look for the BOM.
     memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
     m_numBufferedBytes += len;
     return "";
 }

 DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities)
 {
     TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID();

     if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
         return qcs.latin1();

     if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding)
         && qcs.isAllASCII())
         return qcs.ascii();

     // FIXME: We should see if there is "force ASCII range" mode in ICU;
     // until then, we change the backslash into a yen sign.
     // Encoding will change the yen sign back into a backslash.
     DeprecatedString copy = qcs;
     copy.replace('\\', m_encoding.backslashAsCurrencySymbol());

     if (!m_converterICU)
         createICUConverter();
     if (!m_converterICU)
         return DeprecatedCString();

     // FIXME: when DeprecatedString buffer is latin1, it would be nice to
     // convert from that w/o having to allocate a unicode buffer

     char buffer[ConversionBufferSize];
     const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
     const UChar* sourceLimit = source + copy.length();

     UErrorCode err = U_ZERO_ERROR;
     DeprecatedString normalizedString;
     if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
         normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed

         int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err);
         if (err == U_BUFFER_OVERFLOW_ERROR) {
             err = U_ZERO_ERROR;
             normalizedString.truncate(normalizedLength);
             normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err);
         }

         source = reinterpret_cast<const UChar*>(normalizedString.unicode());
         sourceLimit = source + normalizedLength;
     }

     DeprecatedCString result(1); // for trailing zero

     if (allowEntities)
         ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
     else {
         ucnv_setSubstChars(m_converterICU, "?", 1, &err);
         ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
     }

     ASSERT(U_SUCCESS(err));
     if (U_FAILURE(err))
         return DeprecatedCString();

     do {
         char* target = buffer;
         char* targetLimit = target + ConversionBufferSize;
         err = U_ZERO_ERROR;
         ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true,  &err);
         int count = target - buffer;
         buffer[count] = 0;
         result.append(buffer);
     } while (err == U_BUFFER_OVERFLOW_ERROR);

     return result;
 }


 } // namespace WebCore
	/*
	* Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
	* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "config.h"
	#include "StreamingTextDecoderICU.h"

	#include <unicode/unorm.h>
	#include <wtf/Assertions.h>

	using std::min;

	namespace WebCore {

	StreamingTextDecoderICU::StreamingTextDecoderICU(const TextEncoding& encoding)
	: m_encoding(encoding)
	, m_littleEndian(encoding.flags() & LittleEndian)
	, m_atStart(true)
	, m_numBufferedBytes(0)
	, m_converterICU(0)
	{
	}

	static const UChar BOM = 0xFEFF;
	static const size_t ConversionBufferSize = 16384;

	static UConverter* cachedConverterICU;
	static TextEncodingID cachedConverterEncoding = InvalidEncoding;

	StreamingTextDecoderICU::~StreamingTextDecoderICU()
	{
	releaseICUConverter();
	}

	void StreamingTextDecoderICU::releaseICUConverter()
	{
	if (m_converterICU) {
	if (cachedConverterICU != 0)
	ucnv_close(cachedConverterICU);
	cachedConverterICU = m_converterICU;
	cachedConverterEncoding = m_encoding.encodingID();
	m_converterICU = 0;
	}
	}

	bool StreamingTextDecoderICU::textEncodingSupported()
	{
	if (!m_converterICU)
	createICUConverter();

	return m_converterICU;
	}

	DeprecatedString StreamingTextDecoderICU::convertUTF16(const unsigned char* s, int length)
	{
	ASSERT(m_numBufferedBytes == 0 \|\| m_numBufferedBytes == 1);

	const unsigned char* p = s;
	size_t len = length;

	DeprecatedString result("");

	result.reserve(length / 2);

	if (m_numBufferedBytes != 0 && len != 0) {
	ASSERT(m_numBufferedBytes == 1);
	UChar c;
	if (m_littleEndian)
	c = m_bufferedBytes[0] \| (p[0] << 8);
	else
	c = (m_bufferedBytes[0] << 8) \| p[0];

	if (c)
	result.append(reinterpret_cast<DeprecatedChar*>(&c), 1);

	m_numBufferedBytes = 0;
	p += 1;
	len -= 1;
	}

	while (len > 1) {
	UChar buffer[ConversionBufferSize];
	int runLength = min(len / 2, ConversionBufferSize);
	int bufferLength = 0;
	if (m_littleEndian) {
	for (int i = 0; i < runLength; ++i) {
	UChar c = p[0] \| (p[1] << 8);
	p += 2;
	if (c != BOM)
	buffer[bufferLength++] = c;
	}
	} else {
	for (int i = 0; i < runLength; ++i) {
	UChar c = (p[0] << 8) \| p[1];
	p += 2;
	if (c != BOM)
	buffer[bufferLength++] = c;
	}
	}
	result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
	len -= runLength * 2;
	}

	if (len) {
	ASSERT(m_numBufferedBytes == 0);
	m_numBufferedBytes = 1;
	m_bufferedBytes[0] = p[0];
	}

	return result;
	}

	bool StreamingTextDecoderICU::convertIfASCII(const unsigned char* s, int length, DeprecatedString& str)
	{
	ASSERT(m_numBufferedBytes == 0 \|\| m_numBufferedBytes == 1);

	DeprecatedString result("");
	result.reserve(length);

	const unsigned char* p = s;
	size_t len = length;
	unsigned char ored = 0;
	while (len) {
	UChar buffer[ConversionBufferSize];
	int runLength = min(len, ConversionBufferSize);
	int bufferLength = 0;
	for (int i = 0; i < runLength; ++i) {
	unsigned char c = *p++;
	ored \|= c;
	buffer[bufferLength++] = c;
	}
	if (ored & 0x80)
	return false;
	result.append(reinterpret_cast<DeprecatedChar*>(buffer), bufferLength);
	len -= runLength;
	}

	str = result;
	return true;
	}

	void StreamingTextDecoderICU::createICUConverter()
	{
	TextEncoding encoding = m_encoding.effectiveEncoding();
	const char* encodingName = encoding.name();

	bool cachedEncodingEqual = cachedConverterEncoding == encoding.encodingID();
	cachedConverterEncoding = InvalidEncoding;

	if (cachedEncodingEqual && cachedConverterICU) {
	m_converterICU = cachedConverterICU;
	cachedConverterICU = 0;
	} else {
	UErrorCode err = U_ZERO_ERROR;
	ASSERT(!m_converterICU);
	m_converterICU = ucnv_open(encodingName, &err);
	#if !LOG_DISABLED
	if (err == U_AMBIGUOUS_ALIAS_WARNING)
	LOG_ERROR("ICU ambiguous alias warning for encoding: %s", encodingName);
	#endif
	}
	}

	// We strip BOM characters because they can show up both at the start of content
	// and inside content, and we never want them to end up in the decoded text.
	void StreamingTextDecoderICU::appendOmittingBOM(DeprecatedString& s, const UChar* characters, int byteCount)
	{
	ASSERT(byteCount % sizeof(UChar) == 0);
	int start = 0;
	int characterCount = byteCount / sizeof(UChar);
	for (int i = 0; i != characterCount; ++i) {
	if (BOM == characters[i]) {
	if (start != i)
	s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), i - start);
	start = i + 1;
	}
	}
	if (start != characterCount)
	s.append(reinterpret_cast<const DeprecatedChar*>(&characters[start]), characterCount - start);
	}

	DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush)
	{
	// Get a converter for the passed-in encoding.
	if (!m_converterICU) {
	createICUConverter();
	if (!m_converterICU)
	return DeprecatedString();
	}

	DeprecatedString result("");
	result.reserve(len);

	UChar buffer[ConversionBufferSize];
	const char* source = reinterpret_cast<const char*>(chs);
	const char* sourceLimit = source + len;
	int32_t* offsets = NULL;
	UErrorCode err;

	do {
	UChar* target = buffer;
	const UChar* targetLimit = target + ConversionBufferSize;
	err = U_ZERO_ERROR;
	ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
	int count = target - buffer;
	appendOmittingBOM(result, reinterpret_cast<const UChar>(buffer), count sizeof(UChar));
	} while (err == U_BUFFER_OVERFLOW_ERROR);

	if (U_FAILURE(err)) {
	// flush the converter so it can be reused, and not be bothered by this error.
	do {
	UChar *target = buffer;
	const UChar *targetLimit = target + ConversionBufferSize;
	err = U_ZERO_ERROR;
	ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
	} while (source < sourceLimit);
	LOG_ERROR("ICU conversion error");
	return DeprecatedString();
	}

	return result;
	}

	DeprecatedString StreamingTextDecoderICU::convert(const unsigned char* chs, int len, bool flush)
	{
	switch (m_encoding.encodingID()) {
	case UTF16Encoding:
	return convertUTF16(chs, len);

	case ASCIIEncoding:
	case Latin1Encoding:
	case WinLatin1Encoding: {
	DeprecatedString result;
	if (convertIfASCII(chs, len, result))
	return result;
	break;
	}

	case UTF8Encoding:
	// If a previous run used ICU, we might have a partly converted character.
	// If so, don't use the optimized ASCII code path.
	if (!m_converterICU) {
	DeprecatedString result;
	if (convertIfASCII(chs, len, result))
	return result;
	}
	break;

	default:
	break;
	}

	//#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000
	#ifdef PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
	DeprecatedString result;
	int chunkSize;
	for (int i = 0; i != len; i += chunkSize) {
	chunkSize = len - i;
	if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
	chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
	}
	result += convertUsingICU(chs + i, chunkSize, flush && (i + chunkSize == len));
	}
	return result;
	#else
	return convertUsingICU(chs, len, flush);
	#endif
	}

	DeprecatedString StreamingTextDecoderICU::toUnicode(const char* chs, int len, bool flush)
	{
	ASSERT_ARG(len, len >= 0);

	if (!chs)
	return DeprecatedString();

	if (len <= 0 && !flush)
	return "";

	// Handle normal case.
	if (!m_atStart)
	return convert(chs, len, flush);

	// Check to see if we found a BOM.
	int numBufferedBytes = m_numBufferedBytes;
	int buf1Len = numBufferedBytes;
	int buf2Len = len;
	const unsigned char* buf1 = m_bufferedBytes;
	const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs);
	unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, buf2++) : 0;
	unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, buf2++) : 0;
	unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, buf2++) : 0;
	int BOMLength = 0;
	if (c1 == 0xFF && c2 == 0xFE) {
	if (m_encoding != TextEncoding(UTF16Encoding, LittleEndian)) {
	releaseICUConverter();
	m_encoding = TextEncoding(UTF16Encoding, LittleEndian);
	m_littleEndian = true;
	}
	BOMLength = 2;
	} else if (c1 == 0xFE && c2 == 0xFF) {
	if (m_encoding != TextEncoding(UTF16Encoding, BigEndian)) {
	releaseICUConverter();
	m_encoding = TextEncoding(UTF16Encoding, BigEndian);
	m_littleEndian = false;
	}
	BOMLength = 2;
	} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
	if (m_encoding != TextEncoding(UTF8Encoding)) {
	releaseICUConverter();
	m_encoding = TextEncoding(UTF8Encoding);
	}
	BOMLength = 3;
	}

	// Handle case where we found a BOM.
	if (BOMLength != 0) {
	ASSERT(numBufferedBytes + len >= BOMLength);
	int skip = BOMLength - numBufferedBytes;
	m_numBufferedBytes = 0;
	m_atStart = false;
	return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush);
	}

	// Handle case where we know there is no BOM coming.
	const int bufferSize = sizeof(m_bufferedBytes);
	if (numBufferedBytes + len > bufferSize \|\| flush) {
	m_atStart = false;
	if (numBufferedBytes == 0) {
	return convert(chs, len, flush);
	}
	unsigned char bufferedBytes[sizeof(m_bufferedBytes)];
	memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
	m_numBufferedBytes = 0;
	return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
	}

	// Continue to look for the BOM.
	memcpy(&m_bufferedBytes[numBufferedBytes], chs, len);
	m_numBufferedBytes += len;
	return "";
	}

	DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities)
	{
	TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID();

	if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
	return qcs.latin1();

	if ((encoding == WinLatin1Encoding \|\| encoding == UTF8Encoding \|\| encoding == ASCIIEncoding)
	&& qcs.isAllASCII())
	return qcs.ascii();

	// FIXME: We should see if there is "force ASCII range" mode in ICU;
	// until then, we change the backslash into a yen sign.
	// Encoding will change the yen sign back into a backslash.
	DeprecatedString copy = qcs;
	copy.replace('\\', m_encoding.backslashAsCurrencySymbol());

	if (!m_converterICU)
	createICUConverter();
	if (!m_converterICU)
	return DeprecatedCString();

	// FIXME: when DeprecatedString buffer is latin1, it would be nice to
	// convert from that w/o having to allocate a unicode buffer

	char buffer[ConversionBufferSize];
	const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
	const UChar* sourceLimit = source + copy.length();

	UErrorCode err = U_ZERO_ERROR;
	DeprecatedString normalizedString;
	if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
	normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed

	int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar>(const_cast<DeprecatedChar>(normalizedString.unicode())), copy.length(), &err);
	if (err == U_BUFFER_OVERFLOW_ERROR) {
	err = U_ZERO_ERROR;
	normalizedString.truncate(normalizedLength);
	normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar>(const_cast<DeprecatedChar>(normalizedString.unicode())), normalizedLength, &err);
	}

	source = reinterpret_cast<const UChar*>(normalizedString.unicode());
	sourceLimit = source + normalizedLength;
	}

	DeprecatedCString result(1); // for trailing zero

	if (allowEntities)
	ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
	else {
	ucnv_setSubstChars(m_converterICU, "?", 1, &err);
	ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
	}

	ASSERT(U_SUCCESS(err));
	if (U_FAILURE(err))
	return DeprecatedCString();

	do {
	char* target = buffer;
	char* targetLimit = target + ConversionBufferSize;
	err = U_ZERO_ERROR;
	ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err);
	int count = target - buffer;
	buffer[count] = 0;
	result.append(buffer);
	} while (err == U_BUFFER_OVERFLOW_ERROR);

	return result;
	}


	} // namespace WebCore