| /* |
| * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved. |
| * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| |
| #if !PLATFORM(IOS) |
| |
| #include "TextCodecMac.h" |
| |
| #include "CharsetData.h" |
| #include "ThreadGlobalData.h" |
| #include <wtf/Assertions.h> |
| #include <wtf/RetainPtr.h> |
| #include <wtf/Threading.h> |
| #include <wtf/text/CString.h> |
| #include <wtf/text/WTFString.h> |
| #include <wtf/unicode/CharacterNames.h> |
| |
| namespace WebCore { |
| |
| // We need to keep this because ICU doesn't support some of the encodings that we need: |
| // <http://bugs.webkit.org/show_bug.cgi?id=4195>. |
| |
| const size_t ConversionBufferSize = 16384; |
| |
| static TECConverterWrapper& cachedConverterTEC() |
| { |
| return threadGlobalData().cachedConverterTEC(); |
| } |
| |
| void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar) |
| { |
| TECTextEncodingID lastEncoding = invalidEncoding; |
| const char* lastName = 0; |
| |
| for (size_t i = 0; CharsetTable[i].name; ++i) { |
| if (CharsetTable[i].encoding != lastEncoding) { |
| lastEncoding = CharsetTable[i].encoding; |
| lastName = CharsetTable[i].name; |
| } |
| registrar(CharsetTable[i].name, lastName); |
| } |
| } |
| |
| static std::unique_ptr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData) |
| { |
| return std::make_unique<TextCodecMac>(*static_cast<const TECTextEncodingID*>(additionalData)); |
| } |
| |
| void TextCodecMac::registerCodecs(TextCodecRegistrar registrar) |
| { |
| TECTextEncodingID lastEncoding = invalidEncoding; |
| |
| for (size_t i = 0; CharsetTable[i].name; ++i) |
| if (CharsetTable[i].encoding != lastEncoding) { |
| registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding); |
| lastEncoding = CharsetTable[i].encoding; |
| } |
| } |
| |
| TextCodecMac::TextCodecMac(TECTextEncodingID encoding) |
| : m_encoding(encoding) |
| , m_numBufferedBytes(0) |
| , m_converterTEC(0) |
| { |
| } |
| |
| TextCodecMac::~TextCodecMac() |
| { |
| releaseTECConverter(); |
| } |
| |
| void TextCodecMac::releaseTECConverter() const |
| { |
| if (m_converterTEC) { |
| TECConverterWrapper& cachedConverter = cachedConverterTEC(); |
| if (cachedConverter.converter) |
| TECDisposeConverter(cachedConverter.converter); |
| cachedConverter.converter = m_converterTEC; |
| cachedConverter.encoding = m_encoding; |
| m_converterTEC = 0; |
| } |
| } |
| |
| OSStatus TextCodecMac::createTECConverter() const |
| { |
| TECConverterWrapper& cachedConverter = cachedConverterTEC(); |
| |
| bool cachedEncodingEqual = cachedConverter.encoding == m_encoding; |
| cachedConverter.encoding = invalidEncoding; |
| |
| if (cachedEncodingEqual && cachedConverter.converter) { |
| m_converterTEC = cachedConverter.converter; |
| cachedConverter.converter = 0; |
| |
| TECClearConverterContextInfo(m_converterTEC); |
| } else { |
| OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding, |
| CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat)); |
| if (status) |
| return status; |
| |
| TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask); |
| } |
| |
| return noErr; |
| } |
| |
| OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength, |
| void *outputBuffer, int outputBufferLength, int& outputLength) |
| { |
| OSStatus status; |
| unsigned long bytesRead = 0; |
| unsigned long bytesWritten = 0; |
| |
| if (m_numBufferedBytes != 0) { |
| // Finish converting a partial character that's in our buffer. |
| |
| // First, fill the partial character buffer with as many bytes as are available. |
| ASSERT_WITH_SECURITY_IMPLICATION(m_numBufferedBytes < sizeof(m_bufferedBytes)); |
| const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes; |
| const int bytesToPutInBuffer = std::min(spaceInBuffer, inputBufferLength); |
| ASSERT(bytesToPutInBuffer != 0); |
| memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer); |
| |
| // Now, do a conversion on the buffer. |
| status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead, |
| reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); |
| ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer); |
| |
| if (status == kTECPartialCharErr && bytesRead == 0) { |
| // Handle the case where the partial character was not converted. |
| if (bytesToPutInBuffer >= spaceInBuffer) { |
| LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes)); |
| m_numBufferedBytes = 0; |
| status = kTECUnmappableElementErr; // should never happen, but use this error code |
| } else { |
| // Tell the caller we read all the source bytes and keep them in the buffer. |
| m_numBufferedBytes += bytesToPutInBuffer; |
| bytesRead = bytesToPutInBuffer; |
| status = noErr; |
| } |
| } else { |
| // We are done with the partial character buffer. |
| // Also, we have read some of the bytes from the main buffer. |
| if (bytesRead > m_numBufferedBytes) { |
| bytesRead -= m_numBufferedBytes; |
| } else { |
| LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr"); |
| bytesRead = 0; |
| } |
| m_numBufferedBytes = 0; |
| if (status == kTECPartialCharErr) { |
| // While there may be a partial character problem in the small buffer, |
| // we have to try again and not get confused and think there is a partial |
| // character problem in the large buffer. |
| status = noErr; |
| } |
| } |
| } else { |
| status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead, |
| static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); |
| ASSERT(static_cast<int>(bytesRead) <= inputBufferLength); |
| } |
| |
| // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus. |
| if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) |
| status = kTECOutputBufferFullStatus; |
| |
| inputLength = bytesRead; |
| outputLength = bytesWritten; |
| return status; |
| } |
| |
| String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) |
| { |
| // Get a converter for the passed-in encoding. |
| if (!m_converterTEC && createTECConverter() != noErr) |
| return String(); |
| |
| Vector<UChar> result; |
| |
| const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes); |
| int sourceLength = length; |
| bool bufferWasFull = false; |
| UniChar buffer[ConversionBufferSize]; |
| |
| while ((sourceLength || bufferWasFull) && !sawError) { |
| int bytesRead = 0; |
| int bytesWritten = 0; |
| OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten); |
| ASSERT(bytesRead <= sourceLength); |
| sourcePointer += bytesRead; |
| sourceLength -= bytesRead; |
| |
| switch (status) { |
| case noErr: |
| case kTECOutputBufferFullStatus: |
| break; |
| case kTextMalformedInputErr: |
| case kTextUndefinedElementErr: |
| // FIXME: Put FFFD character into the output string in this case? |
| TECClearConverterContextInfo(m_converterTEC); |
| if (stopOnError) { |
| sawError = true; |
| break; |
| } |
| if (sourceLength) { |
| sourcePointer += 1; |
| sourceLength -= 1; |
| } |
| break; |
| case kTECPartialCharErr: { |
| // Put the partial character into the buffer. |
| ASSERT(m_numBufferedBytes == 0); |
| const int bufferSize = sizeof(m_numBufferedBytes); |
| if (sourceLength < bufferSize) { |
| memcpy(m_bufferedBytes, sourcePointer, sourceLength); |
| m_numBufferedBytes = sourceLength; |
| } else { |
| LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength); |
| } |
| sourceLength = 0; |
| break; |
| } |
| default: |
| sawError = true; |
| return String(); |
| } |
| |
| ASSERT(!(bytesWritten % sizeof(UChar))); |
| result.append(buffer, bytesWritten / sizeof(UChar)); |
| |
| bufferWasFull = status == kTECOutputBufferFullStatus; |
| } |
| |
| if (flush) { |
| unsigned long bytesWritten = 0; |
| TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten); |
| ASSERT(!(bytesWritten % sizeof(UChar))); |
| result.append(buffer, bytesWritten / sizeof(UChar)); |
| } |
| |
| String resultString = String::adopt(WTFMove(result)); |
| |
| // <rdar://problem/3225472> |
| // Simplified Chinese pages use the code A3A0 to mean "full-width space". |
| // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice. |
| // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space). |
| if (m_encoding == kCFStringEncodingGB_18030_2000) |
| resultString.replace(0xE5E5, ideographicSpace); |
| |
| return resultString; |
| } |
| |
| CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling) |
| { |
| // FIXME: We should really use TEC here instead of CFString for consistency with the other direction. |
| |
| // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign. |
| // Encoding will change the yen sign back into a backslash. |
| String copy(characters, length); |
| copy.replace('\\', m_backslashAsCurrencySymbol); |
| RetainPtr<CFStringRef> cfs = copy.createCFString(); |
| |
| CFIndex startPos = 0; |
| CFIndex charactersLeft = CFStringGetLength(cfs.get()); |
| Vector<char> result; |
| size_t size = 0; |
| UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0; |
| while (charactersLeft > 0) { |
| CFRange range = CFRangeMake(startPos, charactersLeft); |
| CFIndex bufferLength; |
| CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength); |
| |
| result.grow(size + bufferLength); |
| unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size); |
| CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength); |
| size += bufferLength; |
| |
| if (charactersConverted != charactersLeft) { |
| unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted); |
| ++charactersConverted; |
| if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate |
| UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted); |
| if ((low & 0xFC00) == 0xDC00) { // is low surrogate |
| badChar <<= 10; |
| badChar += low; |
| badChar += 0x10000 - (0xD800 << 10) - 0xDC00; |
| ++charactersConverted; |
| } |
| } |
| UnencodableReplacementArray entity; |
| int entityLength = getUnencodableReplacement(badChar, handling, entity); |
| result.grow(size + entityLength); |
| memcpy(result.data() + size, entity, entityLength); |
| size += entityLength; |
| } |
| |
| startPos += charactersConverted; |
| charactersLeft -= charactersConverted; |
| } |
| return CString(result.data(), size); |
| } |
| |
| } // namespace WebCore |
| |
| #endif // !PLATFORM(IOS) |