Source/WebCore/platform/text/win/TextCodecWin.cpp - WebKit - Git at Google

 /*
  * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved.
  * Copyright (C) 2010-2012 Patrick Gansterer <paroga@paroga.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  *  This library is distributed in the hope that i will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  Library General Public License for more details.
  *
  *  You should have received a copy of the GNU Library General Public License
  *  along with this library; see the file COPYING.LIB.  If not, write to
  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  *  Boston, MA 02110-1301, USA.
  */

 #include "config.h"
 #include "TextCodecWin.h"

 #include "COMPtr.h"
 #include <mlang.h>
 #include <windows.h>
 #include <wtf/HashMap.h>
 #include <wtf/HashSet.h>
 #include <wtf/text/CString.h>
 #include <wtf/text/StringHash.h>
 #include <wtf/text/WTFString.h>

 namespace WebCore {

 struct CharsetInfo {
     CString m_name;
     String m_friendlyName;
     UINT m_codePage;
     Vector<CString> m_aliases;
 };

 class LanguageManager {
 private:
     LanguageManager();

     friend LanguageManager& languageManager();
 };

 // Usage: a lookup table used to get CharsetInfo with code page ID.
 // Key: code page ID. Value: charset information.
 static HashMap<UINT, CString>& codePageCharsets()
 {
     static HashMap<UINT, CString> cc;
     return cc;
 }

 static HashMap<String, CharsetInfo>& knownCharsets()
 {
     static HashMap<String, CharsetInfo> kc;
     return kc;
 }

 // Usage: a map that stores charsets that are supported by system. Sorted by name.
 // Key: charset. Value: code page ID.
 typedef HashSet<String> CharsetSet;
 static CharsetSet& supportedCharsets()
 {
     static CharsetSet sl;
     return sl;
 }

 static LanguageManager& languageManager()
 {
     static LanguageManager lm;
     return lm;
 }

 LanguageManager::LanguageManager()
 {
     COMPtr<IMultiLanguage> multiLanguage;
     if (FAILED(::CoCreateInstance(CLSID_CMultiLanguage, 0, CLSCTX_INPROC_SERVER, IID_IMultiLanguage, reinterpret_cast<LPVOID*>(&multiLanguage))))
         return;

     COMPtr<IEnumCodePage> enumInterface;
     if (FAILED(multiLanguage->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)))
         return;

     MIMECPINFO cpInfo;
     ULONG ccpInfo;
     while (SUCCEEDED(enumInterface->Next(1, &cpInfo, &ccpInfo)) && ccpInfo) {
         if (!IsValidCodePage(cpInfo.uiCodePage))
             continue;

         HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage);

         CString name(String(cpInfo.wszWebCharset).latin1());
         if (i == codePageCharsets().end()) {
             CharsetInfo info;
             info.m_codePage = cpInfo.uiCodePage;
             knownCharsets().set(name.data(), info);
             i = codePageCharsets().set(cpInfo.uiCodePage, name).iterator;
         }
         if (i != codePageCharsets().end()) {
             HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->value.data(), i->value.length()));
             ASSERT(j != knownCharsets().end());
             CharsetInfo& info = j->value;
             info.m_name = i->value.data();
             info.m_friendlyName = cpInfo.wszDescription;
             info.m_aliases.append(name);
             info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1());
             info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1());
             String cpName = "cp" + String::number(cpInfo.uiCodePage);
             info.m_aliases.append(cpName.latin1());
             supportedCharsets().add(i->value.data());
         }
     }
 }

 static UINT getCodePage(const char* name)
 {
     // Explicitly use a "const" reference to fix the silly VS build error
     // saying "==" is not found for const_iterator and iterator
     const HashMap<String, CharsetInfo>& charsets = knownCharsets();
     HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name);
     return i == charsets.end() ? CP_ACP : i->value.m_codePage;
 }

 static std::unique_ptr<TextCodec> newTextCodecWin(const TextEncoding& encoding, const void*)
 {
     return makeUnique<TextCodecWin>(getCodePage(encoding.name()));
 }

 TextCodecWin::TextCodecWin(UINT codePage)
     : m_codePage(codePage)
 {
 }

 TextCodecWin::~TextCodecWin() = default;

 void TextCodecWin::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
 {
     languageManager();
     for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
         HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
         if (j != knownCharsets().end()) {
             registrar(j->value.m_name.data(), j->value.m_name.data());
             for (Vector<CString>::const_iterator alias = j->value.m_aliases.begin(); alias != j->value.m_aliases.end(); ++alias)
                 registrar(alias->data(), j->value.m_name.data());
         }
     }
 }

 void TextCodecWin::registerExtendedCodecs(TextCodecRegistrar registrar)
 {
     languageManager();
     for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
         HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
         if (j != knownCharsets().end())
             registrar(j->value.m_name.data(), newTextCodecWin, 0);
     }
 }

 static DWORD getCodePageFlags(UINT codePage)
 {
     if (codePage == 42) // Symbol
         return 0;

     // Microsoft says the flag must be 0 for the following code pages
     if (codePage > 50000) {
         if ((codePage >= 50220 && codePage <= 50222)
             || codePage == 50225
             || codePage == 50227
             || codePage == 50229
             || codePage == 52936
             || codePage == 54936
             || (codePage >= 57002 && codePage <= 57001)
             || codePage == 65000 // UTF-7
             )
             return 0;
     }

     return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS;
 }

 static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length)
 {
     for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) {
         if (*bytes & 0x80)
             break;
     }
     return bytes;
 }

 static void decodeInternal(Vector<UChar, 8192>& result, UINT codePage, const char* bytes, size_t length, size_t* left)
 {
     *left = length;
     if (!bytes || !length)
         return;

     DWORD flags = getCodePageFlags(codePage);

     int testLength = length;
     int untestedLength = length;
     for (;;) {
         int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0);

         if (resultLength > 0) {
             int oldSize = result.size();
             result.resize(oldSize + resultLength);

             MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength);

             if (testLength == untestedLength) {
                 *left = length - testLength;
                 break;
             }
             untestedLength -= testLength;
             length -= testLength;
             bytes += testLength;
         } else {
             untestedLength = testLength - 1;
             if (!untestedLength) {
                 *left = length;
                 break;
             }
         }
         testLength = (untestedLength + 1) / 2;
     }
 }

 String TextCodecWin::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
 {
     if (!m_decodeBuffer.isEmpty()) {
         m_decodeBuffer.append(bytes, length);
         bytes = m_decodeBuffer.data();
         length = m_decodeBuffer.size();
     }

     size_t left;
     Vector<UChar, 8192> result;
     for (;;) {
         decodeInternal(result, m_codePage, bytes, length, &left);
         if (!left)
             break;

         if (!flush && left < 16)
             break;

         result.append(L'?');
         sawError = true;
         if (stopOnError)
             return String::adopt(result);

         if (left == 1)
             break;

         bytes += length - left + 1;
         length = left - 1;
     }
     if (left && !flush) {
         if (m_decodeBuffer.isEmpty())
             m_decodeBuffer.append(bytes + length - left, left);
         else {
             memmove(m_decodeBuffer.data(), bytes + length - left, left);
             m_decodeBuffer.resize(left);
         }
     } else
         m_decodeBuffer.clear();

     return String::adopt(result);
 }

 CString TextCodecWin::encode(const UChar* characters, size_t length, UnencodableHandling)
 {
     if (!characters || !length)
         return CString();

     int resultLength = WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, 0, 0, 0, 0);

     // FIXME: We need to implement UnencodableHandling.

     if (resultLength <= 0)
         return "?";

     char* characterBuffer;
     CString result = CString::newUninitialized(resultLength, characterBuffer);

     WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, characterBuffer, resultLength, 0, 0);

     return result;
 }

 void TextCodecWin::enumerateSupportedEncodings(EncodingReceiver& receiver)
 {
     languageManager();
     for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
         HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
         if (j != knownCharsets().end() && !receiver.receive(j->value.m_name.data(), j->value.m_friendlyName.charactersWithNullTermination().data(), j->value.m_codePage))
             break;
     }
 }

 } // namespace WebCore
	/*
	* Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved.
	* Copyright (C) 2010-2012 Patrick Gansterer <paroga@paroga.com>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* This library is distributed in the hope that i will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Library General Public License for more details.
	*
	* You should have received a copy of the GNU Library General Public License
	* along with this library; see the file COPYING.LIB. If not, write to
	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	* Boston, MA 02110-1301, USA.
	*/

	#include "config.h"
	#include "TextCodecWin.h"

	#include "COMPtr.h"
	#include <mlang.h>
	#include <windows.h>
	#include <wtf/HashMap.h>
	#include <wtf/HashSet.h>
	#include <wtf/text/CString.h>
	#include <wtf/text/StringHash.h>
	#include <wtf/text/WTFString.h>

	namespace WebCore {

	struct CharsetInfo {
	CString m_name;
	String m_friendlyName;
	UINT m_codePage;
	Vector<CString> m_aliases;
	};

	class LanguageManager {
	private:
	LanguageManager();

	friend LanguageManager& languageManager();
	};

	// Usage: a lookup table used to get CharsetInfo with code page ID.
	// Key: code page ID. Value: charset information.
	static HashMap<UINT, CString>& codePageCharsets()
	{
	static HashMap<UINT, CString> cc;
	return cc;
	}

	static HashMap<String, CharsetInfo>& knownCharsets()
	{
	static HashMap<String, CharsetInfo> kc;
	return kc;
	}

	// Usage: a map that stores charsets that are supported by system. Sorted by name.
	// Key: charset. Value: code page ID.
	typedef HashSet<String> CharsetSet;
	static CharsetSet& supportedCharsets()
	{
	static CharsetSet sl;
	return sl;
	}

	static LanguageManager& languageManager()
	{
	static LanguageManager lm;
	return lm;
	}

	LanguageManager::LanguageManager()
	{
	COMPtr<IMultiLanguage> multiLanguage;
	if (FAILED(::CoCreateInstance(CLSID_CMultiLanguage, 0, CLSCTX_INPROC_SERVER, IID_IMultiLanguage, reinterpret_cast<LPVOID*>(&multiLanguage))))
	return;

	COMPtr<IEnumCodePage> enumInterface;
	if (FAILED(multiLanguage->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)))
	return;

	MIMECPINFO cpInfo;
	ULONG ccpInfo;
	while (SUCCEEDED(enumInterface->Next(1, &cpInfo, &ccpInfo)) && ccpInfo) {
	if (!IsValidCodePage(cpInfo.uiCodePage))
	continue;

	HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage);

	CString name(String(cpInfo.wszWebCharset).latin1());
	if (i == codePageCharsets().end()) {
	CharsetInfo info;
	info.m_codePage = cpInfo.uiCodePage;
	knownCharsets().set(name.data(), info);
	i = codePageCharsets().set(cpInfo.uiCodePage, name).iterator;
	}
	if (i != codePageCharsets().end()) {
	HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->value.data(), i->value.length()));
	ASSERT(j != knownCharsets().end());
	CharsetInfo& info = j->value;
	info.m_name = i->value.data();
	info.m_friendlyName = cpInfo.wszDescription;
	info.m_aliases.append(name);
	info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1());
	info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1());
	String cpName = "cp" + String::number(cpInfo.uiCodePage);
	info.m_aliases.append(cpName.latin1());
	supportedCharsets().add(i->value.data());
	}
	}
	}

	static UINT getCodePage(const char* name)
	{
	// Explicitly use a "const" reference to fix the silly VS build error
	// saying "==" is not found for const_iterator and iterator
	const HashMap<String, CharsetInfo>& charsets = knownCharsets();
	HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name);
	return i == charsets.end() ? CP_ACP : i->value.m_codePage;
	}

	static std::unique_ptr<TextCodec> newTextCodecWin(const TextEncoding& encoding, const void*)
	{
	return makeUnique<TextCodecWin>(getCodePage(encoding.name()));
	}

	TextCodecWin::TextCodecWin(UINT codePage)
	: m_codePage(codePage)
	{
	}

	TextCodecWin::~TextCodecWin() = default;

	void TextCodecWin::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
	{
	languageManager();
	for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
	HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
	if (j != knownCharsets().end()) {
	registrar(j->value.m_name.data(), j->value.m_name.data());
	for (Vector<CString>::const_iterator alias = j->value.m_aliases.begin(); alias != j->value.m_aliases.end(); ++alias)
	registrar(alias->data(), j->value.m_name.data());
	}
	}
	}

	void TextCodecWin::registerExtendedCodecs(TextCodecRegistrar registrar)
	{
	languageManager();
	for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
	HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
	if (j != knownCharsets().end())
	registrar(j->value.m_name.data(), newTextCodecWin, 0);
	}
	}

	static DWORD getCodePageFlags(UINT codePage)
	{
	if (codePage == 42) // Symbol
	return 0;

	// Microsoft says the flag must be 0 for the following code pages
	if (codePage > 50000) {
	if ((codePage >= 50220 && codePage <= 50222)
	\|\| codePage == 50225
	\|\| codePage == 50227
	\|\| codePage == 50229
	\|\| codePage == 52936
	\|\| codePage == 54936
	\|\| (codePage >= 57002 && codePage <= 57001)
	\|\| codePage == 65000 // UTF-7
	)
	return 0;
	}

	return MB_PRECOMPOSED \| MB_ERR_INVALID_CHARS;
	}

	static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length)
	{
	for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) {
	if (*bytes & 0x80)
	break;
	}
	return bytes;
	}

	static void decodeInternal(Vector<UChar, 8192>& result, UINT codePage, const char* bytes, size_t length, size_t* left)
	{
	*left = length;
	if (!bytes \|\| !length)
	return;

	DWORD flags = getCodePageFlags(codePage);

	int testLength = length;
	int untestedLength = length;
	for (;;) {
	int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0);

	if (resultLength > 0) {
	int oldSize = result.size();
	result.resize(oldSize + resultLength);

	MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength);

	if (testLength == untestedLength) {
	*left = length - testLength;
	break;
	}
	untestedLength -= testLength;
	length -= testLength;
	bytes += testLength;
	} else {
	untestedLength = testLength - 1;
	if (!untestedLength) {
	*left = length;
	break;
	}
	}
	testLength = (untestedLength + 1) / 2;
	}
	}

	String TextCodecWin::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
	{
	if (!m_decodeBuffer.isEmpty()) {
	m_decodeBuffer.append(bytes, length);
	bytes = m_decodeBuffer.data();
	length = m_decodeBuffer.size();
	}

	size_t left;
	Vector<UChar, 8192> result;
	for (;;) {
	decodeInternal(result, m_codePage, bytes, length, &left);
	if (!left)
	break;

	if (!flush && left < 16)
	break;

	result.append(L'?');
	sawError = true;
	if (stopOnError)
	return String::adopt(result);

	if (left == 1)
	break;

	bytes += length - left + 1;
	length = left - 1;
	}
	if (left && !flush) {
	if (m_decodeBuffer.isEmpty())
	m_decodeBuffer.append(bytes + length - left, left);
	else {
	memmove(m_decodeBuffer.data(), bytes + length - left, left);
	m_decodeBuffer.resize(left);
	}
	} else
	m_decodeBuffer.clear();

	return String::adopt(result);
	}

	CString TextCodecWin::encode(const UChar* characters, size_t length, UnencodableHandling)
	{
	if (!characters \|\| !length)
	return CString();

	int resultLength = WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, 0, 0, 0, 0);

	// FIXME: We need to implement UnencodableHandling.

	if (resultLength <= 0)
	return "?";

	char* characterBuffer;
	CString result = CString::newUninitialized(resultLength, characterBuffer);

	WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, characterBuffer, resultLength, 0, 0);

	return result;
	}

	void TextCodecWin::enumerateSupportedEncodings(EncodingReceiver& receiver)
	{
	languageManager();
	for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
	HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
	if (j != knownCharsets().end() && !receiver.receive(j->value.m_name.data(), j->value.m_friendlyName.charactersWithNullTermination().data(), j->value.m_codePage))
	break;
	}
	}

	} // namespace WebCore