blob: 2ceb475e5faebb670c7414475a472924952dca9e [file] [log] [blame]
/*
* Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved.
* Copyright (C) 2010-2012 Patrick Gansterer <paroga@paroga.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* This library is distributed in the hope that i will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#include "config.h"
#include "TextCodecWin.h"
#include "COMPtr.h"
#include <mlang.h>
#include <windows.h>
#include <wtf/HashMap.h>
#include <wtf/HashSet.h>
#include <wtf/text/CString.h>
#include <wtf/text/StringHash.h>
#include <wtf/text/WTFString.h>
namespace WebCore {
struct CharsetInfo {
CString m_name;
String m_friendlyName;
UINT m_codePage;
Vector<CString> m_aliases;
};
class LanguageManager {
private:
LanguageManager();
friend LanguageManager& languageManager();
};
// Usage: a lookup table used to get CharsetInfo with code page ID.
// Key: code page ID. Value: charset information.
static HashMap<UINT, CString>& codePageCharsets()
{
static HashMap<UINT, CString> cc;
return cc;
}
static HashMap<String, CharsetInfo>& knownCharsets()
{
static HashMap<String, CharsetInfo> kc;
return kc;
}
// Usage: a map that stores charsets that are supported by system. Sorted by name.
// Key: charset. Value: code page ID.
typedef HashSet<String> CharsetSet;
static CharsetSet& supportedCharsets()
{
static CharsetSet sl;
return sl;
}
static LanguageManager& languageManager()
{
static LanguageManager lm;
return lm;
}
LanguageManager::LanguageManager()
{
COMPtr<IMultiLanguage> multiLanguage;
if (FAILED(::CoCreateInstance(CLSID_CMultiLanguage, 0, CLSCTX_INPROC_SERVER, IID_IMultiLanguage, reinterpret_cast<LPVOID*>(&multiLanguage))))
return;
COMPtr<IEnumCodePage> enumInterface;
if (FAILED(multiLanguage->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)))
return;
MIMECPINFO cpInfo;
ULONG ccpInfo;
while (SUCCEEDED(enumInterface->Next(1, &cpInfo, &ccpInfo)) && ccpInfo) {
if (!IsValidCodePage(cpInfo.uiCodePage))
continue;
HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage);
CString name(String(cpInfo.wszWebCharset).latin1());
if (i == codePageCharsets().end()) {
CharsetInfo info;
info.m_codePage = cpInfo.uiCodePage;
knownCharsets().set(name.data(), info);
i = codePageCharsets().set(cpInfo.uiCodePage, name).iterator;
}
if (i != codePageCharsets().end()) {
HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->value.data(), i->value.length()));
ASSERT(j != knownCharsets().end());
CharsetInfo& info = j->value;
info.m_name = i->value.data();
info.m_friendlyName = cpInfo.wszDescription;
info.m_aliases.append(name);
info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1());
info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1());
String cpName = "cp" + String::number(cpInfo.uiCodePage);
info.m_aliases.append(cpName.latin1());
supportedCharsets().add(i->value.data());
}
}
}
static UINT getCodePage(const char* name)
{
// Explicitly use a "const" reference to fix the silly VS build error
// saying "==" is not found for const_iterator and iterator
const HashMap<String, CharsetInfo>& charsets = knownCharsets();
HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name);
return i == charsets.end() ? CP_ACP : i->value.m_codePage;
}
static std::unique_ptr<TextCodec> newTextCodecWin(const TextEncoding& encoding, const void*)
{
return makeUnique<TextCodecWin>(getCodePage(encoding.name()));
}
TextCodecWin::TextCodecWin(UINT codePage)
: m_codePage(codePage)
{
}
TextCodecWin::~TextCodecWin() = default;
void TextCodecWin::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
languageManager();
for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
if (j != knownCharsets().end()) {
registrar(j->value.m_name.data(), j->value.m_name.data());
for (Vector<CString>::const_iterator alias = j->value.m_aliases.begin(); alias != j->value.m_aliases.end(); ++alias)
registrar(alias->data(), j->value.m_name.data());
}
}
}
void TextCodecWin::registerExtendedCodecs(TextCodecRegistrar registrar)
{
languageManager();
for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
if (j != knownCharsets().end())
registrar(j->value.m_name.data(), newTextCodecWin, 0);
}
}
static DWORD getCodePageFlags(UINT codePage)
{
if (codePage == 42) // Symbol
return 0;
// Microsoft says the flag must be 0 for the following code pages
if (codePage > 50000) {
if ((codePage >= 50220 && codePage <= 50222)
|| codePage == 50225
|| codePage == 50227
|| codePage == 50229
|| codePage == 52936
|| codePage == 54936
|| (codePage >= 57002 && codePage <= 57001)
|| codePage == 65000 // UTF-7
)
return 0;
}
return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS;
}
static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length)
{
for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) {
if (*bytes & 0x80)
break;
}
return bytes;
}
static void decodeInternal(Vector<UChar, 8192>& result, UINT codePage, const char* bytes, size_t length, size_t* left)
{
*left = length;
if (!bytes || !length)
return;
DWORD flags = getCodePageFlags(codePage);
int testLength = length;
int untestedLength = length;
for (;;) {
int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0);
if (resultLength > 0) {
int oldSize = result.size();
result.resize(oldSize + resultLength);
MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength);
if (testLength == untestedLength) {
*left = length - testLength;
break;
}
untestedLength -= testLength;
length -= testLength;
bytes += testLength;
} else {
untestedLength = testLength - 1;
if (!untestedLength) {
*left = length;
break;
}
}
testLength = (untestedLength + 1) / 2;
}
}
String TextCodecWin::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
if (!m_decodeBuffer.isEmpty()) {
m_decodeBuffer.append(bytes, length);
bytes = m_decodeBuffer.data();
length = m_decodeBuffer.size();
}
size_t left;
Vector<UChar, 8192> result;
for (;;) {
decodeInternal(result, m_codePage, bytes, length, &left);
if (!left)
break;
if (!flush && left < 16)
break;
result.append(L'?');
sawError = true;
if (stopOnError)
return String::adopt(result);
if (left == 1)
break;
bytes += length - left + 1;
length = left - 1;
}
if (left && !flush) {
if (m_decodeBuffer.isEmpty())
m_decodeBuffer.append(bytes + length - left, left);
else {
memmove(m_decodeBuffer.data(), bytes + length - left, left);
m_decodeBuffer.resize(left);
}
} else
m_decodeBuffer.clear();
return String::adopt(result);
}
CString TextCodecWin::encode(const UChar* characters, size_t length, UnencodableHandling)
{
if (!characters || !length)
return CString();
int resultLength = WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, 0, 0, 0, 0);
// FIXME: We need to implement UnencodableHandling.
if (resultLength <= 0)
return "?";
char* characterBuffer;
CString result = CString::newUninitialized(resultLength, characterBuffer);
WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, characterBuffer, resultLength, 0, 0);
return result;
}
void TextCodecWin::enumerateSupportedEncodings(EncodingReceiver& receiver)
{
languageManager();
for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) {
HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i);
if (j != knownCharsets().end() && !receiver.receive(j->value.m_name.data(), j->value.m_friendlyName.charactersWithNullTermination().data(), j->value.m_codePage))
break;
}
}
} // namespace WebCore