blob: 63e427e79bba85b07e374f0e7ecce2fc486e85fb [file] [log] [blame]
/*
This file is part of the KDE libraries
Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
#include "config.h"
#include "Decoder.h"
#include "HTMLNames.h"
#include "StreamingTextDecoder.h"
#include "RegularExpression.h"
using namespace WebCore;
using namespace HTMLNames;
class KanjiCode
{
public:
enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
static enum Type judge(const char *str, int length);
static const int ESC;
static const int _SS2_;
static const unsigned char kanji_map_sjis[];
static int ISkanji(int code)
{
if (code >= 0x100)
return 0;
return (kanji_map_sjis[code & 0xff] & 1);
}
static int ISkana(int code)
{
if (code >= 0x100)
return 0;
return (kanji_map_sjis[code & 0xff] & 2);
}
};
const int KanjiCode::ESC = 0x1b;
const int KanjiCode::_SS2_ = 0x8e;
const unsigned char KanjiCode::kanji_map_sjis[] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
};
/*
* EUC-JP is
* [0xa1 - 0xfe][0xa1 - 0xfe]
* 0x8e[0xa1 - 0xfe](SS2)
* 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
*
* Shift_Jis is
* [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
*
* Shift_Jis Hankaku Kana is
* [0xa1 - 0xdf]
*/
/*
* KanjiCode::judge() is based on judge_jcode() from jvim
* http://hp.vector.co.jp/authors/VA003457/vim/
*
* Special Thanks to Kenichi Tsuchida
*/
enum KanjiCode::Type KanjiCode::judge(const char *str, int size)
{
enum Type code;
int i;
int bfr = false; /* Kana Moji */
int bfk = 0; /* EUC Kana */
int sjis = 0;
int euc = 0;
const unsigned char *ptr = (const unsigned char *) str;
code = ASCII;
i = 0;
while (i < size) {
if (ptr[i] == ESC && (size - i >= 3)) {
if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
|| (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
code = JIS;
goto breakBreak;
} else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
|| (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
code = JIS;
goto breakBreak;
} else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
code = JIS;
i += 3;
} else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
code = JIS;
i += 3;
} else {
i++;
}
bfr = false;
bfk = 0;
} else {
if (ptr[i] < 0x20) {
bfr = false;
bfk = 0;
/* ?? check kudokuten ?? && ?? hiragana ?? */
if ((i >= 2) && (ptr[i - 2] == 0x81)
&& (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
code = SJIS;
sjis += 100; /* kudokuten */
} else if ((i >= 2) && (ptr[i - 2] == 0xa1)
&& (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
code = EUC;
euc += 100; /* kudokuten */
} else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
sjis += 40; /* hiragana */
} else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
euc += 40; /* hiragana */
}
} else {
/* ?? check hiragana or katana ?? */
if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
sjis++; /* hiragana */
} else if ((size - i > 1) && (ptr[i] == 0x83)
&& (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
sjis++; /* katakana */
} else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
euc++; /* hiragana */
} else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
euc++; /* katakana */
}
if (bfr) {
if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
code = SJIS;
goto breakBreak;
} else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
code = SJIS;
goto breakBreak;
} else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
code = EUC;
goto breakBreak;
} else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
code = EUC;
goto breakBreak;
} else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
code = SJIS;
goto breakBreak;
} else if (ptr[i] <= 0x7f) {
code = SJIS;
goto breakBreak;
} else {
if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
euc++; /* sjis hankaku kana kigo */
} else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
; /* sjis hankaku kana */
} else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
euc++;
} else if (0x8e == ptr[i]) {
euc++;
} else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
sjis++;
}
bfr = false;
bfk = 0;
}
} else if (0x8e == ptr[i]) {
if (size - i <= 1) {
;
} else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
/* EUC KANA or SJIS KANJI */
if (bfk == 1) {
euc += 100;
}
bfk++;
i++;
} else {
/* SJIS only */
code = SJIS;
goto breakBreak;
}
} else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
/* SJIS only */
code = SJIS;
if ((size - i >= 1)
&& ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
|| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
goto breakBreak;
}
} else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
/* EUC only */
code = EUC;
if ((size - i >= 1)
&& (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
goto breakBreak;
}
} else if (ptr[i] <= 0x7f) {
;
} else {
bfr = true;
bfk = 0;
}
}
i++;
}
}
if (code == ASCII) {
if (sjis > euc) {
code = SJIS;
} else if (sjis < euc) {
code = EUC;
}
}
breakBreak:
return (code);
}
Decoder::Decoder()
: m_encoding(Latin1Encoding)
, m_decoder(new StreamingTextDecoder(m_encoding))
, enc(0)
, m_type(DefaultEncoding)
, body(false)
, beginning(true)
{
}
Decoder::~Decoder()
{
}
void Decoder::setEncodingName(const char* _encoding, EncodingSource type)
{
enc = _encoding;
enc = enc.lower();
if (enc.isEmpty())
return;
TextEncoding encoding = TextEncoding(enc, type == EncodingFromMetaTag || type == EncodingFromXMLHeader);
// in case the encoding didn't exist, we keep the old one (fixes some sites specifying invalid encodings)
if (encoding.isValid()) {
enc = encoding.name();
m_encoding = encoding;
m_type = type;
m_decoder.set(new StreamingTextDecoder(m_encoding));
}
}
const char* Decoder::encodingName() const
{
return enc;
}
// Other browsers allow comments in the head section, so we need to also.
// It's important not to look for tags inside the comments.
static void skipComment(const char *&ptr, const char *pEnd)
{
const char *p = ptr;
// Allow <!-->; other browsers do.
if (*p == '>') {
p++;
} else {
while (p != pEnd) {
if (*p == '-') {
// This is the real end of comment, "-->".
if (p[1] == '-' && p[2] == '>') {
p += 3;
break;
}
// This is the incorrect end of comment that other browsers allow, "--!>".
if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
p += 4;
break;
}
}
p++;
}
}
ptr = p;
}
// Returns the position of the encoding string.
static int findXMLEncoding(const DeprecatedCString &str, int &encodingLength)
{
int len = str.length();
int pos = str.find("encoding");
if (pos == -1)
return -1;
pos += 8;
// Skip spaces and stray control characters.
while (str[pos] <= ' ' && pos != len)
++pos;
// Skip equals sign.
if (str[pos] != '=')
return -1;
++pos;
// Skip spaces and stray control characters.
while (str[pos] <= ' ' && pos != len)
++pos;
// Skip quotation mark.
char quoteMark = str[pos];
if (quoteMark != '"' && quoteMark != '\'')
return -1;
++pos;
// Find the trailing quotation mark.
int end = pos;
while (str[end] != quoteMark)
++end;
if (end == len)
return -1;
encodingLength = end - pos;
return pos;
}
DeprecatedString Decoder::decode(const char *data, int len)
{
// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
int bufferLength = buffer.length();
const int maximumBOMLength = 3;
if (beginning && bufferLength + len >= maximumBOMLength) {
if (m_type != UserChosenEncoding) {
// Extract the first three bytes.
// Handle the case where some of bytes are already in the buffer.
// The last byte is always guaranteed to not be in the buffer.
const unsigned char *udata = (const unsigned char *)data;
unsigned char c1 = bufferLength >= 1 ? buffer[0].unicode() : *udata++;
unsigned char c2 = bufferLength >= 2 ? buffer[1].unicode() : *udata++;
ASSERT(bufferLength < 3);
unsigned char c3 = *udata;
// Check for the BOM.
const char *autoDetectedEncoding;
if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
autoDetectedEncoding = "ISO-10646-UCS-2";
} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
autoDetectedEncoding = "UTF-8";
} else {
autoDetectedEncoding = 0;
}
// If we found a BOM, use the encoding it implies.
if (autoDetectedEncoding != 0) {
m_type = AutoDetectedEncoding;
m_encoding = TextEncoding(autoDetectedEncoding);
ASSERT(m_encoding.isValid());
enc = m_encoding.name();
m_decoder.set(new StreamingTextDecoder(m_encoding));
}
}
beginning = false;
}
// this is not completely efficient, since the function might go
// through the html head several times...
bool lookForMetaTag = m_type == DefaultEncoding && !body;
if (lookForMetaTag) {
#ifdef DECODE_DEBUG
kdDebug(6005) << "looking for charset definition" << endl;
#endif
{ // extra level of braces to keep indenting matching original for better diff'ing
buffer.append(data, len);
// we still don't have an encoding, and are in the head
// the following tags are allowed in <head>:
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
// We stop scanning when a tag that is not permitted in <head>
// is seen, rather when </head> is seen, because that more closely
// matches behavior in other browsers; more details in
// <http://bugzilla.opendarwin.org/show_bug.cgi?id=3590>.
// Additionally, we ignore things that looks like tags in <title>; see
// <http://bugzilla.opendarwin.org/show_bug.cgi?id=4560>.
bool withinTitle = false;
const char *ptr = buffer.latin1();
const char *pEnd = ptr + buffer.length();
while(ptr != pEnd)
{
if(*ptr == '<') {
bool end = false;
ptr++;
// Handle comments.
if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
ptr += 3;
skipComment(ptr, pEnd);
continue;
}
// Handle XML header, which can have encoding in it.
if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
const char *end = ptr;
while (*end != '>' && *end != '\0') end++;
if (*end == '\0')
break;
DeprecatedCString str(ptr, end - ptr);
int len;
int pos = findXMLEncoding(str, len);
if (pos != -1)
setEncodingName(str.mid(pos, len), EncodingFromXMLHeader);
if (m_type != EncodingFromXMLHeader)
setEncodingName("UTF-8", EncodingFromXMLHeader);
// continue looking for a charset - it may be specified in an HTTP-Equiv meta
} else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') {
// UTF-16 without BOM
setEncodingName(((ptr - buffer.latin1()) % 2) ? "UTF-16LE" : "UTF-16BE", AutoDetectedEncoding);
goto found;
}
if(*ptr == '/') ptr++, end=true;
char tmp[20];
int len = 0;
while (
((*ptr >= 'a') && (*ptr <= 'z') ||
(*ptr >= 'A') && (*ptr <= 'Z') ||
(*ptr >= '0') && (*ptr <= '9'))
&& len < 19 )
{
tmp[len] = tolower( *ptr );
ptr++;
len++;
}
tmp[len] = 0;
AtomicString tag(tmp);
if (tag == titleTag)
withinTitle = !end;
if (!end && tag == metaTag) {
const char * end = ptr;
while(*end != '>' && *end != '\0') end++;
if ( *end == '\0' ) break;
DeprecatedCString str( ptr, (end-ptr)+1);
str = str.lower();
int pos = 0;
while( pos < ( int ) str.length() ) {
if( (pos = str.find("charset", pos, false)) == -1) break;
pos += 7;
// skip whitespace..
while( pos < (int)str.length() && str[pos] <= ' ' ) pos++;
if ( pos == ( int )str.length()) break;
if ( str[pos++] != '=' ) continue;
while ( pos < ( int )str.length() &&
( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
pos++;
// end ?
if ( pos == ( int )str.length() ) break;
unsigned endpos = pos;
while( endpos < str.length() &&
(str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
&& str[endpos] != ';' && str[endpos] != '>') )
endpos++;
setEncodingName(str.mid(pos, endpos-pos), EncodingFromMetaTag);
if( m_type == EncodingFromMetaTag ) goto found;
if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
pos = endpos + 1;
}
} else if (tag != scriptTag && tag != noscriptTag && tag != styleTag &&
tag != linkTag && tag != metaTag && tag != objectTag &&
tag != titleTag && tag != baseTag &&
(end || tag != htmlTag) && !withinTitle &&
(tag != headTag) && isalpha(tmp[0])) {
body = true;
goto found;
}
}
else
ptr++;
}
return DeprecatedString::null;
}
}
found:
// Do the auto-detect if our default encoding is one of the Japanese ones.
if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_encoding.isJapanese())
{
const char *autoDetectedEncoding;
switch (KanjiCode::judge(data, len)) {
case KanjiCode::JIS:
autoDetectedEncoding = "jis7";
break;
case KanjiCode::EUC:
autoDetectedEncoding = "eucjp";
break;
case KanjiCode::SJIS:
autoDetectedEncoding = "sjis";
break;
default:
autoDetectedEncoding = NULL;
break;
}
if (autoDetectedEncoding != 0) {
setEncodingName(autoDetectedEncoding, AutoDetectedEncoding);
}
}
// if we still haven't found an encoding, assume latin1
if (!m_encoding.isValid())
{
if (enc.isEmpty())
enc = "iso8859-1";
m_encoding = TextEncoding(enc);
// be sure not to crash
if (!m_encoding.isValid()) {
enc = "iso8859-1";
m_encoding = TextEncoding(Latin1Encoding);
}
m_decoder.set(new StreamingTextDecoder(m_encoding));
}
DeprecatedString out;
if (!buffer.isEmpty()) {
if (!lookForMetaTag)
buffer.append(data, len);
out = m_decoder->toUnicode(buffer.latin1(), buffer.length());
buffer.truncate(0);
} else {
out = m_decoder->toUnicode(data, len);
}
return out;
}
DeprecatedString Decoder::flush() const
{
return m_decoder->toUnicode(buffer.latin1(), buffer.length(), true);
}
// -----------------------------------------------------------------------------