| /* |
| This file is part of the KDE libraries |
| |
| Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) |
| Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. |
| |
| This library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Library General Public |
| License as published by the Free Software Foundation; either |
| version 2 of the License, or (at your option) any later version. |
| |
| This library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Library General Public License for more details. |
| |
| You should have received a copy of the GNU Library General Public License |
| along with this library; see the file COPYING.LIB. If not, write to |
| the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
| Boston, MA 02111-1307, USA. |
| */ |
| |
| |
| #include "config.h" |
| #include "Decoder.h" |
| |
| #include "HTMLNames.h" |
| #include "StreamingTextDecoder.h" |
| #include "RegularExpression.h" |
| |
| using namespace WebCore; |
| using namespace HTMLNames; |
| |
| class KanjiCode |
| { |
| public: |
| enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 }; |
| static enum Type judge(const char *str, int length); |
| static const int ESC; |
| static const int _SS2_; |
| static const unsigned char kanji_map_sjis[]; |
| static int ISkanji(int code) |
| { |
| if (code >= 0x100) |
| return 0; |
| return (kanji_map_sjis[code & 0xff] & 1); |
| } |
| |
| static int ISkana(int code) |
| { |
| if (code >= 0x100) |
| return 0; |
| return (kanji_map_sjis[code & 0xff] & 2); |
| } |
| |
| }; |
| |
| const int KanjiCode::ESC = 0x1b; |
| const int KanjiCode::_SS2_ = 0x8e; |
| |
| const unsigned char KanjiCode::kanji_map_sjis[] = |
| { |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 |
| }; |
| |
| /* |
| * EUC-JP is |
| * [0xa1 - 0xfe][0xa1 - 0xfe] |
| * 0x8e[0xa1 - 0xfe](SS2) |
| * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3) |
| * |
| * Shift_Jis is |
| * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc] |
| * |
| * Shift_Jis Hankaku Kana is |
| * [0xa1 - 0xdf] |
| */ |
| |
| /* |
| * KanjiCode::judge() is based on judge_jcode() from jvim |
| * http://hp.vector.co.jp/authors/VA003457/vim/ |
| * |
| * Special Thanks to Kenichi Tsuchida |
| */ |
| |
| enum KanjiCode::Type KanjiCode::judge(const char *str, int size) |
| { |
| enum Type code; |
| int i; |
| int bfr = false; /* Kana Moji */ |
| int bfk = 0; /* EUC Kana */ |
| int sjis = 0; |
| int euc = 0; |
| |
| const unsigned char *ptr = (const unsigned char *) str; |
| |
| code = ASCII; |
| |
| i = 0; |
| while (i < size) { |
| if (ptr[i] == ESC && (size - i >= 3)) { |
| if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B') |
| || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) { |
| code = JIS; |
| goto breakBreak; |
| } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@') |
| || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) { |
| code = JIS; |
| goto breakBreak; |
| } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') { |
| code = JIS; |
| i += 3; |
| } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') { |
| code = JIS; |
| i += 3; |
| } else { |
| i++; |
| } |
| bfr = false; |
| bfk = 0; |
| } else { |
| if (ptr[i] < 0x20) { |
| bfr = false; |
| bfk = 0; |
| /* ?? check kudokuten ?? && ?? hiragana ?? */ |
| if ((i >= 2) && (ptr[i - 2] == 0x81) |
| && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) { |
| code = SJIS; |
| sjis += 100; /* kudokuten */ |
| } else if ((i >= 2) && (ptr[i - 2] == 0xa1) |
| && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) { |
| code = EUC; |
| euc += 100; /* kudokuten */ |
| } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) { |
| sjis += 40; /* hiragana */ |
| } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) { |
| euc += 40; /* hiragana */ |
| } |
| } else { |
| /* ?? check hiragana or katana ?? */ |
| if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) { |
| sjis++; /* hiragana */ |
| } else if ((size - i > 1) && (ptr[i] == 0x83) |
| && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) { |
| sjis++; /* katakana */ |
| } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) { |
| euc++; /* hiragana */ |
| } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) { |
| euc++; /* katakana */ |
| } |
| if (bfr) { |
| if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) { |
| code = SJIS; |
| goto breakBreak; |
| } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { |
| code = SJIS; |
| goto breakBreak; |
| } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) { |
| code = EUC; |
| goto breakBreak; |
| } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) { |
| code = EUC; |
| goto breakBreak; |
| } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) { |
| code = SJIS; |
| goto breakBreak; |
| } else if (ptr[i] <= 0x7f) { |
| code = SJIS; |
| goto breakBreak; |
| } else { |
| if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) { |
| euc++; /* sjis hankaku kana kigo */ |
| } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) { |
| ; /* sjis hankaku kana */ |
| } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) { |
| euc++; |
| } else if (0x8e == ptr[i]) { |
| euc++; |
| } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) { |
| sjis++; |
| } |
| bfr = false; |
| bfk = 0; |
| } |
| } else if (0x8e == ptr[i]) { |
| if (size - i <= 1) { |
| ; |
| } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) { |
| /* EUC KANA or SJIS KANJI */ |
| if (bfk == 1) { |
| euc += 100; |
| } |
| bfk++; |
| i++; |
| } else { |
| /* SJIS only */ |
| code = SJIS; |
| goto breakBreak; |
| } |
| } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) { |
| /* SJIS only */ |
| code = SJIS; |
| if ((size - i >= 1) |
| && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) |
| || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) { |
| goto breakBreak; |
| } |
| } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) { |
| /* EUC only */ |
| code = EUC; |
| if ((size - i >= 1) |
| && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) { |
| goto breakBreak; |
| } |
| } else if (ptr[i] <= 0x7f) { |
| ; |
| } else { |
| bfr = true; |
| bfk = 0; |
| } |
| } |
| i++; |
| } |
| } |
| if (code == ASCII) { |
| if (sjis > euc) { |
| code = SJIS; |
| } else if (sjis < euc) { |
| code = EUC; |
| } |
| } |
| breakBreak: |
| return (code); |
| } |
| |
| Decoder::Decoder() |
| : m_encoding(Latin1Encoding) |
| , m_decoder(new StreamingTextDecoder(m_encoding)) |
| , enc(0) |
| , m_type(DefaultEncoding) |
| , body(false) |
| , beginning(true) |
| { |
| } |
| |
| Decoder::~Decoder() |
| { |
| } |
| |
| void Decoder::setEncodingName(const char* _encoding, EncodingSource type) |
| { |
| enc = _encoding; |
| enc = enc.lower(); |
| |
| if (enc.isEmpty()) |
| return; |
| |
| TextEncoding encoding = TextEncoding(enc, type == EncodingFromMetaTag || type == EncodingFromXMLHeader); |
| |
| // in case the encoding didn't exist, we keep the old one (fixes some sites specifying invalid encodings) |
| if (encoding.isValid()) { |
| enc = encoding.name(); |
| m_encoding = encoding; |
| m_type = type; |
| m_decoder.set(new StreamingTextDecoder(m_encoding)); |
| } |
| } |
| |
| const char* Decoder::encodingName() const |
| { |
| return enc; |
| } |
| |
| // Other browsers allow comments in the head section, so we need to also. |
| // It's important not to look for tags inside the comments. |
| static void skipComment(const char *&ptr, const char *pEnd) |
| { |
| const char *p = ptr; |
| // Allow <!-->; other browsers do. |
| if (*p == '>') { |
| p++; |
| } else { |
| while (p != pEnd) { |
| if (*p == '-') { |
| // This is the real end of comment, "-->". |
| if (p[1] == '-' && p[2] == '>') { |
| p += 3; |
| break; |
| } |
| // This is the incorrect end of comment that other browsers allow, "--!>". |
| if (p[1] == '-' && p[2] == '!' && p[3] == '>') { |
| p += 4; |
| break; |
| } |
| } |
| p++; |
| } |
| } |
| ptr = p; |
| } |
| |
| // Returns the position of the encoding string. |
| static int findXMLEncoding(const DeprecatedCString &str, int &encodingLength) |
| { |
| int len = str.length(); |
| |
| int pos = str.find("encoding"); |
| if (pos == -1) |
| return -1; |
| pos += 8; |
| |
| // Skip spaces and stray control characters. |
| while (str[pos] <= ' ' && pos != len) |
| ++pos; |
| |
| // Skip equals sign. |
| if (str[pos] != '=') |
| return -1; |
| ++pos; |
| |
| // Skip spaces and stray control characters. |
| while (str[pos] <= ' ' && pos != len) |
| ++pos; |
| |
| // Skip quotation mark. |
| char quoteMark = str[pos]; |
| if (quoteMark != '"' && quoteMark != '\'') |
| return -1; |
| ++pos; |
| |
| // Find the trailing quotation mark. |
| int end = pos; |
| while (str[end] != quoteMark) |
| ++end; |
| |
| if (end == len) |
| return -1; |
| |
| encodingLength = end - pos; |
| return pos; |
| } |
| |
| DeprecatedString Decoder::decode(const char *data, int len) |
| { |
| // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. |
| int bufferLength = buffer.length(); |
| const int maximumBOMLength = 3; |
| if (beginning && bufferLength + len >= maximumBOMLength) { |
| if (m_type != UserChosenEncoding) { |
| // Extract the first three bytes. |
| // Handle the case where some of bytes are already in the buffer. |
| // The last byte is always guaranteed to not be in the buffer. |
| const unsigned char *udata = (const unsigned char *)data; |
| unsigned char c1 = bufferLength >= 1 ? buffer[0].unicode() : *udata++; |
| unsigned char c2 = bufferLength >= 2 ? buffer[1].unicode() : *udata++; |
| ASSERT(bufferLength < 3); |
| unsigned char c3 = *udata; |
| |
| // Check for the BOM. |
| const char *autoDetectedEncoding; |
| if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { |
| autoDetectedEncoding = "ISO-10646-UCS-2"; |
| } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { |
| autoDetectedEncoding = "UTF-8"; |
| } else { |
| autoDetectedEncoding = 0; |
| } |
| |
| // If we found a BOM, use the encoding it implies. |
| if (autoDetectedEncoding != 0) { |
| m_type = AutoDetectedEncoding; |
| m_encoding = TextEncoding(autoDetectedEncoding); |
| ASSERT(m_encoding.isValid()); |
| enc = m_encoding.name(); |
| m_decoder.set(new StreamingTextDecoder(m_encoding)); |
| } |
| } |
| beginning = false; |
| } |
| |
| // this is not completely efficient, since the function might go |
| // through the html head several times... |
| |
| bool lookForMetaTag = m_type == DefaultEncoding && !body; |
| |
| if (lookForMetaTag) { |
| #ifdef DECODE_DEBUG |
| kdDebug(6005) << "looking for charset definition" << endl; |
| #endif |
| { // extra level of braces to keep indenting matching original for better diff'ing |
| buffer.append(data, len); |
| // we still don't have an encoding, and are in the head |
| // the following tags are allowed in <head>: |
| // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE |
| |
| // We stop scanning when a tag that is not permitted in <head> |
| // is seen, rather when </head> is seen, because that more closely |
| // matches behavior in other browsers; more details in |
| // <http://bugzilla.opendarwin.org/show_bug.cgi?id=3590>. |
| |
| // Additionally, we ignore things that looks like tags in <title>; see |
| // <http://bugzilla.opendarwin.org/show_bug.cgi?id=4560>. |
| |
| bool withinTitle = false; |
| |
| const char *ptr = buffer.latin1(); |
| const char *pEnd = ptr + buffer.length(); |
| while(ptr != pEnd) |
| { |
| if(*ptr == '<') { |
| bool end = false; |
| ptr++; |
| |
| // Handle comments. |
| if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { |
| ptr += 3; |
| skipComment(ptr, pEnd); |
| continue; |
| } |
| |
| // Handle XML header, which can have encoding in it. |
| if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') { |
| const char *end = ptr; |
| while (*end != '>' && *end != '\0') end++; |
| if (*end == '\0') |
| break; |
| DeprecatedCString str(ptr, end - ptr); |
| int len; |
| int pos = findXMLEncoding(str, len); |
| if (pos != -1) |
| setEncodingName(str.mid(pos, len), EncodingFromXMLHeader); |
| if (m_type != EncodingFromXMLHeader) |
| setEncodingName("UTF-8", EncodingFromXMLHeader); |
| // continue looking for a charset - it may be specified in an HTTP-Equiv meta |
| } else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') { |
| // UTF-16 without BOM |
| setEncodingName(((ptr - buffer.latin1()) % 2) ? "UTF-16LE" : "UTF-16BE", AutoDetectedEncoding); |
| goto found; |
| } |
| |
| if(*ptr == '/') ptr++, end=true; |
| char tmp[20]; |
| int len = 0; |
| while ( |
| ((*ptr >= 'a') && (*ptr <= 'z') || |
| (*ptr >= 'A') && (*ptr <= 'Z') || |
| (*ptr >= '0') && (*ptr <= '9')) |
| && len < 19 ) |
| { |
| tmp[len] = tolower( *ptr ); |
| ptr++; |
| len++; |
| } |
| tmp[len] = 0; |
| AtomicString tag(tmp); |
| |
| if (tag == titleTag) |
| withinTitle = !end; |
| |
| if (!end && tag == metaTag) { |
| const char * end = ptr; |
| while(*end != '>' && *end != '\0') end++; |
| if ( *end == '\0' ) break; |
| DeprecatedCString str( ptr, (end-ptr)+1); |
| str = str.lower(); |
| int pos = 0; |
| while( pos < ( int ) str.length() ) { |
| if( (pos = str.find("charset", pos, false)) == -1) break; |
| pos += 7; |
| // skip whitespace.. |
| while( pos < (int)str.length() && str[pos] <= ' ' ) pos++; |
| if ( pos == ( int )str.length()) break; |
| if ( str[pos++] != '=' ) continue; |
| while ( pos < ( int )str.length() && |
| ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'') |
| pos++; |
| |
| // end ? |
| if ( pos == ( int )str.length() ) break; |
| unsigned endpos = pos; |
| while( endpos < str.length() && |
| (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' |
| && str[endpos] != ';' && str[endpos] != '>') ) |
| endpos++; |
| setEncodingName(str.mid(pos, endpos-pos), EncodingFromMetaTag); |
| if( m_type == EncodingFromMetaTag ) goto found; |
| |
| if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break; |
| |
| pos = endpos + 1; |
| } |
| } else if (tag != scriptTag && tag != noscriptTag && tag != styleTag && |
| tag != linkTag && tag != metaTag && tag != objectTag && |
| tag != titleTag && tag != baseTag && |
| (end || tag != htmlTag) && !withinTitle && |
| (tag != headTag) && isalpha(tmp[0])) { |
| body = true; |
| goto found; |
| } |
| } |
| else |
| ptr++; |
| } |
| return DeprecatedString::null; |
| } |
| } |
| |
| found: |
| // Do the auto-detect if our default encoding is one of the Japanese ones. |
| if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_encoding.isJapanese()) |
| { |
| const char *autoDetectedEncoding; |
| switch (KanjiCode::judge(data, len)) { |
| case KanjiCode::JIS: |
| autoDetectedEncoding = "jis7"; |
| break; |
| case KanjiCode::EUC: |
| autoDetectedEncoding = "eucjp"; |
| break; |
| case KanjiCode::SJIS: |
| autoDetectedEncoding = "sjis"; |
| break; |
| default: |
| autoDetectedEncoding = NULL; |
| break; |
| } |
| if (autoDetectedEncoding != 0) { |
| setEncodingName(autoDetectedEncoding, AutoDetectedEncoding); |
| } |
| } |
| |
| // if we still haven't found an encoding, assume latin1 |
| if (!m_encoding.isValid()) |
| { |
| if (enc.isEmpty()) |
| enc = "iso8859-1"; |
| m_encoding = TextEncoding(enc); |
| // be sure not to crash |
| if (!m_encoding.isValid()) { |
| enc = "iso8859-1"; |
| m_encoding = TextEncoding(Latin1Encoding); |
| } |
| m_decoder.set(new StreamingTextDecoder(m_encoding)); |
| } |
| DeprecatedString out; |
| |
| if (!buffer.isEmpty()) { |
| if (!lookForMetaTag) |
| buffer.append(data, len); |
| out = m_decoder->toUnicode(buffer.latin1(), buffer.length()); |
| buffer.truncate(0); |
| } else { |
| out = m_decoder->toUnicode(data, len); |
| } |
| |
| return out; |
| } |
| |
| DeprecatedString Decoder::flush() const |
| { |
| return m_decoder->toUnicode(buffer.latin1(), buffer.length(), true); |
| } |
| |
| // ----------------------------------------------------------------------------- |