Source/WebCore/platform/text/TextEncodingDetectorICU.cpp - WebKit - Git at Google

 /*
  * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  *     * Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
  *     * Neither the name of Google Inc. nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "config.h"
 #include "TextEncodingDetector.h"

 #include "TextEncoding.h"
 #include <unicode/ucnv.h>
 #include <unicode/ucsdet.h>

 namespace WebCore {

 bool detectTextEncoding(const char* data, size_t len,
                         const char* hintEncodingName,
                         TextEncoding* detectedEncoding)
 {
     *detectedEncoding = TextEncoding();
     int matchesCount = 0;
     UErrorCode status = U_ZERO_ERROR;
     UCharsetDetector* detector = ucsdet_open(&status);
     if (U_FAILURE(status))
         return false;
     ucsdet_enableInputFilter(detector, true);
     ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
     if (U_FAILURE(status))
         return false;

     // FIXME: A few things we can do other than improving
     // the ICU detector itself.
     // 1. Use ucsdet_detectAll and pick the most likely one given
     // "the context" (parent-encoding, referrer encoding, etc).
     // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
     // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
     // encoding with a highest confidence among the detector-specific
     // limited set of candidate encodings.
     // Below is a partial implementation of the first part of what's outlined
     // above.
     const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
     if (U_FAILURE(status)) {
         ucsdet_close(detector);
         return false;
     }

     const char* encoding = 0;
     if (hintEncodingName) {
         TextEncoding hintEncoding(hintEncodingName);
         // 10 is the minimum confidence value consistent with the codepoint
         // allocation in a given encoding. The size of a chunk passed to
         // us varies even for the same html file (apparently depending on
         // the network load). When we're given a rather short chunk, we
         // don't have a sufficiently reliable signal other than the fact that
         // the chunk is consistent with a set of encodings. So, instead of
         // setting an arbitrary threshold, we have to scan all the encodings
         // consistent with the data.
         const int32_t kThreshold = 10;
         for (int i = 0; i < matchesCount; ++i) {
             int32_t confidence = ucsdet_getConfidence(matches[i], &status);
             if (U_FAILURE(status)) {
                 status = U_ZERO_ERROR;
                 continue;
             }
             if (confidence < kThreshold)
                 break;
             const char* matchEncoding = ucsdet_getName(matches[i], &status);
             if (U_FAILURE(status)) {
                 status = U_ZERO_ERROR;
                 continue;
             }
             if (TextEncoding(matchEncoding) == hintEncoding) {
                 encoding = hintEncodingName;
                 break;
             }
         }
     }
     // If no match is found so far, just pick the top match.
     // This can happen, say, when a parent frame in EUC-JP refers to
     // a child frame in Shift_JIS and both frames do NOT specify the encoding
     // making us resort to auto-detection (when it IS turned on).
     if (!encoding && matchesCount > 0)
         encoding = ucsdet_getName(matches[0], &status);
     if (U_SUCCESS(status)) {
         *detectedEncoding = TextEncoding(encoding);
         ucsdet_close(detector);
         return true;
     }
     ucsdet_close(detector);
     return false;
 }

 }
	/*
	* Copyright (C) 2008, 2009 Google Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are
	* met:
	*
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following disclaimer
	* in the documentation and/or other materials provided with the
	* distribution.
	* * Neither the name of Google Inc. nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "config.h"
	#include "TextEncodingDetector.h"

	#include "TextEncoding.h"
	#include <unicode/ucnv.h>
	#include <unicode/ucsdet.h>

	namespace WebCore {

	bool detectTextEncoding(const char* data, size_t len,
	const char* hintEncodingName,
	TextEncoding* detectedEncoding)
	{
	*detectedEncoding = TextEncoding();
	int matchesCount = 0;
	UErrorCode status = U_ZERO_ERROR;
	UCharsetDetector* detector = ucsdet_open(&status);
	if (U_FAILURE(status))
	return false;
	ucsdet_enableInputFilter(detector, true);
	ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
	if (U_FAILURE(status))
	return false;

	// FIXME: A few things we can do other than improving
	// the ICU detector itself.
	// 1. Use ucsdet_detectAll and pick the most likely one given
	// "the context" (parent-encoding, referrer encoding, etc).
	// 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
	// Chinese, Japanese, Russian, Korean and Hebrew) by picking the
	// encoding with a highest confidence among the detector-specific
	// limited set of candidate encodings.
	// Below is a partial implementation of the first part of what's outlined
	// above.
	const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
	if (U_FAILURE(status)) {
	ucsdet_close(detector);
	return false;
	}

	const char* encoding = 0;
	if (hintEncodingName) {
	TextEncoding hintEncoding(hintEncodingName);
	// 10 is the minimum confidence value consistent with the codepoint
	// allocation in a given encoding. The size of a chunk passed to
	// us varies even for the same html file (apparently depending on
	// the network load). When we're given a rather short chunk, we
	// don't have a sufficiently reliable signal other than the fact that
	// the chunk is consistent with a set of encodings. So, instead of
	// setting an arbitrary threshold, we have to scan all the encodings
	// consistent with the data.
	const int32_t kThreshold = 10;
	for (int i = 0; i < matchesCount; ++i) {
	int32_t confidence = ucsdet_getConfidence(matches[i], &status);
	if (U_FAILURE(status)) {
	status = U_ZERO_ERROR;
	continue;
	}
	if (confidence < kThreshold)
	break;
	const char* matchEncoding = ucsdet_getName(matches[i], &status);
	if (U_FAILURE(status)) {
	status = U_ZERO_ERROR;
	continue;
	}
	if (TextEncoding(matchEncoding) == hintEncoding) {
	encoding = hintEncodingName;
	break;
	}
	}
	}
	// If no match is found so far, just pick the top match.
	// This can happen, say, when a parent frame in EUC-JP refers to
	// a child frame in Shift_JIS and both frames do NOT specify the encoding
	// making us resort to auto-detection (when it IS turned on).
	if (!encoding && matchesCount > 0)
	encoding = ucsdet_getName(matches[0], &status);
	if (U_SUCCESS(status)) {
	*detectedEncoding = TextEncoding(encoding);
	ucsdet_close(detector);
	return true;
	}
	ucsdet_close(detector);
	return false;
	}

	}