blob: 4090b6689c16ee90bb0f129d0c48428546869770 [file] [log] [blame]
/*
* Copyright (C) 2004, 2006, 2014 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#import "config.h"
#import "TextBoundaries.h"
#import <CoreFoundation/CFStringTokenizer.h>
#import <Foundation/Foundation.h>
#import <unicode/ubrk.h>
#import <unicode/uchar.h>
#import <unicode/ustring.h>
#import <unicode/utypes.h>
#import <wtf/RetainPtr.h>
#import <wtf/text/StringView.h>
#import <wtf/text/TextBreakIterator.h>
#import <wtf/text/TextBreakIteratorInternalICU.h>
#import <wtf/unicode/CharacterNames.h>
namespace WebCore {
#if !USE(APPKIT)
static bool isSkipCharacter(UChar32 c)
{
return c == 0xA0 || c == '\n' || c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':' || u_isspace(c);
}
static bool isWhitespaceCharacter(UChar32 c)
{
return c == 0xA0 || c == '\n' || u_isspace(c);
}
static bool isWordDelimitingCharacter(UChar32 c)
{
// Ampersand is an exception added to treat AT&T as a single word (see <rdar://problem/5022264>).
return !CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetAlphaNumeric), c) && c != '&';
}
static bool isSymbolCharacter(UChar32 c)
{
return CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetSymbol), c);
}
static bool isAmbiguousBoundaryCharacter(UChar32 character)
{
// These are characters that can behave as word boundaries, but can appear within words.
return character == '\'' || character == rightSingleQuotationMark || character == hebrewPunctuationGershayim;
}
static CFStringTokenizerRef tokenizerForString(CFStringRef str)
{
static CFLocaleRef locale = nullptr;
if (!locale) {
const char* temp = currentTextBreakLocaleID();
RetainPtr<CFStringRef> currentLocaleID = adoptCF(CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, reinterpret_cast<const UInt8*>(temp), strlen(temp), kCFStringEncodingASCII, false, kCFAllocatorNull));
locale = CFLocaleCreate(kCFAllocatorDefault, currentLocaleID.get());
if (!locale)
return nullptr;
}
CFRange entireRange = CFRangeMake(0, CFStringGetLength(str));
static CFStringTokenizerRef tokenizer = nullptr;
if (!tokenizer)
tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, str, entireRange, kCFStringTokenizerUnitWordBoundary, locale);
else
CFStringTokenizerSetString(tokenizer, str, entireRange);
return tokenizer;
}
// Simple case: A word is a stream of characters delimited by a special set of word-delimiting characters.
static void findSimpleWordBoundary(StringView text, int position, int* start, int* end)
{
ASSERT(position >= 0);
ASSERT(static_cast<unsigned>(position) < text.length());
unsigned startPos = position;
while (startPos > 0) {
int i = startPos;
UChar32 characterBeforeStartPos;
U16_PREV(text, 0, i, characterBeforeStartPos);
if (isWordDelimitingCharacter(characterBeforeStartPos)) {
ASSERT(i >= 0);
if (!i)
break;
if (!isAmbiguousBoundaryCharacter(characterBeforeStartPos))
break;
UChar32 characterBeforeBeforeStartPos;
U16_PREV(text, 0, i, characterBeforeBeforeStartPos);
if (isWordDelimitingCharacter(characterBeforeBeforeStartPos))
break;
}
U16_BACK_1(text, 0, startPos);
}
unsigned endPos = position;
while (endPos < text.length()) {
UChar32 character;
U16_GET(text, 0, endPos, text.length(), character);
if (isWordDelimitingCharacter(character)) {
unsigned i = endPos;
U16_FWD_1(text, i, text.length());
ASSERT(i <= text.length());
if (i == text.length())
break;
UChar32 characterAfterEndPos;
U16_NEXT(text, i, text.length(), characterAfterEndPos);
if (!isAmbiguousBoundaryCharacter(character))
break;
if (isWordDelimitingCharacter(characterAfterEndPos))
break;
}
U16_FWD_1(text, endPos, text.length());
}
// The text may consist of all delimiter characters (e.g. "++++++++" or a series of emoji), and returning an empty range
// makes no sense (and doesn't match findComplexWordBoundary() behavior).
if (startPos == endPos && endPos < text.length()) {
UChar32 character;
U16_GET(text, 0, endPos, text.length(), character);
if (isSymbolCharacter(character))
U16_FWD_1(text, endPos, text.length());
}
*start = startPos;
*end = endPos;
}
// Complex case: use CFStringTokenizer to find word boundary.
static void findComplexWordBoundary(StringView text, int position, int* start, int* end)
{
RetainPtr<CFStringRef> charString = text.createCFStringWithoutCopying();
CFStringTokenizerRef tokenizer = tokenizerForString(charString.get());
if (!tokenizer) {
// Error creating tokenizer, so just use simple function.
findSimpleWordBoundary(text, position, start, end);
return;
}
CFStringTokenizerTokenType token = CFStringTokenizerGoToTokenAtIndex(tokenizer, position);
if (token == kCFStringTokenizerTokenNone) {
// No token found: select entire block.
// NB: I never hit this section in all my testing.
*start = 0;
*end = text.length();
return;
}
CFRange result = CFStringTokenizerGetCurrentTokenRange(tokenizer);
*start = result.location;
*end = result.location + result.length;
}
#endif
void findWordBoundary(StringView text, int position, int* start, int* end)
{
#if USE(APPKIT)
NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()];
NSRange range = [attributedString doubleClickAtIndex:std::min<unsigned>(position, text.length() - 1)];
[attributedString release];
*start = range.location;
*end = range.location + range.length;
#else
unsigned pos = position;
if (pos == text.length() && pos)
--pos;
// For complex text (Thai, Japanese, Chinese), visible_units will pass the text in as a
// single contiguous run of characters, providing as much context as is possible.
// We only need one character to determine if the text is complex.
UChar32 ch;
unsigned i = pos;
U16_NEXT(text, i, text.length(), ch);
bool isComplex = requiresContextForWordBoundary(ch);
// FIXME: This check improves our word boundary behavior, but doesn't actually go far enough.
// See <rdar://problem/8853951> Take complex word boundary finding path when necessary
if (!isComplex) {
// Check again for complex text, at the start of the run.
i = 0;
U16_NEXT(text, i, text.length(), ch);
isComplex = requiresContextForWordBoundary(ch);
}
if (isComplex)
findComplexWordBoundary(text, position, start, end);
else
findSimpleWordBoundary(text, position, start, end);
#define LOG_WORD_BREAK 0
#if LOG_WORD_BREAK
auto uniString = text.createCFStringWithoutCopying();
auto foundWord = text.substring(*start, *end - *start).createCFStringWithoutCopying();
NSLog(@"%s_BREAK '%@' (%d,%d) in '%@' (%p) at %d, length=%d", isComplex ? "COMPLEX" : "SIMPLE", foundWord.get(), *start, *end, uniString.get(), uniString.get(), position, text.length());
#endif
#endif
}
void findEndWordBoundary(StringView text, int position, int* end)
{
int start;
findWordBoundary(text, position, &start, end);
}
int findNextWordFromIndex(StringView text, int position, bool forward)
{
#if USE(APPKIT)
NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()];
int result = [attributedString nextWordFromIndex:position forward:forward];
[attributedString release];
return result;
#else
// This very likely won't behave exactly like the non-iPhone version, but it works
// for the contexts in which it is used on iPhone, and in the future will be
// tuned to improve the iPhone-specific behavior for the keyboard and text editing.
int pos = position;
UBreakIterator* boundary = wordBreakIterator(text);
if (boundary) {
if (forward) {
do {
pos = ubrk_following(boundary, pos);
if (pos == UBRK_DONE)
pos = text.length();
} while (static_cast<unsigned>(pos) < text.length() && (pos == 0 || !isSkipCharacter(text[pos - 1])) && isSkipCharacter(text[pos]));
}
else {
do {
pos = ubrk_preceding(boundary, pos);
if (pos == UBRK_DONE)
pos = 0;
} while (pos > 0 && isSkipCharacter(text[pos]) && !isWhitespaceCharacter(text[pos - 1]));
}
}
return pos;
#endif
}
}