| /* |
| * Copyright (C) 2004, 2006, 2014 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #import "config.h" |
| #import "TextBoundaries.h" |
| |
| #import <CoreFoundation/CFStringTokenizer.h> |
| #import <Foundation/Foundation.h> |
| #import <unicode/ubrk.h> |
| #import <unicode/uchar.h> |
| #import <unicode/ustring.h> |
| #import <unicode/utypes.h> |
| #import <wtf/RetainPtr.h> |
| #import <wtf/text/StringView.h> |
| #import <wtf/text/TextBreakIterator.h> |
| #import <wtf/text/TextBreakIteratorInternalICU.h> |
| #import <wtf/unicode/CharacterNames.h> |
| |
| namespace WebCore { |
| |
| #if !USE(APPKIT) |
| |
| static bool isSkipCharacter(UChar32 c) |
| { |
| return c == 0xA0 || c == '\n' || c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':' || u_isspace(c); |
| } |
| |
| static bool isWhitespaceCharacter(UChar32 c) |
| { |
| return c == 0xA0 || c == '\n' || u_isspace(c); |
| } |
| |
| static bool isWordDelimitingCharacter(UChar32 c) |
| { |
| // Ampersand is an exception added to treat AT&T as a single word (see <rdar://problem/5022264>). |
| return !CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetAlphaNumeric), c) && c != '&'; |
| } |
| |
| static bool isSymbolCharacter(UChar32 c) |
| { |
| return CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetSymbol), c); |
| } |
| |
| static bool isAmbiguousBoundaryCharacter(UChar32 character) |
| { |
| // These are characters that can behave as word boundaries, but can appear within words. |
| return character == '\'' || character == rightSingleQuotationMark || character == hebrewPunctuationGershayim; |
| } |
| |
| static CFStringTokenizerRef tokenizerForString(CFStringRef str) |
| { |
| static CFLocaleRef locale = nullptr; |
| if (!locale) { |
| const char* temp = currentTextBreakLocaleID(); |
| RetainPtr<CFStringRef> currentLocaleID = adoptCF(CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, reinterpret_cast<const UInt8*>(temp), strlen(temp), kCFStringEncodingASCII, false, kCFAllocatorNull)); |
| locale = CFLocaleCreate(kCFAllocatorDefault, currentLocaleID.get()); |
| if (!locale) |
| return nullptr; |
| } |
| |
| CFRange entireRange = CFRangeMake(0, CFStringGetLength(str)); |
| |
| static CFStringTokenizerRef tokenizer = nullptr; |
| if (!tokenizer) |
| tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, str, entireRange, kCFStringTokenizerUnitWordBoundary, locale); |
| else |
| CFStringTokenizerSetString(tokenizer, str, entireRange); |
| return tokenizer; |
| } |
| |
| // Simple case: A word is a stream of characters delimited by a special set of word-delimiting characters. |
| static void findSimpleWordBoundary(StringView text, int position, int* start, int* end) |
| { |
| ASSERT(position >= 0); |
| ASSERT(static_cast<unsigned>(position) < text.length()); |
| |
| unsigned startPos = position; |
| while (startPos > 0) { |
| int i = startPos; |
| UChar32 characterBeforeStartPos; |
| U16_PREV(text, 0, i, characterBeforeStartPos); |
| if (isWordDelimitingCharacter(characterBeforeStartPos)) { |
| ASSERT(i >= 0); |
| if (!i) |
| break; |
| |
| if (!isAmbiguousBoundaryCharacter(characterBeforeStartPos)) |
| break; |
| |
| UChar32 characterBeforeBeforeStartPos; |
| U16_PREV(text, 0, i, characterBeforeBeforeStartPos); |
| if (isWordDelimitingCharacter(characterBeforeBeforeStartPos)) |
| break; |
| } |
| U16_BACK_1(text, 0, startPos); |
| } |
| |
| unsigned endPos = position; |
| while (endPos < text.length()) { |
| UChar32 character; |
| U16_GET(text, 0, endPos, text.length(), character); |
| if (isWordDelimitingCharacter(character)) { |
| unsigned i = endPos; |
| U16_FWD_1(text, i, text.length()); |
| ASSERT(i <= text.length()); |
| if (i == text.length()) |
| break; |
| UChar32 characterAfterEndPos; |
| U16_NEXT(text, i, text.length(), characterAfterEndPos); |
| if (!isAmbiguousBoundaryCharacter(character)) |
| break; |
| if (isWordDelimitingCharacter(characterAfterEndPos)) |
| break; |
| } |
| U16_FWD_1(text, endPos, text.length()); |
| } |
| |
| // The text may consist of all delimiter characters (e.g. "++++++++" or a series of emoji), and returning an empty range |
| // makes no sense (and doesn't match findComplexWordBoundary() behavior). |
| if (startPos == endPos && endPos < text.length()) { |
| UChar32 character; |
| U16_GET(text, 0, endPos, text.length(), character); |
| if (isSymbolCharacter(character)) |
| U16_FWD_1(text, endPos, text.length()); |
| } |
| |
| *start = startPos; |
| *end = endPos; |
| } |
| |
| // Complex case: use CFStringTokenizer to find word boundary. |
| static void findComplexWordBoundary(StringView text, int position, int* start, int* end) |
| { |
| RetainPtr<CFStringRef> charString = text.createCFStringWithoutCopying(); |
| |
| CFStringTokenizerRef tokenizer = tokenizerForString(charString.get()); |
| if (!tokenizer) { |
| // Error creating tokenizer, so just use simple function. |
| findSimpleWordBoundary(text, position, start, end); |
| return; |
| } |
| |
| CFStringTokenizerTokenType token = CFStringTokenizerGoToTokenAtIndex(tokenizer, position); |
| if (token == kCFStringTokenizerTokenNone) { |
| // No token found: select entire block. |
| // NB: I never hit this section in all my testing. |
| *start = 0; |
| *end = text.length(); |
| return; |
| } |
| |
| CFRange result = CFStringTokenizerGetCurrentTokenRange(tokenizer); |
| *start = result.location; |
| *end = result.location + result.length; |
| } |
| |
| #endif |
| |
| void findWordBoundary(StringView text, int position, int* start, int* end) |
| { |
| #if USE(APPKIT) |
| NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()]; |
| NSRange range = [attributedString doubleClickAtIndex:std::min<unsigned>(position, text.length() - 1)]; |
| [attributedString release]; |
| *start = range.location; |
| *end = range.location + range.length; |
| #else |
| unsigned pos = position; |
| if (pos == text.length() && pos) |
| --pos; |
| |
| // For complex text (Thai, Japanese, Chinese), visible_units will pass the text in as a |
| // single contiguous run of characters, providing as much context as is possible. |
| // We only need one character to determine if the text is complex. |
| UChar32 ch; |
| unsigned i = pos; |
| U16_NEXT(text, i, text.length(), ch); |
| bool isComplex = requiresContextForWordBoundary(ch); |
| |
| // FIXME: This check improves our word boundary behavior, but doesn't actually go far enough. |
| // See <rdar://problem/8853951> Take complex word boundary finding path when necessary |
| if (!isComplex) { |
| // Check again for complex text, at the start of the run. |
| i = 0; |
| U16_NEXT(text, i, text.length(), ch); |
| isComplex = requiresContextForWordBoundary(ch); |
| } |
| |
| if (isComplex) |
| findComplexWordBoundary(text, position, start, end); |
| else |
| findSimpleWordBoundary(text, position, start, end); |
| |
| #define LOG_WORD_BREAK 0 |
| #if LOG_WORD_BREAK |
| auto uniString = text.createCFStringWithoutCopying(); |
| auto foundWord = text.substring(*start, *end - *start).createCFStringWithoutCopying(); |
| NSLog(@"%s_BREAK '%@' (%d,%d) in '%@' (%p) at %d, length=%d", isComplex ? "COMPLEX" : "SIMPLE", foundWord.get(), *start, *end, uniString.get(), uniString.get(), position, text.length()); |
| #endif |
| |
| #endif |
| } |
| |
| void findEndWordBoundary(StringView text, int position, int* end) |
| { |
| int start; |
| findWordBoundary(text, position, &start, end); |
| } |
| |
| int findNextWordFromIndex(StringView text, int position, bool forward) |
| { |
| #if USE(APPKIT) |
| NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()]; |
| int result = [attributedString nextWordFromIndex:position forward:forward]; |
| [attributedString release]; |
| return result; |
| #else |
| // This very likely won't behave exactly like the non-iPhone version, but it works |
| // for the contexts in which it is used on iPhone, and in the future will be |
| // tuned to improve the iPhone-specific behavior for the keyboard and text editing. |
| int pos = position; |
| UBreakIterator* boundary = wordBreakIterator(text); |
| if (boundary) { |
| if (forward) { |
| do { |
| pos = ubrk_following(boundary, pos); |
| if (pos == UBRK_DONE) |
| pos = text.length(); |
| } while (static_cast<unsigned>(pos) < text.length() && (pos == 0 || !isSkipCharacter(text[pos - 1])) && isSkipCharacter(text[pos])); |
| } |
| else { |
| do { |
| pos = ubrk_preceding(boundary, pos); |
| if (pos == UBRK_DONE) |
| pos = 0; |
| } while (pos > 0 && isSkipCharacter(text[pos]) && !isWhitespaceCharacter(text[pos - 1])); |
| } |
| } |
| return pos; |
| #endif |
| } |
| |
| } |