| /* |
| * (C) 1999 Lars Knoll (knoll@kde.org) |
| * Copyright (C) 2004-2016 Apple Inc. All rights reserved. |
| * Copyright (C) 2007-2009 Torch Mobile, Inc. |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Library General Public |
| * License as published by the Free Software Foundation; either |
| * version 2 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Library General Public License for more details. |
| * |
| * You should have received a copy of the GNU Library General Public License |
| * along with this library; see the file COPYING.LIB. If not, write to |
| * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
| * Boston, MA 02110-1301, USA. |
| */ |
| |
| #include "config.h" |
| #include <wtf/text/TextBreakIterator.h> |
| |
| #include <wtf/text/LineBreakIteratorPoolICU.h> |
| #include <wtf/text/TextBreakIteratorInternalICU.h> |
| #include <wtf/text/icu/UTextProviderLatin1.h> |
| #include <wtf/text/icu/UTextProviderUTF16.h> |
| #include <atomic> |
| #include <mutex> |
| #include <unicode/ubrk.h> |
| #include <wtf/text/StringBuilder.h> |
| |
| namespace WTF { |
| |
| #if !PLATFORM(MAC) && !PLATFORM(IOS_FAMILY) |
| |
| static Variant<TextBreakIteratorICU, TextBreakIteratorPlatform> mapModeToBackingIterator(StringView string, TextBreakIterator::Mode mode, const AtomString& locale) |
| { |
| switch (mode) { |
| case TextBreakIterator::Mode::Line: |
| return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Line, locale.string().utf8().data()); |
| case TextBreakIterator::Mode::Caret: |
| return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data()); |
| case TextBreakIterator::Mode::Delete: |
| return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data()); |
| default: |
| ASSERT_NOT_REACHED(); |
| return TextBreakIteratorICU(string, TextBreakIteratorICU::Mode::Character, locale.string().utf8().data()); |
| } |
| } |
| |
| TextBreakIterator::TextBreakIterator(StringView string, Mode mode, const AtomString& locale) |
| : m_backing(mapModeToBackingIterator(string, mode, locale)) |
| , m_mode(mode) |
| , m_locale(locale) |
| { |
| } |
| |
| #endif |
| |
| // Iterator initialization |
| |
| static UBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID()) |
| { |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UBreakIterator* iterator = ubrk_open(type, locale, 0, 0, &openStatus); |
| ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); |
| return iterator; |
| } |
| |
| // Iterator text setting |
| |
| static UBreakIterator* setTextForIterator(UBreakIterator& iterator, StringView string) |
| { |
| if (string.is8Bit()) { |
| UTextWithBuffer textLocal; |
| textLocal.text = UTEXT_INITIALIZER; |
| textLocal.text.extraSize = sizeof(textLocal.buffer); |
| textLocal.text.pExtra = textLocal.buffer; |
| |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus); |
| if (U_FAILURE(openStatus)) { |
| LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus); |
| return nullptr; |
| } |
| |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| ubrk_setUText(&iterator, text, &setTextStatus); |
| if (U_FAILURE(setTextStatus)) { |
| LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); |
| return nullptr; |
| } |
| |
| utext_close(text); |
| } else { |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| ubrk_setText(&iterator, string.characters16(), string.length(), &setTextStatus); |
| if (U_FAILURE(setTextStatus)) |
| return nullptr; |
| } |
| |
| return &iterator; |
| } |
| |
| static UBreakIterator* setContextAwareTextForIterator(UBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength) |
| { |
| if (string.is8Bit()) { |
| UTextWithBuffer textLocal; |
| textLocal.text = UTEXT_INITIALIZER; |
| textLocal.text.extraSize = sizeof(textLocal.buffer); |
| textLocal.text.pExtra = textLocal.buffer; |
| |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus); |
| if (U_FAILURE(openStatus)) { |
| LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus); |
| return nullptr; |
| } |
| |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| ubrk_setUText(&iterator, text, &setTextStatus); |
| if (U_FAILURE(setTextStatus)) { |
| LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); |
| return nullptr; |
| } |
| |
| utext_close(text); |
| } else { |
| UText textLocal = UTEXT_INITIALIZER; |
| |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus); |
| if (U_FAILURE(openStatus)) { |
| LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus); |
| return 0; |
| } |
| |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| ubrk_setUText(&iterator, text, &setTextStatus); |
| if (U_FAILURE(setTextStatus)) { |
| LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); |
| return nullptr; |
| } |
| |
| utext_close(text); |
| } |
| |
| return &iterator; |
| } |
| |
| |
| // Static iterators |
| |
| UBreakIterator* wordBreakIterator(StringView string) |
| { |
| static UBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD); |
| if (!staticWordBreakIterator) |
| return nullptr; |
| |
| return setTextForIterator(*staticWordBreakIterator, string); |
| } |
| |
| UBreakIterator* sentenceBreakIterator(StringView string) |
| { |
| static UBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE); |
| if (!staticSentenceBreakIterator) |
| return nullptr; |
| |
| return setTextForIterator(*staticSentenceBreakIterator, string); |
| } |
| |
| UBreakIterator* acquireLineBreakIterator(StringView string, const AtomString& locale, const UChar* priorContext, unsigned priorContextLength, LineBreakIteratorMode mode) |
| { |
| UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode); |
| if (!iterator) |
| return nullptr; |
| |
| return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength); |
| } |
| |
| void releaseLineBreakIterator(UBreakIterator* iterator) |
| { |
| ASSERT_ARG(iterator, iterator); |
| |
| LineBreakIteratorPool::sharedPool().put(iterator); |
| } |
| |
| UBreakIterator* openLineBreakIterator(const AtomString& locale) |
| { |
| bool localeIsEmpty = locale.isEmpty(); |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UBreakIterator* ubrkIter = ubrk_open(UBRK_LINE, localeIsEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus); |
| // locale comes from a web page and it can be invalid, leading ICU |
| // to fail, in which case we fall back to the default locale. |
| if (!localeIsEmpty && U_FAILURE(openStatus)) { |
| openStatus = U_ZERO_ERROR; |
| ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus); |
| } |
| |
| if (U_FAILURE(openStatus)) { |
| LOG_ERROR("ubrk_open failed with status %d", openStatus); |
| return nullptr; |
| } |
| |
| return ubrkIter; |
| } |
| |
| void closeLineBreakIterator(UBreakIterator*& iterator) |
| { |
| UBreakIterator* ubrkIter = iterator; |
| ASSERT(ubrkIter); |
| ubrk_close(ubrkIter); |
| iterator = nullptr; |
| } |
| |
| static std::atomic<UBreakIterator*> nonSharedCharacterBreakIterator = ATOMIC_VAR_INIT(nullptr); |
| |
| static inline UBreakIterator* getNonSharedCharacterBreakIterator() |
| { |
| if (auto *res = nonSharedCharacterBreakIterator.exchange(nullptr, std::memory_order_acquire)) |
| return res; |
| return initializeIterator(UBRK_CHARACTER); |
| } |
| |
| static inline void cacheNonSharedCharacterBreakIterator(UBreakIterator* cacheMe) |
| { |
| if (auto *old = nonSharedCharacterBreakIterator.exchange(cacheMe, std::memory_order_release)) |
| ubrk_close(old); |
| } |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string) |
| { |
| if ((m_iterator = getNonSharedCharacterBreakIterator())) |
| m_iterator = setTextForIterator(*m_iterator, string); |
| } |
| |
| NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() |
| { |
| if (m_iterator) |
| cacheNonSharedCharacterBreakIterator(m_iterator); |
| } |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(NonSharedCharacterBreakIterator&& other) |
| : m_iterator(nullptr) |
| { |
| std::swap(m_iterator, other.m_iterator); |
| } |
| |
| // Iterator implemenation. |
| |
| bool isWordTextBreak(UBreakIterator* iterator) |
| { |
| int ruleStatus = ubrk_getRuleStatus(iterator); |
| return ruleStatus != UBRK_WORD_NONE; |
| } |
| |
| unsigned numGraphemeClusters(StringView string) |
| { |
| unsigned stringLength = string.length(); |
| |
| if (!stringLength) |
| return 0; |
| |
| // The only Latin-1 Extended Grapheme Cluster is CRLF. |
| if (string.is8Bit()) { |
| auto* characters = string.characters8(); |
| unsigned numCRLF = 0; |
| for (unsigned i = 1; i < stringLength; ++i) |
| numCRLF += characters[i - 1] == '\r' && characters[i] == '\n'; |
| return stringLength - numCRLF; |
| } |
| |
| NonSharedCharacterBreakIterator iterator { string }; |
| if (!iterator) { |
| ASSERT_NOT_REACHED(); |
| return stringLength; |
| } |
| |
| unsigned numGraphemeClusters = 0; |
| while (ubrk_next(iterator) != UBRK_DONE) |
| ++numGraphemeClusters; |
| return numGraphemeClusters; |
| } |
| |
| unsigned numCodeUnitsInGraphemeClusters(StringView string, unsigned numGraphemeClusters) |
| { |
| unsigned stringLength = string.length(); |
| |
| if (stringLength <= numGraphemeClusters) |
| return stringLength; |
| |
| // The only Latin-1 Extended Grapheme Cluster is CRLF. |
| if (string.is8Bit()) { |
| auto* characters = string.characters8(); |
| unsigned i, j; |
| for (i = 0, j = 0; i < numGraphemeClusters && j + 1 < stringLength; ++i, ++j) |
| j += characters[j] == '\r' && characters[j + 1] == '\n'; |
| return j + (i < numGraphemeClusters); |
| } |
| |
| NonSharedCharacterBreakIterator iterator { string }; |
| if (!iterator) { |
| ASSERT_NOT_REACHED(); |
| return stringLength; |
| } |
| |
| for (unsigned i = 0; i < numGraphemeClusters; ++i) { |
| if (ubrk_next(iterator) == UBRK_DONE) |
| return stringLength; |
| } |
| return ubrk_current(iterator); |
| } |
| |
| } // namespace WTF |