2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "TextIterator.h"
32 #include "HTMLElement.h"
33 #include "HTMLTextFormControlElement.h"
34 #include "HTMLNames.h"
35 #include "htmlediting.h"
36 #include "InlineTextBox.h"
38 #include "RenderTableCell.h"
39 #include "RenderTableRow.h"
40 #include "RenderTextControl.h"
41 #include "RenderTextFragment.h"
42 #include "TextBoundaries.h"
43 #include "TextBreakIterator.h"
44 #include "VisiblePosition.h"
45 #include "visible_units.h"
46 #include <wtf/unicode/CharacterNames.h>
48 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
49 #include "TextBreakIteratorInternalICU.h"
50 #include <unicode/usearch.h>
53 using namespace WTF::Unicode;
58 using namespace HTMLNames;
60 // Buffer that knows how to compare with a search target.
61 // Keeps enough of the previous text to be able to search in the future, but no more.
62 // Non-breaking spaces are always equal to normal spaces.
63 // Case folding is also done if the CaseInsensitive option is specified.
64 // Matches are further filtered if the AtWordStarts option is specified, although some
65 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
67 WTF_MAKE_NONCOPYABLE(SearchBuffer);
69 SearchBuffer(const String& target, FindOptions);
72 // Returns number of characters appended; guaranteed to be in the range [1, length].
73 size_t append(const UChar*, size_t length);
74 bool needsMoreContext() const;
75 void prependContext(const UChar*, size_t length);
78 // Result is the size in characters of what was found.
79 // And <startOffset> is the number of characters back to the start of what was found.
80 size_t search(size_t& startOffset);
83 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
86 bool isBadMatch(const UChar*, size_t length) const;
87 bool isWordStartMatch(size_t start, size_t length) const;
90 FindOptions m_options;
92 Vector<UChar> m_buffer;
94 size_t m_prefixLength;
96 bool m_needsMoreContext;
98 bool m_targetRequiresKanaWorkaround;
99 Vector<UChar> m_normalizedTarget;
100 mutable Vector<UChar> m_normalizedMatch;
105 void append(UChar, bool isCharacterStart);
106 size_t length() const;
109 FindOptions m_options;
111 Vector<UChar> m_buffer;
112 Vector<bool> m_isCharacterStartBuffer;
121 static const unsigned bitsInWord = sizeof(unsigned) * 8;
122 static const unsigned bitInWordMask = bitsInWord - 1;
129 BitStack::~BitStack()
133 void BitStack::push(bool bit)
135 unsigned index = m_size / bitsInWord;
136 unsigned shift = m_size & bitInWordMask;
137 if (!shift && index == m_words.size()) {
138 m_words.grow(index + 1);
141 unsigned& word = m_words[index];
142 unsigned mask = 1U << shift;
156 bool BitStack::top() const
160 unsigned shift = (m_size - 1) & bitInWordMask;
161 return m_words.last() & (1U << shift);
164 unsigned BitStack::size() const
173 static unsigned depthCrossingShadowBoundaries(Node* node)
176 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
183 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
184 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
186 if (!rangeEndContainer)
188 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
189 if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
192 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
193 if (Node* next = node->nextSibling())
201 static inline bool fullyClipsContents(Node* node)
203 RenderObject* renderer = node->renderer();
204 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
206 return toRenderBox(renderer)->size().isEmpty();
209 static inline bool ignoresContainerClip(Node* node)
211 RenderObject* renderer = node->renderer();
212 if (!renderer || renderer->isText())
214 EPosition position = renderer->style()->position();
215 return position == AbsolutePosition || position == FixedPosition;
218 static void pushFullyClippedState(BitStack& stack, Node* node)
220 ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
222 // Push true if this node full clips its contents, or if a parent already has fully
223 // clipped and this is not a node that ignores its container's clip.
224 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
227 static void setUpFullyClippedStack(BitStack& stack, Node* node)
229 // Put the nodes in a vector so we can iterate in reverse order.
230 Vector<Node*, 100> ancestry;
231 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
232 ancestry.append(parent);
234 // Call pushFullyClippedState on each node starting with the earliest ancestor.
235 size_t size = ancestry.size();
236 for (size_t i = 0; i < size; ++i)
237 pushFullyClippedState(stack, ancestry[size - i - 1]);
238 pushFullyClippedState(stack, node);
240 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
245 TextIterator::TextIterator()
246 : m_startContainer(0)
251 , m_textCharacters(0)
253 , m_remainingTextBox(0)
254 , m_firstLetterText(0)
256 , m_sortedTextBoxesPosition(0)
257 , m_emitsCharactersBetweenAllVisiblePositions(false)
258 , m_entersTextControls(false)
259 , m_emitsTextWithoutTranscoding(false)
260 , m_handledFirstLetter(false)
261 , m_ignoresStyleVisibility(false)
262 , m_emitsObjectReplacementCharacters(false)
266 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
267 : m_startContainer(0)
272 , m_textCharacters(0)
274 , m_remainingTextBox(0)
275 , m_firstLetterText(0)
276 , m_sortedTextBoxesPosition(0)
277 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
278 , m_entersTextControls(behavior & TextIteratorEntersTextControls)
279 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
280 , m_handledFirstLetter(false)
281 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
282 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
287 // get and validate the range endpoints
288 Node* startContainer = r->startContainer();
291 int startOffset = r->startOffset();
292 Node* endContainer = r->endContainer();
293 int endOffset = r->endOffset();
295 // Callers should be handing us well-formed ranges. If we discover that this isn't
296 // the case, we could consider changing this assertion to an early return.
297 ASSERT(r->boundaryPointsValid());
299 // remember range - this does not change
300 m_startContainer = startContainer;
301 m_startOffset = startOffset;
302 m_endContainer = endContainer;
303 m_endOffset = endOffset;
305 // set up the current node for processing
306 m_node = r->firstNode();
309 setUpFullyClippedStack(m_fullyClippedStack, m_node);
310 m_offset = m_node == m_startContainer ? m_startOffset : 0;
311 m_handledNode = false;
312 m_handledChildren = false;
314 // calculate first out of bounds node
315 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
317 // initialize node processing state
318 m_needsAnotherNewline = false;
321 // initialize record of previous node processing
322 m_hasEmitted = false;
324 m_lastTextNodeEndedWithCollapsedSpace = false;
328 // need this just because of the assert in advance()
329 m_positionNode = m_node;
332 // identify the first run
336 TextIterator::~TextIterator()
340 void TextIterator::advance()
342 // reset the run information
346 // handle remembered node that needed a newline after the text node's newline
347 if (m_needsAnotherNewline) {
348 // Emit the extra newline, and position it *inside* m_node, after m_node's
349 // contents, in case it's a block, in the same way that we position the first
350 // newline. The range for the emitted newline should start where the line
352 // FIXME: It would be cleaner if we emitted two newlines during the last
353 // iteration, instead of using m_needsAnotherNewline.
354 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
355 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
356 m_needsAnotherNewline = false;
360 if (!m_textBox && m_remainingTextBox) {
361 m_textBox = m_remainingTextBox;
362 m_remainingTextBox = 0;
363 m_firstLetterText = 0;
366 // handle remembered text box
373 while (m_node && m_node != m_pastEndNode) {
374 // if the range ends at offset 0 of an element, represent the
375 // position, but not the content, of that element e.g. if the
376 // node is a blockflow element, emit a newline that
377 // precedes the element
378 if (m_node == m_endContainer && m_endOffset == 0) {
379 representNodeOffsetZero();
384 RenderObject* renderer = m_node->renderer();
386 m_handledNode = true;
387 m_handledChildren = true;
389 // handle current node according to its type
390 if (!m_handledNode) {
391 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
392 m_handledNode = handleTextNode();
393 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
394 (renderer->node() && renderer->node()->isElementNode() &&
395 static_cast<Element*>(renderer->node())->isFormControlElement())))
396 m_handledNode = handleReplacedElement();
398 m_handledNode = handleNonTextNode();
404 // find a new current node to handle in depth-first manner,
405 // calling exitNode() as we come back thru a parent node
406 Node* next = m_handledChildren ? 0 : m_node->firstChild();
409 next = m_node->nextSibling();
411 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
412 Node* parentNode = m_node->parentOrHostNode();
413 while (!next && parentNode) {
414 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
416 bool haveRenderer = m_node->renderer();
418 m_fullyClippedStack.pop();
419 parentNode = m_node->parentOrHostNode();
422 if (m_positionNode) {
423 m_handledNode = true;
424 m_handledChildren = true;
427 next = m_node->nextSibling();
430 m_fullyClippedStack.pop();
433 // set the new current node
436 pushFullyClippedState(m_fullyClippedStack, m_node);
437 m_handledNode = false;
438 m_handledChildren = false;
439 m_handledFirstLetter = false;
440 m_firstLetterText = 0;
442 // how would this ever be?
448 bool TextIterator::handleTextNode()
450 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
453 RenderText* renderer = toRenderText(m_node->renderer());
455 m_lastTextNode = m_node;
456 String str = renderer->text();
458 // handle pre-formatted text
459 if (!renderer->style()->collapseWhiteSpace()) {
460 int runStart = m_offset;
461 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
462 emitCharacter(' ', m_node, 0, runStart, runStart);
465 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset) {
466 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
467 if (m_firstLetterText) {
468 String firstLetter = m_firstLetterText->text();
469 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
470 m_firstLetterText = 0;
475 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
477 int strLength = str.length();
478 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
479 int runEnd = min(strLength, end);
481 if (runStart >= runEnd)
484 emitText(m_node, runStart, runEnd);
488 if (!renderer->firstTextBox() && str.length() > 0) {
489 if (!m_handledFirstLetter && renderer->isTextFragment()) {
490 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
491 if (m_firstLetterText) {
496 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
498 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
503 m_textBox = renderer->firstTextBox();
504 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
505 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
507 if (m_firstLetterText)
508 renderer = m_firstLetterText;
510 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
511 if (renderer->containsReversedText()) {
512 m_sortedTextBoxes.clear();
513 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
514 m_sortedTextBoxes.append(textBox);
516 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
517 m_sortedTextBoxesPosition = 0;
518 m_textBox = m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0];
525 void TextIterator::handleTextBox()
527 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
528 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
532 String str = renderer->text();
533 unsigned start = m_offset;
534 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
536 unsigned textBoxStart = m_textBox->start();
537 unsigned runStart = max(textBoxStart, start);
539 // Check for collapsed space at the start of this run.
540 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
541 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
542 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
543 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
544 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
545 unsigned spaceRunStart = runStart - 1;
546 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
548 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
550 emitCharacter(' ', m_node, 0, runStart, runStart);
553 unsigned textBoxEnd = textBoxStart + m_textBox->len();
554 unsigned runEnd = min(textBoxEnd, end);
556 // Determine what the next text box will be, but don't advance yet
557 InlineTextBox* nextTextBox = 0;
558 if (renderer->containsReversedText()) {
559 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
560 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
562 nextTextBox = m_textBox->nextTextBox();
564 if (runStart < runEnd) {
565 // Handle either a single newline character (which becomes a space),
566 // or a run of characters that does not include a newline.
567 // This effectively translates newlines to spaces without copying the text.
568 if (str[runStart] == '\n') {
569 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
570 m_offset = runStart + 1;
572 size_t subrunEnd = str.find('\n', runStart);
573 if (subrunEnd == notFound || subrunEnd > runEnd)
576 m_offset = subrunEnd;
577 emitText(m_node, renderer, runStart, subrunEnd);
580 // If we are doing a subrun that doesn't go to the end of the text box,
581 // come back again to finish handling this text box; don't advance to the next one.
582 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
585 // Advance and return
586 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
587 if (nextRunStart > runEnd)
588 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
589 m_textBox = nextTextBox;
590 if (renderer->containsReversedText())
591 ++m_sortedTextBoxesPosition;
594 // Advance and continue
595 m_textBox = nextTextBox;
596 if (renderer->containsReversedText())
597 ++m_sortedTextBoxesPosition;
599 if (!m_textBox && m_remainingTextBox) {
600 m_textBox = m_remainingTextBox;
601 m_remainingTextBox = 0;
602 m_firstLetterText = 0;
608 static inline RenderText* firstRenderTextInFirstLetter(RenderObject* firstLetter)
613 // FIXME: Should this check descendent objects?
614 for (RenderObject* current = firstLetter->firstChild(); current; current = current->nextSibling()) {
615 if (current->isText())
616 return toRenderText(current);
621 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
623 if (renderer->firstLetter()) {
624 RenderObject* r = renderer->firstLetter();
625 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
627 if (RenderText* firstLetter = firstRenderTextInFirstLetter(r)) {
628 m_handledFirstLetter = true;
629 m_remainingTextBox = m_textBox;
630 m_textBox = firstLetter->firstTextBox();
631 m_firstLetterText = firstLetter;
634 m_handledFirstLetter = true;
637 bool TextIterator::handleReplacedElement()
639 if (m_fullyClippedStack.top())
642 RenderObject* renderer = m_node->renderer();
643 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
646 if (m_lastTextNodeEndedWithCollapsedSpace) {
647 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
651 if (m_entersTextControls && renderer->isTextControl()) {
652 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->textFormControlElement()->innerTextElement()) {
653 m_node = innerTextElement->shadowTreeRootNode();
654 pushFullyClippedState(m_fullyClippedStack, m_node);
662 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
663 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
667 if (m_emitsCharactersBetweenAllVisiblePositions) {
668 // We want replaced elements to behave like punctuation for boundary
669 // finding, and to simply take up space for the selection preservation
670 // code in moveParagraphs, so we use a comma.
671 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
675 m_positionNode = m_node->parentNode();
676 m_positionOffsetBaseNode = m_node;
677 m_positionStartOffset = 0;
678 m_positionEndOffset = 1;
680 m_textCharacters = 0;
688 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
690 if (renderer->style()->visibility() == VISIBLE)
692 if (renderer->isTextFragment()) {
693 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
694 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
700 static bool shouldEmitTabBeforeNode(Node* node)
702 RenderObject* r = node->renderer();
704 // Table cells are delimited by tabs.
705 if (!r || !isTableCell(node))
708 // Want a tab before every cell other than the first one
709 RenderTableCell* rc = toRenderTableCell(r);
710 RenderTable* t = rc->table();
711 return t && (t->cellBefore(rc) || t->cellAbove(rc));
714 static bool shouldEmitNewlineForNode(Node* node)
716 // br elements are represented by a single newline.
717 RenderObject* r = node->renderer();
719 return node->hasTagName(brTag);
724 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
726 // Block flow (versus inline flow) is represented by having
727 // a newline both before and after the element.
728 RenderObject* r = node->renderer();
730 return (node->hasTagName(blockquoteTag)
731 || node->hasTagName(ddTag)
732 || node->hasTagName(divTag)
733 || node->hasTagName(dlTag)
734 || node->hasTagName(dtTag)
735 || node->hasTagName(h1Tag)
736 || node->hasTagName(h2Tag)
737 || node->hasTagName(h3Tag)
738 || node->hasTagName(h4Tag)
739 || node->hasTagName(h5Tag)
740 || node->hasTagName(h6Tag)
741 || node->hasTagName(hrTag)
742 || node->hasTagName(liTag)
743 || node->hasTagName(listingTag)
744 || node->hasTagName(olTag)
745 || node->hasTagName(pTag)
746 || node->hasTagName(preTag)
747 || node->hasTagName(trTag)
748 || node->hasTagName(ulTag));
751 // Need to make an exception for table cells, because they are blocks, but we
752 // want them tab-delimited rather than having newlines before and after.
753 if (isTableCell(node))
756 // Need to make an exception for table row elements, because they are neither
757 // "inline" or "RenderBlock", but we want newlines for them.
758 if (r->isTableRow()) {
759 RenderTable* t = toRenderTableRow(r)->table();
760 if (t && !t->isInline())
764 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
767 static bool shouldEmitNewlineAfterNode(Node* node)
769 // FIXME: It should be better but slower to create a VisiblePosition here.
770 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
772 // Check if this is the very last renderer in the document.
773 // If so, then we should not emit a newline.
774 while ((node = node->traverseNextSibling()))
775 if (node->renderer())
780 static bool shouldEmitNewlineBeforeNode(Node* node)
782 return shouldEmitNewlinesBeforeAndAfterNode(node);
785 static bool shouldEmitExtraNewlineForNode(Node* node)
787 // When there is a significant collapsed bottom margin, emit an extra
788 // newline for a more realistic result. We end up getting the right
789 // result even without margin collapsing. For example: <div><p>text</p></div>
790 // will work right even if both the <div> and the <p> have bottom margins.
791 RenderObject* r = node->renderer();
792 if (!r || !r->isBox())
795 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
796 // not to do this at all
797 if (node->hasTagName(h1Tag)
798 || node->hasTagName(h2Tag)
799 || node->hasTagName(h3Tag)
800 || node->hasTagName(h4Tag)
801 || node->hasTagName(h5Tag)
802 || node->hasTagName(h6Tag)
803 || node->hasTagName(pTag)) {
804 RenderStyle* style = r->style();
806 int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
807 int fontSize = style->fontDescription().computedPixelSize();
808 if (bottomMargin * 2 >= fontSize)
816 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
818 const UChar* characters = renderer->text()->characters();
819 int length = renderer->text()->length();
820 for (int i = textEnd; i < length; ++i) {
821 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
825 return length - textEnd;
828 static int maxOffsetIncludingCollapsedSpaces(Node* node)
830 int offset = caretMaxOffset(node);
832 if (node->renderer() && node->renderer()->isText())
833 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
838 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
839 bool TextIterator::shouldRepresentNodeOffsetZero()
841 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
844 // Leave element positioned flush with start of a paragraph
845 // (e.g. do not insert tab before a table cell at the start of a paragraph)
846 if (m_lastCharacter == '\n')
849 // Otherwise, show the position if we have emitted any characters
853 // We've not emitted anything yet. Generally, there is no need for any positioning then.
854 // The only exception is when the element is visually not in the same line as
855 // the start of the range (e.g. the range starts at the end of the previous paragraph).
856 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
857 // make quicker checks to possibly avoid that. Another check that we could make is
858 // is whether the inline vs block flow changed since the previous visible element.
859 // I think we're already in a special enough case that that won't be needed, tho.
861 // No character needed if this is the first node in the range.
862 if (m_node == m_startContainer)
865 // If we are outside the start container's subtree, assume we need to emit.
866 // FIXME: m_startContainer could be an inline block
867 if (!m_node->isDescendantOf(m_startContainer))
870 // If we started as m_startContainer offset 0 and the current node is a descendant of
871 // the start container, we already had enough context to correctly decide whether to
872 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
873 // so don't second guess that now.
874 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
875 // immaterial since we likely would have already emitted something by now.
876 if (m_startOffset == 0)
879 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
880 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
881 // we would create VisiblePositions on every call to this function without this check.
882 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
885 // The startPos.isNotNull() check is needed because the start could be before the body,
886 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
887 // The currPos.isNotNull() check is needed because positions in non-HTML content
888 // (like SVG) do not have visible positions, and we don't want to emit for them either.
889 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
890 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
891 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
894 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
896 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
899 void TextIterator::representNodeOffsetZero()
901 // Emit a character to show the positioning of m_node.
903 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
904 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
905 // on m_node to see if it necessitates emitting a character first and will early return
906 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
907 if (shouldEmitTabBeforeNode(m_node)) {
908 if (shouldRepresentNodeOffsetZero())
909 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
910 } else if (shouldEmitNewlineBeforeNode(m_node)) {
911 if (shouldRepresentNodeOffsetZero())
912 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
913 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
914 if (shouldRepresentNodeOffsetZero())
915 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
919 bool TextIterator::handleNonTextNode()
921 if (shouldEmitNewlineForNode(m_node))
922 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
923 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
924 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
926 representNodeOffsetZero();
931 void TextIterator::exitNode()
933 // prevent emitting a newline when exiting a collapsed block at beginning of the range
934 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
935 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
936 // therefore look like a blank line.
940 // Emit with a position *inside* m_node, after m_node's contents, in
941 // case it is a block, because the run should start where the
942 // emitted character is positioned visually.
943 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
944 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
945 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
946 // TextIterator in _web_attributedStringFromRange.
947 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
948 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
949 // use extra newline to represent margin bottom, as needed
950 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
952 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
953 // contain a VisiblePosition when doing selection preservation.
954 if (m_lastCharacter != '\n') {
955 // insert a newline with a position following this block's contents.
956 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
957 // remember whether to later add a newline for the current node
958 ASSERT(!m_needsAnotherNewline);
959 m_needsAnotherNewline = addNewline;
960 } else if (addNewline)
961 // insert a newline with a position following this block's contents.
962 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
965 // If nothing was emitted, see if we need to emit a space.
966 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
967 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
970 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
974 // remember information with which to construct the TextIterator::range()
975 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
976 m_positionNode = textNode;
977 m_positionOffsetBaseNode = offsetBaseNode;
978 m_positionStartOffset = textStartOffset;
979 m_positionEndOffset = textEndOffset;
981 // remember information with which to construct the TextIterator::characters() and length()
982 m_singleCharacterBuffer = c;
983 m_textCharacters = &m_singleCharacterBuffer;
986 // remember some iteration state
987 m_lastTextNodeEndedWithCollapsedSpace = false;
991 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
993 RenderText* renderer = toRenderText(renderObject);
994 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
995 ASSERT(m_text.characters());
996 ASSERT(0 <= textStartOffset && textStartOffset < static_cast<int>(m_text.length()));
997 ASSERT(0 <= textEndOffset && textEndOffset <= static_cast<int>(m_text.length()));
998 ASSERT(textStartOffset <= textEndOffset);
1000 m_positionNode = textNode;
1001 m_positionOffsetBaseNode = 0;
1002 m_positionStartOffset = textStartOffset;
1003 m_positionEndOffset = textEndOffset;
1004 m_textCharacters = m_text.characters() + textStartOffset;
1005 m_textLength = textEndOffset - textStartOffset;
1006 m_lastCharacter = m_text[textEndOffset - 1];
1008 m_lastTextNodeEndedWithCollapsedSpace = false;
1009 m_hasEmitted = true;
1012 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1014 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1017 PassRefPtr<Range> TextIterator::range() const
1019 // use the current run information, if we have it
1020 if (m_positionNode) {
1021 if (m_positionOffsetBaseNode) {
1022 int index = m_positionOffsetBaseNode->nodeIndex();
1023 m_positionStartOffset += index;
1024 m_positionEndOffset += index;
1025 m_positionOffsetBaseNode = 0;
1027 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1030 // otherwise, return the end of the overall range we were given
1032 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1037 Node* TextIterator::node() const
1039 RefPtr<Range> textRange = range();
1043 Node* node = textRange->startContainer();
1046 if (node->offsetInCharacters())
1049 return node->childNode(textRange->startOffset());
1054 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1055 : m_behavior(TextIteratorDefaultBehavior)
1058 , m_handledNode(false)
1059 , m_handledChildren(false)
1065 , m_positionStartOffset(0)
1066 , m_positionEndOffset(0)
1067 , m_textCharacters(0)
1070 , m_lastCharacter(0)
1071 , m_singleCharacterBuffer(0)
1072 , m_havePassedStartNode(false)
1073 , m_shouldHandleFirstLetter(false)
1077 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1078 : m_behavior(behavior)
1081 , m_handledNode(false)
1082 , m_handledChildren(false)
1088 , m_positionStartOffset(0)
1089 , m_positionEndOffset(0)
1090 , m_textCharacters(0)
1093 , m_lastCharacter(0)
1094 , m_singleCharacterBuffer(0)
1095 , m_havePassedStartNode(false)
1096 , m_shouldHandleFirstLetter(false)
1098 ASSERT(m_behavior == TextIteratorDefaultBehavior);
1103 Node* startNode = r->startContainer();
1106 Node* endNode = r->endContainer();
1107 int startOffset = r->startOffset();
1108 int endOffset = r->endOffset();
1110 if (!startNode->offsetInCharacters()) {
1111 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1112 startNode = startNode->childNode(startOffset);
1116 if (!endNode->offsetInCharacters()) {
1117 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1118 endNode = endNode->childNode(endOffset - 1);
1119 endOffset = lastOffsetInNode(endNode);
1124 setUpFullyClippedStack(m_fullyClippedStack, m_node);
1125 m_offset = endOffset;
1126 m_handledNode = false;
1127 m_handledChildren = endOffset == 0;
1129 m_startNode = startNode;
1130 m_startOffset = startOffset;
1131 m_endNode = endNode;
1132 m_endOffset = endOffset;
1135 // Need this just because of the assert.
1136 m_positionNode = endNode;
1140 m_lastCharacter = '\n';
1142 m_havePassedStartNode = false;
1147 void SimplifiedBackwardsTextIterator::advance()
1149 ASSERT(m_positionNode);
1154 while (m_node && !m_havePassedStartNode) {
1155 // Don't handle node if we start iterating at [node, 0].
1156 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1157 RenderObject* renderer = m_node->renderer();
1158 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1159 // FIXME: What about CDATA_SECTION_NODE?
1160 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1161 m_handledNode = handleTextNode();
1162 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1163 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1164 m_handledNode = handleReplacedElement();
1166 m_handledNode = handleNonTextNode();
1171 if (!m_handledChildren && m_node->hasChildNodes()) {
1172 m_node = m_node->lastChild();
1173 pushFullyClippedState(m_fullyClippedStack, m_node);
1175 // Exit empty containers as we pass over them or containers
1176 // where [container, 0] is where we started iterating.
1178 && canHaveChildrenForEditing(m_node)
1179 && m_node->parentNode()
1180 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1182 if (m_positionNode) {
1183 m_handledNode = true;
1184 m_handledChildren = true;
1189 // Exit all other containers.
1190 while (!m_node->previousSibling()) {
1191 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1193 m_fullyClippedStack.pop();
1195 if (m_positionNode) {
1196 m_handledNode = true;
1197 m_handledChildren = true;
1202 m_fullyClippedStack.pop();
1203 if (advanceRespectingRange(m_node->previousSibling()))
1204 pushFullyClippedState(m_fullyClippedStack, m_node);
1209 // For the purpose of word boundary detection,
1210 // we should iterate all visible text and trailing (collapsed) whitespaces.
1211 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1212 m_handledNode = false;
1213 m_handledChildren = false;
1220 bool SimplifiedBackwardsTextIterator::handleTextNode()
1222 m_lastTextNode = m_node;
1226 RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1230 String text = renderer->text();
1231 if (!renderer->firstTextBox() && text.length() > 0)
1234 m_positionEndOffset = m_offset;
1235 m_offset = startOffset + offsetInNode;
1236 m_positionNode = m_node;
1237 m_positionStartOffset = m_offset;
1239 ASSERT(0 <= m_positionStartOffset - offsetInNode && m_positionStartOffset - offsetInNode <= static_cast<int>(text.length()));
1240 ASSERT(1 <= m_positionEndOffset - offsetInNode && m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1241 ASSERT(m_positionStartOffset <= m_positionEndOffset);
1243 m_textLength = m_positionEndOffset - m_positionStartOffset;
1244 m_textCharacters = text.characters() + (m_positionStartOffset - offsetInNode);
1245 ASSERT(m_textCharacters >= text.characters());
1246 ASSERT(m_textCharacters + m_textLength <= text.characters() + static_cast<int>(text.length()));
1248 m_lastCharacter = text[m_positionEndOffset - 1];
1250 return !m_shouldHandleFirstLetter;
1253 RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1255 RenderText* renderer = toRenderText(m_node->renderer());
1256 startOffset = (m_node == m_startNode) ? m_startOffset : 0;
1258 if (!renderer->isTextFragment()) {
1263 RenderTextFragment* fragment = toRenderTextFragment(renderer);
1264 int offsetAfterFirstLetter = fragment->start();
1265 if (startOffset >= offsetAfterFirstLetter) {
1266 ASSERT(!m_shouldHandleFirstLetter);
1267 offsetInNode = offsetAfterFirstLetter;
1271 if (!m_shouldHandleFirstLetter && offsetAfterFirstLetter < m_offset) {
1272 m_shouldHandleFirstLetter = true;
1273 offsetInNode = offsetAfterFirstLetter;
1277 m_shouldHandleFirstLetter = false;
1279 return firstRenderTextInFirstLetter(fragment->firstLetter());
1282 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1284 unsigned index = m_node->nodeIndex();
1285 // We want replaced elements to behave like punctuation for boundary
1286 // finding, and to simply take up space for the selection preservation
1287 // code in moveParagraphs, so we use a comma. Unconditionally emit
1288 // here because this iterator is only used for boundary finding.
1289 emitCharacter(',', m_node->parentNode(), index, index + 1);
1293 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1295 // We can use a linefeed in place of a tab because this simple iterator is only used to
1296 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1297 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1298 unsigned index = m_node->nodeIndex();
1299 // The start of this emitted range is wrong. Ensuring correctness would require
1300 // VisiblePositions and so would be slow. previousBoundary expects this.
1301 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1306 void SimplifiedBackwardsTextIterator::exitNode()
1308 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1309 // The start of this emitted range is wrong. Ensuring correctness would require
1310 // VisiblePositions and so would be slow. previousBoundary expects this.
1311 emitCharacter('\n', m_node, 0, 0);
1315 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1317 m_singleCharacterBuffer = c;
1318 m_positionNode = node;
1319 m_positionStartOffset = startOffset;
1320 m_positionEndOffset = endOffset;
1321 m_textCharacters = &m_singleCharacterBuffer;
1323 m_lastCharacter = c;
1326 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1330 m_havePassedStartNode |= m_node == m_startNode;
1331 if (m_havePassedStartNode)
1337 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1340 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1342 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1347 CharacterIterator::CharacterIterator()
1354 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1358 , m_textIterator(r, behavior)
1360 while (!atEnd() && m_textIterator.length() == 0)
1361 m_textIterator.advance();
1364 PassRefPtr<Range> CharacterIterator::range() const
1366 RefPtr<Range> r = m_textIterator.range();
1367 if (!m_textIterator.atEnd()) {
1368 if (m_textIterator.length() <= 1) {
1369 ASSERT(m_runOffset == 0);
1371 Node* n = r->startContainer();
1372 ASSERT(n == r->endContainer());
1373 int offset = r->startOffset() + m_runOffset;
1374 ExceptionCode ec = 0;
1375 r->setStart(n, offset, ec);
1376 r->setEnd(n, offset + 1, ec);
1383 void CharacterIterator::advance(int count)
1392 // easy if there is enough left in the current m_textIterator run
1393 int remaining = m_textIterator.length() - m_runOffset;
1394 if (count < remaining) {
1395 m_runOffset += count;
1400 // exhaust the current m_textIterator run
1402 m_offset += remaining;
1404 // move to a subsequent m_textIterator run
1405 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1406 int runLength = m_textIterator.length();
1410 // see whether this is m_textIterator to use
1411 if (count < runLength) {
1412 m_runOffset = count;
1417 // exhaust this m_textIterator run
1419 m_offset += runLength;
1423 // ran to the end of the m_textIterator... no more runs left
1428 String CharacterIterator::string(int numChars)
1430 Vector<UChar> result;
1431 result.reserveInitialCapacity(numChars);
1432 while (numChars > 0 && !atEnd()) {
1433 int runSize = min(numChars, length());
1434 result.append(characters(), runSize);
1435 numChars -= runSize;
1438 return String::adopt(result);
1441 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1444 RefPtr<Range> start = it.range();
1447 it.advance(length - 1);
1448 RefPtr<Range> end = it.range();
1450 return Range::create(start->startContainer()->document(),
1451 start->startContainer(), start->startOffset(),
1452 end->endContainer(), end->endOffset());
1455 BackwardsCharacterIterator::BackwardsCharacterIterator()
1462 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1466 , m_textIterator(range, behavior)
1468 while (!atEnd() && !m_textIterator.length())
1469 m_textIterator.advance();
1472 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1474 RefPtr<Range> r = m_textIterator.range();
1475 if (!m_textIterator.atEnd()) {
1476 if (m_textIterator.length() <= 1)
1477 ASSERT(m_runOffset == 0);
1479 Node* n = r->startContainer();
1480 ASSERT(n == r->endContainer());
1481 int offset = r->endOffset() - m_runOffset;
1482 ExceptionCode ec = 0;
1483 r->setStart(n, offset - 1, ec);
1484 r->setEnd(n, offset, ec);
1491 void BackwardsCharacterIterator::advance(int count)
1500 int remaining = m_textIterator.length() - m_runOffset;
1501 if (count < remaining) {
1502 m_runOffset += count;
1508 m_offset += remaining;
1510 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1511 int runLength = m_textIterator.length();
1515 if (count < runLength) {
1516 m_runOffset = count;
1522 m_offset += runLength;
1532 WordAwareIterator::WordAwareIterator()
1534 , m_didLookAhead(false)
1538 WordAwareIterator::WordAwareIterator(const Range* r)
1540 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1543 advance(); // get in position over the first chunk of text
1546 WordAwareIterator::~WordAwareIterator()
1550 // We're always in one of these modes:
1551 // - The current chunk in the text iterator is our current chunk
1552 // (typically its a piece of whitespace, or text that ended with whitespace)
1553 // - The previous chunk in the text iterator is our current chunk
1554 // (we looked ahead to the next chunk and found a word boundary)
1555 // - We built up our own chunk of text from many chunks from the text iterator
1557 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1559 void WordAwareIterator::advance()
1562 m_buffer.clear(); // toss any old buffer we built up
1564 // If last time we did a look-ahead, start with that looked-ahead chunk now
1565 if (!m_didLookAhead) {
1566 ASSERT(!m_textIterator.atEnd());
1567 m_textIterator.advance();
1569 m_didLookAhead = false;
1571 // Go to next non-empty chunk
1572 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1573 m_textIterator.advance();
1574 m_range = m_textIterator.range();
1576 if (m_textIterator.atEnd())
1580 // If this chunk ends in whitespace we can just use it as our chunk.
1581 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1584 // If this is the first chunk that failed, save it in previousText before look ahead
1585 if (m_buffer.isEmpty()) {
1586 m_previousText = m_textIterator.characters();
1587 m_previousLength = m_textIterator.length();
1590 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1591 m_textIterator.advance();
1592 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1593 m_didLookAhead = true;
1597 if (m_buffer.isEmpty()) {
1598 // Start gobbling chunks until we get to a suitable stopping point
1599 m_buffer.append(m_previousText, m_previousLength);
1602 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1604 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1608 int WordAwareIterator::length() const
1610 if (!m_buffer.isEmpty())
1611 return m_buffer.size();
1613 return m_previousLength;
1614 return m_textIterator.length();
1617 const UChar* WordAwareIterator::characters() const
1619 if (!m_buffer.isEmpty())
1620 return m_buffer.data();
1622 return m_previousText;
1623 return m_textIterator.characters();
1628 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1631 case hebrewPunctuationGershayim:
1632 case leftDoubleQuotationMark:
1633 case rightDoubleQuotationMark:
1635 case hebrewPunctuationGeresh:
1636 case leftSingleQuotationMark:
1637 case rightSingleQuotationMark:
1640 // Replace soft hyphen with an ignorable character so that their presence or absence will
1641 // not affect string comparison.
1648 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1650 s.replace(hebrewPunctuationGeresh, '\'');
1651 s.replace(hebrewPunctuationGershayim, '"');
1652 s.replace(leftDoubleQuotationMark, '"');
1653 s.replace(leftSingleQuotationMark, '\'');
1654 s.replace(rightDoubleQuotationMark, '"');
1655 s.replace(rightSingleQuotationMark, '\'');
1656 // Replace soft hyphen with an ignorable character so that their presence or absence will
1657 // not affect string comparison.
1658 s.replace(softHyphen, 0);
1661 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1663 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1665 for (size_t i = 0; i < length; ++i)
1666 data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1669 static const size_t minimumSearchBufferSize = 8192;
1672 static bool searcherInUse;
1675 static UStringSearch* createSearcher()
1677 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1678 // but it doesn't matter exactly what it is, since we don't perform any searches
1679 // without setting both the pattern and the text.
1680 UErrorCode status = U_ZERO_ERROR;
1681 String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search");
1682 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status);
1683 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1687 static UStringSearch* searcher()
1689 static UStringSearch* searcher = createSearcher();
1693 static inline void lockSearcher()
1696 ASSERT(!searcherInUse);
1697 searcherInUse = true;
1701 static inline void unlockSearcher()
1704 ASSERT(searcherInUse);
1705 searcherInUse = false;
1709 // ICU's search ignores the distinction between small kana letters and ones
1710 // that are not small, and also characters that differ only in the voicing
1711 // marks when considering only primary collation strength diffrences.
1712 // This is not helpful for end users, since these differences make words
1713 // distinct, so for our purposes we need these to be considered.
1714 // The Unicode folks do not think the collation algorithm should be
1715 // changed. To work around this, we would like to tailor the ICU searcher,
1716 // but we can't get that to work yet. So instead, we check for cases where
1717 // these differences occur, and skip those matches.
1719 // We refer to the above technique as the "kana workaround". The next few
1720 // functions are helper functinos for the kana workaround.
1722 static inline bool isKanaLetter(UChar character)
1724 // Hiragana letters.
1725 if (character >= 0x3041 && character <= 0x3096)
1728 // Katakana letters.
1729 if (character >= 0x30A1 && character <= 0x30FA)
1731 if (character >= 0x31F0 && character <= 0x31FF)
1734 // Halfwidth katakana letters.
1735 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1741 static inline bool isSmallKanaLetter(UChar character)
1743 ASSERT(isKanaLetter(character));
1745 switch (character) {
1746 case 0x3041: // HIRAGANA LETTER SMALL A
1747 case 0x3043: // HIRAGANA LETTER SMALL I
1748 case 0x3045: // HIRAGANA LETTER SMALL U
1749 case 0x3047: // HIRAGANA LETTER SMALL E
1750 case 0x3049: // HIRAGANA LETTER SMALL O
1751 case 0x3063: // HIRAGANA LETTER SMALL TU
1752 case 0x3083: // HIRAGANA LETTER SMALL YA
1753 case 0x3085: // HIRAGANA LETTER SMALL YU
1754 case 0x3087: // HIRAGANA LETTER SMALL YO
1755 case 0x308E: // HIRAGANA LETTER SMALL WA
1756 case 0x3095: // HIRAGANA LETTER SMALL KA
1757 case 0x3096: // HIRAGANA LETTER SMALL KE
1758 case 0x30A1: // KATAKANA LETTER SMALL A
1759 case 0x30A3: // KATAKANA LETTER SMALL I
1760 case 0x30A5: // KATAKANA LETTER SMALL U
1761 case 0x30A7: // KATAKANA LETTER SMALL E
1762 case 0x30A9: // KATAKANA LETTER SMALL O
1763 case 0x30C3: // KATAKANA LETTER SMALL TU
1764 case 0x30E3: // KATAKANA LETTER SMALL YA
1765 case 0x30E5: // KATAKANA LETTER SMALL YU
1766 case 0x30E7: // KATAKANA LETTER SMALL YO
1767 case 0x30EE: // KATAKANA LETTER SMALL WA
1768 case 0x30F5: // KATAKANA LETTER SMALL KA
1769 case 0x30F6: // KATAKANA LETTER SMALL KE
1770 case 0x31F0: // KATAKANA LETTER SMALL KU
1771 case 0x31F1: // KATAKANA LETTER SMALL SI
1772 case 0x31F2: // KATAKANA LETTER SMALL SU
1773 case 0x31F3: // KATAKANA LETTER SMALL TO
1774 case 0x31F4: // KATAKANA LETTER SMALL NU
1775 case 0x31F5: // KATAKANA LETTER SMALL HA
1776 case 0x31F6: // KATAKANA LETTER SMALL HI
1777 case 0x31F7: // KATAKANA LETTER SMALL HU
1778 case 0x31F8: // KATAKANA LETTER SMALL HE
1779 case 0x31F9: // KATAKANA LETTER SMALL HO
1780 case 0x31FA: // KATAKANA LETTER SMALL MU
1781 case 0x31FB: // KATAKANA LETTER SMALL RA
1782 case 0x31FC: // KATAKANA LETTER SMALL RI
1783 case 0x31FD: // KATAKANA LETTER SMALL RU
1784 case 0x31FE: // KATAKANA LETTER SMALL RE
1785 case 0x31FF: // KATAKANA LETTER SMALL RO
1786 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1787 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1788 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1789 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1790 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1791 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1792 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1793 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1794 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1800 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1802 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1804 ASSERT(isKanaLetter(character));
1806 switch (character) {
1807 case 0x304C: // HIRAGANA LETTER GA
1808 case 0x304E: // HIRAGANA LETTER GI
1809 case 0x3050: // HIRAGANA LETTER GU
1810 case 0x3052: // HIRAGANA LETTER GE
1811 case 0x3054: // HIRAGANA LETTER GO
1812 case 0x3056: // HIRAGANA LETTER ZA
1813 case 0x3058: // HIRAGANA LETTER ZI
1814 case 0x305A: // HIRAGANA LETTER ZU
1815 case 0x305C: // HIRAGANA LETTER ZE
1816 case 0x305E: // HIRAGANA LETTER ZO
1817 case 0x3060: // HIRAGANA LETTER DA
1818 case 0x3062: // HIRAGANA LETTER DI
1819 case 0x3065: // HIRAGANA LETTER DU
1820 case 0x3067: // HIRAGANA LETTER DE
1821 case 0x3069: // HIRAGANA LETTER DO
1822 case 0x3070: // HIRAGANA LETTER BA
1823 case 0x3073: // HIRAGANA LETTER BI
1824 case 0x3076: // HIRAGANA LETTER BU
1825 case 0x3079: // HIRAGANA LETTER BE
1826 case 0x307C: // HIRAGANA LETTER BO
1827 case 0x3094: // HIRAGANA LETTER VU
1828 case 0x30AC: // KATAKANA LETTER GA
1829 case 0x30AE: // KATAKANA LETTER GI
1830 case 0x30B0: // KATAKANA LETTER GU
1831 case 0x30B2: // KATAKANA LETTER GE
1832 case 0x30B4: // KATAKANA LETTER GO
1833 case 0x30B6: // KATAKANA LETTER ZA
1834 case 0x30B8: // KATAKANA LETTER ZI
1835 case 0x30BA: // KATAKANA LETTER ZU
1836 case 0x30BC: // KATAKANA LETTER ZE
1837 case 0x30BE: // KATAKANA LETTER ZO
1838 case 0x30C0: // KATAKANA LETTER DA
1839 case 0x30C2: // KATAKANA LETTER DI
1840 case 0x30C5: // KATAKANA LETTER DU
1841 case 0x30C7: // KATAKANA LETTER DE
1842 case 0x30C9: // KATAKANA LETTER DO
1843 case 0x30D0: // KATAKANA LETTER BA
1844 case 0x30D3: // KATAKANA LETTER BI
1845 case 0x30D6: // KATAKANA LETTER BU
1846 case 0x30D9: // KATAKANA LETTER BE
1847 case 0x30DC: // KATAKANA LETTER BO
1848 case 0x30F4: // KATAKANA LETTER VU
1849 case 0x30F7: // KATAKANA LETTER VA
1850 case 0x30F8: // KATAKANA LETTER VI
1851 case 0x30F9: // KATAKANA LETTER VE
1852 case 0x30FA: // KATAKANA LETTER VO
1853 return VoicedSoundMark;
1854 case 0x3071: // HIRAGANA LETTER PA
1855 case 0x3074: // HIRAGANA LETTER PI
1856 case 0x3077: // HIRAGANA LETTER PU
1857 case 0x307A: // HIRAGANA LETTER PE
1858 case 0x307D: // HIRAGANA LETTER PO
1859 case 0x30D1: // KATAKANA LETTER PA
1860 case 0x30D4: // KATAKANA LETTER PI
1861 case 0x30D7: // KATAKANA LETTER PU
1862 case 0x30DA: // KATAKANA LETTER PE
1863 case 0x30DD: // KATAKANA LETTER PO
1864 return SemiVoicedSoundMark;
1866 return NoVoicedSoundMark;
1869 static inline bool isCombiningVoicedSoundMark(UChar character)
1871 switch (character) {
1872 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1873 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1879 static inline bool containsKanaLetters(const String& pattern)
1881 const UChar* characters = pattern.characters();
1882 unsigned length = pattern.length();
1883 for (unsigned i = 0; i < length; ++i) {
1884 if (isKanaLetter(characters[i]))
1890 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1894 buffer.resize(length);
1896 UErrorCode status = U_ZERO_ERROR;
1897 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1898 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1901 buffer.resize(bufferSize);
1903 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1906 status = U_ZERO_ERROR;
1907 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1908 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1911 static bool isNonLatin1Separator(UChar32 character)
1913 ASSERT_ARG(character, character >= 256);
1915 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1918 static inline bool isSeparator(UChar32 character)
1920 static const bool latin1SeparatorTable[256] = {
1921 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1922 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1924 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1925 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1926 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1927 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1928 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1929 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1930 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1931 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1932 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1934 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1936 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1939 if (character < 256)
1940 return latin1SeparatorTable[character];
1942 return isNonLatin1Separator(character);
1945 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1947 , m_options(options)
1950 , m_needsMoreContext(options & AtWordStarts)
1951 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1953 ASSERT(!m_target.isEmpty());
1955 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1956 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1957 // to add tailoring on top of the locale-specific tailoring as of this writing.
1958 foldQuoteMarksAndSoftHyphens(m_target);
1960 size_t targetLength = m_target.length();
1961 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1962 m_overlap = m_buffer.capacity() / 4;
1964 if ((m_options & AtWordStarts) && targetLength) {
1965 UChar32 targetFirstCharacter;
1966 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1967 // Characters in the separator category never really occur at the beginning of a word,
1968 // so if the target begins with such a character, we just ignore the AtWordStart option.
1969 if (isSeparator(targetFirstCharacter)) {
1970 m_options &= ~AtWordStarts;
1971 m_needsMoreContext = false;
1975 // Grab the single global searcher.
1976 // If we ever have a reason to do more than once search buffer at once, we'll have
1977 // to move to multiple searchers.
1980 UStringSearch* searcher = WebCore::searcher();
1981 UCollator* collator = usearch_getCollator(searcher);
1983 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1984 if (ucol_getStrength(collator) != strength) {
1985 ucol_setStrength(collator, strength);
1986 usearch_reset(searcher);
1989 UErrorCode status = U_ZERO_ERROR;
1990 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1991 ASSERT(status == U_ZERO_ERROR);
1993 // The kana workaround requires a normalized copy of the target string.
1994 if (m_targetRequiresKanaWorkaround)
1995 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1998 inline SearchBuffer::~SearchBuffer()
2000 // Leave the static object pointing to a valid string.
2001 UErrorCode status = U_ZERO_ERROR;
2002 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
2003 ASSERT(status == U_ZERO_ERROR);
2008 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2016 } else if (m_buffer.size() == m_buffer.capacity()) {
2017 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2018 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
2019 m_buffer.shrink(m_overlap);
2022 size_t oldLength = m_buffer.size();
2023 size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2024 ASSERT(usableLength);
2025 m_buffer.append(characters, usableLength);
2026 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2027 return usableLength;
2030 inline bool SearchBuffer::needsMoreContext() const
2032 return m_needsMoreContext;
2035 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2037 ASSERT(m_needsMoreContext);
2038 ASSERT(m_prefixLength == m_buffer.size());
2045 size_t wordBoundaryContextStart = length;
2046 if (wordBoundaryContextStart) {
2047 U16_BACK_1(characters, 0, wordBoundaryContextStart);
2048 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2051 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2052 m_buffer.prepend(characters + length - usableLength, usableLength);
2053 m_prefixLength += usableLength;
2055 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2056 m_needsMoreContext = false;
2059 inline bool SearchBuffer::atBreak() const
2064 inline void SearchBuffer::reachedBreak()
2069 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2071 // This function implements the kana workaround. If usearch treats
2072 // it as a match, but we do not want to, then it's a "bad match".
2073 if (!m_targetRequiresKanaWorkaround)
2076 // Normalize into a match buffer. We reuse a single buffer rather than
2077 // creating a new one each time.
2078 normalizeCharacters(match, matchLength, m_normalizedMatch);
2080 const UChar* a = m_normalizedTarget.begin();
2081 const UChar* aEnd = m_normalizedTarget.end();
2083 const UChar* b = m_normalizedMatch.begin();
2084 const UChar* bEnd = m_normalizedMatch.end();
2087 // Skip runs of non-kana-letter characters. This is necessary so we can
2088 // correctly handle strings where the target and match have different-length
2089 // runs of characters that match, while still double checking the correctness
2090 // of matches of kana letters with other kana letters.
2091 while (a != aEnd && !isKanaLetter(*a))
2093 while (b != bEnd && !isKanaLetter(*b))
2096 // If we reached the end of either the target or the match, we should have
2097 // reached the end of both; both should have the same number of kana letters.
2098 if (a == aEnd || b == bEnd) {
2104 // Check for differences in the kana letter character itself.
2105 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2107 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2112 // Check for differences in combining voiced sound marks found after the letter.
2114 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2115 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2119 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2129 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2131 ASSERT(m_options & AtWordStarts);
2136 int size = m_buffer.size();
2138 UChar32 firstCharacter;
2139 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2141 if (m_options & TreatMedialCapitalAsWordStart) {
2142 UChar32 previousCharacter;
2143 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2145 if (isSeparator(firstCharacter)) {
2146 // The start of a separator run is a word start (".org" in "webkit.org").
2147 if (!isSeparator(previousCharacter))
2149 } else if (isASCIIUpper(firstCharacter)) {
2150 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2151 if (!isASCIIUpper(previousCharacter))
2153 // The last character of an uppercase run followed by a non-separator, non-digit
2154 // is a word start ("Request" in "XMLHTTPRequest").
2156 U16_FWD_1(m_buffer.data(), offset, size);
2157 UChar32 nextCharacter = 0;
2159 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2160 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2162 } else if (isASCIIDigit(firstCharacter)) {
2163 // The start of a digit run is a word start ("2" in "WebKit2").
2164 if (!isASCIIDigit(previousCharacter))
2166 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2167 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2168 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2173 // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
2174 // a word, so treat the position before any CJK character as a word start.
2175 if (Font::isCJKIdeographOrSymbol(firstCharacter))
2178 size_t wordBreakSearchStart = start + length;
2179 while (wordBreakSearchStart > start)
2180 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2181 return wordBreakSearchStart == start;
2184 inline size_t SearchBuffer::search(size_t& start)
2186 size_t size = m_buffer.size();
2191 if (size != m_buffer.capacity())
2195 UStringSearch* searcher = WebCore::searcher();
2197 UErrorCode status = U_ZERO_ERROR;
2198 usearch_setText(searcher, m_buffer.data(), size, &status);
2199 ASSERT(status == U_ZERO_ERROR);
2201 usearch_setOffset(searcher, m_prefixLength, &status);
2202 ASSERT(status == U_ZERO_ERROR);
2204 int matchStart = usearch_next(searcher, &status);
2205 ASSERT(status == U_ZERO_ERROR);
2208 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2209 ASSERT(matchStart == USEARCH_DONE);
2213 // Matches that start in the overlap area are only tentative.
2214 // The same match may appear later, matching more characters,
2215 // possibly including a combining character that's not yet in the buffer.
2216 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2217 size_t overlap = m_overlap;
2218 if (m_options & AtWordStarts) {
2219 // Ensure that there is sufficient context before matchStart the next time around for
2220 // determining if it is at a word boundary.
2221 int wordBoundaryContextStart = matchStart;
2222 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2223 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2224 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2226 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2227 m_prefixLength -= min(m_prefixLength, size - overlap);
2228 m_buffer.shrink(overlap);
2232 size_t matchedLength = usearch_getMatchedLength(searcher);
2233 ASSERT(matchStart + matchedLength <= size);
2235 // If this match is "bad", move on to the next match.
2236 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2237 matchStart = usearch_next(searcher, &status);
2238 ASSERT(status == U_ZERO_ERROR);
2242 size_t newSize = size - (matchStart + 1);
2243 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2244 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2245 m_buffer.shrink(newSize);
2247 start = size - matchStart;
2248 return matchedLength;
2251 #else // !ICU_UNICODE
2253 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2254 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2255 , m_options(options)
2256 , m_buffer(m_target.length())
2257 , m_isCharacterStartBuffer(m_target.length())
2258 , m_isBufferFull(false)
2261 ASSERT(!m_target.isEmpty());
2262 m_target.replace(noBreakSpace, ' ');
2263 foldQuoteMarksAndSoftHyphens(m_target);
2266 inline SearchBuffer::~SearchBuffer()
2270 inline void SearchBuffer::reachedBreak()
2273 m_isBufferFull = false;
2276 inline bool SearchBuffer::atBreak() const
2278 return !m_cursor && !m_isBufferFull;
2281 inline void SearchBuffer::append(UChar c, bool isStart)
2283 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2284 m_isCharacterStartBuffer[m_cursor] = isStart;
2285 if (++m_cursor == m_target.length()) {
2287 m_isBufferFull = true;
2291 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2294 if (!(m_options & CaseInsensitive)) {
2295 append(characters[0], true);
2298 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2299 UChar foldedCharacters[maxFoldedCharacters];
2301 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2303 ASSERT(numFoldedCharacters);
2304 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2305 if (!error && numFoldedCharacters) {
2306 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2307 append(foldedCharacters[0], true);
2308 for (int i = 1; i < numFoldedCharacters; ++i)
2309 append(foldedCharacters[i], false);
2314 inline bool SearchBuffer::needsMoreContext() const
2319 void SearchBuffer::prependContext(const UChar*, size_t)
2321 ASSERT_NOT_REACHED();
2324 inline size_t SearchBuffer::search(size_t& start)
2326 if (!m_isBufferFull)
2328 if (!m_isCharacterStartBuffer[m_cursor])
2331 size_t tailSpace = m_target.length() - m_cursor;
2332 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2334 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2339 // Now that we've found a match once, we don't want to find it again, because those
2340 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2341 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2342 // we want to get rid of that in the future we could track this with a separate boolean
2343 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2344 m_isCharacterStartBuffer[m_cursor] = false;
2349 // Returns the number of characters that were appended to the buffer (what we are searching in).
2350 // That's not necessarily the same length as the passed-in target string, because case folding
2351 // can make two strings match even though they're not the same length.
2352 size_t SearchBuffer::length() const
2354 size_t bufferSize = m_target.length();
2356 for (size_t i = 0; i < bufferSize; ++i)
2357 length += m_isCharacterStartBuffer[i];
2361 #endif // !ICU_UNICODE
2365 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2368 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2369 length += it.length();
2374 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2376 CharacterIterator entireRangeIterator(entireRange);
2377 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2380 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2382 RefPtr<Range> resultRange = scope->document()->createRange();
2384 int docTextPosition = 0;
2385 int rangeEnd = rangeLocation + rangeLength;
2386 bool startRangeFound = false;
2388 RefPtr<Range> textRunRange;
2390 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2392 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2393 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2394 textRunRange = it.range();
2396 ExceptionCode ec = 0;
2397 resultRange->setStart(textRunRange->startContainer(), 0, ec);
2399 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2402 return resultRange.release();
2405 for (; !it.atEnd(); it.advance()) {
2406 int len = it.length();
2407 textRunRange = it.range();
2409 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2410 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2412 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2413 // in those cases that textRunRange is used.
2415 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2416 // position for emitted '\n's.
2417 if (len == 1 && it.characters()[0] == '\n') {
2418 scope->document()->updateLayoutIgnorePendingStylesheets();
2421 RefPtr<Range> range = it.range();
2422 ExceptionCode ec = 0;
2423 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2426 Position runStart = textRunRange->startPosition();
2427 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2428 if (runEnd.isNotNull()) {
2429 ExceptionCode ec = 0;
2430 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2438 startRangeFound = true;
2440 if (textRunRange->startContainer()->isTextNode()) {
2441 int offset = rangeLocation - docTextPosition;
2442 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2444 if (rangeLocation == docTextPosition)
2445 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2447 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2453 if (textRunRange->startContainer()->isTextNode()) {
2454 int offset = rangeEnd - docTextPosition;
2455 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2457 if (rangeEnd == docTextPosition)
2458 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2460 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2462 docTextPosition += len;
2465 docTextPosition += len;
2468 if (!startRangeFound)
2471 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2473 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2476 return resultRange.release();
2479 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2481 location = notFound;
2484 if (!range->startContainer())
2487 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2488 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2490 // The critical assumption is that this only gets called with ranges that
2491 // concentrate on a given area containing the selection root. This is done
2492 // because of text fields and textareas. The DOM for those is not
2493 // directly in the document DOM, so ensure that the range does not cross a
2494 // boundary of one of those.
2495 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2497 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2500 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2501 ASSERT(testRange->startContainer() == scope);
2502 location = TextIterator::rangeLength(testRange.get());
2505 testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2506 ASSERT(testRange->startContainer() == scope);
2507 length = TextIterator::rangeLength(testRange.get()) - location;
2513 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2517 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2518 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2519 static const unsigned cMaxSegmentSize = 1 << 16;
2521 typedef pair<UChar*, unsigned> TextSegment;
2522 OwnPtr<Vector<TextSegment> > textSegments;
2523 Vector<UChar> textBuffer;
2524 textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2525 TextIteratorBehavior behavior = defaultBehavior;
2526 if (!isDisplayString)
2527 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2529 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2530 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2531 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2532 if (!newSegmentBuffer)
2534 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2536 textSegments = adoptPtr(new Vector<TextSegment>);
2537 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2540 textBuffer.append(it.characters(), it.length());
2541 bufferLength += it.length();
2547 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2548 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2553 UChar* resultPos = result;
2555 unsigned size = textSegments->size();
2556 for (unsigned i = 0; i < size; ++i) {
2557 const TextSegment& segment = textSegments->at(i);
2558 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2559 resultPos += segment.second;
2562 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2567 unsigned size = textSegments->size();
2568 for (unsigned i = 0; i < size; ++i)
2569 free(textSegments->at(i).first);
2572 if (isDisplayString && r->ownerDocument())
2573 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2578 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2581 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2584 String result(buf, length);
2589 static inline bool isAllCollapsibleWhitespace(const String& string)
2591 const UChar* characters = string.characters();
2592 unsigned length = string.length();
2593 for (unsigned i = 0; i < length; ++i) {
2594 if (!isCollapsibleWhitespace(characters[i]))
2600 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2602 ExceptionCode ec = 0;
2603 RefPtr<Range> result = range->cloneRange(ec);
2605 result->collapse(!forward, ec);
2607 return result.release();
2610 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2613 size_t matchLength = 0;
2615 SearchBuffer buffer(target, options);
2617 if (buffer.needsMoreContext()) {
2618 RefPtr<Range> startRange = it.range();
2619 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2620 ExceptionCode ec = 0;
2621 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2622 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2623 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2624 if (!buffer.needsMoreContext())
2629 while (!it.atEnd()) {
2630 it.advance(buffer.append(it.characters(), it.length()));
2632 size_t matchStartOffset;
2633 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2634 // Note that we found a match, and where we found it.
2635 size_t lastCharacterInBufferOffset = it.characterOffset();
2636 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2637 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2638 matchLength = newMatchLength;
2639 // If searching forward, stop on the first match.
2640 // If searching backward, don't stop, so we end up with the last match.
2641 if (!(options & Backwards))
2645 if (it.atBreak() && !buffer.atBreak()) {
2646 buffer.reachedBreak();
2654 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2656 // CharacterIterator requires renderers to be up-to-date
2657 range->ownerDocument()->updateLayout();
2659 // First, find the text.
2663 CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2664 matchLength = findPlainText(findIterator, target, options, matchStart);
2666 return collapsedToBoundary(range, !(options & Backwards));
2669 // Then, find the document position of the start and the end of the text.
2670 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2671 return characterSubrange(computeRangeIterator, matchStart, matchLength);