code.vuplus.com Git - vuplus_webkit/blob - Source/WebCore/editing/TextIterator.cpp

   1 /*
   2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
   3  * Copyright (C) 2005 Alexey Proskuryakov.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "TextIterator.h"
  29
  30 #include "Document.h"
  31 #include "Frame.h"
  32 #include "HTMLElement.h"
  33 #include "HTMLTextFormControlElement.h"
  34 #include "HTMLNames.h"
  35 #include "htmlediting.h"
  36 #include "InlineTextBox.h"
  37 #include "Range.h"
  38 #include "RenderTableCell.h"
  39 #include "RenderTableRow.h"
  40 #include "RenderTextControl.h"
  41 #include "RenderTextFragment.h"
  42 #include "TextBoundaries.h"
  43 #include "TextBreakIterator.h"
  44 #include "VisiblePosition.h"
  45 #include "visible_units.h"
  46 #include <wtf/unicode/CharacterNames.h>
  47
  48 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
  49 #include "TextBreakIteratorInternalICU.h"
  50 #include <unicode/usearch.h>
  51 #endif
  52
  53 using namespace WTF::Unicode;
  54 using namespace std;
  55
  56 namespace WebCore {
  57
  58 using namespace HTMLNames;
  59
  60 // Buffer that knows how to compare with a search target.
  61 // Keeps enough of the previous text to be able to search in the future, but no more.
  62 // Non-breaking spaces are always equal to normal spaces.
  63 // Case folding is also done if the CaseInsensitive option is specified.
  64 // Matches are further filtered if the AtWordStarts option is specified, although some
  65 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
  66 class SearchBuffer {
  67     WTF_MAKE_NONCOPYABLE(SearchBuffer);
  68 public:
  69     SearchBuffer(const String& target, FindOptions);
  70     ~SearchBuffer();
  71
  72     // Returns number of characters appended; guaranteed to be in the range [1, length].
  73     size_t append(const UChar*, size_t length);
  74     bool needsMoreContext() const;
  75     void prependContext(const UChar*, size_t length);
  76     void reachedBreak();
  77
  78     // Result is the size in characters of what was found.
  79     // And <startOffset> is the number of characters back to the start of what was found.
  80     size_t search(size_t& startOffset);
  81     bool atBreak() const;
  82
  83 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
  84
  85 private:
  86     bool isBadMatch(const UChar*, size_t length) const;
  87     bool isWordStartMatch(size_t start, size_t length) const;
  88
  89     String m_target;
  90     FindOptions m_options;
  91
  92     Vector<UChar> m_buffer;
  93     size_t m_overlap;
  94     size_t m_prefixLength;
  95     bool m_atBreak;
  96     bool m_needsMoreContext;
  97
  98     bool m_targetRequiresKanaWorkaround;
  99     Vector<UChar> m_normalizedTarget;
 100     mutable Vector<UChar> m_normalizedMatch;
 101
 102 #else
 103
 104 private:
 105     void append(UChar, bool isCharacterStart);
 106     size_t length() const;
 107
 108     String m_target;
 109     FindOptions m_options;
 110
 111     Vector<UChar> m_buffer;
 112     Vector<bool> m_isCharacterStartBuffer;
 113     bool m_isBufferFull;
 114     size_t m_cursor;
 115
 116 #endif
 117 };
 118
 119 // --------
 120
 121 static const unsigned bitsInWord = sizeof(unsigned) * 8;
 122 static const unsigned bitInWordMask = bitsInWord - 1;
 123
 124 BitStack::BitStack()
 125     : m_size(0)
 126 {
 127 }
 128
 129 BitStack::~BitStack()
 130 {
 131 }
 132
 133 void BitStack::push(bool bit)
 134 {
 135     unsigned index = m_size / bitsInWord;
 136     unsigned shift = m_size & bitInWordMask;
 137     if (!shift && index == m_words.size()) {
 138         m_words.grow(index + 1);
 139         m_words[index] = 0;
 140     }
 141     unsigned& word = m_words[index];
 142     unsigned mask = 1U << shift;
 143     if (bit)
 144         word |= mask;
 145     else
 146         word &= ~mask;
 147     ++m_size;
 148 }
 149
 150 void BitStack::pop()
 151 {
 152     if (m_size)
 153         --m_size;
 154 }
 155
 156 bool BitStack::top() const
 157 {
 158     if (!m_size)
 159         return false;
 160     unsigned shift = (m_size - 1) & bitInWordMask;
 161     return m_words.last() & (1U << shift);
 162 }
 163
 164 unsigned BitStack::size() const
 165 {
 166     return m_size;
 167 }
 168
 169 // --------
 170
 171 #if !ASSERT_DISABLED
 172
 173 static unsigned depthCrossingShadowBoundaries(Node* node)
 174 {
 175     unsigned depth = 0;
 176     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
 177         ++depth;
 178     return depth;
 179 }
 180
 181 #endif
 182
 183 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
 184 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
 185 {
 186     if (!rangeEndContainer)
 187         return 0;
 188     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
 189         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
 190             return next;
 191     }
 192     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
 193         if (Node* next = node->nextSibling())
 194             return next;
 195     }
 196     return 0;
 197 }
 198
 199 // --------
 200
 201 static inline bool fullyClipsContents(Node* node)
 202 {
 203     RenderObject* renderer = node->renderer();
 204     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
 205         return false;
 206     return toRenderBox(renderer)->size().isEmpty();
 207 }
 208
 209 static inline bool ignoresContainerClip(Node* node)
 210 {
 211     RenderObject* renderer = node->renderer();
 212     if (!renderer || renderer->isText())
 213         return false;
 214     EPosition position = renderer->style()->position();
 215     return position == AbsolutePosition || position == FixedPosition;
 216 }
 217
 218 static void pushFullyClippedState(BitStack& stack, Node* node)
 219 {
 220     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
 221
 222     // Push true if this node full clips its contents, or if a parent already has fully
 223     // clipped and this is not a node that ignores its container's clip.
 224     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
 225 }
 226
 227 static void setUpFullyClippedStack(BitStack& stack, Node* node)
 228 {
 229     // Put the nodes in a vector so we can iterate in reverse order.
 230     Vector<Node*, 100> ancestry;
 231     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
 232         ancestry.append(parent);
 233
 234     // Call pushFullyClippedState on each node starting with the earliest ancestor.
 235     size_t size = ancestry.size();
 236     for (size_t i = 0; i < size; ++i)
 237         pushFullyClippedState(stack, ancestry[size - i - 1]);
 238     pushFullyClippedState(stack, node);
 239
 240     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
 241 }
 242
 243 // --------
 244
 245 TextIterator::TextIterator()
 246     : m_startContainer(0)
 247     , m_startOffset(0)
 248     , m_endContainer(0)
 249     , m_endOffset(0)
 250     , m_positionNode(0)
 251     , m_textCharacters(0)
 252     , m_textLength(0)
 253     , m_remainingTextBox(0)
 254     , m_firstLetterText(0)
 255     , m_lastCharacter(0)
 256     , m_sortedTextBoxesPosition(0)
 257     , m_emitsCharactersBetweenAllVisiblePositions(false)
 258     , m_entersTextControls(false)
 259     , m_emitsTextWithoutTranscoding(false)
 260     , m_handledFirstLetter(false)
 261     , m_ignoresStyleVisibility(false)
 262     , m_emitsObjectReplacementCharacters(false)
 263 {
 264 }
 265
 266 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
 267     : m_startContainer(0)
 268     , m_startOffset(0)
 269     , m_endContainer(0)
 270     , m_endOffset(0)
 271     , m_positionNode(0)
 272     , m_textCharacters(0)
 273     , m_textLength(0)
 274     , m_remainingTextBox(0)
 275     , m_firstLetterText(0)
 276     , m_sortedTextBoxesPosition(0)
 277     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
 278     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
 279     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
 280     , m_handledFirstLetter(false)
 281     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
 282     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
 283 {
 284     if (!r)
 285         return;
 286
 287     // get and validate the range endpoints
 288     Node* startContainer = r->startContainer();
 289     if (!startContainer)
 290         return;
 291     int startOffset = r->startOffset();
 292     Node* endContainer = r->endContainer();
 293     int endOffset = r->endOffset();
 294
 295     // Callers should be handing us well-formed ranges. If we discover that this isn't
 296     // the case, we could consider changing this assertion to an early return.
 297     ASSERT(r->boundaryPointsValid());
 298
 299     // remember range - this does not change
 300     m_startContainer = startContainer;
 301     m_startOffset = startOffset;
 302     m_endContainer = endContainer;
 303     m_endOffset = endOffset;
 304
 305     // set up the current node for processing
 306     m_node = r->firstNode();
 307     if (!m_node)
 308         return;
 309     setUpFullyClippedStack(m_fullyClippedStack, m_node);
 310     m_offset = m_node == m_startContainer ? m_startOffset : 0;
 311     m_handledNode = false;
 312     m_handledChildren = false;
 313
 314     // calculate first out of bounds node
 315     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
 316
 317     // initialize node processing state
 318     m_needsAnotherNewline = false;
 319     m_textBox = 0;
 320
 321     // initialize record of previous node processing
 322     m_hasEmitted = false;
 323     m_lastTextNode = 0;
 324     m_lastTextNodeEndedWithCollapsedSpace = false;
 325     m_lastCharacter = 0;
 326
 327 #ifndef NDEBUG
 328     // need this just because of the assert in advance()
 329     m_positionNode = m_node;
 330 #endif
 331
 332     // identify the first run
 333     advance();
 334 }
 335
 336 TextIterator::~TextIterator()
 337 {
 338 }
 339
 340 void TextIterator::advance()
 341 {
 342     // reset the run information
 343     m_positionNode = 0;
 344     m_textLength = 0;
 345
 346     // handle remembered node that needed a newline after the text node's newline
 347     if (m_needsAnotherNewline) {
 348         // Emit the extra newline, and position it *inside* m_node, after m_node's
 349         // contents, in case it's a block, in the same way that we position the first
 350         // newline.  The range for the emitted newline should start where the line
 351         // break begins.
 352         // FIXME: It would be cleaner if we emitted two newlines during the last
 353         // iteration, instead of using m_needsAnotherNewline.
 354         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
 355         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
 356         m_needsAnotherNewline = false;
 357         return;
 358     }
 359
 360     if (!m_textBox && m_remainingTextBox) {
 361         m_textBox = m_remainingTextBox;
 362         m_remainingTextBox = 0;
 363         m_firstLetterText = 0;
 364         m_offset = 0;
 365     }
 366     // handle remembered text box
 367     if (m_textBox) {
 368         handleTextBox();
 369         if (m_positionNode)
 370             return;
 371     }
 372
 373     while (m_node && m_node != m_pastEndNode) {
 374         // if the range ends at offset 0 of an element, represent the
 375         // position, but not the content, of that element e.g. if the
 376         // node is a blockflow element, emit a newline that
 377         // precedes the element
 378         if (m_node == m_endContainer && m_endOffset == 0) {
 379             representNodeOffsetZero();
 380             m_node = 0;
 381             return;
 382         }
 383
 384         RenderObject* renderer = m_node->renderer();
 385         if (!renderer) {
 386             m_handledNode = true;
 387             m_handledChildren = true;
 388         } else {
 389             // handle current node according to its type
 390             if (!m_handledNode) {
 391                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
 392                     m_handledNode = handleTextNode();
 393                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
 394                          (renderer->node() && renderer->node()->isElementNode() &&
 395                           static_cast<Element*>(renderer->node())->isFormControlElement())))
 396                     m_handledNode = handleReplacedElement();
 397                 else
 398                     m_handledNode = handleNonTextNode();
 399                 if (m_positionNode)
 400                     return;
 401             }
 402         }
 403
 404         // find a new current node to handle in depth-first manner,
 405         // calling exitNode() as we come back thru a parent node
 406         Node* next = m_handledChildren ? 0 : m_node->firstChild();
 407         m_offset = 0;
 408         if (!next) {
 409             next = m_node->nextSibling();
 410             if (!next) {
 411                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
 412                 Node* parentNode = m_node->parentOrHostNode();
 413                 while (!next && parentNode) {
 414                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
 415                         return;
 416                     bool haveRenderer = m_node->renderer();
 417                     m_node = parentNode;
 418                     m_fullyClippedStack.pop();
 419                     parentNode = m_node->parentOrHostNode();
 420                     if (haveRenderer)
 421                         exitNode();
 422                     if (m_positionNode) {
 423                         m_handledNode = true;
 424                         m_handledChildren = true;
 425                         return;
 426                     }
 427                     next = m_node->nextSibling();
 428                 }
 429             }
 430             m_fullyClippedStack.pop();
 431         }
 432
 433         // set the new current node
 434         m_node = next;
 435         if (m_node)
 436             pushFullyClippedState(m_fullyClippedStack, m_node);
 437         m_handledNode = false;
 438         m_handledChildren = false;
 439         m_handledFirstLetter = false;
 440         m_firstLetterText = 0;
 441
 442         // how would this ever be?
 443         if (m_positionNode)
 444             return;
 445     }
 446 }
 447
 448 bool TextIterator::handleTextNode()
 449 {
 450     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
 451         return false;
 452
 453     RenderText* renderer = toRenderText(m_node->renderer());
 454
 455     m_lastTextNode = m_node;
 456     String str = renderer->text();
 457
 458     // handle pre-formatted text
 459     if (!renderer->style()->collapseWhiteSpace()) {
 460         int runStart = m_offset;
 461         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
 462             emitCharacter(' ', m_node, 0, runStart, runStart);
 463             return false;
 464         }
 465         if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset) {
 466             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
 467             if (m_firstLetterText) {
 468                 String firstLetter = m_firstLetterText->text();
 469                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
 470                 m_firstLetterText = 0;
 471                 m_textBox = 0;
 472                 return false;
 473             }
 474         }
 475         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
 476             return false;
 477         int strLength = str.length();
 478         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
 479         int runEnd = min(strLength, end);
 480
 481         if (runStart >= runEnd)
 482             return true;
 483
 484         emitText(m_node, runStart, runEnd);
 485         return true;
 486     }
 487
 488     if (!renderer->firstTextBox() && str.length() > 0) {
 489         if (!m_handledFirstLetter && renderer->isTextFragment()) {
 490             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
 491             if (m_firstLetterText) {
 492                 handleTextBox();
 493                 return false;
 494             }
 495         }
 496         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
 497             return false;
 498         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
 499         return true;
 500     }
 501
 502
 503     m_textBox = renderer->firstTextBox();
 504     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
 505         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
 506
 507     if (m_firstLetterText)
 508         renderer = m_firstLetterText;
 509
 510     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
 511     if (renderer->containsReversedText()) {
 512         m_sortedTextBoxes.clear();
 513         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
 514             m_sortedTextBoxes.append(textBox);
 515         }
 516         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
 517         m_sortedTextBoxesPosition = 0;
 518         m_textBox = m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0];
 519     }
 520
 521     handleTextBox();
 522     return true;
 523 }
 524
 525 void TextIterator::handleTextBox()
 526 {
 527     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
 528     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
 529         m_textBox = 0;
 530         return;
 531     }
 532     String str = renderer->text();
 533     unsigned start = m_offset;
 534     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
 535     while (m_textBox) {
 536         unsigned textBoxStart = m_textBox->start();
 537         unsigned runStart = max(textBoxStart, start);
 538
 539         // Check for collapsed space at the start of this run.
 540         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
 541         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
 542             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
 543         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
 544             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
 545                 unsigned spaceRunStart = runStart - 1;
 546                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
 547                     --spaceRunStart;
 548                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
 549             } else
 550                 emitCharacter(' ', m_node, 0, runStart, runStart);
 551             return;
 552         }
 553         unsigned textBoxEnd = textBoxStart + m_textBox->len();
 554         unsigned runEnd = min(textBoxEnd, end);
 555
 556         // Determine what the next text box will be, but don't advance yet
 557         InlineTextBox* nextTextBox = 0;
 558         if (renderer->containsReversedText()) {
 559             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
 560                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
 561         } else
 562             nextTextBox = m_textBox->nextTextBox();
 563
 564         if (runStart < runEnd) {
 565             // Handle either a single newline character (which becomes a space),
 566             // or a run of characters that does not include a newline.
 567             // This effectively translates newlines to spaces without copying the text.
 568             if (str[runStart] == '\n') {
 569                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
 570                 m_offset = runStart + 1;
 571             } else {
 572                 size_t subrunEnd = str.find('\n', runStart);
 573                 if (subrunEnd == notFound || subrunEnd > runEnd)
 574                     subrunEnd = runEnd;
 575
 576                 m_offset = subrunEnd;
 577                 emitText(m_node, renderer, runStart, subrunEnd);
 578             }
 579
 580             // If we are doing a subrun that doesn't go to the end of the text box,
 581             // come back again to finish handling this text box; don't advance to the next one.
 582             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
 583                 return;
 584
 585             // Advance and return
 586             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
 587             if (nextRunStart > runEnd)
 588                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
 589             m_textBox = nextTextBox;
 590             if (renderer->containsReversedText())
 591                 ++m_sortedTextBoxesPosition;
 592             return;
 593         }
 594         // Advance and continue
 595         m_textBox = nextTextBox;
 596         if (renderer->containsReversedText())
 597             ++m_sortedTextBoxesPosition;
 598     }
 599     if (!m_textBox && m_remainingTextBox) {
 600         m_textBox = m_remainingTextBox;
 601         m_remainingTextBox = 0;
 602         m_firstLetterText = 0;
 603         m_offset = 0;
 604         handleTextBox();
 605     }
 606 }
 607
 608 static inline RenderText* firstRenderTextInFirstLetter(RenderObject* firstLetter)
 609 {
 610     if (!firstLetter)
 611         return 0;
 612
 613     // FIXME: Should this check descendent objects?
 614     for (RenderObject* current = firstLetter->firstChild(); current; current = current->nextSibling()) {
 615         if (current->isText())
 616             return toRenderText(current);
 617     }
 618     return 0;
 619 }
 620
 621 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
 622 {
 623     if (renderer->firstLetter()) {
 624         RenderObject* r = renderer->firstLetter();
 625         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
 626             return;
 627         if (RenderText* firstLetter = firstRenderTextInFirstLetter(r)) {
 628             m_handledFirstLetter = true;
 629             m_remainingTextBox = m_textBox;
 630             m_textBox = firstLetter->firstTextBox();
 631             m_firstLetterText = firstLetter;
 632         }
 633     }
 634     m_handledFirstLetter = true;
 635 }
 636
 637 bool TextIterator::handleReplacedElement()
 638 {
 639     if (m_fullyClippedStack.top())
 640         return false;
 641
 642     RenderObject* renderer = m_node->renderer();
 643     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
 644         return false;
 645
 646     if (m_lastTextNodeEndedWithCollapsedSpace) {
 647         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
 648         return false;
 649     }
 650
 651     if (m_entersTextControls && renderer->isTextControl()) {
 652         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->textFormControlElement()->innerTextElement()) {
 653             m_node = innerTextElement->shadowTreeRootNode();
 654             pushFullyClippedState(m_fullyClippedStack, m_node);
 655             m_offset = 0;
 656             return false;
 657         }
 658     }
 659
 660     m_hasEmitted = true;
 661
 662     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
 663         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
 664         return true;
 665     }
 666
 667     if (m_emitsCharactersBetweenAllVisiblePositions) {
 668         // We want replaced elements to behave like punctuation for boundary
 669         // finding, and to simply take up space for the selection preservation
 670         // code in moveParagraphs, so we use a comma.
 671         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
 672         return true;
 673     }
 674
 675     m_positionNode = m_node->parentNode();
 676     m_positionOffsetBaseNode = m_node;
 677     m_positionStartOffset = 0;
 678     m_positionEndOffset = 1;
 679
 680     m_textCharacters = 0;
 681     m_textLength = 0;
 682
 683     m_lastCharacter = 0;
 684
 685     return true;
 686 }
 687
 688 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
 689 {
 690     if (renderer->style()->visibility() == VISIBLE)
 691         return true;
 692     if (renderer->isTextFragment()) {
 693         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
 694         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
 695             return true;
 696     }
 697     return false;
 698 }
 699
 700 static bool shouldEmitTabBeforeNode(Node* node)
 701 {
 702     RenderObject* r = node->renderer();
 703
 704     // Table cells are delimited by tabs.
 705     if (!r || !isTableCell(node))
 706         return false;
 707
 708     // Want a tab before every cell other than the first one
 709     RenderTableCell* rc = toRenderTableCell(r);
 710     RenderTable* t = rc->table();
 711     return t && (t->cellBefore(rc) || t->cellAbove(rc));
 712 }
 713
 714 static bool shouldEmitNewlineForNode(Node* node)
 715 {
 716     // br elements are represented by a single newline.
 717     RenderObject* r = node->renderer();
 718     if (!r)
 719         return node->hasTagName(brTag);
 720
 721     return r->isBR();
 722 }
 723
 724 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
 725 {
 726     // Block flow (versus inline flow) is represented by having
 727     // a newline both before and after the element.
 728     RenderObject* r = node->renderer();
 729     if (!r) {
 730         return (node->hasTagName(blockquoteTag)
 731                 || node->hasTagName(ddTag)
 732                 || node->hasTagName(divTag)
 733                 || node->hasTagName(dlTag)
 734                 || node->hasTagName(dtTag)
 735                 || node->hasTagName(h1Tag)
 736                 || node->hasTagName(h2Tag)
 737                 || node->hasTagName(h3Tag)
 738                 || node->hasTagName(h4Tag)
 739                 || node->hasTagName(h5Tag)
 740                 || node->hasTagName(h6Tag)
 741                 || node->hasTagName(hrTag)
 742                 || node->hasTagName(liTag)
 743                 || node->hasTagName(listingTag)
 744                 || node->hasTagName(olTag)
 745                 || node->hasTagName(pTag)
 746                 || node->hasTagName(preTag)
 747                 || node->hasTagName(trTag)
 748                 || node->hasTagName(ulTag));
 749     }
 750
 751     // Need to make an exception for table cells, because they are blocks, but we
 752     // want them tab-delimited rather than having newlines before and after.
 753     if (isTableCell(node))
 754         return false;
 755
 756     // Need to make an exception for table row elements, because they are neither
 757     // "inline" or "RenderBlock", but we want newlines for them.
 758     if (r->isTableRow()) {
 759         RenderTable* t = toRenderTableRow(r)->table();
 760         if (t && !t->isInline())
 761             return true;
 762     }
 763
 764     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
 765 }
 766
 767 static bool shouldEmitNewlineAfterNode(Node* node)
 768 {
 769     // FIXME: It should be better but slower to create a VisiblePosition here.
 770     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
 771         return false;
 772     // Check if this is the very last renderer in the document.
 773     // If so, then we should not emit a newline.
 774     while ((node = node->traverseNextSibling()))
 775         if (node->renderer())
 776             return true;
 777     return false;
 778 }
 779
 780 static bool shouldEmitNewlineBeforeNode(Node* node)
 781 {
 782     return shouldEmitNewlinesBeforeAndAfterNode(node);
 783 }
 784
 785 static bool shouldEmitExtraNewlineForNode(Node* node)
 786 {
 787     // When there is a significant collapsed bottom margin, emit an extra
 788     // newline for a more realistic result.  We end up getting the right
 789     // result even without margin collapsing. For example: <div><p>text</p></div>
 790     // will work right even if both the <div> and the <p> have bottom margins.
 791     RenderObject* r = node->renderer();
 792     if (!r || !r->isBox())
 793         return false;
 794
 795     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
 796     // not to do this at all
 797     if (node->hasTagName(h1Tag)
 798         || node->hasTagName(h2Tag)
 799         || node->hasTagName(h3Tag)
 800         || node->hasTagName(h4Tag)
 801         || node->hasTagName(h5Tag)
 802         || node->hasTagName(h6Tag)
 803         || node->hasTagName(pTag)) {
 804         RenderStyle* style = r->style();
 805         if (style) {
 806             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
 807             int fontSize = style->fontDescription().computedPixelSize();
 808             if (bottomMargin * 2 >= fontSize)
 809                 return true;
 810         }
 811     }
 812
 813     return false;
 814 }
 815
 816 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
 817 {
 818     const UChar* characters = renderer->text()->characters();
 819     int length = renderer->text()->length();
 820     for (int i = textEnd; i < length; ++i) {
 821         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
 822             return i - textEnd;
 823     }
 824
 825     return length - textEnd;
 826 }
 827
 828 static int maxOffsetIncludingCollapsedSpaces(Node* node)
 829 {
 830     int offset = caretMaxOffset(node);
 831
 832     if (node->renderer() && node->renderer()->isText())
 833         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
 834
 835     return offset;
 836 }
 837
 838 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
 839 bool TextIterator::shouldRepresentNodeOffsetZero()
 840 {
 841     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
 842         return true;
 843
 844     // Leave element positioned flush with start of a paragraph
 845     // (e.g. do not insert tab before a table cell at the start of a paragraph)
 846     if (m_lastCharacter == '\n')
 847         return false;
 848
 849     // Otherwise, show the position if we have emitted any characters
 850     if (m_hasEmitted)
 851         return true;
 852
 853     // We've not emitted anything yet. Generally, there is no need for any positioning then.
 854     // The only exception is when the element is visually not in the same line as
 855     // the start of the range (e.g. the range starts at the end of the previous paragraph).
 856     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
 857     // make quicker checks to possibly avoid that. Another check that we could make is
 858     // is whether the inline vs block flow changed since the previous visible element.
 859     // I think we're already in a special enough case that that won't be needed, tho.
 860
 861     // No character needed if this is the first node in the range.
 862     if (m_node == m_startContainer)
 863         return false;
 864
 865     // If we are outside the start container's subtree, assume we need to emit.
 866     // FIXME: m_startContainer could be an inline block
 867     if (!m_node->isDescendantOf(m_startContainer))
 868         return true;
 869
 870     // If we started as m_startContainer offset 0 and the current node is a descendant of
 871     // the start container, we already had enough context to correctly decide whether to
 872     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
 873     // so don't second guess that now.
 874     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
 875     // immaterial since we likely would have already emitted something by now.
 876     if (m_startOffset == 0)
 877         return false;
 878
 879     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
 880     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
 881     // we would create VisiblePositions on every call to this function without this check.
 882     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
 883         return false;
 884
 885     // The startPos.isNotNull() check is needed because the start could be before the body,
 886     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
 887     // The currPos.isNotNull() check is needed because positions in non-HTML content
 888     // (like SVG) do not have visible positions, and we don't want to emit for them either.
 889     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
 890     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
 891     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
 892 }
 893
 894 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
 895 {
 896     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
 897 }
 898
 899 void TextIterator::representNodeOffsetZero()
 900 {
 901     // Emit a character to show the positioning of m_node.
 902
 903     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
 904     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
 905     // on m_node to see if it necessitates emitting a character first and will early return
 906     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
 907     if (shouldEmitTabBeforeNode(m_node)) {
 908         if (shouldRepresentNodeOffsetZero())
 909             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
 910     } else if (shouldEmitNewlineBeforeNode(m_node)) {
 911         if (shouldRepresentNodeOffsetZero())
 912             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
 913     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
 914         if (shouldRepresentNodeOffsetZero())
 915             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
 916     }
 917 }
 918
 919 bool TextIterator::handleNonTextNode()
 920 {
 921     if (shouldEmitNewlineForNode(m_node))
 922         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
 923     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
 924         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
 925     else
 926         representNodeOffsetZero();
 927
 928     return true;
 929 }
 930
 931 void TextIterator::exitNode()
 932 {
 933     // prevent emitting a newline when exiting a collapsed block at beginning of the range
 934     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
 935     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
 936     // therefore look like a blank line.
 937     if (!m_hasEmitted)
 938         return;
 939
 940     // Emit with a position *inside* m_node, after m_node's contents, in
 941     // case it is a block, because the run should start where the
 942     // emitted character is positioned visually.
 943     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
 944     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
 945     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
 946     // TextIterator in _web_attributedStringFromRange.
 947     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
 948     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
 949         // use extra newline to represent margin bottom, as needed
 950         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
 951
 952         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
 953         // contain a VisiblePosition when doing selection preservation.
 954         if (m_lastCharacter != '\n') {
 955             // insert a newline with a position following this block's contents.
 956             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
 957             // remember whether to later add a newline for the current node
 958             ASSERT(!m_needsAnotherNewline);
 959             m_needsAnotherNewline = addNewline;
 960         } else if (addNewline)
 961             // insert a newline with a position following this block's contents.
 962             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
 963     }
 964
 965     // If nothing was emitted, see if we need to emit a space.
 966     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
 967         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
 968 }
 969
 970 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
 971 {
 972     m_hasEmitted = true;
 973
 974     // remember information with which to construct the TextIterator::range()
 975     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
 976     m_positionNode = textNode;
 977     m_positionOffsetBaseNode = offsetBaseNode;
 978     m_positionStartOffset = textStartOffset;
 979     m_positionEndOffset = textEndOffset;
 980
 981     // remember information with which to construct the TextIterator::characters() and length()
 982     m_singleCharacterBuffer = c;
 983     m_textCharacters = &m_singleCharacterBuffer;
 984     m_textLength = 1;
 985
 986     // remember some iteration state
 987     m_lastTextNodeEndedWithCollapsedSpace = false;
 988     m_lastCharacter = c;
 989 }
 990
 991 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
 992 {
 993     RenderText* renderer = toRenderText(renderObject);
 994     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
 995     ASSERT(m_text.characters());
 996     ASSERT(0 <= textStartOffset && textStartOffset < static_cast<int>(m_text.length()));
 997     ASSERT(0 <= textEndOffset && textEndOffset <= static_cast<int>(m_text.length()));
 998     ASSERT(textStartOffset <= textEndOffset);
 999
1000     m_positionNode = textNode;
1001     m_positionOffsetBaseNode = 0;
1002     m_positionStartOffset = textStartOffset;
1003     m_positionEndOffset = textEndOffset;
1004     m_textCharacters = m_text.characters() + textStartOffset;
1005     m_textLength = textEndOffset - textStartOffset;
1006     m_lastCharacter = m_text[textEndOffset - 1];
1007
1008     m_lastTextNodeEndedWithCollapsedSpace = false;
1009     m_hasEmitted = true;
1010 }
1011
1012 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1013 {
1014     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1015 }
1016
1017 PassRefPtr<Range> TextIterator::range() const
1018 {
1019     // use the current run information, if we have it
1020     if (m_positionNode) {
1021         if (m_positionOffsetBaseNode) {
1022             int index = m_positionOffsetBaseNode->nodeIndex();
1023             m_positionStartOffset += index;
1024             m_positionEndOffset += index;
1025             m_positionOffsetBaseNode = 0;
1026         }
1027         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1028     }
1029
1030     // otherwise, return the end of the overall range we were given
1031     if (m_endContainer)
1032         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1033
1034     return 0;
1035 }
1036
1037 Node* TextIterator::node() const
1038 {
1039     RefPtr<Range> textRange = range();
1040     if (!textRange)
1041         return 0;
1042
1043     Node* node = textRange->startContainer();
1044     if (!node)
1045         return 0;
1046     if (node->offsetInCharacters())
1047         return node;
1048
1049     return node->childNode(textRange->startOffset());
1050 }
1051
1052 // --------
1053
1054 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1055     : m_behavior(TextIteratorDefaultBehavior)
1056     , m_node(0)
1057     , m_offset(0)
1058     , m_handledNode(false)
1059     , m_handledChildren(false)
1060     , m_startNode(0)
1061     , m_startOffset(0)
1062     , m_endNode(0)
1063     , m_endOffset(0)
1064     , m_positionNode(0)
1065     , m_positionStartOffset(0)
1066     , m_positionEndOffset(0)
1067     , m_textCharacters(0)
1068     , m_textLength(0)
1069     , m_lastTextNode(0)
1070     , m_lastCharacter(0)
1071     , m_singleCharacterBuffer(0)
1072     , m_havePassedStartNode(false)
1073     , m_shouldHandleFirstLetter(false)
1074 {
1075 }
1076
1077 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1078     : m_behavior(behavior)
1079     , m_node(0)
1080     , m_offset(0)
1081     , m_handledNode(false)
1082     , m_handledChildren(false)
1083     , m_startNode(0)
1084     , m_startOffset(0)
1085     , m_endNode(0)
1086     , m_endOffset(0)
1087     , m_positionNode(0)
1088     , m_positionStartOffset(0)
1089     , m_positionEndOffset(0)
1090     , m_textCharacters(0)
1091     , m_textLength(0)
1092     , m_lastTextNode(0)
1093     , m_lastCharacter(0)
1094     , m_singleCharacterBuffer(0)
1095     , m_havePassedStartNode(false)
1096     , m_shouldHandleFirstLetter(false)
1097 {
1098     ASSERT(m_behavior == TextIteratorDefaultBehavior);
1099
1100     if (!r)
1101         return;
1102
1103     Node* startNode = r->startContainer();
1104     if (!startNode)
1105         return;
1106     Node* endNode = r->endContainer();
1107     int startOffset = r->startOffset();
1108     int endOffset = r->endOffset();
1109
1110     if (!startNode->offsetInCharacters()) {
1111         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1112             startNode = startNode->childNode(startOffset);
1113             startOffset = 0;
1114         }
1115     }
1116     if (!endNode->offsetInCharacters()) {
1117         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1118             endNode = endNode->childNode(endOffset - 1);
1119             endOffset = lastOffsetInNode(endNode);
1120         }
1121     }
1122
1123     m_node = endNode;
1124     setUpFullyClippedStack(m_fullyClippedStack, m_node);
1125     m_offset = endOffset;
1126     m_handledNode = false;
1127     m_handledChildren = endOffset == 0;
1128
1129     m_startNode = startNode;
1130     m_startOffset = startOffset;
1131     m_endNode = endNode;
1132     m_endOffset = endOffset;
1133
1134 #ifndef NDEBUG
1135     // Need this just because of the assert.
1136     m_positionNode = endNode;
1137 #endif
1138
1139     m_lastTextNode = 0;
1140     m_lastCharacter = '\n';
1141
1142     m_havePassedStartNode = false;
1143
1144     advance();
1145 }
1146
1147 void SimplifiedBackwardsTextIterator::advance()
1148 {
1149     ASSERT(m_positionNode);
1150
1151     m_positionNode = 0;
1152     m_textLength = 0;
1153
1154     while (m_node && !m_havePassedStartNode) {
1155         // Don't handle node if we start iterating at [node, 0].
1156         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1157             RenderObject* renderer = m_node->renderer();
1158             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1159                 // FIXME: What about CDATA_SECTION_NODE?
1160                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1161                     m_handledNode = handleTextNode();
1162             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1163                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1164                     m_handledNode = handleReplacedElement();
1165             } else
1166                 m_handledNode = handleNonTextNode();
1167             if (m_positionNode)
1168                 return;
1169         }
1170
1171         if (!m_handledChildren && m_node->hasChildNodes()) {
1172             m_node = m_node->lastChild();
1173             pushFullyClippedState(m_fullyClippedStack, m_node);
1174         } else {
1175             // Exit empty containers as we pass over them or containers
1176             // where [container, 0] is where we started iterating.
1177             if (!m_handledNode
1178                     && canHaveChildrenForEditing(m_node)
1179                     && m_node->parentNode()
1180                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1181                 exitNode();
1182                 if (m_positionNode) {
1183                     m_handledNode = true;
1184                     m_handledChildren = true;
1185                     return;
1186                 }
1187             }
1188
1189             // Exit all other containers.
1190             while (!m_node->previousSibling()) {
1191                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1192                     break;
1193                 m_fullyClippedStack.pop();
1194                 exitNode();
1195                 if (m_positionNode) {
1196                     m_handledNode = true;
1197                     m_handledChildren = true;
1198                     return;
1199                 }
1200             }
1201
1202             m_fullyClippedStack.pop();
1203             if (advanceRespectingRange(m_node->previousSibling()))
1204                 pushFullyClippedState(m_fullyClippedStack, m_node);
1205             else
1206                 m_node = 0;
1207         }
1208
1209         // For the purpose of word boundary detection,
1210         // we should iterate all visible text and trailing (collapsed) whitespaces.
1211         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1212         m_handledNode = false;
1213         m_handledChildren = false;
1214
1215         if (m_positionNode)
1216             return;
1217     }
1218 }
1219
1220 bool SimplifiedBackwardsTextIterator::handleTextNode()
1221 {
1222     m_lastTextNode = m_node;
1223
1224     int startOffset;
1225     int offsetInNode;
1226     RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1227     if (!renderer)
1228         return true;
1229
1230     String text = renderer->text();
1231     if (!renderer->firstTextBox() && text.length() > 0)
1232         return true;
1233
1234     m_positionEndOffset = m_offset;
1235     m_offset = startOffset + offsetInNode;
1236     m_positionNode = m_node;
1237     m_positionStartOffset = m_offset;
1238
1239     ASSERT(0 <= m_positionStartOffset - offsetInNode && m_positionStartOffset - offsetInNode <= static_cast<int>(text.length()));
1240     ASSERT(1 <= m_positionEndOffset - offsetInNode && m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1241     ASSERT(m_positionStartOffset <= m_positionEndOffset);
1242
1243     m_textLength = m_positionEndOffset - m_positionStartOffset;
1244     m_textCharacters = text.characters() + (m_positionStartOffset - offsetInNode);
1245     ASSERT(m_textCharacters >= text.characters());
1246     ASSERT(m_textCharacters + m_textLength <= text.characters() + static_cast<int>(text.length()));
1247
1248     m_lastCharacter = text[m_positionEndOffset - 1];
1249
1250     return !m_shouldHandleFirstLetter;
1251 }
1252
1253 RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1254 {
1255     RenderText* renderer = toRenderText(m_node->renderer());
1256     startOffset = (m_node == m_startNode) ? m_startOffset : 0;
1257
1258     if (!renderer->isTextFragment()) {
1259         offsetInNode = 0;
1260         return renderer;
1261     }
1262
1263     RenderTextFragment* fragment = toRenderTextFragment(renderer);
1264     int offsetAfterFirstLetter = fragment->start();
1265     if (startOffset >= offsetAfterFirstLetter) {
1266         ASSERT(!m_shouldHandleFirstLetter);
1267         offsetInNode = offsetAfterFirstLetter;
1268         return renderer;
1269     }
1270
1271     if (!m_shouldHandleFirstLetter && offsetAfterFirstLetter < m_offset) {
1272         m_shouldHandleFirstLetter = true;
1273         offsetInNode = offsetAfterFirstLetter;
1274         return renderer;
1275     }
1276
1277     m_shouldHandleFirstLetter = false;
1278     offsetInNode = 0;
1279     return firstRenderTextInFirstLetter(fragment->firstLetter());
1280 }
1281
1282 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1283 {
1284     unsigned index = m_node->nodeIndex();
1285     // We want replaced elements to behave like punctuation for boundary
1286     // finding, and to simply take up space for the selection preservation
1287     // code in moveParagraphs, so we use a comma.  Unconditionally emit
1288     // here because this iterator is only used for boundary finding.
1289     emitCharacter(',', m_node->parentNode(), index, index + 1);
1290     return true;
1291 }
1292
1293 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1294 {
1295     // We can use a linefeed in place of a tab because this simple iterator is only used to
1296     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1297     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1298         unsigned index = m_node->nodeIndex();
1299         // The start of this emitted range is wrong. Ensuring correctness would require
1300         // VisiblePositions and so would be slow. previousBoundary expects this.
1301         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1302     }
1303     return true;
1304 }
1305
1306 void SimplifiedBackwardsTextIterator::exitNode()
1307 {
1308     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1309         // The start of this emitted range is wrong. Ensuring correctness would require
1310         // VisiblePositions and so would be slow. previousBoundary expects this.
1311         emitCharacter('\n', m_node, 0, 0);
1312     }
1313 }
1314
1315 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1316 {
1317     m_singleCharacterBuffer = c;
1318     m_positionNode = node;
1319     m_positionStartOffset = startOffset;
1320     m_positionEndOffset = endOffset;
1321     m_textCharacters = &m_singleCharacterBuffer;
1322     m_textLength = 1;
1323     m_lastCharacter = c;
1324 }
1325
1326 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1327 {
1328     if (!next)
1329         return false;
1330     m_havePassedStartNode |= m_node == m_startNode;
1331     if (m_havePassedStartNode)
1332         return false;
1333     m_node = next;
1334     return true;
1335 }
1336
1337 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1338 {
1339     if (m_positionNode)
1340         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1341
1342     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1343 }
1344
1345 // --------
1346
1347 CharacterIterator::CharacterIterator()
1348     : m_offset(0)
1349     , m_runOffset(0)
1350     , m_atBreak(true)
1351 {
1352 }
1353
1354 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1355     : m_offset(0)
1356     , m_runOffset(0)
1357     , m_atBreak(true)
1358     , m_textIterator(r, behavior)
1359 {
1360     while (!atEnd() && m_textIterator.length() == 0)
1361         m_textIterator.advance();
1362 }
1363
1364 PassRefPtr<Range> CharacterIterator::range() const
1365 {
1366     RefPtr<Range> r = m_textIterator.range();
1367     if (!m_textIterator.atEnd()) {
1368         if (m_textIterator.length() <= 1) {
1369             ASSERT(m_runOffset == 0);
1370         } else {
1371             Node* n = r->startContainer();
1372             ASSERT(n == r->endContainer());
1373             int offset = r->startOffset() + m_runOffset;
1374             ExceptionCode ec = 0;
1375             r->setStart(n, offset, ec);
1376             r->setEnd(n, offset + 1, ec);
1377             ASSERT(!ec);
1378         }
1379     }
1380     return r.release();
1381 }
1382
1383 void CharacterIterator::advance(int count)
1384 {
1385     if (count <= 0) {
1386         ASSERT(count == 0);
1387         return;
1388     }
1389
1390     m_atBreak = false;
1391
1392     // easy if there is enough left in the current m_textIterator run
1393     int remaining = m_textIterator.length() - m_runOffset;
1394     if (count < remaining) {
1395         m_runOffset += count;
1396         m_offset += count;
1397         return;
1398     }
1399
1400     // exhaust the current m_textIterator run
1401     count -= remaining;
1402     m_offset += remaining;
1403
1404     // move to a subsequent m_textIterator run
1405     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1406         int runLength = m_textIterator.length();
1407         if (runLength == 0)
1408             m_atBreak = true;
1409         else {
1410             // see whether this is m_textIterator to use
1411             if (count < runLength) {
1412                 m_runOffset = count;
1413                 m_offset += count;
1414                 return;
1415             }
1416
1417             // exhaust this m_textIterator run
1418             count -= runLength;
1419             m_offset += runLength;
1420         }
1421     }
1422
1423     // ran to the end of the m_textIterator... no more runs left
1424     m_atBreak = true;
1425     m_runOffset = 0;
1426 }
1427
1428 String CharacterIterator::string(int numChars)
1429 {
1430     Vector<UChar> result;
1431     result.reserveInitialCapacity(numChars);
1432     while (numChars > 0 && !atEnd()) {
1433         int runSize = min(numChars, length());
1434         result.append(characters(), runSize);
1435         numChars -= runSize;
1436         advance(runSize);
1437     }
1438     return String::adopt(result);
1439 }
1440
1441 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1442 {
1443     it.advance(offset);
1444     RefPtr<Range> start = it.range();
1445
1446     if (length > 1)
1447         it.advance(length - 1);
1448     RefPtr<Range> end = it.range();
1449
1450     return Range::create(start->startContainer()->document(),
1451         start->startContainer(), start->startOffset(),
1452         end->endContainer(), end->endOffset());
1453 }
1454
1455 BackwardsCharacterIterator::BackwardsCharacterIterator()
1456     : m_offset(0)
1457     , m_runOffset(0)
1458     , m_atBreak(true)
1459 {
1460 }
1461
1462 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1463     : m_offset(0)
1464     , m_runOffset(0)
1465     , m_atBreak(true)
1466     , m_textIterator(range, behavior)
1467 {
1468     while (!atEnd() && !m_textIterator.length())
1469         m_textIterator.advance();
1470 }
1471
1472 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1473 {
1474     RefPtr<Range> r = m_textIterator.range();
1475     if (!m_textIterator.atEnd()) {
1476         if (m_textIterator.length() <= 1)
1477             ASSERT(m_runOffset == 0);
1478         else {
1479             Node* n = r->startContainer();
1480             ASSERT(n == r->endContainer());
1481             int offset = r->endOffset() - m_runOffset;
1482             ExceptionCode ec = 0;
1483             r->setStart(n, offset - 1, ec);
1484             r->setEnd(n, offset, ec);
1485             ASSERT(!ec);
1486         }
1487     }
1488     return r.release();
1489 }
1490
1491 void BackwardsCharacterIterator::advance(int count)
1492 {
1493     if (count <= 0) {
1494         ASSERT(!count);
1495         return;
1496     }
1497
1498     m_atBreak = false;
1499
1500     int remaining = m_textIterator.length() - m_runOffset;
1501     if (count < remaining) {
1502         m_runOffset += count;
1503         m_offset += count;
1504         return;
1505     }
1506
1507     count -= remaining;
1508     m_offset += remaining;
1509
1510     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1511         int runLength = m_textIterator.length();
1512         if (runLength == 0)
1513             m_atBreak = true;
1514         else {
1515             if (count < runLength) {
1516                 m_runOffset = count;
1517                 m_offset += count;
1518                 return;
1519             }
1520
1521             count -= runLength;
1522             m_offset += runLength;
1523         }
1524     }
1525
1526     m_atBreak = true;
1527     m_runOffset = 0;
1528 }
1529
1530 // --------
1531
1532 WordAwareIterator::WordAwareIterator()
1533     : m_previousText(0)
1534     , m_didLookAhead(false)
1535 {
1536 }
1537
1538 WordAwareIterator::WordAwareIterator(const Range* r)
1539     : m_previousText(0)
1540     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1541     , m_textIterator(r)
1542 {
1543     advance(); // get in position over the first chunk of text
1544 }
1545
1546 WordAwareIterator::~WordAwareIterator()
1547 {
1548 }
1549
1550 // We're always in one of these modes:
1551 // - The current chunk in the text iterator is our current chunk
1552 //      (typically its a piece of whitespace, or text that ended with whitespace)
1553 // - The previous chunk in the text iterator is our current chunk
1554 //      (we looked ahead to the next chunk and found a word boundary)
1555 // - We built up our own chunk of text from many chunks from the text iterator
1556
1557 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1558
1559 void WordAwareIterator::advance()
1560 {
1561     m_previousText = 0;
1562     m_buffer.clear();      // toss any old buffer we built up
1563
1564     // If last time we did a look-ahead, start with that looked-ahead chunk now
1565     if (!m_didLookAhead) {
1566         ASSERT(!m_textIterator.atEnd());
1567         m_textIterator.advance();
1568     }
1569     m_didLookAhead = false;
1570
1571     // Go to next non-empty chunk
1572     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1573         m_textIterator.advance();
1574     m_range = m_textIterator.range();
1575
1576     if (m_textIterator.atEnd())
1577         return;
1578
1579     while (1) {
1580         // If this chunk ends in whitespace we can just use it as our chunk.
1581         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1582             return;
1583
1584         // If this is the first chunk that failed, save it in previousText before look ahead
1585         if (m_buffer.isEmpty()) {
1586             m_previousText = m_textIterator.characters();
1587             m_previousLength = m_textIterator.length();
1588         }
1589
1590         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1591         m_textIterator.advance();
1592         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1593             m_didLookAhead = true;
1594             return;
1595         }
1596
1597         if (m_buffer.isEmpty()) {
1598             // Start gobbling chunks until we get to a suitable stopping point
1599             m_buffer.append(m_previousText, m_previousLength);
1600             m_previousText = 0;
1601         }
1602         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1603         int exception = 0;
1604         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1605     }
1606 }
1607
1608 int WordAwareIterator::length() const
1609 {
1610     if (!m_buffer.isEmpty())
1611         return m_buffer.size();
1612     if (m_previousText)
1613         return m_previousLength;
1614     return m_textIterator.length();
1615 }
1616
1617 const UChar* WordAwareIterator::characters() const
1618 {
1619     if (!m_buffer.isEmpty())
1620         return m_buffer.data();
1621     if (m_previousText)
1622         return m_previousText;
1623     return m_textIterator.characters();
1624 }
1625
1626 // --------
1627
1628 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1629 {
1630     switch (c) {
1631         case hebrewPunctuationGershayim:
1632         case leftDoubleQuotationMark:
1633         case rightDoubleQuotationMark:
1634             return '"';
1635         case hebrewPunctuationGeresh:
1636         case leftSingleQuotationMark:
1637         case rightSingleQuotationMark:
1638             return '\'';
1639         case softHyphen:
1640             // Replace soft hyphen with an ignorable character so that their presence or absence will
1641             // not affect string comparison.
1642             return 0;
1643         default:
1644             return c;
1645     }
1646 }
1647
1648 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1649 {
1650     s.replace(hebrewPunctuationGeresh, '\'');
1651     s.replace(hebrewPunctuationGershayim, '"');
1652     s.replace(leftDoubleQuotationMark, '"');
1653     s.replace(leftSingleQuotationMark, '\'');
1654     s.replace(rightDoubleQuotationMark, '"');
1655     s.replace(rightSingleQuotationMark, '\'');
1656     // Replace soft hyphen with an ignorable character so that their presence or absence will
1657     // not affect string comparison.
1658     s.replace(softHyphen, 0);
1659 }
1660
1661 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1662
1663 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1664 {
1665     for (size_t i = 0; i < length; ++i)
1666         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1667 }
1668
1669 static const size_t minimumSearchBufferSize = 8192;
1670
1671 #ifndef NDEBUG
1672 static bool searcherInUse;
1673 #endif
1674
1675 static UStringSearch* createSearcher()
1676 {
1677     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1678     // but it doesn't matter exactly what it is, since we don't perform any searches
1679     // without setting both the pattern and the text.
1680     UErrorCode status = U_ZERO_ERROR;
1681     String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search");
1682     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status);
1683     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1684     return searcher;
1685 }
1686
1687 static UStringSearch* searcher()
1688 {
1689     static UStringSearch* searcher = createSearcher();
1690     return searcher;
1691 }
1692
1693 static inline void lockSearcher()
1694 {
1695 #ifndef NDEBUG
1696     ASSERT(!searcherInUse);
1697     searcherInUse = true;
1698 #endif
1699 }
1700
1701 static inline void unlockSearcher()
1702 {
1703 #ifndef NDEBUG
1704     ASSERT(searcherInUse);
1705     searcherInUse = false;
1706 #endif
1707 }
1708
1709 // ICU's search ignores the distinction between small kana letters and ones
1710 // that are not small, and also characters that differ only in the voicing
1711 // marks when considering only primary collation strength diffrences.
1712 // This is not helpful for end users, since these differences make words
1713 // distinct, so for our purposes we need these to be considered.
1714 // The Unicode folks do not think the collation algorithm should be
1715 // changed. To work around this, we would like to tailor the ICU searcher,
1716 // but we can't get that to work yet. So instead, we check for cases where
1717 // these differences occur, and skip those matches.
1718
1719 // We refer to the above technique as the "kana workaround". The next few
1720 // functions are helper functinos for the kana workaround.
1721
1722 static inline bool isKanaLetter(UChar character)
1723 {
1724     // Hiragana letters.
1725     if (character >= 0x3041 && character <= 0x3096)
1726         return true;
1727
1728     // Katakana letters.
1729     if (character >= 0x30A1 && character <= 0x30FA)
1730         return true;
1731     if (character >= 0x31F0 && character <= 0x31FF)
1732         return true;
1733
1734     // Halfwidth katakana letters.
1735     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1736         return true;
1737
1738     return false;
1739 }
1740
1741 static inline bool isSmallKanaLetter(UChar character)
1742 {
1743     ASSERT(isKanaLetter(character));
1744
1745     switch (character) {
1746     case 0x3041: // HIRAGANA LETTER SMALL A
1747     case 0x3043: // HIRAGANA LETTER SMALL I
1748     case 0x3045: // HIRAGANA LETTER SMALL U
1749     case 0x3047: // HIRAGANA LETTER SMALL E
1750     case 0x3049: // HIRAGANA LETTER SMALL O
1751     case 0x3063: // HIRAGANA LETTER SMALL TU
1752     case 0x3083: // HIRAGANA LETTER SMALL YA
1753     case 0x3085: // HIRAGANA LETTER SMALL YU
1754     case 0x3087: // HIRAGANA LETTER SMALL YO
1755     case 0x308E: // HIRAGANA LETTER SMALL WA
1756     case 0x3095: // HIRAGANA LETTER SMALL KA
1757     case 0x3096: // HIRAGANA LETTER SMALL KE
1758     case 0x30A1: // KATAKANA LETTER SMALL A
1759     case 0x30A3: // KATAKANA LETTER SMALL I
1760     case 0x30A5: // KATAKANA LETTER SMALL U
1761     case 0x30A7: // KATAKANA LETTER SMALL E
1762     case 0x30A9: // KATAKANA LETTER SMALL O
1763     case 0x30C3: // KATAKANA LETTER SMALL TU
1764     case 0x30E3: // KATAKANA LETTER SMALL YA
1765     case 0x30E5: // KATAKANA LETTER SMALL YU
1766     case 0x30E7: // KATAKANA LETTER SMALL YO
1767     case 0x30EE: // KATAKANA LETTER SMALL WA
1768     case 0x30F5: // KATAKANA LETTER SMALL KA
1769     case 0x30F6: // KATAKANA LETTER SMALL KE
1770     case 0x31F0: // KATAKANA LETTER SMALL KU
1771     case 0x31F1: // KATAKANA LETTER SMALL SI
1772     case 0x31F2: // KATAKANA LETTER SMALL SU
1773     case 0x31F3: // KATAKANA LETTER SMALL TO
1774     case 0x31F4: // KATAKANA LETTER SMALL NU
1775     case 0x31F5: // KATAKANA LETTER SMALL HA
1776     case 0x31F6: // KATAKANA LETTER SMALL HI
1777     case 0x31F7: // KATAKANA LETTER SMALL HU
1778     case 0x31F8: // KATAKANA LETTER SMALL HE
1779     case 0x31F9: // KATAKANA LETTER SMALL HO
1780     case 0x31FA: // KATAKANA LETTER SMALL MU
1781     case 0x31FB: // KATAKANA LETTER SMALL RA
1782     case 0x31FC: // KATAKANA LETTER SMALL RI
1783     case 0x31FD: // KATAKANA LETTER SMALL RU
1784     case 0x31FE: // KATAKANA LETTER SMALL RE
1785     case 0x31FF: // KATAKANA LETTER SMALL RO
1786     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1787     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1788     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1789     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1790     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1791     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1792     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1793     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1794     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1795         return true;
1796     }
1797     return false;
1798 }
1799
1800 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1801
1802 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1803 {
1804     ASSERT(isKanaLetter(character));
1805
1806     switch (character) {
1807     case 0x304C: // HIRAGANA LETTER GA
1808     case 0x304E: // HIRAGANA LETTER GI
1809     case 0x3050: // HIRAGANA LETTER GU
1810     case 0x3052: // HIRAGANA LETTER GE
1811     case 0x3054: // HIRAGANA LETTER GO
1812     case 0x3056: // HIRAGANA LETTER ZA
1813     case 0x3058: // HIRAGANA LETTER ZI
1814     case 0x305A: // HIRAGANA LETTER ZU
1815     case 0x305C: // HIRAGANA LETTER ZE
1816     case 0x305E: // HIRAGANA LETTER ZO
1817     case 0x3060: // HIRAGANA LETTER DA
1818     case 0x3062: // HIRAGANA LETTER DI
1819     case 0x3065: // HIRAGANA LETTER DU
1820     case 0x3067: // HIRAGANA LETTER DE
1821     case 0x3069: // HIRAGANA LETTER DO
1822     case 0x3070: // HIRAGANA LETTER BA
1823     case 0x3073: // HIRAGANA LETTER BI
1824     case 0x3076: // HIRAGANA LETTER BU
1825     case 0x3079: // HIRAGANA LETTER BE
1826     case 0x307C: // HIRAGANA LETTER BO
1827     case 0x3094: // HIRAGANA LETTER VU
1828     case 0x30AC: // KATAKANA LETTER GA
1829     case 0x30AE: // KATAKANA LETTER GI
1830     case 0x30B0: // KATAKANA LETTER GU
1831     case 0x30B2: // KATAKANA LETTER GE
1832     case 0x30B4: // KATAKANA LETTER GO
1833     case 0x30B6: // KATAKANA LETTER ZA
1834     case 0x30B8: // KATAKANA LETTER ZI
1835     case 0x30BA: // KATAKANA LETTER ZU
1836     case 0x30BC: // KATAKANA LETTER ZE
1837     case 0x30BE: // KATAKANA LETTER ZO
1838     case 0x30C0: // KATAKANA LETTER DA
1839     case 0x30C2: // KATAKANA LETTER DI
1840     case 0x30C5: // KATAKANA LETTER DU
1841     case 0x30C7: // KATAKANA LETTER DE
1842     case 0x30C9: // KATAKANA LETTER DO
1843     case 0x30D0: // KATAKANA LETTER BA
1844     case 0x30D3: // KATAKANA LETTER BI
1845     case 0x30D6: // KATAKANA LETTER BU
1846     case 0x30D9: // KATAKANA LETTER BE
1847     case 0x30DC: // KATAKANA LETTER BO
1848     case 0x30F4: // KATAKANA LETTER VU
1849     case 0x30F7: // KATAKANA LETTER VA
1850     case 0x30F8: // KATAKANA LETTER VI
1851     case 0x30F9: // KATAKANA LETTER VE
1852     case 0x30FA: // KATAKANA LETTER VO
1853         return VoicedSoundMark;
1854     case 0x3071: // HIRAGANA LETTER PA
1855     case 0x3074: // HIRAGANA LETTER PI
1856     case 0x3077: // HIRAGANA LETTER PU
1857     case 0x307A: // HIRAGANA LETTER PE
1858     case 0x307D: // HIRAGANA LETTER PO
1859     case 0x30D1: // KATAKANA LETTER PA
1860     case 0x30D4: // KATAKANA LETTER PI
1861     case 0x30D7: // KATAKANA LETTER PU
1862     case 0x30DA: // KATAKANA LETTER PE
1863     case 0x30DD: // KATAKANA LETTER PO
1864         return SemiVoicedSoundMark;
1865     }
1866     return NoVoicedSoundMark;
1867 }
1868
1869 static inline bool isCombiningVoicedSoundMark(UChar character)
1870 {
1871     switch (character) {
1872     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1873     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1874         return true;
1875     }
1876     return false;
1877 }
1878
1879 static inline bool containsKanaLetters(const String& pattern)
1880 {
1881     const UChar* characters = pattern.characters();
1882     unsigned length = pattern.length();
1883     for (unsigned i = 0; i < length; ++i) {
1884         if (isKanaLetter(characters[i]))
1885             return true;
1886     }
1887     return false;
1888 }
1889
1890 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1891 {
1892     ASSERT(length);
1893
1894     buffer.resize(length);
1895
1896     UErrorCode status = U_ZERO_ERROR;
1897     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1898     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1899     ASSERT(bufferSize);
1900
1901     buffer.resize(bufferSize);
1902
1903     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1904         return;
1905
1906     status = U_ZERO_ERROR;
1907     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1908     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1909 }
1910
1911 static bool isNonLatin1Separator(UChar32 character)
1912 {
1913     ASSERT_ARG(character, character >= 256);
1914
1915     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1916 }
1917
1918 static inline bool isSeparator(UChar32 character)
1919 {
1920     static const bool latin1SeparatorTable[256] = {
1921         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1922         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1923         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1924         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1925         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1926         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1927         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1928         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1929         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1930         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1931         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1932         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1933         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1934         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1935         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1936         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1937     };
1938
1939     if (character < 256)
1940         return latin1SeparatorTable[character];
1941
1942     return isNonLatin1Separator(character);
1943 }
1944
1945 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1946     : m_target(target)
1947     , m_options(options)
1948     , m_prefixLength(0)
1949     , m_atBreak(true)
1950     , m_needsMoreContext(options & AtWordStarts)
1951     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1952 {
1953     ASSERT(!m_target.isEmpty());
1954
1955     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1956     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1957     // to add tailoring on top of the locale-specific tailoring as of this writing.
1958     foldQuoteMarksAndSoftHyphens(m_target);
1959
1960     size_t targetLength = m_target.length();
1961     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1962     m_overlap = m_buffer.capacity() / 4;
1963
1964     if ((m_options & AtWordStarts) && targetLength) {
1965         UChar32 targetFirstCharacter;
1966         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1967         // Characters in the separator category never really occur at the beginning of a word,
1968         // so if the target begins with such a character, we just ignore the AtWordStart option.
1969         if (isSeparator(targetFirstCharacter)) {
1970             m_options &= ~AtWordStarts;
1971             m_needsMoreContext = false;
1972         }
1973     }
1974
1975     // Grab the single global searcher.
1976     // If we ever have a reason to do more than once search buffer at once, we'll have
1977     // to move to multiple searchers.
1978     lockSearcher();
1979
1980     UStringSearch* searcher = WebCore::searcher();
1981     UCollator* collator = usearch_getCollator(searcher);
1982
1983     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1984     if (ucol_getStrength(collator) != strength) {
1985         ucol_setStrength(collator, strength);
1986         usearch_reset(searcher);
1987     }
1988
1989     UErrorCode status = U_ZERO_ERROR;
1990     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1991     ASSERT(status == U_ZERO_ERROR);
1992
1993     // The kana workaround requires a normalized copy of the target string.
1994     if (m_targetRequiresKanaWorkaround)
1995         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1996 }
1997
1998 inline SearchBuffer::~SearchBuffer()
1999 {
2000     // Leave the static object pointing to a valid string.
2001     UErrorCode status = U_ZERO_ERROR;
2002     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
2003     ASSERT(status == U_ZERO_ERROR);
2004
2005     unlockSearcher();
2006 }
2007
2008 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2009 {
2010     ASSERT(length);
2011
2012     if (m_atBreak) {
2013         m_buffer.shrink(0);
2014         m_prefixLength = 0;
2015         m_atBreak = false;
2016     } else if (m_buffer.size() == m_buffer.capacity()) {
2017         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2018         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
2019         m_buffer.shrink(m_overlap);
2020     }
2021
2022     size_t oldLength = m_buffer.size();
2023     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2024     ASSERT(usableLength);
2025     m_buffer.append(characters, usableLength);
2026     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2027     return usableLength;
2028 }
2029
2030 inline bool SearchBuffer::needsMoreContext() const
2031 {
2032     return m_needsMoreContext;
2033 }
2034
2035 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2036 {
2037     ASSERT(m_needsMoreContext);
2038     ASSERT(m_prefixLength == m_buffer.size());
2039
2040     if (!length)
2041         return;
2042
2043     m_atBreak = false;
2044
2045     size_t wordBoundaryContextStart = length;
2046     if (wordBoundaryContextStart) {
2047         U16_BACK_1(characters, 0, wordBoundaryContextStart);
2048         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2049     }
2050
2051     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2052     m_buffer.prepend(characters + length - usableLength, usableLength);
2053     m_prefixLength += usableLength;
2054
2055     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2056         m_needsMoreContext = false;
2057 }
2058
2059 inline bool SearchBuffer::atBreak() const
2060 {
2061     return m_atBreak;
2062 }
2063
2064 inline void SearchBuffer::reachedBreak()
2065 {
2066     m_atBreak = true;
2067 }
2068
2069 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2070 {
2071     // This function implements the kana workaround. If usearch treats
2072     // it as a match, but we do not want to, then it's a "bad match".
2073     if (!m_targetRequiresKanaWorkaround)
2074         return false;
2075
2076     // Normalize into a match buffer. We reuse a single buffer rather than
2077     // creating a new one each time.
2078     normalizeCharacters(match, matchLength, m_normalizedMatch);
2079
2080     const UChar* a = m_normalizedTarget.begin();
2081     const UChar* aEnd = m_normalizedTarget.end();
2082
2083     const UChar* b = m_normalizedMatch.begin();
2084     const UChar* bEnd = m_normalizedMatch.end();
2085
2086     while (true) {
2087         // Skip runs of non-kana-letter characters. This is necessary so we can
2088         // correctly handle strings where the target and match have different-length
2089         // runs of characters that match, while still double checking the correctness
2090         // of matches of kana letters with other kana letters.
2091         while (a != aEnd && !isKanaLetter(*a))
2092             ++a;
2093         while (b != bEnd && !isKanaLetter(*b))
2094             ++b;
2095
2096         // If we reached the end of either the target or the match, we should have
2097         // reached the end of both; both should have the same number of kana letters.
2098         if (a == aEnd || b == bEnd) {
2099             ASSERT(a == aEnd);
2100             ASSERT(b == bEnd);
2101             return false;
2102         }
2103
2104         // Check for differences in the kana letter character itself.
2105         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2106             return true;
2107         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2108             return true;
2109         ++a;
2110         ++b;
2111
2112         // Check for differences in combining voiced sound marks found after the letter.
2113         while (1) {
2114             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2115                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2116                     return true;
2117                 break;
2118             }
2119             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2120                 return true;
2121             if (*a != *b)
2122                 return true;
2123             ++a;
2124             ++b;
2125         }
2126     }
2127 }
2128
2129 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2130 {
2131     ASSERT(m_options & AtWordStarts);
2132
2133     if (!start)
2134         return true;
2135
2136     int size = m_buffer.size();
2137     int offset = start;
2138     UChar32 firstCharacter;
2139     U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2140
2141     if (m_options & TreatMedialCapitalAsWordStart) {
2142         UChar32 previousCharacter;
2143         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2144
2145         if (isSeparator(firstCharacter)) {
2146             // The start of a separator run is a word start (".org" in "webkit.org").
2147             if (!isSeparator(previousCharacter))
2148                 return true;
2149         } else if (isASCIIUpper(firstCharacter)) {
2150             // The start of an uppercase run is a word start ("Kit" in "WebKit").
2151             if (!isASCIIUpper(previousCharacter))
2152                 return true;
2153             // The last character of an uppercase run followed by a non-separator, non-digit
2154             // is a word start ("Request" in "XMLHTTPRequest").
2155             offset = start;
2156             U16_FWD_1(m_buffer.data(), offset, size);
2157             UChar32 nextCharacter = 0;
2158             if (offset < size)
2159                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2160             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2161                 return true;
2162         } else if (isASCIIDigit(firstCharacter)) {
2163             // The start of a digit run is a word start ("2" in "WebKit2").
2164             if (!isASCIIDigit(previousCharacter))
2165                 return true;
2166         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2167             // The start of a non-separator, non-uppercase, non-digit run is a word start,
2168             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2169             return true;
2170         }
2171     }
2172
2173     // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
2174     // a word, so treat the position before any CJK character as a word start.
2175     if (Font::isCJKIdeographOrSymbol(firstCharacter))
2176         return true;
2177
2178     size_t wordBreakSearchStart = start + length;
2179     while (wordBreakSearchStart > start)
2180         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2181     return wordBreakSearchStart == start;
2182 }
2183
2184 inline size_t SearchBuffer::search(size_t& start)
2185 {
2186     size_t size = m_buffer.size();
2187     if (m_atBreak) {
2188         if (!size)
2189             return 0;
2190     } else {
2191         if (size != m_buffer.capacity())
2192             return 0;
2193     }
2194
2195     UStringSearch* searcher = WebCore::searcher();
2196
2197     UErrorCode status = U_ZERO_ERROR;
2198     usearch_setText(searcher, m_buffer.data(), size, &status);
2199     ASSERT(status == U_ZERO_ERROR);
2200
2201     usearch_setOffset(searcher, m_prefixLength, &status);
2202     ASSERT(status == U_ZERO_ERROR);
2203
2204     int matchStart = usearch_next(searcher, &status);
2205     ASSERT(status == U_ZERO_ERROR);
2206
2207 nextMatch:
2208     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2209         ASSERT(matchStart == USEARCH_DONE);
2210         return 0;
2211     }
2212
2213     // Matches that start in the overlap area are only tentative.
2214     // The same match may appear later, matching more characters,
2215     // possibly including a combining character that's not yet in the buffer.
2216     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2217         size_t overlap = m_overlap;
2218         if (m_options & AtWordStarts) {
2219             // Ensure that there is sufficient context before matchStart the next time around for
2220             // determining if it is at a word boundary.
2221             int wordBoundaryContextStart = matchStart;
2222             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2223             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2224             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2225         }
2226         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2227         m_prefixLength -= min(m_prefixLength, size - overlap);
2228         m_buffer.shrink(overlap);
2229         return 0;
2230     }
2231
2232     size_t matchedLength = usearch_getMatchedLength(searcher);
2233     ASSERT(matchStart + matchedLength <= size);
2234
2235     // If this match is "bad", move on to the next match.
2236     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2237         matchStart = usearch_next(searcher, &status);
2238         ASSERT(status == U_ZERO_ERROR);
2239         goto nextMatch;
2240     }
2241
2242     size_t newSize = size - (matchStart + 1);
2243     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2244     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2245     m_buffer.shrink(newSize);
2246
2247     start = size - matchStart;
2248     return matchedLength;
2249 }
2250
2251 #else // !ICU_UNICODE
2252
2253 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2254     : m_target(options & CaseInsensitive ? target.foldCase() : target)
2255     , m_options(options)
2256     , m_buffer(m_target.length())
2257     , m_isCharacterStartBuffer(m_target.length())
2258     , m_isBufferFull(false)
2259     , m_cursor(0)
2260 {
2261     ASSERT(!m_target.isEmpty());
2262     m_target.replace(noBreakSpace, ' ');
2263     foldQuoteMarksAndSoftHyphens(m_target);
2264 }
2265
2266 inline SearchBuffer::~SearchBuffer()
2267 {
2268 }
2269
2270 inline void SearchBuffer::reachedBreak()
2271 {
2272     m_cursor = 0;
2273     m_isBufferFull = false;
2274 }
2275
2276 inline bool SearchBuffer::atBreak() const
2277 {
2278     return !m_cursor && !m_isBufferFull;
2279 }
2280
2281 inline void SearchBuffer::append(UChar c, bool isStart)
2282 {
2283     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2284     m_isCharacterStartBuffer[m_cursor] = isStart;
2285     if (++m_cursor == m_target.length()) {
2286         m_cursor = 0;
2287         m_isBufferFull = true;
2288     }
2289 }
2290
2291 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2292 {
2293     ASSERT(length);
2294     if (!(m_options & CaseInsensitive)) {
2295         append(characters[0], true);
2296         return 1;
2297     }
2298     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2299     UChar foldedCharacters[maxFoldedCharacters];
2300     bool error;
2301     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2302     ASSERT(!error);
2303     ASSERT(numFoldedCharacters);
2304     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2305     if (!error && numFoldedCharacters) {
2306         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2307         append(foldedCharacters[0], true);
2308         for (int i = 1; i < numFoldedCharacters; ++i)
2309             append(foldedCharacters[i], false);
2310     }
2311     return 1;
2312 }
2313
2314 inline bool SearchBuffer::needsMoreContext() const
2315 {
2316     return false;
2317 }
2318
2319 void SearchBuffer::prependContext(const UChar*, size_t)
2320 {
2321     ASSERT_NOT_REACHED();
2322 }
2323
2324 inline size_t SearchBuffer::search(size_t& start)
2325 {
2326     if (!m_isBufferFull)
2327         return 0;
2328     if (!m_isCharacterStartBuffer[m_cursor])
2329         return 0;
2330
2331     size_t tailSpace = m_target.length() - m_cursor;
2332     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2333         return 0;
2334     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2335         return 0;
2336
2337     start = length();
2338
2339     // Now that we've found a match once, we don't want to find it again, because those
2340     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2341     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2342     // we want to get rid of that in the future we could track this with a separate boolean
2343     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2344     m_isCharacterStartBuffer[m_cursor] = false;
2345
2346     return start;
2347 }
2348
2349 // Returns the number of characters that were appended to the buffer (what we are searching in).
2350 // That's not necessarily the same length as the passed-in target string, because case folding
2351 // can make two strings match even though they're not the same length.
2352 size_t SearchBuffer::length() const
2353 {
2354     size_t bufferSize = m_target.length();
2355     size_t length = 0;
2356     for (size_t i = 0; i < bufferSize; ++i)
2357         length += m_isCharacterStartBuffer[i];
2358     return length;
2359 }
2360
2361 #endif // !ICU_UNICODE
2362
2363 // --------
2364
2365 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2366 {
2367     int length = 0;
2368     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2369         length += it.length();
2370
2371     return length;
2372 }
2373
2374 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2375 {
2376     CharacterIterator entireRangeIterator(entireRange);
2377     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2378 }
2379
2380 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2381 {
2382     RefPtr<Range> resultRange = scope->document()->createRange();
2383
2384     int docTextPosition = 0;
2385     int rangeEnd = rangeLocation + rangeLength;
2386     bool startRangeFound = false;
2387
2388     RefPtr<Range> textRunRange;
2389
2390     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2391
2392     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2393     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2394         textRunRange = it.range();
2395
2396         ExceptionCode ec = 0;
2397         resultRange->setStart(textRunRange->startContainer(), 0, ec);
2398         ASSERT(!ec);
2399         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2400         ASSERT(!ec);
2401
2402         return resultRange.release();
2403     }
2404
2405     for (; !it.atEnd(); it.advance()) {
2406         int len = it.length();
2407         textRunRange = it.range();
2408
2409         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2410         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2411
2412         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2413         // in those cases that textRunRange is used.
2414         if (foundEnd) {
2415             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2416             // position for emitted '\n's.
2417             if (len == 1 && it.characters()[0] == '\n') {
2418                 scope->document()->updateLayoutIgnorePendingStylesheets();
2419                 it.advance();
2420                 if (!it.atEnd()) {
2421                     RefPtr<Range> range = it.range();
2422                     ExceptionCode ec = 0;
2423                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2424                     ASSERT(!ec);
2425                 } else {
2426                     Position runStart = textRunRange->startPosition();
2427                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2428                     if (runEnd.isNotNull()) {
2429                         ExceptionCode ec = 0;
2430                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2431                         ASSERT(!ec);
2432                     }
2433                 }
2434             }
2435         }
2436
2437         if (foundStart) {
2438             startRangeFound = true;
2439             int exception = 0;
2440             if (textRunRange->startContainer()->isTextNode()) {
2441                 int offset = rangeLocation - docTextPosition;
2442                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2443             } else {
2444                 if (rangeLocation == docTextPosition)
2445                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2446                 else
2447                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2448             }
2449         }
2450
2451         if (foundEnd) {
2452             int exception = 0;
2453             if (textRunRange->startContainer()->isTextNode()) {
2454                 int offset = rangeEnd - docTextPosition;
2455                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2456             } else {
2457                 if (rangeEnd == docTextPosition)
2458                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2459                 else
2460                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2461             }
2462             docTextPosition += len;
2463             break;
2464         }
2465         docTextPosition += len;
2466     }
2467
2468     if (!startRangeFound)
2469         return 0;
2470
2471     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2472         int exception = 0;
2473         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2474     }
2475
2476     return resultRange.release();
2477 }
2478
2479 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2480 {
2481     location = notFound;
2482     length = 0;
2483
2484     if (!range->startContainer())
2485         return false;
2486
2487     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2488     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2489
2490     // The critical assumption is that this only gets called with ranges that
2491     // concentrate on a given area containing the selection root. This is done
2492     // because of text fields and textareas. The DOM for those is not
2493     // directly in the document DOM, so ensure that the range does not cross a
2494     // boundary of one of those.
2495     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2496         return false;
2497     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2498         return false;
2499
2500     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2501     ASSERT(testRange->startContainer() == scope);
2502     location = TextIterator::rangeLength(testRange.get());
2503
2504     ExceptionCode ec;
2505     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2506     ASSERT(testRange->startContainer() == scope);
2507     length = TextIterator::rangeLength(testRange.get()) - location;
2508     return true;
2509 }
2510
2511 // --------
2512
2513 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2514 {
2515     UChar* result = 0;
2516
2517     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2518     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2519     static const unsigned cMaxSegmentSize = 1 << 16;
2520     bufferLength = 0;
2521     typedef pair<UChar*, unsigned> TextSegment;
2522     OwnPtr<Vector<TextSegment> > textSegments;
2523     Vector<UChar> textBuffer;
2524     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2525     TextIteratorBehavior behavior = defaultBehavior;
2526     if (!isDisplayString)
2527         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2528
2529     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2530         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2531             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2532             if (!newSegmentBuffer)
2533                 goto exit;
2534             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2535             if (!textSegments)
2536                 textSegments = adoptPtr(new Vector<TextSegment>);
2537             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2538             textBuffer.clear();
2539         }
2540         textBuffer.append(it.characters(), it.length());
2541         bufferLength += it.length();
2542     }
2543
2544     if (!bufferLength)
2545         return 0;
2546
2547     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2548     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2549     if (!result)
2550         goto exit;
2551
2552     {
2553         UChar* resultPos = result;
2554         if (textSegments) {
2555             unsigned size = textSegments->size();
2556             for (unsigned i = 0; i < size; ++i) {
2557                 const TextSegment& segment = textSegments->at(i);
2558                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2559                 resultPos += segment.second;
2560             }
2561         }
2562         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2563     }
2564
2565 exit:
2566     if (textSegments) {
2567         unsigned size = textSegments->size();
2568         for (unsigned i = 0; i < size; ++i)
2569             free(textSegments->at(i).first);
2570     }
2571
2572     if (isDisplayString && r->ownerDocument())
2573         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2574
2575     return result;
2576 }
2577
2578 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2579 {
2580     unsigned length;
2581     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2582     if (!buf)
2583         return "";
2584     String result(buf, length);
2585     free(buf);
2586     return result;
2587 }
2588
2589 static inline bool isAllCollapsibleWhitespace(const String& string)
2590 {
2591     const UChar* characters = string.characters();
2592     unsigned length = string.length();
2593     for (unsigned i = 0; i < length; ++i) {
2594         if (!isCollapsibleWhitespace(characters[i]))
2595             return false;
2596     }
2597     return true;
2598 }
2599
2600 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2601 {
2602     ExceptionCode ec = 0;
2603     RefPtr<Range> result = range->cloneRange(ec);
2604     ASSERT(!ec);
2605     result->collapse(!forward, ec);
2606     ASSERT(!ec);
2607     return result.release();
2608 }
2609
2610 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2611 {
2612     matchStart = 0;
2613     size_t matchLength = 0;
2614
2615     SearchBuffer buffer(target, options);
2616
2617     if (buffer.needsMoreContext()) {
2618         RefPtr<Range> startRange = it.range();
2619         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2620         ExceptionCode ec = 0;
2621         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2622         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2623             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2624             if (!buffer.needsMoreContext())
2625                 break;
2626         }
2627     }
2628
2629     while (!it.atEnd()) {
2630         it.advance(buffer.append(it.characters(), it.length()));
2631 tryAgain:
2632         size_t matchStartOffset;
2633         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2634             // Note that we found a match, and where we found it.
2635             size_t lastCharacterInBufferOffset = it.characterOffset();
2636             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2637             matchStart = lastCharacterInBufferOffset - matchStartOffset;
2638             matchLength = newMatchLength;
2639             // If searching forward, stop on the first match.
2640             // If searching backward, don't stop, so we end up with the last match.
2641             if (!(options & Backwards))
2642                 break;
2643             goto tryAgain;
2644         }
2645         if (it.atBreak() && !buffer.atBreak()) {
2646             buffer.reachedBreak();
2647             goto tryAgain;
2648         }
2649     }
2650
2651     return matchLength;
2652 }
2653
2654 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2655 {
2656     // CharacterIterator requires renderers to be up-to-date
2657     range->ownerDocument()->updateLayout();
2658
2659     // First, find the text.
2660     size_t matchStart;
2661     size_t matchLength;
2662     {
2663         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2664         matchLength = findPlainText(findIterator, target, options, matchStart);
2665         if (!matchLength)
2666             return collapsedToBoundary(range, !(options & Backwards));
2667     }
2668
2669     // Then, find the document position of the start and the end of the text.
2670     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2671     return characterSubrange(computeRangeIterator, matchStart, matchLength);
2672 }
2673
2674 }