empresalibre.net/source/vendor/antoligy/dom-string-iterators/src/DOMWordsIterator.php

118 lines
3.4 KiB
PHP

<?php
/**
* Iterates individual words of DOM text and CDATA nodes
* while keeping track of their position in the document.
*
* Example:
*
* $doc = new DOMDocument();
* $doc->load('example.xml');
* foreach(new DOMWordsIterator($doc) as $word) echo $word;
*
* @author pjgalbraith http://www.pjgalbraith.com
* @author porneL http://pornel.net (based on DOMLettersIterator available at http://pornel.net/source/domlettersiterator.php)
* @license Public Domain
*
*/
final class DOMWordsIterator implements Iterator {
private $start, $current;
private $offset, $key, $words;
/**
* expects DOMElement or DOMDocument (see DOMDocument::load and DOMDocument::loadHTML)
*/
function __construct(DOMNode $el)
{
if ($el instanceof DOMDocument) $this->start = $el->documentElement;
else if ($el instanceof DOMElement) $this->start = $el;
else throw new InvalidArgumentException("Invalid arguments, expected DOMElement or DOMDocument");
}
/**
* Returns position in text as DOMText node and character offset.
* (it's NOT a byte offset, you must use mb_substr() or similar to use this offset properly).
* node may be NULL if iterator has finished.
*
* @return array
*/
function currentWordPosition()
{
return array($this->current, $this->offset, $this->words);
}
/**
* Returns DOMElement that is currently being iterated or NULL if iterator has finished.
*
* @return DOMElement
*/
function currentElement()
{
return $this->current ? $this->current->parentNode : NULL;
}
// Implementation of Iterator interface
function key()
{
return $this->key;
}
function next()
{
if (!$this->current) return;
if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE)
{
if ($this->offset == -1)
{
// fastest way to get individual Unicode chars and does not require mb_* functions
//preg_match_all('/./us',$this->current->textContent,$m); $this->words = $m[0];
$this->words = preg_split("/[\n\r\t ]+/", $this->current->textContent, -1, PREG_SPLIT_NO_EMPTY|PREG_SPLIT_OFFSET_CAPTURE);
}
$this->offset++;
if ($this->offset < count($this->words)) {
$this->key++;
return;
}
$this->offset = -1;
}
while($this->current->nodeType == XML_ELEMENT_NODE && $this->current->firstChild)
{
$this->current = $this->current->firstChild;
if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE) return $this->next();
}
while(!$this->current->nextSibling && $this->current->parentNode)
{
$this->current = $this->current->parentNode;
if ($this->current === $this->start) {$this->current = NULL; return;}
}
$this->current = $this->current->nextSibling;
return $this->next();
}
function current()
{
if ($this->current) return $this->words[$this->offset][0];
return NULL;
}
function valid()
{
return !!$this->current;
}
function rewind()
{
$this->offset = -1; $this->words = array();
$this->current = $this->start;
$this->next();
}
}