diff options
Diffstat (limited to 'engine/classes/ElggAutoP.php')
| -rw-r--r-- | engine/classes/ElggAutoP.php | 309 | 
1 files changed, 309 insertions, 0 deletions
| diff --git a/engine/classes/ElggAutoP.php b/engine/classes/ElggAutoP.php new file mode 100644 index 000000000..89d77e583 --- /dev/null +++ b/engine/classes/ElggAutoP.php @@ -0,0 +1,309 @@ +<?php + +/** + * Create wrapper P and BR elements in HTML depending on newlines. Useful when + * users use newlines to signal line and paragraph breaks. In all cases output + * should be well-formed markup. + * + * In DIV elements, Ps are only added when there would be at + * least two of them. + */ +class ElggAutoP { + +	public $encoding = 'UTF-8'; + +	/** +	 * @var DOMDocument +	 */ +	protected $_doc = null; + +	/** +	 * @var DOMXPath +	 */ +	protected $_xpath = null; + +	protected $_blocks = 'address article area aside blockquote caption col colgroup dd  +		details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header  +		hr hgroup legend map math menu nav noscript p pre section select style summary +		table tbody td tfoot th thead tr ul ol option li'; + +	/** +	 * @var array +	 */ +	protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist +		del dfn em embed i iframe img input ins kbd keygen label map mark meter object +		output progress q rp rt ruby s samp script select small source span strong style +		sub sup textarea time var video wbr'; + +	/** +	 * Descend into these elements to add Ps +	 * +	 * @var array +	 */ +	protected $_descendList = 'article aside blockquote body details div footer form +		header section'; + +	/** +	 * Add Ps inside these elements +	 * +	 * @var array +	 */ +	protected $_alterList = 'article aside blockquote body details div footer header +		section'; + +	protected $_unique = ''; + +	public function __construct() { +		$this->_blocks = preg_split('@\\s+@', $this->_blocks); +		$this->_descendList = preg_split('@\\s+@', $this->_descendList); +		$this->_alterList = preg_split('@\\s+@', $this->_alterList); +		$this->_inlines = preg_split('@\\s+@', $this->_inlines); +		$this->_unique = md5(__FILE__); +	} + +	/** +	 * Intance of class for singleton pattern. +	 * @var ElggAutoP +	 */ +	private static $instance; +	 +	/** +	 * Singleton pattern. +	 * @return ElggAutoP +	 */ +	public static function getInstance() { +		$className = __CLASS__; +		if (!(self::$instance instanceof $className)) { +			self::$instance = new $className(); +		} +		return self::$instance; +	} +	 +	/** +	 * Create wrapper P and BR elements in HTML depending on newlines. Useful when +	 * users use newlines to signal line and paragraph breaks. In all cases output +	 * should be well-formed markup. +	 * +	 * In DIV, LI, TD, and TH elements, Ps are only added when their would be at +	 * least two of them. +	 * +	 * @param string $html snippet +	 * @return string|false output or false if parse error occurred +	 */ +	public function process($html) { +		// normalize whitespace +		$html = str_replace(array("\r\n", "\r"), "\n", $html); + +		// allows preserving entities untouched +		$html = str_replace('&', $this->_unique . 'AMP', $html); + +		$this->_doc = new DOMDocument(); +	    +		// parse to DOM, suppressing loadHTML warnings +		// http://www.php.net/manual/en/domdocument.loadhtml.php#95463 +		libxml_use_internal_errors(true); + +		if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "  +				. "content='text/html; charset={$this->encoding}'><body>{$html}</body>" +				. "</html>")) { +			return false; +		} + +		$this->_xpath = new DOMXPath($this->_doc); +		// start processing recursively at the BODY element +		$nodeList = $this->_xpath->query('//body[1]'); +		$this->_addParagraphs($nodeList->item(0)); + +		// serialize back to HTML +		$html = $this->_doc->saveHTML(); + +		// split AUTOPs into multiples at /\n\n+/ +		$html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html); +		$html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'),  +				'<br />', +				$html); +		$html = str_replace('<br /></autop>', '</autop>', $html); + +		// re-parse so we can handle new AUTOP elements + +		if (!$this->_doc->loadHTML($html)) { +			return false; +		} +		// must re-create XPath object after DOM load +		$this->_xpath = new DOMXPath($this->_doc); + +		// strip AUTOPs that only have comments/whitespace +		foreach ($this->_xpath->query('//autop') as $autop) { +			$hasContent = false; +			if (trim($autop->textContent) !== '') { +				$hasContent = true; +			} else { +				foreach ($autop->childNodes as $node) { +					if ($node->nodeType === XML_ELEMENT_NODE) { +						$hasContent = true; +						break; +					} +				} +			} +			if (!$hasContent) { +				// strip w/ preg_replace later (faster than moving nodes out) +				$autop->setAttribute("r", "1"); +			} +		} + +		// remove a single AUTOP inside certain elements +		foreach ($this->_xpath->query('//div') as $el) { +			$autops = $this->_xpath->query('./autop', $el); +			if ($autops->length === 1) { +				// strip w/ preg_replace later (faster than moving nodes out) +				$autops->item(0)->setAttribute("r", "1"); +			} +		} + +		$html = $this->_doc->saveHTML(); + +		// trim to the contents of BODY +		$bodyStart = strpos($html, '<body>'); +		$bodyEnd = strpos($html, '</body>', $bodyStart + 6); +		$html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6); +		 +		// strip AUTOPs that should be removed +		$html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html); + +		// commit to converting AUTOPs to Ps +		$html = str_replace('<autop>', "\n<p>", $html); +		$html = str_replace('</autop>', "</p>\n", $html); +		 +		$html = str_replace('<br>', '<br />', $html); +		$html = str_replace($this->_unique . 'AMP', '&', $html); +		return $html; +	} + +	/** +	 * Add P and BR elements as necessary +	 * +	 * @param DOMElement $el +	 */ +	protected function _addParagraphs(DOMElement $el) { +		// no need to recurse, just queue up +		$elsToProcess = array($el); +		$inlinesToProcess = array(); +		while ($el = array_shift($elsToProcess)) { +			// if true, we can alter all child nodes, if not, we'll just call +			// _addParagraphs on each element in the descendInto list +			$alterInline = in_array($el->nodeName, $this->_alterList); + +			// inside affected elements, we want to trim leading whitespace from +			// the first text node +			$ltrimFirstTextNode = true; + +			// should we open a new AUTOP element to move inline elements into? +			$openP = true; +			$autop = null; + +			// after BR, ignore a newline +			$isFollowingBr = false; + +			$node = $el->firstChild; +			while (null !== $node) { +				if ($alterInline) { +					if ($openP) { +						$openP = false; +						// create a P to move inline content into (this may be removed later) +						$autop = $el->insertBefore($this->_doc->createElement('autop'), $node); +					} +				} + +				$isElement = ($node->nodeType === XML_ELEMENT_NODE); +				if ($isElement) { +					$elName = $node->nodeName; +				} +				$isBlock = ($isElement && in_array($elName, $this->_blocks)); + +				if ($alterInline) { +					$isInline = $isElement && ! $isBlock; +					$isText = ($node->nodeType === XML_TEXT_NODE); +					$isLastInline = (! $node->nextSibling +								   || ($node->nextSibling->nodeType === XML_ELEMENT_NODE +									   && in_array($node->nextSibling->nodeName, $this->_blocks))); +					if ($isElement) { +						$isFollowingBr = ($node->nodeName === 'br'); +					} + +					if ($isText) { +						$nodeText = $node->nodeValue; +						if ($ltrimFirstTextNode) { +							$nodeText = ltrim($nodeText); +							$ltrimFirstTextNode = false; +						} +						if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) { +							// if a user ends a line with <br>, don't add a second BR +							$nodeText = substr($nodeText, strlen($m[0])); +						} +						if ($isLastInline) { +							$nodeText = rtrim($nodeText); +						} +						$nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText); +						$tmpNode = $node; +						$node = $node->nextSibling; // move loop to next node + +						// alter node in place, then move into AUTOP +						$tmpNode->nodeValue = $nodeText; +						$autop->appendChild($tmpNode); + +						continue; +					} +				} +				if ($isBlock || ! $node->nextSibling) { +					if ($isBlock) { +						if (in_array($node->nodeName, $this->_descendList)) { +							$elsToProcess[] = $node; +							//$this->_addParagraphs($node); +						} +					} +					$openP = true; +					$ltrimFirstTextNode = true; +				} +				if ($alterInline) { +					if (! $isBlock) { +						$tmpNode = $node; +						if ($isElement && false !== strpos($tmpNode->textContent, "\n")) { +							$inlinesToProcess[] = $tmpNode; +						} +						$node = $node->nextSibling; +						$autop->appendChild($tmpNode); +						continue; +					} +				} + +				$node = $node->nextSibling; +			} +		} + +		// handle inline nodes +		// no need to recurse, just queue up +		while ($el = array_shift($inlinesToProcess)) { +			$ignoreLeadingNewline = false; +			foreach ($el->childNodes as $node) { +				if ($node->nodeType === XML_ELEMENT_NODE) { +					if ($node->nodeValue === 'BR') { +						$ignoreLeadingNewline = true; +					} else { +						$ignoreLeadingNewline = false; +						if (false !== strpos($node->textContent, "\n")) { +							$inlinesToProcess[] = $node; +						} +					} +					continue; +				} elseif ($node->nodeType === XML_TEXT_NODE) { +					$text = $node->nodeValue; +					if ($text[0] === "\n" && $ignoreLeadingNewline) { +						$text = substr($text, 1); +						$ignoreLeadingNewline = false; +					} +					$node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text); +				} +			} +		} +	} +} | 
