PhutilHTMLParser.php
No OneTemporary
Actions

Size

11 KB

Referenced Files

None

Subscribers

None

PhutilHTMLParser.php
View Options

	<?php

	final class PhutilHTMLParser extends Phobject {

	private $cursor;

	public function parseDocument($corpus) {
	// Divide the block into sequences of "tag" and "non-tag" content. Tag
	// content is anything between angle brackets ("<" and ">"). Non-tag
	// content is anything else.

	$segment_pos = 0;
	$segments = array();
	$in_tag = false;

	for ($ii = 0; $ii < strlen($corpus); $ii++) {
	$c = $corpus[$ii];

	if ($in_tag && ($c === '>')) {
	if ($segment_pos !== null) {
	$segments[] = array(
	'tag' => $in_tag,
	'pos' => $segment_pos,
	'end' => $ii + 1,
	);
	}

	$segment_pos = $ii + 1;
	$in_tag = false;
	continue;
	}

	// When we encounter a "<", we start a new tag whether we're already in
	// a tag or not. We want to parse "<x>1 < 2</x>" as a single tag with
	// the content "1 < 2".

	if ($c === '<') {
	$segments[] = array(
	'tag' => false,
	'pos' => $segment_pos,
	'end' => $ii,
	);

	$segment_pos = $ii;
	$in_tag = true;
	continue;
	}
	}

	// Add whatever content was left at the end of the string. If we were in
	// a tag but did not find a closing ">", we treat this as normal content.
	$segments[] = array(
	'tag' => false,
	'pos' => $segment_pos,
	'end' => $ii,
	);

	// Slice the marked segments out of the raw corpus so we get a list of
	// "tag" strings and a list of "non-tag" strings.

	$parts = array();
	$corpus_length = strlen($corpus);
	foreach ($segments as $segment) {
	$tag = $segment['tag'];
	$pos = $segment['pos'];
	$len = $segment['end'] - $pos;

	// If this is a tag, we'll drop the "<" at the beginning and the ">"
	// at the end here.
	if ($tag) {
	$slice_pos = $pos + 1;
	$slice_len = $len - 2;
	} else {
	$slice_pos = $pos;
	$slice_len = $len;
	}

	if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
	$content = substr($corpus, $slice_pos, $slice_len);
	} else {
	$content = '';
	}

	$parts[] = array(
	'tag' => $tag,
	'pos' => $pos,
	'len' => $len,
	'content' => $content,
	);
	}

	$root = new PhutilDOMNode();
	$this->setCursor($root);

	foreach ($parts as $part) {
	$tag = $this->newTagDOMNode($part);

	if ($tag !== null) {
	continue;
	}

	$content = $part['content'];

	// If this part is a tag, restore the angle brackets.
	if ($part['tag']) {
	$content = '<'.$content.'>';
	}

	$node = id(new PhutilDOMNode())
	->setContent($content)
	->setRawHead($content);

	$this->getCursor()->appendChild($node);
	}

	$root->mergeContent();

	return $root;
	}

	private function newTagDOMNode(array $part) {
	if (!$part['tag']) {
	return null;
	}

	$raw_content = $part['content'];
	$content = $raw_content;

	$content = trim($content);
	$content_len = strlen($content);

	// If the tag content begins with "/", like "</td>", strip the slash
	// off and mark this as a closing tag.
	$is_close = false;
	if ($content_len > 0 && $content[0] === '/') {
	$is_close = true;
	$content = substr($content, 1);
	$content = trim($content);
	$content_len = strlen($content);
	}

	// If the tag content ends with "/", like "<td />", strip the slash off
	// and mark this as self-closing.
	$self_close = false;
	if ($content_len > 0 && $content[$content_len - 1] === '/') {
	$self_close = true;
	$content = substr($content, 0, $content_len - 1);
	$content = trim($content);
	$content_len = strlen($content);
	}

	// If this tag is both a closing tag and a self-closing tag, it is
	// not formatted correctly. Treat it as content.
	if ($self_close && $is_close) {
	return null;
	}

	// Now, split the rest of the tag into the tag name and tag attributes.
	$pieces = preg_split('/\s+/', $content, 2);
	$tag_name = $pieces[0];

	if (count($pieces) > 1) {
	$attributes = $pieces[1];
	} else {
	$attributes = '';
	}

	// If there's no tag name, this tag is not valid. Treat it as content.
	if (!strlen($tag_name)) {
	return null;
	}

	// If this is a closing tag with attributes, it's not valid. Treat it
	// as content.
	if ($is_close && strlen($attributes)) {
	return null;
	}

	$tag_name = phutil_utf8_strtolower($tag_name);

	// If we find a valid closing tag, try to find a matching tag on the stack.
	// If we find a matching tag, close it.
	// If we do not find a matching tag, treat the closing tag as content.
	if ($is_close) {
	$cursor = $this->getCursor();

	while ($cursor) {
	if ($cursor->getTagName() === $tag_name) {
	// Add this raw content to the raw content of the tag we're closing.
	$cursor->setRawTail('<'.$raw_content.'>');

	$parent = $cursor->getParentNode();
	$this->setCursor($parent);

	return true;
	}
	$cursor = $cursor->getParentNode();
	}

	return null;
	}

	if (strlen($attributes)) {
	$attribute_map = $this->parseAttributes($attributes);
	// If the attributes can't be parsed, treat the tag as content.
	if ($attribute_map === null) {
	return null;
	}
	} else {
	$attribute_map = array();
	}

	$node = id(new PhutilDOMNode())
	->setTagName($tag_name)
	->setAttributes($attribute_map)
	->setRawHead('<'.$raw_content.'>');

	$cursor = $this->getCursor();
	$cursor->appendChild($node);

	if (!$self_close) {
	$this->setCursor($node);
	}

	return $node;
	}

	private function setCursor(PhutilDOMNode $cursor) {
	$this->cursor = $cursor;
	return $this;
	}

	private function getCursor() {
	return $this->cursor;
	}

	private function parseAttributes($attributes) {
	$state = 'key';

	$whitespace = array(
	' ' => true,
	"\n" => true,
	"\t" => true,
	"\r" => true,
	);

	$map = array();
	$len = strlen($attributes);
	$key_pos = null;
	for ($ii = 0; $ii < $len; $ii++) {
	$c = $attributes[$ii];
	$is_space = isset($whitespace[$c]);

	switch ($state) {
	case 'key':
	// We're looking for the start of an attribute name.

	// Skip over any whitespace.
	if ($is_space) {
	break;
	}

	// If we see "<tag =...", that isn't valid. Treat this tag as
	// content.
	if ($c === '=') {
	return null;
	}

	// If we see a quotation mark with no attribute name, that isn't
	// valid. Treat this tag as content.
	if ($c === '"') {
	return null;
	}

	// Any other character marks the beginning of an attribute name.
	// Switch the parser state to "name" to parse the name.
	$name_pos = $ii;
	$state = 'name';
	break;
	case 'name':
	// We're looking for the end of an attribute name.

	// Finding a "=" or a space character ends the attribute name.
	// Save it, then figure out what to do with the parser state.
	if ($c === '=' \|\| $is_space) {
	$name_value = substr($attributes, $name_pos, $ii - $name_pos);
	$name_value = phutil_utf8_strtolower($name_value);

	// If this attribute already exists, the tag is invalid. This means
	// the input is something like "<tag a=1 a=2>".
	if (isset($map[$name_value])) {
	return null;
	}
	}

	// If we find an "=", that's the end of the name. Next, we're going
	// to parse a value.
	if ($c === '=') {
	$state = 'value';
	break;
	}

	// If we find whitespace, that's the end of the name. We're going
	// to look for an "=".
	if ($is_space) {
	$state = 'equals';
	break;
	}

	break;
	case 'equals':
	// We've parsed the name of an attribute and are looking for an
	// "=" character.

	// Skip over any whitespace.
	if ($is_space) {
	break;
	}

	// This is the "=" we're looking for, so we're good to go.
	if ($c === '=') {
	$state = 'value';
	break;
	}

	// If this is anything else, this is an attribute name with no
	// value. Treat it as "true" and move on. This corresponds to an
	// input like "<input disabled>".
	$map[$name_value] = true;
	$name_pos = $ii;
	$state = 'name';
	break;
	case 'value':
	// We've parsed an "=" and are looking for the start of a value.

	// Skip over any whitespace.
	if ($is_space) {
	break;
	}

	// Don't accept "<tag a==" to mean that key "a" has a value of
	// "=", since this is silly. To specify a value beginning with "=",
	// you have to quote it.
	if ($c === '=') {
	return null;
	}

	// Anything else is a value.
	$value_pos = $ii;

	// This is a quotation mark, so parse a quoted value.
	if ($c === '"') {
	$value_pos = $value_pos + 1;
	$state = 'quoted';
	} else {
	$state = 'unquoted';
	}
	break;
	case 'quoted':
	// We've started parsing a quoted value, so look for the closing
	// quote.

	// We found the closing quote, so pull out the actual value.
	if ($c === '"') {
	$attr_value = substr($attributes, $value_pos, $ii - $value_pos);

	$map[$name_value] = $attr_value;
	$state = 'key';
	break;
	}

	// Anything else is more text in the quoted value.
	break;
	case 'unquoted':
	// We've started parsing an unquoted value, so look for terminating
	// whitespace.

	// We've found some whitespace, so pull out the actual value.
	if ($is_space) {
	$attr_value = substr($attributes, $value_pos, $ii - $value_pos);

	$map[$name_value] = $attr_value;
	$state = 'key';
	break;
	}

	// Anything else is more text in the unquoted value.
	break;
	}
	}

	switch ($state) {
	case 'key':
	// We were looking for the start of an attribute name, so there's
	// nothing to clean up.
	break;
	case 'name':
	// We were looking for the end of an attribute name. Treat whatever
	// we found as a name.
	$name_value = substr($attributes, $name_pos, $len - $name_pos);

	if (isset($map[$name_value])) {
	return null;
	}

	$map[$name_value] = true;
	break;
	case 'equals':
	case 'value':
	// We found an attribute name followed by whitespace or an "=". Treat
	// whatever we found as a valid attribute name with no value.

	if (isset($map[$name_value])) {
	return null;
	}

	$map[$name_value] = true;
	break;
	case 'quoted':
	case 'unquoted':
	// We were parsing a value but ran out of characters before we found
	// the delimiter or closing quote. Treat whatever we found as a quoted
	// value.

	$attr_value = substr($attributes, $value_pos, $len - $name_pos);

	$map[$name_value] = $attr_value;
	break;
	}

	return $map;
	}

	}

File Metadata

Mime Type: text/x-php
Expires: Mon, Mar 24, 04:05 (3 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1117754
Default Alt Text: PhutilHTMLParser.php (11 KB)

PhutilHTMLParser.phpNo OneTemporaryActions

PhutilHTMLParser.phpView Options

File Metadata

Event Timeline

PhutilHTMLParser.php
No OneTemporary
Actions

PhutilHTMLParser.php
View Options