W3cubDocs

/WordPress

WP_HTML_Processor::step_in_body(): bool

Parses next element in the ‘in body’ insertion mode.

Description

This internal function performs the ‘in body’ insertion mode logic for the generalized WP_HTML_Processor::step() function.

See also

Return

bool Whether an element was found.

Source

				'#text' === $token_name
			)
		) ||
		(
			'math' === $adjusted_current_node->namespace &&
			'ANNOTATION-XML' === $adjusted_current_node->node_name &&
			$is_start_tag && 'SVG' === $token_name
		) ||
		(
			'html' === $adjusted_current_node->integration_node_type &&
			( $is_start_tag || '#text' === $token_name )
		)
	);

	try {
		if ( ! $parse_in_current_insertion_mode ) {
			return $this->step_in_foreign_content();
		}

		switch ( $this->state->insertion_mode ) {
			case WP_HTML_Processor_State::INSERTION_MODE_INITIAL:
				return $this->step_initial();

			case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML:
				return $this->step_before_html();

			case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD:
				return $this->step_before_head();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD:
				return $this->step_in_head();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT:
				return $this->step_in_head_noscript();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD:
				return $this->step_after_head();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY:
				return $this->step_in_body();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE:
				return $this->step_in_table();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT:
				return $this->step_in_table_text();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION:
				return $this->step_in_caption();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP:
				return $this->step_in_column_group();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY:
				return $this->step_in_table_body();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW:
				return $this->step_in_row();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL:
				return $this->step_in_cell();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT:
				return $this->step_in_select();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE:
				return $this->step_in_select_in_table();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE:
				return $this->step_in_template();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY:
				return $this->step_after_body();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET:
				return $this->step_in_frameset();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET:
				return $this->step_after_frameset();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY:
				return $this->step_after_after_body();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET:
				return $this->step_after_after_frameset();

			// This should be unreachable but PHP doesn't have total type checking on switch.
			default:
				$this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." );
		}
	} catch ( WP_HTML_Unsupported_Exception $e ) {
		/*
		 * Exceptions are used in this class to escape deep call stacks that
		 * otherwise might involve messier calling and return conventions.
		 */
		return false;
	}
}

/**
 * Computes the HTML breadcrumbs for the currently-matched node, if matched.
 *
 * Breadcrumbs start at the outermost parent and descend toward the matched element.
 * They always include the entire path from the root HTML node to the matched element.
 *
 * @todo It could be more efficient to expose a generator-based version of this function
 *       to avoid creating the array copy on tag iteration. If this is done, it would likely
 *       be more useful to walk up the stack when yielding instead of starting at the top.
 *
 * Example
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<p><strong><em><img></em></strong></p>' );
 *     $processor->next_tag( 'IMG' );
 *     $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
 *
 * @since 6.4.0
 *
 * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
 */
public function get_breadcrumbs(): ?array {
	return $this->breadcrumbs;
}

/**
 * Returns the nesting depth of the current location in the document.
 *
 * Example:
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<div><p></p></div>' );
 *     // The processor starts in the BODY context, meaning it has depth from the start: HTML > BODY.
 *     2 === $processor->get_current_depth();
 *
 *     // Opening the DIV element increases the depth.
 *     $processor->next_token();
 *     3 === $processor->get_current_depth();
 *
 *     // Opening the P element increases the depth.
 *     $processor->next_token();
 *     4 === $processor->get_current_depth();
 *
 *     // The P element is closed during `next_token()` so the depth is decreased to reflect that.
 *     $processor->next_token();
 *     3 === $processor->get_current_depth();
 *
 * @since 6.6.0
 *
 * @return int Nesting-depth of current location in the document.
 */
public function get_current_depth(): int {
	return count( $this->breadcrumbs );
}

/**
 * Normalizes an HTML fragment by serializing it.
 *
 * This method assumes that the given HTML snippet is found in BODY context.
 * For normalizing full documents or fragments found in other contexts, create
 * a new processor using WP_HTML_Processor::create_fragment or
 * WP_HTML_Processor::create_full_parser and call WP_HTML_Processor::serialize
 * on the created instances.
 *
 * Many aspects of an input HTML fragment may be changed during normalization.
 *
 *  - Attribute values will be double-quoted.
 *  - Duplicate attributes will be removed.
 *  - Omitted tags will be added.
 *  - Tag and attribute name casing will be lower-cased,
 *    except for specific SVG and MathML tags or attributes.
 *  - Text will be re-encoded, null bytes handled,
 *    and invalid UTF-8 replaced with U+FFFD.
 *  - Any incomplete syntax trailing at the end will be omitted,
 *    for example, an unclosed comment opener will be removed.
 *
 * Example:
 *
 *     echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
 *     // <a href="#anchor" v="5" enabled>One</a>
 *
 *     echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
 *
 *     echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
 *
 * @since 6.7.0
 *
 * @param string $html Input HTML to normalize.
 *
 * @return string|null Normalized output, or `null` if unable to normalize.
 */
public static function normalize( string $html ): ?string {
	return static::create_fragment( $html )->serialize();
}

/**
 * Returns normalized HTML for a fragment by serializing it.
 *
 * This differs from WP_HTML_Processor::normalize in that it starts with
 * a specific HTML Processor, which _must_ not have already started scanning;
 * it must be in the initial ready state and will be in the completed state once
 * serialization is complete.
 *
 * Many aspects of an input HTML fragment may be changed during normalization.
 *
 *  - Attribute values will be double-quoted.
 *  - Duplicate attributes will be removed.
 *  - Omitted tags will be added.
 *  - Tag and attribute name casing will be lower-cased,
 *    except for specific SVG and MathML tags or attributes.
 *  - Text will be re-encoded, null bytes handled,
 *    and invalid UTF-8 replaced with U+FFFD.
 *  - Any incomplete syntax trailing at the end will be omitted,
 *    for example, an unclosed comment opener will be removed.
 *
 * Example:
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
 *     echo $processor->serialize();
 *     // <a href="#anchor" v="5" enabled>One</a>
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
 *     echo $processor->serialize();
 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
 *     echo $processor->serialize();
 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
 *
 * @since 6.7.0
 *
 * @return string|null Normalized HTML markup represented by processor,
 *                     or `null` if unable to generate serialization.
 */
public function serialize(): ?string {
	if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
		wp_trigger_error(
			__METHOD__,
			'An HTML Processor which has already started processing cannot serialize its contents. Serialize immediately after creating the instance.',
			E_USER_WARNING
		);
		return null;
	}

	$html = '';
	while ( $this->next_token() ) {
		$html .= $this->serialize_token();
	}

	if ( null !== $this->get_last_error() ) {
		wp_trigger_error(
			__METHOD__,
			"Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
			E_USER_WARNING
		);
		return null;
	}

	return $html;
}

/**
 * Serializes the currently-matched token.
 *
 * This method produces a fully-normative HTML string for the currently-matched token,
 * if able. If not matched at any token or if the token doesn't correspond to any HTML
 * it will return an empty string (for example, presumptuous end tags are ignored).
 *
 * @see static::serialize()
 *
 * @since 6.7.0
 *
 * @return string Serialization of token, or empty string if no serialization exists.
 */
protected function serialize_token(): string {
	$html       = '';
	$token_type = $this->get_token_type();

	switch ( $token_type ) {
		case '#text':
			$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
			break;

		// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
		case '#presumptuous-tag':
			break;

		case '#funky-comment':
		case '#comment':
			$html .= "<!--{$this->get_full_comment_text()}-->";
			break;

		case '#cdata-section':
			$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
			break;

		case 'html':
			$html .= '<!DOCTYPE html>';
			break;
	}

	if ( '#tag' !== $token_type ) {
		return $html;
	}

	$tag_name       = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
	$in_html        = 'html' === $this->get_namespace();
	$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();

	if ( $this->is_tag_closer() ) {
		$html .= "</{$qualified_name}>";
		return $html;
	}

	$attribute_names = $this->get_attribute_names_with_prefix( '' );
	if ( ! isset( $attribute_names ) ) {
		$html .= "<{$qualified_name}>";
		return $html;
	}

	$html .= "<{$qualified_name}";
	foreach ( $attribute_names as $attribute_name ) {
		$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
		$value = $this->get_attribute( $attribute_name );

		if ( is_string( $value ) ) {
			$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
		}

		$html = str_replace( "\x00", "\u{FFFD}", $html );
	}

	if ( ! $in_html && $this->has_self_closing_flag() ) {
		$html .= ' /';
	}

	$html .= '>';

	// Flush out self-contained elements.
	if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
		$text = $this->get_modifiable_text();

		switch ( $tag_name ) {
			case 'IFRAME':
			case 'NOEMBED':
			case 'NOFRAMES':
				$text = '';
				break;

			case 'SCRIPT':
			case 'STYLE':
				break;

			default:
				$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
		}

		$html .= "{$text}</{$qualified_name}>";
	}

	return $html;
}

/**
 * Parses next element in the 'initial' insertion mode.
 *
 * This internal function performs the 'initial' insertion mode
 * logic for the generalized WP_HTML_Processor::step() function.
 *
 * @since 6.7.0
 *
 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
 *
 * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
 * @see WP_HTML_Processor::step
 *
 * @return bool Whether an element was found.
 */
private function step_initial(): bool {
	$token_name = $this->get_token_name();
	$token_type = $this->get_token_type();
	$op_sigil   = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
	$op         = "{$op_sigil}{$token_name}";

	switch ( $op ) {
		/*
		 * > A character token that is one of U+0009 CHARACTER TABULATION,
		 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
		 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
		 *
		 * Parse error: ignore the token.
		 */
		case '#text':
			if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
				return $this->step();
			}
			goto initial_anything_else;
			break;

		/*
		 * > A comment token
		 */
		case '#comment':
		case '#funky-comment':
		case '#presumptuous-tag':
			$this->insert_html_element( $this->state->current_token );
			return true;

		/*
		 * > A DOCTYPE token
		 */
		case 'html':
			$doctype = $this->get_doctype_info();
			if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
				$this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
			}

			/*
			 * > Then, switch the insertion mode to "before html".
			 */
			$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
			$this->insert_html_element( $this->state->current_token );
			return true;
	}

	/*
	 * > Anything else
	 */
	initial_anything_else:
	$this->compat_mode           = WP_HTML_Tag_Processor::QUIRKS_MODE;
	$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
	return $this->step( self::REPROCESS_CURRENT_NODE );
}

/**
 * Parses next element in the 'before html' insertion mode.
 *
 * This internal function performs the 'before html' insertion mode
 * logic for the generalized WP_HTML_Processor::step() function.
 *
 * @since 6.7.0
 *
 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
 *
 * @see https://html.spec.whatwg.org/#the-before-html-insertion-mode
 * @see WP_HTML_Processor::step
 *
 * @return bool Whether an element was found.
 */
private function step_before_html(): bool {
	$token_name = $this->get_token_name();
	$token_type = $this->get_token_type();
	$is_closer  = parent::is_tag_closer();
	$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
	$op         = "{$op_sigil}{$token_name}";

	switch ( $op ) {
		/*
		 * > A DOCTYPE token
		 */
		case 'html':
			// Parse error: ignore the token.
			return $this->step();

		/*
		 * > A comment token
		 */
		case '#comment':
		case '#funky-comment':
		case '#presumptuous-tag':
			$this->insert_html_element( $this->state->current_token );
			return true;

		/*
		 * > A character token that is one of U+0009 CHARACTER TABULATION,
		 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
		 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
		 *
		 * Parse error: ignore the token.
		 */
		case '#text':
			if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
				return $this->step();
			}
			goto before_html_anything_else;
			break;

		/*
		 * > A start tag whose tag name is "html"
		 */
		case '+HTML':
			$this->insert_html_element( $this->state->current_token );
			$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD;
			return true;

		/*
		 * > An end tag whose tag name is one of: "head", "body", "html", "br"
		 *
		 * Closing BR tags are always reported by the Tag Processor as opening tags.
		 */
		case '-HEAD':
		case '-BODY':
		case '-HTML':
			/*
			 * > Act as described in the "anything else" entry below.
			 */
			goto before_html_anything_else;
			break;
	}

	/*
	 * > Any other end tag
	 */
	if ( $is_closer ) {
		// Parse error: ignore the token.
		return $this->step();
	}

	/*
	 * > Anything else.
	 *
	 * > Create an html element whose node document is the Document object.
	 * > Append it to the Document object. Put this element in the stack of open elements.
	 * > Switch the insertion mode to "before head", then reprocess the token.
	 */
	before_html_anything_else:
	$this->insert_virtual_node( 'HTML' );
	$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD;
	return $this->step( self::REPROCESS_CURRENT_NODE );
}

/**
 * Parses next element in the 'before head' insertion mode.
 *
 * This internal function performs the 'before head' insertion mode
 * logic for the generalized WP_HTML_Processor::step() function.
 *
 * @since 6.7.0 Stub implementation.
 *
 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
 *
 * @see https://html.spec.whatwg.org/#the-before-head-insertion-mode
 * @see WP_HTML_Processor::step
 *
 * @return bool Whether an element was found.
 */
private function step_before_head(): bool {
	$token_name = $this->get_token_name();
	$token_type = $this->get_token_type();
	$is_closer  = parent::is_tag_closer();
	$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
	$op         = "{$op_sigil}{$token_name}";

	switch ( $op ) {
		/*
		 * > A character token that is one of U+0009 CHARACTER TABULATION,
		 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
		 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
		 *
		 * Parse error: ignore the token.
		 */
		case '#text':
			if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
				return $this->step();
			}
			goto before_head_anything_else;
			break;

		/*

Changelog

Version Description
6.4.0 Introduced.

© 2003–2024 WordPress Foundation
Licensed under the GNU GPLv2+ License.
https://developer.wordpress.org/reference/classes/wp_html_processor/step_in_body