FileMaster
Search
Toggle Dark Mode
Home
/
.
/
wp-content
/
plugins
/
woocommerce
/
packages
/
email-editor
/
src
/
Engine
/
Renderer
Edit File: class-html2text.php
<?php /** * HTML to Text Converter class * * This file was extracted from the `soundasleep/html2text` package. * Copyright (c) 2019 Jevon Wright * MIT License * * @package Automattic\WooCommerce\EmailEditor */ declare( strict_types = 1 ); namespace Automattic\WooCommerce\EmailEditor\Engine\Renderer; /** * Converts HTML into plain text format suitable for email display * * Features: * - Maintains links with href copied over * - Information in the <head> is lost * - Handles various HTML elements appropriately for text conversion */ class Html2Text { /** * Default options for HTML to text conversion * * @return array<string, bool|string> Default options array. */ public static function default_options(): array { return array( 'ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto', ); } /** * Converts HTML into plain text format * * @param string $html The input HTML. * @param boolean|array<string, bool|string> $options Conversion options. * @return string The HTML converted to text. * @throws Html2Text_Exception|\InvalidArgumentException If the HTML could not be loaded or invalid options are provided. */ public static function convert( string $html, $options = array() ): string { if ( false === $options || true === $options ) { // Using old style (< 1.0) of passing in options. $options = array( 'ignore_errors' => $options ); } $options = array_merge( static::default_options(), $options ); // Check all options are valid. foreach ( array_keys( $options ) as $key ) { if ( ! in_array( $key, array_keys( static::default_options() ), true ) ) { // Log invalid option for debugging purposes without exposing in exception. // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log -- Security: Logging sensitive data separately from user-facing exception messages. error_log( 'Html2Text: Invalid option provided: ' . htmlspecialchars( (string) $key, ENT_QUOTES, 'UTF-8' ) . '. Valid options are: ' . htmlspecialchars( implode( ',', array_keys( static::default_options() ) ), ENT_QUOTES, 'UTF-8' ) ); // Throw generic error message to avoid exposing user input. throw new \InvalidArgumentException( 'Invalid option provided for html2text conversion.' ); } } $is_office_document = self::is_office_document( $html ); if ( $is_office_document ) { // Remove office namespace. $html = str_replace( array( '<o:p>', '</o:p>' ), '', $html ); } $html = self::fix_newlines( $html ); // Use mb_convert_encoding for legacy versions of php. if ( PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding( $html, 'UTF-8', true ) ) { $converted = mb_convert_encoding( $html, 'HTML-ENTITIES', 'UTF-8' ); $html = false !== $converted ? $converted : $html; } // Ensure $html is always a string before passing to get_document. if ( ! is_string( $html ) ) { $html = (string) $html; } $doc = self::get_document( $html, $options ); $output = self::iterate_over_node( $doc, null, false, $is_office_document, $options ); // Process output for whitespace/newlines. $output = self::process_whitespace_newlines( $output ); return $output; } /** * Unify newlines * * Converts \r\n to \n, and \r to \n. This means that all newlines * (Unix, Windows, Mac) all become \ns. * * @param string $text Text with any number of \r, \r\n and \n combinations. * @return string The fixed text. */ public static function fix_newlines( string $text ): string { // Replace \r\n to \n. $text = str_replace( "\r\n", "\n", $text ); // Remove \rs. $text = str_replace( "\r", "\n", $text ); return $text; } /** * Get non-breaking space character codes * * @return array<string> Array of nbsp codes. */ public static function nbsp_codes(): array { return array( "\xc2\xa0", "\u00a0", ); } /** * Get zero-width non-joiner character codes * * @return array<string> Array of zwnj codes. */ public static function zwnj_codes(): array { return array( "\xe2\x80\x8c", "\u200c", ); } /** * Remove leading or trailing spaces and excess empty lines from provided multiline text * * @param string $text Multiline text with any number of leading or trailing spaces or excess lines. * @return string The fixed text. */ public static function process_whitespace_newlines( string $text ): string { // Remove excess spaces around tabs. $result = preg_replace( '/ *\t */im', "\t", $text ); $text = null !== $result ? $result : $text; // Remove leading whitespace. $text = ltrim( $text ); // Remove leading spaces on each line. $result = preg_replace( "/\n[ \t]*/im", "\n", $text ); $text = null !== $result ? $result : $text; // Convert non-breaking spaces to regular spaces to prevent output issues, // do it here so they do NOT get removed with other leading spaces, as they // are sometimes used for indentation. $text = self::render_text( $text ); // Remove trailing whitespace. $text = rtrim( $text ); // Remove trailing spaces on each line. $result = preg_replace( "/[ \t]*\n/im", "\n", $text ); $text = null !== $result ? $result : $text; // Unarmor pre blocks. $text = self::fix_newlines( $text ); // Remove unnecessary empty lines. $result = preg_replace( "/\n\n\n*/im", "\n\n", $text ); return null !== $result ? $result : $text; } /** * Can we guess that this HTML is generated by Microsoft Office? * * @param string $html The HTML content. * @return bool True if this appears to be an Office document. */ public static function is_office_document( string $html ): bool { return strpos( $html, 'urn:schemas-microsoft-com:office' ) !== false; } /** * Check if text is whitespace * * @param string $text The text to check. * @return bool True if the text is whitespace. */ public static function is_whitespace( string $text ): bool { return 0 === strlen( trim( self::render_text( $text ), "\n\r\t " ) ); } /** * Parse HTML into a DOMDocument * * @param string $html The input HTML. * @param array<string, bool|string> $options Parsing options. * @return \DOMDocument The parsed document tree. * @throws Html2Text_Exception If the HTML could not be loaded. */ private static function get_document( string $html, array $options ): \DOMDocument { $doc = new \DOMDocument(); $html = trim( $html ); if ( ! $html ) { // DOMDocument doesn't support empty value and throws an error. // Return empty document instead. return $doc; } if ( '<' !== $html[0] ) { // If HTML does not begin with a tag, we put a body tag around it. // If we do not do this, PHP will insert a paragraph tag around // the first block of text for some reason which can mess up // the newlines. See pre.html test for an example. $html = '<body>' . $html . '</body>'; } $header = ''; // Use char sets for modern versions of php. if ( PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81 ) { // Use specified char_set, or auto detect if not set. $char_set = ! empty( $options['char_set'] ) && is_string( $options['char_set'] ) ? $options['char_set'] : 'auto'; if ( 'auto' === $char_set ) { $detected = mb_detect_encoding( $html ); $char_set = false !== $detected ? $detected : 'UTF-8'; } elseif ( strpos( $char_set, ',' ) !== false ) { $encoding_list = explode( ',', $char_set ); $encoding_list = array_map( 'trim', $encoding_list ); $encoding_list = array_filter( $encoding_list, function ( $encoding ) { return ! empty( $encoding ); } ); if ( ! empty( $encoding_list ) ) { // Ensure we have a proper list with consecutive integer keys. $encoding_list = array_values( $encoding_list ); mb_detect_order( $encoding_list ); $detected = mb_detect_encoding( $html ); $char_set = false !== $detected ? $detected : 'UTF-8'; } } // Turn off error detection for Windows-1252 legacy html. if ( strpos( $char_set, '1252' ) !== false ) { $options['ignore_errors'] = true; } $header = '<?xml version="1.0" encoding="' . $char_set . '">'; } if ( ! empty( $options['ignore_errors'] ) ) { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $doc->strictErrorChecking = false; // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $doc->recover = true; // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $doc->xmlStandalone = true; $old_internal_errors = libxml_use_internal_errors( true ); $load_result = $doc->loadHTML( $header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE ); libxml_use_internal_errors( $old_internal_errors ); } else { $load_result = $doc->loadHTML( $header . $html ); } if ( ! $load_result ) { // Log truncated HTML content for debugging purposes (limit to 500 chars to prevent log bloat). $html_preview = strlen( $html ) > 500 ? substr( $html, 0, 500 ) . '...[truncated]' : $html; // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log -- Security: Logging sensitive data separately from user-facing exception messages. error_log( 'Html2Text: Failed to load HTML content: ' . htmlspecialchars( $html_preview, ENT_QUOTES, 'UTF-8' ) ); // Throw a generic error message to avoid exposing sensitive data. throw new Html2Text_Exception( 'Could not load HTML - the content may be malformed.' ); } return $doc; } /** * Replace any special characters with simple text versions * * This prevents output issues: * - Convert non-breaking spaces to regular spaces; and * - Convert zero-width non-joiners to '' (nothing). * * This is to match our goal of rendering documents as they would be rendered * by a browser. * * @param string $text The text to process. * @return string The processed text. */ private static function render_text( string $text ): string { $text = str_replace( self::nbsp_codes(), ' ', $text ); $text = str_replace( self::zwnj_codes(), '', $text ); return $text; } /** * Get the next child name * * @param \DOMNode|null $node The node to check. * @return string|null The next child name. */ private static function next_child_name( ?\DOMNode $node ): ?string { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase if ( null === $node || null === $node->nextSibling ) { return null; } // Get the next child. // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $next_node = $node->nextSibling; while ( null !== $next_node ) { if ( $next_node instanceof \DOMText ) { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase if ( ! self::is_whitespace( $next_node->wholeText ) ) { break; } } if ( $next_node instanceof \DOMElement ) { break; } // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $next_node = $next_node->nextSibling; } $next_name = null; if ( $next_node instanceof \DOMElement || $next_node instanceof \DOMText ) { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $next_name = strtolower( $next_node->nodeName ); } return $next_name; } /** * Iterate over a DOM node and convert to text * * @param \DOMNode $node The DOM node. * @param string|null $prev_name Previous node name. * @param bool $in_pre Whether we're in a pre block. * @param bool $is_office_document Whether this is an Office document. * @param array<string, bool|string> $options Conversion options. * @return string The converted text. */ private static function iterate_over_node( \DOMNode $node, ?string $prev_name, bool $in_pre, bool $is_office_document, array $options ): string { if ( $node instanceof \DOMText ) { // Replace whitespace characters with a space (equivalent to \s). if ( $in_pre ) { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $text = "\n" . trim( self::render_text( $node->wholeText ), "\n\r\t " ) . "\n"; // Remove trailing whitespace only. $result = preg_replace( "/[ \t]*\n/im", "\n", $text ); $text = null !== $result ? $result : $text; // Armor newlines with \r. return str_replace( "\n", "\r", $text ); } // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $text = self::render_text( $node->wholeText ); $result = preg_replace( "/[\\t\\n\\f\\r ]+/im", ' ', $text ); $text = null !== $result ? $result : $text; if ( ! self::is_whitespace( $text ) && ( 'p' === $prev_name || 'div' === $prev_name ) ) { return "\n" . $text; } return $text; } if ( $node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction ) { // Ignore. return ''; } // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $name = strtolower( $node->nodeName ); $next_name = self::next_child_name( $node ); // Start whitespace. switch ( $name ) { case 'hr': $prefix = ''; if ( null !== $prev_name ) { $prefix = "\n"; } return $prefix . "---------------------------------------------------------------\n"; case 'style': case 'head': case 'title': case 'meta': case 'script': // Ignore these tags. return ''; case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': case 'ol': case 'ul': case 'pre': // Add two newlines. $output = "\n\n"; break; case 'td': case 'th': // Add tab char to separate table fields. $output = "\t"; break; case 'p': // Microsoft exchange emails often include HTML which, when passed through // html2text, results in lots of double line returns everywhere. // // To fix this, for any p element with a className of `MsoNormal` (the standard // classname in any Microsoft export or outlook for a paragraph that behaves // like a line return) we skip the first line returns and set the name to br. if ( $is_office_document && $node instanceof \DOMElement && 'MsoNormal' === $node->getAttribute( 'class' ) ) { $output = ''; $name = 'br'; break; } // Add two lines. $output = "\n\n"; break; case 'tr': // Add one line. $output = "\n"; break; case 'div': $output = ''; if ( null !== $prev_name ) { // Add one line. $output .= "\n"; } break; case 'li': $output = '- '; break; default: // Print out contents of unknown tags. $output = ''; break; } // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase if ( $node->childNodes->length > 0 ) { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $n = $node->childNodes->item( 0 ); $previous_sibling_names = array(); $previous_sibling_name = null; $parts = array(); $trailing_whitespace = 0; while ( null !== $n ) { $text = self::iterate_over_node( $n, $previous_sibling_name, $in_pre || 'pre' === $name, $is_office_document, $options ); // Pass current node name to next child, as previousSibling does not appear to get populated. if ( $n instanceof \DOMDocumentType || $n instanceof \DOMProcessingInstruction || ( $n instanceof \DOMText && self::is_whitespace( $text ) ) ) { // Keep current previousSiblingName, these are invisible. ++$trailing_whitespace; } else { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $previous_sibling_name = strtolower( $n->nodeName ); $previous_sibling_names[] = $previous_sibling_name; $trailing_whitespace = 0; } $node->removeChild( $n ); // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase $n = $node->childNodes->item( 0 ); $parts[] = $text; } // Remove trailing whitespace, important for the br check below. while ( $trailing_whitespace-- > 0 ) { array_pop( $parts ); } // Suppress last br tag inside a node list if follows text. $last_name = array_pop( $previous_sibling_names ); if ( 'br' === $last_name ) { $last_name = array_pop( $previous_sibling_names ); if ( '#text' === $last_name ) { array_pop( $parts ); } } $output .= implode( '', $parts ); } // End whitespace. switch ( $name ) { case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': case 'pre': case 'p': // Add two lines. $output .= "\n\n"; break; case 'br': // Add one line. $output .= "\n"; break; case 'div': break; case 'a': // Links are returned in [text](link) format. $href = $node instanceof \DOMElement ? $node->getAttribute( 'href' ) : ''; $output = trim( $output ); // Remove double [[ ]] s from linking images. if ( '[' === substr( $output, 0, 1 ) && ']' === substr( $output, -1 ) ) { $output = substr( $output, 1, strlen( $output ) - 2 ); // For linking images, the title of the <a> overrides the title of the <img>. if ( $node instanceof \DOMElement && $node->getAttribute( 'title' ) ) { $output = $node->getAttribute( 'title' ); } } // If there is no link text, but a title attr. if ( ! $output && $node instanceof \DOMElement && $node->getAttribute( 'title' ) ) { $output = $node->getAttribute( 'title' ); } if ( ! $href ) { // It doesn't link anywhere. if ( $node instanceof \DOMElement && $node->getAttribute( 'name' ) ) { if ( $options['drop_links'] ) { $output = "$output"; } else { $output = "[$output]"; } } } elseif ( $href === $output || "mailto:$output" === $href || "http://$output" === $href || "https://$output" === $href ) { // Link to the same address: just use link. $output = "$output"; } elseif ( $output ) { // Replace it. if ( $options['drop_links'] ) { $output = "$output"; } else { $output = "[$output]($href)"; } } else { // Empty string. $output = "$href"; } // Does the next node require additional whitespace? switch ( $next_name ) { case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': $output .= "\n"; break; } break; case 'img': if ( $node instanceof \DOMElement && $node->getAttribute( 'title' ) ) { $output = '[' . $node->getAttribute( 'title' ) . ']'; } elseif ( $node instanceof \DOMElement && $node->getAttribute( 'alt' ) ) { $output = '[' . $node->getAttribute( 'alt' ) . ']'; } else { $output = ''; } break; case 'li': $output .= "\n"; break; case 'blockquote': // Process quoted text for whitespace/newlines. $output = self::process_whitespace_newlines( $output ); // Add leading newline. $output = "\n" . $output; // Prepend '> ' at the beginning of all lines. $result = preg_replace( "/\n/im", "\n> ", $output ); $output = null !== $result ? $result : $output; // Replace leading '> >' with '>>'. $result = preg_replace( "/\n> >/im", "\n>>", $output ); $output = null !== $result ? $result : $output; // Add another leading newline and trailing newlines. $output = "\n" . $output . "\n\n"; break; default: // Do nothing. } return $output; } }
Save
Back