newdesigngestionnaire/vendor/spipu/html2pdf/src/Parsing/HtmlLexer.php

82 lines
2.3 KiB
PHP
Executable File

<?php
/**
* Html2Pdf Library - parsing Html class
*
* HTML => PDF converter
* distributed under the OSL-3.0 License
*
* @package Html2pdf
* @author Laurent MINGUET <webmaster@html2pdf.fr>
* @copyright 2025 Laurent MINGUET
*/
namespace Spipu\Html2Pdf\Parsing;
/**
* Class HtmlLexer
*/
class HtmlLexer
{
/**
* Tokenize the HTML code
*
* @param string $html HTML code to tokenize
*
* @return Token[]
*/
public function tokenize($html)
{
// initialise the array
$tokens = array();
// regexp to separate the tags from the texts
$reg = '/(<\/?\w[^<>]*>)|([^<]+|<)/is';
$commentRegex = '/(<!--.*-->)/isU';
// last match found
$str = '';
$offset = 0;
$line = 1;
$length = strlen($html);
// As it finds a match
while ($offset < $length) {
if (strpos($html, '<!--', $offset) === $offset
&& preg_match($commentRegex, $html, $match, PREG_OFFSET_CAPTURE, $offset)
) {
$line += substr_count($match[1][0], "\n");
$offset = $match[0][1] + strlen($match[0][0]);
continue;
}
preg_match($reg, $html, $parse, PREG_OFFSET_CAPTURE, $offset);
// if it is a tag
if ($parse[1][0]) {
// save the previous text if it exists
if ($str !== '') {
$tokens[] = new Token('txt', $str);
}
// save the tag, with the offset
$tokens[] = new Token('code', trim($parse[1][0]), $line);
$line += substr_count($parse[1][0], "\n");
// init the current text
$str = '';
} else { // else (if it is a text)
// add the new text to the current text
$str .= $parse[2][0];
$line += substr_count($parse[2][0], "\n");
}
// Update offset to the end of the match
$offset = $parse[0][1] + strlen($parse[0][0]);
unset($parse);
}
// if a text is present in the end, we save it
if ($str !== '') {
$tokens[] = new Token('txt', $str);
}
return $tokens;
}
}