474 lines
14 KiB
PHP
Executable File
474 lines
14 KiB
PHP
Executable File
<?php
|
|
/**
|
|
* Html2Pdf Library
|
|
*
|
|
* HTML => PDF converter
|
|
* distributed under the OSL-3.0 License
|
|
*
|
|
* @package Html2pdf
|
|
* @author Laurent MINGUET <webmaster@html2pdf.fr>
|
|
* @copyright 2025 Laurent MINGUET
|
|
*/
|
|
namespace Spipu\Html2Pdf\Parsing;
|
|
|
|
use Spipu\Html2Pdf\Exception\HtmlParsingException;
|
|
|
|
/**
|
|
* Class Html
|
|
*/
|
|
class Html
|
|
{
|
|
const HTML_TAB = ' ';
|
|
|
|
/**
|
|
* @var TagParser
|
|
*/
|
|
protected $tagParser;
|
|
|
|
/**
|
|
* @var TextParser
|
|
*/
|
|
protected $textParser;
|
|
|
|
/**
|
|
* are we in a pre ?
|
|
* @var boolean
|
|
*/
|
|
protected $tagPreIn = false;
|
|
|
|
/**
|
|
* parsed HTML code
|
|
* @var Node[]
|
|
*/
|
|
public $code = array();
|
|
|
|
/**
|
|
* main constructor
|
|
*
|
|
* @param TextParser $textParser
|
|
*/
|
|
public function __construct(TextParser $textParser)
|
|
{
|
|
$this->textParser = $textParser;
|
|
$this->tagParser = new TagParser($this->textParser);
|
|
$this->code = array();
|
|
}
|
|
|
|
/**
|
|
* Get the list of the codes, but cloned
|
|
*
|
|
* @return Node[]
|
|
*/
|
|
public function getCloneCodes()
|
|
{
|
|
$codes = array();
|
|
foreach ($this->code as $key => $code) {
|
|
$codes[$key] = clone $code;
|
|
}
|
|
return $codes;
|
|
}
|
|
|
|
/**
|
|
* parse the HTML code
|
|
*
|
|
* @param Token[] $tokens A list of tokens to parse
|
|
*
|
|
* @throws HtmlParsingException
|
|
*/
|
|
public function parse($tokens)
|
|
{
|
|
$parents = array();
|
|
|
|
// flag : are we in a <pre> Tag ?
|
|
$this->tagPreIn = false;
|
|
|
|
/**
|
|
* all the actions to do
|
|
* @var Node[] $actions
|
|
*/
|
|
$actions = array();
|
|
|
|
// get the actions from the html tokens
|
|
foreach ($tokens as $token) {
|
|
if ($token->getType() === 'code') {
|
|
$actions = array_merge($actions, $this->getTagAction($token, $parents));
|
|
} elseif ($token->getType() === 'txt') {
|
|
$actions = array_merge($actions, $this->getTextAction($token));
|
|
}
|
|
}
|
|
|
|
// for each identified action, we have to clean up the begin and the end of the texte
|
|
// based on tags that surround it
|
|
|
|
// list of the tags to clean
|
|
$tagsToClean = array(
|
|
'page', 'page_header', 'page_footer', 'form',
|
|
'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
|
|
'div', 'hr', 'p', 'ul', 'ol', 'li',
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'bookmark', 'fieldset', 'legend',
|
|
'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
|
|
'option'
|
|
);
|
|
|
|
// list of the tags to move space
|
|
$tagsToSpace = array(
|
|
'span', 'font', 'label',
|
|
'strong', 'b',
|
|
'address', 'cite', 'em', 'i', 'samp',
|
|
'cite', 's',
|
|
'ins', 'u',
|
|
'big', 'small', 'sub', 'sup'
|
|
);
|
|
|
|
// foreach action
|
|
$nb = count($actions);
|
|
for ($k = 0; $k < $nb; $k++) {
|
|
// if it is a Text
|
|
if ($actions[$k]->getName() !== 'write') {
|
|
continue;
|
|
}
|
|
|
|
// if the tag before the text is a tag to clean => ltrim on the text
|
|
if ($k>0) {
|
|
if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
|
|
$actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
|
|
}
|
|
}
|
|
|
|
if ($k < $nb - 1) {
|
|
// if the tag after the text is a tag to clean => rtrim on the text
|
|
if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
|
|
$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
|
|
}
|
|
|
|
// if the tag after the text is a tag with space to move => move the space to the next write
|
|
if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
|
|
if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
|
|
$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
|
|
for ($subK = $k+2; $subK < $nb; $subK++) {
|
|
if ($actions[$subK]->getName() === 'write') {
|
|
$actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// if the text is empty => remove the action
|
|
if (!strlen($actions[$k]->getParam('txt'))) {
|
|
unset($actions[$k]);
|
|
}
|
|
}
|
|
|
|
// if we are not on the level 0 => HTML validator ERROR
|
|
if (count($parents)) {
|
|
if (count($parents)>1) {
|
|
$errorMsg = 'The following tags have not been closed:';
|
|
} else {
|
|
$errorMsg = 'The following tag has not been closed:';
|
|
}
|
|
|
|
$e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
|
|
$e->setInvalidTag($parents[0]);
|
|
throw $e;
|
|
}
|
|
|
|
$this->verifyMustContain($actions, 'thead', 'tr');
|
|
$this->verifyMustContain($actions, 'tfoot', 'tr');
|
|
|
|
// save the actions to do
|
|
$this->code = array_values($actions);
|
|
}
|
|
|
|
/**
|
|
* Verify some tags that must contain other tags
|
|
*
|
|
* @param Node[] $actions
|
|
* @param string $mainTag
|
|
* @param string $mustTag
|
|
*
|
|
* @return bool
|
|
* @throws HtmlParsingException
|
|
*/
|
|
protected function verifyMustContain(&$actions, $mainTag, $mustTag)
|
|
{
|
|
$inMainTag = 0;
|
|
$foundMustTag = false;
|
|
|
|
foreach ($actions as $action) {
|
|
if ($action->getName() == $mainTag && !$action->isClose()) {
|
|
$inMainTag++;
|
|
$foundMustTag = false;
|
|
}
|
|
|
|
if ($action->getName() == $mustTag && $inMainTag > 0) {
|
|
$foundMustTag = true;
|
|
}
|
|
|
|
if ($action->getName() == $mainTag && $action->isClose()) {
|
|
if (!$foundMustTag) {
|
|
$exception = new HtmlParsingException(
|
|
"The tag [$mainTag] must contain at least one tag [$mustTag]"
|
|
);
|
|
$exception->setInvalidTag($action->getName());
|
|
$exception->setHtmlLine($action->getLine());
|
|
throw $exception;
|
|
}
|
|
$inMainTag--;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* TODO remove the reference on the $parents variable
|
|
*
|
|
* @param Token $token
|
|
* @param array $parents
|
|
*
|
|
* @return array
|
|
* @throws HtmlParsingException
|
|
*/
|
|
protected function getTagAction(Token $token, &$parents)
|
|
{
|
|
// tag that can be not closed
|
|
$tagsNotClosed = array(
|
|
'br', 'hr', 'img', 'col',
|
|
'input', 'link', 'option',
|
|
'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
|
|
);
|
|
|
|
// analyze the HTML code
|
|
$node = $this->tagParser->analyzeTag($token->getData());
|
|
|
|
// save the current position in the HTML code
|
|
$node->setLine($token->getLine());
|
|
|
|
$actions = array();
|
|
// if the tag must be closed
|
|
if (!in_array($node->getName(), $tagsNotClosed)) {
|
|
// if it is a closure tag
|
|
if ($node->isClose()) {
|
|
// HTML validation
|
|
if (count($parents) < 1) {
|
|
$e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
|
|
$e->setInvalidTag($node->getName());
|
|
$e->setHtmlLine($token->getLine());
|
|
throw $e;
|
|
} elseif (end($parents) != $node->getName()) {
|
|
$e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
|
|
$e->setInvalidTag($node->getName());
|
|
$e->setHtmlLine($token->getLine());
|
|
throw $e;
|
|
} else {
|
|
array_pop($parents);
|
|
}
|
|
} else {
|
|
// if it is an auto-closed tag
|
|
if ($node->isAutoClose()) {
|
|
// save the opened tag
|
|
$actions[] = $node;
|
|
|
|
// prepare the closed tag
|
|
$node = clone $node;
|
|
$node->setParams(array());
|
|
$node->setClose(true);
|
|
} else {
|
|
// else: add a child for validation
|
|
array_push($parents, $node->getName());
|
|
}
|
|
}
|
|
|
|
// if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
|
|
if (($node->getName() === 'pre' || $node->getName() === 'code') && !$node->isAutoClose()) {
|
|
$this->tagPreIn = !$node->isClose();
|
|
}
|
|
}
|
|
|
|
// save the actions to convert
|
|
$actions[] = $node;
|
|
|
|
return $actions;
|
|
}
|
|
|
|
/**
|
|
* get the Text action
|
|
*
|
|
* @param Token $token
|
|
*
|
|
* @return array
|
|
*/
|
|
protected function getTextAction(Token $token)
|
|
{
|
|
// action to use for each line of the content of a <pre> Tag
|
|
$tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
|
|
|
|
$actions = array();
|
|
|
|
// if we are not in a <pre> tag
|
|
if (!$this->tagPreIn) {
|
|
// save the action
|
|
$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
|
|
} else { // else (if we are in a <pre> tag)
|
|
// prepare the text
|
|
$data = str_replace("\r", '', $token->getData());
|
|
$lines = explode("\n", $data);
|
|
|
|
// foreach line of the text
|
|
foreach ($lines as $k => $txt) {
|
|
// transform the line
|
|
$txt = str_replace("\t", self::HTML_TAB, $txt);
|
|
$txt = str_replace(' ', ' ', $txt);
|
|
|
|
// add a break line
|
|
if ($k > 0) {
|
|
$actions[] = clone $tagPreBr;
|
|
}
|
|
|
|
// save the action
|
|
$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
|
|
}
|
|
}
|
|
return $actions;
|
|
}
|
|
|
|
/**
|
|
* get a full level of HTML, between an opening and closing corresponding
|
|
*
|
|
* @param integer $k
|
|
* @return array actions
|
|
*/
|
|
public function getLevel($k)
|
|
{
|
|
// if the code does not exist => return empty
|
|
if (!isset($this->code[$k])) {
|
|
return array();
|
|
}
|
|
|
|
// the tag to detect
|
|
$detect = $this->code[$k]->getName();
|
|
|
|
// if it is a text => return
|
|
if ($detect === 'write') {
|
|
return array($this->code[$k]);
|
|
}
|
|
|
|
//
|
|
$level = 0; // depth level
|
|
$end = false; // end of the search
|
|
$code = array(); // extract code
|
|
|
|
// while it's not ended
|
|
while (!$end) {
|
|
// current action
|
|
/** @var Node $node */
|
|
$node = $this->code[$k];
|
|
|
|
// if 'write' => we add the text
|
|
if ($node->getName() === 'write') {
|
|
$code[] = $node;
|
|
} else { // else, it is a html tag
|
|
$not = false; // flag for not taking into account the current tag
|
|
|
|
// if it is the searched tag
|
|
if ($node->getName() == $detect) {
|
|
// if we are just at the root level => dont take it
|
|
if ($level == 0) {
|
|
$not = true;
|
|
}
|
|
|
|
// update the level
|
|
$level += ($node->isClose() ? -1 : 1);
|
|
|
|
// if we are now at the root level => it is the end, and dont take it
|
|
if ($level == 0) {
|
|
$not = true;
|
|
$end = true;
|
|
}
|
|
}
|
|
|
|
// if we can take into account the current tag => save it
|
|
if (!$not) {
|
|
$code[] = $node;
|
|
}
|
|
}
|
|
|
|
// it continues as long as there has code to analyze
|
|
if (isset($this->code[$k + 1])) {
|
|
$k++;
|
|
} else {
|
|
$end = true;
|
|
}
|
|
}
|
|
|
|
// return the extract
|
|
return $code;
|
|
}
|
|
|
|
/**
|
|
* prepare the HTML
|
|
*
|
|
* @param string $html
|
|
*
|
|
* @return string
|
|
*/
|
|
public function prepareHtml($html)
|
|
{
|
|
// if it is a real html page, we have to convert it
|
|
if (preg_match('/<body/isU', $html)) {
|
|
$html = $this->getHtmlFromRealPage($html);
|
|
}
|
|
|
|
// replace some constants
|
|
$html = str_replace('[[date_y]]', date('Y'), $html);
|
|
$html = str_replace('[[date_m]]', date('m'), $html);
|
|
$html = str_replace('[[date_d]]', date('d'), $html);
|
|
|
|
$html = str_replace('[[date_h]]', date('H'), $html);
|
|
$html = str_replace('[[date_i]]', date('i'), $html);
|
|
$html = str_replace('[[date_s]]', date('s'), $html);
|
|
|
|
return $html;
|
|
}
|
|
|
|
/**
|
|
* convert the HTML of a real page, to a code adapted to Html2Pdf
|
|
*
|
|
* @param string $html HTML code of a real page
|
|
* @return string HTML adapted to Html2Pdf
|
|
*/
|
|
protected function getHtmlFromRealPage($html)
|
|
{
|
|
// set body tag to lower case
|
|
$html = str_replace('<BODY', '<body', $html);
|
|
$html = str_replace('</BODY', '</body', $html);
|
|
|
|
// explode from the body tag. If no body tag => end
|
|
$res = explode('<body', $html);
|
|
|
|
// the html content is between body tag openning and closing
|
|
$content = '<page'.$res[1];
|
|
$content = explode('</body', $content);
|
|
$content = $content[0].'</page>';
|
|
|
|
// extract the link tags from the original html
|
|
// and add them before the content
|
|
preg_match_all('/<link ([^>]*)[\/]?>/isU', $html, $match);
|
|
foreach ($match[1] as $src) {
|
|
$content = '<link '.$src.'/>'.$content;
|
|
}
|
|
|
|
// extract the css style tags from the original html
|
|
// and add them before the content
|
|
preg_match_all('/<style[^>]*>(.*)<\/style[^>]*>/isU', $html, $match);
|
|
foreach ($match[0] as $src) {
|
|
$content = $src.$content;
|
|
}
|
|
|
|
return $content;
|
|
}
|
|
}
|