Skip to content

Enhancements to addHTML parser #1902

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 195 additions & 16 deletions src/PhpWord/Shared/Html.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ public static function addHtml($element, $html, $fullHTML = false, $preserveWhit
// Preprocess: remove all line ends, decode HTML entity,
// fix ampersand and angle brackets and add body tag for HTML fragments
$html = str_replace(array("\n", "\r"), '', $html);
$html = str_replace(array('<', '>', '&'), array('_lt_', '_gt_', '_amp_'), $html);
$html = str_replace(array('<', '>', '&', '"'), array('_lt_', '_gt_', '_amp_', '_quot_'), $html);
$html = html_entity_decode($html, ENT_QUOTES, 'UTF-8');
$html = str_replace('&', '&', $html);
$html = str_replace(array('_lt_', '_gt_', '_amp_'), array('<', '>', '&'), $html);
$html = str_replace(array('_lt_', '_gt_', '_amp_', '_quot_'), array('<', '>', '&', '"'), $html);

if (false === $fullHTML) {
$html = '<body>' . $html . '</body>';
Expand Down Expand Up @@ -96,15 +96,43 @@ protected static function parseInlineStyle($node, $styles = array())
$attributes = $node->attributes; // get all the attributes(eg: id, class)

foreach ($attributes as $attribute) {
switch ($attribute->name) {
$val = $attribute->value;
switch (strtolower($attribute->name)) {
case 'style':
$styles = self::parseStyle($attribute, $styles);
break;
case 'align':
$styles['alignment'] = self::mapAlign($attribute->value);
$styles['alignment'] = self::mapAlign(trim($val));
break;
case 'lang':
$styles['lang'] = $attribute->value;
$styles['lang'] = $val;
break;
case 'width':
// tables, cells
if (false !== strpos($val, '%')) {
// e.g. <table width="100%"> or <td width="50%">
$styles['width'] = intval($val) * 50;
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::PERCENT;
} else {
// e.g. <table width="250> where "250" = 250px (always pixels)
$styles['width'] = Converter::pixelToTwip($val);
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::TWIP;
}
break;
case 'cellspacing':
// tables e.g. <table cellspacing="2">, where "2" = 2px (always pixels)
$val = intval($val).'px';
$styles['cellSpacing'] = Converter::cssToTwip($val);
break;
case 'bgcolor':
// tables, rows, cells e.g. <tr bgColor="#FF0000">
$styles['bgColor'] = trim($val, '# ');
break;
case 'valign':
// cells e.g. <td valign="middle">
if (preg_match('#(?:top|bottom|middle|baseline)#i', $val, $matches)) {
$styles['valign'] = self::mapAlignVertical($matches[0]);
}
break;
}
}
Expand Down Expand Up @@ -161,6 +189,7 @@ protected static function parseNode($node, $element, $styles = array(), $data =
'img' => array('Image', $node, $element, $styles, null, null, null),
'br' => array('LineBreak', null, $element, $styles, null, null, null),
'a' => array('Link', $node, $element, $styles, null, null, null),
'hr' => array('HorizRule', $node, $element, $styles, null, null, null),
);

$newElement = null;
Expand Down Expand Up @@ -361,7 +390,11 @@ protected static function parseCell($node, $element, &$styles)
if (!empty($colspan)) {
$cellStyles['gridSpan'] = $colspan - 0;
}
$cell = $element->addCell(null, $cellStyles);

// set cell width to control column widths
$width = isset($cellStyles['width']) ? $cellStyles['width'] : null;
unset($cellStyles['width']); // would not apply
$cell = $element->addCell($width, $cellStyles);

if (self::shouldAddTextRun($node)) {
return $cell->addTextRun(self::parseInlineStyle($node, $styles['paragraph']));
Expand Down Expand Up @@ -420,7 +453,32 @@ protected static function parseList($node, $element, &$styles, &$data)
} else {
$data['listdepth'] = 0;
$styles['list'] = 'listStyle_' . self::$listIndex++;
$element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
$style = $element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));

// extract attributes start & type e.g. <ol type="A" start="3">
$start = 0;
$type = '';
foreach ($node->attributes as $attribute) {
switch ($attribute->name) {
case 'start':
$start = (int) $attribute->value;
break;
case 'type':
$type = $attribute->value;
break;
}
}

$levels = $style->getLevels();
/** @var \PhpOffice\PhpWord\Style\NumberingLevel */
$level = $levels[0];
if ($start > 0) {
$level->setStart($start);
}
$type = $type ? self::mapListType($type) : null;
if ($type) {
$level->setFormat($type);
}
}
if ($node->parentNode->nodeName === 'li') {
return $element->getParent();
Expand Down Expand Up @@ -502,7 +560,8 @@ protected static function parseStyle($attribute, $styles)
foreach ($properties as $property) {
list($cKey, $cValue) = array_pad(explode(':', $property, 2), 2, null);
$cValue = trim($cValue);
switch (trim($cKey)) {
$cKey = strtolower(trim($cKey));
switch ($cKey) {
case 'text-decoration':
switch ($cValue) {
case 'underline':
Expand Down Expand Up @@ -575,11 +634,18 @@ protected static function parseStyle($attribute, $styles)
}
$styles['italic'] = $tValue;
break;
case 'margin':
$cValue = Converter::cssToTwip($cValue);
$styles['spaceBefore'] = $cValue;
$styles['spaceAfter'] = $cValue;
break;
case 'margin-top':
$styles['spaceBefore'] = Converter::cssToPoint($cValue);
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
$styles['spaceBefore'] = Converter::cssToTwip($cValue);
break;
case 'margin-bottom':
$styles['spaceAfter'] = Converter::cssToPoint($cValue);
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
$styles['spaceAfter'] = Converter::cssToTwip($cValue);
break;
case 'border-color':
self::mapBorderColor($styles, $cValue);
Expand All @@ -603,10 +669,37 @@ protected static function parseStyle($attribute, $styles)
}
break;
case 'border':
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+)\s+([a-z]+)/', $cValue, $matches)) {
$styles['borderSize'] = Converter::cssToPoint($matches[1]);
$styles['borderColor'] = trim($matches[2], '#');
$styles['borderStyle'] = self::mapBorderStyle($matches[3]);
case 'border-top':
case 'border-bottom':
case 'border-right':
case 'border-left':
// must have exact order [width color style], e.g. "1px #0011CC solid" or "2pt green solid"
// Word does not accept shortened hex colors e.g. #CCC, only full e.g. #CCCCCC
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+|[a-zA-Z]+)\s+([a-z]+)/', $cValue, $matches)) {
if (false !== strpos($cKey, '-')) {
$which = explode('-', $cKey)[1];
$which = ucfirst($which); // e.g. bottom -> Bottom
} else {
$which = '';
}
// Note - border width normalization:
// Width of border in Word is calculated differently than HTML borders, usually showing up too bold.
// Smallest 1px (or 1pt) appears in Word like 2-3px/pt in HTML once converted to twips.
// Therefore we need to normalize converted twip value to cca 1/2 of value.
// This may be adjusted, if better ratio or formula found.
// BC change: up to ver. 0.17.0 was $size converted to points - Converter::cssToPoint($size)
$size = Converter::cssToTwip($matches[1]);
$size = intval($size / 2);
// valid variants may be e.g. borderSize, borderTopSize, borderLeftColor, etc ..
$styles["border{$which}Size"] = $size; // twips
$styles["border{$which}Color"] = trim($matches[2], '#');
$styles["border{$which}Style"] = self::mapBorderStyle($matches[3]);
}
break;
case 'vertical-align':
// https://developer.mozilla.org/en-US/docs/Web/CSS/vertical-align
if (preg_match('#(?:top|bottom|middle|sub|baseline)#i', $cValue, $matches)) {
$styles['valign'] = self::mapAlignVertical($matches[0]);
}
break;
}
Expand Down Expand Up @@ -651,14 +744,14 @@ protected static function parseImage($node, $element)
case 'float':
if (trim($v) == 'right') {
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_RIGHT;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
$style['overlap'] = true;
}
if (trim($v) == 'left') {
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_LEFT;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
$style['overlap'] = true;
Expand Down Expand Up @@ -773,6 +866,58 @@ protected static function mapAlign($cssAlignment)
}
}

/**
* Transforms a HTML/CSS alignment into a \PhpOffice\PhpWord\SimpleType\Jc
*
* @param string $cssAlignment
* @return string|null
*/
protected static function mapAlignVertical($alignment)
{
$alignment = strtolower($alignment);
switch ($alignment) {
case 'top':
case 'baseline':
case 'bottom':
return $alignment;
case 'middle':
return 'center';
case 'sub':
return 'bottom';
case 'text-top':
case 'baseline':
return 'top';
default:
// @discuss - which one should apply:
// - Word uses default vert. alignment: top
// - all browsers use default vert. alignment: middle
// Returning empty string means attribute wont be set so use Word default (top).
return '';
}
}

/**
* Map list style for ordered list
*
* @param string $cssListType
*/
protected static function mapListType($cssListType)
{
switch ($cssListType) {
case 'a':
return NumberFormat::LOWER_LETTER; // a, b, c, ..
case 'A':
return NumberFormat::UPPER_LETTER; // A, B, C, ..
case 'i':
return NumberFormat::LOWER_ROMAN; // i, ii, iii, iv, ..
case 'I':
return NumberFormat::UPPER_ROMAN; // I, II, III, IV, ..
case '1':
default:
return NumberFormat::DECIMAL; // 1, 2, 3, ..
}
}

/**
* Parse line break
*
Expand Down Expand Up @@ -808,4 +953,38 @@ protected static function parseLink($node, $element, &$styles)

return $element->addLink($target, $node->textContent, $styles['font'], $styles['paragraph']);
}

/**
* Render horizontal rule
* Note: Word rule is not the same as HTML's <hr> since it does not support width and thus neither alignment
*
* @param \DOMNode $node
* @param \PhpOffice\PhpWord\Element\AbstractContainer $element
*/
protected static function parseHorizRule($node, $element)
{
$styles = self::parseInlineStyle($node);

// <hr> is implemented as an empty paragraph - extending 100% inside the section
// Some properties may be controlled, e.g. <hr style="border-bottom: 3px #DDDDDD solid; margin-bottom: 0;">

$fontStyle = $styles + ['size' => 3];

$paragraphStyle = $styles + [
'lineHeight' => 0.25, // multiply default line height - e.g. 1, 1.5 etc
'spacing' => 0, // twip
'spaceBefore' => 120, // twip, 240/2 (default line height)
'spaceAfter' => 120, // twip
'borderBottomSize' => empty($styles['line-height']) ? 1 : $styles['line-height'],
'borderBottomColor' => empty($styles['color']) ? '000000' : $styles['color'],
'borderBottomStyle' => 'single', // same as "solid"
];

$element->addText("", $fontStyle, $paragraphStyle);

// Notes: <hr/> cannot be:
// - table - throws error "cannot be inside textruns", e.g. lists
// - line - that is a shape, has different behaviour
// - repeated text, e.g. underline "_", because of unpredictable line wrapping
}
}
Loading