Skip to content

Commit 9e322dd

Browse files
authored
Merge pull request #1902 from lubosdz/develop
Enhancements to addHTML parser
2 parents bf8f2ac + 69632b3 commit 9e322dd

File tree

2 files changed

+465
-16
lines changed

2 files changed

+465
-16
lines changed

src/PhpWord/Shared/Html.php

Lines changed: 195 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ public static function addHtml($element, $html, $fullHTML = false, $preserveWhit
6262
// Preprocess: remove all line ends, decode HTML entity,
6363
// fix ampersand and angle brackets and add body tag for HTML fragments
6464
$html = str_replace(array("\n", "\r"), '', $html);
65-
$html = str_replace(array('<', '>', '&'), array('_lt_', '_gt_', '_amp_'), $html);
65+
$html = str_replace(array('<', '>', '&', '"'), array('_lt_', '_gt_', '_amp_', '_quot_'), $html);
6666
$html = html_entity_decode($html, ENT_QUOTES, 'UTF-8');
6767
$html = str_replace('&', '&', $html);
68-
$html = str_replace(array('_lt_', '_gt_', '_amp_'), array('<', '>', '&'), $html);
68+
$html = str_replace(array('_lt_', '_gt_', '_amp_', '_quot_'), array('<', '>', '&', '"'), $html);
6969

7070
if (false === $fullHTML) {
7171
$html = '<body>' . $html . '</body>';
@@ -100,15 +100,43 @@ protected static function parseInlineStyle($node, $styles = array())
100100
$attributes = $node->attributes; // get all the attributes(eg: id, class)
101101

102102
foreach ($attributes as $attribute) {
103-
switch ($attribute->name) {
103+
$val = $attribute->value;
104+
switch (strtolower($attribute->name)) {
104105
case 'style':
105106
$styles = self::parseStyle($attribute, $styles);
106107
break;
107108
case 'align':
108-
$styles['alignment'] = self::mapAlign($attribute->value);
109+
$styles['alignment'] = self::mapAlign(trim($val));
109110
break;
110111
case 'lang':
111-
$styles['lang'] = $attribute->value;
112+
$styles['lang'] = $val;
113+
break;
114+
case 'width':
115+
// tables, cells
116+
if (false !== strpos($val, '%')) {
117+
// e.g. <table width="100%"> or <td width="50%">
118+
$styles['width'] = intval($val) * 50;
119+
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::PERCENT;
120+
} else {
121+
// e.g. <table width="250> where "250" = 250px (always pixels)
122+
$styles['width'] = Converter::pixelToTwip($val);
123+
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::TWIP;
124+
}
125+
break;
126+
case 'cellspacing':
127+
// tables e.g. <table cellspacing="2">, where "2" = 2px (always pixels)
128+
$val = intval($val).'px';
129+
$styles['cellSpacing'] = Converter::cssToTwip($val);
130+
break;
131+
case 'bgcolor':
132+
// tables, rows, cells e.g. <tr bgColor="#FF0000">
133+
$styles['bgColor'] = trim($val, '# ');
134+
break;
135+
case 'valign':
136+
// cells e.g. <td valign="middle">
137+
if (preg_match('#(?:top|bottom|middle|baseline)#i', $val, $matches)) {
138+
$styles['valign'] = self::mapAlignVertical($matches[0]);
139+
}
112140
break;
113141
}
114142
}
@@ -165,6 +193,7 @@ protected static function parseNode($node, $element, $styles = array(), $data =
165193
'img' => array('Image', $node, $element, $styles, null, null, null),
166194
'br' => array('LineBreak', null, $element, $styles, null, null, null),
167195
'a' => array('Link', $node, $element, $styles, null, null, null),
196+
'hr' => array('HorizRule', $node, $element, $styles, null, null, null),
168197
);
169198

170199
$newElement = null;
@@ -365,7 +394,11 @@ protected static function parseCell($node, $element, &$styles)
365394
if (!empty($colspan)) {
366395
$cellStyles['gridSpan'] = $colspan - 0;
367396
}
368-
$cell = $element->addCell(null, $cellStyles);
397+
398+
// set cell width to control column widths
399+
$width = isset($cellStyles['width']) ? $cellStyles['width'] : null;
400+
unset($cellStyles['width']); // would not apply
401+
$cell = $element->addCell($width, $cellStyles);
369402

370403
if (self::shouldAddTextRun($node)) {
371404
return $cell->addTextRun(self::parseInlineStyle($node, $styles['paragraph']));
@@ -424,7 +457,32 @@ protected static function parseList($node, $element, &$styles, &$data)
424457
} else {
425458
$data['listdepth'] = 0;
426459
$styles['list'] = 'listStyle_' . self::$listIndex++;
427-
$element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
460+
$style = $element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
461+
462+
// extract attributes start & type e.g. <ol type="A" start="3">
463+
$start = 0;
464+
$type = '';
465+
foreach ($node->attributes as $attribute) {
466+
switch ($attribute->name) {
467+
case 'start':
468+
$start = (int) $attribute->value;
469+
break;
470+
case 'type':
471+
$type = $attribute->value;
472+
break;
473+
}
474+
}
475+
476+
$levels = $style->getLevels();
477+
/** @var \PhpOffice\PhpWord\Style\NumberingLevel */
478+
$level = $levels[0];
479+
if ($start > 0) {
480+
$level->setStart($start);
481+
}
482+
$type = $type ? self::mapListType($type) : null;
483+
if ($type) {
484+
$level->setFormat($type);
485+
}
428486
}
429487
if ($node->parentNode->nodeName === 'li') {
430488
return $element->getParent();
@@ -506,7 +564,8 @@ protected static function parseStyle($attribute, $styles)
506564
foreach ($properties as $property) {
507565
list($cKey, $cValue) = array_pad(explode(':', $property, 2), 2, null);
508566
$cValue = trim($cValue);
509-
switch (trim($cKey)) {
567+
$cKey = strtolower(trim($cKey));
568+
switch ($cKey) {
510569
case 'text-decoration':
511570
switch ($cValue) {
512571
case 'underline':
@@ -579,11 +638,18 @@ protected static function parseStyle($attribute, $styles)
579638
}
580639
$styles['italic'] = $tValue;
581640
break;
641+
case 'margin':
642+
$cValue = Converter::cssToTwip($cValue);
643+
$styles['spaceBefore'] = $cValue;
644+
$styles['spaceAfter'] = $cValue;
645+
break;
582646
case 'margin-top':
583-
$styles['spaceBefore'] = Converter::cssToPoint($cValue);
647+
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
648+
$styles['spaceBefore'] = Converter::cssToTwip($cValue);
584649
break;
585650
case 'margin-bottom':
586-
$styles['spaceAfter'] = Converter::cssToPoint($cValue);
651+
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
652+
$styles['spaceAfter'] = Converter::cssToTwip($cValue);
587653
break;
588654
case 'border-color':
589655
self::mapBorderColor($styles, $cValue);
@@ -607,10 +673,37 @@ protected static function parseStyle($attribute, $styles)
607673
}
608674
break;
609675
case 'border':
610-
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+)\s+([a-z]+)/', $cValue, $matches)) {
611-
$styles['borderSize'] = Converter::cssToPoint($matches[1]);
612-
$styles['borderColor'] = trim($matches[2], '#');
613-
$styles['borderStyle'] = self::mapBorderStyle($matches[3]);
676+
case 'border-top':
677+
case 'border-bottom':
678+
case 'border-right':
679+
case 'border-left':
680+
// must have exact order [width color style], e.g. "1px #0011CC solid" or "2pt green solid"
681+
// Word does not accept shortened hex colors e.g. #CCC, only full e.g. #CCCCCC
682+
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+|[a-zA-Z]+)\s+([a-z]+)/', $cValue, $matches)) {
683+
if (false !== strpos($cKey, '-')) {
684+
$which = explode('-', $cKey)[1];
685+
$which = ucfirst($which); // e.g. bottom -> Bottom
686+
} else {
687+
$which = '';
688+
}
689+
// Note - border width normalization:
690+
// Width of border in Word is calculated differently than HTML borders, usually showing up too bold.
691+
// Smallest 1px (or 1pt) appears in Word like 2-3px/pt in HTML once converted to twips.
692+
// Therefore we need to normalize converted twip value to cca 1/2 of value.
693+
// This may be adjusted, if better ratio or formula found.
694+
// BC change: up to ver. 0.17.0 was $size converted to points - Converter::cssToPoint($size)
695+
$size = Converter::cssToTwip($matches[1]);
696+
$size = intval($size / 2);
697+
// valid variants may be e.g. borderSize, borderTopSize, borderLeftColor, etc ..
698+
$styles["border{$which}Size"] = $size; // twips
699+
$styles["border{$which}Color"] = trim($matches[2], '#');
700+
$styles["border{$which}Style"] = self::mapBorderStyle($matches[3]);
701+
}
702+
break;
703+
case 'vertical-align':
704+
// https://developer.mozilla.org/en-US/docs/Web/CSS/vertical-align
705+
if (preg_match('#(?:top|bottom|middle|sub|baseline)#i', $cValue, $matches)) {
706+
$styles['valign'] = self::mapAlignVertical($matches[0]);
614707
}
615708
break;
616709
}
@@ -655,14 +748,14 @@ protected static function parseImage($node, $element)
655748
case 'float':
656749
if (trim($v) == 'right') {
657750
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_RIGHT;
658-
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
751+
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
659752
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
660753
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
661754
$style['overlap'] = true;
662755
}
663756
if (trim($v) == 'left') {
664757
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_LEFT;
665-
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
758+
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
666759
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
667760
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
668761
$style['overlap'] = true;
@@ -777,6 +870,58 @@ protected static function mapAlign($cssAlignment)
777870
}
778871
}
779872

873+
/**
874+
* Transforms a HTML/CSS alignment into a \PhpOffice\PhpWord\SimpleType\Jc
875+
*
876+
* @param string $cssAlignment
877+
* @return string|null
878+
*/
879+
protected static function mapAlignVertical($alignment)
880+
{
881+
$alignment = strtolower($alignment);
882+
switch ($alignment) {
883+
case 'top':
884+
case 'baseline':
885+
case 'bottom':
886+
return $alignment;
887+
case 'middle':
888+
return 'center';
889+
case 'sub':
890+
return 'bottom';
891+
case 'text-top':
892+
case 'baseline':
893+
return 'top';
894+
default:
895+
// @discuss - which one should apply:
896+
// - Word uses default vert. alignment: top
897+
// - all browsers use default vert. alignment: middle
898+
// Returning empty string means attribute wont be set so use Word default (top).
899+
return '';
900+
}
901+
}
902+
903+
/**
904+
* Map list style for ordered list
905+
*
906+
* @param string $cssListType
907+
*/
908+
protected static function mapListType($cssListType)
909+
{
910+
switch ($cssListType) {
911+
case 'a':
912+
return NumberFormat::LOWER_LETTER; // a, b, c, ..
913+
case 'A':
914+
return NumberFormat::UPPER_LETTER; // A, B, C, ..
915+
case 'i':
916+
return NumberFormat::LOWER_ROMAN; // i, ii, iii, iv, ..
917+
case 'I':
918+
return NumberFormat::UPPER_ROMAN; // I, II, III, IV, ..
919+
case '1':
920+
default:
921+
return NumberFormat::DECIMAL; // 1, 2, 3, ..
922+
}
923+
}
924+
780925
/**
781926
* Parse line break
782927
*
@@ -812,4 +957,38 @@ protected static function parseLink($node, $element, &$styles)
812957

813958
return $element->addLink($target, $node->textContent, $styles['font'], $styles['paragraph']);
814959
}
960+
961+
/**
962+
* Render horizontal rule
963+
* Note: Word rule is not the same as HTML's <hr> since it does not support width and thus neither alignment
964+
*
965+
* @param \DOMNode $node
966+
* @param \PhpOffice\PhpWord\Element\AbstractContainer $element
967+
*/
968+
protected static function parseHorizRule($node, $element)
969+
{
970+
$styles = self::parseInlineStyle($node);
971+
972+
// <hr> is implemented as an empty paragraph - extending 100% inside the section
973+
// Some properties may be controlled, e.g. <hr style="border-bottom: 3px #DDDDDD solid; margin-bottom: 0;">
974+
975+
$fontStyle = $styles + ['size' => 3];
976+
977+
$paragraphStyle = $styles + [
978+
'lineHeight' => 0.25, // multiply default line height - e.g. 1, 1.5 etc
979+
'spacing' => 0, // twip
980+
'spaceBefore' => 120, // twip, 240/2 (default line height)
981+
'spaceAfter' => 120, // twip
982+
'borderBottomSize' => empty($styles['line-height']) ? 1 : $styles['line-height'],
983+
'borderBottomColor' => empty($styles['color']) ? '000000' : $styles['color'],
984+
'borderBottomStyle' => 'single', // same as "solid"
985+
];
986+
987+
$element->addText("", $fontStyle, $paragraphStyle);
988+
989+
// Notes: <hr/> cannot be:
990+
// - table - throws error "cannot be inside textruns", e.g. lists
991+
// - line - that is a shape, has different behaviour
992+
// - repeated text, e.g. underline "_", because of unpredictable line wrapping
993+
}
815994
}

0 commit comments

Comments
 (0)