Skip to content

Commit 276f618

Browse files
committed
feat: Enhancements to addHTML parser
pr PHPOffice#1902 on original repo
1 parent ec1b3d3 commit 276f618

File tree

2 files changed

+465
-16
lines changed

2 files changed

+465
-16
lines changed

src/PhpWord/Shared/Html.php

Lines changed: 195 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ public static function addHtml($element, $html, $fullHTML = false, $preserveWhit
6262
// Preprocess: remove all line ends, decode HTML entity,
6363
// fix ampersand and angle brackets and add body tag for HTML fragments
6464
$html = str_replace(array("\n", "\r"), '', $html);
65-
$html = str_replace(array('<', '>', '&'), array('_lt_', '_gt_', '_amp_'), $html);
65+
$html = str_replace(array('<', '>', '&', '"'), array('_lt_', '_gt_', '_amp_', '_quot_'), $html);
6666
$html = html_entity_decode($html, ENT_QUOTES, 'UTF-8');
6767
$html = str_replace('&', '&', $html);
68-
$html = str_replace(array('_lt_', '_gt_', '_amp_'), array('<', '>', '&'), $html);
68+
$html = str_replace(array('_lt_', '_gt_', '_amp_', '_quot_'), array('<', '>', '&', '"'), $html);
6969

7070
if (false === $fullHTML) {
7171
$html = '<body>' . $html . '</body>';
@@ -96,15 +96,43 @@ protected static function parseInlineStyle($node, $styles = array())
9696
$attributes = $node->attributes; // get all the attributes(eg: id, class)
9797

9898
foreach ($attributes as $attribute) {
99-
switch ($attribute->name) {
99+
$val = $attribute->value;
100+
switch (strtolower($attribute->name)) {
100101
case 'style':
101102
$styles = self::parseStyle($attribute, $styles);
102103
break;
103104
case 'align':
104-
$styles['alignment'] = self::mapAlign($attribute->value);
105+
$styles['alignment'] = self::mapAlign(trim($val));
105106
break;
106107
case 'lang':
107-
$styles['lang'] = $attribute->value;
108+
$styles['lang'] = $val;
109+
break;
110+
case 'width':
111+
// tables, cells
112+
if (false !== strpos($val, '%')) {
113+
// e.g. <table width="100%"> or <td width="50%">
114+
$styles['width'] = intval($val) * 50;
115+
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::PERCENT;
116+
} else {
117+
// e.g. <table width="250> where "250" = 250px (always pixels)
118+
$styles['width'] = Converter::pixelToTwip($val);
119+
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::TWIP;
120+
}
121+
break;
122+
case 'cellspacing':
123+
// tables e.g. <table cellspacing="2">, where "2" = 2px (always pixels)
124+
$val = intval($val).'px';
125+
$styles['cellSpacing'] = Converter::cssToTwip($val);
126+
break;
127+
case 'bgcolor':
128+
// tables, rows, cells e.g. <tr bgColor="#FF0000">
129+
$styles['bgColor'] = trim($val, '# ');
130+
break;
131+
case 'valign':
132+
// cells e.g. <td valign="middle">
133+
if (preg_match('#(?:top|bottom|middle|baseline)#i', $val, $matches)) {
134+
$styles['valign'] = self::mapAlignVertical($matches[0]);
135+
}
108136
break;
109137
}
110138
}
@@ -161,6 +189,7 @@ protected static function parseNode($node, $element, $styles = array(), $data =
161189
'img' => array('Image', $node, $element, $styles, null, null, null),
162190
'br' => array('LineBreak', null, $element, $styles, null, null, null),
163191
'a' => array('Link', $node, $element, $styles, null, null, null),
192+
'hr' => array('HorizRule', $node, $element, $styles, null, null, null),
164193
);
165194

166195
$newElement = null;
@@ -361,7 +390,11 @@ protected static function parseCell($node, $element, &$styles)
361390
if (!empty($colspan)) {
362391
$cellStyles['gridSpan'] = $colspan - 0;
363392
}
364-
$cell = $element->addCell(null, $cellStyles);
393+
394+
// set cell width to control column widths
395+
$width = isset($cellStyles['width']) ? $cellStyles['width'] : null;
396+
unset($cellStyles['width']); // would not apply
397+
$cell = $element->addCell($width, $cellStyles);
365398

366399
if (self::shouldAddTextRun($node)) {
367400
return $cell->addTextRun(self::parseInlineStyle($node, $styles['paragraph']));
@@ -420,7 +453,32 @@ protected static function parseList($node, $element, &$styles, &$data)
420453
} else {
421454
$data['listdepth'] = 0;
422455
$styles['list'] = 'listStyle_' . self::$listIndex++;
423-
$element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
456+
$style = $element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
457+
458+
// extract attributes start & type e.g. <ol type="A" start="3">
459+
$start = 0;
460+
$type = '';
461+
foreach ($node->attributes as $attribute) {
462+
switch ($attribute->name) {
463+
case 'start':
464+
$start = (int) $attribute->value;
465+
break;
466+
case 'type':
467+
$type = $attribute->value;
468+
break;
469+
}
470+
}
471+
472+
$levels = $style->getLevels();
473+
/** @var \PhpOffice\PhpWord\Style\NumberingLevel */
474+
$level = $levels[0];
475+
if ($start > 0) {
476+
$level->setStart($start);
477+
}
478+
$type = $type ? self::mapListType($type) : null;
479+
if ($type) {
480+
$level->setFormat($type);
481+
}
424482
}
425483
if ($node->parentNode->nodeName === 'li') {
426484
return $element->getParent();
@@ -502,7 +560,8 @@ protected static function parseStyle($attribute, $styles)
502560
foreach ($properties as $property) {
503561
list($cKey, $cValue) = array_pad(explode(':', $property, 2), 2, null);
504562
$cValue = trim($cValue);
505-
switch (trim($cKey)) {
563+
$cKey = strtolower(trim($cKey));
564+
switch ($cKey) {
506565
case 'text-decoration':
507566
switch ($cValue) {
508567
case 'underline':
@@ -575,11 +634,18 @@ protected static function parseStyle($attribute, $styles)
575634
}
576635
$styles['italic'] = $tValue;
577636
break;
637+
case 'margin':
638+
$cValue = Converter::cssToTwip($cValue);
639+
$styles['spaceBefore'] = $cValue;
640+
$styles['spaceAfter'] = $cValue;
641+
break;
578642
case 'margin-top':
579-
$styles['spaceBefore'] = Converter::cssToPoint($cValue);
643+
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
644+
$styles['spaceBefore'] = Converter::cssToTwip($cValue);
580645
break;
581646
case 'margin-bottom':
582-
$styles['spaceAfter'] = Converter::cssToPoint($cValue);
647+
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
648+
$styles['spaceAfter'] = Converter::cssToTwip($cValue);
583649
break;
584650
case 'border-color':
585651
self::mapBorderColor($styles, $cValue);
@@ -603,10 +669,37 @@ protected static function parseStyle($attribute, $styles)
603669
}
604670
break;
605671
case 'border':
606-
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+)\s+([a-z]+)/', $cValue, $matches)) {
607-
$styles['borderSize'] = Converter::cssToPoint($matches[1]);
608-
$styles['borderColor'] = trim($matches[2], '#');
609-
$styles['borderStyle'] = self::mapBorderStyle($matches[3]);
672+
case 'border-top':
673+
case 'border-bottom':
674+
case 'border-right':
675+
case 'border-left':
676+
// must have exact order [width color style], e.g. "1px #0011CC solid" or "2pt green solid"
677+
// Word does not accept shortened hex colors e.g. #CCC, only full e.g. #CCCCCC
678+
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+|[a-zA-Z]+)\s+([a-z]+)/', $cValue, $matches)) {
679+
if (false !== strpos($cKey, '-')) {
680+
$which = explode('-', $cKey)[1];
681+
$which = ucfirst($which); // e.g. bottom -> Bottom
682+
} else {
683+
$which = '';
684+
}
685+
// Note - border width normalization:
686+
// Width of border in Word is calculated differently than HTML borders, usually showing up too bold.
687+
// Smallest 1px (or 1pt) appears in Word like 2-3px/pt in HTML once converted to twips.
688+
// Therefore we need to normalize converted twip value to cca 1/2 of value.
689+
// This may be adjusted, if better ratio or formula found.
690+
// BC change: up to ver. 0.17.0 was $size converted to points - Converter::cssToPoint($size)
691+
$size = Converter::cssToTwip($matches[1]);
692+
$size = intval($size / 2);
693+
// valid variants may be e.g. borderSize, borderTopSize, borderLeftColor, etc ..
694+
$styles["border{$which}Size"] = $size; // twips
695+
$styles["border{$which}Color"] = trim($matches[2], '#');
696+
$styles["border{$which}Style"] = self::mapBorderStyle($matches[3]);
697+
}
698+
break;
699+
case 'vertical-align':
700+
// https://developer.mozilla.org/en-US/docs/Web/CSS/vertical-align
701+
if (preg_match('#(?:top|bottom|middle|sub|baseline)#i', $cValue, $matches)) {
702+
$styles['valign'] = self::mapAlignVertical($matches[0]);
610703
}
611704
break;
612705
}
@@ -651,14 +744,14 @@ protected static function parseImage($node, $element)
651744
case 'float':
652745
if (trim($v) == 'right') {
653746
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_RIGHT;
654-
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
747+
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
655748
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
656749
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
657750
$style['overlap'] = true;
658751
}
659752
if (trim($v) == 'left') {
660753
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_LEFT;
661-
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
754+
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
662755
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
663756
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
664757
$style['overlap'] = true;
@@ -773,6 +866,58 @@ protected static function mapAlign($cssAlignment)
773866
}
774867
}
775868

869+
/**
870+
* Transforms a HTML/CSS alignment into a \PhpOffice\PhpWord\SimpleType\Jc
871+
*
872+
* @param string $cssAlignment
873+
* @return string|null
874+
*/
875+
protected static function mapAlignVertical($alignment)
876+
{
877+
$alignment = strtolower($alignment);
878+
switch ($alignment) {
879+
case 'top':
880+
case 'baseline':
881+
case 'bottom':
882+
return $alignment;
883+
case 'middle':
884+
return 'center';
885+
case 'sub':
886+
return 'bottom';
887+
case 'text-top':
888+
case 'baseline':
889+
return 'top';
890+
default:
891+
// @discuss - which one should apply:
892+
// - Word uses default vert. alignment: top
893+
// - all browsers use default vert. alignment: middle
894+
// Returning empty string means attribute wont be set so use Word default (top).
895+
return '';
896+
}
897+
}
898+
899+
/**
900+
* Map list style for ordered list
901+
*
902+
* @param string $cssListType
903+
*/
904+
protected static function mapListType($cssListType)
905+
{
906+
switch ($cssListType) {
907+
case 'a':
908+
return NumberFormat::LOWER_LETTER; // a, b, c, ..
909+
case 'A':
910+
return NumberFormat::UPPER_LETTER; // A, B, C, ..
911+
case 'i':
912+
return NumberFormat::LOWER_ROMAN; // i, ii, iii, iv, ..
913+
case 'I':
914+
return NumberFormat::UPPER_ROMAN; // I, II, III, IV, ..
915+
case '1':
916+
default:
917+
return NumberFormat::DECIMAL; // 1, 2, 3, ..
918+
}
919+
}
920+
776921
/**
777922
* Parse line break
778923
*
@@ -808,4 +953,38 @@ protected static function parseLink($node, $element, &$styles)
808953

809954
return $element->addLink($target, $node->textContent, $styles['font'], $styles['paragraph']);
810955
}
956+
957+
/**
958+
* Render horizontal rule
959+
* Note: Word rule is not the same as HTML's <hr> since it does not support width and thus neither alignment
960+
*
961+
* @param \DOMNode $node
962+
* @param \PhpOffice\PhpWord\Element\AbstractContainer $element
963+
*/
964+
protected static function parseHorizRule($node, $element)
965+
{
966+
$styles = self::parseInlineStyle($node);
967+
968+
// <hr> is implemented as an empty paragraph - extending 100% inside the section
969+
// Some properties may be controlled, e.g. <hr style="border-bottom: 3px #DDDDDD solid; margin-bottom: 0;">
970+
971+
$fontStyle = $styles + ['size' => 3];
972+
973+
$paragraphStyle = $styles + [
974+
'lineHeight' => 0.25, // multiply default line height - e.g. 1, 1.5 etc
975+
'spacing' => 0, // twip
976+
'spaceBefore' => 120, // twip, 240/2 (default line height)
977+
'spaceAfter' => 120, // twip
978+
'borderBottomSize' => empty($styles['line-height']) ? 1 : $styles['line-height'],
979+
'borderBottomColor' => empty($styles['color']) ? '000000' : $styles['color'],
980+
'borderBottomStyle' => 'single', // same as "solid"
981+
];
982+
983+
$element->addText("", $fontStyle, $paragraphStyle);
984+
985+
// Notes: <hr/> cannot be:
986+
// - table - throws error "cannot be inside textruns", e.g. lists
987+
// - line - that is a shape, has different behaviour
988+
// - repeated text, e.g. underline "_", because of unpredictable line wrapping
989+
}
811990
}

0 commit comments

Comments
 (0)