From 049d847fa9f4303430b97a4fe73d643d1ace76be Mon Sep 17 00:00:00 2001 From: Joby Elliott Date: Thu, 15 Dec 2022 21:15:47 -0700 Subject: [PATCH] parsing working --- src/Containers/ContainerGroup.php | 13 +- src/Containers/DocumentTags/HtmlTag.php | 2 +- src/Containers/DocumentTags/TitleTag.php | 25 +-- .../DocumentTags/TitleTagInterface.php | 6 +- src/Helpers/Classes.php | 8 + src/Helpers/Styles.php | 8 + src/Html5/Tags/DivTag.php | 12 ++ src/Html5/Tags/PTag.php | 12 ++ src/Nodes/CData.php | 40 ++++ src/Nodes/CDataInterface.php | 13 ++ src/Parser.php | 188 ++++++++++++++++++ .../Containers/DocumentTags/TitleTagTest.php | 8 +- tests/Containers/GenericHtmlDocumentTest.php | 2 +- tests/Nodes/CDataTest.php | 24 +++ tests/ParserTest.php | 61 ++++++ 15 files changed, 384 insertions(+), 38 deletions(-) create mode 100644 src/Html5/Tags/DivTag.php create mode 100644 src/Html5/Tags/PTag.php create mode 100644 src/Nodes/CData.php create mode 100644 src/Nodes/CDataInterface.php create mode 100644 src/Parser.php create mode 100644 tests/Nodes/CDataTest.php create mode 100644 tests/ParserTest.php diff --git a/src/Containers/ContainerGroup.php b/src/Containers/ContainerGroup.php index 64c6c87..024175a 100644 --- a/src/Containers/ContainerGroup.php +++ b/src/Containers/ContainerGroup.php @@ -79,7 +79,7 @@ class ContainerGroup implements ContainerInterface, NodeInterface public function addChild(NodeInterface|Stringable|string $child, bool $prepend = false, bool $skip_sanitize = false): static { - if ($this->willAccept($child, false)) { + if ($this->willAccept($child)) { $this->doAddChild($child, $prepend, $skip_sanitize); $this->enforceChildLimit($prepend); } @@ -88,7 +88,7 @@ class ContainerGroup implements ContainerInterface, NodeInterface public function addChildAfter(NodeInterface|Stringable|string $new_child, NodeInterface|Stringable|string $after_child, bool $skip_sanitize = false): static { - if ($this->willAccept($new_child, false)) { + if ($this->willAccept($new_child)) { $this->doAddChildAfter($new_child, $after_child, $skip_sanitize); $this->enforceChildLimit(false); } @@ -97,7 +97,7 @@ class ContainerGroup implements ContainerInterface, NodeInterface public function addChildBefore(NodeInterface|Stringable|string $new_child, NodeInterface|Stringable|string $before_child, bool $skip_sanitize = false): static { - if ($this->willAccept($new_child, false)) { + if ($this->willAccept($new_child)) { $this->doAddChildBefore($new_child, $before_child, $skip_sanitize); $this->enforceChildLimit(true); } @@ -118,13 +118,8 @@ class ContainerGroup implements ContainerInterface, NodeInterface } } - public function willAccept(NodeInterface|Stringable|string $child, bool $check_limit = true): bool + public function willAccept(NodeInterface|Stringable|string $child): bool { - if ($check_limit && $this->limit > 0) { - if (count($this->children()) >= $this->limit) { - return false; - } - } if ($child instanceof NodeInterface) { $child = $child->detachCopy(); } diff --git a/src/Containers/DocumentTags/HtmlTag.php b/src/Containers/DocumentTags/HtmlTag.php index 9f8cfea..dcc2fa9 100644 --- a/src/Containers/DocumentTags/HtmlTag.php +++ b/src/Containers/DocumentTags/HtmlTag.php @@ -20,7 +20,7 @@ class HtmlTag extends AbstractGroupedTag implements HtmlTagInterface public function __construct() { parent::__construct(); - $this->head = ContainerGroup::ofClass(HeadTagInterface::class); + $this->head = ContainerGroup::ofClass(HeadTagInterface::class, 1); $this->body = ContainerGroup::ofClass(BodyTagInterface::class, 1); $this->addGroup($this->head); $this->addGroup($this->body); diff --git a/src/Containers/DocumentTags/TitleTag.php b/src/Containers/DocumentTags/TitleTag.php index cb470dc..76b30eb 100644 --- a/src/Containers/DocumentTags/TitleTag.php +++ b/src/Containers/DocumentTags/TitleTag.php @@ -2,31 +2,18 @@ namespace ByJoby\HTML\Containers\DocumentTags; -use ByJoby\HTML\Traits\NodeTrait; -use Exception; +use ByJoby\HTML\Tags\AbstractContentTag; +use Stringable; -class TitleTag implements TitleTagInterface +class TitleTag extends AbstractContentTag implements TitleTagInterface { const TAG = 'title'; - - use NodeTrait; - /** @var string */ - protected $title = 'Untitled'; + protected $content = 'Untitled'; - public function setTitle(string $title): static + public function setContent(string|Stringable $content): static { - $this->title = trim(strip_tags($title)); + parent::setContent(trim(strip_tags($content))); return $this; } - - public function title(): string - { - return $this->title; - } - - public function __toString(): string - { - return '' . $this->title() . ''; - } } diff --git a/src/Containers/DocumentTags/TitleTagInterface.php b/src/Containers/DocumentTags/TitleTagInterface.php index fa0e945..a3d477b 100644 --- a/src/Containers/DocumentTags/TitleTagInterface.php +++ b/src/Containers/DocumentTags/TitleTagInterface.php @@ -2,10 +2,8 @@ namespace ByJoby\HTML\Containers\DocumentTags; -use ByJoby\HTML\NodeInterface; +use ByJoby\HTML\Tags\ContentTagInterface; -interface TitleTagInterface extends NodeInterface +interface TitleTagInterface extends ContentTagInterface { - public function title(): string; - public function setTitle(string $title): static; } diff --git a/src/Helpers/Classes.php b/src/Helpers/Classes.php index b16f435..3283579 100644 --- a/src/Helpers/Classes.php +++ b/src/Helpers/Classes.php @@ -30,6 +30,14 @@ class Classes implements Countable } } + public function parse(string $class_string): void + { + foreach (explode(' ', $class_string) as $class) { + $class = trim($class); + if ($class) $this->add($class); + } + } + public function count(): int { return count($this->classes); diff --git a/src/Helpers/Styles.php b/src/Helpers/Styles.php index 763cece..18affe4 100644 --- a/src/Helpers/Styles.php +++ b/src/Helpers/Styles.php @@ -35,6 +35,14 @@ class Styles implements Countable, ArrayAccess, Stringable } } + public function parse(string $css_string): void + { + foreach (explode(';', $css_string) as $rule) { + $rule = explode(':', trim($rule)); + if (count($rule) == 2) $this[$rule[0]] = $rule[1]; + } + } + public function count(): int { return count($this->styles); diff --git a/src/Html5/Tags/DivTag.php b/src/Html5/Tags/DivTag.php new file mode 100644 index 0000000..4c09210 --- /dev/null +++ b/src/Html5/Tags/DivTag.php @@ -0,0 +1,12 @@ +value; + } + + public function setValue(string|Stringable $value): static + { + $this->value = $value; + return $this; + } + + public function __toString(): string + { + return sprintf( + '', + str_replace( + // ending sequence cannot be replaced + ']]>', + // we can split here and render as two cdata sections though -- messy, but the only option + ']]]]>', + $this->value() + ) + ); + } +} diff --git a/src/Nodes/CDataInterface.php b/src/Nodes/CDataInterface.php new file mode 100644 index 0000000..f2485c7 --- /dev/null +++ b/src/Nodes/CDataInterface.php @@ -0,0 +1,13 @@ + */ + protected $tag_namespaces = [ + '\\ByJoby\\HTML\\Html5\\Tags\\', + '\\ByJoby\\HTML\\Containers\\DocumentTags\\' + ]; + + /** @var array> */ + protected $tag_classes = []; + + /** @var class-string */ + protected $comment_class = Comment::class; + + /** @var class-string */ + protected $text_class = Text::class; + + /** @var class-string */ + protected $cdata_class = CData::class; + + /** @var class-string */ + protected $document_class = GenericHtmlDocument::class; + + /** @var class-string */ + protected $fragment_class = Fragment::class; + + public function parseFragment(string $html): FragmentInterface + { + $fragment = new ($this->fragment_class); + $dom = new DOMDocument(); + $dom->loadHTML( + '
' . $html . '
', // wrap in DIV otherwise it will wrap root-level text in P tags + LIBXML_BIGLINES + | LIBXML_COMPACT + | LIBXML_HTML_NOIMPLIED + | LIBXML_HTML_NODEFDTD + | LIBXML_PARSEHUGE + | LIBXML_NOERROR + ); + $this->walkDom($dom->childNodes[0], $fragment); + return $fragment; + } + + public function parseDocument(string $html): HtmlDocumentInterface + { + /** @var HtmlDocumentInterface */ + $document = new ($this->document_class); + $dom = new DOMDocument(); + $dom->loadHTML( + $html, + LIBXML_BIGLINES + | LIBXML_COMPACT + | LIBXML_HTML_NODEFDTD + | LIBXML_PARSEHUGE + | LIBXML_NOERROR + ); + $this->walkDom($dom, $document); + return $document; + } + + protected function walkDom(DOMNode $node, ContainerInterface $parent): void + { + foreach ($node->childNodes as $child) { + if ($converted_child = $this->convertNode($child)) { + // append converted child to parent + $parent->addChild($converted_child); + // walk DOM for child if it is a container + if ($converted_child instanceof ContainerInterface) { + $this->walkDom($child, $converted_child); + } + } + } + } + + protected function convertNode(DOMNode $node): null|NodeInterface + { + if ($node instanceof DOMElement) { + return $this->convertNodeToTag($node); + } elseif ($node instanceof DOMComment) { + return new ($this->comment_class)($node->textContent); + } elseif ($node instanceof DOMText) { + return new ($this->text_class)($node->textContent); + } + // This line shouldn't be reached, but if it is it's philosophically + // consistent to simply ignore unknown node types + return null; // @codeCoverageIgnore + } + + protected function convertNodeToTag(DOMElement $node): null|NodeInterface + { + // build object + $class = $this->tagClass($node->tagName); + if (!$class) return null; + $tag = new $class; + // tool for settin gup content tags + if ($tag instanceof ContentTagInterface) { + $tag->setContent($node->textContent); + } + // external helper methods to stay tidy + if ($tag instanceof TagInterface) { + $this->processAttributes($node, $tag); + } + return $tag; + } + + protected function processAttributes(DOMElement $node, TagInterface $tag): void + { + if (!$node->attributes) return; + /** @var array */ + $attributes = []; + // absorb attributes + /** @var DOMNode $attribute */ + foreach ($node->attributes as $attribute) { + if ($attribute->nodeValue) { + $attributes[$attribute->nodeName] = $attribute->nodeValue; + } else { + $attributes[$attribute->nodeName] = true; + } + } + // set attributes + foreach ($attributes as $k => $v) { + if ($k == 'id' && is_string($v)) { + $tag->setID($v); + } elseif ($k == 'class' && is_string($v)) { + $tag->classes()->parse($v); + } elseif ($k == 'style' && is_string($v)) { + $tag->styles()->parse($v); + } else { + // make an effort to set ID + try { + $tag->attributes()["$k"] = $v; + } + // it is correct to ignore attributes that are unsettable + catch (\Throwable $th) { // @codeCoverageIgnore + // does nothing + } + } + } + } + + /** + * @param string $tag + * @return class-string|null + */ + protected function tagClass(string $tag): string|null + { + // look for an explicitly-set class + if (isset($this->tag_classes[$tag])) { + return $this->tag_classes[$tag]; + } + // otherwise loop through namespaces and try to find a tag + foreach ($this->tag_namespaces as $namespace) { + $class = $namespace . ucfirst($tag) . 'Tag'; + if (class_exists($class)) { + $implements = class_implements($class); + $implements = $implements ? $implements : []; + if (in_array(NodeInterface::class, $implements)) { + $this->tag_classes[$tag] = $class; // @phpstan-ignore-line + return $class; // @phpstan-ignore-line + } + } + } + // return null if nothing found + return null; + } +} diff --git a/tests/Containers/DocumentTags/TitleTagTest.php b/tests/Containers/DocumentTags/TitleTagTest.php index 5e86f3a..710deb9 100644 --- a/tests/Containers/DocumentTags/TitleTagTest.php +++ b/tests/Containers/DocumentTags/TitleTagTest.php @@ -9,9 +9,9 @@ class TitleTagTest extends TestCase public function testGetAndSet() { $title = new TitleTag; - $this->assertEquals('Untitled', $title->title()); - $title->setTitle('Titled'); - $this->assertEquals('Titled', $title->title()); - $this->assertEquals('Titled', $title->__toString()); + $this->assertEquals('Untitled', $title->content()); + $title->setContent('Titled'); + $this->assertEquals('Titled', $title->content()); + $this->assertEquals('' . PHP_EOL . 'Titled' . PHP_EOL . '', $title->__toString()); } } diff --git a/tests/Containers/GenericHtmlDocumentTest.php b/tests/Containers/GenericHtmlDocumentTest.php index 39f08ab..e7f8b82 100644 --- a/tests/Containers/GenericHtmlDocumentTest.php +++ b/tests/Containers/GenericHtmlDocumentTest.php @@ -34,7 +34,7 @@ class GenericHtmlDocumentTest extends TestCase '', '', '', - 'Untitled', + '', 'Untitled', '', '', '', '' diff --git a/tests/Nodes/CDataTest.php b/tests/Nodes/CDataTest.php new file mode 100644 index 0000000..d5b095c --- /dev/null +++ b/tests/Nodes/CDataTest.php @@ -0,0 +1,24 @@ +assertEquals('', new CData('')); + $this->assertEquals('', new CData('foo')); + $this->assertEquals('', new CData('foo-bar')); + $this->assertEquals('bar]]>', (new CData('foo]]>bar'))->__toString()); + } + + public function testModification(): void + { + $cdata = new CData('foo'); + $this->assertEquals('foo', $cdata->value()); + $cdata->setValue('bar'); + $this->assertEquals('bar', $cdata->value()); + } +} diff --git a/tests/ParserTest.php b/tests/ParserTest.php new file mode 100644 index 0000000..12aa71a --- /dev/null +++ b/tests/ParserTest.php @@ -0,0 +1,61 @@ +parseFragment('foobar'); + $this->assertInstanceOf(TextInterface::class, $fragment->children()[0]); + $fragment = $parser->parseFragment('foobar
fizzbuzz
'); + $this->assertInstanceOf(TextInterface::class, $fragment->children()[0]); + $this->assertInstanceOf(DivTag::class, $fragment->children()[1]); + } + + public function testAttributes() + { + $parser = new Parser(); + $fragment = $parser->parseFragment('
'); + $this->assertEquals('foo', $fragment->children()[0]->id()); + $this->assertEquals('b', $fragment->children()[0]->attributes()['a']); + $this->assertEquals('d', $fragment->children()[0]->attributes()['c']); + } + + public function testStylesAndClasses() + { + $parser = new Parser(); + $fragment = $parser->parseFragment('
'); + $this->assertEquals(['bar', 'foo'], $fragment->children()[0]->classes()->getArray()); + $this->assertEquals(['background-color' => 'blue', 'color' => 'red'], $fragment->children()[0]->styles()->getArray()); + } + + public function testNesting() + { + $parser = new Parser(); + $fragment = $parser->parseFragment('

foobar

foo

'); + $this->assertInstanceOf(DivTag::class, $fragment->children()[0]); + $this->assertCount(2, $fragment->children()[0]->children()); + $this->assertCount(3, $fragment->children()[0]->children()[0]->children()); + } + + public function testUnknownTags() + { + $parser = new Parser(); + $fragment = $parser->parseFragment('
'); + $this->assertCount(1, $fragment->children()); + } + + public function testParseDocument() + { + $parser = new Parser(); + $document = $parser->parseDocument('Title
foo
'); + $this->assertEquals('Title', $document->html()->head()->title()->content()); + $this->assertEquals('
' . PHP_EOL . 'foo' . PHP_EOL . '
', $document->body()->children()[0]->__toString()); + } +}