Skip to content

Commit 98c0d38

Browse files
committed
merged branch jakzal/domcrawler-namespace-autodiscovery (PR symfony#6650)
This PR was merged into the master branch. Discussion ---------- [DomCrawler] Added auto-discovery and explicit registration of namespaces in filter() and filterByXPath() | Q | A | --- | --- |Bug fix: | no |Feature addition: |yes |Backwards compatibility break: | yes, default namespace is no longer removed in the `addContent` method |Symfony2 tests pass: | yes| |Fixes the following tickets: | symfony#4845 |Todo: | - |License of the code:| MIT |Documentation PR: | symfony/symfony-docs#2979 * added support for automatic discovery and explicit registration of document namespaces for `Crawler::filterXPath()` and `Crawler::filter()` * improved content type guessing in `Crawler::addContent()` * [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document namespace I mentioned in symfony#4845 it would probably be possible to use [DOMNode::lookupNamespaceURI()](http://www.php.net/manual/en/domnode.lookupnamespaceuri.php) to find a namespace URI by given prefix. Unfortunately we cannot use it here since we'd have to call it on a node in the namespace we're looking for. Current implementation makes the following query to find a namespace: ```php $domxpath->query('(//namespace::*[name()="media"])[last()]') ``` Commits ------- 77e2fa5 [DomCrawler] Removed checks if CssSelector is present. 9110468 [DomCrawler] Enabled manual namespace registration. be1e4e6 [DomCrawler] Enabled default namespace prefix overloading. 943d446 [DomCrawler] Updated the CHANGELOG with namespace auto-registration details. c6fbb13 [DomCrawler] Added support for an automatic default namespace registration. 587e2dd [DomCrawler] Made that default namespace is no longer removed when loading documents with addXmlContent(). c905bba [DomCrawler] Added more tests for namespaced filtering. 6e717a3 [DomCrawler] Made sure only the default namespace is removed when loading an XML content. e5b8abb [DomCrawler] Added auto-discovery of namespaces in Crawler::filter() and Crawler::filterByXPath().
2 parents 6a28718 + 77e2fa5 commit 98c0d38

File tree

3 files changed

+190
-5
lines changed

3 files changed

+190
-5
lines changed

src/Symfony/Component/DomCrawler/CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
CHANGELOG
22
=========
33

4+
2.4.0
5+
-----
6+
7+
* added support for automatic discovery and explicit registration of document
8+
namespaces for `Crawler::filterXPath()` and `Crawler::filter()`
9+
* improved content type guessing in `Crawler::addContent()`
10+
* [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document
11+
namespace
12+
413
2.3.0
514
-----
615

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@ class Crawler extends \SplObjectStorage
2727
*/
2828
protected $uri;
2929

30+
/**
31+
* @var string The default namespace prefix to be used with XPath and CSS expressions
32+
*/
33+
private $defaultNamespacePrefix = 'default';
34+
35+
/**
36+
* @var array A map of manually registered namespaces
37+
*/
38+
private $namespaces = array();
39+
3040
/**
3141
* Constructor.
3242
*
@@ -92,7 +102,7 @@ public function add($node)
92102
public function addContent($content, $type = null)
93103
{
94104
if (empty($type)) {
95-
$type = 'text/html';
105+
$type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
96106
}
97107

98108
// DOM only for HTML/XML content
@@ -195,9 +205,7 @@ public function addXmlContent($content, $charset = 'UTF-8')
195205

196206
$dom = new \DOMDocument('1.0', $charset);
197207
$dom->validateOnParse = true;
198-
199-
// remove the default namespace to make XPath expressions simpler
200-
@$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
208+
@$dom->loadXML($content, LIBXML_NONET);
201209

202210
libxml_use_internal_errors($current);
203211
libxml_disable_entity_loader($disableEntities);
@@ -579,7 +587,8 @@ public function filterXPath($xpath)
579587
$root->appendChild($document->importNode($node, true));
580588
}
581589

582-
$domxpath = new \DOMXPath($document);
590+
$prefixes = $this->findNamespacePrefixes($xpath);
591+
$domxpath = $this->createDOMXPath($document, $prefixes);
583592

584593
return new static($domxpath->query($xpath), $this->uri);
585594
}
@@ -709,6 +718,25 @@ public function form(array $values = null, $method = null)
709718
return $form;
710719
}
711720

721+
/**
722+
* Overloads a default namespace prefix to be used with XPath and CSS expressions.
723+
*
724+
* @param string $prefix
725+
*/
726+
public function setDefaultNamespacePrefix($prefix)
727+
{
728+
$this->defaultNamespacePrefix = $prefix;
729+
}
730+
731+
/**
732+
* @param string $prefix
733+
* @param string $namespace
734+
*/
735+
public function registerNamespace($prefix, $namespace)
736+
{
737+
$this->namespaces[$prefix] = $namespace;
738+
}
739+
712740
/**
713741
* Converts string for XPath expressions.
714742
*
@@ -792,4 +820,62 @@ protected function sibling($node, $siblingDir = 'nextSibling')
792820

793821
return $nodes;
794822
}
823+
824+
/**
825+
* @param \DOMDocument $document
826+
* @param array $prefixes
827+
*
828+
* @return \DOMXPath
829+
*
830+
* @throws \InvalidArgumentException
831+
*/
832+
private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
833+
{
834+
$domxpath = new \DOMXPath($document);
835+
836+
foreach ($prefixes as $prefix) {
837+
$namespace = $this->discoverNamespace($domxpath, $prefix);
838+
$domxpath->registerNamespace($prefix, $namespace);
839+
}
840+
841+
return $domxpath;
842+
}
843+
844+
/**
845+
* @param \DOMXPath $domxpath
846+
* @param string $prefix
847+
*
848+
* @return string
849+
*
850+
* @throws \InvalidArgumentException
851+
*/
852+
private function discoverNamespace(\DOMXPath $domxpath, $prefix)
853+
{
854+
if (isset($this->namespaces[$prefix])) {
855+
return $this->namespaces[$prefix];
856+
}
857+
858+
// ask for one namespace, otherwise we'd get a collection with an item for each node
859+
$namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
860+
861+
if ($node = $namespaces->item(0)) {
862+
return $node->nodeValue;
863+
}
864+
865+
throw new \InvalidArgumentException(sprintf('Could not find a namespace for the prefix: "%s"', $prefix));
866+
}
867+
868+
/**
869+
* @param $xpath
870+
*
871+
* @return array
872+
*/
873+
private function findNamespacePrefixes($xpath)
874+
{
875+
if (preg_match_all('/(?P<prefix>[a-zA-Z_][a-zA-Z_0-9\-\.]*):[^:]/', $xpath, $matches)) {
876+
return array_unique($matches['prefix']);
877+
}
878+
879+
return array();
880+
}
795881
}

src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
namespace Symfony\Component\DomCrawler\Tests;
1313

14+
use Symfony\Component\CssSelector\CssSelector;
1415
use Symfony\Component\DomCrawler\Crawler;
1516

1617
class CrawlerTest extends \PHPUnit_Framework_TestCase
@@ -370,6 +371,55 @@ public function testFilterXPath()
370371
$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');
371372
}
372373

374+
public function testFilterXPathWithDefaultNamespace()
375+
{
376+
$crawler = $this->createTestXmlCrawler()->filterXPath('//default:entry/default:id');
377+
$this->assertCount(1, $crawler, '->filterXPath() automatically registers a namespace');
378+
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
379+
}
380+
381+
public function testFilterXPathWithCustomDefaultNamespace()
382+
{
383+
$crawler = $this->createTestXmlCrawler();
384+
$crawler->setDefaultNamespacePrefix('x');
385+
$crawler = $crawler->filterXPath('//x:entry/x:id');
386+
387+
$this->assertCount(1, $crawler, '->filterXPath() lets to override the default namespace prefix');
388+
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
389+
}
390+
391+
public function testFilterXPathWithNamespace()
392+
{
393+
$crawler = $this->createTestXmlCrawler()->filterXPath('//yt:accessControl');
394+
$this->assertCount(2, $crawler, '->filterXPath() automatically registers a namespace');
395+
}
396+
397+
public function testFilterXPathWithMultipleNamespaces()
398+
{
399+
$crawler = $this->createTestXmlCrawler()->filterXPath('//media:group/yt:aspectRatio');
400+
$this->assertCount(1, $crawler, '->filterXPath() automatically registers multiple namespaces');
401+
$this->assertSame('widescreen', $crawler->text());
402+
}
403+
404+
/**
405+
* @expectedException \InvalidArgumentException
406+
* @expectedExceptionMessage Could not find a namespace for the prefix: "foo"
407+
*/
408+
public function testFilterXPathWithAnInvalidNamespace()
409+
{
410+
$this->createTestXmlCrawler()->filterXPath('//media:group/foo:aspectRatio');
411+
}
412+
413+
public function testFilterXPathWithManuallyRegisteredNamespace()
414+
{
415+
$crawler = $this->createTestXmlCrawler();
416+
$crawler->registerNamespace('m', 'http://search.yahoo.com/mrss/');
417+
418+
$crawler = $crawler->filterXPath('//m:group/yt:aspectRatio');
419+
$this->assertCount(1, $crawler, '->filterXPath() uses manually registered namespace');
420+
$this->assertSame('widescreen', $crawler->text());
421+
}
422+
373423
/**
374424
* @covers Symfony\Component\DomCrawler\Crawler::filter
375425
*/
@@ -384,6 +434,30 @@ public function testFilter()
384434
$this->assertCount(6, $crawler->filter('li'), '->filter() filters the node list with the CSS selector');
385435
}
386436

437+
public function testFilterWithDefaultNamespace()
438+
{
439+
$crawler = $this->createTestXmlCrawler()->filter('default|entry default|id');
440+
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
441+
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
442+
}
443+
444+
public function testFilterWithNamespace()
445+
{
446+
CssSelector::disableHtmlExtension();
447+
448+
$crawler = $this->createTestXmlCrawler()->filter('yt|accessControl');
449+
$this->assertCount(2, $crawler, '->filter() automatically registers namespaces');
450+
}
451+
452+
public function testFilterWithMultipleNamespaces()
453+
{
454+
CssSelector::disableHtmlExtension();
455+
456+
$crawler = $this->createTestXmlCrawler()->filter('media|group yt|aspectRatio');
457+
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
458+
$this->assertSame('widescreen', $crawler->text());
459+
}
460+
387461
public function testSelectLink()
388462
{
389463
$crawler = $this->createTestCrawler();
@@ -656,6 +730,22 @@ public function createTestCrawler($uri = null)
656730
return new Crawler($dom, $uri);
657731
}
658732

733+
protected function createTestXmlCrawler($uri = null)
734+
{
735+
$xml = '<?xml version="1.0" encoding="UTF-8"?>
736+
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xmlns:yt="http://gdata.youtube.com/schemas/2007">
737+
<id>tag:youtube.com,2008:video:kgZRZmEc9j4</id>
738+
<yt:accessControl action="comment" permission="allowed"/>
739+
<yt:accessControl action="videoRespond" permission="moderated"/>
740+
<media:group>
741+
<media:title type="plain">Chordates - CrashCourse Biology #24</media:title>
742+
<yt:aspectRatio>widescreen</yt:aspectRatio>
743+
</media:group>
744+
</entry>';
745+
746+
return new Crawler($xml, $uri);
747+
}
748+
659749
protected function createDomDocument()
660750
{
661751
$dom = new \DOMDocument();

0 commit comments

Comments
 (0)