|
| 1 | +.. index:: |
| 2 | + single: DomCrawler |
| 3 | + |
| 4 | +The DomCrawler Component |
| 5 | +======================== |
| 6 | + |
| 7 | + DomCrawler Component eases DOM navigation for HTML and XML documents. |
| 8 | + |
| 9 | +Installation |
| 10 | +------------ |
| 11 | + |
| 12 | +You can install the component in many different ways: |
| 13 | + |
| 14 | +* Use the official Git repository (https://github.com/symfony/DomCrawler); |
| 15 | +* Install it via PEAR ( `pear.symfony.com/DomCrawler`); |
| 16 | +* Install it via Composer (`symfony/dom-crawler` on Packagist). |
| 17 | + |
| 18 | +Usage |
| 19 | +----- |
| 20 | + |
| 21 | +The :class:`Symfony\\Component\\DomCrawler\\Crawler` class provides methods |
| 22 | +to query and manipulate HTML and XML documents. |
| 23 | + |
| 24 | +Instance of the Crawler represents a set (:phpclass:`SplObjectStorage`) |
| 25 | +of :phpclass:`DOMElement` objects: |
| 26 | + |
| 27 | +.. code-block:: php |
| 28 | +
|
| 29 | + use Symfony\Component\DomCrawler\Crawler; |
| 30 | +
|
| 31 | + $html = <<<'HTML' |
| 32 | + <html> |
| 33 | + <body> |
| 34 | + <p class="message">Hello World!</p> |
| 35 | + <p>Hello Crawler!</p> |
| 36 | + </body> |
| 37 | + </html> |
| 38 | + HTML; |
| 39 | +
|
| 40 | + $crawler = new Crawler($html); |
| 41 | +
|
| 42 | + foreach ($crawler as $domElement) { |
| 43 | + print $domElement->nodeName; |
| 44 | + } |
| 45 | +
|
| 46 | +More specialized :class:`Symfony\\Component\\DomCrawler\\Link` and |
| 47 | +:class:`Symfony\\Component\\DomCrawler\\Form` classes are useful for |
| 48 | +interacting with html links and forms. |
| 49 | + |
| 50 | +Node Filtering |
| 51 | +~~~~~~~~~~~~~~ |
| 52 | + |
| 53 | +Using XPath expressions is really simplified: |
| 54 | + |
| 55 | +.. code-block:: php |
| 56 | +
|
| 57 | + $crawler = $crawler->filterXPath('descendant-or-self::body/p'); |
| 58 | +
|
| 59 | +.. tip:: |
| 60 | + |
| 61 | + :phpmethod:`DOMXPath::query` is used internally to actually perform |
| 62 | + an XPath query. |
| 63 | + |
| 64 | +Filtering is even easier if you have CssSelector Component installed: |
| 65 | + |
| 66 | +.. code-block:: php |
| 67 | +
|
| 68 | + $crawler = $crawler->filter('body > p'); |
| 69 | +
|
| 70 | +Anonymous function can be used to filter with more complex criteria: |
| 71 | + |
| 72 | +.. code-block:: php |
| 73 | +
|
| 74 | + $crawler = $crawler->filter('body > p')->reduce(function ($node, $i) { |
| 75 | + // filter even nodes |
| 76 | + return ($i % 2) == 0; |
| 77 | + }); |
| 78 | +
|
| 79 | +To remove a node the anonymous function must return false. |
| 80 | + |
| 81 | +.. note:: |
| 82 | + |
| 83 | + All filter methods return a new :class:`Symfony\\Component\\DomCrawler\\Crawler` |
| 84 | + instance with filtered content. |
| 85 | + |
| 86 | +Node Traversing |
| 87 | +~~~~~~~~~~~~~~~ |
| 88 | + |
| 89 | +Access node by its position on the list: |
| 90 | + |
| 91 | +.. code-block:: php |
| 92 | +
|
| 93 | + $crawler->filter('body > p')->eq(0); |
| 94 | +
|
| 95 | +Get the first or last node of the current selection: |
| 96 | + |
| 97 | +.. code-block:: php |
| 98 | +
|
| 99 | + $crawler->filter('body > p')->first(); |
| 100 | + $crawler->filter('body > p')->last(); |
| 101 | +
|
| 102 | +Get the nodes of the same level as the current selection: |
| 103 | + |
| 104 | +.. code-block:: php |
| 105 | +
|
| 106 | + $crawler->filter('body > p')->siblings(); |
| 107 | +
|
| 108 | +Get the same level nodes after or before the current selection: |
| 109 | + |
| 110 | +.. code-block:: php |
| 111 | +
|
| 112 | + $crawler->filter('body > p')->nextAll(); |
| 113 | + $crawler->filter('body > p')->previousAll(); |
| 114 | +
|
| 115 | +Get all the child or parent nodes: |
| 116 | + |
| 117 | +.. code-block:: php |
| 118 | +
|
| 119 | + $crawler->filter('body')->children(); |
| 120 | + $crawler->filter('body > p')->parents(); |
| 121 | +
|
| 122 | +.. note:: |
| 123 | + |
| 124 | + All the traversal methods return a new :class:`Symfony\\Component\\DomCrawler\\Crawler` |
| 125 | + instance. |
| 126 | + |
| 127 | +Accessing Node Values |
| 128 | +~~~~~~~~~~~~~~~~~~~~~ |
| 129 | + |
| 130 | +Access the value of the first node of the current selection: |
| 131 | + |
| 132 | +.. code-block:: php |
| 133 | +
|
| 134 | + $message = $crawler->filterXPath('//body/p')->text(); |
| 135 | +
|
| 136 | +Access the attribute value of the first node of the current selection: |
| 137 | + |
| 138 | +.. code-block:: php |
| 139 | +
|
| 140 | + $class = $crawler->filterXPath('//body/p')->attr('class'); |
| 141 | +
|
| 142 | +Extract attribute and/or node values from the list of nodes: |
| 143 | + |
| 144 | +.. code-block:: php |
| 145 | +
|
| 146 | + $attributes = $crawler->filterXpath('//body/p')->extract(array('_text', 'class')); |
| 147 | +
|
| 148 | +.. note:: Special attribute ``_text`` represents a node value. |
| 149 | + |
| 150 | +Call an anonymous function on each node of the list: |
| 151 | + |
| 152 | +.. code-block:: php |
| 153 | +
|
| 154 | + $nodeValues = $crawler->filter('p')->each(function ($node, $i) { |
| 155 | + return $node->nodeValue; |
| 156 | + }); |
| 157 | +
|
| 158 | +The anonymous function receives the position and the node as arguments. |
| 159 | +Result is an array of values returned by anonymous function calls. |
| 160 | + |
| 161 | +Adding the Content |
| 162 | +~~~~~~~~~~~~~~~~~~ |
| 163 | + |
| 164 | +Crawler supports multiple ways of adding the content: |
| 165 | + |
| 166 | +.. code-block:: php |
| 167 | +
|
| 168 | + $crawler = new Crawler('<html><body /></html>'); |
| 169 | +
|
| 170 | + $crawler->addHtmlContent('<html><body /></html>'); |
| 171 | + $crawler->addXmlContent('<root><node /></root>'); |
| 172 | +
|
| 173 | + $crawler->addContent('<html><body /></html>'); |
| 174 | + $crawler->addContent('<root><node /></root>', 'text/xml'); |
| 175 | +
|
| 176 | + $crawler->add('<html><body /></html>'); |
| 177 | + $crawler->add('<root><node /></root>'); |
| 178 | +
|
| 179 | +As Crawler's implementation is based on the DOM extension it is also able |
| 180 | +to interact with native :phpclass:`DOMDocument`, :phpclass:`DOMNodeList` |
| 181 | +and :phpclass:`DOMNode` objects: |
| 182 | + |
| 183 | +.. code-block:: php |
| 184 | +
|
| 185 | + $document = new \DOMDocument(); |
| 186 | + $document->loadXml('<root><node /><node /></root>'); |
| 187 | + $nodeList = $document->getElementsByTagName('node'); |
| 188 | + $node = $document->getElementsByTagName('node')->item(0); |
| 189 | +
|
| 190 | + $crawler->addDocument($document); |
| 191 | + $crawler->addNodeList($nodeList); |
| 192 | + $crawler->addNodes(array($node)); |
| 193 | + $crawler->addNode($node); |
| 194 | + $crawler->add($document); |
| 195 | +
|
| 196 | +Form and Link support |
| 197 | +~~~~~~~~~~~~~~~~~~~~~ |
| 198 | + |
| 199 | +todo: |
| 200 | + |
| 201 | +* selectLink() |
| 202 | +* selectButton() |
| 203 | +* link() |
| 204 | +* links() |
| 205 | +* form() |
0 commit comments