1 <?php
2
3 4 5 6 7 8 9 10
11
12 namespace Patron;
13
14 class HTMLParser
15 {
16 const T_ERROR_HANDLER = 'error-handler';
17
18 private $encoding;
19 private $matches;
20 private $escaped;
21 private $opened = array();
22
23 protected $error_handler;
24 protected $namespace;
25
26 public function __construct(array $tags=array())
27 {
28 $tags += array
29 (
30 self::T_ERROR_HANDLER => function($str, $args) {
31
32 trigger_error(\ICanBoogie\format($str, $args));
33
34 }
35 );
36
37 $this->error_handler = $tags[self::T_ERROR_HANDLER];
38 }
39
40 public function parse($html, $namespace=null, $encoding='utf-8')
41 {
42 $this->encoding = $encoding;
43 $this->namespace = $namespace;
44
45
46
47
48
49
50 $html = $this->escapeSpecials($html);
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 $this->matches = preg_split
66 (
67 '#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE
68 );
69
70
71
72
73
74 $tree = $this->buildTree();
75
76
77
78
79
80
81 if ($this->escaped)
82 {
83 $tree = $this->unescapeSpecials($tree);
84 }
85
86 return $tree;
87 }
88
89 protected function escapeSpecials($html)
90 {
91
92
93
94
95 $html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);
96
97
98
99
100
101 $html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);
102
103 return $html;
104 }
105
106 protected function escapeSpecials_callback($m)
107 {
108 $this->escaped = true;
109
110 $text = $m[0];
111
112 $text = str_replace
113 (
114 array('<', '>'),
115 array("\x01", "\x02"),
116 $text
117 );
118
119 return $text;
120 }
121
122 protected function unescapeSpecials($tree)
123 {
124 return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace
125 (
126 array("\x01", "\x02"),
127 array('<', '>'),
128 $tree
129 );
130 }
131
132 protected function buildTree()
133 {
134 $nodes = array();
135
136 $i = 0;
137 $text = null;
138
139 while (($value = array_shift($this->matches)) !== null)
140 {
141 switch ($i++ % 3)
142 {
143 case 0:
144 {
145
146
147
148
149
150 if (trim($value))
151 {
152 $nodes[] = $value;
153 }
154 }
155 break;
156
157 case 1:
158 {
159 $closing = ($value == '/');
160 }
161 break;
162
163 case 2:
164 {
165 if (substr($value, -1, 1) == '/')
166 {
167
168
169
170
171 $nodes[] = $this->parseMarkup(substr($value, 0, -1));
172 }
173 else if ($closing)
174 {
175
176
177
178
179 $open = array_pop($this->opened);
180
181 if ($value != $open)
182 {
183 $this->error($value, $open);
184 }
185
186 return $nodes;
187 }
188 else
189 {
190
191
192
193
194 $node = $this->parseMarkup($value);
195
196
197
198
199
200 $this->opened[] = $node['name'];
201
202
203
204
205
206 $node['children'] = $this->buildTree($this->matches);
207
208 $nodes[] = $node;
209 }
210 }
211 }
212 }
213
214 return $nodes;
215 }
216
217 protected function parseMarkup($markup)
218 {
219
220
221
222
223 preg_match('#^[^\s]+#', $markup, $matches);
224
225 $name = $matches[0];
226
227
228
229
230
231 preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);
232
233
234
235
236
237 $args = array();
238
239 foreach ($matches as $m)
240 {
241
242
243
244
245 $args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);
246 }
247
248 return array('name' => $name, 'args' => $args);
249 }
250
251 protected function error($markup, $expected)
252 {
253 $this->malformed = true;
254
255 call_user_func
256 (
257 $this->error_handler, $expected
258 ? 'unexpected closing markup %markup, should be %expected'
259 : 'unexpected closing markup %markup, when none was opened', array
260 (
261 '%markup' => $this->namespace . $markup, '%expected' => $expected
262 )
263 );
264 }
265
266 static public function collectMarkup($nodes, $markup)
267 {
268 $collected = array();
269
270 foreach ($nodes as $node)
271 {
272 if (!is_array($node))
273 {
274 continue;
275 }
276
277 if ($node['name'] == $markup)
278 {
279 $collected[] = $node;
280 }
281
282 if (isset($node['children']))
283 {
284 $collected = array_merge($collected, self::collectMarkup($node['children'], $markup));
285 }
286 }
287
288 return $collected;
289 }
290 }