add files
|
1 |
package Mojo::DOM::HTML; |
2 |
use Mojo::Base -base; |
|
3 | ||
4 |
use Mojo::Util qw(html_unescape xml_escape); |
|
5 |
use Scalar::Util 'weaken'; |
|
6 | ||
7 |
has 'xml'; |
|
8 |
has tree => sub { ['root'] }; |
|
9 | ||
10 |
my $ATTR_RE = qr/ |
|
11 |
([^<>=\s]+) # Key |
|
12 |
(?: |
|
13 |
\s*=\s* |
|
14 |
(?: |
|
15 |
"([^"]*?)" # Quotation marks |
|
16 |
| |
|
17 |
'([^']*?)' # Apostrophes |
|
18 |
| |
|
19 |
([^>\s]*) # Unquoted |
|
20 |
) |
|
21 |
)? |
|
22 |
\s* |
|
23 |
/x; |
|
24 |
my $END_RE = qr!^\s*/\s*(.+)\s*!; |
|
25 |
my $TOKEN_RE = qr/ |
|
26 |
([^<]*) # Text |
|
27 |
(?: |
|
28 |
<\?(.*?)\?> # Processing Instruction |
|
29 |
| |
|
30 |
<!--(.*?)--\s*> # Comment |
|
31 |
| |
|
32 |
<!\[CDATA\[(.*?)\]\]> # CDATA |
|
33 |
| |
|
34 |
<!DOCTYPE( |
|
35 |
\s+\w+ |
|
36 |
(?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID |
|
37 |
(?:\s+\[.+?\])? # Int Subset |
|
38 |
\s* |
|
39 |
)> |
|
40 |
| |
|
41 |
<( |
|
42 |
\s* |
|
43 |
[^<>\s]+ # Tag |
|
44 |
\s* |
|
45 |
(?:$ATTR_RE)* # Attributes |
|
46 |
)> |
|
47 |
| |
|
48 |
(<) # Runaway "<" |
|
49 |
)?? |
|
50 |
/xis; |
|
51 | ||
52 |
# HTML elements that break paragraphs |
|
53 |
my %PARAGRAPH = map { $_ => 1 } ( |
|
54 |
qw(address article aside blockquote dir div dl fieldset footer form h1 h2), |
|
55 |
qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul) |
|
56 |
); |
|
57 | ||
58 |
# HTML table elements with optional end tags |
|
59 |
my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr); |
|
60 | ||
61 |
# HTML elements without end tags |
|
62 |
my %VOID = map { $_ => 1 } ( |
|
63 |
qw(area base br col embed hr img input keygen link menuitem meta param), |
|
64 |
qw(source track wbr) |
|
65 |
); |
|
66 | ||
67 |
# HTML elements categorized as phrasing content (and obsolete inline elements) |
|
68 |
my @PHRASING = ( |
|
69 |
qw(a abbr area audio b bdi bdo br button canvas cite code data datalist), |
|
70 |
qw(del dfn em embed i iframe img input ins kbd keygen label link map mark), |
|
71 |
qw(math meta meter noscript object output progress q ruby s samp script), |
|
72 |
qw(select small span strong sub sup svg template textarea time u var video), |
|
73 |
qw(wbr) |
|
74 |
); |
|
75 |
my @OBSOLETE = qw(acronym applet basefont big font strike tt); |
|
76 |
my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING; |
|
77 | ||
78 |
sub parse { |
|
79 |
my ($self, $html) = @_; |
|
80 | ||
81 |
my $current = my $tree = ['root']; |
|
82 |
while ($html =~ m/\G$TOKEN_RE/gcs) { |
|
83 |
my ($text, $pi, $comment, $cdata, $doctype, $tag, $runaway) |
|
84 |
= ($1, $2, $3, $4, $5, $6, $11); |
|
85 | ||
86 |
# Text (and runaway "<") |
|
87 |
$text .= '<' if defined $runaway; |
|
88 |
push @$current, ['text', html_unescape $text] if length $text; |
|
89 | ||
90 |
# DOCTYPE |
|
91 |
if ($doctype) { push @$current, ['doctype', $doctype] } |
|
92 | ||
93 |
# Comment |
|
94 |
elsif ($comment) { push @$current, ['comment', $comment] } |
|
95 | ||
96 |
# CDATA |
|
97 |
elsif ($cdata) { push @$current, ['cdata', $cdata] } |
|
98 | ||
99 |
# Processing instruction (try to detect XML) |
|
100 |
elsif ($pi) { |
|
101 |
$self->xml(1) if !defined $self->xml && $pi =~ /xml/i; |
|
102 |
push @$current, ['pi', $pi]; |
|
103 |
} |
|
104 | ||
105 |
# End |
|
106 |
next unless $tag; |
|
107 |
my $cs = $self->xml; |
|
108 |
if ($tag =~ $END_RE) { $self->_end($cs ? $1 : lc($1), \$current) } |
|
109 | ||
110 |
# Start |
|
111 |
elsif ($tag =~ m!([^\s/]+)([\s\S]*)!) { |
|
112 |
my ($start, $attr) = ($cs ? $1 : lc($1), $2); |
|
113 | ||
114 |
# Attributes |
|
115 |
my %attrs; |
|
116 |
while ($attr =~ /$ATTR_RE/g) { |
|
117 |
my $key = $cs ? $1 : lc($1); |
|
118 |
my $value = $2 // $3 // $4; |
|
119 | ||
120 |
# Empty tag |
|
121 |
next if $key eq '/'; |
|
122 | ||
123 |
$attrs{$key} = defined $value ? html_unescape($value) : $value; |
|
124 |
} |
|
125 | ||
126 |
# Tag |
|
127 |
$self->_start($start, \%attrs, \$current); |
|
128 | ||
129 |
# Element without end tag |
|
130 |
$self->_end($start, \$current) |
|
131 |
if (!$self->xml && $VOID{$start}) || $attr =~ m!/\s*$!; |
|
132 | ||
133 |
# Relaxed "script" or "style" |
|
134 |
if ($start eq 'script' || $start eq 'style') { |
|
135 |
if ($html =~ m!\G(.*?)<\s*/\s*$start\s*>!gcsi) { |
|
136 |
push @$current, ['raw', $1]; |
|
137 |
$self->_end($start, \$current); |
|
138 |
} |
|
139 |
} |
|
140 |
} |
|
141 |
} |
|
142 | ||
143 |
return $self->tree($tree); |
|
144 |
} |
|
145 | ||
146 |
sub render { $_[0]->_render($_[0]->tree) } |
|
147 | ||
148 |
sub _close { |
|
149 |
my ($self, $current, $allowed, $scope) = @_; |
|
150 | ||
151 |
# Close allowed parent elements in scope |
|
152 |
my $parent = $$current; |
|
153 |
while ($parent->[0] ne 'root' && $parent->[1] ne $scope) { |
|
154 |
$self->_end($parent->[1], $current) if $allowed->{$parent->[1]}; |
|
155 |
$parent = $parent->[3]; |
|
156 |
} |
|
157 |
} |
|
158 | ||
159 |
sub _end { |
|
160 |
my ($self, $end, $current) = @_; |
|
161 | ||
162 |
# Search stack for start tag |
|
163 |
my $found = 0; |
|
164 |
my $next = $$current; |
|
165 |
while ($next->[0] ne 'root') { |
|
166 | ||
167 |
# Right tag |
|
168 |
++$found and last if $next->[1] eq $end; |
|
169 | ||
170 |
# Phrasing content can only cross phrasing content |
|
171 |
return if !$self->xml && $PHRASING{$end} && !$PHRASING{$next->[1]}; |
|
172 | ||
173 |
$next = $next->[3]; |
|
174 |
} |
|
175 | ||
176 |
# Ignore useless end tag |
|
177 |
return unless $found; |
|
178 | ||
179 |
# Walk backwards |
|
180 |
$next = $$current; |
|
181 |
while (($$current = $next) && $$current->[0] ne 'root') { |
|
182 |
$next = $$current->[3]; |
|
183 | ||
184 |
# Match |
|
185 |
if ($end eq $$current->[1]) { return $$current = $$current->[3] } |
|
186 | ||
187 |
# Table |
|
188 |
elsif ($end eq 'table') { $self->_close($current, \%TABLE, $end) } |
|
189 | ||
190 |
# Missing end tag |
|
191 |
$self->_end($$current->[1], $current); |
|
192 |
} |
|
193 |
} |
|
194 | ||
195 |
sub _render { |
|
196 |
my ($self, $tree) = @_; |
|
197 | ||
198 |
# Text (escaped) |
|
199 |
my $e = $tree->[0]; |
|
200 |
return xml_escape $tree->[1] if $e eq 'text'; |
|
201 | ||
202 |
# Raw text |
|
203 |
return $tree->[1] if $e eq 'raw'; |
|
204 | ||
205 |
# DOCTYPE |
|
206 |
return '<!DOCTYPE' . $tree->[1] . '>' if $e eq 'doctype'; |
|
207 | ||
208 |
# Comment |
|
209 |
return '<!--' . $tree->[1] . '-->' if $e eq 'comment'; |
|
210 | ||
211 |
# CDATA |
|
212 |
return '<![CDATA[' . $tree->[1] . ']]>' if $e eq 'cdata'; |
|
213 | ||
214 |
# Processing instruction |
|
215 |
return '<?' . $tree->[1] . '?>' if $e eq 'pi'; |
|
216 | ||
217 |
# Start tag |
|
218 |
my $start = 1; |
|
219 |
my $content = ''; |
|
220 |
if ($e eq 'tag') { |
|
221 |
$start = 4; |
|
222 | ||
223 |
# Open tag |
|
224 |
my $tag = $tree->[1]; |
|
225 |
$content .= "<$tag"; |
|
226 | ||
227 |
# Attributes |
|
228 |
my @attrs; |
|
229 |
for my $key (sort keys %{$tree->[2]}) { |
|
230 |
my $value = $tree->[2]{$key}; |
|
231 | ||
232 |
# No value |
|
233 |
push @attrs, $key and next unless defined $value; |
|
234 | ||
235 |
# Key and value |
|
236 |
push @attrs, qq{$key="} . xml_escape($value) . '"'; |
|
237 |
} |
|
238 |
my $attrs = join ' ', @attrs; |
|
239 |
$content .= " $attrs" if $attrs; |
|
240 | ||
241 |
# Element without end tag |
|
242 |
return $self->xml || $VOID{$tag} ? "$content />" : "$content></$tag>" |
|
243 |
unless $tree->[4]; |
|
244 | ||
245 |
# Close tag |
|
246 |
$content .= '>'; |
|
247 |
} |
|
248 | ||
249 |
# Render whole tree |
|
250 |
$content .= $self->_render($tree->[$_]) for $start .. $#$tree; |
|
251 | ||
252 |
# End tag |
|
253 |
$content .= '</' . $tree->[1] . '>' if $e eq 'tag'; |
|
254 | ||
255 |
return $content; |
|
256 |
} |
|
257 | ||
258 |
sub _start { |
|
259 |
my ($self, $start, $attrs, $current) = @_; |
|
260 | ||
261 |
# Autoclose optional HTML elements |
|
262 |
if (!$self->xml && $$current->[0] ne 'root') { |
|
263 | ||
264 |
# "li" |
|
265 |
if ($start eq 'li') { $self->_close($current, {li => 1}, 'ul') } |
|
266 | ||
267 |
# "p" |
|
268 |
elsif ($PARAGRAPH{$start}) { $self->_end('p', $current) } |
|
269 | ||
270 |
# "head" |
|
271 |
elsif ($start eq 'body') { $self->_end('head', $current) } |
|
272 | ||
273 |
# "optgroup" |
|
274 |
elsif ($start eq 'optgroup') { $self->_end('optgroup', $current) } |
|
275 | ||
276 |
# "option" |
|
277 |
elsif ($start eq 'option') { $self->_end('option', $current) } |
|
278 | ||
279 |
# "colgroup", "thead", "tbody" and "tfoot" |
|
280 |
elsif (grep { $_ eq $start } qw(colgroup thead tbody tfoot)) { |
|
281 |
$self->_close($current, \%TABLE, 'table'); |
|
282 |
} |
|
283 | ||
284 |
# "tr" |
|
285 |
elsif ($start eq 'tr') { $self->_close($current, {tr => 1}, 'table') } |
|
286 | ||
287 |
# "th" and "td" |
|
288 |
elsif ($start eq 'th' || $start eq 'td') { |
|
289 |
$self->_close($current, {$_ => 1}, 'table') for qw(th td); |
|
290 |
} |
|
291 | ||
292 |
# "dt" and "dd" |
|
293 |
elsif ($start eq 'dt' || $start eq 'dd') { |
|
294 |
$self->_end($_, $current) for qw(dt dd); |
|
295 |
} |
|
296 | ||
297 |
# "rt" and "rp" |
|
298 |
elsif ($start eq 'rt' || $start eq 'rp') { |
|
299 |
$self->_end($_, $current) for qw(rt rp); |
|
300 |
} |
|
301 |
} |
|
302 | ||
303 |
# New tag |
|
304 |
my $new = ['tag', $start, $attrs, $$current]; |
|
305 |
weaken $new->[3]; |
|
306 |
push @$$current, $new; |
|
307 |
$$current = $new; |
|
308 |
} |
|
309 | ||
310 |
1; |
|
311 | ||
312 |
=encoding utf8 |
|
313 | ||
314 |
=head1 NAME |
|
315 | ||
316 |
Mojo::DOM::HTML - HTML/XML engine |
|
317 | ||
318 |
=head1 SYNOPSIS |
|
319 | ||
320 |
use Mojo::DOM::HTML; |
|
321 | ||
322 |
# Turn HTML into DOM tree |
|
323 |
my $html = Mojo::DOM::HTML->new; |
|
324 |
$html->parse('<div><p id="a">A</p><p id="b">B</p></div>'); |
|
325 |
my $tree = $html->tree; |
|
326 | ||
327 |
=head1 DESCRIPTION |
|
328 | ||
329 |
L<Mojo::DOM::HTML> is the HTML/XML engine used by L<Mojo::DOM>. |
|
330 | ||
331 |
=head1 ATTRIBUTES |
|
332 | ||
333 |
L<Mojo::DOM::HTML> implements the following attributes. |
|
334 | ||
335 |
=head2 tree |
|
336 | ||
337 |
my $tree = $html->tree; |
|
338 |
$html = $html->tree(['root', ['text', 'foo']]); |
|
339 | ||
340 |
Document Object Model. Note that this structure should only be used very |
|
341 |
carefully since it is very dynamic. |
|
342 | ||
343 |
=head2 xml |
|
344 | ||
345 |
my $bool = $html->xml; |
|
346 |
$html = $html->xml($bool); |
|
347 | ||
348 |
Disable HTML semantics in parser and activate case sensitivity, defaults to |
|
349 |
auto detection based on processing instructions. |
|
350 | ||
351 |
=head1 METHODS |
|
352 | ||
353 |
L<Mojo::DOM::HTML> inherits all methods from L<Mojo::Base> and implements the |
|
354 |
following new ones. |
|
355 | ||
356 |
=head2 parse |
|
357 | ||
358 |
$html = $html->parse('<foo bar="baz">test</foo>'); |
|
359 | ||
360 |
Parse HTML/XML fragment. |
|
361 | ||
362 |
=head2 render |
|
363 | ||
364 |
my $xml = $html->render; |
|
365 | ||
366 |
Render DOM to XML. |
|
367 | ||
368 |
=head1 SEE ALSO |
|
369 | ||
370 |
L<Mojolicious>, L<Mojolicious::Guides>, L<http://mojolicio.us>. |
|
371 | ||
372 |
=cut |