Newer Older
372 lines | 8.853kb
add files
Yuki Kimoto authored on 2014-03-26
1
package Mojo::DOM::HTML;
2
use Mojo::Base -base;
3

            
4
use Mojo::Util qw(html_unescape xml_escape);
5
use Scalar::Util 'weaken';
6

            
7
has 'xml';
8
has tree => sub { ['root'] };
9

            
10
my $ATTR_RE = qr/
11
  ([^<>=\s]+)      # Key
12
  (?:
13
    \s*=\s*
14
    (?:
15
      "([^"]*?)"   # Quotation marks
16
    |
17
      '([^']*?)'   # Apostrophes
18
    |
19
      ([^>\s]*)    # Unquoted
20
    )
21
  )?
22
  \s*
23
/x;
24
my $END_RE   = qr!^\s*/\s*(.+)\s*!;
25
my $TOKEN_RE = qr/
26
  ([^<]*)                                           # Text
27
  (?:
28
    <\?(.*?)\?>                                     # Processing Instruction
29
  |
30
    <!--(.*?)--\s*>                                 # Comment
31
  |
32
    <!\[CDATA\[(.*?)\]\]>                           # CDATA
33
  |
34
    <!DOCTYPE(
35
      \s+\w+
36
      (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)?   # External ID
37
      (?:\s+\[.+?\])?                               # Int Subset
38
      \s*
39
    )>
40
  |
41
    <(
42
      \s*
43
      [^<>\s]+                                      # Tag
44
      \s*
45
      (?:$ATTR_RE)*                                 # Attributes
46
    )>
47
  |
48
    (<)                                             # Runaway "<"
49
  )??
50
/xis;
51

            
52
# HTML elements that break paragraphs
53
my %PARAGRAPH = map { $_ => 1 } (
54
  qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
55
  qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
56
);
57

            
58
# HTML table elements with optional end tags
59
my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
60

            
61
# HTML elements without end tags
62
my %VOID = map { $_ => 1 } (
63
  qw(area base br col embed hr img input keygen link menuitem meta param),
64
  qw(source track wbr)
65
);
66

            
67
# HTML elements categorized as phrasing content (and obsolete inline elements)
68
my @PHRASING = (
69
  qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
70
  qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
71
  qw(math meta meter noscript object output progress q ruby s samp script),
72
  qw(select small span strong sub sup svg template textarea time u var video),
73
  qw(wbr)
74
);
75
my @OBSOLETE = qw(acronym applet basefont big font strike tt);
76
my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
77

            
78
sub parse {
79
  my ($self, $html) = @_;
80

            
81
  my $current = my $tree = ['root'];
82
  while ($html =~ m/\G$TOKEN_RE/gcs) {
83
    my ($text, $pi, $comment, $cdata, $doctype, $tag, $runaway)
84
      = ($1, $2, $3, $4, $5, $6, $11);
85

            
86
    # Text (and runaway "<")
87
    $text .= '<' if defined $runaway;
88
    push @$current, ['text', html_unescape $text] if length $text;
89

            
90
    # DOCTYPE
91
    if ($doctype) { push @$current, ['doctype', $doctype] }
92

            
93
    # Comment
94
    elsif ($comment) { push @$current, ['comment', $comment] }
95

            
96
    # CDATA
97
    elsif ($cdata) { push @$current, ['cdata', $cdata] }
98

            
99
    # Processing instruction (try to detect XML)
100
    elsif ($pi) {
101
      $self->xml(1) if !defined $self->xml && $pi =~ /xml/i;
102
      push @$current, ['pi', $pi];
103
    }
104

            
105
    # End
106
    next unless $tag;
107
    my $cs = $self->xml;
108
    if ($tag =~ $END_RE) { $self->_end($cs ? $1 : lc($1), \$current) }
109

            
110
    # Start
111
    elsif ($tag =~ m!([^\s/]+)([\s\S]*)!) {
112
      my ($start, $attr) = ($cs ? $1 : lc($1), $2);
113

            
114
      # Attributes
115
      my %attrs;
116
      while ($attr =~ /$ATTR_RE/g) {
117
        my $key = $cs ? $1 : lc($1);
118
        my $value = $2 // $3 // $4;
119

            
120
        # Empty tag
121
        next if $key eq '/';
122

            
123
        $attrs{$key} = defined $value ? html_unescape($value) : $value;
124
      }
125

            
126
      # Tag
127
      $self->_start($start, \%attrs, \$current);
128

            
129
      # Element without end tag
130
      $self->_end($start, \$current)
131
        if (!$self->xml && $VOID{$start}) || $attr =~ m!/\s*$!;
132

            
133
      # Relaxed "script" or "style"
134
      if ($start eq 'script' || $start eq 'style') {
135
        if ($html =~ m!\G(.*?)<\s*/\s*$start\s*>!gcsi) {
136
          push @$current, ['raw', $1];
137
          $self->_end($start, \$current);
138
        }
139
      }
140
    }
141
  }
142

            
143
  return $self->tree($tree);
144
}
145

            
146
sub render { $_[0]->_render($_[0]->tree) }
147

            
148
sub _close {
149
  my ($self, $current, $allowed, $scope) = @_;
150

            
151
  # Close allowed parent elements in scope
152
  my $parent = $$current;
153
  while ($parent->[0] ne 'root' && $parent->[1] ne $scope) {
154
    $self->_end($parent->[1], $current) if $allowed->{$parent->[1]};
155
    $parent = $parent->[3];
156
  }
157
}
158

            
159
sub _end {
160
  my ($self, $end, $current) = @_;
161

            
162
  # Search stack for start tag
163
  my $found = 0;
164
  my $next  = $$current;
165
  while ($next->[0] ne 'root') {
166

            
167
    # Right tag
168
    ++$found and last if $next->[1] eq $end;
169

            
170
    # Phrasing content can only cross phrasing content
171
    return if !$self->xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
172

            
173
    $next = $next->[3];
174
  }
175

            
176
  # Ignore useless end tag
177
  return unless $found;
178

            
179
  # Walk backwards
180
  $next = $$current;
181
  while (($$current = $next) && $$current->[0] ne 'root') {
182
    $next = $$current->[3];
183

            
184
    # Match
185
    if ($end eq $$current->[1]) { return $$current = $$current->[3] }
186

            
187
    # Table
188
    elsif ($end eq 'table') { $self->_close($current, \%TABLE, $end) }
189

            
190
    # Missing end tag
191
    $self->_end($$current->[1], $current);
192
  }
193
}
194

            
195
sub _render {
196
  my ($self, $tree) = @_;
197

            
198
  # Text (escaped)
199
  my $e = $tree->[0];
200
  return xml_escape $tree->[1] if $e eq 'text';
201

            
202
  # Raw text
203
  return $tree->[1] if $e eq 'raw';
204

            
205
  # DOCTYPE
206
  return '<!DOCTYPE' . $tree->[1] . '>' if $e eq 'doctype';
207

            
208
  # Comment
209
  return '<!--' . $tree->[1] . '-->' if $e eq 'comment';
210

            
211
  # CDATA
212
  return '<![CDATA[' . $tree->[1] . ']]>' if $e eq 'cdata';
213

            
214
  # Processing instruction
215
  return '<?' . $tree->[1] . '?>' if $e eq 'pi';
216

            
217
  # Start tag
218
  my $start   = 1;
219
  my $content = '';
220
  if ($e eq 'tag') {
221
    $start = 4;
222

            
223
    # Open tag
224
    my $tag = $tree->[1];
225
    $content .= "<$tag";
226

            
227
    # Attributes
228
    my @attrs;
229
    for my $key (sort keys %{$tree->[2]}) {
230
      my $value = $tree->[2]{$key};
231

            
232
      # No value
233
      push @attrs, $key and next unless defined $value;
234

            
235
      # Key and value
236
      push @attrs, qq{$key="} . xml_escape($value) . '"';
237
    }
238
    my $attrs = join ' ', @attrs;
239
    $content .= " $attrs" if $attrs;
240

            
241
    # Element without end tag
242
    return $self->xml || $VOID{$tag} ? "$content />" : "$content></$tag>"
243
      unless $tree->[4];
244

            
245
    # Close tag
246
    $content .= '>';
247
  }
248

            
249
  # Render whole tree
250
  $content .= $self->_render($tree->[$_]) for $start .. $#$tree;
251

            
252
  # End tag
253
  $content .= '</' . $tree->[1] . '>' if $e eq 'tag';
254

            
255
  return $content;
256
}
257

            
258
sub _start {
259
  my ($self, $start, $attrs, $current) = @_;
260

            
261
  # Autoclose optional HTML elements
262
  if (!$self->xml && $$current->[0] ne 'root') {
263

            
264
    # "li"
265
    if ($start eq 'li') { $self->_close($current, {li => 1}, 'ul') }
266

            
267
    # "p"
268
    elsif ($PARAGRAPH{$start}) { $self->_end('p', $current) }
269

            
270
    # "head"
271
    elsif ($start eq 'body') { $self->_end('head', $current) }
272

            
273
    # "optgroup"
274
    elsif ($start eq 'optgroup') { $self->_end('optgroup', $current) }
275

            
276
    # "option"
277
    elsif ($start eq 'option') { $self->_end('option', $current) }
278

            
279
    # "colgroup", "thead", "tbody" and "tfoot"
280
    elsif (grep { $_ eq $start } qw(colgroup thead tbody tfoot)) {
281
      $self->_close($current, \%TABLE, 'table');
282
    }
283

            
284
    # "tr"
285
    elsif ($start eq 'tr') { $self->_close($current, {tr => 1}, 'table') }
286

            
287
    # "th" and "td"
288
    elsif ($start eq 'th' || $start eq 'td') {
289
      $self->_close($current, {$_ => 1}, 'table') for qw(th td);
290
    }
291

            
292
    # "dt" and "dd"
293
    elsif ($start eq 'dt' || $start eq 'dd') {
294
      $self->_end($_, $current) for qw(dt dd);
295
    }
296

            
297
    # "rt" and "rp"
298
    elsif ($start eq 'rt' || $start eq 'rp') {
299
      $self->_end($_, $current) for qw(rt rp);
300
    }
301
  }
302

            
303
  # New tag
304
  my $new = ['tag', $start, $attrs, $$current];
305
  weaken $new->[3];
306
  push @$$current, $new;
307
  $$current = $new;
308
}
309

            
310
1;
311

            
312
=encoding utf8
313

            
314
=head1 NAME
315

            
316
Mojo::DOM::HTML - HTML/XML engine
317

            
318
=head1 SYNOPSIS
319

            
320
  use Mojo::DOM::HTML;
321

            
322
  # Turn HTML into DOM tree
323
  my $html = Mojo::DOM::HTML->new;
324
  $html->parse('<div><p id="a">A</p><p id="b">B</p></div>');
325
  my $tree = $html->tree;
326

            
327
=head1 DESCRIPTION
328

            
329
L<Mojo::DOM::HTML> is the HTML/XML engine used by L<Mojo::DOM>.
330

            
331
=head1 ATTRIBUTES
332

            
333
L<Mojo::DOM::HTML> implements the following attributes.
334

            
335
=head2 tree
336

            
337
  my $tree = $html->tree;
338
  $html    = $html->tree(['root', ['text', 'foo']]);
339

            
340
Document Object Model. Note that this structure should only be used very
341
carefully since it is very dynamic.
342

            
343
=head2 xml
344

            
345
  my $bool = $html->xml;
346
  $html    = $html->xml($bool);
347

            
348
Disable HTML semantics in parser and activate case sensitivity, defaults to
349
auto detection based on processing instructions.
350

            
351
=head1 METHODS
352

            
353
L<Mojo::DOM::HTML> inherits all methods from L<Mojo::Base> and implements the
354
following new ones.
355

            
356
=head2 parse
357

            
358
  $html = $html->parse('<foo bar="baz">test</foo>');
359

            
360
Parse HTML/XML fragment.
361

            
362
=head2 render
363

            
364
  my $xml = $html->render;
365

            
366
Render DOM to XML.
367

            
368
=head1 SEE ALSO
369

            
370
L<Mojolicious>, L<Mojolicious::Guides>, L<http://mojolicio.us>.
371

            
372
=cut