Newer Older
630 lines | 14.413kb
add files
Yuki Kimoto authored on 2014-03-26
1
package Mojo::DOM::CSS;
2
use Mojo::Base -base;
3

            
4
has 'tree';
5

            
6
my $ESCAPE_RE = qr/\\[^0-9a-fA-F]|\\[0-9a-fA-F]{1,6}/;
7
my $ATTR_RE   = qr/
8
  \[
9
  ((?:$ESCAPE_RE|[\w\-])+)           # Key
10
  (?:
11
    (\W)?                            # Operator
12
    =
13
    (?:"((?:\\"|[^"])*)"|([^\]]+))   # Value
14
  )?
15
  \]
16
/x;
17
my $CLASS_ID_RE = qr/
18
  (?:
19
    (?:\.((?:\\\.|[^\#.])+))   # Class
20
  |
21
    (?:\#((?:\\\#|[^.\#])+))   # ID
22
  )
23
/x;
24
my $PSEUDO_CLASS_RE = qr/(?::([\w\-]+)(?:\(((?:\([^)]+\)|[^)])+)\))?)/;
25
my $TOKEN_RE        = qr/
26
  (\s*,\s*)?                         # Separator
27
  ((?:[^[\\:\s,]|$ESCAPE_RE\s?)+)?   # Element
28
  ($PSEUDO_CLASS_RE*)?               # Pseudoclass
29
  ((?:$ATTR_RE)*)?                   # Attributes
30
  (?:
31
    \s*
32
    ([>+~])                          # Combinator
33
  )?
34
/x;
35

            
36
sub match {
37
  my $self = shift;
38
  my $tree = $self->tree;
39
  return undef if $tree->[0] eq 'root';
40
  return $self->_match($self->_compile(shift), $tree, $tree);
41
}
42

            
43
sub select {
44
  my $self = shift;
45

            
46
  my @results;
47
  my $pattern = $self->_compile(shift);
48
  my $tree    = $self->tree;
49
  my @queue   = ($tree);
50
  while (my $current = shift @queue) {
51
    my $type = $current->[0];
52

            
53
    # Tag
54
    if ($type eq 'tag') {
55
      unshift @queue, @$current[4 .. $#$current];
56
      push @results, $current if $self->_match($pattern, $current, $tree);
57
    }
58

            
59
    # Root
60
    elsif ($type eq 'root') { unshift @queue, @$current[1 .. $#$current] }
61
  }
62

            
63
  return \@results;
64
}
65

            
66
sub _ancestor {
67
  my ($self, $selectors, $current, $tree) = @_;
68
  while ($current = $current->[3]) {
69
    return undef if $current->[0] eq 'root' || $current eq $tree;
70
    return 1 if $self->_combinator($selectors, $current, $tree);
71
  }
72
  return undef;
73
}
74

            
75
sub _attr {
76
  my ($self, $key, $regex, $current) = @_;
77

            
78
  # Ignore namespace prefix
79
  my $attrs = $current->[2];
80
  for my $name (keys %$attrs) {
81
    next unless $name =~ /(?:^|:)$key$/;
82
    return 1 unless defined $attrs->{$name} && defined $regex;
83
    return 1 if $attrs->{$name} =~ $regex;
84
  }
85

            
86
  return undef;
87
}
88

            
89
sub _combinator {
90
  my ($self, $selectors, $current, $tree) = @_;
91

            
92
  # Selector
93
  my @s = @$selectors;
94
  return undef unless my $combinator = shift @s;
95
  if ($combinator->[0] ne 'combinator') {
96
    return undef unless $self->_selector($combinator, $current);
97
    return 1 unless $combinator = shift @s;
98
  }
99

            
100
  # " " (ancestor)
101
  my $c = $combinator->[1];
102
  if ($c eq ' ') { return undef unless $self->_ancestor(\@s, $current, $tree) }
103

            
104
  # ">" (parent only)
105
  elsif ($c eq '>') {
106
    return undef unless $self->_parent(\@s, $current, $tree);
107
  }
108

            
109
  # "~" (preceding siblings)
110
  elsif ($c eq '~') {
111
    return undef unless $self->_sibling(\@s, $current, $tree, 0);
112
  }
113

            
114
  # "+" (immediately preceding siblings)
115
  elsif ($c eq '+') {
116
    return undef unless $self->_sibling(\@s, $current, $tree, 1);
117
  }
118

            
119
  return 1;
120
}
121

            
122
sub _compile {
123
  my ($self, $css) = @_;
124

            
125
  my $pattern = [[]];
126
  while ($css =~ /$TOKEN_RE/g) {
127
    my ($separator, $element, $pc, $attrs, $combinator)
128
      = ($1, $2 // '', $3, $6, $11);
129

            
130
    next unless $separator || $element || $pc || $attrs || $combinator;
131

            
132
    # New selector
133
    push @$pattern, [] if $separator;
134
    my $part = $pattern->[-1];
135

            
136
    # Empty combinator
137
    push @$part, [combinator => ' ']
138
      if $part->[-1] && $part->[-1][0] ne 'combinator';
139

            
140
    # Selector
141
    push @$part, ['element'];
142
    my $selector = $part->[-1];
143

            
144
    # Element
145
    my $tag = '*';
146
    $element =~ s/^((?:\\\.|\\\#|[^.#])+)// and $tag = $self->_unescape($1);
147

            
148
    # Tag
149
    push @$selector, ['tag', $tag];
150

            
151
    # Class or ID
152
    while ($element =~ /$CLASS_ID_RE/g) {
153
      push @$selector, ['attr', 'class', $self->_regex('~', $1)] if defined $1;
154
      push @$selector, ['attr', 'id',    $self->_regex('',  $2)] if defined $2;
155
    }
156

            
157
    # Pseudo classes
158
    while ($pc =~ /$PSEUDO_CLASS_RE/g) {
159

            
160
      # "not"
161
      if ($1 eq 'not') {
162
        my $subpattern = $self->_compile($2)->[-1][-1];
163
        push @$selector, ['pc', 'not', $subpattern];
164
      }
165

            
166
      # Everything else
167
      else { push @$selector, ['pc', $1, $2] }
168
    }
169

            
170
    # Attributes
171
    while ($attrs =~ /$ATTR_RE/g) {
172
      my ($key, $op, $value) = ($self->_unescape($1), $2 // '', $3 // $4);
173
      push @$selector, ['attr', $key, $self->_regex($op, $value)];
174
    }
175

            
176
    # Combinator
177
    push @$part, [combinator => $combinator] if $combinator;
178
  }
179

            
180
  return $pattern;
181
}
182

            
183
sub _equation {
184
  my ($self, $equation) = @_;
185

            
186
  # "even"
187
  return [2, 2] if $equation =~ /^even$/i;
188

            
189
  # "odd"
190
  return [2, 1] if $equation =~ /^odd$/i;
191

            
192
  # Equation
193
  my $num = [1, 1];
194
  return $num if $equation !~ /(?:(-?(?:\d+)?)?(n))?\s*\+?\s*(-?\s*\d+)?\s*$/i;
195
  $num->[0] = defined($1) && length($1) ? $1 : $2 ? 1 : 0;
196
  $num->[0] = -1 if $num->[0] eq '-';
197
  $num->[1] = $3 // 0;
198
  $num->[1] =~ s/\s+//g;
199
  return $num;
200
}
201

            
202
sub _match {
203
  my ($self, $pattern, $current, $tree) = @_;
204
  $self->_combinator([reverse @$_], $current, $tree) and return 1
205
    for @$pattern;
206
  return undef;
207
}
208

            
209
sub _parent {
210
  my ($self, $selectors, $current, $tree) = @_;
211
  return undef unless my $parent = $current->[3];
212
  return undef if $parent->[0] eq 'root';
213
  return $self->_combinator($selectors, $parent, $tree);
214
}
215

            
216
sub _pc {
217
  my ($self, $class, $args, $current) = @_;
218

            
219
  # ":first-*"
220
  if ($class =~ /^first-(?:(child)|of-type)$/) {
221
    $class = defined $1 ? 'nth-child' : 'nth-of-type';
222
    $args = 1;
223
  }
224

            
225
  # ":last-*"
226
  elsif ($class =~ /^last-(?:(child)|of-type)$/) {
227
    $class = defined $1 ? 'nth-last-child' : 'nth-last-of-type';
228
    $args = '-n+1';
229
  }
230

            
231
  # ":checked"
232
  if ($class eq 'checked') {
233
    my $attrs = $current->[2];
234
    return 1 if exists $attrs->{checked} || exists $attrs->{selected};
235
  }
236

            
237
  # ":empty"
238
  elsif ($class eq 'empty') { return 1 unless defined $current->[4] }
239

            
240
  # ":root"
241
  elsif ($class eq 'root') {
242
    if (my $parent = $current->[3]) { return 1 if $parent->[0] eq 'root' }
243
  }
244

            
245
  # ":not"
246
  elsif ($class eq 'not') { return 1 if !$self->_selector($args, $current) }
247

            
248
  # ":nth-*"
249
  elsif ($class =~ /^nth-/) {
250

            
251
    # Numbers
252
    $args = $self->_equation($args) unless ref $args;
253

            
254
    # Siblings
255
    my $parent = $current->[3];
256
    my @siblings;
257
    my $type = $class =~ /of-type$/ ? $current->[1] : undef;
258
    for my $i (($parent->[0] eq 'root' ? 1 : 4) .. $#$parent) {
259
      my $sibling = $parent->[$i];
260
      next unless $sibling->[0] eq 'tag';
261
      next if defined $type && $type ne $sibling->[1];
262
      push @siblings, $sibling;
263
    }
264

            
265
    # Reverse
266
    @siblings = reverse @siblings if $class =~ /^nth-last/;
267

            
268
    # Find
269
    for my $i (0 .. $#siblings) {
270
      my $result = $args->[0] * $i + $args->[1];
271
      next if $result < 1;
272
      last unless my $sibling = $siblings[$result - 1];
273
      return 1 if $sibling eq $current;
274
    }
275
  }
276

            
277
  # ":only-*"
278
  elsif ($class =~ /^only-(?:child|(of-type))$/) {
279
    my $type = $1 ? $current->[1] : undef;
280

            
281
    # Siblings
282
    my $parent = $current->[3];
283
    for my $i (($parent->[0] eq 'root' ? 1 : 4) .. $#$parent) {
284
      my $sibling = $parent->[$i];
285
      next if $sibling->[0] ne 'tag' || $sibling eq $current;
286
      return undef unless defined $type && $sibling->[1] ne $type;
287
    }
288

            
289
    # No siblings
290
    return 1;
291
  }
292

            
293
  return undef;
294
}
295

            
296
sub _regex {
297
  my ($self, $op, $value) = @_;
298
  return undef unless defined $value;
299
  $value = quotemeta $self->_unescape($value);
300

            
301
  # "~=" (word)
302
  return qr/(?:^|.*\s+)$value(?:\s+.*|$)/ if $op eq '~';
303

            
304
  # "*=" (contains)
305
  return qr/$value/ if $op eq '*';
306

            
307
  # "^=" (begins with)
308
  return qr/^$value/ if $op eq '^';
309

            
310
  # "$=" (ends with)
311
  return qr/$value$/ if $op eq '$';
312

            
313
  # Everything else
314
  return qr/^$value$/;
315
}
316

            
317
sub _selector {
318
  my ($self, $selector, $current) = @_;
319

            
320
  for my $s (@$selector[1 .. $#$selector]) {
321
    my $type = $s->[0];
322

            
323
    # Tag (ignore namespace prefix)
324
    if ($type eq 'tag') {
325
      my $tag = $s->[1];
326
      return undef unless $tag eq '*' || $current->[1] =~ /(?:^|:)$tag$/;
327
    }
328

            
329
    # Attribute
330
    elsif ($type eq 'attr') {
331
      return undef unless $self->_attr(@$s[1, 2], $current);
332
    }
333

            
334
    # Pseudo class
335
    elsif ($type eq 'pc') {
336
      return undef unless $self->_pc(lc $s->[1], $s->[2], $current);
337
    }
338
  }
339

            
340
  return 1;
341
}
342

            
343
sub _sibling {
344
  my ($self, $selectors, $current, $tree, $immediate) = @_;
345

            
346
  my $parent = $current->[3];
347
  my $found;
348
  for my $e (@$parent[($parent->[0] eq 'root' ? 1 : 4) .. $#$parent]) {
349
    return $found if $e eq $current;
350
    next unless $e->[0] eq 'tag';
351

            
352
    # "+" (immediately preceding sibling)
353
    if ($immediate) { $found = $self->_combinator($selectors, $e, $tree) }
354

            
355
    # "~" (preceding sibling)
356
    else { return 1 if $self->_combinator($selectors, $e, $tree) }
357
  }
358

            
359
  return undef;
360
}
361

            
362
sub _unescape {
363
  my ($self, $value) = @_;
364

            
365
  # Remove escaped newlines
366
  $value =~ s/\\\n//g;
367

            
368
  # Unescape Unicode characters
369
  $value =~ s/\\([0-9a-fA-F]{1,6})\s?/pack('U', hex $1)/ge;
370

            
371
  # Remove backslash
372
  $value =~ s/\\//g;
373

            
374
  return $value;
375
}
376

            
377
1;
378

            
379
=encoding utf8
380

            
381
=head1 NAME
382

            
383
Mojo::DOM::CSS - CSS selector engine
384

            
385
=head1 SYNOPSIS
386

            
387
  use Mojo::DOM::CSS;
388

            
389
  # Select elements from DOM tree
390
  my $css = Mojo::DOM::CSS->new(tree => $tree);
391
  my $elements = $css->select('h1, h2, h3');
392

            
393
=head1 DESCRIPTION
394

            
395
L<Mojo::DOM::CSS> is the CSS selector engine used by L<Mojo::DOM>.
396

            
397
=head1 SELECTORS
398

            
399
All CSS selectors that make sense for a standalone parser are supported.
400

            
401
=head2 *
402

            
403
Any element.
404

            
405
  my $all = $css->select('*');
406

            
407
=head2 E
408

            
409
An element of type C<E>.
410

            
411
  my $title = $css->select('title');
412

            
413
=head2 E[foo]
414

            
415
An C<E> element with a C<foo> attribute.
416

            
417
  my $links = $css->select('a[href]');
418

            
419
=head2 E[foo="bar"]
420

            
421
An C<E> element whose C<foo> attribute value is exactly equal to C<bar>.
422

            
423
  my $fields = $css->select('input[name="foo"]');
424

            
425
=head2 E[foo~="bar"]
426

            
427
An C<E> element whose C<foo> attribute value is a list of
428
whitespace-separated values, one of which is exactly equal to C<bar>.
429

            
430
  my $fields = $css->select('input[name~="foo"]');
431

            
432
=head2 E[foo^="bar"]
433

            
434
An C<E> element whose C<foo> attribute value begins exactly with the string
435
C<bar>.
436

            
437
  my $fields = $css->select('input[name^="f"]');
438

            
439
=head2 E[foo$="bar"]
440

            
441
An C<E> element whose C<foo> attribute value ends exactly with the string
442
C<bar>.
443

            
444
  my $fields = $css->select('input[name$="o"]');
445

            
446
=head2 E[foo*="bar"]
447

            
448
An C<E> element whose C<foo> attribute value contains the substring C<bar>.
449

            
450
  my $fields = $css->select('input[name*="fo"]');
451

            
452
=head2 E:root
453

            
454
An C<E> element, root of the document.
455

            
456
  my $root = $css->select(':root');
457

            
458
=head2 E:checked
459

            
460
A user interface element C<E> which is checked (for instance a radio-button or
461
checkbox).
462

            
463
  my $input = $css->select(':checked');
464

            
465
=head2 E:empty
466

            
467
An C<E> element that has no children (including text nodes).
468

            
469
  my $empty = $css->select(':empty');
470

            
471
=head2 E:nth-child(n)
472

            
473
An C<E> element, the C<n-th> child of its parent.
474

            
475
  my $third = $css->select('div:nth-child(3)');
476
  my $odd   = $css->select('div:nth-child(odd)');
477
  my $even  = $css->select('div:nth-child(even)');
478
  my $top3  = $css->select('div:nth-child(-n+3)');
479

            
480
=head2 E:nth-last-child(n)
481

            
482
An C<E> element, the C<n-th> child of its parent, counting from the last one.
483

            
484
  my $third    = $css->select('div:nth-last-child(3)');
485
  my $odd      = $css->select('div:nth-last-child(odd)');
486
  my $even     = $css->select('div:nth-last-child(even)');
487
  my $bottom3  = $css->select('div:nth-last-child(-n+3)');
488

            
489
=head2 E:nth-of-type(n)
490

            
491
An C<E> element, the C<n-th> sibling of its type.
492

            
493
  my $third = $css->select('div:nth-of-type(3)');
494
  my $odd   = $css->select('div:nth-of-type(odd)');
495
  my $even  = $css->select('div:nth-of-type(even)');
496
  my $top3  = $css->select('div:nth-of-type(-n+3)');
497

            
498
=head2 E:nth-last-of-type(n)
499

            
500
An C<E> element, the C<n-th> sibling of its type, counting from the last one.
501

            
502
  my $third    = $css->select('div:nth-last-of-type(3)');
503
  my $odd      = $css->select('div:nth-last-of-type(odd)');
504
  my $even     = $css->select('div:nth-last-of-type(even)');
505
  my $bottom3  = $css->select('div:nth-last-of-type(-n+3)');
506

            
507
=head2 E:first-child
508

            
509
An C<E> element, first child of its parent.
510

            
511
  my $first = $css->select('div p:first-child');
512

            
513
=head2 E:last-child
514

            
515
An C<E> element, last child of its parent.
516

            
517
  my $last = $css->select('div p:last-child');
518

            
519
=head2 E:first-of-type
520

            
521
An C<E> element, first sibling of its type.
522

            
523
  my $first = $css->select('div p:first-of-type');
524

            
525
=head2 E:last-of-type
526

            
527
An C<E> element, last sibling of its type.
528

            
529
  my $last = $css->select('div p:last-of-type');
530

            
531
=head2 E:only-child
532

            
533
An C<E> element, only child of its parent.
534

            
535
  my $lonely = $css->select('div p:only-child');
536

            
537
=head2 E:only-of-type
538

            
539
An C<E> element, only sibling of its type.
540

            
541
  my $lonely = $css->select('div p:only-of-type');
542

            
543
=head2 E.warning
544

            
545
  my $warning = $css->select('div.warning');
546

            
547
An C<E> element whose class is "warning".
548

            
549
=head2 E#myid
550

            
551
  my $foo = $css->select('div#foo');
552

            
553
An C<E> element with C<ID> equal to "myid".
554

            
555
=head2 E:not(s)
556

            
557
An C<E> element that does not match simple selector C<s>.
558

            
559
  my $others = $css->select('div p:not(:first-child)');
560

            
561
=head2 E F
562

            
563
An C<F> element descendant of an C<E> element.
564

            
565
  my $headlines = $css->select('div h1');
566

            
567
=head2 E E<gt> F
568

            
569
An C<F> element child of an C<E> element.
570

            
571
  my $headlines = $css->select('html > body > div > h1');
572

            
573
=head2 E + F
574

            
575
An C<F> element immediately preceded by an C<E> element.
576

            
577
  my $second = $css->select('h1 + h2');
578

            
579
=head2 E ~ F
580

            
581
An C<F> element preceded by an C<E> element.
582

            
583
  my $second = $css->select('h1 ~ h2');
584

            
585
=head2 E, F, G
586

            
587
Elements of type C<E>, C<F> and C<G>.
588

            
589
  my $headlines = $css->select('h1, h2, h3');
590

            
591
=head2 E[foo=bar][bar=baz]
592

            
593
An C<E> element whose attributes match all following attribute selectors.
594

            
595
  my $links = $css->select('a[foo^=b][foo$=ar]');
596

            
597
=head1 ATTRIBUTES
598

            
599
L<Mojo::DOM::CSS> implements the following attributes.
600

            
601
=head2 tree
602

            
603
  my $tree = $css->tree;
604
  $css     = $css->tree(['root', ['text', 'foo']]);
605

            
606
Document Object Model. Note that this structure should only be used very
607
carefully since it is very dynamic.
608

            
609
=head1 METHODS
610

            
611
L<Mojo::DOM::CSS> inherits all methods from L<Mojo::Base> and implements the
612
following new ones.
613

            
614
=head2 match
615

            
616
  my $bool = $css->match('head > title');
617

            
618
Match CSS selector against first node in L</"tree">.
619

            
620
=head2 select
621

            
622
  my $results = $css->select('head > title');
623

            
624
Run CSS selector against L</"tree">.
625

            
626
=head1 SEE ALSO
627

            
628
L<Mojolicious>, L<Mojolicious::Guides>, L<http://mojolicio.us>.
629

            
630
=cut