Yuki Kimoto add files
aa0f2e9 10 years ago
1 contributor
630 lines | 14.413kb
package Mojo::DOM::CSS;
use Mojo::Base -base;

has 'tree';

my $ESCAPE_RE = qr/\\[^0-9a-fA-F]|\\[0-9a-fA-F]{1,6}/;
my $ATTR_RE   = qr/
  \[
  ((?:$ESCAPE_RE|[\w\-])+)           # Key
  (?:
    (\W)?                            # Operator
    =
    (?:"((?:\\"|[^"])*)"|([^\]]+))   # Value
  )?
  \]
/x;
my $CLASS_ID_RE = qr/
  (?:
    (?:\.((?:\\\.|[^\#.])+))   # Class
  |
    (?:\#((?:\\\#|[^.\#])+))   # ID
  )
/x;
my $PSEUDO_CLASS_RE = qr/(?::([\w\-]+)(?:\(((?:\([^)]+\)|[^)])+)\))?)/;
my $TOKEN_RE        = qr/
  (\s*,\s*)?                         # Separator
  ((?:[^[\\:\s,]|$ESCAPE_RE\s?)+)?   # Element
  ($PSEUDO_CLASS_RE*)?               # Pseudoclass
  ((?:$ATTR_RE)*)?                   # Attributes
  (?:
    \s*
    ([>+~])                          # Combinator
  )?
/x;

sub match {
  my $self = shift;
  my $tree = $self->tree;
  return undef if $tree->[0] eq 'root';
  return $self->_match($self->_compile(shift), $tree, $tree);
}

sub select {
  my $self = shift;

  my @results;
  my $pattern = $self->_compile(shift);
  my $tree    = $self->tree;
  my @queue   = ($tree);
  while (my $current = shift @queue) {
    my $type = $current->[0];

    # Tag
    if ($type eq 'tag') {
      unshift @queue, @$current[4 .. $#$current];
      push @results, $current if $self->_match($pattern, $current, $tree);
    }

    # Root
    elsif ($type eq 'root') { unshift @queue, @$current[1 .. $#$current] }
  }

  return \@results;
}

sub _ancestor {
  my ($self, $selectors, $current, $tree) = @_;
  while ($current = $current->[3]) {
    return undef if $current->[0] eq 'root' || $current eq $tree;
    return 1 if $self->_combinator($selectors, $current, $tree);
  }
  return undef;
}

sub _attr {
  my ($self, $key, $regex, $current) = @_;

  # Ignore namespace prefix
  my $attrs = $current->[2];
  for my $name (keys %$attrs) {
    next unless $name =~ /(?:^|:)$key$/;
    return 1 unless defined $attrs->{$name} && defined $regex;
    return 1 if $attrs->{$name} =~ $regex;
  }

  return undef;
}

sub _combinator {
  my ($self, $selectors, $current, $tree) = @_;

  # Selector
  my @s = @$selectors;
  return undef unless my $combinator = shift @s;
  if ($combinator->[0] ne 'combinator') {
    return undef unless $self->_selector($combinator, $current);
    return 1 unless $combinator = shift @s;
  }

  # " " (ancestor)
  my $c = $combinator->[1];
  if ($c eq ' ') { return undef unless $self->_ancestor(\@s, $current, $tree) }

  # ">" (parent only)
  elsif ($c eq '>') {
    return undef unless $self->_parent(\@s, $current, $tree);
  }

  # "~" (preceding siblings)
  elsif ($c eq '~') {
    return undef unless $self->_sibling(\@s, $current, $tree, 0);
  }

  # "+" (immediately preceding siblings)
  elsif ($c eq '+') {
    return undef unless $self->_sibling(\@s, $current, $tree, 1);
  }

  return 1;
}

sub _compile {
  my ($self, $css) = @_;

  my $pattern = [[]];
  while ($css =~ /$TOKEN_RE/g) {
    my ($separator, $element, $pc, $attrs, $combinator)
      = ($1, $2 // '', $3, $6, $11);

    next unless $separator || $element || $pc || $attrs || $combinator;

    # New selector
    push @$pattern, [] if $separator;
    my $part = $pattern->[-1];

    # Empty combinator
    push @$part, [combinator => ' ']
      if $part->[-1] && $part->[-1][0] ne 'combinator';

    # Selector
    push @$part, ['element'];
    my $selector = $part->[-1];

    # Element
    my $tag = '*';
    $element =~ s/^((?:\\\.|\\\#|[^.#])+)// and $tag = $self->_unescape($1);

    # Tag
    push @$selector, ['tag', $tag];

    # Class or ID
    while ($element =~ /$CLASS_ID_RE/g) {
      push @$selector, ['attr', 'class', $self->_regex('~', $1)] if defined $1;
      push @$selector, ['attr', 'id',    $self->_regex('',  $2)] if defined $2;
    }

    # Pseudo classes
    while ($pc =~ /$PSEUDO_CLASS_RE/g) {

      # "not"
      if ($1 eq 'not') {
        my $subpattern = $self->_compile($2)->[-1][-1];
        push @$selector, ['pc', 'not', $subpattern];
      }

      # Everything else
      else { push @$selector, ['pc', $1, $2] }
    }

    # Attributes
    while ($attrs =~ /$ATTR_RE/g) {
      my ($key, $op, $value) = ($self->_unescape($1), $2 // '', $3 // $4);
      push @$selector, ['attr', $key, $self->_regex($op, $value)];
    }

    # Combinator
    push @$part, [combinator => $combinator] if $combinator;
  }

  return $pattern;
}

sub _equation {
  my ($self, $equation) = @_;

  # "even"
  return [2, 2] if $equation =~ /^even$/i;

  # "odd"
  return [2, 1] if $equation =~ /^odd$/i;

  # Equation
  my $num = [1, 1];
  return $num if $equation !~ /(?:(-?(?:\d+)?)?(n))?\s*\+?\s*(-?\s*\d+)?\s*$/i;
  $num->[0] = defined($1) && length($1) ? $1 : $2 ? 1 : 0;
  $num->[0] = -1 if $num->[0] eq '-';
  $num->[1] = $3 // 0;
  $num->[1] =~ s/\s+//g;
  return $num;
}

sub _match {
  my ($self, $pattern, $current, $tree) = @_;
  $self->_combinator([reverse @$_], $current, $tree) and return 1
    for @$pattern;
  return undef;
}

sub _parent {
  my ($self, $selectors, $current, $tree) = @_;
  return undef unless my $parent = $current->[3];
  return undef if $parent->[0] eq 'root';
  return $self->_combinator($selectors, $parent, $tree);
}

sub _pc {
  my ($self, $class, $args, $current) = @_;

  # ":first-*"
  if ($class =~ /^first-(?:(child)|of-type)$/) {
    $class = defined $1 ? 'nth-child' : 'nth-of-type';
    $args = 1;
  }

  # ":last-*"
  elsif ($class =~ /^last-(?:(child)|of-type)$/) {
    $class = defined $1 ? 'nth-last-child' : 'nth-last-of-type';
    $args = '-n+1';
  }

  # ":checked"
  if ($class eq 'checked') {
    my $attrs = $current->[2];
    return 1 if exists $attrs->{checked} || exists $attrs->{selected};
  }

  # ":empty"
  elsif ($class eq 'empty') { return 1 unless defined $current->[4] }

  # ":root"
  elsif ($class eq 'root') {
    if (my $parent = $current->[3]) { return 1 if $parent->[0] eq 'root' }
  }

  # ":not"
  elsif ($class eq 'not') { return 1 if !$self->_selector($args, $current) }

  # ":nth-*"
  elsif ($class =~ /^nth-/) {

    # Numbers
    $args = $self->_equation($args) unless ref $args;

    # Siblings
    my $parent = $current->[3];
    my @siblings;
    my $type = $class =~ /of-type$/ ? $current->[1] : undef;
    for my $i (($parent->[0] eq 'root' ? 1 : 4) .. $#$parent) {
      my $sibling = $parent->[$i];
      next unless $sibling->[0] eq 'tag';
      next if defined $type && $type ne $sibling->[1];
      push @siblings, $sibling;
    }

    # Reverse
    @siblings = reverse @siblings if $class =~ /^nth-last/;

    # Find
    for my $i (0 .. $#siblings) {
      my $result = $args->[0] * $i + $args->[1];
      next if $result < 1;
      last unless my $sibling = $siblings[$result - 1];
      return 1 if $sibling eq $current;
    }
  }

  # ":only-*"
  elsif ($class =~ /^only-(?:child|(of-type))$/) {
    my $type = $1 ? $current->[1] : undef;

    # Siblings
    my $parent = $current->[3];
    for my $i (($parent->[0] eq 'root' ? 1 : 4) .. $#$parent) {
      my $sibling = $parent->[$i];
      next if $sibling->[0] ne 'tag' || $sibling eq $current;
      return undef unless defined $type && $sibling->[1] ne $type;
    }

    # No siblings
    return 1;
  }

  return undef;
}

sub _regex {
  my ($self, $op, $value) = @_;
  return undef unless defined $value;
  $value = quotemeta $self->_unescape($value);

  # "~=" (word)
  return qr/(?:^|.*\s+)$value(?:\s+.*|$)/ if $op eq '~';

  # "*=" (contains)
  return qr/$value/ if $op eq '*';

  # "^=" (begins with)
  return qr/^$value/ if $op eq '^';

  # "$=" (ends with)
  return qr/$value$/ if $op eq '$';

  # Everything else
  return qr/^$value$/;
}

sub _selector {
  my ($self, $selector, $current) = @_;

  for my $s (@$selector[1 .. $#$selector]) {
    my $type = $s->[0];

    # Tag (ignore namespace prefix)
    if ($type eq 'tag') {
      my $tag = $s->[1];
      return undef unless $tag eq '*' || $current->[1] =~ /(?:^|:)$tag$/;
    }

    # Attribute
    elsif ($type eq 'attr') {
      return undef unless $self->_attr(@$s[1, 2], $current);
    }

    # Pseudo class
    elsif ($type eq 'pc') {
      return undef unless $self->_pc(lc $s->[1], $s->[2], $current);
    }
  }

  return 1;
}

sub _sibling {
  my ($self, $selectors, $current, $tree, $immediate) = @_;

  my $parent = $current->[3];
  my $found;
  for my $e (@$parent[($parent->[0] eq 'root' ? 1 : 4) .. $#$parent]) {
    return $found if $e eq $current;
    next unless $e->[0] eq 'tag';

    # "+" (immediately preceding sibling)
    if ($immediate) { $found = $self->_combinator($selectors, $e, $tree) }

    # "~" (preceding sibling)
    else { return 1 if $self->_combinator($selectors, $e, $tree) }
  }

  return undef;
}

sub _unescape {
  my ($self, $value) = @_;

  # Remove escaped newlines
  $value =~ s/\\\n//g;

  # Unescape Unicode characters
  $value =~ s/\\([0-9a-fA-F]{1,6})\s?/pack('U', hex $1)/ge;

  # Remove backslash
  $value =~ s/\\//g;

  return $value;
}

1;

=encoding utf8

=head1 NAME

Mojo::DOM::CSS - CSS selector engine

=head1 SYNOPSIS

  use Mojo::DOM::CSS;

  # Select elements from DOM tree
  my $css = Mojo::DOM::CSS->new(tree => $tree);
  my $elements = $css->select('h1, h2, h3');

=head1 DESCRIPTION

L<Mojo::DOM::CSS> is the CSS selector engine used by L<Mojo::DOM>.

=head1 SELECTORS

All CSS selectors that make sense for a standalone parser are supported.

=head2 *

Any element.

  my $all = $css->select('*');

=head2 E

An element of type C<E>.

  my $title = $css->select('title');

=head2 E[foo]

An C<E> element with a C<foo> attribute.

  my $links = $css->select('a[href]');

=head2 E[foo="bar"]

An C<E> element whose C<foo> attribute value is exactly equal to C<bar>.

  my $fields = $css->select('input[name="foo"]');

=head2 E[foo~="bar"]

An C<E> element whose C<foo> attribute value is a list of
whitespace-separated values, one of which is exactly equal to C<bar>.

  my $fields = $css->select('input[name~="foo"]');

=head2 E[foo^="bar"]

An C<E> element whose C<foo> attribute value begins exactly with the string
C<bar>.

  my $fields = $css->select('input[name^="f"]');

=head2 E[foo$="bar"]

An C<E> element whose C<foo> attribute value ends exactly with the string
C<bar>.

  my $fields = $css->select('input[name$="o"]');

=head2 E[foo*="bar"]

An C<E> element whose C<foo> attribute value contains the substring C<bar>.

  my $fields = $css->select('input[name*="fo"]');

=head2 E:root

An C<E> element, root of the document.

  my $root = $css->select(':root');

=head2 E:checked

A user interface element C<E> which is checked (for instance a radio-button or
checkbox).

  my $input = $css->select(':checked');

=head2 E:empty

An C<E> element that has no children (including text nodes).

  my $empty = $css->select(':empty');

=head2 E:nth-child(n)

An C<E> element, the C<n-th> child of its parent.

  my $third = $css->select('div:nth-child(3)');
  my $odd   = $css->select('div:nth-child(odd)');
  my $even  = $css->select('div:nth-child(even)');
  my $top3  = $css->select('div:nth-child(-n+3)');

=head2 E:nth-last-child(n)

An C<E> element, the C<n-th> child of its parent, counting from the last one.

  my $third    = $css->select('div:nth-last-child(3)');
  my $odd      = $css->select('div:nth-last-child(odd)');
  my $even     = $css->select('div:nth-last-child(even)');
  my $bottom3  = $css->select('div:nth-last-child(-n+3)');

=head2 E:nth-of-type(n)

An C<E> element, the C<n-th> sibling of its type.

  my $third = $css->select('div:nth-of-type(3)');
  my $odd   = $css->select('div:nth-of-type(odd)');
  my $even  = $css->select('div:nth-of-type(even)');
  my $top3  = $css->select('div:nth-of-type(-n+3)');

=head2 E:nth-last-of-type(n)

An C<E> element, the C<n-th> sibling of its type, counting from the last one.

  my $third    = $css->select('div:nth-last-of-type(3)');
  my $odd      = $css->select('div:nth-last-of-type(odd)');
  my $even     = $css->select('div:nth-last-of-type(even)');
  my $bottom3  = $css->select('div:nth-last-of-type(-n+3)');

=head2 E:first-child

An C<E> element, first child of its parent.

  my $first = $css->select('div p:first-child');

=head2 E:last-child

An C<E> element, last child of its parent.

  my $last = $css->select('div p:last-child');

=head2 E:first-of-type

An C<E> element, first sibling of its type.

  my $first = $css->select('div p:first-of-type');

=head2 E:last-of-type

An C<E> element, last sibling of its type.

  my $last = $css->select('div p:last-of-type');

=head2 E:only-child

An C<E> element, only child of its parent.

  my $lonely = $css->select('div p:only-child');

=head2 E:only-of-type

An C<E> element, only sibling of its type.

  my $lonely = $css->select('div p:only-of-type');

=head2 E.warning

  my $warning = $css->select('div.warning');

An C<E> element whose class is "warning".

=head2 E#myid

  my $foo = $css->select('div#foo');

An C<E> element with C<ID> equal to "myid".

=head2 E:not(s)

An C<E> element that does not match simple selector C<s>.

  my $others = $css->select('div p:not(:first-child)');

=head2 E F

An C<F> element descendant of an C<E> element.

  my $headlines = $css->select('div h1');

=head2 E E<gt> F

An C<F> element child of an C<E> element.

  my $headlines = $css->select('html > body > div > h1');

=head2 E + F

An C<F> element immediately preceded by an C<E> element.

  my $second = $css->select('h1 + h2');

=head2 E ~ F

An C<F> element preceded by an C<E> element.

  my $second = $css->select('h1 ~ h2');

=head2 E, F, G

Elements of type C<E>, C<F> and C<G>.

  my $headlines = $css->select('h1, h2, h3');

=head2 E[foo=bar][bar=baz]

An C<E> element whose attributes match all following attribute selectors.

  my $links = $css->select('a[foo^=b][foo$=ar]');

=head1 ATTRIBUTES

L<Mojo::DOM::CSS> implements the following attributes.

=head2 tree

  my $tree = $css->tree;
  $css     = $css->tree(['root', ['text', 'foo']]);

Document Object Model. Note that this structure should only be used very
carefully since it is very dynamic.

=head1 METHODS

L<Mojo::DOM::CSS> inherits all methods from L<Mojo::Base> and implements the
following new ones.

=head2 match

  my $bool = $css->match('head > title');

Match CSS selector against first node in L</"tree">.

=head2 select

  my $results = $css->select('head > title');

Run CSS selector against L</"tree">.

=head1 SEE ALSO

L<Mojolicious>, L<Mojolicious::Guides>, L<http://mojolicio.us>.

=cut