package Mojo::DOM::CSS; use Mojo::Base -base; has 'tree'; my $ESCAPE_RE = qr/\\[^0-9a-fA-F]|\\[0-9a-fA-F]{1,6}/; my $ATTR_RE = qr/ \[ ((?:$ESCAPE_RE|[\w\-])+) # Key (?: (\W)? # Operator = (?:"((?:\\"|[^"])*)"|([^\]]+)) # Value )? \] /x; my $CLASS_ID_RE = qr/ (?: (?:\.((?:\\\.|[^\#.])+)) # Class | (?:\#((?:\\\#|[^.\#])+)) # ID ) /x; my $PSEUDO_CLASS_RE = qr/(?::([\w\-]+)(?:\(((?:\([^)]+\)|[^)])+)\))?)/; my $TOKEN_RE = qr/ (\s*,\s*)? # Separator ((?:[^[\\:\s,]|$ESCAPE_RE\s?)+)? # Element ($PSEUDO_CLASS_RE*)? # Pseudoclass ((?:$ATTR_RE)*)? # Attributes (?: \s* ([>+~]) # Combinator )? /x; sub match { my $self = shift; my $tree = $self->tree; return undef if $tree->[0] eq 'root'; return $self->_match($self->_compile(shift), $tree, $tree); } sub select { my $self = shift; my @results; my $pattern = $self->_compile(shift); my $tree = $self->tree; my @queue = ($tree); while (my $current = shift @queue) { my $type = $current->[0]; # Tag if ($type eq 'tag') { unshift @queue, @$current[4 .. $#$current]; push @results, $current if $self->_match($pattern, $current, $tree); } # Root elsif ($type eq 'root') { unshift @queue, @$current[1 .. $#$current] } } return \@results; } sub _ancestor { my ($self, $selectors, $current, $tree) = @_; while ($current = $current->[3]) { return undef if $current->[0] eq 'root' || $current eq $tree; return 1 if $self->_combinator($selectors, $current, $tree); } return undef; } sub _attr { my ($self, $key, $regex, $current) = @_; # Ignore namespace prefix my $attrs = $current->[2]; for my $name (keys %$attrs) { next unless $name =~ /(?:^|:)$key$/; return 1 unless defined $attrs->{$name} && defined $regex; return 1 if $attrs->{$name} =~ $regex; } return undef; } sub _combinator { my ($self, $selectors, $current, $tree) = @_; # Selector my @s = @$selectors; return undef unless my $combinator = shift @s; if ($combinator->[0] ne 'combinator') { return undef unless $self->_selector($combinator, $current); return 1 unless $combinator = shift @s; } # " " (ancestor) my $c = $combinator->[1]; if ($c eq ' ') { return undef unless $self->_ancestor(\@s, $current, $tree) } # ">" (parent only) elsif ($c eq '>') { return undef unless $self->_parent(\@s, $current, $tree); } # "~" (preceding siblings) elsif ($c eq '~') { return undef unless $self->_sibling(\@s, $current, $tree, 0); } # "+" (immediately preceding siblings) elsif ($c eq '+') { return undef unless $self->_sibling(\@s, $current, $tree, 1); } return 1; } sub _compile { my ($self, $css) = @_; my $pattern = [[]]; while ($css =~ /$TOKEN_RE/g) { my ($separator, $element, $pc, $attrs, $combinator) = ($1, $2 // '', $3, $6, $11); next unless $separator || $element || $pc || $attrs || $combinator; # New selector push @$pattern, [] if $separator; my $part = $pattern->[-1]; # Empty combinator push @$part, [combinator => ' '] if $part->[-1] && $part->[-1][0] ne 'combinator'; # Selector push @$part, ['element']; my $selector = $part->[-1]; # Element my $tag = '*'; $element =~ s/^((?:\\\.|\\\#|[^.#])+)// and $tag = $self->_unescape($1); # Tag push @$selector, ['tag', $tag]; # Class or ID while ($element =~ /$CLASS_ID_RE/g) { push @$selector, ['attr', 'class', $self->_regex('~', $1)] if defined $1; push @$selector, ['attr', 'id', $self->_regex('', $2)] if defined $2; } # Pseudo classes while ($pc =~ /$PSEUDO_CLASS_RE/g) { # "not" if ($1 eq 'not') { my $subpattern = $self->_compile($2)->[-1][-1]; push @$selector, ['pc', 'not', $subpattern]; } # Everything else else { push @$selector, ['pc', $1, $2] } } # Attributes while ($attrs =~ /$ATTR_RE/g) { my ($key, $op, $value) = ($self->_unescape($1), $2 // '', $3 // $4); push @$selector, ['attr', $key, $self->_regex($op, $value)]; } # Combinator push @$part, [combinator => $combinator] if $combinator; } return $pattern; } sub _equation { my ($self, $equation) = @_; # "even" return [2, 2] if $equation =~ /^even$/i; # "odd" return [2, 1] if $equation =~ /^odd$/i; # Equation my $num = [1, 1]; return $num if $equation !~ /(?:(-?(?:\d+)?)?(n))?\s*\+?\s*(-?\s*\d+)?\s*$/i; $num->[0] = defined($1) && length($1) ? $1 : $2 ? 1 : 0; $num->[0] = -1 if $num->[0] eq '-'; $num->[1] = $3 // 0; $num->[1] =~ s/\s+//g; return $num; } sub _match { my ($self, $pattern, $current, $tree) = @_; $self->_combinator([reverse @$_], $current, $tree) and return 1 for @$pattern; return undef; } sub _parent { my ($self, $selectors, $current, $tree) = @_; return undef unless my $parent = $current->[3]; return undef if $parent->[0] eq 'root'; return $self->_combinator($selectors, $parent, $tree); } sub _pc { my ($self, $class, $args, $current) = @_; # ":first-*" if ($class =~ /^first-(?:(child)|of-type)$/) { $class = defined $1 ? 'nth-child' : 'nth-of-type'; $args = 1; } # ":last-*" elsif ($class =~ /^last-(?:(child)|of-type)$/) { $class = defined $1 ? 'nth-last-child' : 'nth-last-of-type'; $args = '-n+1'; } # ":checked" if ($class eq 'checked') { my $attrs = $current->[2]; return 1 if exists $attrs->{checked} || exists $attrs->{selected}; } # ":empty" elsif ($class eq 'empty') { return 1 unless defined $current->[4] } # ":root" elsif ($class eq 'root') { if (my $parent = $current->[3]) { return 1 if $parent->[0] eq 'root' } } # ":not" elsif ($class eq 'not') { return 1 if !$self->_selector($args, $current) } # ":nth-*" elsif ($class =~ /^nth-/) { # Numbers $args = $self->_equation($args) unless ref $args; # Siblings my $parent = $current->[3]; my @siblings; my $type = $class =~ /of-type$/ ? $current->[1] : undef; for my $i (($parent->[0] eq 'root' ? 1 : 4) .. $#$parent) { my $sibling = $parent->[$i]; next unless $sibling->[0] eq 'tag'; next if defined $type && $type ne $sibling->[1]; push @siblings, $sibling; } # Reverse @siblings = reverse @siblings if $class =~ /^nth-last/; # Find for my $i (0 .. $#siblings) { my $result = $args->[0] * $i + $args->[1]; next if $result < 1; last unless my $sibling = $siblings[$result - 1]; return 1 if $sibling eq $current; } } # ":only-*" elsif ($class =~ /^only-(?:child|(of-type))$/) { my $type = $1 ? $current->[1] : undef; # Siblings my $parent = $current->[3]; for my $i (($parent->[0] eq 'root' ? 1 : 4) .. $#$parent) { my $sibling = $parent->[$i]; next if $sibling->[0] ne 'tag' || $sibling eq $current; return undef unless defined $type && $sibling->[1] ne $type; } # No siblings return 1; } return undef; } sub _regex { my ($self, $op, $value) = @_; return undef unless defined $value; $value = quotemeta $self->_unescape($value); # "~=" (word) return qr/(?:^|.*\s+)$value(?:\s+.*|$)/ if $op eq '~'; # "*=" (contains) return qr/$value/ if $op eq '*'; # "^=" (begins with) return qr/^$value/ if $op eq '^'; # "$=" (ends with) return qr/$value$/ if $op eq '$'; # Everything else return qr/^$value$/; } sub _selector { my ($self, $selector, $current) = @_; for my $s (@$selector[1 .. $#$selector]) { my $type = $s->[0]; # Tag (ignore namespace prefix) if ($type eq 'tag') { my $tag = $s->[1]; return undef unless $tag eq '*' || $current->[1] =~ /(?:^|:)$tag$/; } # Attribute elsif ($type eq 'attr') { return undef unless $self->_attr(@$s[1, 2], $current); } # Pseudo class elsif ($type eq 'pc') { return undef unless $self->_pc(lc $s->[1], $s->[2], $current); } } return 1; } sub _sibling { my ($self, $selectors, $current, $tree, $immediate) = @_; my $parent = $current->[3]; my $found; for my $e (@$parent[($parent->[0] eq 'root' ? 1 : 4) .. $#$parent]) { return $found if $e eq $current; next unless $e->[0] eq 'tag'; # "+" (immediately preceding sibling) if ($immediate) { $found = $self->_combinator($selectors, $e, $tree) } # "~" (preceding sibling) else { return 1 if $self->_combinator($selectors, $e, $tree) } } return undef; } sub _unescape { my ($self, $value) = @_; # Remove escaped newlines $value =~ s/\\\n//g; # Unescape Unicode characters $value =~ s/\\([0-9a-fA-F]{1,6})\s?/pack('U', hex $1)/ge; # Remove backslash $value =~ s/\\//g; return $value; } 1; =encoding utf8 =head1 NAME Mojo::DOM::CSS - CSS selector engine =head1 SYNOPSIS use Mojo::DOM::CSS; # Select elements from DOM tree my $css = Mojo::DOM::CSS->new(tree => $tree); my $elements = $css->select('h1, h2, h3'); =head1 DESCRIPTION L is the CSS selector engine used by L. =head1 SELECTORS All CSS selectors that make sense for a standalone parser are supported. =head2 * Any element. my $all = $css->select('*'); =head2 E An element of type C. my $title = $css->select('title'); =head2 E[foo] An C element with a C attribute. my $links = $css->select('a[href]'); =head2 E[foo="bar"] An C element whose C attribute value is exactly equal to C. my $fields = $css->select('input[name="foo"]'); =head2 E[foo~="bar"] An C element whose C attribute value is a list of whitespace-separated values, one of which is exactly equal to C. my $fields = $css->select('input[name~="foo"]'); =head2 E[foo^="bar"] An C element whose C attribute value begins exactly with the string C. my $fields = $css->select('input[name^="f"]'); =head2 E[foo$="bar"] An C element whose C attribute value ends exactly with the string C. my $fields = $css->select('input[name$="o"]'); =head2 E[foo*="bar"] An C element whose C attribute value contains the substring C. my $fields = $css->select('input[name*="fo"]'); =head2 E:root An C element, root of the document. my $root = $css->select(':root'); =head2 E:checked A user interface element C which is checked (for instance a radio-button or checkbox). my $input = $css->select(':checked'); =head2 E:empty An C element that has no children (including text nodes). my $empty = $css->select(':empty'); =head2 E:nth-child(n) An C element, the C child of its parent. my $third = $css->select('div:nth-child(3)'); my $odd = $css->select('div:nth-child(odd)'); my $even = $css->select('div:nth-child(even)'); my $top3 = $css->select('div:nth-child(-n+3)'); =head2 E:nth-last-child(n) An C element, the C child of its parent, counting from the last one. my $third = $css->select('div:nth-last-child(3)'); my $odd = $css->select('div:nth-last-child(odd)'); my $even = $css->select('div:nth-last-child(even)'); my $bottom3 = $css->select('div:nth-last-child(-n+3)'); =head2 E:nth-of-type(n) An C element, the C sibling of its type. my $third = $css->select('div:nth-of-type(3)'); my $odd = $css->select('div:nth-of-type(odd)'); my $even = $css->select('div:nth-of-type(even)'); my $top3 = $css->select('div:nth-of-type(-n+3)'); =head2 E:nth-last-of-type(n) An C element, the C sibling of its type, counting from the last one. my $third = $css->select('div:nth-last-of-type(3)'); my $odd = $css->select('div:nth-last-of-type(odd)'); my $even = $css->select('div:nth-last-of-type(even)'); my $bottom3 = $css->select('div:nth-last-of-type(-n+3)'); =head2 E:first-child An C element, first child of its parent. my $first = $css->select('div p:first-child'); =head2 E:last-child An C element, last child of its parent. my $last = $css->select('div p:last-child'); =head2 E:first-of-type An C element, first sibling of its type. my $first = $css->select('div p:first-of-type'); =head2 E:last-of-type An C element, last sibling of its type. my $last = $css->select('div p:last-of-type'); =head2 E:only-child An C element, only child of its parent. my $lonely = $css->select('div p:only-child'); =head2 E:only-of-type An C element, only sibling of its type. my $lonely = $css->select('div p:only-of-type'); =head2 E.warning my $warning = $css->select('div.warning'); An C element whose class is "warning". =head2 E#myid my $foo = $css->select('div#foo'); An C element with C equal to "myid". =head2 E:not(s) An C element that does not match simple selector C. my $others = $css->select('div p:not(:first-child)'); =head2 E F An C element descendant of an C element. my $headlines = $css->select('div h1'); =head2 E E F An C element child of an C element. my $headlines = $css->select('html > body > div > h1'); =head2 E + F An C element immediately preceded by an C element. my $second = $css->select('h1 + h2'); =head2 E ~ F An C element preceded by an C element. my $second = $css->select('h1 ~ h2'); =head2 E, F, G Elements of type C, C and C. my $headlines = $css->select('h1, h2, h3'); =head2 E[foo=bar][bar=baz] An C element whose attributes match all following attribute selectors. my $links = $css->select('a[foo^=b][foo$=ar]'); =head1 ATTRIBUTES L implements the following attributes. =head2 tree my $tree = $css->tree; $css = $css->tree(['root', ['text', 'foo']]); Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. =head1 METHODS L inherits all methods from L and implements the following new ones. =head2 match my $bool = $css->match('head > title'); Match CSS selector against first node in L. =head2 select my $results = $css->select('head > title'); Run CSS selector against L. =head1 SEE ALSO L, L, L. =cut