package Mojo::DOM; use Mojo::Base -strict; use overload '%{}' => sub { shift->attr }, bool => sub {1}, '""' => sub { shift->to_xml }, fallback => 1; # "Fry: This snow is beautiful. I'm glad global warming never happened. # Leela: Actually, it did. But thank God nuclear winter canceled it out." use Carp 'croak'; use Mojo::Collection; use Mojo::DOM::CSS; use Mojo::DOM::HTML; use Mojo::Util 'squish'; use Scalar::Util qw(blessed weaken); sub AUTOLOAD { my $self = shift; my ($package, $method) = our $AUTOLOAD =~ /^([\w:]+)::(\w+)$/; croak "Undefined subroutine &${package}::$method called" unless blessed $self && $self->isa(__PACKAGE__); # Search children of current element my $children = $self->children($method); return @$children > 1 ? $children : $children->[0] if @$children; croak qq{Can't locate object method "$method" via package "$package"}; } sub DESTROY { } sub new { my $class = shift; my $self = bless [Mojo::DOM::HTML->new], ref $class || $class; return @_ ? $self->parse(@_) : $self; } sub all_text { shift->_content(1, @_) } sub ancestors { _select($_[0]->_collect(_ancestors($_[0]->tree)), $_[1]) } sub append { shift->_add(1, @_) } sub append_content { my ($self, $new) = @_; my $tree = $self->tree; push @$tree, _link($self->_parse("$new"), $tree); return $self; } sub at { shift->find(@_)->[0] } sub attr { my $self = shift; # Hash my $tree = $self->tree; my $attrs = $tree->[0] eq 'root' ? {} : $tree->[2]; return $attrs unless @_; # Get return defined $attrs->{$_[0]} ? $attrs->{$_[0]} : '' unless @_ > 1 || ref $_[0]; # Set %$attrs = (%$attrs, %{ref $_[0] ? $_[0] : {@_}}); return $self; } sub children { my $self = shift; return _select( $self->_collect(grep { $_->[0] eq 'tag' } _nodes($self->tree)), @_); } sub content_xml { my $self = shift; my $xml = $self->xml; return join '', map { _render($_, $xml) } _nodes($self->tree); } sub find { my $self = shift; my $results = Mojo::DOM::CSS->new(tree => $self->tree)->select(@_); return $self->_collect(@$results); } sub match { my $self = shift; return undef unless Mojo::DOM::CSS->new(tree => $self->tree)->match(@_); return $self; } sub namespace { my $self = shift; return '' if (my $current = $self->tree)->[0] eq 'root'; # Extract namespace prefix and search parents my $ns = $current->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; while ($current->[0] ne 'root') { # Namespace for prefix my $attrs = $current->[2]; if ($ns) { /^\Q$ns\E$/ and return $attrs->{$_} for keys %$attrs } # Namespace attribute elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } $current = $current->[3]; } return ''; } sub next { shift->_siblings->[1][0] } sub parent { my $self = shift; return undef if (my $tree = $self->tree)->[0] eq 'root'; return $self->new->tree($tree->[3])->xml($self->xml); } sub parse { shift->_delegate(parse => shift) } sub prepend { shift->_add(0, @_) } sub prepend_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, _offset($tree), 0, _link($self->_parse("$new"), $tree); return $self; } sub previous { shift->_siblings->[0][-1] } sub remove { shift->replace('') } sub replace { my ($self, $new) = @_; my $tree = $self->tree; return $self->xml(undef)->parse($new) if $tree->[0] eq 'root'; return $self->_replace($tree, $self->_parse("$new")); } sub replace_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, _offset($tree), $#$tree, _link($self->_parse("$new"), $tree); return $self; } sub root { my $self = shift; return $self unless my $tree = _ancestors($self->tree, 1); return $self->new->tree($tree)->xml($self->xml); } sub siblings { _select(Mojo::Collection->new(@{_siblings($_[0], 1)}), $_[1]) } sub strip { my $self = shift; my $tree = $self->tree; return $self if $tree->[0] eq 'root'; return $self->_replace($tree, ['root', _nodes($tree)]); } sub tap { shift->Mojo::Base::tap(@_) } sub text { shift->_content(0, @_) } sub text_after { my ($self, $trim) = @_; return '' if (my $tree = $self->tree)->[0] eq 'root'; my (@nodes, $started); for my $n (_nodes($tree->[3])) { ++$started and next if $n eq $tree; next unless $started; last if $n->[0] eq 'tag'; push @nodes, $n; } return _text(\@nodes, 0, _trim($tree->[3], $trim)); } sub text_before { my ($self, $trim) = @_; return '' if (my $tree = $self->tree)->[0] eq 'root'; my @nodes; for my $n (_nodes($tree->[3])) { last if $n eq $tree; push @nodes, $n; @nodes = () if $n->[0] eq 'tag'; } return _text(\@nodes, 0, _trim($tree->[3], $trim)); } sub to_xml { shift->[0]->render } sub tree { shift->_delegate(tree => @_) } sub type { my ($self, $type) = @_; return '' if (my $tree = $self->tree)->[0] eq 'root'; return $tree->[1] unless $type; $tree->[1] = $type; return $self; } sub xml { shift->_delegate(xml => @_) } sub _add { my ($self, $offset, $new) = @_; return $self if (my $tree = $self->tree)->[0] eq 'root'; my $parent = $tree->[3]; splice @$parent, _parent($parent, $tree) + $offset, 0, _link($self->_parse("$new"), $parent); return $self; } sub _ancestors { my ($tree, $root) = @_; my @ancestors; push @ancestors, $tree while ($tree->[0] eq 'tag') && ($tree = $tree->[3]); return $root ? $ancestors[-1] : @ancestors[0 .. $#ancestors - 1]; } sub _collect { my $self = shift; my $xml = $self->xml; return Mojo::Collection->new(@_) ->map(sub { $self->new->tree($_)->xml($xml) }); } sub _content { my $tree = shift->tree; return _text([_nodes($tree)], shift, _trim($tree, @_)); } sub _delegate { my ($self, $method) = (shift, shift); return $self->[0]->$method unless @_; $self->[0]->$method(@_); return $self; } sub _link { my ($children, $parent) = @_; # Link parent to children my @new; for my $n (@$children[1 .. $#$children]) { push @new, $n; next unless $n->[0] eq 'tag'; $n->[3] = $parent; weaken $n->[3]; } return @new; } sub _nodes { return unless my $n = shift; return @$n[_offset($n) .. $#$n]; } sub _offset { $_[0][0] eq 'root' ? 1 : 4 } sub _parent { my ($parent, $child) = @_; # Find parent offset for child my $i = _offset($parent); for my $n (@$parent[$i .. $#$parent]) { last if $n == $child; $i++; } return $i; } sub _parse { Mojo::DOM::HTML->new(xml => shift->xml)->parse(shift)->tree } sub _render { Mojo::DOM::HTML->new(tree => shift, xml => shift)->render } sub _replace { my ($self, $tree, $new) = @_; my $parent = $tree->[3]; splice @$parent, _parent($parent, $tree), 1, _link($new, $parent); return $self->parent; } sub _select { my ($self, $selector) = @_; return defined $selector ? $self->grep(sub { $_->match($selector) }) : $self; } sub _siblings { my ($self, $merge) = @_; return $merge ? [] : [[], []] unless my $parent = $self->parent; my $tree = $self->tree; my (@before, @after, $match); for my $child ($parent->children->each) { ++$match and next if $child->tree eq $tree; $match ? push @after, $child : push @before, $child; } return $merge ? [@before, @after] : [\@before, \@after]; } sub _text { my ($nodes, $recurse, $trim) = @_; # Merge successive text nodes my $i = 0; while (my $next = $nodes->[$i + 1]) { ++$i and next unless $nodes->[$i][0] eq 'text' && $next->[0] eq 'text'; splice @$nodes, $i, 2, ['text', $nodes->[$i][1] . $next->[1]]; } my $text = ''; for my $n (@$nodes) { my $type = $n->[0]; # Nested tag my $content = ''; if ($type eq 'tag' && $recurse) { $content = _text([_nodes($n)], 1, _trim($n, $trim)); } # Text elsif ($type eq 'text') { $content = $trim ? squish($n->[1]) : $n->[1] } # CDATA or raw text elsif ($type eq 'cdata' || $type eq 'raw') { $content = $n->[1] } # Add leading whitespace if punctuation allows it $content = " $content" if $text =~ /\S\z/ && $content =~ /^[^.!?,;:\s]+/; # Trim whitespace blocks $text .= $content if $content =~ /\S+/ || !$trim; } return $text; } sub _trim { my ($e, $trim) = @_; # Disabled return 0 unless $e && ($trim = defined $trim ? $trim : 1); # Detect "pre" tag while ($e->[0] eq 'tag') { return 0 if $e->[1] eq 'pre'; last unless $e = $e->[3]; } return 1; } 1; =encoding utf8 =head1 NAME Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use Mojo::DOM; # Parse my $dom = Mojo::DOM->new('

A

B

'); # Find say $dom->at('#b')->text; say $dom->find('p')->text; say $dom->find('[id]')->attr('id'); # Walk say $dom->div->p->[0]->text; say $dom->div->children('p')->first->{id}; # Iterate $dom->find('p[id]')->each(sub { say shift->{id} }); # Loop for my $e ($dom->find('p[id]')->each) { say $e->text; } # Modify $dom->div->p->[1]->append('

C

'); $dom->find(':not(p)')->strip; # Render say "$dom"; =head1 DESCRIPTION L is a minimalistic and relaxed HTML/XML DOM parser with CSS selector support. It will even try to interpret broken XML, so you should not use it for validation. =head1 CASE SENSITIVITY L defaults to HTML semantics, that means all tags and attributes are lowercased and selectors need to be lowercase as well. my $dom = Mojo::DOM->new('

Hi!

'); say $dom->at('p')->text; say $dom->p->{id}; If XML processing instructions are found, the parser will automatically switch into XML mode and everything becomes case sensitive. my $dom = Mojo::DOM->new('

Hi!

'); say $dom->at('P')->text; say $dom->P->{ID}; XML detection can also be disabled with the L method. # Force XML semantics $dom->xml(1); # Force HTML semantics $dom->xml(0); =head1 METHODS L implements the following methods. =head2 new my $dom = Mojo::DOM->new; my $dom = Mojo::DOM->new('test'); Construct a new array-based L object and L HTML/XML fragment if necessary. =head2 all_text my $trimmed = $dom->all_text; my $untrimmed = $dom->all_text(0); Extract all text content from DOM structure, smart whitespace trimming is enabled by default. # "foo bar baz" $dom->parse("
foo\n

bar

baz\n
")->div->all_text; # "foo\nbarbaz\n" $dom->parse("
foo\n

bar

baz\n
")->div->all_text(0); =head2 ancestors my $collection = $dom->ancestors; my $collection = $dom->ancestors('div'); Find all ancestors of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List types of ancestor elements say $dom->ancestors->type; =head2 append $dom = $dom->append('

Hi!

'); Append HTML/XML fragment to element. # "

A

B

" $dom->parse('

A

')->at('h1')->append('

B

')->root; =head2 append_content $dom = $dom->append_content('

Hi!

'); Append HTML/XML fragment to element content. # "

AB

" $dom->parse('

A

')->at('h1')->append_content('B')->root; =head2 at my $result = $dom->at('html title'); Find first element matching the CSS selector and return it as a L object or return C if none could be found. All selectors from L are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; =head2 attr my $attrs = $dom->attr; my $foo = $dom->attr('foo'); $dom = $dom->attr({foo => 'bar'}); $dom = $dom->attr(foo => 'bar'); Element attributes. # List id attributes say $dom->find('*')->attr('id')->compact; =head2 children my $collection = $dom->children; my $collection = $dom->children('div'); Find all children of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # Show type of random child element say $dom->children->shuffle->first->type; =head2 content_xml my $xml = $dom->content_xml; Render content of this element to XML. # "test" $dom->parse('
test
')->div->content_xml; =head2 find my $collection = $dom->find('html title'); Find all elements matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; # Extract information from multiple elements my @headers = $dom->find('h1, h2, h3')->text->each; my @links = $dom->find('a[href]')->attr('href')->each; =head2 match my $result = $dom->match('html title'); Match the CSS selector against this element and return it as a L object or return C if it didn't match. All selectors from L are supported. =head2 namespace my $namespace = $dom->namespace; Find element namespace. # Find namespace for an element with namespace prefix my $namespace = $dom->at('svg > svg\:circle')->namespace; # Find namespace for an element that may or may not have a namespace prefix my $namespace = $dom->at('svg > circle')->namespace; =head2 next my $sibling = $dom->next; Return L object for next sibling of element or C if there are no more siblings. # "

B

" $dom->parse('

A

B

')->at('h1')->next; =head2 parent my $parent = $dom->parent; Return L object for parent of element or C if this element has no parent. =head2 parse $dom = $dom->parse('test'); Parse HTML/XML fragment with L. # Parse XML my $dom = Mojo::DOM->new->xml(1)->parse($xml); =head2 prepend $dom = $dom->prepend('

Hi!

'); Prepend HTML/XML fragment to element. # "

A

B

" $dom->parse('

B

')->at('h2')->prepend('

A

')->root; =head2 prepend_content $dom = $dom->prepend_content('

Hi!

'); Prepend HTML/XML fragment to element content. # "

AB

" $dom->parse('

B

')->at('h2')->prepend_content('A')->root; =head2 previous my $sibling = $dom->previous; Return L object for previous sibling of element or C if there are no more siblings. # "

A

" $dom->parse('

A

B

')->at('h2')->previous; =head2 remove my $parent = $dom->remove; Remove element and return L object for parent of element. # "
" $dom->parse('

A

')->at('h1')->remove; =head2 replace my $parent = $dom->replace('
test
'); Replace element with HTML/XML fragment and return L object for parent of element. # "

B

" $dom->parse('

A

')->at('h1')->replace('

B

'); # "
" $dom->parse('

A

')->at('h1')->replace(''); =head2 replace_content $dom = $dom->replace_content('

test

'); Replace element content with HTML/XML fragment. # "

B

" $dom->parse('

A

')->at('h1')->replace_content('B')->root; # "

" $dom->parse('

A

')->at('h1')->replace_content('')->root; =head2 root my $root = $dom->root; Return L object for root node. =head2 siblings my $collection = $dom->siblings; my $collection = $dom->siblings('div'); Find all siblings of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List types of sibling elements say $dom->siblings->type; =head2 strip my $parent = $dom->strip; Remove element while preserving its content and return L object for parent of element. # "
A
" $dom->parse('

A

')->at('h1')->strip; =head2 tap $dom = $dom->tap(sub {...}); Alias for L. =head2 text my $trimmed = $dom->text; my $untrimmed = $dom->text(0); Extract text content from element only (not including child elements), smart whitespace trimming is enabled by default. # "foo baz" $dom->parse("
foo\n

bar

baz\n
")->div->text; # "foo\nbaz\n" $dom->parse("
foo\n

bar

baz\n
")->div->text(0); =head2 text_after my $trimmed = $dom->text_after; my $untrimmed = $dom->text_after(0); Extract text content immediately following element, smart whitespace trimming is enabled by default. # "baz" $dom->parse("
foo\n

bar

baz\n
")->div->p->text_after; # "baz\n" $dom->parse("
foo\n

bar

baz\n
")->div->p->text_after(0); =head2 text_before my $trimmed = $dom->text_before; my $untrimmed = $dom->text_before(0); Extract text content immediately preceding element, smart whitespace trimming is enabled by default. # "foo" $dom->parse("
foo\n

bar

baz\n
")->div->p->text_before; # "foo\n" $dom->parse("
foo\n

bar

baz\n
")->div->p->text_before(0); =head2 to_xml my $xml = $dom->to_xml; my $xml = "$dom"; Render this element and its content to XML. # "test" $dom->parse('
test
')->div->b->to_xml; =head2 tree my $tree = $dom->tree; $dom = $dom->tree(['root', ['text', 'foo']]); Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. =head2 type my $type = $dom->type; $dom = $dom->type('div'); Element type. # List types of child elements say $dom->children->type; =head2 xml my $bool = $dom->xml; $dom = $dom->xml($bool); Disable HTML semantics in parser and activate case sensitivity, defaults to auto detection based on processing instructions. =head1 CHILD ELEMENTS In addition to the methods above, many child elements are also automatically available as object methods, which return a L or L object, depending on number of children. say $dom->p->text; say $dom->div->[23]->text; say $dom->div->text; =head1 ELEMENT ATTRIBUTES Direct hash reference access to element attributes is also possible. say $dom->{foo}; say $dom->div->{id}; =head1 SEE ALSO L, L, L. =cut