package Mojo::DOM; use Mojo::Base -strict; use overload '%{}' => sub { shift->attr }, bool => sub {1}, '""' => sub { shift->to_xml }, fallback => 1; # "Fry: This snow is beautiful. I'm glad global warming never happened. # Leela: Actually, it did. But thank God nuclear winter canceled it out." use Carp 'croak'; use Mojo::Collection; use Mojo::DOM::CSS; use Mojo::DOM::HTML; use Mojo::Util 'squish'; use Scalar::Util qw(blessed weaken); sub AUTOLOAD { my $self = shift; my ($package, $method) = our $AUTOLOAD =~ /^([\w:]+)::(\w+)$/; croak "Undefined subroutine &${package}::$method called" unless blessed $self && $self->isa(__PACKAGE__); # Search children of current element my $children = $self->children($method); return @$children > 1 ? $children : $children->[0] if @$children; croak qq{Can't locate object method "$method" via package "$package"}; } sub DESTROY { } sub new { my $class = shift; my $self = bless [Mojo::DOM::HTML->new], ref $class || $class; return @_ ? $self->parse(@_) : $self; } sub all_text { shift->_content(1, @_) } sub ancestors { _select($_[0]->_collect(_ancestors($_[0]->tree)), $_[1]) } sub append { shift->_add(1, @_) } sub append_content { my ($self, $new) = @_; my $tree = $self->tree; push @$tree, _link($self->_parse("$new"), $tree); return $self; } sub at { shift->find(@_)->[0] } sub attr { my $self = shift; # Hash my $tree = $self->tree; my $attrs = $tree->[0] eq 'root' ? {} : $tree->[2]; return $attrs unless @_; # Get return $attrs->{$_[0]} // '' unless @_ > 1 || ref $_[0]; # Set %$attrs = (%$attrs, %{ref $_[0] ? $_[0] : {@_}}); return $self; } sub children { my $self = shift; return _select( $self->_collect(grep { $_->[0] eq 'tag' } _nodes($self->tree)), @_); } sub content_xml { my $self = shift; my $xml = $self->xml; return join '', map { _render($_, $xml) } _nodes($self->tree); } sub find { my $self = shift; my $results = Mojo::DOM::CSS->new(tree => $self->tree)->select(@_); return $self->_collect(@$results); } sub match { my $self = shift; return undef unless Mojo::DOM::CSS->new(tree => $self->tree)->match(@_); return $self; } sub namespace { my $self = shift; return '' if (my $current = $self->tree)->[0] eq 'root'; # Extract namespace prefix and search parents my $ns = $current->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; while ($current->[0] ne 'root') { # Namespace for prefix my $attrs = $current->[2]; if ($ns) { /^\Q$ns\E$/ and return $attrs->{$_} for keys %$attrs } # Namespace attribute elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } $current = $current->[3]; } return ''; } sub next { shift->_siblings->[1][0] } sub parent { my $self = shift; return undef if (my $tree = $self->tree)->[0] eq 'root'; return $self->new->tree($tree->[3])->xml($self->xml); } sub parse { shift->_delegate(parse => shift) } sub prepend { shift->_add(0, @_) } sub prepend_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, _offset($tree), 0, _link($self->_parse("$new"), $tree); return $self; } sub previous { shift->_siblings->[0][-1] } sub remove { shift->replace('') } sub replace { my ($self, $new) = @_; my $tree = $self->tree; return $self->xml(undef)->parse($new) if $tree->[0] eq 'root'; return $self->_replace($tree, $self->_parse("$new")); } sub replace_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, _offset($tree), $#$tree, _link($self->_parse("$new"), $tree); return $self; } sub root { my $self = shift; return $self unless my $tree = _ancestors($self->tree, 1); return $self->new->tree($tree)->xml($self->xml); } sub siblings { _select(Mojo::Collection->new(@{_siblings($_[0], 1)}), $_[1]) } sub strip { my $self = shift; my $tree = $self->tree; return $self if $tree->[0] eq 'root'; return $self->_replace($tree, ['root', _nodes($tree)]); } sub tap { shift->Mojo::Base::tap(@_) } sub text { shift->_content(0, @_) } sub text_after { my ($self, $trim) = @_; return '' if (my $tree = $self->tree)->[0] eq 'root'; my (@nodes, $started); for my $n (_nodes($tree->[3])) { ++$started and next if $n eq $tree; next unless $started; last if $n->[0] eq 'tag'; push @nodes, $n; } return _text(\@nodes, 0, _trim($tree->[3], $trim)); } sub text_before { my ($self, $trim) = @_; return '' if (my $tree = $self->tree)->[0] eq 'root'; my @nodes; for my $n (_nodes($tree->[3])) { last if $n eq $tree; push @nodes, $n; @nodes = () if $n->[0] eq 'tag'; } return _text(\@nodes, 0, _trim($tree->[3], $trim)); } sub to_xml { shift->[0]->render } sub tree { shift->_delegate(tree => @_) } sub type { my ($self, $type) = @_; return '' if (my $tree = $self->tree)->[0] eq 'root'; return $tree->[1] unless $type; $tree->[1] = $type; return $self; } sub xml { shift->_delegate(xml => @_) } sub _add { my ($self, $offset, $new) = @_; return $self if (my $tree = $self->tree)->[0] eq 'root'; my $parent = $tree->[3]; splice @$parent, _parent($parent, $tree) + $offset, 0, _link($self->_parse("$new"), $parent); return $self; } sub _ancestors { my ($tree, $root) = @_; my @ancestors; push @ancestors, $tree while ($tree->[0] eq 'tag') && ($tree = $tree->[3]); return $root ? $ancestors[-1] : @ancestors[0 .. $#ancestors - 1]; } sub _collect { my $self = shift; my $xml = $self->xml; return Mojo::Collection->new(@_) ->map(sub { $self->new->tree($_)->xml($xml) }); } sub _content { my $tree = shift->tree; return _text([_nodes($tree)], shift, _trim($tree, @_)); } sub _delegate { my ($self, $method) = (shift, shift); return $self->[0]->$method unless @_; $self->[0]->$method(@_); return $self; } sub _link { my ($children, $parent) = @_; # Link parent to children my @new; for my $n (@$children[1 .. $#$children]) { push @new, $n; next unless $n->[0] eq 'tag'; $n->[3] = $parent; weaken $n->[3]; } return @new; } sub _nodes { return unless my $n = shift; return @$n[_offset($n) .. $#$n]; } sub _offset { $_[0][0] eq 'root' ? 1 : 4 } sub _parent { my ($parent, $child) = @_; # Find parent offset for child my $i = _offset($parent); for my $n (@$parent[$i .. $#$parent]) { last if $n == $child; $i++; } return $i; } sub _parse { Mojo::DOM::HTML->new(xml => shift->xml)->parse(shift)->tree } sub _render { Mojo::DOM::HTML->new(tree => shift, xml => shift)->render } sub _replace { my ($self, $tree, $new) = @_; my $parent = $tree->[3]; splice @$parent, _parent($parent, $tree), 1, _link($new, $parent); return $self->parent; } sub _select { my ($self, $selector) = @_; return defined $selector ? $self->grep(sub { $_->match($selector) }) : $self; } sub _siblings { my ($self, $merge) = @_; return $merge ? [] : [[], []] unless my $parent = $self->parent; my $tree = $self->tree; my (@before, @after, $match); for my $child ($parent->children->each) { ++$match and next if $child->tree eq $tree; $match ? push @after, $child : push @before, $child; } return $merge ? [@before, @after] : [\@before, \@after]; } sub _text { my ($nodes, $recurse, $trim) = @_; # Merge successive text nodes my $i = 0; while (my $next = $nodes->[$i + 1]) { ++$i and next unless $nodes->[$i][0] eq 'text' && $next->[0] eq 'text'; splice @$nodes, $i, 2, ['text', $nodes->[$i][1] . $next->[1]]; } my $text = ''; for my $n (@$nodes) { my $type = $n->[0]; # Nested tag my $content = ''; if ($type eq 'tag' && $recurse) { $content = _text([_nodes($n)], 1, _trim($n, $trim)); } # Text elsif ($type eq 'text') { $content = $trim ? squish($n->[1]) : $n->[1] } # CDATA or raw text elsif ($type eq 'cdata' || $type eq 'raw') { $content = $n->[1] } # Add leading whitespace if punctuation allows it $content = " $content" if $text =~ /\S\z/ && $content =~ /^[^.!?,;:\s]+/; # Trim whitespace blocks $text .= $content if $content =~ /\S+/ || !$trim; } return $text; } sub _trim { my ($e, $trim) = @_; # Disabled return 0 unless $e && ($trim = defined $trim ? $trim : 1); # Detect "pre" tag while ($e->[0] eq 'tag') { return 0 if $e->[1] eq 'pre'; last unless $e = $e->[3]; } return 1; } 1; =encoding utf8 =head1 NAME Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use Mojo::DOM; # Parse my $dom = Mojo::DOM->new('
A
B
C
'); $dom->find(':not(p)')->strip; # Render say "$dom"; =head1 DESCRIPTION LHi!
'); say $dom->at('p')->text; say $dom->p->{id}; If XML processing instructions are found, the parser will automatically switch into XML mode and everything becomes case sensitive. my $dom = Mojo::DOM->new('Hi!
'); say $dom->at('P')->text; say $dom->P->{ID}; XML detection can also be disabled with the L"xml"> method. # Force XML semantics $dom->xml(1); # Force HTML semantics $dom->xml(0); =head1 METHODS Lbar
baz\nbar
baz\nHi!
'); Append HTML/XML fragment to element. # "Hi!
'); Append HTML/XML fragment to element content. # "Hi!
'); Prepend HTML/XML fragment to element. # "Hi!
'); Prepend HTML/XML fragment to element content. # "test
'); Replace element content with HTML/XML fragment. # "bar
baz\nbar
baz\nbar
baz\nbar
baz\nbar
baz\nbar
baz\n