package Mojo::DOM; use Mojo::Base -base; use overload '%{}' => sub { shift->attrs }, 'bool' => sub {1}, '""' => sub { shift->to_xml }, fallback => 1; # "Fry: This snow is beautiful. I'm glad global warming never happened. # Leela: Actually, it did. But thank God nuclear winter canceled it out." use Carp 'croak'; use Mojo::Collection; use Mojo::DOM::CSS; use Mojo::DOM::HTML; use Mojo::Util 'squish'; use Scalar::Util qw(blessed weaken); sub AUTOLOAD { my $self = shift; # Method my ($package, $method) = our $AUTOLOAD =~ /^([\w:]+)::(\w+)$/; croak "Undefined subroutine &${package}::$method called" unless blessed $self && $self->isa(__PACKAGE__); # Search children my $children = $self->children($method); return @$children > 1 ? $children : $children->[0] if @$children; croak qq{Can't locate object method "$method" via package "$package"}; } sub DESTROY { } sub new { my $class = shift; my $self = bless [Mojo::DOM::HTML->new], ref $class || $class; return @_ ? $self->parse(@_) : $self; } sub all_text { my ($self, $trim) = @_; my $tree = $self->tree; return _text(_elements($tree), 1, _trim($tree, $trim)); } sub append { shift->_add(1, @_) } sub append_content { my ($self, $new) = @_; my $tree = $self->tree; push @$tree, @{_parent($self->_parse("$new"), $tree)}; return $self; } sub at { shift->find(@_)->[0] } sub attrs { my $self = shift; # Hash my $tree = $self->tree; my $attrs = $tree->[0] eq 'root' ? {} : $tree->[2]; return $attrs unless @_; # Get return defined $attrs->{$_[0]} ? $attrs->{$_[0]} : '' unless @_ > 1 || ref $_[0]; # Set %$attrs = (%$attrs, %{ref $_[0] ? $_[0] : {@_}}); return $self; } sub charset { shift->_html(charset => @_) } sub children { my ($self, $type) = @_; my @children; my $charset = $self->charset; my $xml = $self->xml; my $tree = $self->tree; for my $e (@$tree[($tree->[0] eq 'root' ? 1 : 4) .. $#$tree]) { # Make sure child is the right type next unless $e->[0] eq 'tag'; next if defined $type && $e->[1] ne $type; push @children, $self->new->charset($charset)->tree($e)->xml($xml); } return Mojo::Collection->new(@children); } sub content_xml { my $self = shift; # Render children my $tree = $self->tree; my $charset = $self->charset; my $xml = $self->xml; return join '', map { Mojo::DOM::HTML->new(charset => $charset, tree => $_, xml => $xml)->render } @$tree[($tree->[0] eq 'root' ? 1 : 4) .. $#$tree]; } sub find { my ($self, $selector) = @_; my $charset = $self->charset; my $xml = $self->xml; return Mojo::Collection->new( map { $self->new->charset($charset)->tree($_)->xml($xml) } @{Mojo::DOM::CSS->new(tree => $self->tree)->select($selector)}); } sub namespace { my $self = shift; # Extract namespace prefix and search parents return '' if (my $current = $self->tree)->[0] eq 'root'; my $ns = $current->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; while ($current) { last if $current->[0] eq 'root'; # Namespace for prefix my $attrs = $current->[2]; if ($ns) { /^\Q$ns\E$/ and return $attrs->{$_} for keys %$attrs } # Namespace attribute elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } # Parent $current = $current->[3]; } return ''; } sub next { shift->_sibling(1) } sub parent { my $self = shift; return undef if (my $tree = $self->tree)->[0] eq 'root'; return $self->new->charset($self->charset)->tree($tree->[3]) ->xml($self->xml); } sub parse { my $self = shift; $self->[0]->parse(@_); return $self; } sub prepend { shift->_add(0, @_) } sub prepend_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, $tree->[0] eq 'root' ? 1 : 4, 0, @{_parent($self->_parse("$new"), $tree)}; return $self; } sub previous { shift->_sibling(0) } sub remove { shift->replace('') } sub replace { my ($self, $new) = @_; # Parse my $tree = $self->tree; if ($tree->[0] eq 'root') { return $self->xml(undef)->parse($new) } else { $new = $self->_parse("$new") } # Find and replace my $parent = $tree->[3]; my $i = $parent->[0] eq 'root' ? 1 : 4; for my $e (@$parent[$i .. $#$parent]) { last if $e == $tree; $i++; } splice @$parent, $i, 1, @{_parent($new, $parent)}; return $self; } sub replace_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, $tree->[0] eq 'root' ? 1 : 4, $#$tree, @{_parent($self->_parse("$new"), $tree)}; return $self; } sub root { my $self = shift; my $root = $self->tree; while ($root->[0] eq 'tag') { last unless my $parent = $root->[3]; $root = $parent; } return $self->new->charset($self->charset)->tree($root)->xml($self->xml); } sub text { my ($self, $trim) = @_; my $tree = $self->tree; return _text(_elements($tree), 0, _trim($tree, $trim)); } sub text_after { my ($self, $trim) = @_; # Find following text elements return '' if (my $tree = $self->tree)->[0] eq 'root'; my (@elements, $started); for my $e (@{_elements($tree->[3])}) { ++$started and next if $e eq $tree; next unless $started; last if $e->[0] eq 'tag'; push @elements, $e; } return _text(\@elements, 0, _trim($tree->[3], $trim)); } sub text_before { my ($self, $trim) = @_; # Find preceding text elements return '' if (my $tree = $self->tree)->[0] eq 'root'; my @elements; for my $e (@{_elements($tree->[3])}) { last if $e eq $tree; push @elements, $e; @elements = () if $e->[0] eq 'tag'; } return _text(\@elements, 0, _trim($tree->[3], $trim)); } sub to_xml { shift->[0]->render } sub tree { shift->_html(tree => @_) } sub type { my ($self, $type) = @_; # Get return '' if (my $tree = $self->tree)->[0] eq 'root'; return $tree->[1] unless $type; # Set $tree->[1] = $type; return $self; } sub xml { shift->_html(xml => @_) } sub _add { my ($self, $offset, $new) = @_; # Not a tag return $self if (my $tree = $self->tree)->[0] eq 'root'; # Find parent my $parent = $tree->[3]; my $i = $parent->[0] eq 'root' ? 1 : 4; for my $e (@$parent[$i .. $#$parent]) { last if $e == $tree; $i++; } # Add children splice @$parent, $i + $offset, 0, @{_parent($self->_parse("$new"), $parent)}; return $self; } sub _elements { return [] unless my $e = shift; return [@$e[($e->[0] eq 'root' ? 1 : 4) .. $#$e]]; } sub _html { my ($self, $method) = (shift, shift); return $self->[0]->$method unless @_; $self->[0]->$method(@_); return $self; } sub _parent { my ($children, $parent) = @_; # Link parent to children my @new; for my $e (@$children[1 .. $#$children]) { if ($e->[0] eq 'tag') { $e->[3] = $parent; weaken $e->[3]; } push @new, $e; } return \@new; } sub _parse { my $self = shift; Mojo::DOM::HTML->new(charset => $self->charset, xml => $self->xml) ->parse(shift)->tree; } sub _sibling { my ($self, $next) = @_; # Make sure we have a parent return undef unless my $parent = $self->parent; # Find previous or next sibling my ($previous, $current); for my $child ($parent->children->each) { ++$current and next if $child->tree eq $self->tree; return $next ? $child : $previous if $current; $previous = $child; } # No siblings return undef; } sub _text { my ($elements, $recurse, $trim) = @_; my $text = ''; for my $e (@$elements) { my $type = $e->[0]; # Nested tag my $content = ''; if ($type eq 'tag' && $recurse) { $content = _text(_elements($e), 1, _trim($e, $trim)); } # Text elsif ($type eq 'text') { $content = $trim ? squish($e->[1]) : $e->[1] } # CDATA or raw text elsif ($type eq 'cdata' || $type eq 'raw') { $content = $e->[1] } # Add leading whitespace if punctuation allows it $content = " $content" if $text =~ /\S\z/ && $content =~ /^[^.!?,;:\s]+/; # Trim whitespace blocks $text .= $content if $content =~ /\S+/ || !$trim; } return $text; } sub _trim { my ($e, $trim) = @_; # Disabled return 0 unless $e && ($trim = defined $trim ? $trim : 1); # Detect "pre" tag while ($e->[0] eq 'tag') { return 0 if $e->[1] eq 'pre'; last unless $e = $e->[3]; } return 1; } 1; =head1 NAME Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use Mojo::DOM; # Parse my $dom = Mojo::DOM->new('
A
B
C
'); # Render say $dom; =head1 DESCRIPTION LHi!
'); say $dom->at('p')->text; say $dom->p->{id}; If XML processing instructions are found, the parser will automatically switch into XML mode and everything becomes case sensitive. my $dom = Mojo::DOM->new('Hi!
'); say $dom->at('P')->text; say $dom->P->{ID}; XML detection can also be disabled with the Cbar
baz\nbar
baz\nHi!
'); Append to element. # "Hi!
'); Append to element content. # "Hi!
'); Prepend to element. # "Hi!
'); Prepend to element content. # "bar
baz\nbar
baz\nbar
baz\nbar
baz\nbar
baz\nbar
baz\n