From b00c6c9640453bf1407c4e880ef0c171388197c7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 16 Nov 2010 16:57:50 -0400 Subject: [PATCH] inline: Improve RSS url munging to use a proper html parser and support all elements that HTML::Tagset knows about. (Which doesn't include html5 just yet, but then the old version didn't either.) Bonus: 4 times faster than old regexp method. --- IkiWiki/Plugin/inline.pm | 63 ++++++++++++++++++++++++++++++---------- debian/changelog | 4 +++ 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/IkiWiki/Plugin/inline.pm b/IkiWiki/Plugin/inline.pm index 3b98bf8dd..1fe40a5ea 100644 --- a/IkiWiki/Plugin/inline.pm +++ b/IkiWiki/Plugin/inline.pm @@ -506,26 +506,59 @@ sub date_822 ($) { } sub absolute_urls ($$) { - # sucky sub because rss sucks - my $content=shift; + # needed because rss sucks + my $html=shift; my $baseurl=shift; my $url=$baseurl; $url=~s/[^\/]+$//; + my $urltop; # calculated if needed - # what is the non path part of the url? - my $top_uri = URI->new($url); - $top_uri->path_query(""); # reset the path - my $urltop = $top_uri->as_string; - - $content=~s/(new(api_version => 3); + $p->handler(default => sub { $ret.=join("", @_) }, "text"); + $p->handler(start => sub { + my ($tagname, $pos, $text) = @_; + if (ref $HTML::Tagset::linkElements{$tagname}) { + while (4 <= @$pos) { + # use attribute sets from right to left + # to avoid invalidating the offsets + # when replacing the values + my ($k_offset, $k_len, $v_offset, $v_len) = + splice(@$pos, -4); + my $attrname = lc(substr($text, $k_offset, $k_len)); + next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}}; + next unless $v_offset; # 0 v_offset means no value + my $v = substr($text, $v_offset, $v_len); + $v =~ s/^([\'\"])(.*)\1$/$2/; + if ($v=~/^#/) { + $v=$baseurl.$v; # anchor + } + elsif ($v=~/^(?!\w+:)[^\/]/) { + $v=$url.$v; # relative url + } + elsif ($v=~/^\//) { + if (! defined $urltop) { + # what is the non path part of the url? + my $top_uri = URI->new($url); + $top_uri->path_query(""); # reset the path + $urltop = $top_uri->as_string; + } + $v=$urltop.$v; # url relative to top of site + } + $v =~ s/\"/"/g; # since we quote with "" + substr($text, $v_offset, $v_len) = qq("$v"); + } + } + $ret.=$text; + }, "tagname, tokenpos, text"); + $p->parse($html); + $p->eof; + + return $ret; } sub genfeed ($$$$$@) { diff --git a/debian/changelog b/debian/changelog index 00c32e95d..0edb78004 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,10 @@ ikiwiki (3.20101113) UNRELEASED; urgency=low * more: Add pages parameter to limit where the more is displayed. (thanks, dark) * Fix escaping of filenames in historyurl. (Thanks, aj) + * inline: Improve RSS url munging to use a proper html parser, + and support all elements that HTML::Tagset knows about. + (Which doesn't include html5 just yet, but then the old version + didn't either.) Bonus: 4 times faster than old regexp method. -- Joey Hess Tue, 16 Nov 2010 14:23:47 -0400 -- 2.26.2