inline: Improve RSS url munging to use a proper html parser
authorJoey Hess <joey@kitenet.net>
Tue, 16 Nov 2010 20:57:50 +0000 (16:57 -0400)
committerJoey Hess <joey@kitenet.net>
Tue, 16 Nov 2010 20:57:50 +0000 (16:57 -0400)
and support all elements that HTML::Tagset knows about.

(Which doesn't include html5 just yet, but then the old version didn't either.)

Bonus: 4 times faster than old regexp method.

IkiWiki/Plugin/inline.pm
debian/changelog

index 3b98bf8dd67b116fa9524043dbdb1eb269f9af95..1fe40a5eab4235f82c46cbd0499032c84103e091 100644 (file)
@@ -506,26 +506,59 @@ sub date_822 ($) {
 }
 
 sub absolute_urls ($$) {
-       # sucky sub because rss sucks
-       my $content=shift;
+       # needed because rss sucks
+       my $html=shift;
        my $baseurl=shift;
 
        my $url=$baseurl;
        $url=~s/[^\/]+$//;
+       my $urltop; # calculated if needed
 
-       # what is the non path part of the url?
-       my $top_uri = URI->new($url);
-       $top_uri->path_query(""); # reset the path
-       my $urltop = $top_uri->as_string;
-
-       $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(#[^"]+)"/$1 href="$baseurl$2"/mig;
-       # relative to another wiki page
-       $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)([^\/][^"]*)"/$1 href="$url$2"/mig;
-       $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)([^\/][^"]*)"/$1 src="$url$2"/mig;
-       # relative to the top of the site
-       $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)(\/[^"]*)"/$1 href="$urltop$2"/mig;
-       $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)(\/[^"]*)"/$1 src="$urltop$2"/mig;
-       return $content;
+       my $ret="";
+
+       eval q{use HTML::Parser; use HTML::Tagset};
+       die $@ if $@;
+       my $p = HTML::Parser->new(api_version => 3);
+       $p->handler(default => sub { $ret.=join("", @_) }, "text");
+       $p->handler(start => sub {
+               my ($tagname, $pos, $text) = @_;
+               if (ref $HTML::Tagset::linkElements{$tagname}) {
+                       while (4 <= @$pos) {
+                               # use attribute sets from right to left
+                               # to avoid invalidating the offsets
+                               # when replacing the values
+                               my ($k_offset, $k_len, $v_offset, $v_len) =
+                                       splice(@$pos, -4);
+                               my $attrname = lc(substr($text, $k_offset, $k_len));
+                               next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}};
+                               next unless $v_offset; # 0 v_offset means no value
+                               my $v = substr($text, $v_offset, $v_len);
+                               $v =~ s/^([\'\"])(.*)\1$/$2/;
+                               if ($v=~/^#/) {
+                                       $v=$baseurl.$v; # anchor
+                               }
+                               elsif ($v=~/^(?!\w+:)[^\/]/) {
+                                       $v=$url.$v; # relative url
+                               }
+                               elsif ($v=~/^\//) {
+                                       if (! defined $urltop) {
+                                               # what is the non path part of the url?
+                                               my $top_uri = URI->new($url);
+                                               $top_uri->path_query(""); # reset the path
+                                               $urltop = $top_uri->as_string;
+                                       }
+                                       $v=$urltop.$v; # url relative to top of site
+                               }
+                               $v =~ s/\"/&quot;/g; # since we quote with ""
+                               substr($text, $v_offset, $v_len) = qq("$v");
+                       }
+               }
+               $ret.=$text;
+       }, "tagname, tokenpos, text");
+       $p->parse($html);
+       $p->eof;
+
+       return $ret;
 }
 
 sub genfeed ($$$$$@) {
index 00c32e95d109857f6604fe1185319e6a4cb93409..0edb780046983fdca7ac9a5ed9ee4a5b3c187bef 100644 (file)
@@ -4,6 +4,10 @@ ikiwiki (3.20101113) UNRELEASED; urgency=low
   * more: Add pages parameter to limit where the more is displayed.
     (thanks, dark)
   * Fix escaping of filenames in historyurl. (Thanks, aj)
+  * inline: Improve RSS url munging to use a proper html parser,
+    and support all elements that HTML::Tagset knows about. 
+    (Which doesn't include html5 just yet, but then the old version
+    didn't either.) Bonus: 4 times faster than old regexp method.
 
  -- Joey Hess <joeyh@debian.org>  Tue, 16 Nov 2010 14:23:47 -0400