From e8b39b094116e8b50cf12fe56b9c6a04f05683e5 Mon Sep 17 00:00:00 2001 From: joey Date: Thu, 3 Aug 2006 21:50:47 +0000 Subject: [PATCH] * Try to handle relative links in aggregated feeds. However, the current support is a crude hack due to limitations of XML::Feed: xml:base is not supported, neither is Content-Location. And of course, relative links in RSS feeds are ill-undefined.. --- IkiWiki/Plugin/aggregate.pm | 63 ++++++++++++++++++++++++++++++++++--- debian/changelog | 9 ++++++ doc/todo/aggregation.mdwn | 3 -- 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/IkiWiki/Plugin/aggregate.pm b/IkiWiki/Plugin/aggregate.pm index 16db42a8f..d6592fef3 100644 --- a/IkiWiki/Plugin/aggregate.pm +++ b/IkiWiki/Plugin/aggregate.pm @@ -6,6 +6,9 @@ use warnings; use strict; use IkiWiki; use HTML::Entities; +use HTML::Parser; +use HTML::Tagset; +use URI; my %feeds; my %guids; @@ -283,15 +286,13 @@ sub add_page (@) { #{{{ # Create the page. my $template=IkiWiki::template("aggregatepost.tmpl", blind_cache => 1); - my $content=$params{content}; - $params{content}=~s/(?param(title => $params{title}) if defined $params{title} && length($params{title}); - $template->param(content => $params{content}); + $template->param(content => htmlescape(htmlabs($params{content}, $feed->{feedurl}))); $template->param(url => $feed->{url}); $template->param(name => $feed->{name}); - $template->param(link => $params{link}) if defined $params{link}; + $template->param(link => urlabs($params{link}, $feed->{feedurl})) + if defined $params{link}; if (ref $feed->{tags}) { $template->param(tags => [map { tag => $_ }, @{$feed->{tags}}]); } @@ -303,6 +304,58 @@ sub add_page (@) { #{{{ utime $mtime, $mtime, pagefile($guid->{page}) if defined $mtime; } #}}} +sub htmlescape ($) { #{{{ + # escape accidental wikilinks and preprocessor stuff + my $html=shift; + $html=~s/(?new_abs($url, $urlbase)->as_string; +} #}}} + +sub htmlabs ($$) { #{{{ + # Convert links in html from relative to absolute. + # Note that this is a heuristic, which is not specified by the rss + # spec and may not be right for all feeds. Also, see Debian + # bug #XXXX TODO: get bug. + my $html=shift; + my $urlbase=shift; + + my $ret=""; + my $p = HTML::Parser->new(api_version => 3); + $p->handler(default => sub { $ret.=join("", @_) }, "text"); + $p->handler(start => sub { + my ($tagname, $pos, $text) = @_; + if (ref $HTML::Tagset::linkElements{$tagname}) { + while (4 <= @$pos) { + # use attribute sets from right to left + # to avoid invalidating the offsets + # when replacing the values + my($k_offset, $k_len, $v_offset, $v_len) = + splice(@$pos, -4); + my $attrname = lc(substr($text, $k_offset, $k_len)); + next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}}; + next unless $v_offset; # 0 v_offset means no value + my $v = substr($text, $v_offset, $v_len); + $v =~ s/^([\'\"])(.*)\1$/$2/; + my $new_v=urlabs($v, $urlbase); + $new_v =~ s/\"/"/g; # since we quote with "" + substr($text, $v_offset, $v_len) = qq("$new_v"); + } + } + $ret.=$text; + }, "tagname, tokenpos, text"); + $p->parse($html); + $p->eof; + + return $ret; +} #}}} + sub remove_feeds () { #{{{ my $page=shift; diff --git a/debian/changelog b/debian/changelog index aacdbe52f..ab053e496 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,12 @@ +ikiwiki (1.16) UNRELEASED; urgency=low + + * Try to handle relative links in aggregated feeds. However, + the current support is a crude hack due to limitations of XML::Feed: + xml:base is not supported, neither is Content-Location. And of course, + relative links in RSS feeds are ill-undefined.. + + -- Joey Hess Thu, 3 Aug 2006 17:29:51 -0400 + ikiwiki (1.15) unstable; urgency=low * Remove CDPATH and other env vars perl taint checking doesn't like. diff --git a/doc/todo/aggregation.mdwn b/doc/todo/aggregation.mdwn index 5abb6a53e..dec242ea6 100644 --- a/doc/todo/aggregation.mdwn +++ b/doc/todo/aggregation.mdwn @@ -2,6 +2,3 @@ * Need to store page author metadata and include it in the rss feed. Permalink to? Also, that stuff could be presented better in the html blog view, also using the metadata. -* Some rss feeds contain relative links or relative urls to inline images, - which break when aggregated. Do I need to parse the html and make them - all absolute? -- 2.26.2