htmlbalance: new plugin that balances tags by parsing and re-serializing
authorSimon McVittie <smcv@ http://smcv.pseudorandom.co.uk/>
Sun, 16 Nov 2008 18:11:39 +0000 (18:11 +0000)
committerSimon McVittie <smcv@ http://smcv.pseudorandom.co.uk/>
Mon, 17 Nov 2008 10:46:21 +0000 (10:46 +0000)
IkiWiki/Plugin/htmlbalance.pm [new file with mode: 0644]
doc/plugins/aggregate.mdwn
doc/plugins/htmlbalance.mdwn [new file with mode: 0644]
doc/plugins/htmltidy.mdwn
t/htmlbalance.t [new file with mode: 0755]

diff --git a/IkiWiki/Plugin/htmlbalance.pm b/IkiWiki/Plugin/htmlbalance.pm
new file mode 100644 (file)
index 0000000..667d73b
--- /dev/null
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+package IkiWiki::Plugin::htmlbalance;
+
+# htmlbalance: Parse and re-serialize HTML to ensure balanced tags
+#
+# Copyright 2008 Simon McVittie <http://smcv.pseudorandom.co.uk/>
+# Licensed under the GNU GPL, version 2, or any later version published by the
+# Free Software Foundation
+
+use warnings;
+use strict;
+use IkiWiki 2.00;
+
+sub import { #{{{
+       hook(type => "getsetup", id => "htmlbalance", call => \&getsetup);
+       hook(type => "sanitize", id => "htmlbalance", call => \&sanitize);
+} # }}}
+
+sub getsetup () { #{{{
+       return
+               plugin => {
+                       safe => 1,
+                       rebuild => undef,
+               },
+} #}}}
+
+sub sanitize (@) { #{{{
+       my %params=@_;
+       my $ret = '';
+
+       eval {
+               use HTML::TreeBuilder;
+               use XML::Atom::Util qw(encode_xml);
+       };
+
+       if ($@) {
+               error($@);
+               return $params{content};
+       }
+
+       my $tree = HTML::TreeBuilder->new_from_content($params{content});
+       my @nodes = $tree->disembowel();
+       foreach my $node (@nodes) {
+               if (ref $node) {
+                       $ret .= $node->as_XML();
+                       chomp $ret;
+                       $node->delete();
+               }
+               else {
+                       $ret .= encode_xml($node);
+               }
+       }
+       $tree->delete();
+       return $ret;
+} # }}}
+
+1
index c40a6dc22e977dc98a83836fa85ade23e03ec5c3..6fc87853b44141fe33725bad2f881cc0395e856f 100644 (file)
@@ -9,9 +9,9 @@ New users of aggregate should enable the `aggregateinternal => 1` option in the
 .setup file. If you don't do so, you will need to enable the [[html]] plugin
 as well as aggregate itself, since feed entries will be stored as HTML.
 
-The [[meta]] and [[tag]] plugins are also recommended. The
-[[htmltidy]] plugin is suggested, since feeds can easily contain html
-problems, some of which tidy can fix.
+The [[meta]] and [[tag]] plugins are also recommended. Either the
+[[htmltidy]] or [[htmlbalance]] plugin is suggested, since feeds can easily
+contain html problems, some of which these plugins can fix.
 
 You will need to run ikiwiki periodically from a cron job, passing it the
 --aggregate parameter, to make it check for new posts. Here's an example
diff --git a/doc/plugins/htmlbalance.mdwn b/doc/plugins/htmlbalance.mdwn
new file mode 100644 (file)
index 0000000..7cdb1f9
--- /dev/null
@@ -0,0 +1,9 @@
+[[!template id=plugin name=htmlbalance author="Simon McVittie"]]
+[[!tag type/html]]
+
+This plugin ensures that the HTML emitted by ikiwiki contains well-balanced
+HTML tags, by parsing it with HTML::TreeBuilder and re-serializing it. This
+acts as a lighter-weight alternative to [[plugins/htmltidy]]; it doesn't
+ensure validity, but it does at least ensure that formatting from a
+blog post pulled in by \[[![[ikiwiki/directive/inline]]]] doesn't
+leak into the rest of the page.
index f675a01aed5d0a482622077f374d0251d602dbb7..580e56f596d690e42ce10467329f05e1b07354d5 100644 (file)
@@ -7,4 +7,5 @@ emitted by ikiwiki. Besides being nicely formatted, this helps ensure that
 even if users enter suboptimal html, your wiki generates valid html.
 
 Note that since tidy is an external program, that is run each time a page
-is built, this plugin will slow ikiwiki down somewhat.
+is built, this plugin will slow ikiwiki down somewhat. [[plugins/htmlbalance]]
+might provide a faster alternative.
diff --git a/t/htmlbalance.t b/t/htmlbalance.t
new file mode 100755 (executable)
index 0000000..cd124e4
--- /dev/null
@@ -0,0 +1,13 @@
+#!/usr/bin/perl
+use warnings;
+use strict;
+use Test::More tests => 7;
+
+BEGIN { use_ok("IkiWiki::Plugin::htmlbalance"); }
+
+is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<br></br>"), "<br />");
+is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<div><p b=\"c\">hello world</div>"), "<div><p b=\"c\">hello world</p></div>");
+is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<a></a></a>"), "<a></a>");
+is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<b>foo <a</b>"), "<b>foo </b>");
+is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<b> foo <a</a></b>"), "<b> foo </b>");
+is(IkiWiki::Plugin::htmlbalance::sanitize(content => "a>"), "a&gt;");