From 6b7d90d88a723296817de735905965c1a41184bc Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 3 Jun 2008 23:52:56 -0400 Subject: [PATCH] fixed most of the xapian todos --- IkiWiki/Plugin/search.pm | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/IkiWiki/Plugin/search.pm b/IkiWiki/Plugin/search.pm index fc77a7d2f..0d4a8f552 100644 --- a/IkiWiki/Plugin/search.pm +++ b/IkiWiki/Plugin/search.pm @@ -61,6 +61,7 @@ sub needsbuild ($) { #{{{ %toindex = map { pagename($_) => 1 } @{shift()}; } #}}} +my $scrubber; sub filter (@) { #{{{ my %params=@_; @@ -77,10 +78,31 @@ sub filter (@) { #{{{ $title=IkiWiki::pagetitle($params{page}); } + # Remove any html from text to be indexed. + # TODO: This removes html that is in eg, a markdown pre, + # which should not be removed. + if (! defined $scrubber) { + eval q{use HTML::Scrubber}; + error($@) if $@; + $scrubber=HTML::Scrubber->new(allow => []); + } + my $toindex=$scrubber->scrub($params{content}); + + # Take 512 characters for a sample, then extend it out + # if it stopped in the middle of a word. + my $size=512; + my ($sample)=substr($toindex, 0, $size); + my $next=substr($toindex, $size++, 1); + while ($next !~ /\s/) { + $sample.=$next; + $next=substr($toindex, $size++, 1); + } + $sample=~s/\n/ /g; + # data used by omega $doc->set_data( "url=".urlto($params{page}, "")."\n". - "sample=\n". # TODO + "sample=$sample\n". "caption=$title\n". "modtime=$IkiWiki::pagemtime{$params{page}}\n". "size=".length($params{content})."\n" @@ -91,7 +113,7 @@ sub filter (@) { #{{{ $tg->set_document($doc); $tg->index_text($params{page}, 2); $tg->index_text($title, 2); - $tg->index_text($params{content}); # TODO html strip; preprocessor too + $tg->index_text($toindex); my $pageterm=pageterm($params{page}); $doc->add_term($pageterm); -- 2.26.2