* Add a separate pass to find page links, and only render each page once,

author joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071>

Sat, 28 Oct 2006 03:27:10 +0000 (03:27 +0000)

committer joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071>

Sat, 28 Oct 2006 03:27:10 +0000 (03:27 +0000)
author joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071>
Sat, 28 Oct 2006 03:27:10 +0000 (03:27 +0000)
committer joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071>
Sat, 28 Oct 2006 03:27:10 +0000 (03:27 +0000)
diff --git a/IkiWiki/Render.pm b/IkiWiki/Render.pm

index deec539ae4b43fed8b3d4161baee55604dfdcc7b..026b3582e079edc7f7ebb6820c93d2394afeb4b6 100644 (file)
--- a/IkiWiki/Render.pm
+++ b/IkiWiki/Render.pm
@@ -13,6 +13,7 @@ sub backlinks ($) { #{{{
         my @links;
         foreach my $p (keys %links) {
                 next if bestlink($page, $p) eq $page;
+
                 if (grep { length $_ && bestlink($p, $_) eq $page } @{$links{$p}}) {
                         my $href=abs2rel(htmlpage($p), dirname($page));
                         
@@ -119,21 +120,25 @@ sub mtime ($) { #{{{
         return (stat($file))[9];
  } #}}}
  
-sub findlinks ($$) { #{{{
-       my $page=shift;
-       my $content=shift;
+sub scan ($) { #{{{
+       my $file=shift;
  
-       my @links;
-       while ($content =~ /(?<!\\)$config{wiki_link_regexp}/g) {
-               push @links, titlepage($2);
-       }
-       if ($config{discussion}) {
-               # Discussion links are a special case since they're not in the
-               # text of the page, but on its template.
-               return @links, "$page/discussion";
-       }
-       else {
-               return @links;
+       my $type=pagetype($file);
+       if (defined $type) {
+               my $srcfile=srcfile($file);
+               my $content=readfile($srcfile);
+               my $page=pagename($file);
+
+               my @links;
+               while ($content =~ /(?<!\\)$config{wiki_link_regexp}/g) {
+                       push @links, titlepage($2);
+               }
+               if ($config{discussion}) {
+                       # Discussion links are a special case since they're not in the
+                       # text of the page, but on its template.
+                       push @links, "$page/discussion";
+               }
+               $links{$page}=\@links;
         }
  } #}}}
  
@@ -149,9 +154,6 @@ sub render ($) { #{{{
                 will_render($page, htmlpage($page), 1);
                 
                 $content=filter($page, $content);
-               
-               $links{$page}=[findlinks($page, $content)];
-               
                 $content=preprocess($page, $page, $content);
                 $content=linkify($page, $page, $content);
                 $content=htmlize($page, $type, $content);
@@ -162,7 +164,6 @@ sub render ($) { #{{{
         }
         else {
                 my $content=readfile($srcfile, 1);
-               $links{$file}=[];
                 delete $depends{$file};
                 will_render($file, $file, 1);
                 writefile($file, $config{destdir}, $content, 1);
@@ -238,9 +239,8 @@ sub refresh () { #{{{
         foreach my $file (@files) {
                 my $page=pagename($file);
                 if (! $oldpagemtime{$page}) {
-                       debug("new page $page") unless exists $pagectime{$page};
                         push @add, $file;
-                       $links{$page}=[];
+                       scan($file);
                         $pagecase{lc $page}=$page;
                         $pagesources{$page}=$file;
                         if ($config{getctime} && -e "$config{srcdir}/$file") {
@@ -256,6 +256,7 @@ sub refresh () { #{{{
                 if (! $exists{$page}) {
                         debug("removing old page $page");
                         push @del, $pagesources{$page};
+                       $links{$page}=[];
                         $renderedfiles{$page}=[];
                         $oldpagemtime{$page}=0;
                         prune($config{destdir}."/".$_)
@@ -264,6 +265,18 @@ sub refresh () { #{{{
                 }
         }
         
+       # scan updated files to update info about them
+       foreach my $file (@files) {
+               my $page=pagename($file);
+               
+               if (! exists $oldpagemtime{$page} ||
+                   mtime(srcfile($file)) > $oldpagemtime{$page} ||
+                   $forcerebuild{$page}) {
+                       debug("scanning $file");
+                       scan($file);
+               }
+       }
+
         # render any updated files
         foreach my $file (@files) {
                 my $page=pagename($file);
@@ -278,12 +291,10 @@ sub refresh () { #{{{
         }
         
         # if any files were added or removed, check to see if each page
-       # needs an update due to linking to them or inlining them.
-       # TODO: inefficient; pages may get rendered above and again here;
-       # problem is the bestlink may have changed and we won't know until
-       # now
+       # needs an update due to linking to them or inlining them
         if (@add || @del) {
  FILE:          foreach my $file (@files) {
+                       next if $rendered{$file};
                         my $page=pagename($file);
                         foreach my $f (@add, @del) {
                                 my $p=pagename($f);
@@ -301,11 +312,9 @@ FILE:              foreach my $file (@files) {
  
         # Handle backlinks; if a page has added/removed links, update the
         # pages it links to. Also handles rebuilding dependant pages.
-       # TODO: inefficient; pages may get rendered above and again here;
-       # problem is the backlinks could be wrong in the first pass render
-       # above
         if (%rendered || @del) {
                 foreach my $f (@files) {
+                       next if $rendered{$f};
                         my $p=pagename($f);
                         if (exists $depends{$p}) {
                                 foreach my $file (keys %rendered, @del) {
@@ -347,6 +356,7 @@ FILE:               foreach my $file (@files) {
                 foreach my $link (keys %linkchanged) {
                         my $linkfile=$pagesources{$link};
                         if (defined $linkfile) {
+                               next if $rendered{$linkfile};
                                 debug("rendering $linkfile, to update its backlinks");
                                 render($linkfile);
                                 $rendered{$linkfile}=1;
diff --git a/debian/changelog b/debian/changelog

index 24f0e38867c56ea9f943f3900b63260d035a17b3..e914d40b3c86d233b26c802914ae10434bae244a 100644 (file)
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,12 @@
+ikiwiki (1.32) UNRELEASED; urgency=low
+
+  * Add a separate pass to find page links, and only render each page once,
+    instead of over and over. This is up to 8 times faster than before!
+    (This could have introduced some subtle bugs, so it needs to be tested
+    extensively.)
+
+ -- Joey Hess <joeyh@debian.org>  Fri, 27 Oct 2006 23:21:35 -0400
+
  ikiwiki (1.31) unstable; urgency=low
  
    * Patch from Pawel Tecza to cp -a the templates in the Makefile.
diff --git a/doc/roadmap.mdwn b/doc/roadmap.mdwn

index 7a3193308b3ff52eb8ab413adaa66cf2a60f3130..b393f254ffdc4ff58f655322781d09fa77cea9b9 100644 (file)
--- a/doc/roadmap.mdwn
+++ b/doc/roadmap.mdwn
@@ -14,12 +14,11 @@ Released 29 April 2006.
  
  * Unit test suite (with tests of at least core stuff like
    [[PageSpec]]). _(status: exists, could of course use more tests)_
-* [[Plugins]] _(status: done, interface still not [[quite_stable|todo/firm_up_plugin_interface]])_
+* [[Plugins]] _(status: done)_
  * [[Tags]] _(status: fair)_
  * Should have fully working [[todo/utf8]] support. _(status: good)_
  * [[Optimised_rendering|todo/optimisations]] if possible. Deal with other
-  scalability issues. _(status: 45%-60%+ speedup since 1.0, much more
-  possible)_
+  scalability issues. _(status: something like 9x speedup 1.0!)_
  * Improved [[todo/html]] stylesheets and templates.
  * Improved scalable [[logo]]. _(status: done)_
  * Support for at other revision control systems aside from svn.
diff --git a/doc/todo/optimisations.mdwn b/doc/todo/optimisations.mdwn

index 4e8118756e809ca7afc4d15d54a79ba38f21462e..13a270b8fa36c30be7bf8ca9e18650a1522e22c7 100644 (file)
--- a/doc/todo/optimisations.mdwn
+++ b/doc/todo/optimisations.mdwn
@@ -1,25 +1,21 @@
-* Render each changed page only once. Currently pages are rendered up to 4
-  times in worst case (8 times if there's an rss feed).
-
-  The issue is that rendering a page is used to gather info like the links
-  on the page (and other stuff) that can effect rendering other pages. So it
-  needs a multi-pass system. But rendering the whole page in each pass is
-  rather obscene.
-  
-  It would be better to have the first pass be a data gathering pass. Such
-  a pass would still need to load and parse the page contents etc, but
-  wouldn't need to generate html or write anything to disk.
-
-  One problem with this idea is that it could turn into 2x the work in
-  cases where ikiwiki currently efficiently renders a page just once. And
-  caching between the passes to avoid that wouldn't do good things to the
-  memory footprint.
-
-  Might be best to just do a partial first pass, getting eg, the page links
-  up-to-date, and then multiple, but generally fewer, rendering passes.
-
  * Don't render blog archive pages unless a page is added/removed. Just
    changing a page doesn't affect the archives as they show only the title.
  
  * Look at splitting up CGI.pm. But note that too much splitting can slow
    perl down.
+
+* The backlinks code turns out to scale badly to wikis with thousands of
+  pages. The code is O(N^2)! It's called for each page, and it loops
+  through all the pages to find backlinks.
+
+  Need to find a way to calculate and cache all the backlinks in one pass,
+  which could be done in at worst O(N), and possibly less (if they're
+  stored in the index, it could be constant time). But to do this, there
+  would need to be a way to invalidate or update the cache in these
+  situations:
+
+  - A page is added. Note that this can change a backlink to point to 
+    the new page instead of the page it pointed to before.
+  - A page is deleted. This can also change backlinks that pointed to that
+    page.
+  - A page is modified. Links added/removed.
author	joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071>
	Sat, 28 Oct 2006 03:27:10 +0000 (03:27 +0000)
committer	joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071>
	Sat, 28 Oct 2006 03:27:10 +0000 (03:27 +0000)
IkiWiki/Render.pm		patch \| blob \| history
debian/changelog		patch \| blob \| history
doc/roadmap.mdwn		patch \| blob \| history
doc/todo/optimisations.mdwn		patch \| blob \| history