Return-Path: X-Original-To: notmuch@notmuchmail.org Delivered-To: notmuch@notmuchmail.org Received: from localhost (localhost [127.0.0.1]) by olra.theworths.org (Postfix) with ESMTP id 46F99431FB6 for ; Wed, 20 Feb 2013 17:29:44 -0800 (PST) X-Virus-Scanned: Debian amavisd-new at olra.theworths.org X-Spam-Flag: NO X-Spam-Score: 0 X-Spam-Level: X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none] autolearn=disabled Received: from olra.theworths.org ([127.0.0.1]) by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id eid1EeS5z6Da for ; Wed, 20 Feb 2013 17:29:43 -0800 (PST) Received: from tesseract.cs.unb.ca (tesseract.cs.unb.ca [131.202.240.238]) (using TLSv1 with cipher DHE-RSA-AES128-SHA (128/128 bits)) (No client certificate requested) by olra.theworths.org (Postfix) with ESMTPS id 346DE431FAE for ; Wed, 20 Feb 2013 17:29:43 -0800 (PST) Received: from fctnnbsc30w-156034082078.dhcp-dynamic.fibreop.nb.bellaliant.net ([156.34.82.78] helo=zancas.localnet) by tesseract.cs.unb.ca with esmtpsa (TLS1.2:DHE_RSA_AES_128_CBC_SHA1:128) (Exim 4.80) (envelope-from ) id 1U8Kyg-0000v5-74; Wed, 20 Feb 2013 21:29:38 -0400 Received: from bremner by zancas.localnet with local (Exim 4.80) (envelope-from ) id 1U8KyZ-0007UO-1d; Wed, 20 Feb 2013 21:29:31 -0400 From: David Bremner To: notmuch mailing list Subject: Re: On disk tag storage format In-Reply-To: <874nk8v9zw.fsf@zancas.localnet> References: <874nk8v9zw.fsf@zancas.localnet> User-Agent: Notmuch/0.15.2+32~g16aa65b (http://notmuchmail.org) Emacs/24.2.1 (x86_64-pc-linux-gnu) Date: Wed, 20 Feb 2013 21:29:30 -0400 Message-ID: <87vc9mtpxh.fsf@zancas.localnet> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="=-=-=" X-Spam_bar: - X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Feb 2013 01:29:44 -0000 --=-=-= Content-Type: text/plain David Bremner writes: > Austin outlined on IRC a way of representing tags on disk as hardlinks > to messages. In order to make the discussion more concrete, I wrote a > prototype in python to dump the notmuch database to this format. On my > 250k messages, this creates 40k new hardlinks, and uses about 5M of > diskspace. The dump process takes about 20s on > my core i7 machine. With symbolic links, the same database takes about > 150M of disk space; this isn't great but it isn't unbearable either. > I've being playing a bit with this script and it seems more or less usable as a way of mirroring the notmuch tag database to a link farm. It's a bit faster than my current dump/restore based approach, although if you want to keep the results in a git repository then it takes up more space. Of course the bonus with this approach is that it creates "virtual" maildirs for each tag that can be browsed with the maildir client of choice. The current default is to use some mix of hard and symbolic links to try to balance the space consumed in a git repo versus the inode consumption/performance issues of using too many symlinks. It's still a prototype, and there is not much error checking, and there are certain issues not dealt with at all (the ones I thought about are commented). --=-=-= Content-Type: text/x-python Content-Disposition: inline; filename=linksync.py # Copyright 2013, David Bremner # Licensed under the same terms as notmuch. import notmuch import re import os, errno import sys from collections import defaultdict import argparse # skip automatic and maildir tags skiptags = re.compile(r"^(attachement|signed|encrypted|draft|flagged|passed|replied|unread)$") # some random person on stack overflow suggests: def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' encode_re = '([^{0}])'.format(CHARSET) decode_re = '[%]([0-7][0-9A-Fa-f])' def encode_one_char(match): return('%{:02x}'.format(ord(match.group(1)))) def encode_for_fs(str): return re.sub(encode_re,encode_one_char, str,0) def decode_one_char(match): return chr(int(match.group(1),16)) def decode_from_fs(str): return re.sub(decode_re,decode_one_char, str, 0) def mk_tag_dir(tagdir): mkdir_p (os.path.join(tagdir, 'cur')) mkdir_p (os.path.join(tagdir, 'new')) mkdir_p (os.path.join(tagdir, 'tmp')) flagpart = '(:2,[^:]*)' flagre = re.compile(flagpart + '$'); def path_for_msg (dir, msg): filename = msg.get_filename() flagsmatch = flagre.search(filename) if flagsmatch == None: flags = '' else: flags = flagsmatch.group(1) return os.path.join(dir, 'cur', encode_for_fs(msg.get_message_id()) + flags) def unlink_message(dir, msg): dir = os.path.join(dir, 'cur') filepattern = encode_for_fs(msg.get_message_id()) + flagpart +'?$' filere = re.compile(filepattern); for file in os.listdir(dir): if filere.match(file): os.unlink(os.path.join(dir, file)) def dir_for_tag(tag): enc_tag = encode_for_fs (tag) return os.path.join(tagroot, enc_tag) disk_tags = defaultdict(set) disk_ids = set() def read_tags_from_disk(rootdir): for root, subFolders, files in os.walk(rootdir): for filename in files: msg_id = filename.split(':')[0] tag = root.split('/')[-2] decoded_id = decode_from_fs(msg_id) disk_ids.add(decoded_id) disk_tags[decoded_id].add(decode_from_fs(tag)); # Main program parser = argparse.ArgumentParser(description='Sync notmuch tag database to/from link farm') parser.add_argument('-l','--link-style',choices=['hard','symbolic', 'adaptive'], default='adaptive',dest='link_style') parser.add_argument('-d','--destination',choices=['disk','notmuch'], default='disk', dest='destination') parser.add_argument('-t','--threshold', default=50000L, type=int, dest='threshold') parser.add_argument('tagroot') opts=parser.parse_args() tagroot=opts.tagroot sync_from_links = (opts.destination == 'notmuch') read_tags_from_disk(tagroot) if sync_from_links: db = notmuch.Database(mode=notmuch.Database.MODE.READ_WRITE) else: db = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) dbtags = filter (lambda tag: not skiptags.match(tag), db.get_all_tags()) querystr = ' OR '.join(map (lambda tag: 'tag:'+tag, dbtags)); q_new = notmuch.Query(db, querystr) q_new.set_sort(notmuch.Query.SORT.UNSORTED) for msg in q_new.search_messages(): # silently ignore empty tags db_tags = set(filter (lambda tag: tag != '' and not skiptags.match(tag), msg.get_tags())) message_id = msg.get_message_id() disk_ids.discard(message_id) missing_on_disk = db_tags.difference(disk_tags[message_id]) missing_in_db = disk_tags[message_id].difference(db_tags) if sync_from_links: msg.freeze() filename = msg.get_filename() if len(missing_on_disk) > 0: if opts.link_style == 'adaptive': statinfo = os.stat (filename) symlink = (statinfo.st_size > opts.threshold) else: symlink = opts.link_style == 'symbolic' for tag in missing_on_disk: if sync_from_links: msg.remove_tag(tag,sync_maildir_flags=False) else: tagdir = dir_for_tag (tag) mk_tag_dir (tagdir) newlink = path_for_msg (tagdir, msg) if symlink: os.symlink(filename, newlink) else: os.link(filename, newlink) for tag in missing_in_db: if sync_from_links: msg.add_tag(tag,sync_maildir_flags=False) else: tagdir = dir_for_tag (tag) unlink_message(tagdir,msg) if sync_from_links: msg.thaw() # everything remaining in disk_ids is a deleted message # unless we are syncing back to the database, in which case # it just might not currently have any non maildir tags. if not sync_from_links: for root, subFolders, files in os.walk(tagroot): for filename in files: msg_id = filename.split(':')[0] decoded_id = decode_from_fs(msg_id) if decoded_id in disk_ids: os.unlink(os.path.join(root, filename)) db.close() # currently empty directories are not pruned. --=-=-=--