Return-Path: X-Original-To: notmuch@notmuchmail.org Delivered-To: notmuch@notmuchmail.org Received: from localhost (localhost [127.0.0.1]) by olra.theworths.org (Postfix) with ESMTP id 4A161431FBD for ; Fri, 4 Oct 2013 18:28:21 -0700 (PDT) X-Virus-Scanned: Debian amavisd-new at olra.theworths.org X-Spam-Flag: NO X-Spam-Score: -0.799 X-Spam-Level: X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5 tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled Received: from olra.theworths.org ([127.0.0.1]) by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id rdQfzwQdBW4y for ; Fri, 4 Oct 2013 18:28:16 -0700 (PDT) Received: from mail-qa0-f44.google.com (mail-qa0-f44.google.com [209.85.216.44]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) (No client certificate requested) by olra.theworths.org (Postfix) with ESMTPS id 52FBE431FAF for ; Fri, 4 Oct 2013 18:28:16 -0700 (PDT) Received: by mail-qa0-f44.google.com with SMTP id j7so1607348qaq.10 for ; Fri, 04 Oct 2013 18:28:13 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=from:to:subject:in-reply-to:references:user-agent:date:message-id :mime-version:content-type; bh=yHoDnFNlw3o30xsgKwGZhEWp0L6xhXNl363yAv1oSYQ=; b=OtDmvQM83JHSad6l3VssCmOuHRQMma4zgX03PQVIRmOFkGx2P5rCMcAeY3GVV2GfR3 hGFIn1bi7YksArkz/KNiU7S0QunkgTriKcsepHDsnzgenZjYRoU2EFV7O4osxnLVMKRB UQ7Ymc79Re8AvsID33U2439zey2I7iNV8o3pKwlOFGoPQlRqvQzkFAE+dOQzRwpMg26q 4vqlfjsfjTCPie5mIp+rgpeB0MoQkYXw0Qy5g2Ofxl5RK0+pxcAKISCSqyJxR9m0h9KP 3hSmfq5lCvgVgpi4OBHhjUvca82TjLibGXWc9r0cNfb/wcWU1+17L0+QwPlqzSm6PfBB zeVw== X-Received: by 10.224.130.72 with SMTP id r8mr22741562qas.32.1380936493626; Fri, 04 Oct 2013 18:28:13 -0700 (PDT) Received: from smtp.gmail.com ([66.114.71.21]) by mx.google.com with ESMTPSA id x8sm35331978qam.2.1969.12.31.16.00.00 (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Fri, 04 Oct 2013 18:28:12 -0700 (PDT) From: Ethan Glasser-Camp To: David Bremner , notmuch mailing list Subject: Re: On disk tag storage format In-Reply-To: <87vc9mtpxh.fsf@zancas.localnet> References: <874nk8v9zw.fsf@zancas.localnet> <87vc9mtpxh.fsf@zancas.localnet> User-Agent: Notmuch/0.16+80~g81ee785 (http://notmuchmail.org) Emacs/24.2.1 (x86_64-pc-linux-gnu) Date: Fri, 04 Oct 2013 21:28:10 -0400 Message-ID: <87fvsgh5g5.fsf@betacantrips.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="=-=-=" X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 05 Oct 2013 01:28:21 -0000 --=-=-= Content-Type: text/plain David Bremner writes: > It's still a prototype, and there is not much error checking, and there > are certain issues not dealt with at all (the ones I thought about are > commented). Hi everyone, I'm very interested in running notmuch on all my laptops and having my mail and its tags be synchronized for me, so at Bremner's direction on IRC, I played around with this script a little. At first it wouldn't run on my computer; the script uses message IDs as filenames, which can be quite long, whereas I keep my mail in my $HOME, which is on an ecryptfs filesystem, and has a filename limit of 143 characters. I've modified the script so that it would run by mangling filenames, which is irreversible (the original tried to encode/decode filenames reversibly). Then I got a little carried away, adding --verbose and --dry-run options as well as removing a couple trailing semicolons. Here's my version, in case it should interest anyone else. --=-=-= Content-Type: text/x-python Content-Disposition: inline; filename=linksync.py Content-Description: linksync.py # Copyright 2013, David Bremner # Licensed under the same terms as notmuch. import notmuch import re import os, errno import sys from collections import defaultdict import argparse import hashlib # skip automatic and maildir tags skiptags = re.compile(r"^(attachement|signed|encrypted|draft|flagged|passed|replied|unread)$") # some random person on stack overflow suggests: def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise VERBOSE = False def log(msg): if VERBOSE: print(msg) CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' encode_re = '([^{0}])'.format(CHARSET) decode_re = '[%]([0-7][0-9A-Fa-f])' def encode_one_char(match): return('%{:02x}'.format(ord(match.group(1)))) def encode_for_fs(str): return re.sub(encode_re,encode_one_char, str,0) def mangle_message_id(msg_id): """ Return a mangled version of the message id, suitable for use as a filename. """ MAX_LENGTH = 143 FLAGS_LENGTH = 8 # :2,S...?? encoded = encode_for_fs(msg_id) if len(encoded) < MAX_LENGTH - FLAGS_LENGTH: return encoded SHA_LENGTH = 8 TRUNCATED_ID_LENGTH = MAX_LENGTH - SHA_LENGTH - FLAGS_LENGTH PREFIX_LENGTH = SUFFIX_LENGTH = (TRUNCATED_ID_LENGTH - 3) // 2 prefix = encoded[:PREFIX_LENGTH] suffix = encoded[-SUFFIX_LENGTH:] sha = hashlib.sha256() sha.update(encoded) return prefix + '...' + suffix + sha.hexdigest()[:SHA_LENGTH] def decode_one_char(match): return chr(int(match.group(1),16)) def decode_from_fs(str): return re.sub(decode_re,decode_one_char, str, 0) def mk_tag_dir(tagdir): mkdir_p (os.path.join(tagdir, 'cur')) mkdir_p (os.path.join(tagdir, 'new')) mkdir_p (os.path.join(tagdir, 'tmp')) flagpart = '(:2,[^:]*)' flagre = re.compile(flagpart + '$') def path_for_msg (dir, msg): filename = msg.get_filename() flagsmatch = flagre.search(filename) if flagsmatch == None: flags = '' else: flags = flagsmatch.group(1) return os.path.join(dir, 'cur', mangle_message_id(msg.get_message_id()) + flags) def unlink_message(dir, msg): dir = os.path.join(dir, 'cur') filepattern = mangle_filename_for_fs(msg.get_message_id()) + flagpart +'?$' filere = re.compile(filepattern) for file in os.listdir(dir): if filere.match(file): log("Unlinking {}".format(os.path.join(dir, file))) if not opts.dry_run: os.unlink(os.path.join(dir, file)) def dir_for_tag(tag): enc_tag = encode_for_fs (tag) return os.path.join(tagroot, enc_tag) disk_tags = defaultdict(set) disk_ids = set() def read_tags_from_disk(rootdir): for root, subFolders, files in os.walk(rootdir): for filename in files: mangled_id = filename.split(':')[0] tag = root.split('/')[-2] disk_ids.add(mangled_id) disk_tags[mangled_id].add(decode_from_fs(tag)) # Main program parser = argparse.ArgumentParser(description='Sync notmuch tag database to/from link farm') parser.add_argument('-l','--link-style',choices=['hard','symbolic', 'adaptive'], default='adaptive') parser.add_argument('-d','--destination',choices=['disk','notmuch'], default='disk') parser.add_argument('-t','--threshold', default=50000L, type=int) parser.add_argument('-n','--dry-run', default=False, action='store_true') parser.add_argument('-v','--verbose', default=False, action='store_true') parser.add_argument('tagroot') opts=parser.parse_args() VERBOSE = opts.verbose tagroot=opts.tagroot sync_from_links = (opts.destination == 'notmuch') read_tags_from_disk(tagroot) if sync_from_links: db = notmuch.Database(mode=notmuch.Database.MODE.READ_WRITE) else: db = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) dbtags = filter (lambda tag: not skiptags.match(tag), db.get_all_tags()) querystr = ' OR '.join(map (lambda tag: 'tag:'+tag, dbtags)) q_new = notmuch.Query(db, querystr) q_new.set_sort(notmuch.Query.SORT.UNSORTED) for msg in q_new.search_messages(): # silently ignore empty tags db_tags = set(filter (lambda tag: tag != '' and not skiptags.match(tag), msg.get_tags())) message_id = msg.get_message_id() mangled_id = mangle_message_id(message_id) disk_ids.discard(mangled_id) missing_on_disk = db_tags.difference(disk_tags[mangled_id]) missing_in_db = disk_tags[mangled_id].difference(db_tags) if sync_from_links: msg.freeze() filename = msg.get_filename() if len(missing_on_disk) > 0: if opts.link_style == 'adaptive': statinfo = os.stat (filename) symlink = (statinfo.st_size > opts.threshold) else: symlink = opts.link_style == 'symbolic' for tag in missing_on_disk: if sync_from_links: log("Removing tag {} from {}".format(tag, message_id)) if not opts.dry_run: msg.remove_tag(tag,sync_maildir_flags=False) else: tagdir = dir_for_tag (tag) if not opts.dry_run: mk_tag_dir (tagdir) newlink = path_for_msg (tagdir, msg) log("Linking {} to {}".format(filename, newlink)) if not opts.dry_run: if symlink: os.symlink(filename, newlink) else: os.link(filename, newlink) for tag in missing_in_db: if sync_from_links: log("Adding {} to message {}".format(tag, message_id)) if not opts.dry_run: msg.add_tag(tag,sync_maildir_flags=False) else: tagdir = dir_for_tag (tag) unlink_message(tagdir,msg) if sync_from_links: msg.thaw() # everything remaining in disk_ids is a deleted message # unless we are syncing back to the database, in which case # it just might not currently have any non maildir tags. if not sync_from_links: for root, subFolders, files in os.walk(tagroot): for filename in files: mangled_id = filename.split(':')[0] if mangled_id in disk_ids: os.unlink(os.path.join(root, filename)) db.close() # currently empty directories are not pruned. --=-=-=--