From ec56122b40d91b7a0ad04fcd54767f34a30f5bcc Mon Sep 17 00:00:00 2001 From: Ethan Glasser-Camp Date: Sat, 5 Oct 2013 21:28:10 +2000 Subject: [PATCH] Re: On disk tag storage format --- 78/2bff22f207be4876c5fbe411113309452787ad | 326 ++++++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 78/2bff22f207be4876c5fbe411113309452787ad diff --git a/78/2bff22f207be4876c5fbe411113309452787ad b/78/2bff22f207be4876c5fbe411113309452787ad new file mode 100644 index 000000000..e8a3108cf --- /dev/null +++ b/78/2bff22f207be4876c5fbe411113309452787ad @@ -0,0 +1,326 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by olra.theworths.org (Postfix) with ESMTP id 4A161431FBD + for ; Fri, 4 Oct 2013 18:28:21 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at olra.theworths.org +X-Spam-Flag: NO +X-Spam-Score: -0.799 +X-Spam-Level: +X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5 + tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, + FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled +Received: from olra.theworths.org ([127.0.0.1]) + by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id rdQfzwQdBW4y for ; + Fri, 4 Oct 2013 18:28:16 -0700 (PDT) +Received: from mail-qa0-f44.google.com (mail-qa0-f44.google.com + [209.85.216.44]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) + (No client certificate requested) + by olra.theworths.org (Postfix) with ESMTPS id 52FBE431FAF + for ; Fri, 4 Oct 2013 18:28:16 -0700 (PDT) +Received: by mail-qa0-f44.google.com with SMTP id j7so1607348qaq.10 + for ; Fri, 04 Oct 2013 18:28:13 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; + h=from:to:subject:in-reply-to:references:user-agent:date:message-id + :mime-version:content-type; + bh=yHoDnFNlw3o30xsgKwGZhEWp0L6xhXNl363yAv1oSYQ=; + b=OtDmvQM83JHSad6l3VssCmOuHRQMma4zgX03PQVIRmOFkGx2P5rCMcAeY3GVV2GfR3 + hGFIn1bi7YksArkz/KNiU7S0QunkgTriKcsepHDsnzgenZjYRoU2EFV7O4osxnLVMKRB + UQ7Ymc79Re8AvsID33U2439zey2I7iNV8o3pKwlOFGoPQlRqvQzkFAE+dOQzRwpMg26q + 4vqlfjsfjTCPie5mIp+rgpeB0MoQkYXw0Qy5g2Ofxl5RK0+pxcAKISCSqyJxR9m0h9KP + 3hSmfq5lCvgVgpi4OBHhjUvca82TjLibGXWc9r0cNfb/wcWU1+17L0+QwPlqzSm6PfBB + zeVw== +X-Received: by 10.224.130.72 with SMTP id r8mr22741562qas.32.1380936493626; + Fri, 04 Oct 2013 18:28:13 -0700 (PDT) +Received: from smtp.gmail.com ([66.114.71.21]) + by mx.google.com with ESMTPSA id x8sm35331978qam.2.1969.12.31.16.00.00 + (version=TLSv1.2 cipher=RC4-SHA bits=128/128); + Fri, 04 Oct 2013 18:28:12 -0700 (PDT) +From: Ethan Glasser-Camp +To: David Bremner , + notmuch mailing list +Subject: Re: On disk tag storage format +In-Reply-To: <87vc9mtpxh.fsf@zancas.localnet> +References: <874nk8v9zw.fsf@zancas.localnet> <87vc9mtpxh.fsf@zancas.localnet> +User-Agent: Notmuch/0.16+80~g81ee785 (http://notmuchmail.org) Emacs/24.2.1 + (x86_64-pc-linux-gnu) +Date: Fri, 04 Oct 2013 21:28:10 -0400 +Message-ID: <87fvsgh5g5.fsf@betacantrips.com> +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="=-=-=" +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.13 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Sat, 05 Oct 2013 01:28:21 -0000 + +--=-=-= +Content-Type: text/plain + +David Bremner writes: + +> It's still a prototype, and there is not much error checking, and there +> are certain issues not dealt with at all (the ones I thought about are +> commented). + +Hi everyone, + +I'm very interested in running notmuch on all my laptops and having my +mail and its tags be synchronized for me, so at Bremner's direction on +IRC, I played around with this script a little. At first it wouldn't run +on my computer; the script uses message IDs as filenames, which can be +quite long, whereas I keep my mail in my $HOME, which is on an ecryptfs +filesystem, and has a filename limit of 143 characters. + +I've modified the script so that it would run by mangling filenames, +which is irreversible (the original tried to encode/decode filenames +reversibly). Then I got a little carried away, adding --verbose and +--dry-run options as well as removing a couple trailing +semicolons. Here's my version, in case it should interest anyone else. + + +--=-=-= +Content-Type: text/x-python +Content-Disposition: inline; filename=linksync.py +Content-Description: linksync.py + +# Copyright 2013, David Bremner + +# Licensed under the same terms as notmuch. + +import notmuch +import re +import os, errno +import sys +from collections import defaultdict +import argparse +import hashlib + +# skip automatic and maildir tags + +skiptags = re.compile(r"^(attachement|signed|encrypted|draft|flagged|passed|replied|unread)$") + +# some random person on stack overflow suggests: + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise + +VERBOSE = False + +def log(msg): + if VERBOSE: + print(msg) + +CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' + +encode_re = '([^{0}])'.format(CHARSET) + +decode_re = '[%]([0-7][0-9A-Fa-f])' + +def encode_one_char(match): + return('%{:02x}'.format(ord(match.group(1)))) + +def encode_for_fs(str): + return re.sub(encode_re,encode_one_char, str,0) + +def mangle_message_id(msg_id): + """ + Return a mangled version of the message id, suitable for use as a filename. + """ + MAX_LENGTH = 143 + FLAGS_LENGTH = 8 # :2,S...?? + encoded = encode_for_fs(msg_id) + if len(encoded) < MAX_LENGTH - FLAGS_LENGTH: + return encoded + + SHA_LENGTH = 8 + TRUNCATED_ID_LENGTH = MAX_LENGTH - SHA_LENGTH - FLAGS_LENGTH + PREFIX_LENGTH = SUFFIX_LENGTH = (TRUNCATED_ID_LENGTH - 3) // 2 + prefix = encoded[:PREFIX_LENGTH] + suffix = encoded[-SUFFIX_LENGTH:] + sha = hashlib.sha256() + sha.update(encoded) + return prefix + '...' + suffix + sha.hexdigest()[:SHA_LENGTH] + +def decode_one_char(match): + return chr(int(match.group(1),16)) + +def decode_from_fs(str): + return re.sub(decode_re,decode_one_char, str, 0) + +def mk_tag_dir(tagdir): + + mkdir_p (os.path.join(tagdir, 'cur')) + mkdir_p (os.path.join(tagdir, 'new')) + mkdir_p (os.path.join(tagdir, 'tmp')) + + +flagpart = '(:2,[^:]*)' +flagre = re.compile(flagpart + '$') + +def path_for_msg (dir, msg): + filename = msg.get_filename() + flagsmatch = flagre.search(filename) + if flagsmatch == None: + flags = '' + else: + flags = flagsmatch.group(1) + + return os.path.join(dir, 'cur', mangle_message_id(msg.get_message_id()) + flags) + + +def unlink_message(dir, msg): + + dir = os.path.join(dir, 'cur') + + filepattern = mangle_filename_for_fs(msg.get_message_id()) + flagpart +'?$' + + filere = re.compile(filepattern) + + for file in os.listdir(dir): + if filere.match(file): + log("Unlinking {}".format(os.path.join(dir, file))) + if not opts.dry_run: + os.unlink(os.path.join(dir, file)) + +def dir_for_tag(tag): + enc_tag = encode_for_fs (tag) + return os.path.join(tagroot, enc_tag) + +disk_tags = defaultdict(set) +disk_ids = set() + +def read_tags_from_disk(rootdir): + + for root, subFolders, files in os.walk(rootdir): + for filename in files: + mangled_id = filename.split(':')[0] + tag = root.split('/')[-2] + disk_ids.add(mangled_id) + disk_tags[mangled_id].add(decode_from_fs(tag)) + +# Main program + +parser = argparse.ArgumentParser(description='Sync notmuch tag database to/from link farm') +parser.add_argument('-l','--link-style',choices=['hard','symbolic', 'adaptive'], + default='adaptive') +parser.add_argument('-d','--destination',choices=['disk','notmuch'], default='disk') +parser.add_argument('-t','--threshold', default=50000L, type=int) +parser.add_argument('-n','--dry-run', default=False, action='store_true') +parser.add_argument('-v','--verbose', default=False, action='store_true') + +parser.add_argument('tagroot') + +opts=parser.parse_args() +VERBOSE = opts.verbose + +tagroot=opts.tagroot + +sync_from_links = (opts.destination == 'notmuch') + +read_tags_from_disk(tagroot) + +if sync_from_links: + db = notmuch.Database(mode=notmuch.Database.MODE.READ_WRITE) +else: + db = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) + +dbtags = filter (lambda tag: not skiptags.match(tag), db.get_all_tags()) + +querystr = ' OR '.join(map (lambda tag: 'tag:'+tag, dbtags)) + +q_new = notmuch.Query(db, querystr) +q_new.set_sort(notmuch.Query.SORT.UNSORTED) +for msg in q_new.search_messages(): + + # silently ignore empty tags + db_tags = set(filter (lambda tag: tag != '' and not skiptags.match(tag), + msg.get_tags())) + + message_id = msg.get_message_id() + + mangled_id = mangle_message_id(message_id) + + disk_ids.discard(mangled_id) + + missing_on_disk = db_tags.difference(disk_tags[mangled_id]) + missing_in_db = disk_tags[mangled_id].difference(db_tags) + + if sync_from_links: + msg.freeze() + + filename = msg.get_filename() + + if len(missing_on_disk) > 0: + if opts.link_style == 'adaptive': + statinfo = os.stat (filename) + symlink = (statinfo.st_size > opts.threshold) + else: + symlink = opts.link_style == 'symbolic' + + for tag in missing_on_disk: + + if sync_from_links: + log("Removing tag {} from {}".format(tag, message_id)) + if not opts.dry_run: + msg.remove_tag(tag,sync_maildir_flags=False) + else: + tagdir = dir_for_tag (tag) + + if not opts.dry_run: + mk_tag_dir (tagdir) + + newlink = path_for_msg (tagdir, msg) + + log("Linking {} to {}".format(filename, newlink)) + if not opts.dry_run: + if symlink: + os.symlink(filename, newlink) + else: + os.link(filename, newlink) + + + for tag in missing_in_db: + if sync_from_links: + log("Adding {} to message {}".format(tag, message_id)) + if not opts.dry_run: + msg.add_tag(tag,sync_maildir_flags=False) + else: + tagdir = dir_for_tag (tag) + unlink_message(tagdir,msg) + + if sync_from_links: + msg.thaw() + +# everything remaining in disk_ids is a deleted message +# unless we are syncing back to the database, in which case +# it just might not currently have any non maildir tags. + +if not sync_from_links: + for root, subFolders, files in os.walk(tagroot): + for filename in files: + mangled_id = filename.split(':')[0] + if mangled_id in disk_ids: + os.unlink(os.path.join(root, filename)) + + +db.close() + +# currently empty directories are not pruned. + +--=-=-=-- -- 2.26.2