From: Ethan Glasser-Camp Date: Mon, 7 Oct 2013 04:49:39 +0000 (+2000) Subject: Re: On disk tag storage format X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=bf660a0e46d2eb8268e0d584db70a075bab18319;p=notmuch-archives.git Re: On disk tag storage format --- diff --git a/c4/f91b52d6bd92dd01e43ac5a10a3b5f89f4dbbf b/c4/f91b52d6bd92dd01e43ac5a10a3b5f89f4dbbf new file mode 100644 index 000000000..0b27211a1 --- /dev/null +++ b/c4/f91b52d6bd92dd01e43ac5a10a3b5f89f4dbbf @@ -0,0 +1,995 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by olra.theworths.org (Postfix) with ESMTP id 22889431FAF + for ; Sun, 6 Oct 2013 21:49:52 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at olra.theworths.org +X-Spam-Flag: NO +X-Spam-Score: -0.799 +X-Spam-Level: +X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5 + tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, + FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled +Received: from olra.theworths.org ([127.0.0.1]) + by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id l0dk1rHivO6h for ; + Sun, 6 Oct 2013 21:49:44 -0700 (PDT) +Received: from mail-qe0-f53.google.com (mail-qe0-f53.google.com + [209.85.128.53]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) + (No client certificate requested) + by olra.theworths.org (Postfix) with ESMTPS id 83F48431FAE + for ; Sun, 6 Oct 2013 21:49:44 -0700 (PDT) +Received: by mail-qe0-f53.google.com with SMTP id cy11so956017qeb.26 + for ; Sun, 06 Oct 2013 21:49:42 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; + h=from:to:subject:in-reply-to:references:user-agent:date:message-id + :mime-version:content-type; + bh=djCxR7kr3DGkKxSNNOoNrBe3lo2uI3tzo52A5puB6eI=; + b=W9YJGZc3BuQI5wlBCt6FgoIy59LdhOiQsMaW1yejCmq8RkflhNV3kWbL6q0NPI7P6/ + 8LBCDceH4hfL6x/wTr7q2bYrWlBgRkkyrBBVhax7MpiY0aGL1+pzuM5tpN01d0P65R9J + /6DXChty02CS4Z/fenIyWN1nWlkkZhuBYJpPjgNzpMpwNC3vZu11w56WD5u/xx7gKT5J + L3imy4r2lwNHTjWSxDD0yPDYuxQkxYWfgWCDQtBIjZtsKD7PGA3Sq9HT2a+BuOZ1tQIt + oHD28zD4wPfjyYD+f9Q3v4FH+Rhx0RQEGRKdijahPqJ4tsv1VBDVikKUw6j2wAWPIuug + 3UmQ== +X-Received: by 10.224.11.133 with SMTP id t5mr35139503qat.34.1381121382839; + Sun, 06 Oct 2013 21:49:42 -0700 (PDT) +Received: from smtp.gmail.com ([66.114.71.21]) + by mx.google.com with ESMTPSA id g2sm58448024qaf.12.1969.12.31.16.00.00 + (version=TLSv1.2 cipher=RC4-SHA bits=128/128); + Sun, 06 Oct 2013 21:49:41 -0700 (PDT) +From: Ethan Glasser-Camp +To: David Bremner , + notmuch mailing list +Subject: Re: On disk tag storage format +In-Reply-To: <87fvsgh5g5.fsf@betacantrips.com> +References: <874nk8v9zw.fsf@zancas.localnet> <87vc9mtpxh.fsf@zancas.localnet> + <87fvsgh5g5.fsf@betacantrips.com> +User-Agent: Notmuch/0.16+80~g81ee785 (http://notmuchmail.org) Emacs/24.2.1 + (x86_64-pc-linux-gnu) +Date: Mon, 07 Oct 2013 00:49:39 -0400 +Message-ID: <87bo31heho.fsf@betacantrips.com> +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="=-=-=" +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.13 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Mon, 07 Oct 2013 04:49:52 -0000 + +--=-=-= +Content-Type: text/plain + +Ethan Glasser-Camp writes: + +> I've modified the script so that it would run by mangling filenames, +> which is irreversible (the original tried to encode/decode filenames +> reversibly). Then I got a little carried away, adding --verbose and +> --dry-run options as well as removing a couple trailing +> semicolons. Here's my version, in case it should interest anyone else. + +Hi guys, + +There was a bug in the previous version I sent. It didn't handle +unlinking tags correctly. Also, I spotted a bug in syncing to untagged +messages. Maybe I should stop using emails as version control. + +---- 8< ---- + + + +--=-=-= +Content-Type: text/x-python +Content-Disposition: inline; filename=linksync.py +Content-Description: slightly more tested this time + +# Copyright 2013, David Bremner + +# Licensed under the same terms as notmuch. + +import notmuch +import re +import os, errno +import sys +from collections import defaultdict +import argparse +import hashlib + +# skip automatic and maildir tags + +skiptags = re.compile(r"^(attachement|signed|encrypted|draft|flagged|passed|replied|unread)$") + +# some random person on stack overflow suggests: + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise + +VERBOSE = False + +def log(msg): + if VERBOSE: + print(msg) + +CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' + +encode_re = '([^{0}])'.format(CHARSET) + +decode_re = '[%]([0-7][0-9A-Fa-f])' + +def encode_one_char(match): + return('%{:02x}'.format(ord(match.group(1)))) + +def encode_for_fs(str): + return re.sub(encode_re,encode_one_char, str,0) + +def mangle_message_id(msg_id): + """ + Return a mangled version of the message id, suitable for use as a filename. + """ + MAX_LENGTH = 143 + FLAGS_LENGTH = 8 # :2,S...?? + encoded = encode_for_fs(msg_id) + if len(encoded) < MAX_LENGTH - FLAGS_LENGTH: + return encoded + + SHA_LENGTH = 8 + TRUNCATED_ID_LENGTH = MAX_LENGTH - SHA_LENGTH - FLAGS_LENGTH + PREFIX_LENGTH = SUFFIX_LENGTH = (TRUNCATED_ID_LENGTH - 3) // 2 + prefix = encoded[:PREFIX_LENGTH] + suffix = encoded[-SUFFIX_LENGTH:] + sha = hashlib.sha256() + sha.update(encoded) + return prefix + '...' + suffix + sha.hexdigest()[:SHA_LENGTH] + +def decode_one_char(match): + return chr(int(match.group(1),16)) + +def decode_from_fs(str): + return re.sub(decode_re,decode_one_char, str, 0) + +def mk_tag_dir(tagdir): + + mkdir_p (os.path.join(tagdir, 'cur')) + mkdir_p (os.path.join(tagdir, 'new')) + mkdir_p (os.path.join(tagdir, 'tmp')) + + +flagpart = '(:2,[^:]*)' +flagre = re.compile(flagpart + '$'); + +def path_for_msg (dir, msg): + filename = msg.get_filename() + flagsmatch = flagre.search(filename) + if flagsmatch == None: + flags = '' + else: + flags = flagsmatch.group(1) + + return os.path.join(dir, 'cur', mangle_message_id(msg.get_message_id()) + flags) + + +def unlink_message(dir, msg): + + dir = os.path.join(dir, 'cur') + + mangled_id = mangle_message_id(msg.get_message_id()) + filepattern = '^' + re.escape(mangled_id) + flagpart +'?$' + + filere = re.compile(filepattern) + + for file in os.listdir(dir): + if filere.match(file): + log("Unlinking {}".format(os.path.join(dir, file))) + if not opts.dry_run: + os.unlink(os.path.join(dir, file)) + +def dir_for_tag(tag): + enc_tag = encode_for_fs (tag) + return os.path.join(tagroot, enc_tag) + +disk_tags = defaultdict(set) +disk_ids = set() + +def read_tags_from_disk(rootdir): + + for root, subFolders, files in os.walk(rootdir): + for filename in files: + mangled_id = filename.split(':')[0] + tag = root.split('/')[-2] + disk_ids.add(mangled_id) + disk_tags[mangled_id].add(decode_from_fs(tag)) + +# Main program + +parser = argparse.ArgumentParser(description='Sync notmuch tag database to/from link farm') +parser.add_argument('-l','--link-style',choices=['hard','symbolic', 'adaptive'], + default='adaptive') +parser.add_argument('-d','--destination',choices=['disk','notmuch'], default='disk') +parser.add_argument('-t','--threshold', default=50000L, type=int) +parser.add_argument('-n','--dry-run', default=False, action='store_true') +parser.add_argument('-v','--verbose', default=False, action='store_true') + +parser.add_argument('tagroot') + +opts=parser.parse_args() +VERBOSE = opts.verbose + +tagroot=opts.tagroot + +sync_from_links = (opts.destination == 'notmuch') + +read_tags_from_disk(tagroot) + +if sync_from_links: + db = notmuch.Database(mode=notmuch.Database.MODE.READ_WRITE) +else: + db = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) + +dbtags = filter (lambda tag: not skiptags.match(tag), db.get_all_tags()) + +if sync_from_links: + # have to iterate over even untagged messages + querystr = '*' +else: + querystr = ' OR '.join(map (lambda tag: 'tag:'+tag, dbtags)) + +q_new = notmuch.Query(db, querystr) +q_new.set_sort(notmuch.Query.SORT.UNSORTED) +for msg in q_new.search_messages(): + + # silently ignore empty tags + db_tags = set(filter (lambda tag: tag != '' and not skiptags.match(tag), + msg.get_tags())) + + message_id = msg.get_message_id() + + mangled_id = mangle_message_id(message_id) + + disk_ids.discard(mangled_id) + + missing_on_disk = db_tags.difference(disk_tags[mangled_id]) + missing_in_db = disk_tags[mangled_id].difference(db_tags) + + if sync_from_links: + msg.freeze() + + filename = msg.get_filename() + + if len(missing_on_disk) > 0: + if opts.link_style == 'adaptive': + statinfo = os.stat (filename) + symlink = (statinfo.st_size > opts.threshold) + else: + symlink = opts.link_style == 'symbolic' + + for tag in missing_on_disk: + + if sync_from_links: + log("Removing tag {} from {}".format(tag, message_id)) + if not opts.dry_run: + msg.remove_tag(tag,sync_maildir_flags=False) + else: + tagdir = dir_for_tag (tag) + + if not opts.dry_run: + mk_tag_dir (tagdir) + + newlink = path_for_msg (tagdir, msg) + + log("Linking {} to {}".format(filename, newlink)) + if not opts.dry_run: + if symlink: + os.symlink(filename, newlink) + else: + os.link(filename, newlink) + + + for tag in missing_in_db: + if sync_from_links: + log("Adding {} to message {}".format(tag, message_id)) + if not opts.dry_run: + msg.add_tag(tag,sync_maildir_flags=False) + else: + tagdir = dir_for_tag (tag) + unlink_message(tagdir,msg) + + if sync_from_links: + msg.thaw() + +# everything remaining in disk_ids is a deleted message +# unless we are syncing back to the database, in which case +# it just might not currently have any non maildir tags. + +if not sync_from_links: + for root, subFolders, files in os.walk(tagroot): + for filename in files: + mangled_id = filename.split(':')[0] + if mangled_id in disk_ids: + os.unlink(os.path.join(root, filename)) + + +db.close() + +# currently empty directories are not pruned. + +--=-=-= +Content-Type: text/plain + + +---- 8< ---- + +Of course, the next step is to sync using this mechanism. Rsync doesn't +really have a concept of history, which basically makes it unusable for +this purpose [1]. Unison doesn't really understand renames, so it gets +confused when you mark a message as read (which might move it from new +to cur, and definitely changes its tags). Bremner suggested +syncmaildir. Syncmaildir doesn't understand links at all. Bremner +suggested that we could use some parts of syncmaildir to implement the +tag syncing we need. + +I didn't have anything else going on this weekend so I tried to +prototype the approach. It turns out to be possible to leverage some +parts of syncmaildir. I translated a bunch of smd-client into a new +program, tagsync-client, that links to messages in an existing notmuch +DB. It seems like it's possible to use it in place of the existing +smd-client by putting lines like this in your config: + +SMDCLIENT=~/src/tagsync.git/tagsync-client.py +REMOTESMDCLIENT=~/src/tagsync.git/tagsync-client.py + +The sequence of commands I ran: + +- linksync.py to dump tags to ~/Mail/.notmuch/exported-tags +- smd-pull mail to sync ~/Mail but excluding .notmuch +- notmuch new +- smd-pull tagsync (using the above client) to sync ~/Mail/.notmuch/exported-tags +- linksync.py to pull tags from ~/Mail/.notmuch/exported-tags + +syncmaildir doesn't cope well with drafts, so it might choke on that, +and it doesn't like symlinks (it thinks they're always to directories), +so be sure to run linksync with -l hard. + +Here's the script. It's a work in progress; I have only tested it once in one direction. + +---- 8< ---- + + +--=-=-= +Content-Type: text/x-python +Content-Disposition: inline; filename=tagsync-client.py +Content-Description: client script + +#! /usr/bin/env python +import sys +from sys import stdout, stdin, stderr +import stat +import urllib +import hashlib +import re +import os.path +import argparse +import subprocess +import traceback +import notmuch +import time +PROTOCOL_VERSION = "1.1" + +# Not reproducing the autoconf logic +XDELTA = 'xdelta' +MDDIFF = 'mddiff' + +VERBOSE = False + +def log(msg): + if VERBOSE: + stderr.write("INFO: "+msg+"\n") + +def __error(msg): + raise ValueError(msg) + +def log_tags_and_fail(msg, *args): + log_tags(*args) + __error(msg) + +def log_internal_error_and_fail(msg, *args): + log_internal_error_tags(msg, *args) + __error(msg) + +def log_error(msg): + return stderr.write("ERROR: {}\n".format(msg)) + +def log_tag(tag): + return stderr.write("TAGS: {}\n".format(tag)) + +def log_progress(msg): + pass + +def log_tags(context='unknown', cause='unknown', human=False, *args): + if human: + human = "necessary" + else: + human = "avoidable" + + suggestions = {} + suggestions_string = "" + if len(args): + suggestions_string = ' suggested-actions({})'.format(' '.join(args)) + + return log_tag("error::context({}) probable-cause({}) human-intervention({})".format( + context, cause, human) + suggestions_string) + +def mkdir_p(filename): + """Maildir-aware mkdir. + + Creates a directory and all parent directories. + + Moreover, if the last component is 'tmp', 'cur' or 'new', the + others are created too.""" + + # The Lua function throws away the last path component if it + # doesn't end in /. This allows you to just call mkdir_p on any + # file and a directory for it to live will be created. + if not filename.endswith('/'): + filename, _ = os.path.split(filename) + + if not filename.startswith('/'): + # This path is relative to HOME, and needs to be translated + # too. + filename = translate(filename) + filename = os.path.expanduser('~/'+filename) + + dirname, basename = os.path.split(filename) + try: + os.makedirs(filename) + except OSError: + pass # probably "File exists" + MAILDIR_SUBDIRS = ['tmp', 'cur', 'new'] + if basename in MAILDIR_SUBDIRS: + for subdir in MAILDIR_SUBDIRS: + to_create = os.path.join(dirname, subdir) + if not os.path.exists(to_create): + os.mkdir(to_create) + + +class FakeSubprocess(object): + def __init__(self, init_function): + self.init_function = init_function + self.input = None + self.output = None + self.pipe_name = None + self.removed = None + self.did_write = None + self.filter = {} + + def readline(self): + if not self.input: + log_internal_error_and_fail("read called before write", + "make_slave_filter_process") + + + if not self.removed and self.did_write: + self.removed = True + rc = self.input.readline() + os.unlink(self.pipe_name) + return rc + else: + return self.input.readline() + + def write(self, *args): + if not self.output: + self.init_function(self.filter) + self.input = self.filter['inf'] + self.output = self.filter['outf'] + self.pipe_name = self.filter['pipe'] + + self.did_write = True + self.output.write(*args) + + def flush(self): + self.output.flush() + + def lines(self): + return self.input.readlines() + +def make_slave_filter_process(cmd, seed="no seed"): + def init(filter): + if 'inf' not in filter: + home = os.getenv('HOME') + user = os.getenv('USER') or 'nobody' + mangled_name = re.compile('[ %./]').sub('-', seed) + attempt = 0 + if home: + base_dir = home + '/.smd/fifo/' + else: + base_dir = '/tmp/' + + rc = subprocess.call([MDDIFF, '--mkdir-p', base_dir]) + if rc != 0: + log_internal_error_and_fail('unable to create directory', + 'make_slave_filter_process') + + while True: + pipe_name = ''.join([base_dir, 'smd-', user, str(int(time.time())), + mangled_name, str(attempt)]) + attempt += 1 + rc = subprocess.call([MDDIFF, '--mkfifo', pipe_name]) + if rc == 0 or attempt > 10: + break + if rc != 0: + log_internal_error_and_fail('unable to create fifo', + "make_slave_filter_process") + + inferior = cmd(pipe_name) + filter['inf'] = inferior.stdout + filter['outf'] = file(pipe_name, 'w') + filter['pipe'] = pipe_name + + return FakeSubprocess(init) + +_translator = None +def set_translator(p): + global _translator + translator_filter = make_slave_filter_process( + lambda pipe: subprocess.Popen(p, stdin=file(pipe), stdout=subprocess.PIPE), + "translate") + if p == 'cat': + _translator = lambda x: x + else: + def translator_fn(x): + translator_filter.write(x + '\n') + translator_filter.flush() + line = translator_filter.readline() + if not line or line.strip() == 'ERROR': + log_error("Translator {} on input {} gave an error".format( + p, x)) + for l in translator_filter.readlines(): + log_error(l) + log_tags_and_fail("Unable to translate mailbox", + 'translate', 'bad-translator', True) + if '..' in line: + log_error("Translator {} on input {} returned a path containing ..".format( + p, x)) + log_tags_and_fail('Translator returned a path containing ..', + 'translate', 'bad-translator', True) + + return line + + _translator = translator_fn + +def translate(x): + if _translator: + return _translator(x) + return x + + +mddiff_sha_handler = make_slave_filter_process( + lambda pipe: subprocess.Popen([MDDIFF, pipe], stdout=subprocess.PIPE), + "sha_file") + +def sha_file(name): + mddiff_sha_handler.write(name+'\n') + mddiff_sha_handler.flush() + + data = mddiff_sha_handler.readline() + if data.startswith('ERROR'): + log_tags_and_fail("Failed to sha1 message: " + (name or "nil"), + 'sha_file', 'modify-while-update', False, 'retry') + + hsha, bsha = data.split() + if not hsha or not bsha: + log_internal_error_and_fail('mddiff incorrect behavior', 'mddiff') + + return hsha, bsha + +def exists_and_sha(name): + if os.path.exists(name): + h, b = sha_file(name) + return True, h, b + + return False, False, False + + +def touch(f): + try: + file(f, 'r') + except IOError: + try: + file(f, 'w') + except IOError: + log_error('Unable to touch ' + quote(f)) + log_tags("touch", "bad-permissions", True, + "display-permissions(" + quote(f) + ")") + error("Unable to touch a file") + +def quote(s): + return repr(s) + + +def assert_exists(name): + assert os.exists(name), "Not found: "+repr(name) + +def url_quote(txt): + return urllib.quote(txt, safe='') + +def url_decode(s): + return urllib.unquote(s) + +def log_internal_error_tags(msg, ctx): + log_tags('internal-error', ctx, True) + # Blob of "run gnome-open" junk not copied + +def receive(inf, outfile): + try: + outf = file(outfile, 'w') + except IOError: + log_error("Unable to open " + outfile + " for writing.") + log_error('It may be caused by bad directory permissions, '+ + 'please check.') + log_tags("receive", "non-writeable-file", True, + "display-permissions(" + quote(outfile) +")") + error("Unable to write incoming data") + + line = inf.readline() + if not line or line.strip() == "ABORT": + log_error("Data transmission failed.") + log_error("This problem is transient, please retry.") + log_tags_and_fail('server sent ABORT or connection died', + "receive", "network", False, "retry") + + # In the Lua version, this is called "len", but that's a builtin + # in Python + chunk_len = int(re.compile(r'^chunk (\d+)').match(line).group(1)) + total = chunk_len + while chunk_len: + next_chunk = 16384 + if chunk_len < next_chunk: + next_chunk = chunk_len + data = inf.read(next_chunk) + chunk_len -= len(data) + outf.write(data) + + # Probably not strictly speaking necessary in Python + outf.close() + + return total + +def handshake(dbfile): + stdout.write("protocol {}\n".format(PROTOCOL_VERSION)) + touch(dbfile) + sha_output = subprocess.check_output([MDDIFF, '--sha1sum', dbfile]) + db_sha = sha_output.split()[0] + err_msg = sha_output[sha_output.index(' ')+1:] + + if db_sha == 'ERROR': + log_internal_error_and_fail('unreadable db file: '+quote(dbfile), 'handshake') + + stdout.write("dbfile {}\n".format(db_sha)) + stdout.flush() + + line = stdin.readline() + if not line: + log_error("Network error.") + log_error("Unable to get any data from the other endpoint.") + log_error("This problem may be transient, please retry.") + log_error("Hint: did you correctly setup the SERVERNAME variable") + log_error("on your client? Did you add an entry for it in your ssh") + log_error("configuration file?") + log_tags_and_fail('Network error', "handshake", "network", False, "retry") + + protocol = re.compile('^protocol (.+)$').match(line) + if not protocol or protocol.group(1) != PROTOCOL_VERSION: + log_error('Wrong protocol version.') + log_error('The same version of syncmaildir must be used on '+ + 'both endpoints') + log_tags_and_fail('Protocol version mismatch', "handshake", "protocol-mismatch", True) + + line = stdin.readline() + if not line: + log_error("The client disconnected during handshake") + log_tags_and_fail('Network error', "handshake", "network", False, "retry") + + sha = re.compile(r'^dbfile (\S+)$').match(line) + if not sha or sha.group(1) != db_sha: + log_error('Local dbfile and remote db file differ.') + log_error('Remove both files and push/pull again.') + log_tags_and_fail('Database mismatch', "handshake", "db-mismatch", True, "run(rm "+ + quote(dbfile)+")") + +def dbfile_name(endpoint, mailboxes): + endpoint = endpoint.rstrip('/') + mailboxes = mailboxes.rstrip('/') + subprocess.check_call([MDDIFF, '--mkdir-p', os.path.expanduser('~/.smd/')]) + return os.path.expanduser('~/.smd/{}__{}.db.txt'.format( + endpoint.replace('/', '_'), + mailboxes.replace('/', '_').replace('%', '_') + )) + +def receive_delta(inf): + cmds = [] + while True: + line = inf.readline() + if line and line.strip() != "END": + cmds.append(line) + + if not line or line.strip() == "END": + break + + if line.strip() != "END": + log_error('Unable to receive a complete diff') + log_tags("receive-delta", "network", False, "retry") + error("network error while receiving delta") + + return cmds + +def homefy(filename): + return os.path.expanduser("~/"+filename) + +def execute_add(name, hsha, bsha): + dir, basename = os.path.split(name) + # The real smd creates symlinks from workarea to the target + # directory, I dunno why. + dest = homefy(name) + ex, hsha_1, bsha_1 = exists_and_sha(dest) + if ex: + if hsha_1 != hsha or bsha_1 != bsha: + log_error("Failed to add {} since a file with the same name".format( + dest)) + log_error('exists but its content is different.') + log_error("Current hash {}/{}, requested hash {}/{}".format( + hsha_1, bsha_1, hsha, bsha)) + log_error('To fix this problem you should rename '+dest) + log_error('Executing `cd; mv -n '+quote(name)+' '+ + 'FIXME: tmp_for' +'` should work.') + log_tags("mail-addition", "concurrent-mailbox-edit", True, + ) + #mk_act("mv", name)) + return False + + return True # already there + if ':2,' in basename: + basename = basename[:basename.index(':2,')] + filenames = original_message_filenames(basename) + for filename in filenames: + orig_exists, hsha_orig, bsha_orig = exists_and_sha(filename) + assert orig_exists + if hsha_orig == hsha or bsha_orig == bsha: + os.link(filename, dest) + return True + + + log_error("Something seriously wrong here: we tried to link {}".format( + filename)) + log_error("to {} but the hashes were wrong. We wanted {}/{}".format( + dest, hsha, bsha)) + log_error("but we didn't see that in {}".format(filenames)) + log_tags_and_fail('Mail corpus wrong') + # FIXME: How do we decide whether to use symlinks or not? + # Seems like syncmaildir can't cope with symlinks, so let's just + # always use hard links + return False + +def execute_delete(name, hsha, bsha): + name = homefy(name) + ex, hsha_1, bsha_1 = exists_and_sha(name) + assert ex + assert hsha_1 == hsha + assert bsha_1 == bsha + + os.unlink(name) + return True + +def execute_copy(name_src, hsha, bsha, name_tgt): + name_src = homefy(name_src) + name_tgt = homefy(name_tgt) + ex_src, hsha_src, bsha_src = exists_and_sha(name_src) + ex_tgt, hsha_tgt, bsha_tgt = exists_and_sha(name_tgt) + + # Not reproducing all logic + assert ex_src + assert not ex_tgt + + assert hsha == hsha_src + assert bsha == bsha_src + if stat.S_ISLNK(os.stat(name_src).st_mode): + link_tgt = os.readlink(name_src) + os.symlink(link_tgt, name_tgt) + else: + os.link(name_src, name_tgt) + return True + +def execute_error(msg): + log_error('mddiff failed: '+msg) + if msg.startswith("Unable to open directory"): + log_tags("mddiff", "directory-disappeared", false) + else: + log_tags("mddiff", "unknown", true) + + # return (trace(false)) + return False + + +def execute(cmd): + """The main switch, dispatching actions.""" + opcode = cmd.split()[0] + + if opcode == "ADD": + name, hsha, bsha = re.compile(r'^ADD (\S+) (\S+) (\S+)$').match(cmd).groups() + name = url_decode(name) + mkdir_p(name) + return execute_add(name, hsha, bsha) + + elif opcode == "DELETE": + name, hsha, bsha = re.compile(r'^DELETE (\S+) (\S+) (\S+)$').match(cmd).groups() + name = url_decode(name) + mkdir_p(name) + return execute_delete(name, hsha, bsha) + + elif opcode == "COPY": + name_src, hsha, bsha, name_tgt = re.compile( + r'COPY (\S+) (\S+) (\S+) TO (\S+)$').match(cmd).groups() + name_src = url_decode(name_src) + name_tgt = url_decode(name_tgt) + mkdir_p(name_src) + mkdir_p(name_tgt) + return execute_copy(name_src, hsha, bsha, name_tgt) + + elif opcode in ['REPLACEHEADER', 'COPYBODY', 'REPLACE']: + log_internal_error_and_fail(opcode + ' was called: ' + cmd) + return False + + elif opcode == "ERROR": + msg = cmd[cmd.index(' ')+1:] + return execute_error(msg) + + else: + error("Unknown opcode " + opcode) + return False + +def main(): + parser = argparse.ArgumentParser(description="") + parser.add_argument('-v', '--verbose', action='store_true', default=False) + parser.add_argument('-d', '--dry-run', action='store_true', default=False) + parser.add_argument('-t', '--translator', type=str, default='cat') + parser.add_argument('endpoint') + parser.add_argument('mailboxes') + + opts = parser.parse_args() + + set_translator(opts.translator) + read_message_ids() + + dbfile = dbfile_name(opts.endpoint, opts.mailboxes) + xdelta = dbfile + '.xdelta' + newdb = dbfile + '.new' + + if opts.mailboxes[0] == '/': + log_error("Absolute paths are not supported: " + opts.mailboxes) + log_tags_and_fail("Absolute path detected", "main", "mailbox-has--absolute-path", True) + + handshake(dbfile) + commands = receive_delta(stdin) + for cmd in commands: + try: + rc = execute(cmd) + # Just wrap the whole thing in try-except to abort "cleanly" + except Exception as e: + log_error("Got an exception when processing {}: {}".format(cmd.strip(), str(e))) + log_error(traceback.format_exc()) + rc = False + if not rc: + stdout.write('ABORT\n') + stdout.flush() + sys.exit(3) + + # if len(get_full_email_queue) > queue_max_len: + # process_pending_queue() + + # some commands may still be in the queue, we fire them now + # process_pending_queue() + + # we commit and update the dbfile + stdout.write('COMMIT\n') + stdout.flush() + receive(stdin, xdelta) + + rc = subprocess.call([XDELTA, 'patch', xdelta, dbfile, newdb]) + if rc != 0 and rc != 256: + log_error('Unable to apply delta to dbfile.') + stdout.write('ABORT\n') + stdout.flush() + sys.exit(4) + + try: + os.rename(newdb, dbfile) + except OSError: + log_error('Unable to rename ' + newdb + ' to ' + dbfile) + stdout.write('ABORT\n') + stdout.flush() + sys.exit(5) + + os.unlink(xdelta) + stdout.write('DONE\n') + stdout.flush() + + #log_tag('stats::new-mails(' + statistics.added + + #'), del-mails(' + statistics.removed + ')') + +CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' + +encode_re = '([^{0}])'.format(CHARSET) + +def encode_one_char(match): + return('%{:02x}'.format(ord(match.group(1)))) + +def encode_for_fs(str): + return re.sub(encode_re,encode_one_char, str,0) + +def mangle_message_id(msg_id): + """ + Return a mangled version of the message id, suitable for use as a filename. + """ + MAX_LENGTH = 143 + FLAGS_LENGTH = 8 # :2,S...?? + encoded = encode_for_fs(msg_id) + if len(encoded) < MAX_LENGTH - FLAGS_LENGTH: + return encoded + + SHA_LENGTH = 8 + TRUNCATED_ID_LENGTH = MAX_LENGTH - SHA_LENGTH - FLAGS_LENGTH + PREFIX_LENGTH = SUFFIX_LENGTH = (TRUNCATED_ID_LENGTH - 3) // 2 + prefix = encoded[:PREFIX_LENGTH] + suffix = encoded[-SUFFIX_LENGTH:] + sha = hashlib.sha256() + sha.update(encoded) + return prefix + '...' + suffix + sha.hexdigest()[:SHA_LENGTH] + +MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES = {} +DB = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) +def read_message_ids(): + # We can't base this on tags at all because tags aren't applied yet + querystr = '*' + + q_new = notmuch.Query(DB, querystr) + q_new.set_sort(notmuch.Query.SORT.UNSORTED) + for msg in q_new.search_messages(): + mangled_id = mangle_message_id(msg.get_message_id()) + fiter = msg.get_filenames() + # list(fiter) gives me a NotInitializedException???? + filenames = [] + while True: + try: + filename = next(fiter) + filenames.append(filename) + except StopIteration: + break + + MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES[mangled_id] = filenames + +def original_message_filenames(mangled_filename): + if mangled_filename not in MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES: + log_error("{} not in notmuch. Trying to tag nonexistant message?".format( + mangled_filename)) + return MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES[mangled_filename] + +if __name__ == '__main__': + try: + main() + except Exception as e: + log_error(str(e)) + log_error(traceback.format_exc()) + sys.exit(6) + +--=-=-=--