Return-Path: X-Original-To: notmuch@notmuchmail.org Delivered-To: notmuch@notmuchmail.org Received: from localhost (localhost [127.0.0.1]) by olra.theworths.org (Postfix) with ESMTP id 22889431FAF for ; Sun, 6 Oct 2013 21:49:52 -0700 (PDT) X-Virus-Scanned: Debian amavisd-new at olra.theworths.org X-Spam-Flag: NO X-Spam-Score: -0.799 X-Spam-Level: X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5 tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled Received: from olra.theworths.org ([127.0.0.1]) by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id l0dk1rHivO6h for ; Sun, 6 Oct 2013 21:49:44 -0700 (PDT) Received: from mail-qe0-f53.google.com (mail-qe0-f53.google.com [209.85.128.53]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) (No client certificate requested) by olra.theworths.org (Postfix) with ESMTPS id 83F48431FAE for ; Sun, 6 Oct 2013 21:49:44 -0700 (PDT) Received: by mail-qe0-f53.google.com with SMTP id cy11so956017qeb.26 for ; Sun, 06 Oct 2013 21:49:42 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=from:to:subject:in-reply-to:references:user-agent:date:message-id :mime-version:content-type; bh=djCxR7kr3DGkKxSNNOoNrBe3lo2uI3tzo52A5puB6eI=; b=W9YJGZc3BuQI5wlBCt6FgoIy59LdhOiQsMaW1yejCmq8RkflhNV3kWbL6q0NPI7P6/ 8LBCDceH4hfL6x/wTr7q2bYrWlBgRkkyrBBVhax7MpiY0aGL1+pzuM5tpN01d0P65R9J /6DXChty02CS4Z/fenIyWN1nWlkkZhuBYJpPjgNzpMpwNC3vZu11w56WD5u/xx7gKT5J L3imy4r2lwNHTjWSxDD0yPDYuxQkxYWfgWCDQtBIjZtsKD7PGA3Sq9HT2a+BuOZ1tQIt oHD28zD4wPfjyYD+f9Q3v4FH+Rhx0RQEGRKdijahPqJ4tsv1VBDVikKUw6j2wAWPIuug 3UmQ== X-Received: by 10.224.11.133 with SMTP id t5mr35139503qat.34.1381121382839; Sun, 06 Oct 2013 21:49:42 -0700 (PDT) Received: from smtp.gmail.com ([66.114.71.21]) by mx.google.com with ESMTPSA id g2sm58448024qaf.12.1969.12.31.16.00.00 (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Sun, 06 Oct 2013 21:49:41 -0700 (PDT) From: Ethan Glasser-Camp To: David Bremner , notmuch mailing list Subject: Re: On disk tag storage format In-Reply-To: <87fvsgh5g5.fsf@betacantrips.com> References: <874nk8v9zw.fsf@zancas.localnet> <87vc9mtpxh.fsf@zancas.localnet> <87fvsgh5g5.fsf@betacantrips.com> User-Agent: Notmuch/0.16+80~g81ee785 (http://notmuchmail.org) Emacs/24.2.1 (x86_64-pc-linux-gnu) Date: Mon, 07 Oct 2013 00:49:39 -0400 Message-ID: <87bo31heho.fsf@betacantrips.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="=-=-=" X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 07 Oct 2013 04:49:52 -0000 --=-=-= Content-Type: text/plain Ethan Glasser-Camp writes: > I've modified the script so that it would run by mangling filenames, > which is irreversible (the original tried to encode/decode filenames > reversibly). Then I got a little carried away, adding --verbose and > --dry-run options as well as removing a couple trailing > semicolons. Here's my version, in case it should interest anyone else. Hi guys, There was a bug in the previous version I sent. It didn't handle unlinking tags correctly. Also, I spotted a bug in syncing to untagged messages. Maybe I should stop using emails as version control. ---- 8< ---- --=-=-= Content-Type: text/x-python Content-Disposition: inline; filename=linksync.py Content-Description: slightly more tested this time # Copyright 2013, David Bremner # Licensed under the same terms as notmuch. import notmuch import re import os, errno import sys from collections import defaultdict import argparse import hashlib # skip automatic and maildir tags skiptags = re.compile(r"^(attachement|signed|encrypted|draft|flagged|passed|replied|unread)$") # some random person on stack overflow suggests: def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise VERBOSE = False def log(msg): if VERBOSE: print(msg) CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' encode_re = '([^{0}])'.format(CHARSET) decode_re = '[%]([0-7][0-9A-Fa-f])' def encode_one_char(match): return('%{:02x}'.format(ord(match.group(1)))) def encode_for_fs(str): return re.sub(encode_re,encode_one_char, str,0) def mangle_message_id(msg_id): """ Return a mangled version of the message id, suitable for use as a filename. """ MAX_LENGTH = 143 FLAGS_LENGTH = 8 # :2,S...?? encoded = encode_for_fs(msg_id) if len(encoded) < MAX_LENGTH - FLAGS_LENGTH: return encoded SHA_LENGTH = 8 TRUNCATED_ID_LENGTH = MAX_LENGTH - SHA_LENGTH - FLAGS_LENGTH PREFIX_LENGTH = SUFFIX_LENGTH = (TRUNCATED_ID_LENGTH - 3) // 2 prefix = encoded[:PREFIX_LENGTH] suffix = encoded[-SUFFIX_LENGTH:] sha = hashlib.sha256() sha.update(encoded) return prefix + '...' + suffix + sha.hexdigest()[:SHA_LENGTH] def decode_one_char(match): return chr(int(match.group(1),16)) def decode_from_fs(str): return re.sub(decode_re,decode_one_char, str, 0) def mk_tag_dir(tagdir): mkdir_p (os.path.join(tagdir, 'cur')) mkdir_p (os.path.join(tagdir, 'new')) mkdir_p (os.path.join(tagdir, 'tmp')) flagpart = '(:2,[^:]*)' flagre = re.compile(flagpart + '$'); def path_for_msg (dir, msg): filename = msg.get_filename() flagsmatch = flagre.search(filename) if flagsmatch == None: flags = '' else: flags = flagsmatch.group(1) return os.path.join(dir, 'cur', mangle_message_id(msg.get_message_id()) + flags) def unlink_message(dir, msg): dir = os.path.join(dir, 'cur') mangled_id = mangle_message_id(msg.get_message_id()) filepattern = '^' + re.escape(mangled_id) + flagpart +'?$' filere = re.compile(filepattern) for file in os.listdir(dir): if filere.match(file): log("Unlinking {}".format(os.path.join(dir, file))) if not opts.dry_run: os.unlink(os.path.join(dir, file)) def dir_for_tag(tag): enc_tag = encode_for_fs (tag) return os.path.join(tagroot, enc_tag) disk_tags = defaultdict(set) disk_ids = set() def read_tags_from_disk(rootdir): for root, subFolders, files in os.walk(rootdir): for filename in files: mangled_id = filename.split(':')[0] tag = root.split('/')[-2] disk_ids.add(mangled_id) disk_tags[mangled_id].add(decode_from_fs(tag)) # Main program parser = argparse.ArgumentParser(description='Sync notmuch tag database to/from link farm') parser.add_argument('-l','--link-style',choices=['hard','symbolic', 'adaptive'], default='adaptive') parser.add_argument('-d','--destination',choices=['disk','notmuch'], default='disk') parser.add_argument('-t','--threshold', default=50000L, type=int) parser.add_argument('-n','--dry-run', default=False, action='store_true') parser.add_argument('-v','--verbose', default=False, action='store_true') parser.add_argument('tagroot') opts=parser.parse_args() VERBOSE = opts.verbose tagroot=opts.tagroot sync_from_links = (opts.destination == 'notmuch') read_tags_from_disk(tagroot) if sync_from_links: db = notmuch.Database(mode=notmuch.Database.MODE.READ_WRITE) else: db = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) dbtags = filter (lambda tag: not skiptags.match(tag), db.get_all_tags()) if sync_from_links: # have to iterate over even untagged messages querystr = '*' else: querystr = ' OR '.join(map (lambda tag: 'tag:'+tag, dbtags)) q_new = notmuch.Query(db, querystr) q_new.set_sort(notmuch.Query.SORT.UNSORTED) for msg in q_new.search_messages(): # silently ignore empty tags db_tags = set(filter (lambda tag: tag != '' and not skiptags.match(tag), msg.get_tags())) message_id = msg.get_message_id() mangled_id = mangle_message_id(message_id) disk_ids.discard(mangled_id) missing_on_disk = db_tags.difference(disk_tags[mangled_id]) missing_in_db = disk_tags[mangled_id].difference(db_tags) if sync_from_links: msg.freeze() filename = msg.get_filename() if len(missing_on_disk) > 0: if opts.link_style == 'adaptive': statinfo = os.stat (filename) symlink = (statinfo.st_size > opts.threshold) else: symlink = opts.link_style == 'symbolic' for tag in missing_on_disk: if sync_from_links: log("Removing tag {} from {}".format(tag, message_id)) if not opts.dry_run: msg.remove_tag(tag,sync_maildir_flags=False) else: tagdir = dir_for_tag (tag) if not opts.dry_run: mk_tag_dir (tagdir) newlink = path_for_msg (tagdir, msg) log("Linking {} to {}".format(filename, newlink)) if not opts.dry_run: if symlink: os.symlink(filename, newlink) else: os.link(filename, newlink) for tag in missing_in_db: if sync_from_links: log("Adding {} to message {}".format(tag, message_id)) if not opts.dry_run: msg.add_tag(tag,sync_maildir_flags=False) else: tagdir = dir_for_tag (tag) unlink_message(tagdir,msg) if sync_from_links: msg.thaw() # everything remaining in disk_ids is a deleted message # unless we are syncing back to the database, in which case # it just might not currently have any non maildir tags. if not sync_from_links: for root, subFolders, files in os.walk(tagroot): for filename in files: mangled_id = filename.split(':')[0] if mangled_id in disk_ids: os.unlink(os.path.join(root, filename)) db.close() # currently empty directories are not pruned. --=-=-= Content-Type: text/plain ---- 8< ---- Of course, the next step is to sync using this mechanism. Rsync doesn't really have a concept of history, which basically makes it unusable for this purpose [1]. Unison doesn't really understand renames, so it gets confused when you mark a message as read (which might move it from new to cur, and definitely changes its tags). Bremner suggested syncmaildir. Syncmaildir doesn't understand links at all. Bremner suggested that we could use some parts of syncmaildir to implement the tag syncing we need. I didn't have anything else going on this weekend so I tried to prototype the approach. It turns out to be possible to leverage some parts of syncmaildir. I translated a bunch of smd-client into a new program, tagsync-client, that links to messages in an existing notmuch DB. It seems like it's possible to use it in place of the existing smd-client by putting lines like this in your config: SMDCLIENT=~/src/tagsync.git/tagsync-client.py REMOTESMDCLIENT=~/src/tagsync.git/tagsync-client.py The sequence of commands I ran: - linksync.py to dump tags to ~/Mail/.notmuch/exported-tags - smd-pull mail to sync ~/Mail but excluding .notmuch - notmuch new - smd-pull tagsync (using the above client) to sync ~/Mail/.notmuch/exported-tags - linksync.py to pull tags from ~/Mail/.notmuch/exported-tags syncmaildir doesn't cope well with drafts, so it might choke on that, and it doesn't like symlinks (it thinks they're always to directories), so be sure to run linksync with -l hard. Here's the script. It's a work in progress; I have only tested it once in one direction. ---- 8< ---- --=-=-= Content-Type: text/x-python Content-Disposition: inline; filename=tagsync-client.py Content-Description: client script #! /usr/bin/env python import sys from sys import stdout, stdin, stderr import stat import urllib import hashlib import re import os.path import argparse import subprocess import traceback import notmuch import time PROTOCOL_VERSION = "1.1" # Not reproducing the autoconf logic XDELTA = 'xdelta' MDDIFF = 'mddiff' VERBOSE = False def log(msg): if VERBOSE: stderr.write("INFO: "+msg+"\n") def __error(msg): raise ValueError(msg) def log_tags_and_fail(msg, *args): log_tags(*args) __error(msg) def log_internal_error_and_fail(msg, *args): log_internal_error_tags(msg, *args) __error(msg) def log_error(msg): return stderr.write("ERROR: {}\n".format(msg)) def log_tag(tag): return stderr.write("TAGS: {}\n".format(tag)) def log_progress(msg): pass def log_tags(context='unknown', cause='unknown', human=False, *args): if human: human = "necessary" else: human = "avoidable" suggestions = {} suggestions_string = "" if len(args): suggestions_string = ' suggested-actions({})'.format(' '.join(args)) return log_tag("error::context({}) probable-cause({}) human-intervention({})".format( context, cause, human) + suggestions_string) def mkdir_p(filename): """Maildir-aware mkdir. Creates a directory and all parent directories. Moreover, if the last component is 'tmp', 'cur' or 'new', the others are created too.""" # The Lua function throws away the last path component if it # doesn't end in /. This allows you to just call mkdir_p on any # file and a directory for it to live will be created. if not filename.endswith('/'): filename, _ = os.path.split(filename) if not filename.startswith('/'): # This path is relative to HOME, and needs to be translated # too. filename = translate(filename) filename = os.path.expanduser('~/'+filename) dirname, basename = os.path.split(filename) try: os.makedirs(filename) except OSError: pass # probably "File exists" MAILDIR_SUBDIRS = ['tmp', 'cur', 'new'] if basename in MAILDIR_SUBDIRS: for subdir in MAILDIR_SUBDIRS: to_create = os.path.join(dirname, subdir) if not os.path.exists(to_create): os.mkdir(to_create) class FakeSubprocess(object): def __init__(self, init_function): self.init_function = init_function self.input = None self.output = None self.pipe_name = None self.removed = None self.did_write = None self.filter = {} def readline(self): if not self.input: log_internal_error_and_fail("read called before write", "make_slave_filter_process") if not self.removed and self.did_write: self.removed = True rc = self.input.readline() os.unlink(self.pipe_name) return rc else: return self.input.readline() def write(self, *args): if not self.output: self.init_function(self.filter) self.input = self.filter['inf'] self.output = self.filter['outf'] self.pipe_name = self.filter['pipe'] self.did_write = True self.output.write(*args) def flush(self): self.output.flush() def lines(self): return self.input.readlines() def make_slave_filter_process(cmd, seed="no seed"): def init(filter): if 'inf' not in filter: home = os.getenv('HOME') user = os.getenv('USER') or 'nobody' mangled_name = re.compile('[ %./]').sub('-', seed) attempt = 0 if home: base_dir = home + '/.smd/fifo/' else: base_dir = '/tmp/' rc = subprocess.call([MDDIFF, '--mkdir-p', base_dir]) if rc != 0: log_internal_error_and_fail('unable to create directory', 'make_slave_filter_process') while True: pipe_name = ''.join([base_dir, 'smd-', user, str(int(time.time())), mangled_name, str(attempt)]) attempt += 1 rc = subprocess.call([MDDIFF, '--mkfifo', pipe_name]) if rc == 0 or attempt > 10: break if rc != 0: log_internal_error_and_fail('unable to create fifo', "make_slave_filter_process") inferior = cmd(pipe_name) filter['inf'] = inferior.stdout filter['outf'] = file(pipe_name, 'w') filter['pipe'] = pipe_name return FakeSubprocess(init) _translator = None def set_translator(p): global _translator translator_filter = make_slave_filter_process( lambda pipe: subprocess.Popen(p, stdin=file(pipe), stdout=subprocess.PIPE), "translate") if p == 'cat': _translator = lambda x: x else: def translator_fn(x): translator_filter.write(x + '\n') translator_filter.flush() line = translator_filter.readline() if not line or line.strip() == 'ERROR': log_error("Translator {} on input {} gave an error".format( p, x)) for l in translator_filter.readlines(): log_error(l) log_tags_and_fail("Unable to translate mailbox", 'translate', 'bad-translator', True) if '..' in line: log_error("Translator {} on input {} returned a path containing ..".format( p, x)) log_tags_and_fail('Translator returned a path containing ..', 'translate', 'bad-translator', True) return line _translator = translator_fn def translate(x): if _translator: return _translator(x) return x mddiff_sha_handler = make_slave_filter_process( lambda pipe: subprocess.Popen([MDDIFF, pipe], stdout=subprocess.PIPE), "sha_file") def sha_file(name): mddiff_sha_handler.write(name+'\n') mddiff_sha_handler.flush() data = mddiff_sha_handler.readline() if data.startswith('ERROR'): log_tags_and_fail("Failed to sha1 message: " + (name or "nil"), 'sha_file', 'modify-while-update', False, 'retry') hsha, bsha = data.split() if not hsha or not bsha: log_internal_error_and_fail('mddiff incorrect behavior', 'mddiff') return hsha, bsha def exists_and_sha(name): if os.path.exists(name): h, b = sha_file(name) return True, h, b return False, False, False def touch(f): try: file(f, 'r') except IOError: try: file(f, 'w') except IOError: log_error('Unable to touch ' + quote(f)) log_tags("touch", "bad-permissions", True, "display-permissions(" + quote(f) + ")") error("Unable to touch a file") def quote(s): return repr(s) def assert_exists(name): assert os.exists(name), "Not found: "+repr(name) def url_quote(txt): return urllib.quote(txt, safe='') def url_decode(s): return urllib.unquote(s) def log_internal_error_tags(msg, ctx): log_tags('internal-error', ctx, True) # Blob of "run gnome-open" junk not copied def receive(inf, outfile): try: outf = file(outfile, 'w') except IOError: log_error("Unable to open " + outfile + " for writing.") log_error('It may be caused by bad directory permissions, '+ 'please check.') log_tags("receive", "non-writeable-file", True, "display-permissions(" + quote(outfile) +")") error("Unable to write incoming data") line = inf.readline() if not line or line.strip() == "ABORT": log_error("Data transmission failed.") log_error("This problem is transient, please retry.") log_tags_and_fail('server sent ABORT or connection died', "receive", "network", False, "retry") # In the Lua version, this is called "len", but that's a builtin # in Python chunk_len = int(re.compile(r'^chunk (\d+)').match(line).group(1)) total = chunk_len while chunk_len: next_chunk = 16384 if chunk_len < next_chunk: next_chunk = chunk_len data = inf.read(next_chunk) chunk_len -= len(data) outf.write(data) # Probably not strictly speaking necessary in Python outf.close() return total def handshake(dbfile): stdout.write("protocol {}\n".format(PROTOCOL_VERSION)) touch(dbfile) sha_output = subprocess.check_output([MDDIFF, '--sha1sum', dbfile]) db_sha = sha_output.split()[0] err_msg = sha_output[sha_output.index(' ')+1:] if db_sha == 'ERROR': log_internal_error_and_fail('unreadable db file: '+quote(dbfile), 'handshake') stdout.write("dbfile {}\n".format(db_sha)) stdout.flush() line = stdin.readline() if not line: log_error("Network error.") log_error("Unable to get any data from the other endpoint.") log_error("This problem may be transient, please retry.") log_error("Hint: did you correctly setup the SERVERNAME variable") log_error("on your client? Did you add an entry for it in your ssh") log_error("configuration file?") log_tags_and_fail('Network error', "handshake", "network", False, "retry") protocol = re.compile('^protocol (.+)$').match(line) if not protocol or protocol.group(1) != PROTOCOL_VERSION: log_error('Wrong protocol version.') log_error('The same version of syncmaildir must be used on '+ 'both endpoints') log_tags_and_fail('Protocol version mismatch', "handshake", "protocol-mismatch", True) line = stdin.readline() if not line: log_error("The client disconnected during handshake") log_tags_and_fail('Network error', "handshake", "network", False, "retry") sha = re.compile(r'^dbfile (\S+)$').match(line) if not sha or sha.group(1) != db_sha: log_error('Local dbfile and remote db file differ.') log_error('Remove both files and push/pull again.') log_tags_and_fail('Database mismatch', "handshake", "db-mismatch", True, "run(rm "+ quote(dbfile)+")") def dbfile_name(endpoint, mailboxes): endpoint = endpoint.rstrip('/') mailboxes = mailboxes.rstrip('/') subprocess.check_call([MDDIFF, '--mkdir-p', os.path.expanduser('~/.smd/')]) return os.path.expanduser('~/.smd/{}__{}.db.txt'.format( endpoint.replace('/', '_'), mailboxes.replace('/', '_').replace('%', '_') )) def receive_delta(inf): cmds = [] while True: line = inf.readline() if line and line.strip() != "END": cmds.append(line) if not line or line.strip() == "END": break if line.strip() != "END": log_error('Unable to receive a complete diff') log_tags("receive-delta", "network", False, "retry") error("network error while receiving delta") return cmds def homefy(filename): return os.path.expanduser("~/"+filename) def execute_add(name, hsha, bsha): dir, basename = os.path.split(name) # The real smd creates symlinks from workarea to the target # directory, I dunno why. dest = homefy(name) ex, hsha_1, bsha_1 = exists_and_sha(dest) if ex: if hsha_1 != hsha or bsha_1 != bsha: log_error("Failed to add {} since a file with the same name".format( dest)) log_error('exists but its content is different.') log_error("Current hash {}/{}, requested hash {}/{}".format( hsha_1, bsha_1, hsha, bsha)) log_error('To fix this problem you should rename '+dest) log_error('Executing `cd; mv -n '+quote(name)+' '+ 'FIXME: tmp_for' +'` should work.') log_tags("mail-addition", "concurrent-mailbox-edit", True, ) #mk_act("mv", name)) return False return True # already there if ':2,' in basename: basename = basename[:basename.index(':2,')] filenames = original_message_filenames(basename) for filename in filenames: orig_exists, hsha_orig, bsha_orig = exists_and_sha(filename) assert orig_exists if hsha_orig == hsha or bsha_orig == bsha: os.link(filename, dest) return True log_error("Something seriously wrong here: we tried to link {}".format( filename)) log_error("to {} but the hashes were wrong. We wanted {}/{}".format( dest, hsha, bsha)) log_error("but we didn't see that in {}".format(filenames)) log_tags_and_fail('Mail corpus wrong') # FIXME: How do we decide whether to use symlinks or not? # Seems like syncmaildir can't cope with symlinks, so let's just # always use hard links return False def execute_delete(name, hsha, bsha): name = homefy(name) ex, hsha_1, bsha_1 = exists_and_sha(name) assert ex assert hsha_1 == hsha assert bsha_1 == bsha os.unlink(name) return True def execute_copy(name_src, hsha, bsha, name_tgt): name_src = homefy(name_src) name_tgt = homefy(name_tgt) ex_src, hsha_src, bsha_src = exists_and_sha(name_src) ex_tgt, hsha_tgt, bsha_tgt = exists_and_sha(name_tgt) # Not reproducing all logic assert ex_src assert not ex_tgt assert hsha == hsha_src assert bsha == bsha_src if stat.S_ISLNK(os.stat(name_src).st_mode): link_tgt = os.readlink(name_src) os.symlink(link_tgt, name_tgt) else: os.link(name_src, name_tgt) return True def execute_error(msg): log_error('mddiff failed: '+msg) if msg.startswith("Unable to open directory"): log_tags("mddiff", "directory-disappeared", false) else: log_tags("mddiff", "unknown", true) # return (trace(false)) return False def execute(cmd): """The main switch, dispatching actions.""" opcode = cmd.split()[0] if opcode == "ADD": name, hsha, bsha = re.compile(r'^ADD (\S+) (\S+) (\S+)$').match(cmd).groups() name = url_decode(name) mkdir_p(name) return execute_add(name, hsha, bsha) elif opcode == "DELETE": name, hsha, bsha = re.compile(r'^DELETE (\S+) (\S+) (\S+)$').match(cmd).groups() name = url_decode(name) mkdir_p(name) return execute_delete(name, hsha, bsha) elif opcode == "COPY": name_src, hsha, bsha, name_tgt = re.compile( r'COPY (\S+) (\S+) (\S+) TO (\S+)$').match(cmd).groups() name_src = url_decode(name_src) name_tgt = url_decode(name_tgt) mkdir_p(name_src) mkdir_p(name_tgt) return execute_copy(name_src, hsha, bsha, name_tgt) elif opcode in ['REPLACEHEADER', 'COPYBODY', 'REPLACE']: log_internal_error_and_fail(opcode + ' was called: ' + cmd) return False elif opcode == "ERROR": msg = cmd[cmd.index(' ')+1:] return execute_error(msg) else: error("Unknown opcode " + opcode) return False def main(): parser = argparse.ArgumentParser(description="") parser.add_argument('-v', '--verbose', action='store_true', default=False) parser.add_argument('-d', '--dry-run', action='store_true', default=False) parser.add_argument('-t', '--translator', type=str, default='cat') parser.add_argument('endpoint') parser.add_argument('mailboxes') opts = parser.parse_args() set_translator(opts.translator) read_message_ids() dbfile = dbfile_name(opts.endpoint, opts.mailboxes) xdelta = dbfile + '.xdelta' newdb = dbfile + '.new' if opts.mailboxes[0] == '/': log_error("Absolute paths are not supported: " + opts.mailboxes) log_tags_and_fail("Absolute path detected", "main", "mailbox-has--absolute-path", True) handshake(dbfile) commands = receive_delta(stdin) for cmd in commands: try: rc = execute(cmd) # Just wrap the whole thing in try-except to abort "cleanly" except Exception as e: log_error("Got an exception when processing {}: {}".format(cmd.strip(), str(e))) log_error(traceback.format_exc()) rc = False if not rc: stdout.write('ABORT\n') stdout.flush() sys.exit(3) # if len(get_full_email_queue) > queue_max_len: # process_pending_queue() # some commands may still be in the queue, we fire them now # process_pending_queue() # we commit and update the dbfile stdout.write('COMMIT\n') stdout.flush() receive(stdin, xdelta) rc = subprocess.call([XDELTA, 'patch', xdelta, dbfile, newdb]) if rc != 0 and rc != 256: log_error('Unable to apply delta to dbfile.') stdout.write('ABORT\n') stdout.flush() sys.exit(4) try: os.rename(newdb, dbfile) except OSError: log_error('Unable to rename ' + newdb + ' to ' + dbfile) stdout.write('ABORT\n') stdout.flush() sys.exit(5) os.unlink(xdelta) stdout.write('DONE\n') stdout.flush() #log_tag('stats::new-mails(' + statistics.added + #'), del-mails(' + statistics.removed + ')') CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-' encode_re = '([^{0}])'.format(CHARSET) def encode_one_char(match): return('%{:02x}'.format(ord(match.group(1)))) def encode_for_fs(str): return re.sub(encode_re,encode_one_char, str,0) def mangle_message_id(msg_id): """ Return a mangled version of the message id, suitable for use as a filename. """ MAX_LENGTH = 143 FLAGS_LENGTH = 8 # :2,S...?? encoded = encode_for_fs(msg_id) if len(encoded) < MAX_LENGTH - FLAGS_LENGTH: return encoded SHA_LENGTH = 8 TRUNCATED_ID_LENGTH = MAX_LENGTH - SHA_LENGTH - FLAGS_LENGTH PREFIX_LENGTH = SUFFIX_LENGTH = (TRUNCATED_ID_LENGTH - 3) // 2 prefix = encoded[:PREFIX_LENGTH] suffix = encoded[-SUFFIX_LENGTH:] sha = hashlib.sha256() sha.update(encoded) return prefix + '...' + suffix + sha.hexdigest()[:SHA_LENGTH] MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES = {} DB = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY) def read_message_ids(): # We can't base this on tags at all because tags aren't applied yet querystr = '*' q_new = notmuch.Query(DB, querystr) q_new.set_sort(notmuch.Query.SORT.UNSORTED) for msg in q_new.search_messages(): mangled_id = mangle_message_id(msg.get_message_id()) fiter = msg.get_filenames() # list(fiter) gives me a NotInitializedException???? filenames = [] while True: try: filename = next(fiter) filenames.append(filename) except StopIteration: break MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES[mangled_id] = filenames def original_message_filenames(mangled_filename): if mangled_filename not in MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES: log_error("{} not in notmuch. Trying to tag nonexistant message?".format( mangled_filename)) return MESSAGE_MANGLED_FILENAMES_TO_ORIGINAL_FILENAMES[mangled_filename] if __name__ == '__main__': try: main() except Exception as e: log_error(str(e)) log_error(traceback.format_exc()) sys.exit(6) --=-=-=--