From 93c2b7629442076fa9473968816d2b5ab9a77759 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 27 Oct 2014 21:43:14 -0700 Subject: [PATCH] WIP: ssoma-mda: Translate to Python3.3+, using pygit2 (~0.21.3+) I'll test with older versions of pygit2 to figure out where the cutoff is. And without pygit2 in Debian (even Debian testing), it gets a bit harder to get this going. You should be able to use: $ apt-get install libgit2-dev $ pip install cffi $ pip install pygit2 but I haven't fired up a Debian image to test that yet. --- Documentation/ssoma-mda.txt | 50 ------ INSTALL | 31 +--- MANIFEST | 29 ---- README | 32 +--- lib/Ssoma/Git.pm | 292 ---------------------------------- lib/Ssoma/GitIndexInfo.pm | 49 ------ lib/Ssoma/MDA.pm | 138 ---------------- ssoma-mda | 308 +++++++++++++++++++++++++++++++++--- t/mda-badheaders.t | 57 ------- t/mda-conflict.t | 107 ------------- t/mda-missing-mid.t | 26 --- 11 files changed, 301 insertions(+), 818 deletions(-) delete mode 100644 Documentation/ssoma-mda.txt delete mode 100644 MANIFEST delete mode 100644 lib/Ssoma/Git.pm delete mode 100644 lib/Ssoma/GitIndexInfo.pm delete mode 100644 lib/Ssoma/MDA.pm delete mode 100644 t/mda-badheaders.t delete mode 100644 t/mda-conflict.t delete mode 100644 t/mda-missing-mid.t diff --git a/Documentation/ssoma-mda.txt b/Documentation/ssoma-mda.txt deleted file mode 100644 index 3b6757b..0000000 --- a/Documentation/ssoma-mda.txt +++ /dev/null @@ -1,50 +0,0 @@ -% ssoma-mda(1) ssoma user manual - -# NAME - -ssoma-mda - mail delivery agent for ssoma - -# SYNOPSIS - -ssoma-mda [-1] /path/to/ssoma/repository.git < message - -# DESCRIPTION - -ssoma-mda delivers messages to a git repository as described by -ssoma_repository(5). It reads messages from STDIN and takes no -command-line arguments. It may be invoked by the MTA (mail transport -agent, e.g. postfix or exim) or as part of another MDA (e.g. procmail or -maildrop) - -ssoma-mda takes no command-line options and does not alter its own -permissions. This must be done by the MTA or MDA which invokes -ssoma-mda. - -# OPTIONS - --1 -: Only allow a Message-ID to appear once in the database. - Future messages with an identical Message-ID will not be allowed. - -# FILES - -See ssoma_repository(5) for details. - -# ENVIRONMENT - -ssoma-mda depends on no environment variables - -# CONTACT - -All feedback welcome via plain-text mail to \ -The mail archives are hosted at git://public-inbox.org/meta -See ssoma(1) for instructions on how to subscribe. - -# COPYRIGHT - -Copyright 2013, Eric Wong and all contributors.\ -License: AGPLv3 or later - -# SEE ALSO - -git(1), ssoma(1), ssoma_repository(5) diff --git a/INSTALL b/INSTALL index 691b122..d8755a9 100644 --- a/INSTALL +++ b/INSTALL @@ -1,8 +1,6 @@ -ssoma (client-side) installation --------------------------------- -This is for users who wish to follow archives hosted on public-inbox. -We only use commonly-available Perl modules available on Debian-based -distributions (and not necessarily modern ones!) +ssoma-mda installation +---------------------- +This is for users who wish to add email to archives. If you have problems or comments on installation, please send a plain-text email to: meta@public-inbox.org @@ -18,28 +16,15 @@ You may also clone using git: git clone git://80x24.org/ssoma -standard MakeMaker installation (Perl) --------------------------------------- +standard distutils installation +------------------------------- - perl Makefile.PL - make - make test - make install # root permissions may be needed + TODO Requirements ------------ -All packages should be easily available in Debian GNU/Linux and derived -distros. Debian package names for Perl modules are are listed for -convenience. - -* git (https://git-scm.com/) -* any MUA capable of reading/importing IMAP, mbox(5) or Maildir -* Perl and several modules: (Debian package name (7.0)) - - Digest::SHA perl - - Email::LocalDelivery libemail-localdelivery-perl - - Email::Simple libemail-simple-perl - - File::Path::Expand libfile-path-expand-perl - - Net::IMAP::Simple libnet-imap-simple-perl +It requires Python 3.3 or later (currently only packaged for Debian +Sid) and pygit2 (tested on version 0.21.3). Copyright --------- diff --git a/MANIFEST b/MANIFEST deleted file mode 100644 index 27283c9..0000000 --- a/MANIFEST +++ /dev/null @@ -1,29 +0,0 @@ -.gitignore -COPYING -Documentation/include.mk -Documentation/ssoma-mda.txt -Documentation/ssoma-rm.txt -Documentation/ssoma.txt -Documentation/ssoma_repository.txt -Documentation/txt2pre -INSTALL -MANIFEST -Makefile.PL -README -lib/Ssoma/Extractor.pm -lib/Ssoma/Git.pm -lib/Ssoma/GitIndexInfo.pm -lib/Ssoma/IMAP.pm -lib/Ssoma/MDA.pm -lib/Ssoma/Remover.pm -ssoma -ssoma-mda -ssoma-rm -t/all.t -t/extractor.t -t/git.t -t/imap.t -t/mda-badheaders.t -t/mda-conflict.t -t/mda-missing-mid.t -t/remover.t diff --git a/README b/README index 006bf61..3b5eed8 100644 --- a/README +++ b/README @@ -14,19 +14,18 @@ See http://public-inbox.org/ for more information on how ssoma is used. Features -------- * stores email in git, so readers have a full history of the mailing list -* mail user-agent (MUA) users may choose from IMAP, mbox(5), and Maildir * uses only well-documented and easy-to-implement data formats Install ------- Installation should be easy and require only a few, commonly-available -packages. See http://ssoma.public-inbox.org/INSTALL for details. +packages. See http://ssoma-mda.public-inbox.org/INSTALL for details. Hacking ------- Source code is available via git: - git clone git://80x24.org/ssoma + git clone git://80x24.org/ssoma-mda See below for contact info. @@ -44,38 +43,13 @@ subscription). This also makes it easier to rope in folks of tangentially related projects we depend on (e.g. git developers on git@vger.kernel.org). -You can subscribe via ssoma, LISTNAME is a name of your choosing: - - URL=git://public-inbox.org/meta - LISTNAME=public-inbox - - # to initialize a maildir (this may be a new or existing maildir, - # ssoma will not touch existing messages) - # If you prefer mbox, use mbox:/path/to/mbox as the last argument - ssoma add $LISTNAME $URL maildir:/path/to/maildir - - # read with your favorite MUA (only using mutt as an example) - mutt -f /path/to/maildir # (or /path/to/mbox) - - # to keep your mbox or maildir up-to-date, periodically run the following: - ssoma sync $LISTNAME - - # your MUA may modify and delete messages from the maildir or mbox, - # this does not affect ssoma functionality at all - - # to sync all your ssoma subscriptions - ssoma sync - - # You may wish to sync in your cronjob - ssoma sync --cron - Mail repository format ---------------------- If you are uncomfortable running code in ssoma for any reason and would rather read directly from the git repository, the following document describes it: - http://ssoma.public-inbox.org/ssoma_repository.txt + http://ssoma-mda.public-inbox.org/ssoma_repository.txt Copyright --------- diff --git a/lib/Ssoma/Git.pm b/lib/Ssoma/Git.pm deleted file mode 100644 index e8d4cf6..0000000 --- a/lib/Ssoma/Git.pm +++ /dev/null @@ -1,292 +0,0 @@ -# Copyright (C) 2013, Eric Wong and all contributors -# License: GPLv2 or later (https://www.gnu.org/licenses/gpl-2.0.txt) -# -# Note: some trivial code here stolen from git-svn + Perl modules -# distributed with git. This remains GPLv2+ so improvements may flow -# back into git. Note: git-svn has always been GPLv2+, unlike most -# of the rest of git being GPLv2-only. - -package Ssoma::Git; -use strict; -use warnings; -use File::Path qw/mkpath/; -use Fcntl qw/:DEFAULT :flock SEEK_END/; -use IO::Handle; -use Email::Simple; -use Digest::SHA qw/sha1_hex/; - -# Future versions of Ssoma will always be able to handle this version, at least -our $REPO_VERSION = 1; - -sub new { - my ($class, $git_dir) = @_; - bless { - git_dir => $git_dir, - index => "$git_dir/ssoma.index", - }, $class; -} - -# initialize a git repository -sub init_db { - my ($self, @opts) = @_; - - my @cmd = (qw(git init --bare), @opts); - push @cmd, $self->{git_dir}; - - system(@cmd) == 0 or die join(' ', @cmd)." failed: $?\n"; - - $self->tmp_git_do(sub { - @cmd = (qw(git config ssoma.repoversion), $REPO_VERSION); - system(@cmd) == 0 or die "command: ". join(' ', @cmd) . ": $?\n"; - }); -} - -sub lockfile { $_[0]->{git_dir} . "/ssoma.lock" } - -sub sync_do { - my ($self, $sub) = @_; - - my $path = $self->lockfile; - my $lock; - - # we must not race here because this is concurrent: - sysopen($lock, $path, O_WRONLY) or - sysopen($lock, $path, O_CREAT|O_EXCL|O_WRONLY) or - sysopen($lock, $path, O_WRONLY) or - die "failed to open lock $path: $!\n"; - - # wait for other processes to be done - flock($lock, LOCK_EX) or die "lock failed: $!\n"; - - # run the sub! - my @ret = eval { &$sub }; - my $err = $@; - - # these would happen anyways, but be explicit so we can detect errors - flock($lock, LOCK_UN) or die "unlock failed: $!\n"; - close $lock or die "close lockfile($path) failed: $!\n"; - - die $err if $err; - - wantarray ? @ret : $ret[0]; -} - -# perform sub with the given GIT_DIR -sub tmp_git_do { - my ($self, $sub) = @_; - local $ENV{GIT_DIR} = $self->{git_dir}; - &$sub; -} - -# perform sub with a temporary index -sub tmp_index_do { - my ($self, $sub) = @_; - local $ENV{GIT_INDEX_FILE} = $self->{index}; - - my ($dir, $base) = ($self->{index} =~ m#^(.*?)/?([^/]+)$#); - mkpath([$dir]) unless -d $dir; - -d $dir or die "$dir creation failed $!\n"; - &$sub; -} - -# bidirectional pipe, output would be SHA-1 hexdigest -sub bidi_sha1 { - my ($self, @cmd) = @_; - my $sub = pop @cmd; - my $cmd = join(' ', @cmd); - my ($in_0, $in_1, $out_0, $out_1); - - pipe($in_0, $in_1) or die "pipe failed: $!\n"; - pipe($out_0, $out_1) or die "pipe failed: $!\n"; - - my $pid = fork; - defined $pid or die "fork failed: $!\n"; - - if ($pid == 0) { - open STDIN, '<&', $in_0 or die "redirect stdin failed: $!\n"; - open STDOUT, '>&', $out_1 or die "redirect stdout failed: $!\n"; - exec @cmd; - die "exec($cmd) failed: $!\n"; - } - - close $in_0 or die "close in_0 failed: $!\n"; - close $out_1 or die "close out_1 failed: $!\n"; - $sub->($in_1); - close $in_1 or die "close in_1 failed: $!\n"; - my $sha1 = <$out_0>; - close $out_0 or die "close out_0 failed: $!\n"; - waitpid($pid, 0) or die "waitpid $pid failed: $!\n"; - $? == 0 or die "$cmd failed: $?\n"; - chomp $sha1; - $sha1 =~ /\A[a-f0-9]{40}\z/i or die "not a SHA-1: $sha1\n"; - $sha1; -} - -# run a command described by str and return the SHA-1 hexdigest output -sub qx_sha1 { - my ($self, $str) = @_; - my $sha1 = `$str`; - - die "$str failed: $?\n" if $?; - chomp $sha1; - $sha1 =~ /\A[a-f0-9]{40}\z/i or - die "not a SHA-1 hexdigest from: $str\n"; - $sha1; -} - -# returns a blob identifier the new message -sub simple_to_blob { - my ($self, $simple) = @_; - $self->bidi_sha1(qw/git hash-object -w --stdin/, sub { - my ($io) = @_; - print $io $simple->as_string or die "print failed: $!\n"; - }); -} - -# converts the given object name to an Email::Simple object -sub blob_to_simple { - my ($self, $obj) = @_; - Email::Simple->new($self->cat_blob($obj)); -} - -# returns key-value pairs of config directives in a hash -sub config_list { - my ($self, $file) = @_; - - local $ENV{GIT_CONFIG} = $file; - - my @cfg = `git config -l`; - $? == 0 or die "git config -l failed: $?\n"; - chomp @cfg; - my %rv = map { split(/=/, $_, 2) } @cfg; - \%rv; -} - -# used to hash the relevant portions of a message when there are conflicts -sub hash_simple2 { - my ($self, $simple) = @_; - my $dig = Digest::SHA->new("SHA-1"); - $dig->add($simple->header("Subject")); - $dig->add($simple->body); - $dig->hexdigest; -} - -# we currently only compare messages for equality based on -# Message-ID, Subject: header and body, nothing else. -# both args are Email::Simple objects -sub simple_eq { - my ($self, $cur, $new) = @_; - - (($cur->header("Subject") eq $new->header("Subject")) && - ($cur->body eq $new->body)); -} - -# kills leading/trailing space in-place -sub stripws { - $_[0] =~ s/\A\s*//; - $_[0] =~ s/\s*\z//; -} - -sub mid2path { - my ($self, $message_id) = @_; - stripws($message_id); - $message_id =~ s/\A\z//; - my $hex = sha1_hex($message_id); - $hex =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/i or - die "BUG: not a SHA-1 hex: $hex"; - "$1/$2"; -} - -sub cat_blob { - my ($self, $blob_id, $git_pm) = @_; - my $str; - if ($git_pm) { - open my $fh, '>', \$str or - die "failed to setup string handle: $!\n"; - binmode $fh; - my $bytes = $git_pm->cat_blob($blob_id, $fh); - close $fh or die "failed to close string handle: $!\n"; - die "$blob_id invalid\n" if $bytes <= 0; - } else { - my $cmd = "git cat-file blob $blob_id"; - $str = `$cmd`; - die "$cmd failed: $?\n" if $?; - } - $str; -} - -sub type { - my ($self, $obj) = @_; - my $cmd = "git cat-file -t $obj"; - my $str = `$cmd`; - die "$cmd failed: $?\n" if $?; - chomp $str; - $str; -} - -# only used for conflict resolution -sub each_in_tree { - my ($self, $obj, $sub) = @_; - my $cmd = "git ls-tree $obj"; - my @tree = `$cmd`; - $? == 0 or die "$cmd failed: $!\n"; - my $x40 = '[a-f0-9]{40}'; - foreach my $line (@tree) { - if ($line =~ m!\A100644 blob ($x40)\t($x40)$!o) { - my ($blob_id, $path) = ($1, $2); - $sub->($blob_id, $path); - } else { - warn "unexpected: bad line from $cmd:\n$line"; - } - } -} - -sub commit_index { - my ($self, $gii, $need_parent, $ref, $message) = @_; - - # this is basically what git commit(1) does, - # but we use git plumbing, not porcelain - $gii->done; - my $tree = $self->qx_sha1("git write-tree"); - - # can't rely on qx_sha1 since we initial commit may not have a parent - my $cmd = "git rev-parse $ref^0"; - my $parent; - if ($need_parent) { - $parent = $self->qx_sha1($cmd); - } else { - $parent = eval { $self->qx_sha1("$cmd 2>/dev/null") }; - if (defined $parent && $parent !~ /\A[a-f0-9]{40}\z/) { - die "$cmd returned bad SHA-1: $parent\n"; - } - } - - # make the commit - my @cmd = qw/git commit-tree/; - push @cmd, $tree; - push @cmd, '-p', $parent if $parent; - push @cmd, '-m', "'$message'"; - - my $commit = $self->qx_sha1(join(' ', @cmd)); - - # update the ref - @cmd = (qw/git update-ref/, $ref, $commit); - push @cmd, $parent if $parent; # verification - system(@cmd) == 0 or die "command: ". join(' ', @cmd) . ": $?\n"; - - # gc if needed - @cmd = qw/git gc --auto/; - system(@cmd) == 0 or die "command: ". join(' ', @cmd) . ": $?\n"; -} - -# keep Git.pm optional, not all installations of git have it -sub try_git_pm { - my ($self) = @_; - eval { - require Git; - Git->repository(Directory => $self->{git_dir}); - }; -} - -1; diff --git a/lib/Ssoma/GitIndexInfo.pm b/lib/Ssoma/GitIndexInfo.pm deleted file mode 100644 index 7f20297..0000000 --- a/lib/Ssoma/GitIndexInfo.pm +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (C) 2013, Eric Wong and all contributors -# License: GPLv2 or later (https://www.gnu.org/licenses/gpl-2.0.txt) -# -# Note: some trivial code here stolen from git-svn + Perl modules -# distributed with git. This remains GPLv2+ so improvements may flow -# back into git. Note: git-svn has always been GPLv2+, unlike most -# of the rest of git being GPLv2-only. -# -# Not using Git.pm and friends directly because some git installations may use -# a different Perl than this (and I might end up rewriting this entirely -# in another language). Git::IndexInfo is also somewhat recent, so folks -# on LTS distros may not have it, yet. - -package Ssoma::GitIndexInfo; -use strict; -use warnings; - -sub new { - my ($class) = @_; - my $pid = open my $gui, '|-'; - defined $pid or die "failed to pipe + fork: $!\n"; - if ($pid == 0) { - exec(qw/git update-index -z --index-info/); - die "exec failed: $!\n"; - } - bless { gui => $gui, pid => $pid, nr => 0}, $class; -} - -sub remove { - my ($self, $path) = @_; - print { $self->{gui} } '0 ', 0 x 40, "\t", $path, "\0" or - die "failed to print to git update-index pipe: $!\n"; - ++$self->{nr}; -} - -sub update { - my ($self, $mode, $hash, $path) = @_; - print { $self->{gui} } $mode, ' ', $hash, "\t", $path, "\0" or - die "failed to print to git update-index pipe: $!\n"; - ++$self->{nr}; -} - -sub done { - my ($self) = @_; - close $self->{gui} or die "close pipe: $!\n"; - $? == 0 or die "git update-index failed: $?\n"; -} - -1; diff --git a/lib/Ssoma/MDA.pm b/lib/Ssoma/MDA.pm deleted file mode 100644 index 6b58b43..0000000 --- a/lib/Ssoma/MDA.pm +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (C) 2013, Eric Wong and all contributors -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# -# Mail Delivery Agent module, delivers mail into a ssoma git repo -package Ssoma::MDA; -use strict; -use warnings; -use Ssoma::GitIndexInfo; - -sub new { - my ($class, $git) = @_; - bless { git => $git, ref => "refs/heads/master" }, $class; -} - -# may convert existing blob to a tree -# returns false if message already exists -# returns true on successful delivery -sub blob_upgrade { - my ($self, $gii, $new, $path) = @_; - - my $git = $self->{git}; - my $obj = "$self->{ref}^0:$path"; - my $cur = $git->blob_to_simple($obj); - - # do nothing if the messages match: - return 0 if $git->simple_eq($cur, $new); - - # kill the old blob - $gii->remove($path); - - # implicitly create a new tree via index with two messages - foreach my $simple ($cur, $new) { - my $id = $git->simple_to_blob($simple); - my $path2 = $git->hash_simple2($simple); - $gii->update("100644", $id, "$path/$path2"); - } - 1; -} - -# used to update existing trees, which only happen when we have Message-ID -# conflicts -sub tree_update { - my ($self, $gii, $new, $path) = @_; - my $git = $self->{git}; - my $obj = "$self->{ref}^0:$path"; - my $cmd = "git ls-tree $obj"; - my @tree = `$cmd`; - $? == 0 or die "$cmd failed: $!\n"; - chomp @tree; - - my $id = $git->simple_to_blob($new); - my $path2 = $git->hash_simple2($new); - - # go through the existing tree and look for duplicates - foreach my $line (@tree) { - $line =~ m!\A100644 blob ([a-f0-9]{40})\t(([a-f0-9]{40}))\z! or - die "corrupt repo: bad line from $cmd: $line\n"; - my ($xid, $xpath2) = ($1, $2); - - # do nothing if most of the message matches - return 0 if $path2 eq $xpath2 || $id eq $xid; - } - - # no duplicates found, add to the index - $gii->update("100644", $id, "$path/$path2"); -} - -# this appends the given message-id to the git repo, requires locking -# (Ssoma::Git::sync_do) -sub append { - my ($self, $path, $simple, $once) = @_; - - my $git = $self->{git}; - my $ref = $self->{ref}; - - # $path is a path name we generated, so it's sanitized - my $gii = Ssoma::GitIndexInfo->new; - - my $obj = "$ref^0:$path"; - my $cmd = "git cat-file -t $obj"; - my $type = `$cmd 2>/dev/null`; - - if ($? == 0) { # rare, object already exists - chomp $type; - if ($once) { - my $mid = $simple->header("Message-ID"); - die "CONFLICT: Message-ID: $mid exists ($path)\n"; - } - - # we return undef here if the message already exists - if ($type eq "blob") { - # this may upgrade the existing blob to a tree - $self->blob_upgrade($gii, $simple, $path) or return; - } elsif ($type eq "tree") { - # possibly add object to an existing tree - $self->tree_update($gii, $simple, $path) or return; - } else { - # we're screwed if a commit/tag has the same SHA-1 - die "CONFLICT: `$cmd' returned: $type\n"; - } - } else { # new message, just create a blob, common - my $id = $git->simple_to_blob($simple); - $gii->update('100644', $id, $path); - } - my $subject = $simple->header("Subject"); - $git->commit_index($gii, 0, $ref, $subject); -} - -# the main entry point takes an Email::Simple object -sub deliver { - my ($self, $simple, $once) = @_; - my $git = $self->{git}; - - # convert the Message-ID into a path - my $mid = $simple->header("Message-ID"); - - # if there's no Message-ID, generate one to avoid too many conflicts - # leading to trees - if (!defined $mid || $mid =~ /\A\s*\z/) { - $mid = '<' . $git->hash_simple2($simple) . '@localhost>'; - $simple->header_set("Message-ID", $mid); - } - my $path = $git->mid2path($mid); - - # kill potentially confusing/misleading headers - foreach my $d (qw(lines content-length)) { - $simple->header_set($d); - } - - my $sub = sub { - $git->tmp_index_do(sub { - $self->append($path, $simple, $once); - }); - }; - $git->sync_do(sub { $git->tmp_git_do($sub) }); -} - -1; diff --git a/ssoma-mda b/ssoma-mda index d763224..c5aedf4 100755 --- a/ssoma-mda +++ b/ssoma-mda @@ -1,22 +1,294 @@ -#!/usr/bin/perl -w +#!/usr/bin/env python # Copyright (C) 2013, Eric Wong and all contributors # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# This is the command-line mail delivery agent for servers. # Try to keep this small as it may be invoked frequently for each message # delivered. -my $usage = "ssoma-mda [-1] /path/to/git/repo < /path/to/rfc2822_message"; -use strict; -use warnings; -use Ssoma::MDA; -use Ssoma::Git; -use Email::Simple; -my $once = $ARGV[0] eq "-1"; -my $repo = pop @ARGV or die "Usage: $usage\n"; -my $git = Ssoma::Git->new($repo); -my $mda = Ssoma::MDA->new($git); -my $simple; -{ - local $/; - $simple = Email::Simple->new(); -} -$mda->deliver($simple, $once); + +"""Mail delivery agent for adding mail to a ssoma Git repository. + +ssoma-mda reads a message from standard input and delivers it to a Git +repository as described by ssoma_repository(5). It may be invoked by +the MTA (mail transport agent, e.g. Postfix or Exim) or as part of +another MDA (e.g. procmail or maildrop). + +ssoma-mda does not alter its own permissions. This must be done by +the MTA or MDA which invokes ssoma-mda. + +# FILES + +See ssoma_repository(5) for details. + +# ENVIRONMENT + +ssoma-mda depends on no environment variables directly, but it uses +your PATH to find your local Git. + +# CONTACT + +All feedback welcome via plain-text mail to . +The mail archives are hosted at git://public-inbox.org/meta See +ssoma(1) for instructions on how to subscribe. + +# COPYRIGHT + +Copyright 2013, Eric Wong and all contributors. +License: AGPLv3 or later + +# SEE ALSO + +git(1), ssoma(1), ssoma_repository(5) +""" + +from __future__ import print_function +from __future__ import unicode_literals + +import argparse as _argparse +import email as _email +import email.message as _email_message +import email.policy as _email_policy +import email.utils as _email_utils +import hashlib as _hashlib +import logging as _logging +import os.path as _os_path +import sys as _sys +import pygit2 as _pygit2 + + +__version__ = '0.2.0' + +_LOG = _logging.getLogger('ssoma-mda') +_LOG.setLevel(_logging.ERROR) +_LOG.addHandler(_logging.StreamHandler()) + +_COMMIT_MESSAGE_ENCODING = 'UTF-8' + + +class DirtyIndex(RuntimeError): + def __init__(self, repository, diff): + self.repository = repository + self.diff = diff + status = '\n'.join( + ' {} {}'.format(patch.status, patch.old_file_path) + for patch in diff) + super(DirtyIndex, self).__init__('dirty index:\n{}'.format(status)) + + +class MessagePathConflict(RuntimeError): + """Different messages with the same target path.""" + def __init__(self, repository, path, message=None): + self.repository = repository + self.path = path + self.message = message + super(MessagePathConflict, self).__init__( + 'duplicate message for {}'.format(path)) + + +def _add_message(repository, index, path, message_bytes): + """Add a message to the repository. + + For messages without a Message-ID hash conflict (most messages). + """ + oid = repository.write(_pygit2.GIT_OBJ_BLOB, message_bytes) + _LOG.debug('add message at {} ({})'.format(path, oid.hex[:8])) + entry = _pygit2.IndexEntry(path, oid, _pygit2.GIT_FILEMODE_BLOB) + index.add(entry) + index.write() + + +def _upgrade_blob(repository, index, path, obj, message_bytes, once=False): + """Possibly upgrade an existing blob to a tree. + + To handle conflicting Message-ID hashes. + """ + old_message_bytes = obj.read_raw() + if message_bytes == old_message_bytes: + _LOG.info('skipping byte-duplicate message for {}'.format(path)) + return + if once: + raise MessagePathConflict( + repository=repository, path=path, message=message_bytes) + _LOG.debug('upgrade {} to a directory'.format(path)) + index.remove(path) + for action, bytes in [ + ('upgrade', old_message_bytes), + ('add', message_bytes) + ]: + oid = repository.write(_pygit2.GIT_OBJ_BLOB, bytes) + p = _os_path.join(path, oid.hex) + _LOG.debug('{} message to tree at {}'.format(action, p)) + entry = _pygit2.IndexEntry(p, oid, _pygit2.GIT_FILEMODE_BLOB) + index.add(entry) + index.write() + + +def _update_tree(repository, index, path, obj, message_bytes, once=False): + """Possibly update an existing tree. + + To handle conflicting Message-ID hashes. + """ + if once: # we shouldn't have this tree at all + raise MessagePathConflict( + repository=repository, path=path, message=message_bytes) + oid = repository.write(_pygit2.GIT_OBJ_BLOB, message_bytes) + p = _os_path.join(path, oid.hex) + _LOG.debug('add message to tree at {}'.format(p)) + entry = _pygit2.IndexEntry(p, oid, _pygit2.GIT_FILEMODE_BLOB) + index.add(entry) + index.write() + + +def append(repository, path, message_bytes, commit_message, + author=None, **kwargs): + """Append the given message to the Git repo at 'path' (or a subpath). + + Additional keyword arguments are passed through to index-updating + function. + """ + index = repository.index + index.read() + reference_name = 'HEAD' + try: + reference = repository.head + except _pygit2.GitError as e: + if 'not found' in str(e): # no HEAD commit (so this will be the first) + reference = None + commit = None + _add_message( + repository=repository, index=index, path=path, + message_bytes=message_bytes) + else: # we have a HEAD commit to build on + commit = reference.get_object() + diff = index.diff_to_tree(commit.tree) + if len(diff): + raise DirtyIndex(repository=repository, diff=diff) + try: + entry = commit.tree[path] + except KeyError: # new message, just create a blob + _add_message( + repository=repository, index=index, path=path, + message_bytes=message_bytes) + else: # object already exists + obj = repository.get(entry.oid) + kwargs = kwargs.copy() + kwargs.update({ + 'repository': repository, + 'index': index, + 'path': path, + 'obj': obj, + 'message_bytes': message_bytes, + }) + if obj.type == _pygit2.GIT_OBJ_BLOB: + _upgrade_blob(**kwargs) + elif obj.type == _pygit2.GIT_OBJ_TREE: + _update_tree(**kwargs) + else: + raise NotImplementedError( + 'tree entry for {} has type {}'.format( + path, type(obj).__name__.lower())) + tree = index.write_tree() + if commit is None or tree != commit.tree.oid: # we've changed something + committer = repository.default_signature + if author is None: + author = committer + if commit is None: + parents = [] + else: + parents = [commit.oid] + _LOG.debug('create a new commit for tree {}: {}'.format( + tree.hex[:8], commit_message)) + new_commit = repository.create_commit( + reference_name, author, committer, commit_message, tree, + parents, _COMMIT_MESSAGE_ENCODING) + _LOG.debug('new commit {} advances {}'.format( + new_commit.hex[:8], reference_name)) + else: + _LOG.info('no changes to commit') + + +def message_id_path(message_id): + """Calculate the default path from a Message-ID + + >>> message_id_path('<20131106023245.GA20224@dcvr.yhbt.net>') + 'f2/8c6cfd2b0a65f994c3e1be266105413b3d3f63' + """ + message_id = message_id.lstrip('<').rstrip('>') + hash = _hashlib.sha1(message_id.encode('UTF-8')).hexdigest() + return _os_path.join(hash[:2], hash[2:]) + + +def get_author(message): + """Create a pygit2.Signature for the message author.""" + author_name, author_email = _email_utils.parseaddr( + message['From']) + date = message['Date'] + datetime = _email_utils.parsedate_to_datetime(date) + time = int(datetime.timestamp()) + offset = datetime.utcoffset().seconds // 60 + return _pygit2.Signature( + name=author_name, + email=author_email, + time=time, + offset=offset) + + +def deliver(message=None, message_bytes=None, **kwargs): + """Deliver a message to a ssoma repository. + + The input message can be an email.message.Message instance (use + 'message'), the raw SMTP byte stream (use 'message_bytes'), or + both (in which case 'message' is used to extract the message data, + and 'message_bytes' is written to the repository. + + Additional keyword arguments are passed through to append(). + """ + if message is None: + if message_bytes is None: + raise ValueError('no message arguments') + message = _email.message_from_bytes( + message_bytes, policy=_email_policy.SMTP) + elif message_bytes is None: + message_bytes = message.as_bytes(policy=_email_policy.SMTP) + + message_id = message.get('Message-ID', '') + path = message_id_path(message_id=message_id) + _LOG.info('deliver {} to {}'.format(message_id, path)) + commit_message = message.get('Subject', '') + author = get_author(message=message) + repository = _pygit2.Repository(_os_path.curdir) + append( + repository=repository, path=path, message_bytes=message_bytes, + commit_message=commit_message, author=author, **kwargs) + + +def main(stream=_sys.stdin.buffer): + """Command-line entry point.""" + parser = _argparse.ArgumentParser( + description=__doc__.strip(), + formatter_class=_argparse.RawDescriptionHelpFormatter) + parser.add_argument( + '-v', '--version', action='version', + version='%(prog)s {}'.format(__version__)) + parser.add_argument( + '-l', '--log-level', + choices=['critical', 'error', 'warning', 'info', 'debug'], + help='Log verbosity. Defaults to {!r}.'.format( + _logging.getLevelName(_LOG.level).lower())) + parser.add_argument( + '-1', '--once', action='store_true', + help='Die if the incoming Message-ID is already in the repository.') + + args = parser.parse_args() + + if args.log_level: + level = getattr(_logging, args.log_level.upper()) + _LOG.setLevel(level) + + try: + deliver(message_bytes=stream.read(), once=args.once) + except (DirtyIndex, MessagePathConflict) as e: + _LOG.error(e) + raise SystemExit(1) + + +if __name__ == '__main__': + main() diff --git a/t/mda-badheaders.t b/t/mda-badheaders.t deleted file mode 100644 index b161574..0000000 --- a/t/mda-badheaders.t +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl -w -# Copyright (C) 2013, Eric Wong and all contributors -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -use strict; -use warnings; -use Test::More; -use Ssoma::MDA; -use Ssoma::Git; -use Email::Simple; -use Digest::SHA qw/sha1_hex/; -use File::Temp qw/tempdir/; - -my $tmpdir = tempdir(CLEANUP => 1); -my $git = Ssoma::Git->new("$tmpdir/gittest"); -$git->init_db; -my $mda = Ssoma::MDA->new($git); - -my $email = Email::Simple->new("From: U \n\nHIHI\n"); -my %headers = ( - "To" => "Me ", - "From" => "You ", - "Message-ID" => "<666\@example.com>", - "Subject" => ":o", - "Lines" => "666", - "Content-Length" => "666", -); - -my %discard = map { $_ => 1 } qw(Lines Content-Length); - -while (my ($key, $val) = each %headers) { - $email->header_set($key, $val); -} - -$mda->deliver($email); - -local $ENV{GIT_DIR} = "$tmpdir/gittest"; - -my $blob_id = sha1_hex("666\@example.com"); -my ($dir, $base) = ($blob_id =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!); -ok(defined $dir && defined $base, "bad sha1: $blob_id"); - -my $raw = `git cat-file blob HEAD:$dir/$base`; -is(0, $?, "git cat-file returned: $?"); - -my $delivered = Email::Simple->new($raw); -is("HIHI\n", $delivered->body, "body matches"); - -while (my ($key, $val) = each %headers) { - if ($discard{$key}) { - is($delivered->header($key), undef, "header $key discarded"); - } else { - is($delivered->header($key), $val, "header $key not discarded"); - } -} - -done_testing(); - diff --git a/t/mda-conflict.t b/t/mda-conflict.t deleted file mode 100644 index 54b3541..0000000 --- a/t/mda-conflict.t +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/perl -w -# Copyright (C) 2013, Eric Wong and all contributors -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -use strict; -use warnings; -use Test::More; -use Ssoma::MDA; -use Ssoma::Git; -use Email::Simple; -use Digest::SHA qw/sha1_hex/; -use File::Temp qw/tempdir/; - -my $tmpdir = tempdir(CLEANUP => 1); -my $git = Ssoma::Git->new("$tmpdir/gittest"); -$git->init_db; -my $mda = Ssoma::MDA->new($git); - -my $email = Email::Simple->new("From: U \n\nHIHI\n"); -$email->header_set("To", "Me "); -$email->header_set("Subject", ":o"); -$email->header_set("Message-ID", "<12345\@example.com>"); - -$mda->deliver($email); - -local $ENV{GIT_DIR} = "$tmpdir/gittest"; -my @orig = `git rev-list HEAD`; -is(1, scalar @orig, "one revision exists"); - -# deliver a second message -$email->header_set("message-ID", "<666\@example.com>"); -$email->body_set("BYEBYE\nBYEYBE\n"); - -$mda->deliver($email); - -# validate delivery results and history -my @two = ` git rev-list HEAD`; -is(2, scalar @two, "two revisions exist"); -is($orig[0], $two[1], "history is correct"); - -my @tree = `git ls-tree -r HEAD`; -is(0, $?, "git ls-tree -r HEAD succeeded"); -chomp @tree; -is(2, scalar @tree, "two entries in tree"); - -# ensure path Message-ID -> path mapping works -foreach my $line (@tree) { - my ($mode, $type, $blob, $path) = split(/\s+/, $line);; - my $raw = `git cat-file blob $blob`; - my $simple = Email::Simple->new($raw); - my $mid = $simple->header("message-id"); - my $path_sha1 = $path; - $path_sha1 =~ tr!/!!d; - $mid =~ tr/<>//d; - is($path_sha1, sha1_hex($mid), "path mapping works $mid"); -} - -# delivery again with identical Message-ID -$mda->deliver($email); - -# duplicate detected -chomp(my @curr = `git ls-tree -r HEAD`); -is_deeply(\@tree, \@curr, "duplicate not stored"); - -# repeat message-ID but different content -$email->body_set("different\n"); -$mda->deliver($email); - -my @prev = @curr; -my @prev_blobs = map { (split(/\s+/, $_))[2] } @prev; - -chomp(@curr = `git ls-tree -r HEAD`); -my %curr_blobs = map { (split(/\s+/, $_))[2] => 1 } @curr; -is(3, scalar @curr, "mismatch stored with identical Message-ID"); - -foreach my $prev (@prev_blobs) { - ok(delete $curr_blobs{$prev}, "prev=$prev blob exists"); -} - -my @only = keys %curr_blobs; -is(1, scalar @only, "one new blob stored"); - -my $body_3 = "3rd message with identical Message-ID, ridiculous\n"; -$email->body_set($body_3); -$mda->deliver($email); - -@prev = @curr; -@prev_blobs = map { (split(/\s+/, $_))[2] } @prev; -chomp(@curr = `git ls-tree -r HEAD`); -%curr_blobs = map { (split(/\s+/, $_))[2] => 1 } @curr; -is(4, scalar @curr, "another stored with identical Message-ID"); - -foreach my $prev (@prev_blobs) { - ok(delete $curr_blobs{$prev}, "prev=$prev blob exists"); -} -@only = keys %curr_blobs; -is(1, scalar @only, "one new blob stored"); - -my $want = sha1_hex($email->header("Subject") . $email->body); -my @want = grep(m!/\Q$want\E!, @curr); -is(1, scalar @want, "wanted message is unique"); -my $blob = (split(/\s+/, $want[0]))[2]; -my $s = `git cat-file blob $blob`; -$s = Email::Simple->new($s); -is("<666\@example.com>", $s->header("message-id"), "MID matches"); -is($body_3, $s->body, "body matches"); - -done_testing(); diff --git a/t/mda-missing-mid.t b/t/mda-missing-mid.t deleted file mode 100644 index d375ae9..0000000 --- a/t/mda-missing-mid.t +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl -w -# Copyright (C) 2013, Eric Wong and all contributors -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -use strict; -use warnings; -use Test::More; -use Ssoma::MDA; -use Ssoma::Git; -use Email::Simple; -use File::Temp qw/tempdir/; -my $tmpdir = tempdir(CLEANUP => 1); -my $git = Ssoma::Git->new("$tmpdir/gittest"); -$git->init_db; -my $mda = Ssoma::MDA->new($git); -my $email = Email::Simple->new("From: U \n\nHIHI\n"); -$mda->deliver($email); - -local $ENV{GIT_DIR} = "$tmpdir/gittest"; -my @tree = `git ls-tree -r HEAD`; -is(scalar @tree, 1, "one item in tree"); -my @line = split(/\s+/, $tree[0]); -my $msg = Email::Simple->new($git->cat_blob($line[2])); -like($msg->header("message-id"), qr/\A<[a-f0-9]{40}\@localhost>\z/, - "message-id generated for message missing it"); - -done_testing(); -- 2.26.2