Date: Fri, 6 Jun 2014 12:40:18 +0200
From: Vladimir Marek <Vladimir.Marek@oracle.com>
To: David Edmondson <david.edmondson@oracle.com>
Subject: Re: Deduplication ?
Message-ID: <20140606104018.GJ2154@virt.cz.oracle.com>
References: <20140602123212.GA12639@virt.cz.oracle.com>
	<87d2ers9mi.fsf@qmul.ac.uk> <m2ppirs8ea.fsf@guru.guru-group.fi>
	<87ppirqtfa.fsf@qmul.ac.uk> <87y4xfz1fi.fsf@nikula.org>
	<cunegz71aw9.fsf@gargravarr.hh.sledj.net>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="mJm6k4Vb/yFcL9ZU"
Content-Disposition: inline
In-Reply-To: <cunegz71aw9.fsf@gargravarr.hh.sledj.net>
User-Agent: Mutt/1.5.22.1-rc1 (2013-10-16)
Cc: Tomi Ollila <tomi.ollila@iki.fi>, notmuch@notmuchmail.org
Precedence: list


--mJm6k4Vb/yFcL9ZU
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline

Hi,


So I wrote some code which works for me well. I have erased ~40k
messages out of 500k. It does not try to be complete solution, it only
detects and removes the obvious cases. The idea is to help me control
the number of duplicates when I import big mail archives which surely
contain many duplicates into my mail database.

> Thinking about this a bit...

> The headers are likely to be different, so you could remove them (get
> rid of everything up to the first empty line).

Yes, that's what I ended up doing. And I delete the files which have
less 'Received:' headers.


> Various mailing lists add footers, so you would need to remove them (a
> regular expression based approach would catch most of them easily).

I defined a list of known footers. Then I take the two mails with the
same message-id, create diff between them and  compare it to the list of
footers.


> The remaining content should be the same for identical messages, so a
> sensible hash (md5) could be used to compare.
> 
> Although, some MTAs modify the body of the message when manipulating
> encoding. I don't know how to address this.

I'm attaching my perl script if anyone is interested. It's in no way
complete solution. It is supposed to be used as

notmuch search --output=files --duplicate=2 '*' > dups
./dedup # It opens the file 'dups'

The attached version does not remove anyting (the 'unlink' command is
commented out).


Interestingly this does not work (it seems to return all messages):
notmuch search --output=messages --duplicate=2 '*'

Also I have found that if I run 'notmuch search' and 'notmuch new' at
the same time, the notmuch search crashes sometimes. That's why I don't
use

notmuch search ... | ./dedup

Use with care :)

Thank you for your help
-- 
	Vlad

--mJm6k4Vb/yFcL9ZU
Content-Type: text/plain; charset=utf-8
Content-Disposition: attachment; filename=dedup

#!/usr/bin/perl

use Data::Dumper;
use List::Util;


@TO_IGNORE= (

<<'EOT'
> _______________________________________________
> notmuch mailing list
> notmuch@notmuchmail.org
> http://notmuchmail.org/mailman/listinfo/notmuch
EOT

,

<<'EOT'
> _______________________________________________
> Userland-perl mailing list
> Userland-perl@userland.us.oracle.com
> http://userland.us.oracle.com/mailman/listinfo/userland-perl
EOT

,

<<'EOT'
> _______________________________________________
> Mercurial mailing list
> Mercurial@selenic.com
> http://selenic.com/mailman/listinfo/mercurial
EOT

,

<<'EOT'
> --    
> To unsubscribe from this list go to the following URL and read the
> instructions:  https://lists.samba.org/mailman/options/samba
EOT

,

<<'EOT'
> 
EOT

);

sub rm($$) {
	my ($file, $comment) = @_;
	print "-> $file\n";
	print $comment;
	# unlink $file;
}

sub check_mail_id($) {
	$ID = $_[0];

	unless (open ID, "-|", "./notmuch", "search", "--output=files", "id:$ID") {
		warn "Can not fork: $!";
		return;
	}
	chomp(@FILES = <ID>);
	close ID;

	if (scalar @FILES <= 1) {
		warn "Not enough files for ID:$ID\n";
		return;
	}

	my ($F1, $F2) = @FILES;

	unless (-r $F1) {
		warn "Can not read $F1 in ID:$ID\n";
		return;
	}
	unless (-r $F2) {
		warn "Can not read $F2 in ID:$ID\n";
		return;
	}
	if ($F1 eq $F2) {
		warn "Same filename $F1\n in ID:$ID\n";
		return;
	}

	unless (open DIFF_WHOLE, "-|", $diff, $F1, $F2) {
		warn "Can not fork $diff: $!\n";
		return;
	}
	$DIFF_WHOLE = join "", <DIFF_WHOLE>;
	close DIFF_WHOLE;

	if ( length($DIFF_WHOLE) == 0 ) {
		rm $F2, "deleting_1\nID:$ID\n\n";
		return;
	}

	# 35a36
	# > Content-Length: 893
	if (
		$DIFF_WHOLE =~ /^\d+a\d+\n> Content-Length: \d+$/
		or
		$DIFF_WHOLE =~ /^\d+d\d+\n< Content-Length: \d+$/
	) {
		rm $F2, "deleting_2\nID:$ID\n\n";
		return;
	}



	# $r="[a-zA-Z0-9 ()[\]\.\+:/=;,\t-]+";
	# if (
	# 	$DIFF_WHOLE =~ /1,7d0\n< Received:$r\n< \t$r\n< \t$r\n< Received:$r\n< \t$r\n< \t$r\n< \t$r\n\d+a\d+,\d+\n> Content-Length:$r\n> Lines:$r/
	# ) {
	# 	printf "deleting_3\nID:$ID\n$DIFF_WHOLE\n\n";
	# 	return;
	# }

	unless (open DIFF_BODY, "-|", "bash", "-c", "$diff <(sed -e 1,/^\$/d \"\$1\" ) <(sed -e 1,/^\$/d \"\$2\" )", "", $F1, $F2) {
		warn "Can not fork $diff (2): $!\n";
		return;
	}
	$DIFF_BODY = join "", <DIFF_BODY>;
	close DIFF_BODY;

	if ( length($DIFF_BODY) == 0 ) {
		# The bodies are the same - let's find which one has less
		# Received: headers and delete that
		unless (open F, $F1) 
		{
			warn "Can't open F1 '$F1': $!";
			return;
		}
		my $count1 = grep { /^Received: / } <F>;
		close F;
		unless (open F, $F2) 
		{
			warn "Can't open F2 '$F2': $!";
			return;
		}
		my $count2 = grep { /^Received: / } <F>;
		close F;

		if ($count1 > $count2) {
			rm $F2, "deleting_4a\nID:$ID\n\n";
		} else {
			rm $F1, "deleting_4b\nID:$ID\n\n";
		}
		return;
	}


	for (@TO_IGNORE) {
		next unless $DIFF_BODY =~ $_;
		# Remove the first one as the second is adding lines
		rm $F1, "deleting_5\nID:$ID\n\n";
		return;
	}

	for (@TO_IGNORE_REVERSE) {
		next unless $DIFF_BODY =~ $_;
		# Remove the second as it is removing some lines
		rm $F2, "deleting_6\nID:$ID\n\n";
		return;
	}

	#--------------------------------------------------
	# '2c2
	# < --Boundary_(ID_DK6KMxNlhttcScVv/QSi8A)
	# ---
	# > --Boundary_(ID_dlReFj9tgdNWy+1SUxwTeQ)
	# 39c39
	# < --Boundary_(ID_DK6KMxNlhttcScVv/QSi8A)
	# ---
	# > --Boundary_(ID_dlReFj9tgdNWy+1SUxwTeQ)
	# 55c55
	# < --Boundary_(ID_DK6KMxNlhttcScVv/QSi8A)--
	# ---
	# > --Boundary_(ID_dlReFj9tgdNWy+1SUxwTeQ)--
	#-------------------------------------------------- 
	$re = qr/(\d+)c\1\n< --Boundary_\(\S+\)(?:--)?\n---\n> --Boundary_\(\S+\)(?:--)?\n/;
	if ( $DIFF_BODY =~ m/^(?:$re+)$/ ) {
		# Change in boundary strings
		rm $F2, "deleting_7\nID:$ID\n\n";
		return;
	}

	print "DIFF_BODY (ID: $ID):\n'$DIFF_BODY'\n\n" if length $DIFF_BODY < 300;
}

$diff = 'diff';
$diff = 'gdiff' if -x '/usr/bin/gdiff'; # Solaris

# First create reverse regexps (removing lines from the mail) so that we don't
# overwrite the original @TO_IGNORE
@TO_IGNORE_REVERSE = map {
	$x = $_;                       # Make sure we don't change the @TO_IGNORE array
	$x =~ s/^>/</mg;               # Make sure all the lines are adding a text
	qr/^(?:\d+,)?\d+d\d+\n\Q$x\E$/ # 1,2d3 or 2d3
} @TO_IGNORE;

# Now map the positive regexp (adding lines to the mail)
@TO_IGNORE = map {
	s/^</>/mg;                      # Make sure all the lines are removing text
	qr/^\d+a\d+?(?:,\d+)?\n\Q$_\E$/ # 115a116,119 or 114a116
} @TO_IGNORE;

# File 'dups' is created via
# notmuch search --output=files --duplicate=2 '*' > dups

open INPUT, "dups" or die "Can't open dups: $!\n";
while (<INPUT>) {
	chomp;
	if (open FILE, $_) {
		$id =  List::Util::first { s/^message-id:.*<(.*)>\n$/\1/i } <FILE>;
		close FILE;
		check_mail_id $id if defined $id;
	} else {
		print "Can't find '$_\n'";
	}
}
close INPUT;

--mJm6k4Vb/yFcL9ZU--