#!/usr/bin/perl # # Analyze an mbox directory tree (e.g. that of Thunderbird) and # Report received/sent statistics # Number of messages per folder # Number of incoming / outgoing messages per day, day of week, calendar month # # Copyright (c) 2010, Diomidis Spinellis # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $Id: email-analytics.pl,v 1.4 2010/05/16 07:35:11 dds Exp dds $ # use strict; use Mail::Box::Mbox; use File::Find; use HTML::AsSubs; ############################### # Configurable items start here # Mail directory my $maildir = '/dds/mail'; # Directory to analyze my $analyze_folder = "$maildir/ggps.sbd"; #my $analyze_folder = "$maildir/next.sbd"; # Folder of sent messages my $sentbox = "$maildir/sent"; # Your email address my $me = 'dds'; # Set to 1 to produce a report with all folders and addresses my $full = 1; # Set to true in order to redact the names in the results my $redact = 0; # Set to true for verbose processing info my $verbose = 0; # Map email aliases to a common address sub addressMap { my ($email) = @_; $email =~ tr/[A-Z]/[a-z]/; # Add here rules for mapping email address aliases return 'dds' if ($email =~ m/^dds@/); return $email; } # Configurable items end here ############################# my %sendFolderCount; my %sendHourCount; my %sendMonthCount; my %sendWeekDayCount; my %sendDayCount; my %receiveFolderCount; my %receiveHourCount; my %receiveMonthCount; my %receiveWeekDayCount; my %receiveDayCount; my %recipientCount; my %senderCount; my $messageCount; my $folderCount; my @dayOfWeekName = qw(Sun Mon Tue Wed Thu Fri Sat); my @monthName = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); Mail::Reporter->defaultTrace('ERRORS'); # Process the message folders find(\&process, $analyze_folder); $_ = $sentbox; process(); # Generate the HTML report my @body; my @toc; my $hcount; section('Summary'); push(@body, table({border=>1}, &tr(th({align=>'left'}, 'Number of messages'), td({align=>'right'}, $messageCount)), &tr(th({align=>'left'}, 'Number of recipients'), td({align=>'right'}, %recipientCount + 0)), &tr(th({align=>'left'}, 'Number of senders'), td({align=>'right'}, %senderCount + 0)), &tr(th({align=>'left'}, 'Number of active days'), td({align=>'right'}, %receiveDayCount + 0)), &tr(th({align=>'left'}, 'Average messages per day'), td({align=>'right'}, round($messageCount / %receiveDayCount))), &tr(th({align=>'left'}, 'Average messages per month'), td({align=>'right'}, round($messageCount / %receiveMonthCount))), &tr(th({align=>'left'}, 'Average messages per folder'), td({align=>'right'}, round($messageCount / %receiveFolderCount))), &tr(th({align=>'left'}, 'Average messages per recipient'), td({align=>'right'}, round($messageCount / %recipientCount))), &tr(th({align=>'left'}, 'Average messages per sender'), td({align=>'right'}, round($messageCount / %senderCount))) )); report('Emails by Month', 'Month', \%sendMonthCount, \%receiveMonthCount, \&yearMonthMap ); report('Emails by Day of Week', 'Week day', \%sendWeekDayCount, \%receiveWeekDayCount, \&weekDayMap ); report('Emails by Hour', 'Hour', \%sendHourCount, \%receiveHourCount, \&scalarIdentity ); baseReport('Top 10 Folders', 'Folder', \%sendFolderCount, \%receiveFolderCount, \&redact, \&sortByTotal, 1 ); baseReport('Top 10 Email Addresses', 'Address', \%recipientCount, \%senderCount, \&redact, \&sortByTotal, 1 ); if ($full) { report('Emails by Folder', 'Folder', \%sendFolderCount, \%receiveFolderCount, \&redact ); report('Emails by Address', 'Address', \%recipientCount, \%senderCount, \&redact ); } print body( a({name=>'#top'}, h1({align=>'center'}, "Email Analytics for $me")), h2("Table of contents"), ul(@toc), @body )->as_HTML(); # Process a single mail folder sub process { return unless (-f); return if (/\.msf$/ || /\.dat$/); my $folder = $_; print STDERR "Processing folder $folder\n"; my $folder = Mail::Box::Mbox->new(folder => $_, lock_type => 'NONE'); foreach my $m ($folder->messages) { $messageCount++; my $t = $m->timestamp(); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($t); # Round hour $hour++ if ($min > 30); $hour = 0 if ($hour == 24); $hour = sprintf("%02d", $hour); $year += 1900; my $ym = sprintf("%d-%02d", $year, $mon); my $to = $m->head->get('to'); if (defined($to)) { my @addr = $to->addresses; for $a (@addr) { my $email = addressMap($a->address); $recipientCount{$email}++; print STDERR "Recipient: $email\n" if ($verbose); } } my $from = $m->head->get('From'); if (defined($from)) { my @addr = $from->addresses; for $a (@addr) { my $email = addressMap($a->address); $senderCount{$email}++; if ($email eq $me) { $sendHourCount{$hour}++; $sendMonthCount{$ym}++; $sendWeekDayCount{$wday}++; $sendDayCount{"$year-$yday"}++; $sendFolderCount{$folder}++; } else { $receiveHourCount{$hour}++; $receiveMonthCount{$ym}++; $receiveWeekDayCount{$wday}++; $receiveDayCount{"$year-$yday"}++; $receiveFolderCount{$folder}++; } print STDERR "Sender: $email\n" if ($verbose); } } } $folder->DESTROY(); } sub baseReport { my($title, $keyName, $send, $receive, $map, $sort, $summarize) = @_; my @table; my @rows; push(@table, {border=>1}, &tr( th($keyName), th('Sent'), th('Received'), th('Total'), )); foreach my $key (unique(sort (keys %$send, keys %$receive))) { push(@rows, &tr( td($map->($key)), td({align=>'right'}, $send->{$key} + 0), td({align=>'right'}, $receive->{$key} + 0), td({align=>'right'}, $send->{$key} + $receive->{$key} + 0) )); } section($title); if ($summarize) { push(@body, table(@table, ($sort->(@rows))[0..9])); } else { push(@body, table(@table, $sort->(@rows))); } } # Report by by key name and by volume sub report { my($title, $keyName, $send, $receive, $map) = @_; baseReport($title, $keyName, $send, $receive, $map, \&arrayIdentity); baseReport($title . " Ordered by Volume", $keyName, $send, $receive, $map, \&sortByTotal); } # Sort an array using the byTotal function sub sortByTotal { return sort byTotal @_; } # Sort table row HTML Elements by their third column sub byTotal { $b->content->[3]->content()->[0] <=> $a->content()->[3]->content()->[0]; } # Remove duplicate elements from a sorted array sub unique { my(@in) = @_; my $prev = 'xyzzyxyzynonesuch'; return grep($_ ne $prev && (($prev) = $_), @in); } # Map a day of week number into its name sub weekDayMap { my($num) = @_; return $dayOfWeekName[$num]; } # Map 2009-01 into Jan 2009 sub yearMonthMap { my($ym) = @_; my($y, $m) = ($ym =~ m/(\d+)\-(\d+)/); return $monthName[$m] . ' ' . $y; } # Redact alphabetic characters to x if $redact is true sub redact { return @_[0] unless ($redact); my($name) = @_; return $name if ($name eq $me); $name =~ s/[a-z]/x/g; $name =~ s/[A-Z]/X/g; return $name; } sub scalarIdentity { return @_[0]; } sub arrayIdentity { return @_; } # Start a new HTML section sub section { my ($title) = @_; push(@body, a({href=>'#top'}, p('Back to top'))); push(@body, a({name=>"h" . ++$hcount}, h2($title))); push(@toc, li(a({href=>"#h$hcount"}, $title))); } sub round { my($number) = shift; return int($number + .5); }