#!/usr/bin/perl -w
#
# mk-roc-graphs -- create Receiver Operating Characteristic graphs
#
# Creates ROC curve graphs from a pair of SpamAssassin logs and the current
# score-set.  See <http://home.comcast.net/~tom.fawcett/public_html/ROCCH/>
# for details on ROC curves.
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use Getopt::Long;
our ($opt_cffile, $opt_spam, $opt_ham, $opt_count, $opt_lambda);

GetOptions("cffile=s", "count", "threshold=f", "spam=s",
	   "ham=s", "scoreset=i", "lambda=f");

$opt_cffile ||= "../rules";
$opt_spam ||= 'spam.log';
$opt_ham ||= 'ham.log';
$opt_count ||= 0;

$opt_scoreset = 0 if (!defined $opt_scoreset);

my $msgline;

our (%rules, %allrules);

my %spam_points = ();
my %ham_points = ();
my ($num_tests, $num_spam, $num_ham);

my $nybias = 10;

# lambda value for TCR equation, representing the cost of of an FP vs. the
# cost of a FN.  Some example values are: 1 = tagged only, 9 = mailed back
# to sender asking for token, 999 = blocking or deleting a message.
#
# We roughly aim for a value representing "moved to infrequently-read folder".

my $lambda = 50;
if ($opt_lambda) { $lambda = $opt_lambda; }


readscores();
readlogs();

if ($opt_count) {
  $nybias = $nybias*($num_spam / $num_ham);
  evaluate_reports();
}
else {
  evaluate_scores();
}
exit 0;


# arguments are $isspam, $count, \@tests
sub log_line_count {
  my $score = 0;
  $score += $scores{$_} for @{$_[2]};

  $score = (int($score * 10) / 10);
  if ($_[0]) {
    $num_spam++;
    $spam_points{$score}++;
  }
  else {
    $num_ham++;
    $ham_points{$score}++;
  }
}

sub readlogs {
  my $count = 0;
  $num_spam = $num_ham = 0;

  # set handler for log lines
  my $log_line = \&log_line_count;

  foreach my $file ($opt_spam, $opt_ham) {
    open (IN, "<$file") || die "Could not open file '$file': $!";

    my $isspam = ($file eq $opt_spam);
    my $caught;			# 1st parameter of log line
    my $rules;			# 4th parameter of log line

    while (defined($msgline = <IN>)) {
      ($caught, undef, undef, $rules) = split(' ', $msgline);

      # only take lines starting with Y or .
      next unless ($caught eq 'Y' || $caught eq '.') && $rules;

      # get tests, but ignore unknown tests and subrules
      my @tests;
      foreach my $r (split(/,/, $rules)) {
        my $hits = 1;
        # Support compacted RULE(hitcount) format
        if ($r =~ s/\((\d+)\)$//) {
          $hits = $1;
        }
        next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
        push @tests, $r for (1 .. $hits);
      }

      # run handler
      $log_line->($isspam, $count, \@tests);

      # increment line
      $count++;
    }
    close IN;
  }
  $num_tests = $count;
}

sub readscores {
  warn "Reading scores from \"$opt_cffile\"...\n";
  system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
  require "./tmp/rules.pl";
  %allrules = %rules;           # ensure it stays global
}

sub _evaluate_at_all_thresholds {
  my $callback = shift;

  my %u=();
  my @scores = grep {defined} map {
    if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
  } sort { $a <=> $b } (keys %ham_points, keys %spam_points);

  my $tot_h = $num_ham || 0.00001;
  my $tot_s = $num_spam || 0.00001;

  foreach my $threshold (@scores) {
    my $fp = 0;     # false positives; ham marked as spam
    my $fn = 0;     # false negatives; spam marked as ham
    my $tp = 0;     # true positives; spam marked as spam
    my $tn = 0;     # true negatives; ham marked as ham
    foreach my $scr (@scores) {
      my $nh = $ham_points{$scr} || 0;
      my $ns = $spam_points{$scr} || 0;
      if ($threshold > $scr) {
        $tn += $nh;
        $fn += $ns;
      } else {
        $fp += $nh;
        $tp += $ns;
      }
    }
    $callback->($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp);
  }
}

sub evaluate_reports {
  _evaluate_at_all_thresholds (sub {
    my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
    my $nh = $ham_points{$threshold} || 0;
    my $ns = $spam_points{$threshold} || 0;
    # $nh /= $tot_h;
    # $ns /= $tot_s;
    # $tn /= $tot_h;
    # $tp /= $tot_s;
    # $fp /= $tot_h;
    # $fn /= $tot_s;

    printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
    printf "# Correctly non-spam: %6d  %4.2f%%\n",
        $tn, ($tn /  $tot_h) * 100.0;
    printf "# Correctly spam:     %6d  %4.2f%%\n",
        $tp, ($tp /  $tot_s) * 100.0;
    printf "# False positives:    %6d  %4.2f%%\n",
        $fp, ($fp /  $tot_h) * 100.0;
    printf "# False negatives:    %6d  %4.2f%%\n",
        $fn, ($fn /  $tot_s) * 100.0;

    # convert to the TCR metrics used in the published lit
    my $nspamspam = $tp;
    my $nspamlegit = $fn;
    my $nlegitspam = $fp;
    my $nlegitlegit = $tn;
    my $nlegit = $tot_h;
    my $nspam = $tot_s;

    my $werr = ($lambda * $nlegitspam + $nspamlegit)
                    / ($lambda * $nlegit + $nspam);

    my $werr_base = $nspam
                    / ($lambda * $nlegit + $nspam);

    $werr ||= 0.000001;     # avoid / by 0
    my $tcr = $werr_base / $werr;

    my $sr = ($nspamspam / $nspam) * 100.0;
    my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
    printf "# TCR(l=%s): %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%\n",
      $lambda, $tcr, $sr, $sp;
  });
}

sub evaluate_scores {
  _evaluate_at_all_thresholds (sub {
    my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
    my $nh = $ham_points{$threshold} || 0;
    my $ns = $spam_points{$threshold} || 0;
    $nh /= $tot_h;
    $ns /= $tot_s;
    $tn /= $tot_h;
    $tp /= $tot_s;
    $fp /= $tot_h;
    $fn /= $tot_s;
    printf "%f %.9f %.9f %.9f %.9f %.9f %.9f\n",
        $threshold, $nh, $ns, $tn, $tp, $fp, $fn;
  });
}

__DATA__

# Suggested commands to plot data from this script:

# ham score distribution:
./mk-roc-graphs --scoreset=3  > dat
gnuplot
set grid
plot [-5:40] [] "dat" using 1:2 with boxes

# spam score distribution:
./mk-roc-graphs --scoreset=3  > dat
gnuplot
set grid
plot [-5:40] [] "dat" using 1:3 with boxes

# plot a ROC curve:
./mk-roc-graphs --scoreset=3  > dat
gnuplot
set grid
plot [0:0.1] [0:0.1] "dat" using 6:7 with linesp

# For a ROC curve, as shown in the literature
./mk-roc-graphs --scoreset=3 > dat
set grid
set xlabel "False Positives (1.0 = 100%)"
set ylabel "True Positives (1.0 = 100%)"
plot "dat" using 6:5 with linesp

# same, but tighter:
./mk-roc-graphs --scoreset=3  > dat
gnuplot
set grid
plot [0:0.02] [0:0.02] "dat" using 6:7 with linesp

# and the piece de resistance:
./mk-roc-graphs --scoreset=0  > set0
./mk-roc-graphs --scoreset=1  > set1
./mk-roc-graphs --scoreset=2  > set2
./mk-roc-graphs --scoreset=3  > set3
gnuplot
set terminal png
set output "out.png"
set grid
set xlabel "False Positives (1.0 = 100% of ham marked as spam)"
set ylabel "False Negatives (1.0 = 100% of spam marked as ham)"
plot [0:0.1] [0:0.1] "set0" using 6:7 with linesp, \
    "set1" using 6:7 with linesp, \
    "set2" using 6:7 with linesp, \
    "set3" using 6:7 with linesp


