#!/usr/bin/perl -w
use utf8;


#########################################################################
#### Author: Eckhard Bick 2003,2008; contact: eckhard.bick@mail.dk ######
#########################################################################
#
# use: eval_cg file1 file2
#
# compares two cg files, either a gold file with a test file, or simply two different runs on the same input.
#
# (1) Input format can either be niceline or cohort format
#   (a) niceline: word     [base] <...> ... POS MORF ... @FUNC ...
#       optional extra tags on the same line: either §SEM .. or %ERROR ...
#   (b) cohort: "<word>"
#                    "base" <...> ... POS MORF ... @FUNC
#                    "base" <...> ... POS MORF ... @FUNC
#                    ...
# However, only for syntatic tags (@) and extra tags (% or §) is ambiguity allowed - cohort format will be truncated to one morphological line per cohort (the first). Output format will be "niceline".
#
# (2) rewrites testfile with difference markers (3a-d), followed by file1 line number and file1 tag (in parentheses). For syntax, the parenthesis will also contain a hit-out-of count, e.g. 1/2.
#
# (3) At the end of the rewritten file2, eval_cg will output file difference as an evaluation metrics, providing recall, precision and F-score for
#      (a) base form *B
#      (b) part of speech *P
#      (c) morphology *M
#      (d) syntax *S
#      (e) extra *E (could e.g. be §SEM, or % markers for spell/grammarchecking or komma
#      (f) extra_exist *X (i.e. whether or not there is a %/§-tag at all - useful for e.g. komma-evaluation) ... on the line, *Xover marks a precision error, *Xunder a recall error
#
# (4) The program has no special alignment needs - it will tolerate some tokenisation difference (e.g. regarding polylexicals). Tokenisation mismatches will be marked in the rewritten file2 as *T_missing, *T_extra and *T_mismatch followed by the corresponding line number in file1, in the case of many to many mismatches also with an unti-n/m indicator, where n and m are the respective line numbers of the point where alignment was reestablished
#
# Set $no_meta to 1 to provide for meta-markup mismatches (<..> lines ignored)
#
# Set $match_only to 1 to evaluate pos and morf only for matching words
#
# Set $sta_adj to 1 if you want to evaluate STA participles as adjectives
#
#########################################################################

$no_meta =0;
$match_only =0;
$sta_adj =1;

$goldfile =$ARGV[0];
$testfile =$ARGV[1];

$i=0;
open(FH, "<$goldfile");
while (<FH>) {
  $j++;
  s/\t.* PU .*\@PU[^%§\n]*/ /; # punctuation stripped of its tags
  s/^\$([^ \t]+)[ \t]([%§\n])/\$$1\t \[$1\] PU \@PU $2/;
  s/^\$([^ \t]+)\n/\$$1\t \[$1\] PU \@PU \n/;
#  if (/,/) {print "--gold: $_\n";}
  if (/^\"<(.*)>\"/) {
    $token =$1;
#    if ($i>0 && $goldline[$i-1] && $goldline[$i-1] =~ /\[/) {$goldline[$i-1] .= "\n";}
    $i++; $jgold[$i]=$j;
    $token_gold[$i] =$token;
    $goldline[$i] =$token;
  }
  elsif (/^[ \t]+\"(.*?)\"(.*)/) {
    $goldline[$i].= "\t\[$1\]$2";
    $words++;
  }
  elsif ($no_meta && /^</) {}
  elsif (/./) {
    $i++; $jgold[$i]=$j;
    s/(\@.*?)[ \t]+[\"\[].*/$1/; # only one morphological reading;
    s/=([\t ])/$1/; # normalise parts of polylexicals
    $goldline[$i] =$_;
    ($token) = $_ =~ /^([^ \t]+)/;
    $token_gold[$i] =$token;
#    print "i=$i goldline=$goldline[$i]";
    if (/\[.*\].* [A-Z]+ .*\@/) {$words++;}
  }
}
if (! ($goldline[$i] =~ /\n$/)) {$goldline[$i] .= "\n";}

foreach (@goldline) {
#  $n++; print "--$n -- $_\n";
  if ($_) {
    if (/\[/ && ! /\n/) {s/ *$/ \n/;}
    if (/<\*>/) {s/^(.)/\u$1/;} 
    s/(\@.*?)[ \t]+[\"\[].*/$1/; # only one morphological reading;
    s/ (\@[^ ]+)( .*| )\1([ \n])/$2$1$3/; # erase identical syntax readings;
    s/(\@[^ ]+ )\1/$1/;
    #  print "$_";
  }
}

$i=0;
open(FH, "<$testfile");
while (<FH>) {
  s/\t.* PU .*\@PU[^%§\n]*/ /; # punctuation stripped of its tags
  s/^\$([^ \t]+)[ \t]+([%§\n])/\$$1\t \[$1\] PU \@PU $2/;
  s/^\$([^ \t]+)\n/\$$1\t \[$1\] PU \@PU \n/;
#  if (/,/) {print "--$_\n";}
  if (/^\"<(.*)>\"/) {
    $token =$1;
#    if ($i>0 && $testline[$i-1] && $testline[$i-1] =~ /\[/) {$testline[$i-1] .= "\n";}
    $i++;
    $token_test[$i] =$token;
    $testline[$i] =$token;
  }
  elsif (/^[ \t]+\"(.*?)\"(.*)/) {
    $testline[$i].= "\t\[$1\]$2";
    if (! /\@/) {$testline[$i] .= " \@X";} # add dummy syntax where missing
  }
  elsif ($no_meta && /^</) {}
  elsif (/./) {
    $i++;
    s/=([\t ])/$1/; # normalise parts of polylexicals
    
    $testline[$i] =$_;
    ($token) = $_ =~ /^([^ \t]+)/;
    $token_test[$i] =$token;
  }
#  print "-$i $testline[$i]\n";
}

if (! ($testline[$i] =~ /\n$/)) {$testline[$i] .= "\n";}

foreach (@testline) {
#  $m++; print "--$m $_\n";
  if ($_) {
    if (/\[/ && ! /\n/) {s/ *$/ \n/;}
    if (/<\*>/) {s/^(.)/\u$1/;} 
    s/(\@.*?)[ \t]+[\"\[].*/$1/; # only one morphological reading;
    s/ (\@[^ ]+)( .*| )\1([ \n])/$2$1$3/; # erase identical syntax readings;
    s/(\@[^ ]+ )\1/$1/;
#    print "--$_";
  }
}



$num =$i;
$delta_gold =0;
$delta_test =0;

$eyeold =0;
for ($i=1; $i <=$num - $delta_test; $i++) {
  match:
  $sem_gold =""; $sem_test ="";
  $goldeye = $jgold[$i+$delta_gold];
 if ($goldline[$i+$delta_gold] && $testline[$i+$delta_test] && $goldline[$i+$delta_gold] =~ /\[.*\].* [A-Z]+ .*\@/ && $testline[$i+$delta_test] =~ /\[.*\].* [A-Z]+ .*\@/) {
 #   print "---i=" . ($i+$delta_gold) . " goldline=" . $goldline[$i+$delta_gold];
 #   print "---i=" . ($i+$delta_test) . " testline=" . $testline[$i+$delta_test];
    ($word_gold,$base_gold,$pos_gold,$morf_gold,$syn_gold) = $goldline[$i+$delta_gold] =~ /^(.*?)[\t ]+[\"\[](.*?)[\"\]].*? ([A-Z]+)( [A-Za-z0-9\/\- ]*).*?(\@[\w\-<>\@ ]+?) *[\#\n%&£]/;
#    print "--$word_gold,$base_gold,$pos_gold,$morf_gold,$syn_gold\n";
    ($word_test,$base_test,$pos_test,$morf_test,$syn_test) = $testline[$i+$delta_test] =~ /^(.*?)[\t ]+[\"\[](.*?)[\"\]].*? ([A-Z]+)( [A-Za-z0-9\/\- ]*).*?(\@[\w\-<>\@ ]+?) *([\#\n%&£]|\*[A-Z])/; # cave: there might already be a *T added to the line - if gold had an extra token here
#    if (! $word_test ) {print "problem: formal error in line " . ($i+$delta_test) . " " .  $testline[$i+$delta_test] . "\n";}
#    print "---$word_gold $word_test ---, $pos_gold $pos_test\n";
    $syn_gold =~ s/ $//;
    $syn_test =~ s/ $//;
    if ($goldline[$i+$delta_gold] =~ / ([%§][\w\-<>%§ ]+)/) {
      $sem_gold = $1;
      $sem_gold =~ s/ $//;
#      print "--sem_gold=$sem_gold\n";
    }
    if ($testline[$i+$delta_test] =~ / ([%§][\w\-<>%§ ]+)/) {
      $sem_test = $1;
      $sem_test =~ s/ $//;
#      print "--sem_test=$sem_test ### $testline[$i+$delta_test]\n";
    }
#    if (! $syn_test) {print "--no syn for " . ($i + $delta_test) . " -- " . $testline[$i+$delta_test] . "\n";}

    if ($word_gold && $word_test && ($word_gold eq $word_test || $pos_gold eq $pos_test)) { # move here from higher up, to allow counting of pos and morf (for the current word even in mismatches, i.e. for instance the first part of token-mismatched MWE, but also simple one-word typo mismatches. Syn counts were already unaffected before the change, because they had their own gold count (due to allowed ambiguity)
      $word_match++;
      $morf_gold =~ s/^ +//;
      $morf_gold =~ s/ +$//;
      if ($morf_test) {
	$morf_test =~ s/^ +//;
	$morf_test =~ s/ +$//;
      }
      else {
#	print "no morf_test in line $i\n";
      }
#      print "$word_test, $pos_test / $pos_gold, $morf_test / $morf_gold, $syn_test\n";
      if ($pos_gold eq $pos_test) {$pos_match++;}
      elsif ($sta_adj && ($pos_gold eq "ADJ" && $testline[$i+$delta_test] =~ /(STA|PCP1.*\@>N)/ || $pos_test eq "ADJ" && $goldline[$i+$delta_gold] =~ /(STA|PCP1.*\@>N)/)) {$pos_match++;}
      elsif ($pos_gold eq "KP" && $testline[$i+$delta_test] =~ /<kp>/) {$pos_match++;}
      else {
	$testline[$i+$delta_test] =~ s/$/ *P$goldeye($pos_gold)/;
      }
      if ($base_gold eq $base_test) {$base_match++;}
      else {
	$testline[$i+$delta_test] =~ s/$/ *B$goldeye($base_gold)/;
      }
      if ($morf_gold eq $morf_test) {$morf_match++;}
      else {
	$testline[$i+$delta_test] =~ s/$/ *M$goldeye($morf_gold)/;
      }

### comparing syns

      @syns = split / /, $syn_test;
      $synerror =0; $synyes =0;
      @syngolds = split / /, $syn_gold;
      $syn_ct_gold += @syngolds;

      foreach (@syns) {
	$syn_ct++;
	$syn_statitem = $_;
#	print "---------$syn_statitem -- $syn_gold --------\n";
	if ($syn_gold =~ /$syn_statitem( .*)?$/) {
	  $syn_match++;
	  $synyes++;
	}
	else {
	  $synerror++;
	}
      }
      if ($synerror) {
	$testline[$i+$delta_test] =~ s/$/ *S$goldeye($synyes\/$synerror:$syn_gold)/;
      }

### comparing sems, if any (*E extras)

      if ($sem_gold) {
	@semgolds = split / /, $sem_gold;
	$sem_ct_gold += @semgolds;
	$sem_exist_ct_gold++;
	if (! $sem_test) {
	  $testline[$i+$delta_test] =~ s/$/ *E$goldeye(0\/0:$sem_gold) *Xunder/;
	}
      }
      if ($sem_test) {
	@sems = split / /, $sem_test;
	$semerror =0; $semyes =0;

	foreach (@sems) {
	  $sem_ct++;
	  $sem_statitem = $_;
	  #	print "---------$sem_statitem -- $sem_gold --------\n";
	  if ($sem_gold && $sem_gold =~ /$sem_statitem( .*)?$/) {
	    $sem_match++;
	    $semyes++;
	  }
	  else {
	    $semerror++;
	  }
	}
	if ($semerror) {
	  if (! $sem_gold) {$sem_gold = "NONE";}
	  $testline[$i+$delta_test] =~ s/$/ *E$goldeye($semyes\/$semerror:$sem_gold)/;
	}
	$sem_exist_ct++;
	if (! $sem_gold || $sem_gold =~ /NONE/) {
#	  $sem_exist_error++;
	  $testline[$i+$delta_test] =~ s/$/ *Xover/;
	}
	else {
	  $sem_exist_match++;
#	  print "--sem_exist_match=$sem_exist_match --- sem_exist_ct_gold=$sem_exist_ct_gold ---sem_exist_error=$sem_exist_error\n";
	}
      }
    }
    elsif ($goldline[$i+$delta_gold] =~ /\[.*\].* [A-Z]+ .*\@/ && $testline[$i+$delta_test+1] =~ /\[.*\].* [A-Z]+ .*\@/ && $token_gold[$i+$delta_gold] eq $token_test[$i+$delta_test+1]) {
#      print "problem: extra word in test at line " . ($i+$delta_test) . "\n";
      $testline[$i+$delta_test] =~ s/$/ *T_extra$goldeye/;
      $delta_test++;
      goto match;
    }
    elsif ($goldline[$i+$delta_gold+1] =~ /\[.*\].* [A-Z]+ .*\@/ && $testline[$i+$delta_test] =~ /\[.*\].* [A-Z]+ .*\@/ && $token_gold[$i+$delta_gold+1] eq $token_test[$i+$delta_test]) {
#      print "problem: extra word in gold at line " . ($i+$delta_gold) . "\n";
      $testline[$i+$delta_test] =~ s/$/ *T_missing$goldeye/;
      $delta_gold++;
      goto match;
    }
    elsif ($goldline[$i+$delta_gold+1] =~ /\[.*\].* [A-Z]+ .*\@/ && $testline[$i+$delta_test+1] =~ /\[.*\].* [A-Z]+ .*\@/ && $token_gold[$i+$delta_gold+1] eq $token_test[$i+$delta_test+1]) {
 #     print "problem: non-matching words at line " . ($i+$delta_test) . "/" . ($i+$delta_gold) . "\n";
      $testline[$i+$delta_test] =~ s/$/ *T_mismatch$goldeye/;
      $delta_test++; $delta_gold++;
      goto match;
    }
    elsif ($token_test[$i+$delta_test+1] eq $token_gold[$i+$delta_gold+1]) {
 #     print "problem: non-matching word in test at line " . ($i+$delta_gold) . " / " . ($i+$delta_test) . "\n";
      $testline[$i+$delta_test] =~ s/$/ *T_mismatch$goldeye/;
      $delta_gold++; $delta_test++;
      next;
    }
    else {
      for ($z=0; $z <=10; $z++) {
	for ($n=2; $n <=2+$z; $n++) {
	  for ($y=0; $y <=$z; $y++) {
	    for ($dir=-1; $dir <=1; $dir =$dir+2) {
	      $m =$n + $y*$dir;
	      if ($m >=2 && $m <= 2+$z) {
		if ($token_test[$i+$delta_test+$n] eq $token_gold[$i+$delta_gold+$m] && $token_test[$i+$delta_test+$n+1] eq $token_gold[$i+$delta_gold+$m+1]) {
		  $testline[$i+$delta_test] =~ s/$/ *T_mismatch$goldeye\_until_$n\/$m/;
		  $delta_test += $n;
		  $delta_gold += $m;
#	    print "problem: non-matching word in test at line " . ($i+$delta_gold) . " / " . ($i+$delta_test) . " - rematching at +$n/+$m\n";
		  goto match;
		}
	      }
	    }
	  }
	}
      }

      print "--problem: serious multi-line mismatch at line " . ($i+$delta_test) . " - exiting\n";
#      next;
      return;
    }

  }
  elsif (! $goldline[$i+$delta_gold]) {
    $goldshort++;
    print "-golddata shorter at thest $i + $delta_gold\n";
  }
  elsif (! $testline[$i+$delta_test]) {
    $testshort++;
#    print "-testdata shorter\n";
  }
#  print "--$eyeold $testline[$i+$delta_test]\n";
  for ($eye=$eyeold+1; $eye<=$i+$delta_test; $eye++) {
    if ($testline[$eye]) {print $testline[$eye];}
  }
  $eyeold =$eye-1;
}

if ($goldshort) {print "--golddata short by $goldshort lines\n";}
if ($testshort) {print "--testdata short by $testshort lines\n";}

if ($syn_match) {
  $recall_syn = int (1000 * ($syn_match/$syn_ct_gold) + 0.5) / 10;
  $precision_syn = int (1000 * ($syn_match/$syn_ct)+ 0.5) / 10;

  $f_syn = int (10 *  (2 * $recall_syn * $precision_syn) / ($recall_syn + $precision_syn + 0.1)+ 0.5) / 10;

  if ($match_only) {$pos_ct =$word_match; $pos_words =$word_match;}
  else {$pos_ct =$words; $pos_words =$words;}
  $recall_pos = int (1000 * ($pos_match/$pos_words)+ 0.5) / 10;
  $precision_pos = int (1000 * ($pos_match/$pos_ct)+ 0.5) / 10;
  $f_pos = int (10 *  (2 * $recall_pos * $precision_pos) / ($recall_pos + $precision_pos)+ 0.5) / 10;

  if ($match_only) {$morf_ct =$word_match; $morf_words =$word_match;}
  else {$morf_ct =$words; $morf_words =$words;}
  $recall_morf = int (1000 * ($morf_match/$morf_words)+ 0.5) / 10;
  $precision_morf = int (1000 * ($morf_match/$morf_ct)+ 0.5) / 10;
  $f_morf = int (10 *  (2 * $recall_morf * $precision_morf) / ($recall_morf + $precision_morf)+ 0.5) / 10;

  $base_ct =$words;
  $recall_base = int (1000 * ($base_match/$words)+ 0.5) / 10;
  $precision_base = int (1000 * ($base_match/$base_ct)+ 0.5) / 10;
  $f_base = int (10 *  (2 * $recall_base * $precision_base) / ($recall_base + $precision_base)+ 0.5) / 10;

  print "$num tokens, $words real words\n";
  print "pos_match =$pos_match, pos_words =$pos_words, pos_ct =$pos_ct\n";
  print "morf_match =$morf_match, morf_words =$morf_words, morf_ct =$morf_ct\n\n";
  print "syntax: recall=$recall_syn precision=$precision_syn F-score=$f_syn\n";
  print "pos: recall=$recall_pos precision=$precision_pos F-score=$f_pos\n";
  print "morphology: recall=$recall_morf precision=$precision_morf F-score=$f_morf\n";
  print "base form: recall=$recall_base precision=$precision_base F-score=$f_base\n";

  if ($sem_ct_gold) {
    $recall_sem = int (1000 * ($sem_match/$sem_ct_gold) + 0.5) / 10;
    $precision_sem = int (1000 * ($sem_match/$sem_ct)+ 0.5) / 10;

    $f_sem = int (10 *  (2 * $recall_sem * $precision_sem) / ($recall_sem + $precision_sem + 0.1)+ 0.5) / 10;
    print "extras: recall=$recall_sem precision=$precision_sem F-score=$f_sem\n";

### evaluate only existance of extra tag, not its type
    $recall_sem_exist = int (1000 * ($sem_exist_match/$sem_exist_ct_gold) + 0.5) / 10;
    $precision_sem_exist = int (1000 * ($sem_exist_match/$sem_exist_ct)+ 0.5) / 10;

    $f_sem_exist = int (10 *  (2 * $recall_sem_exist * $precision_sem_exist) / ($recall_sem_exist + $precision_sem_exist + 0.1)+ 0.5) / 10;
    print "--sem_exist_match=$sem_exist_match, sem_exist_ct_gold=$sem_exist_ct_gold, sem_exist_ct=$sem_exist_ct\n";
    print "extras_exist: recall=$recall_sem_exist precision=$precision_sem_exist F-score=$f_sem_exist\n";
  }


}
