diff options
author | Patrick Simianer <p@simianer.de> | 2017-11-09 16:12:02 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2017-11-09 16:12:02 +0100 |
commit | 8840c569c862e10c79f30015875bae1e03ae38b7 (patch) | |
tree | 33b3d08a3b476124e4a94df279e5f97078b0e290 | |
parent | 60b2027c4f1991276dc3cfd4a2359070f7fa9c62 (diff) |
rm
-rw-r--r-- | mteval-v14.pl | 1179 | ||||
-rwxr-xr-x | wrap-xml.perl | 41 |
2 files changed, 0 insertions, 1220 deletions
diff --git a/mteval-v14.pl b/mteval-v14.pl deleted file mode 100644 index 84a7549..0000000 --- a/mteval-v14.pl +++ /dev/null @@ -1,1179 +0,0 @@ -#!/usr/bin/env perl - -use warnings; -use strict; -use utf8; -use Encode; -use XML::Twig; -use Sort::Naturally; - -binmode STDOUT, ":utf8"; -binmode STDERR, ":utf8"; - - -################################# -# History: -# -# version 14 -# (2016-03-29 lukas.diduch@nist.gov) -# * Fixed warning message in case seg-id is a string, by sorting in correct order using Sort::Naturally. -# -# version 13b -# * Fixed die 'bug' in case seg->id = 0 -# -# version 13a -# * modified the scoring functions to prevent division-by-zero errors when a system segment is empty -# * affected methods: 'bleu_score' and 'bleu_score_smoothing' -# -# version 13 -# * Uses a XML parser to read data (only when extension is .xml) -# * Smoothing of the segment-level BLEU scores, done by default -# * smoothing method similar to that of bleu-1.04.pl (IBM) -# * see comments above the 'bleu_score' method for more details on how the smoothing is computed -# * added a '--no-smoothing' option to simulate old scripts behavior -# * Introduction of the 'brevity-penalty' option, taking one of two values: -# * 'closest' (default) : act as IBM BLEU (taking the closest reference translation length) -# * in case two reference translations are at the same distance, will take the shortest one -# * for more details regarding how the BP is computed, see comments of the 'brevity_penalty_closest' function -# * 'shortest' : act as previous versions of the script (taking shortest reference translation length) -# * Introduction of the 'international-tokenization' option, boolean, disabled by default -# by default (when the option is not provided), uses 11b's tokenization function -# when option specified, uses v12's tokenization function -# * Introduction of a 'Metrics MATR output' flag (option '--metricsMATR') -# when used, creates three files for both BLEU score and NIST score: -# * BLEU-seg.scr and NIST-seg.scr: contain segment-level scores -# * BLEU-doc.scr and NIST-doc.scr: contain document-level scores -# * BLEU-sys.scr and NIST-sys.scr: contain system-level scores -# * SGML parsing -# * script will halt if source, reference and test files don't share the same setid attribute value (used for metricsMATR output) -# * correct segment IDs extracted from the files (was previously using an array, and using the index as a segID for output) -# * detailed output flag (-d) can now be used when running both BLEU and NIST -# -# version 12 -# * Text normalization changes: -# * convert entity references (only the entities declared in the DTD) -# * now uses unicode categories -# * tokenize punctuation unless followed AND preceded by digits -# * tokenize symbols -# * UTF-8 handling: -# * files are now read using utf8 mode -# * Added the '-e' command-line option to enclose non-ASCII characters between spaces -# -# version 11b -- text normalization modified: -# * take out the join digit line because it joins digits -# when it shouldn't have -# $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits -# -# version 11a -- corrected output of individual n-gram precision values -# -# version 11 -- bug fixes: -# * make filehandle operate in binary mode to prevent Perl from operating -# (by default in Red Hat 9) in UTF-8 -# * fix failure on joining digits -# version 10 -- updated output to include more details of n-gram scoring. -# Defaults to generate both NIST and BLEU scores. Use -b for BLEU -# only, use -n for NIST only -# -# version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4 -# being the max, regardless what was entered on the command line.) -# -# version 09c -- bug fix (During the calculation of ngram information, -# each ngram was being counted only once for each segment. This has -# been fixed so that each ngram is counted correctly in each segment.) -# -# version 09b -- text normalization modified: -# * option flag added to preserve upper case -# * non-ASCII characters left in place. -# -# version 09a -- text normalization modified: -# * " and & converted to "" and &, respectively -# * non-ASCII characters kept together (bug fix) -# -# version 09 -- modified to accommodate sgml tag and attribute -# names revised to conform to default SGML conventions. -# -# version 08 -- modifies the NIST metric in accordance with the -# findings on the 2001 Chinese-English dry run corpus. Also -# incorporates the BLEU metric as an option and supports the -# output of ngram detail. -# -# version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI -# Keep strings of non-ASCII characters together as one word -# (rather than splitting them into one-character words). -# Change length penalty so that translations that are longer than -# the average reference translation are not penalized. -# -# version 06 -# Prevent divide-by-zero when a segment has no evaluation N-grams. -# Correct segment index for level 3 debug output. -# -# version 05 -# improve diagnostic error messages -# -# version 04 -# tag segments -# -# version 03 -# add detailed output option (intermediate document and segment scores) -# -# version 02 -# accommodation of modified sgml tags and attributes -# -# version 01 -# same as bleu version 15, but modified to provide formal score output. -# -# original IBM version -# Author: Kishore Papineni -# Date: 06/10/2001 -################################# - -###### -# Intro -my ($date, $time) = date_time_stamp(); -print "MT evaluation scorer began on $date at $time\n"; -print "\ncommand line: ", $0, " ", join(" ", @ARGV), "\n"; -my $usage = "\n\nUsage: $0 -r <ref_file> -s <src_file> -t <tst_file>\n\n". - "Description: This Perl script evaluates MT system performance.\n". - "\n". - "Required arguments:\n". - " -r <ref_file> is a file containing the reference translations for\n". - " the documents to be evaluated.\n". - " -s <src_file> is a file containing the source documents for which\n". - " translations are to be evaluated\n". - " -t <tst_file> is a file containing the translations to be evaluated\n". - "\n". - "Optional arguments:\n". - " -h prints this help message to STDOUT\n". - " -c preserves upper-case alphabetic characters\n". - " -b generate BLEU scores only\n". - " -n generate NIST scores only\n". - " -d detailed output flag:\n". - " 0 (default) for system-level score only\n". - " 1 to include document-level scores\n". - " 2 to include segment-level scores\n". - " 3 to include ngram-level scores\n". - " -e enclose non-ASCII characters between spaces\n". - " --brevity-penalty ( closest | shortest )\n" . - " closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" . - " shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" . - " --international-tokenization\n" . - " when specified, uses Unicode-based (only) tokenization rules\n" . - " when not specified (default), uses default tokenization (some language-dependant rules)\n" . - " --metricsMATR : create three files for both BLEU scores and NIST scores:\n" . - " BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" . - " BLEU-doc.scr and NIST-doc.scr : document-level scores\n" . - " BLEU-sys.scr and NIST-sys.scr : system-level scores\n" . - " --no-smoothing : disable smoothing on BLEU scores\n" . - "\n"; - -use vars qw ($opt_r $opt_s $opt_t $opt_d $opt_h $opt_b $opt_n $opt_c $opt_x $opt_e); -use Getopt::Long; -my $ref_file = ''; -my $src_file = ''; -my $tst_file = ''; -my $detail = 0; -my $help = ''; -my $preserve_case = ''; -my $split_non_ASCII = ''; -my $brevity_penalty = 'closest'; -my $international_tokenization; -my $metricsMATR_output = ''; -my $no_smoothing = ''; -our $opt_x = ''; -our $opt_b = ''; -our $opt_n = ''; -GetOptions( - 'r=s' => \$ref_file, - 's=s' => \$src_file, - 't=s' => \$tst_file, - 'd:i' => \$detail, - 'h|help' => \$help, - 'b', - 'n', - 'c' => \$preserve_case, - 'x:s', - 'e' => \$split_non_ASCII, - 'brevity-penalty:s' => \$brevity_penalty, - 'international-tokenization' => \$international_tokenization, - 'metricsMATR-output' => \$metricsMATR_output, - 'no-smoothing' => \$no_smoothing -); -die $usage if $help; - -die "Error in command line: ref_file not defined$usage" unless ( $ref_file ); -die "Error in command line: src_file not defined$usage" unless ( $src_file ); -die "Error in command line: tst_file not defined$usage" unless ( $tst_file ); -my $BLEU_BP; -if ( !( $brevity_penalty cmp 'closest' ) ) -{ - $BLEU_BP = \&brevity_penalty_closest; -} -elsif ( !( $brevity_penalty cmp 'shortest' ) ) -{ - $BLEU_BP = \&brevity_penalty_shortest; -} -else -{ - die "Incorrect value supplied for 'brevity_penalty'$usage"; -} -my $TOKENIZATION = \&tokenization; -$TOKENIZATION = \&tokenization_international if ( $international_tokenization ); - -my $BLEU_SCORE = \&bleu_score; -$BLEU_SCORE = \&bleu_score_nosmoothing if ( $no_smoothing ); - -my $max_Ngram = 9; - -my $METHOD = "BOTH"; -if ( $opt_b ) { $METHOD = "BLEU"; } -if ( $opt_n ) { $METHOD = "NIST"; } -my $method; - -###### -# Global variables -my ($src_lang, $tgt_lang, @tst_sys, @ref_sys); # evaluation parameters -my (%tst_data, %ref_data); # the data -- with structure: {system}{document}{segments} -my ($src_id, $ref_id, $tst_id); # unique identifiers for ref and tst translation sets -my %eval_docs; # document information for the evaluation data set -my %ngram_info; # the information obtained from (the last word in) the ngram - -###### -# Get source document ID's -($src_id) = get_source_info ($src_file); - -###### -# Get reference translations -($ref_id) = get_MT_data (\%ref_data, "RefSet", $ref_file); - -compute_ngram_info (); - -###### -# Get translations to evaluate -($tst_id) = get_MT_data (\%tst_data, "TstSet", $tst_file); - -###### -# Check data for completeness and correctness -check_MT_data (); - -###### -# -my %NISTmt; -my %NISTOverall; -my %BLEUmt; -my %BLEUOverall; - -###### -# Evaluate -print "\nEvaluation of $src_lang-to-$tgt_lang translation using:\n"; -my $cum_seg = 0; -foreach my $doc (sort keys %eval_docs) -{ - $cum_seg += scalar( keys( %{$eval_docs{$doc}{SEGS}} ) ); -} -print " src set \"$src_id\" (", scalar keys %eval_docs, " docs, $cum_seg segs)\n"; -print " ref set \"$ref_id\" (", scalar keys %ref_data, " refs)\n"; -print " tst set \"$tst_id\" (", scalar keys %tst_data, " systems)\n\n"; - -foreach my $sys (sort @tst_sys) -{ - for (my $n=1; $n<=$max_Ngram; $n++) - { - $NISTmt{$n}{$sys}{cum} = 0; - $NISTmt{$n}{$sys}{ind} = 0; - $BLEUmt{$n}{$sys}{cum} = 0; - $BLEUmt{$n}{$sys}{ind} = 0; - } - if ( ($METHOD eq "BOTH") || ($METHOD eq "NIST") ) - { - $method="NIST"; - score_system ($sys, \%NISTmt, \%NISTOverall); - } - if ( ($METHOD eq "BOTH") || ($METHOD eq "BLEU") ) - { - $method="BLEU"; - score_system ($sys, \%BLEUmt, \%BLEUOverall); - } -} - -###### -printout_report (); -if ( $metricsMATR_output ) -{ - outputMetricsMATR( 'NIST', %NISTOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'NIST' ) ); - outputMetricsMATR( 'BLEU', %BLEUOverall ) if ( ( $METHOD eq 'BOTH' ) || ( $METHOD eq 'BLEU' ) ); -} - -($date, $time) = date_time_stamp(); -print "\nMT evaluation scorer ended on $date at $time\n"; - -exit 0; - -################################# - -sub get_source_info -{ - my ($file) = @_; - my ($name, $id, $src, $doc, $seg); - my ($data, $tag, $span); - - # Extension of the file determines the parser used: - # .xml : XML::Twig - # otherwise : simple SGML parsing functions - if ( $file =~ /\.xml$/i ) - { - my $twig = XML::Twig->new(); - $twig->parsefile( $file ); - my $root = $twig->root; - my $currentSet = $root->first_child( 'srcset' ); - die "Source XML file '$file' does not contain the 'srcset' element" if ( not $currentSet ); - $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'"; - $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No srcset 'srclang' attribute value in '$file'"; - die "Not the same srclang attribute values across sets" unless ( not defined $src_lang or $src eq $src_lang ); - $src_lang = $src; - foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) ) - { - my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'"; - foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) ) - { - - my $segID = $currentSeg->{ 'att' }->{ 'id' }; - die "No segment 'id' attribute value in '$file'" if (! defined $segID); - my $segData = $currentSeg->text; - ($eval_docs{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData ); - } - } - } - else - { - #read data from file - open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; - binmode FILE, ":utf8"; - $data .= $_ while <FILE>; - close (FILE); - - #get source set info - die "\n\nFATAL INPUT ERROR: no 'src_set' tag in src_file '$file'\n\n" - unless ($tag, $span, $data) = extract_sgml_tag_and_span ("SrcSet", $data); - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); - die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" - ." with $name in previous input data ('$src_lang')\n\n" - unless (not defined $src_lang or $src eq $src_lang); - $src_lang = $src; - - #get doc info -- ID and # of segs - $data = $span; - while (($tag, $span, $data) = extract_sgml_tag_and_span ("Doc", $data)) - { - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless ($doc) = extract_sgml_tag_attribute ($name="DocID", $tag); - die "\n\nFATAL INPUT ERROR: duplicate '$name' in file '$file'\n\n" - if defined $eval_docs{$doc}; - $span =~ s/[\s\n\r]+/ /g; # concatenate records - my $nseg=0, my $seg_data = $span; - while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) - { - die "\n\nFATAL INPUT ERROR: no attribute '$name' in file '$file'\n\n" - unless ($seg) = extract_sgml_tag_attribute( $name='id', $tag ); - ($eval_docs{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span ); - $nseg++; - } - die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" - if $nseg == 0; - } - die "\n\nFATAL INPUT ERROR: no documents in file '$file'\n\n" - unless keys %eval_docs > 0; - } - return $id; -} - -################################# - -sub get_MT_data -{ - my ($docs, $set_tag, $file) = @_; - my ($name, $id, $src, $tgt, $sys, $doc, $seg); - my ($tag, $span, $data); - - # Extension of the file determines the parser used: - # .xml : XML::Twig - # otherwise : simple SGML parsing functions - if ( $file =~ /\.xml$/i ) - { - my $twig = XML::Twig->new(); - $twig->parsefile( $file ); - my $root = $twig->root; - foreach my $currentSet ( $root->get_xpath( 'refset' ), $root->get_xpath( 'tstset' ) ) - { - $id = $currentSet->{ 'att' }->{ 'setid' } or die "No 'setid' attribute value in '$file'"; - $src = $currentSet->{ 'att' }->{ 'srclang' } or die "No 'srclang' attribute value in '$file'"; - $tgt = $currentSet->{ 'att' }->{ 'trglang' } or die "No 'trglang' attribute value in '$file'"; - die "Not the same 'srclang' attribute value across sets" unless ( $src eq $src_lang ); - die "Not the same 'trglang' attribute value across sets" unless ( ( not defined $tgt_lang ) or ( $tgt = $tgt_lang ) ); - $tgt_lang = $tgt; - my $sys; - if ( $currentSet->name eq 'tstset' ) - { - $sys = $currentSet->{ 'att' }->{ 'sysid' } or die "No 'sysid' attribute value in '$file'"; - } - else - { - $sys = $currentSet->{ 'att' }->{ 'refid' } or die "No 'refid' attribute value in '$file'"; - } - foreach my $currentDoc ( $currentSet->get_xpath( './/doc' ) ) - { - my $docID = $currentDoc->{ 'att' }->{ 'docid' } or die "No document 'docid' attribute value in '$file'"; - $docs->{ $sys }{ $docID }{ FILE } = $file; - foreach my $currentSeg ( $currentDoc->get_xpath( './/seg' ) ) - { - my $segID = $currentSeg->{ 'att' }->{ 'id' }; - die "No segment 'id' attribute value in '$file'" if (! defined $segID); - my $segData = $currentSeg->text; - ($docs->{$sys}{$docID}{SEGS}{$segID}) = &{ $TOKENIZATION }( $segData ); - } - } - } - } - else - { - #read data from file - open (FILE, $file) or die "\nUnable to open translation data file '$file'", $usage; - binmode FILE, ":utf8"; - $data .= $_ while <FILE>; - close (FILE); - - #get tag info - while (($tag, $span, $data) = extract_sgml_tag_and_span ($set_tag, $data)) - { - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless ($id) = extract_sgml_tag_attribute ($name="SetID", $tag); - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless ($src) = extract_sgml_tag_attribute ($name="SrcLang", $tag); - die "\n\nFATAL INPUT ERROR: $name ('$src') in file '$file' inconsistent\n" - ." with $name of source ('$src_lang')\n\n" - unless $src eq $src_lang; - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless ($tgt) = extract_sgml_tag_attribute ($name="TrgLang", $tag); - die "\n\nFATAL INPUT ERROR: $name ('$tgt') in file '$file' inconsistent\n" - ." with $name of the evaluation ('$tgt_lang')\n\n" - unless (not defined $tgt_lang or $tgt eq $tgt_lang); - $tgt_lang = $tgt; - - my $mtdata = $span; - while (($tag, $span, $mtdata) = extract_sgml_tag_and_span ("Doc", $mtdata)) - { - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless (my $sys) = extract_sgml_tag_attribute ($name="SysID", $tag); - die "\n\nFATAL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless $doc = extract_sgml_tag_attribute ($name="DocID", $tag); - die "\n\nFATAL INPUT ERROR: document '$doc' for system '$sys' in file '$file'\n" - ." previously loaded from file '$docs->{$sys}{$doc}{FILE}'\n\n" - unless (not defined $docs->{$sys}{$doc}); - - $span =~ s/[\s\n\r]+/ /g; # concatenate records - my $nseg=0, my $seg_data = $span; - while (($tag, $span, $seg_data) = extract_sgml_tag_and_span ("Seg", $seg_data)) - { - die "\n\nFATAIL INPUT ERROR: no tag attribute '$name' in file '$file'\n\n" - unless $seg = extract_sgml_tag_attribute( $name="id", $tag ); - ($docs->{$sys}{$doc}{SEGS}{$seg}) = &{ $TOKENIZATION }( $span ); - $nseg++; - } - die "\n\nFATAL INPUT ERROR: no segments in document '$doc' in file '$file'\n\n" if $nseg == 0; - $docs->{$sys}{$doc}{FILE} = $file; - } - } - } - return $id; -} - -################################# - -sub check_MT_data -{ - @tst_sys = sort keys %tst_data; - @ref_sys = sort keys %ref_data; - - die "Not the same 'setid' attribute values across files" unless ( ( $src_id eq $tst_id ) && ( $src_id eq $ref_id ) ); - -#every evaluation document must be represented for every system and every reference - foreach my $doc (sort keys %eval_docs) - { - my $nseg_source = scalar( keys( %{$eval_docs{$doc}{SEGS}} ) ); - foreach my $sys (@tst_sys) - { - die "\n\nFATAL ERROR: no document '$doc' for system '$sys'\n\n" unless defined $tst_data{$sys}{$doc}; - my $nseg = scalar( keys( %{$tst_data{$sys}{$doc}{SEGS}} ) ); - die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" - ." document '$doc' for system '$sys' contains $nseg segments, while\n" - ." the source document contains $nseg_source segments.\n\n" - unless $nseg == $nseg_source; - } - foreach my $sys (@ref_sys) - { - die "\n\nFATAL ERROR: no document '$doc' for reference '$sys'\n\n" unless defined $ref_data{$sys}{$doc}; - my $nseg = scalar( keys( %{$ref_data{$sys}{$doc}{SEGS}} ) ); - die "\n\nFATAL ERROR: translated documents must contain the same # of segments as the source, but\n" - ." document '$doc' for system '$sys' contains $nseg segments, while\n" - ." the source document contains $nseg_source segments.\n\n" - unless $nseg == $nseg_source; - } - } -} - -################################# - -sub compute_ngram_info -{ - my ($ref, $doc, $seg); - my (@wrds, $tot_wrds, %ngrams, $ngram, $mgram); - my (%ngram_count, @tot_ngrams); - - foreach $ref (keys %ref_data) - { - foreach $doc (keys %{$ref_data{$ref}}) - { - foreach $seg ( keys %{$ref_data{$ref}{$doc}{SEGS}}) - { - @wrds = split /\s+/, $ref_data{ $ref }{ $doc }{ SEGS }{ $seg }; - $tot_wrds += @wrds; - %ngrams = %{Words2Ngrams (@wrds)}; - foreach $ngram (keys %ngrams) - { - $ngram_count{$ngram} += $ngrams{$ngram}; - } - } - } - } - - foreach $ngram (keys %ngram_count) - { - @wrds = split / /, $ngram; - pop @wrds, $mgram = join " ", @wrds; - $ngram_info{$ngram} = - log ($mgram ? $ngram_count{$ngram}/$ngram_count{$mgram} : $ngram_count{$ngram}/$tot_wrds) / log 2; - if (defined $opt_x and $opt_x eq "ngram info") - { - @wrds = split / /, $ngram; - printf "ngram info:%9.4f%6d%6d%8d%3d %s\n", $ngram_info{$ngram}, $ngram_count{$ngram}, - $mgram ? $ngram_count{$mgram} : $tot_wrds, $tot_wrds, scalar @wrds, $ngram; - } - } -} - -################################# - -sub score_system -{ - my ($sys, $ref, $doc, $SCOREmt, $overallScore); - ($sys, $SCOREmt, $overallScore) = @_; - my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); - my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); - - $cum_ref_length = 0; - for (my $j=1; $j<=$max_Ngram; $j++) - { - $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; - } - foreach $doc (sort keys %eval_docs) - { - ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_document ($sys, $doc, $overallScore); - if ( $method eq "NIST" ) - { - my %DOCmt = (); - my $docScore = nist_score( scalar( @ref_sys ), $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, \%DOCmt ); - $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore; - if ( $detail >= 1 ) - { - printf "$method score using 5-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", - $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; - } - } - - if ( $method eq "BLEU" ) - { - my %DOCmt = (); - my $docScore = &{$BLEU_SCORE}( $ref_length, $match_cnt, $tst_cnt, $sys, \%DOCmt ); - $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'score' } = $docScore; - if ( $detail >= 1 ) - { - printf "$method score using 4-grams = %.4f for system \"$sys\" on document \"$doc\" (%d segments, %d words)\n", - $docScore, scalar keys %{$tst_data{$sys}{$doc}{SEGS}}, $tst_cnt->[1]; - } - } - - $cum_ref_length += $ref_length; - for (my $j=1; $j<=$max_Ngram; $j++) - { - $cum_match[$j] += $match_cnt->[$j]; - $cum_tst_cnt[$j] += $tst_cnt->[$j]; - $cum_ref_cnt[$j] += $ref_cnt->[$j]; - $cum_tst_info[$j] += $tst_info->[$j]; - $cum_ref_info[$j] += $ref_info->[$j]; - printf "document info: $sys $doc %d-gram %d %d %d %9.4f %9.4f\n", $j, $match_cnt->[$j], - $tst_cnt->[$j], $ref_cnt->[$j], $tst_info->[$j], $ref_info->[$j] - if (defined $opt_x and $opt_x eq "document info"); - } - } - - if ($method eq "BLEU") - { - $overallScore->{ $sys }{ 'score' } = &{$BLEU_SCORE}($cum_ref_length, \@cum_match, \@cum_tst_cnt, $sys, $SCOREmt); - } - if ($method eq "NIST") - { - $overallScore->{ $sys }{ 'score' } = nist_score (scalar @ref_sys, \@cum_match, \@cum_tst_cnt, \@cum_ref_cnt, \@cum_tst_info, \@cum_ref_info, $sys, $SCOREmt); - } -} - -################################# - -sub score_document -{ - my ($sys, $ref, $doc, $overallScore); - ($sys, $doc, $overallScore) = @_; - my ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info); - my ($cum_ref_length, @cum_match, @cum_tst_cnt, @cum_ref_cnt, @cum_tst_info, @cum_ref_info); - - $cum_ref_length = 0; - for (my $j=1; $j<=$max_Ngram; $j++) - { - $cum_match[$j] = $cum_tst_cnt[$j] = $cum_ref_cnt[$j] = $cum_tst_info[$j] = $cum_ref_info[$j] = 0; - } - - # score each segment - foreach my $seg ( nsort keys( %{$tst_data{$sys}{$doc}{SEGS}} ) ) - { - - my @ref_segments = (); - foreach $ref (@ref_sys) - { - push @ref_segments, $ref_data{$ref}{$doc}{SEGS}{$seg}; - if ( $detail >= 3 ) - { - printf "ref '$ref', seg $seg: %s\n", $ref_data{$ref}{$doc}{SEGS}{$seg} - } - - } - - printf "sys '$sys', seg $seg: %s\n", $tst_data{$sys}{$doc}{SEGS}{$seg} if ( $detail >= 3 ); - ($ref_length, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info) = score_segment ($tst_data{$sys}{$doc}{SEGS}{$seg}, @ref_segments); - - if ( $method eq "BLEU" ) - { - my %DOCmt = (); - my $segScore = &{$BLEU_SCORE}($ref_length, $match_cnt, $tst_cnt, $sys, %DOCmt); - $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore; - if ( $detail >= 2 ) - { - printf " $method score using 4-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1] - } - } - if ( $method eq "NIST" ) - { - my %DOCmt = (); - my $segScore = nist_score (scalar @ref_sys, $match_cnt, $tst_cnt, $ref_cnt, $tst_info, $ref_info, $sys, %DOCmt); - $overallScore->{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' } = $segScore; - if ( $detail >= 2 ) - { - printf " $method score using 5-grams = %.4f for system \"$sys\" on segment $seg of document \"$doc\" (%d words)\n", $segScore, $tst_cnt->[1]; - } - } - $cum_ref_length += $ref_length; - for (my $j=1; $j<=$max_Ngram; $j++) - { - $cum_match[$j] += $match_cnt->[$j]; - $cum_tst_cnt[$j] += $tst_cnt->[$j]; - $cum_ref_cnt[$j] += $ref_cnt->[$j]; - $cum_tst_info[$j] += $tst_info->[$j]; - $cum_ref_info[$j] += $ref_info->[$j]; - } - } - return ($cum_ref_length, [@cum_match], [@cum_tst_cnt], [@cum_ref_cnt], [@cum_tst_info], [@cum_ref_info]); -} - -############################################################################################################################### -# function returning the shortest reference length -# takes as input: -# - currentLength : the current (shortest) reference length -# - referenceSentenceLength : the current reference sentence length -# - candidateSentenceLength : the current candidate sentence length (unused) -############################################################################################################################### -sub brevity_penalty_shortest -{ - my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_; - return ( $referenceSentenceLength < $currentLength ? $referenceSentenceLength : $currentLength ); -} - -############################################################################################################################### -# function returning the closest reference length (to the candidate sentence length) -# takes as input: -# - currentLength: the current (closest) reference length. -# - candidateSentenceLength : the current reference sentence length -# - candidateSentenceLength : the current candidate sentence length -# when two reference sentences are at the same distance, it will return the shortest reference sentence length -# example of 4 iterations, given: -# - one candidate sentence containing 7 tokens -# - one reference translation containing 11 tokens -# - one reference translation containing 8 tokens -# - one reference translation containing 6 tokens -# - one reference translation containing 7 tokens -# the multiple invokations will return: -# - currentLength is set to 11 (outside of this function) -# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 ) -# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8 -# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 ) -############################################################################################################################### -sub brevity_penalty_closest -{ - my ( $currentLength, $referenceSentenceLength, $candidateSentenceLength ) = @_; - my $result = $currentLength; - if ( abs( $candidateSentenceLength - $referenceSentenceLength ) <= abs( $candidateSentenceLength - $currentLength ) ) - { - if ( abs( $candidateSentenceLength - $referenceSentenceLength ) == abs( $candidateSentenceLength - $currentLength ) ) - { - if ( $currentLength > $referenceSentenceLength ) - { - $result = $referenceSentenceLength; - } - } - else - { - $result = $referenceSentenceLength; - } - } - return $result; -} - -################################# - -sub score_segment -{ - my ($tst_seg, @ref_segs) = @_; - my (@tst_wrds, %tst_ngrams, @match_count, @tst_count, @tst_info); - my (@ref_wrds, $ref_seg, %ref_ngrams, %ref_ngrams_max, @ref_count, @ref_info); - my ($ngram); - my (@nwrds_ref); - my $ref_length; - - for (my $j=1; $j<= $max_Ngram; $j++) - { - $match_count[$j] = $tst_count[$j] = $ref_count[$j] = $tst_info[$j] = $ref_info[$j] = 0; - } - -# get the ngram counts for the test segment - @tst_wrds = split /\s+/, $tst_seg; - %tst_ngrams = %{Words2Ngrams (@tst_wrds)}; - for (my $j=1; $j<=$max_Ngram; $j++) - { - # compute ngram counts - $tst_count[$j] = $j<=@tst_wrds ? (@tst_wrds - $j + 1) : 0; - } - -# get the ngram counts for the reference segments - foreach $ref_seg (@ref_segs) - { - @ref_wrds = split /\s+/, $ref_seg; - %ref_ngrams = %{Words2Ngrams (@ref_wrds)}; - foreach $ngram (keys %ref_ngrams) - { - # find the maximum # of occurrences - my @wrds = split / /, $ngram; - $ref_info[@wrds] += $ngram_info{$ngram}; - $ref_ngrams_max{$ngram} = defined $ref_ngrams_max{$ngram} ? max ($ref_ngrams_max{$ngram}, $ref_ngrams{$ngram}) : $ref_ngrams{$ngram}; - } - for (my $j=1; $j<=$max_Ngram; $j++) - { - # update ngram counts - $ref_count[$j] += $j<=@ref_wrds ? (@ref_wrds - $j + 1) : 0; - } - if ( not defined( $ref_length ) ) - { - $ref_length = scalar( @ref_wrds ); - } - else - { - $ref_length = &{$BLEU_BP}( $ref_length, scalar( @ref_wrds ), scalar( @tst_wrds ) ); - } - } - -# accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams - foreach $ngram (keys %tst_ngrams) - { - next unless defined $ref_ngrams_max{$ngram}; - my @wrds = split / /, $ngram; - $tst_info[@wrds] += $ngram_info{$ngram} * min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); - $match_count[@wrds] += my $count = min($tst_ngrams{$ngram},$ref_ngrams_max{$ngram}); - printf "%.2f info for each of $count %d-grams = '%s'\n", $ngram_info{$ngram}, scalar @wrds, $ngram - if $detail >= 3; - } - - return ($ref_length, [@match_count], [@tst_count], [@ref_count], [@tst_info], [@ref_info]); -} - -################################# - -sub bleu_score_nosmoothing -{ - my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_; - my $score = 0; - my $iscore = 0; - - for ( my $j = 1; $j <= $max_Ngram; ++$j ) - { - if ($matching_ngrams->[ $j ] == 0) - { - $SCOREmt->{ $j }{ $sys }{ cum }=0; - } - else - { - my $len_score = min (0, 1-$ref_length/$tst_ngrams->[1]); - # Cumulative N-Gram score - $score += log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); - $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j + $len_score ); - # Individual N-Gram score - $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); - $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore ); - } - } - return $SCOREmt->{ 4 }{ $sys }{ cum }; -} - -############################################################################################################################### -# Default method used to compute the BLEU score, using smoothing. -# Note that the method used can be overridden using the '--no-smoothing' command-line argument -# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null -# k is 1 for the first 'n' value for which the n-gram match count is null -# For example, if the text contains: -# - one 2-gram match -# - and (consequently) two 1-gram matches -# the n-gram count for each individual precision score would be: -# - n=1 => prec_count = 2 (two unigrams) -# - n=2 => prec_count = 1 (one bigram) -# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) -# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) -############################################################################################################################### -sub bleu_score -{ - my ($ref_length, $matching_ngrams, $tst_ngrams, $sys, $SCOREmt) = @_; - my $score = 0; - my $iscore = 0; - my $exp_len_score = 0; - $exp_len_score = exp( min (0, 1 - $ref_length / $tst_ngrams->[ 1 ] ) ) if ( $tst_ngrams->[ 1 ] > 0 ); - my $smooth = 1; - for ( my $j = 1; $j <= $max_Ngram; ++$j ) - { - if ( $tst_ngrams->[ $j ] == 0 ) - { - $iscore = 0; - } - elsif ( $matching_ngrams->[ $j ] == 0 ) - { - $smooth *= 2; - $iscore = log( 1 / ( $smooth * $tst_ngrams->[ $j ] ) ); - } - else - { - $iscore = log( $matching_ngrams->[ $j ] / $tst_ngrams->[ $j ] ); - } - $SCOREmt->{ $j }{ $sys }{ ind } = exp( $iscore ); - $score += $iscore; - $SCOREmt->{ $j }{ $sys }{ cum } = exp( $score / $j ) * $exp_len_score; - } - return $SCOREmt->{ 4 }{ $sys }{ cum }; -} - -################################# - -sub nist_score -{ - my ($nsys, $matching_ngrams, $tst_ngrams, $ref_ngrams, $tst_info, $ref_info, $sys, $SCOREmt) = @_; - my $score = 0; - my $iscore = 0; - - for (my $n=1; $n<=$max_Ngram; $n++) - { - $score += $tst_info->[$n]/max($tst_ngrams->[$n],1); - $SCOREmt->{$n}{$sys}{cum} = $score * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); - $iscore = $tst_info->[$n]/max($tst_ngrams->[$n],1); - $SCOREmt->{$n}{$sys}{ind} = $iscore * nist_length_penalty($tst_ngrams->[1]/($ref_ngrams->[1]/$nsys)); - } - return $SCOREmt->{5}{$sys}{cum}; -} - -################################# - -sub Words2Ngrams -{ - #convert a string of words to an Ngram count hash - my %count = (); - - for (; @_; shift) - { - my ($j, $ngram, $word); - for ($j=0; $j<$max_Ngram and defined($word=$_[$j]); $j++) - { - $ngram .= defined $ngram ? " $word" : $word; - $count{$ngram}++; - } - } - return {%count}; -} - -################################# - -sub tokenization -{ - my ($norm_text) = @_; - -# language-independent part: - $norm_text =~ s/<skipped>//g; # strip "skipped" tags - $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines - $norm_text =~ s/\n/ /g; # join lines - $norm_text =~ s/"/"/g; # convert SGML tag for quote to " - $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & - $norm_text =~ s/</</g; # convert SGML tag for less-than to > - $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < - -# language-dependent part (assuming Western languages): - $norm_text = " $norm_text "; - $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; - $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation - $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit - $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit - $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit - $norm_text =~ s/\s+/ /g; # one space only between words - $norm_text =~ s/^\s+//; # no leading space - $norm_text =~ s/\s+$//; # no trailing space - - return $norm_text; -} - - -sub tokenization_international -{ - my ($norm_text) = @_; - - $norm_text =~ s/<skipped>//g; # strip "skipped" tags - #$norm_text =~ s/\p{Hyphen}\p{Zl}//g; # strip end-of-line hyphenation and join lines - $norm_text =~ s/\p{Zl}/ /g; # join lines - - # replace entities - $norm_text =~ s/"/\"/g; # quote to " - $norm_text =~ s/&/&/g; # ampersand to & - $norm_text =~ s/</</g; # less-than to < - $norm_text =~ s/>/>/g; # greater-than to > - $norm_text =~ s/'/\'/g; # apostrophe to ' - - $norm_text = lc( $norm_text ) unless $preserve_case; # lowercasing if needed - $norm_text =~ s/([^[:ascii:]])/ $1 /g if ( $split_non_ASCII ); - - # punctuation: tokenize any punctuation unless followed AND preceded by a digit - $norm_text =~ s/(\P{N})(\p{P})/$1 $2 /g; - $norm_text =~ s/(\p{P})(\P{N})/ $1 $2/g; - - $norm_text =~ s/(\p{S})/ $1 /g; # tokenize symbols - - $norm_text =~ s/\p{Z}+/ /g; # one space only between words - $norm_text =~ s/^\p{Z}+//; # no leading space - $norm_text =~ s/\p{Z}+$//; # no trailing space - - return $norm_text; -} - -################################# - -sub nist_length_penalty -{ - my ($ratio) = @_; - return 1 if $ratio >= 1; - return 0 if $ratio <= 0; - my $ratio_x = 1.5; - my $score_x = 0.5; - my $beta = -log($score_x)/log($ratio_x)/log($ratio_x); - return exp (-$beta*log($ratio)*log($ratio)); -} - -################################# - -sub date_time_stamp -{ - my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); - my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); - my ($date, $time); - $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; - $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; - return ($date, $time); -} - -################################# - -sub extract_sgml_tag_and_span -{ - my ($name, $data) = @_; - ($data =~ m|<$name\s*([^>]*)>(.*?)</$name\s*>(.*)|si) ? ($1, $2, $3) : (); -} - -################################# - -sub extract_sgml_tag_attribute -{ - my ($name, $data) = @_; - ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); -} - -################################# - -sub max -{ - my ($max, $next); - - return unless defined ($max=pop); - while (defined ($next=pop)) - { - $max = $next if $next > $max; - } - return $max; -} - -################################# - -sub min -{ - my ($min, $next); - - return unless defined ($min=pop); - while (defined ($next=pop)) - { - $min = $next if $next < $min; - } - return $min; -} - -################################# - -sub printout_report -{ - if ( $METHOD eq "BOTH" ) - { - foreach my $sys (sort @tst_sys) - { - printf "NIST score = %2.4f BLEU score = %.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum},$BLEUmt{4}{$sys}{cum}; - } - } - elsif ($METHOD eq "NIST" ) - { - foreach my $sys (sort @tst_sys) - { - printf "NIST score = %2.4f for system \"$sys\"\n",$NISTmt{5}{$sys}{cum}; - } - } - elsif ($METHOD eq "BLEU" ) - { - foreach my $sys (sort @tst_sys) - { - printf "\nBLEU score = %.4f for system \"$sys\"\n",$BLEUmt{4}{$sys}{cum}; - } - } - printf "\n# ------------------------------------------------------------------------\n\n"; - printf "Individual N-gram scoring\n"; - printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; - printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; - - if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "NIST") ) - { - foreach my $sys (sort @tst_sys) - { - printf " NIST:"; - for (my $i=1; $i<=$max_Ngram; $i++) - { - printf " %2.4f ",$NISTmt{$i}{$sys}{ind} - } - printf " \"$sys\"\n"; - } - printf "\n"; - } - - if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") ) - { - foreach my $sys (sort @tst_sys) - { - printf " BLEU:"; - for (my $i=1; $i<=$max_Ngram; $i++) - { - printf " %2.4f ",$BLEUmt{$i}{$sys}{ind} - } - printf " \"$sys\"\n"; - } - } - - printf "\n# ------------------------------------------------------------------------\n"; - printf "\nCumulative N-gram scoring\n"; - printf " 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram\n"; - printf " ------ ------ ------ ------ ------ ------ ------ ------ ------\n"; - - if (( $METHOD eq "BOTH" ) || ($METHOD eq "NIST")) - { - foreach my $sys (sort @tst_sys) - { - printf " NIST:"; - for (my $i=1; $i<=$max_Ngram; $i++) - { - printf " %2.4f ",$NISTmt{$i}{$sys}{cum} - } - printf " \"$sys\"\n"; - } - } - printf "\n"; - if ( ( $METHOD eq "BOTH" ) || ($METHOD eq "BLEU") ) - { - foreach my $sys (sort @tst_sys) - { - printf " BLEU:"; - for (my $i=1; $i<=$max_Ngram; $i++) - { - printf " %2.4f ",$BLEUmt{$i}{$sys}{cum} - } - printf " \"$sys\"\n"; - } - } -} - -############################################################################################################################### -# Create three files, by using: -# - $prefix : the prefix used for the output file names -# - %overall : a hash containing seg/doc/sys-level scores: -# - $overall{ $SYSTEM_ID }{ 'score' } => system-level score -# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'score' } => document-level score -# - $overall{ $SYSTEM_ID }{ 'documents' }{ $DOCUMENT_ID }{ 'segments' }{ $SEGMENT_ID } => segment-level score -############################################################################################################################### -sub outputMetricsMATR -{ - my ( $prefix, %overall ) = @_; - my $fileNameSys = $prefix . '-sys.scr'; - my $fileNameDoc = $prefix . '-doc.scr'; - my $fileNameSeg = $prefix . '-seg.scr'; - open FILEOUT_SYS, '>', $fileNameSys or die "Could not open file: ${fileNameSys}"; - open FILEOUT_DOC, '>', $fileNameDoc or die "Could not open file: ${fileNameDoc}"; - open FILEOUT_SEG, '>', $fileNameSeg or die "Could not open file: ${fileNameSeg}"; - foreach my $sys ( sort( keys( %overall ) ) ) - { - my $scoreSys = $overall{ $sys }{ 'score' }; - print FILEOUT_SYS "${tst_id}\t${sys}\t${scoreSys}\n"; - foreach my $doc ( sort( keys( %{$overall{ $sys }{ 'documents' }} ) ) ) - { - my $scoreDoc = $overall{ $sys }{ 'documents' }{ $doc }{ 'score' }; - print FILEOUT_DOC "${tst_id}\t${sys}\t${doc}\t${scoreDoc}\n"; - foreach my $seg ( nsort keys( %{$overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }} ) ) - { - my $scoreSeg = $overall{ $sys }{ 'documents' }{ $doc }{ 'segments' }{ $seg }{ 'score' }; - print FILEOUT_SEG "${tst_id}\t${sys}\t${doc}\t${seg}\t${scoreSeg}\n"; - } - } - } - close FILEOUT_SEG; - close FILEOUT_DOC; - close FILEOUT_SYS; -} - diff --git a/wrap-xml.perl b/wrap-xml.perl deleted file mode 100755 index 06303b7..0000000 --- a/wrap-xml.perl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl -w -# original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl -# (licensed under LGPL) - -use strict; - -my $src = $ARGV[0]; -my $language = $ARGV[1]; -die("syntax: wrap-xml.perl xml-frame language [system-name]") - unless $src && $language && -e $src; -my $system = "my-system"; -$system = $ARGV[2] if defined($ARGV[2]); - -open(SRC,$src); -my @OUT = <STDIN>; -chomp(@OUT); -#my @OUT = `cat $decoder_output`; -while(<SRC>) { - chomp; - if (/^<srcset/) { - s/<srcset/<tstset trglang="$language" sysid="$system"/; - } - elsif (/^<\/srcset/) { - s/<\/srcset/<\/tstset/; - } - elsif (/^<DOC/) { - s/<DOC/<DOC sysid="$system"/; - } - elsif (/<seg/) { - my $line = shift(@OUT); - $line = "" if $line =~ /NO BEST TRANSLATION/; - if (/<\/seg>/) { - s/(<seg[^>]+> *).+(<\/seg>)/$1$line$2/; - } - else { - s/(<seg[^>]+> *)[^<]+/$1$line/; - } - } - print $_."\n"; -} - |