diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-11-11 21:38:33 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-11-11 21:38:33 -0500 |
commit | fd5f83ba7c24d041e8378d6bbfb4325f20eb0dd7 (patch) | |
tree | 0f545ec67261fbfff82a620b556201625dd75c00 | |
parent | f56b7c1e35c38116d99b06458f2c63dda939a8cc (diff) |
switch to new multi-reference format for mert
-rwxr-xr-x | dpmert/dpmert.pl | 56 |
1 files changed, 38 insertions, 18 deletions
diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl index 52ce0fc0..2e6a9728 100755 --- a/dpmert/dpmert.pl +++ b/dpmert/dpmert.pl @@ -67,6 +67,7 @@ my $bleu_weight=1; my $use_make = 1; # use make to parallelize line search my $useqsub; my $pass_suffix = ''; +my $devset = ''; my $cpbin=1; # Process command-line options Getopt::Long::Configure("no_auto_abbrev"); @@ -85,12 +86,13 @@ if (GetOptions( "pmem=s" => \$pmem, "cpbin!" => \$cpbin, "random-directions=i" => \$rand_directions, + "devset=s" => \$devset, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, "weights=s" => \$initialWeights, "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, + "opt-iterations=i" => \$optimization_iters, ) == 0 || @ARGV!=1 || $help) { print_help(); exit; @@ -102,8 +104,17 @@ if ($useqsub) { } my @missing_args = (); -if (!defined $srcFile) { push @missing_args, "--source-file"; } -if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (defined $srcFile || defined $refFiles) { + die <<EOT; + + The options --ref-files and --source-file are no longer supported. + Please specify the input file and its reference translations with + --devset FILE + +EOT +} + +if (!defined $devset) { push @missing_args, "--devset"; } if (!defined $initialWeights) { push @missing_args, "--weights"; } die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); @@ -133,8 +144,6 @@ if ($metric =~ /^ter$|^aer$/i) { $DIR_FLAG = ''; } -my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); - unless ($dir){ $dir = "dpmert"; } @@ -229,8 +238,10 @@ if ($dryrun){ check_call("cp $iniFile $newIniFile"); $iniFile = $newIniFile; +split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); +my $refs = "-r $dir/dev.refs"; my $newsrc = "$dir/dev.input"; -enseg($srcFile, $newsrc); +enseg("$dir/dev.input.raw", $newsrc); $srcFile = $newsrc; my $devSize = 0; open F, "<$srcFile" or die "Can't read $srcFile: $!"; @@ -293,7 +304,7 @@ while (1){ $retries++; } die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; @@ -346,7 +357,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -490,14 +501,6 @@ sub get_lines { return $lc; } -sub get_comma_sep_refs { - my ($r,$p) = @_; - my $o = check_output("echo $p"); - chomp $o; - my @files = split /\s+/, $o; - return "-$r " . join(" -$r ", @files); -} - sub read_weights_file { my ($file) = @_; open F, "<$file" or die "Couldn't read $file: $!"; @@ -530,8 +533,7 @@ sub write_config { print $fh "DECODER: $decoder\n"; print $fh "INI FILE: $iniFile\n"; print $fh "WORKING DIR: $dir\n"; - print $fh "SOURCE (DEV): $srcFile\n"; - print $fh "REFS (DEV): $refFiles\n"; + print $fh "DEVSET: $devset\n"; print $fh "EVAL METRIC: $metric\n"; print $fh "START ITERATION: $iteration\n"; print $fh "MAX ITERATIONS: $max_iterations\n"; @@ -698,3 +700,21 @@ sub escaped_shell_args_str { sub escaped_cmdline { return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); } + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + |