From 22b3bffd201fe43cea4c4639bcefc3c0e4746518 Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 22 Jul 2010 20:52:22 +0000 Subject: add additional filtering step git-svn-id: https://ws10smt.googlecode.com/svn/trunk@368 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/clsp.config | 2 +- gi/pipeline/evaluation-pipeline.pl | 18 ++++++++++++++---- gi/pipeline/scripts/refilter.pl | 37 +++++++++++++++++++++++++++++++++++++ gi/pipeline/scripts/rekey.pl | 8 ++++++++ gi/pipeline/scripts/sort-by-key.sh | 2 +- 5 files changed, 61 insertions(+), 6 deletions(-) create mode 100755 gi/pipeline/scripts/refilter.pl create mode 100755 gi/pipeline/scripts/rekey.pl (limited to 'gi') diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index f7f131a0..f8b899f9 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -4,7 +4,7 @@ btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al zhen /export/ws10smt/data/chinese-english corpus.zh-en.al -aren /export/ws10smt/data/arabic-english corpus.ar-en.al +aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index b2656985..13fe07cf 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -7,6 +7,7 @@ my $CWD = getcwd; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } my $JOBS = 15; +my $PMEM = "9G"; # featurize_grammar may add multiple features from a single feature extractor # the key in this map is the extractor name, the value is a list of the extracted features @@ -58,6 +59,7 @@ my @DEFAULT_FEATS = qw( PassThrough Glue GlueTop LanguageModel WordPenalty ); +my $FILTERBYF = "$SCRIPT_DIR/scripts/filter-by-f.pl"; my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; my $EXTOOLS = "$SCRIPT_DIR/../../extools"; @@ -67,7 +69,7 @@ die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; my $DISTVEST = "$VEST/dist-vest.pl"; my $FILTER = "$EXTOOLS/filter_grammar"; my $FEATURIZE = "$EXTOOLS/featurize_grammar"; -assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST); +assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF); my $numtopics = 25; @@ -126,6 +128,7 @@ if (GetOptions( "backoff-grammar=s" => \$bkoffgram, "glue-grammar=s" => \$gluegram, "data=s" => \$dataDir, + "pmem=s" => \$PMEM, "features=s@" => \@features, "use-fork" => \$usefork, "jobs=i" => \$JOBS, @@ -218,7 +221,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned'); if (-f $tuned_weights) { print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; } else { - my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --ref-files=$drefs --source-file=$dev --weights $weights $devini"; + my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; print STDERR "MERT COMMAND: $cmd\n"; `rm -rf $outdir/vest 2> /dev/null`; chdir $outdir or die "Can't chdir to $outdir: $!"; @@ -248,6 +251,10 @@ exit 0; sub write_random_weights_file { my ($file, @extras) = @_; + if (-f $file) { + print STDERR "$file exists - REUSING!\n"; + return; + } open F, ">$file" or die "Can't write $file: $!"; my @feats = (@DEFAULT_FEATS, @extras); for my $feat (@feats) { @@ -262,12 +269,15 @@ sub write_random_weights_file { sub filter { my ($grammar, $set, $name, $outdir) = @_; my $out1 = mydircat($outdir, "$name.filt.gz"); + my $out2 = mydircat($outdir, "$name.f_feat.gz"); my $outgrammar = mydircat($outdir, "$name.scfg.gz"); if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { my $cmd = "gunzip -c $grammar | $FILTER -t $set | gzip > $out1"; safesystem($out1, $cmd) or die "Filtering failed."; - $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $outgrammar"; - safesystem($outgrammar, $cmd) or die "Featurizing failed"; + $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2"; + safesystem($out2, $cmd) or die "Featurizing failed"; + $cmd = "$FILTERBYF $out2 $outgrammar"; + safesystem($outgrammar, $cmd) or die "Secondary filtering failed"; } return $outgrammar; } diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl new file mode 100755 index 00000000..11a36ebe --- /dev/null +++ b/gi/pipeline/scripts/refilter.pl @@ -0,0 +1,37 @@ +#!/usr/bin/perl -w +use strict; + +my $NUM_TRANSLATIONS = 30; +my $pk = ''; +my %dict; +while(<>) { + s/^(.+)\t//; + my $key = $1; + if ($key ne $pk) { + if ($pk) { + emit_dict(); + } + %dict = (); + $pk = $key; + } + my ($lhs, $f, $e, $s) = split / \|\|\| /; + my $score = 0; + if ($s =~ /XEF=([^ ]+)/) { + $score += $1; + } else { die; } + if ($s =~ /GenerativeProb=([^ ]+)/) { + $score += ($1 / 10); + } else { die; } + $dict{"$lhs ||| $f ||| $e ||| $s"} = $score; +} +emit_dict(); + +sub emit_dict { + my $cc = 0; + for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) { + print "$k"; + $cc++; + if ($cc >= $NUM_TRANSLATIONS) { last; } + } +} + diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl new file mode 100755 index 00000000..31eb86b8 --- /dev/null +++ b/gi/pipeline/scripts/rekey.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +while(<>) { + my ($lhs, $f, $e, $s) = split / \|\|\| /; + $f =~ s/\[X[0-9]+\]/\[X\]/g; + print "$f\t$_"; +} + diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh index 948dd4df..7ae33e03 100755 --- a/gi/pipeline/scripts/sort-by-key.sh +++ b/gi/pipeline/scripts/sort-by-key.sh @@ -1,5 +1,5 @@ #!/bin/bash export LANG=C -sort -t $'\t' -k 1 +sort -t $'\t' -k 1 -T /tmp -S 6000000000 -- cgit v1.2.3