diff options
-rw-r--r-- | gi/pipeline/OLD.clsp.config | 9 | ||||
-rwxr-xr-x | gi/pipeline/OLD.evaluation-pipeline.pl | 4 | ||||
-rw-r--r-- | gi/pipeline/clsp.config | 4 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 107 |
4 files changed, 69 insertions, 55 deletions
diff --git a/gi/pipeline/OLD.clsp.config b/gi/pipeline/OLD.clsp.config new file mode 100644 index 00000000..cd0f9d65 --- /dev/null +++ b/gi/pipeline/OLD.clsp.config @@ -0,0 +1,9 @@ +# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED +# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... +btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al +zhen /export/ws10smt/data/chinese-english corpus.zh-en.al +aren /export/ws10smt/data/arabic-english corpus.ar-en.al +uren /export/ws10smt/data/urdu-english corpus.ur-en.al +nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al + diff --git a/gi/pipeline/OLD.evaluation-pipeline.pl b/gi/pipeline/OLD.evaluation-pipeline.pl index 06c89b60..49c303eb 100755 --- a/gi/pipeline/OLD.evaluation-pipeline.pl +++ b/gi/pipeline/OLD.evaluation-pipeline.pl @@ -43,7 +43,7 @@ my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); -my $config = "$SCRIPT_DIR/clsp.config"; +my $config = "$SCRIPT_DIR/OLD.clsp.config"; print STDERR "CORPORA CONFIGURATION: $config\n"; open CONF, "<$config" or die "Can't read $config: $!"; my %paths; @@ -233,7 +233,7 @@ EOT sub print_help { print STDERR<<EOT; -Usage: $0 [OPTIONS] language-pair grammar.bidir.gz +Usage: $0 [OPTIONS] language-pair unfiltered-grammar.gz Given an induced grammar for an entire corpus (i.e., generated by local-gi-pipeline.pl), filter and featurize it for a dev and test set, diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index cd0f9d65..49feada0 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -1,6 +1,6 @@ # THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... -btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al zhen /export/ws10smt/data/chinese-english corpus.zh-en.al aren /export/ws10smt/data/arabic-english corpus.ar-en.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 06c89b60..8414308d 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -6,32 +6,37 @@ my $CWD = getcwd; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } -my @DEFAULT_FEATS = qw( - LogRuleCount SingletonRule LexE2F LexF2E WordPenalty - LogFCount LanguageModel Glue GlueTop PassThrough SingletonF -); +# featurize_grammar may add multiple features from a single feature extractor +# the key in this map is the extractor name, the value is a list of the extracted features +my $feat_map = { + "LogRuleCount" => [ "LogRuleCount", "SingletonRule" ] , + "LexProb" => [ "LexE2F", "LexF2E" ] , +}; my %init_weights = qw( - LogRuleCount 0.2 - LexE2F -0.3 - LexF2E -0.3 - LogFCount 0.1 - WordPenalty -1.5 - LanguageModel 1.2 - Glue -1.0 - GlueTop 0.00001 - PassThrough -10.0 - SingletonRule -0.1 - X_EGivenF -0.3 - X_FGivenE -0.3 - X_LogECount -1 - X_LogFCount -0.1 - X_LogRuleCount 0.3 - X_SingletonE -0.1 - X_SingletonF -0.1 - X_SingletonRule -0.5 + EGivenF -0.735245 + FGivenE -0.219391 + Glue -0.306709 + GlueTop 0.0473331 + LanguageModel 2.40403 + LexE2F -0.266989 + LexF2E -0.550373 + LogECount -0.129853 + LogFCount -0.194037 + LogRuleCount 0.256706 + PassThrough -0.9304905 + SingletonE -3.04161 + SingletonF 0.0714027 + SingletonRule -0.889377 + WordPenalty -7.99495 ); + +# these features are included by default +my @DEFAULT_FEATS = qw( Glue GlueTop LanguageModel WordPenalty ); + + + my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; my $EXTOOLS = "$SCRIPT_DIR/../../extools"; @@ -39,9 +44,9 @@ die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; my $VEST = "$SCRIPT_DIR/../../vest"; die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; my $DISTVEST = "$VEST/dist-vest.pl"; -my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; -my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; -assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); +my $FILTER = "$EXTOOLS/filter_grammar"; +my $FEATURIZE = "$EXTOOLS/featurize_grammar"; +assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST); my $config = "$SCRIPT_DIR/clsp.config"; print STDERR "CORPORA CONFIGURATION: $config\n"; @@ -53,7 +58,6 @@ my %devs; my %devrefs; my %tests; my %testevals; -my %xgrammars; print STDERR " LANGUAGE PAIRS:"; while(<CONF>) { chomp; @@ -61,11 +65,10 @@ while(<CONF>) { next if /^\s*$/; s/^\s+//; s/\s+$//; - my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/; + my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/; $paths{$name} = $path; $corpora{$name} = $corpus; $lms{$name} = $lm; - $xgrammars{$name} = $xgrammar; $devs{$name} = $dev; $devrefs{$name} = $devref; $tests{$name} = $xtests[0]; @@ -78,16 +81,27 @@ my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); my $outdir = "$CWD/exp"; my $help; -my $XFEATS; -my $EXTRA_FILTER = ''; +my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; +my @features; if (GetOptions( "data=s" => \$dataDir, - "xfeats" => \$XFEATS, + "features=s@" => \@features, ) == 0 || @ARGV!=2 || $help) { print_help(); exit; } + +my @xfeats; +for my $feat (@features) { + my $rs = $feat_map->{$feat}; + if (!defined $rs) { die "DON'T KNOW ABOUT FEATURE $feat\n"; } + my @xfs = @$rs; + @xfeats = (@xfeats, @xfs); + $FEATURIZER_OPTS .= " -f $feat"; +} +print STDERR "X-FEATS: @xfeats\n"; + my $lp = $ARGV[0]; my $grammar = $ARGV[1]; print STDERR " CORPUS REPO: $dataDir\n"; @@ -113,17 +127,16 @@ my $teval = mydircat($corpdir, $testevals{$lp}); die "Can't find test: $test\n" unless -f $test; assert_exec($teval); -if ($XFEATS) { - my $xgram = mydircat($corpdir, $xgrammars{$lp}); - die "Can't find x-grammar: $xgram" unless -f $xgram; - $EXTRA_FILTER = "$ADDXFEATS $xgram |"; - print STDERR "ADDING X-FEATS FROM $xgram\n"; -} +`mkdir -p $outdir`; + +# CREATE INIT WEIGHTS +print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; +my $weights = mydircat($outdir, "weights.init"); +write_random_weights_file($weights, @xfeats); # MAKE DEV print STDERR "\nFILTERING FOR dev...\n"; print STDERR "DEV: $dev (REFS=$drefs)\n"; -`mkdir -p $outdir`; my $devgrammar = filter($grammar, $dev, 'dev', $outdir); my $devini = mydircat($outdir, "cdec-dev.ini"); write_cdec_ini($devini, $devgrammar); @@ -138,11 +151,6 @@ my $testini = mydircat($outdir, "cdec-test.ini"); write_cdec_ini($testini, $testgrammar); -# CREATE INIT WEIGHTS -print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; -my $weights = mydircat($outdir, "weights.init"); -write_random_weights_file($weights); - # VEST print STDERR "\nMINIMUM ERROR TRAINING\n"; @@ -182,12 +190,6 @@ sub write_random_weights_file { my ($file, @extras) = @_; open F, ">$file" or die "Can't write $file: $!"; my @feats = (@DEFAULT_FEATS, @extras); - if ($XFEATS) { - my @xfeats = qw( - X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF - ); - @feats = (@feats, @xfeats); - } for my $feat (@feats) { my $r = rand(1.6); my $w = $init_weights{$feat} * $r; @@ -199,10 +201,13 @@ sub write_random_weights_file { sub filter { my ($grammar, $set, $name, $outdir) = @_; + my $out1 = mydircat($outdir, "$name.filt.gz"); my $outgrammar = mydircat($outdir, "$name.scfg.gz"); if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { - my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar"; - safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!"; + my $cmd = "gunzip -c $grammar | $FILTER -t $set | gzip > $out1"; + safesystem($out1, $cmd) or die "Filtering failed."; + $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $outgrammar"; + safesystem($outgrammar, $cmd) or die "Featurizing failed"; } return $outgrammar; } |