diff options
| -rw-r--r-- | gi/pipeline/OLD.clsp.config | 9 | ||||
| -rwxr-xr-x | gi/pipeline/OLD.evaluation-pipeline.pl | 4 | ||||
| -rw-r--r-- | gi/pipeline/clsp.config | 4 | ||||
| -rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 107 | 
4 files changed, 69 insertions, 55 deletions
| diff --git a/gi/pipeline/OLD.clsp.config b/gi/pipeline/OLD.clsp.config new file mode 100644 index 00000000..cd0f9d65 --- /dev/null +++ b/gi/pipeline/OLD.clsp.config @@ -0,0 +1,9 @@ +# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED +# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... +btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al +zhen /export/ws10smt/data/chinese-english corpus.zh-en.al +aren /export/ws10smt/data/arabic-english corpus.ar-en.al +uren /export/ws10smt/data/urdu-english corpus.ur-en.al +nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al + diff --git a/gi/pipeline/OLD.evaluation-pipeline.pl b/gi/pipeline/OLD.evaluation-pipeline.pl index 06c89b60..49c303eb 100755 --- a/gi/pipeline/OLD.evaluation-pipeline.pl +++ b/gi/pipeline/OLD.evaluation-pipeline.pl @@ -43,7 +43,7 @@ my $FILTSCORE = "$EXTOOLS/filter_score_grammar";  my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl";  assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); -my $config = "$SCRIPT_DIR/clsp.config"; +my $config = "$SCRIPT_DIR/OLD.clsp.config";  print STDERR "CORPORA CONFIGURATION: $config\n";  open CONF, "<$config" or die "Can't read $config: $!";  my %paths; @@ -233,7 +233,7 @@ EOT  sub print_help {    print STDERR<<EOT; -Usage: $0 [OPTIONS] language-pair grammar.bidir.gz +Usage: $0 [OPTIONS] language-pair unfiltered-grammar.gz  Given an induced grammar for an entire corpus (i.e., generated by  local-gi-pipeline.pl), filter and featurize it for a dev and test set, diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index cd0f9d65..49feada0 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -1,6 +1,6 @@  # THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... -btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh  fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al  zhen /export/ws10smt/data/chinese-english corpus.zh-en.al  aren /export/ws10smt/data/arabic-english corpus.ar-en.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 06c89b60..8414308d 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -6,32 +6,37 @@ my $CWD = getcwd;  my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } -my @DEFAULT_FEATS = qw( -  LogRuleCount SingletonRule LexE2F LexF2E WordPenalty -  LogFCount LanguageModel Glue GlueTop PassThrough SingletonF -); +# featurize_grammar may add multiple features from a single feature extractor +# the key in this map is the extractor name, the value is a list of the extracted features +my $feat_map = { +  "LogRuleCount" => [ "LogRuleCount", "SingletonRule" ] , +  "LexProb" => [ "LexE2F", "LexF2E" ] , +};  my %init_weights = qw( -  LogRuleCount 0.2 -  LexE2F -0.3 -  LexF2E -0.3 -  LogFCount 0.1 -  WordPenalty -1.5 -  LanguageModel 1.2 -  Glue -1.0 -  GlueTop 0.00001 -  PassThrough -10.0 -  SingletonRule -0.1 -  X_EGivenF -0.3 -  X_FGivenE -0.3 -  X_LogECount -1 -  X_LogFCount -0.1 -  X_LogRuleCount 0.3 -  X_SingletonE -0.1 -  X_SingletonF -0.1 -  X_SingletonRule -0.5 +  EGivenF -0.735245 +  FGivenE -0.219391 +  Glue -0.306709 +  GlueTop 0.0473331 +  LanguageModel 2.40403 +  LexE2F -0.266989 +  LexF2E -0.550373 +  LogECount -0.129853 +  LogFCount -0.194037 +  LogRuleCount 0.256706 +  PassThrough -0.9304905 +  SingletonE -3.04161 +  SingletonF 0.0714027 +  SingletonRule -0.889377 +  WordPenalty -7.99495  ); + +# these features are included by default +my @DEFAULT_FEATS = qw( Glue GlueTop LanguageModel WordPenalty ); + + +  my $CDEC = "$SCRIPT_DIR/../../decoder/cdec";  my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl";  my $EXTOOLS = "$SCRIPT_DIR/../../extools"; @@ -39,9 +44,9 @@ die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;  my $VEST = "$SCRIPT_DIR/../../vest";  die "Can't find vest: $VEST" unless -e $VEST && -d $VEST;  my $DISTVEST = "$VEST/dist-vest.pl"; -my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; -my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; -assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); +my $FILTER = "$EXTOOLS/filter_grammar"; +my $FEATURIZE = "$EXTOOLS/featurize_grammar"; +assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST);  my $config = "$SCRIPT_DIR/clsp.config";  print STDERR "CORPORA CONFIGURATION: $config\n"; @@ -53,7 +58,6 @@ my %devs;  my %devrefs;  my %tests;  my %testevals; -my %xgrammars;  print STDERR "       LANGUAGE PAIRS:";  while(<CONF>) {    chomp; @@ -61,11 +65,10 @@ while(<CONF>) {    next if /^\s*$/;    s/^\s+//;    s/\s+$//; -  my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/; +  my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/;    $paths{$name} = $path;    $corpora{$name} = $corpus;    $lms{$name} = $lm; -  $xgrammars{$name} = $xgrammar;    $devs{$name} = $dev;    $devrefs{$name} = $devref;    $tests{$name} = $xtests[0]; @@ -78,16 +81,27 @@ my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr );  my $outdir = "$CWD/exp";  my $help; -my $XFEATS; -my $EXTRA_FILTER = ''; +my $FEATURIZER_OPTS = '';  my $dataDir = '/export/ws10smt/data'; +my @features;  if (GetOptions(          "data=s" => \$dataDir, -        "xfeats" => \$XFEATS, +        "features=s@" => \@features,  ) == 0 || @ARGV!=2 || $help) {          print_help();          exit;  } + +my @xfeats; +for my $feat (@features) { +  my $rs = $feat_map->{$feat}; +  if (!defined $rs) { die "DON'T KNOW ABOUT FEATURE $feat\n"; } +  my @xfs = @$rs; +  @xfeats = (@xfeats, @xfs); +  $FEATURIZER_OPTS .= " -f $feat"; +} +print STDERR "X-FEATS: @xfeats\n"; +  my $lp = $ARGV[0];  my $grammar = $ARGV[1];  print STDERR "   CORPUS REPO: $dataDir\n"; @@ -113,17 +127,16 @@ my $teval = mydircat($corpdir, $testevals{$lp});  die "Can't find test: $test\n" unless -f $test;  assert_exec($teval); -if ($XFEATS) { -  my $xgram = mydircat($corpdir, $xgrammars{$lp}); -  die "Can't find x-grammar: $xgram" unless -f $xgram; -  $EXTRA_FILTER = "$ADDXFEATS $xgram |"; -  print STDERR "ADDING X-FEATS FROM $xgram\n"; -} +`mkdir -p $outdir`; + +# CREATE INIT WEIGHTS +print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; +my $weights = mydircat($outdir, "weights.init"); +write_random_weights_file($weights, @xfeats);  # MAKE DEV  print STDERR "\nFILTERING FOR dev...\n";  print STDERR "DEV: $dev (REFS=$drefs)\n"; -`mkdir -p $outdir`;  my $devgrammar = filter($grammar, $dev, 'dev', $outdir);  my $devini = mydircat($outdir, "cdec-dev.ini");  write_cdec_ini($devini, $devgrammar); @@ -138,11 +151,6 @@ my $testini = mydircat($outdir, "cdec-test.ini");  write_cdec_ini($testini, $testgrammar); -# CREATE INIT WEIGHTS -print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; -my $weights = mydircat($outdir, "weights.init"); -write_random_weights_file($weights); -  # VEST  print STDERR "\nMINIMUM ERROR TRAINING\n"; @@ -182,12 +190,6 @@ sub write_random_weights_file {    my ($file, @extras) = @_;    open F, ">$file" or die "Can't write $file: $!";    my @feats = (@DEFAULT_FEATS, @extras); -  if ($XFEATS) { -    my @xfeats = qw( -      X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF -    ); -    @feats = (@feats, @xfeats); -  }    for my $feat (@feats) {      my $r = rand(1.6);      my $w = $init_weights{$feat} * $r; @@ -199,10 +201,13 @@ sub write_random_weights_file {  sub filter {    my ($grammar, $set, $name, $outdir) = @_; +  my $out1 = mydircat($outdir, "$name.filt.gz");    my $outgrammar = mydircat($outdir, "$name.scfg.gz");    if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { -    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar"; -    safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!"; +    my $cmd = "gunzip -c $grammar | $FILTER -t $set | gzip > $out1"; +    safesystem($out1, $cmd) or die "Filtering failed."; +    $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $outgrammar"; +    safesystem($outgrammar, $cmd) or die "Featurizing failed";    }    return $outgrammar;  } | 
