diff options
Diffstat (limited to 'gi/pipeline')
| -rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 33 | ||||
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 3 | 
2 files changed, 35 insertions, 1 deletions
| diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 3e6d0cfd..a1631b0f 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -65,6 +65,8 @@ my $FILTER = "$EXTOOLS/filter_grammar";  my $FEATURIZE = "$EXTOOLS/featurize_grammar";  assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST); +my $numtopics = 25; +  my $config = "$SCRIPT_DIR/clsp.config";  print STDERR "CORPORA CONFIGURATION: $config\n";  open CONF, "<$config" or die "Can't read $config: $!"; @@ -101,7 +103,9 @@ my $help;  my $FEATURIZER_OPTS = '';  my $dataDir = '/export/ws10smt/data';  my @features; +my $bkoffgram;  if (GetOptions( +        "backoff_grammar" => \$bkoffgram,          "data=s" => \$dataDir,          "features=s@" => \@features,  ) == 0 || @ARGV!=2 || $help) { @@ -156,6 +160,9 @@ write_random_weights_file($weights, @xfeats);  print STDERR "\nFILTERING FOR dev...\n";  print STDERR "DEV: $dev (REFS=$drefs)\n";  my $devgrammar = filter($grammar, $dev, 'dev', $outdir); +if($bkoffgram) { +  $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir); +}  my $devini = mydircat($outdir, "cdec-dev.ini");  write_cdec_ini($devini, $devgrammar); @@ -165,6 +172,9 @@ print STDERR "\nFILTERING FOR test...\n";  print STDERR "TEST: $test (EVAL=$teval)\n";  `mkdir -p $outdir`;  my $testgrammar = filter($grammar, $test, 'test', $outdir); +if($bkoffgram) { +  $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir); +}  my $testini = mydircat($outdir, "cdec-test.ini");  write_cdec_ini($testini, $testgrammar); @@ -230,6 +240,29 @@ sub filter {    return $outgrammar;  } +sub add_backoff { +  my ($grammar, $topics, $name, $outdir) = @_; +  my $out = mydircat($outdir, "backoff.$name.scfg"); +  my $outgrammar = mydircat($outdir, "$name.scfg.gz"); +  my $cmd = "zcat $grammar > $out"; +  safesystem($out,$cmd) or die "Adding backoff rules failed."; +  for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) { +    for(my $tpc2=0;$tpc2<$topics;$tpc2++) { +      my $bkoff = "1"; +      if($tpc2 == $tpcnum) { +        $bkoff = "0"; +      } +      my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff"; +      $cmd = "echo '$rule' >> $out"; +      safesystem($out,$cmd) or die "Adding backoff rules failed."; +    } +  } +  $cmd = "cat $out | gzip > $outgrammar"; +  safesystem($outgrammar, $cmd) or die "Adding backoff rules failed."; +  return $outgrammar; +} +   +  sub mydircat {   my ($base, $suffix) = @_;   if ($suffix =~ /^\//) { return $suffix; } diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 817d5c90..259dcd9c 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -245,7 +245,8 @@ sub grammar_extract_bidir {    if (-e $OUTGRAMMAR) {      print STDERR "$OUTGRAMMAR exists, reusing...\n";    } else { -    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS -g | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; +    my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); +    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";    }    return $OUTGRAMMAR;  } | 
