diff options
Diffstat (limited to 'gi/pipeline')
| -rw-r--r-- | gi/pipeline/OLD.clsp.config | 9 | ||||
| -rwxr-xr-x | gi/pipeline/OLD.evaluation-pipeline.pl | 277 | ||||
| -rw-r--r-- | gi/pipeline/backoff-pipe.pl | 215 | ||||
| -rw-r--r-- | gi/pipeline/blacklight.config | 9 | ||||
| -rw-r--r-- | gi/pipeline/clsp.config | 10 | ||||
| -rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 364 | ||||
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 465 | ||||
| -rw-r--r-- | gi/pipeline/lticluster.config | 9 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/filter-by-f.pl | 56 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 65 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/refilter.pl | 40 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/rekey.pl | 8 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 44 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/sort-by-key.sh | 5 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/xfeats.pl | 39 | ||||
| -rw-r--r-- | gi/pipeline/valhalla.config | 9 | 
17 files changed, 0 insertions, 1677 deletions
| diff --git a/gi/pipeline/OLD.clsp.config b/gi/pipeline/OLD.clsp.config deleted file mode 100644 index cd0f9d65..00000000 --- a/gi/pipeline/OLD.clsp.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... -btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /export/ws10smt/data/chinese-english corpus.zh-en.al -aren /export/ws10smt/data/arabic-english corpus.ar-en.al -uren /export/ws10smt/data/urdu-english corpus.ur-en.al -nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/OLD.evaluation-pipeline.pl b/gi/pipeline/OLD.evaluation-pipeline.pl deleted file mode 100755 index 49c303eb..00000000 --- a/gi/pipeline/OLD.evaluation-pipeline.pl +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use Getopt::Long; -use Cwd; -my $CWD = getcwd; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -my @DEFAULT_FEATS = qw( -  LogRuleCount SingletonRule LexE2F LexF2E WordPenalty -  LogFCount LanguageModel Glue GlueTop PassThrough SingletonF -); - -my %init_weights = qw( -  LogRuleCount 0.2 -  LexE2F -0.3 -  LexF2E -0.3 -  LogFCount 0.1 -  WordPenalty -1.5 -  LanguageModel 1.2 -  Glue -1.0 -  GlueTop 0.00001 -  PassThrough -10.0 -  SingletonRule -0.1 -  X_EGivenF -0.3 -  X_FGivenE -0.3 -  X_LogECount -1 -  X_LogFCount -0.1 -  X_LogRuleCount 0.3 -  X_SingletonE -0.1 -  X_SingletonF -0.1 -  X_SingletonRule -0.5 -); - -my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; -my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; -my $EXTOOLS = "$SCRIPT_DIR/../../extools"; -die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; -my $VEST = "$SCRIPT_DIR/../../vest"; -die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; -my $DISTVEST = "$VEST/dist-vest.pl"; -my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; -my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; -assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); - -my $config = "$SCRIPT_DIR/OLD.clsp.config"; -print STDERR "CORPORA CONFIGURATION: $config\n"; -open CONF, "<$config" or die "Can't read $config: $!"; -my %paths; -my %corpora; -my %lms; -my %devs; -my %devrefs; -my %tests; -my %testevals; -my %xgrammars; -print STDERR "       LANGUAGE PAIRS:"; -while(<CONF>) { -  chomp; -  next if /^#/; -  next if /^\s*$/; -  s/^\s+//; -  s/\s+$//; -  my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/; -  $paths{$name} = $path; -  $corpora{$name} = $corpus; -  $lms{$name} = $lm; -  $xgrammars{$name} = $xgrammar; -  $devs{$name} = $dev; -  $devrefs{$name} = $devref; -  $tests{$name} = $xtests[0]; -  $testevals{$name} = $xtests[1]; -  print STDERR " $name"; -} -print STDERR "\n"; - -my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); - -my $outdir = "$CWD/exp"; -my $help; -my $XFEATS; -my $EXTRA_FILTER = ''; -my $dataDir = '/export/ws10smt/data'; -if (GetOptions( -        "data=s" => \$dataDir, -        "xfeats" => \$XFEATS, -) == 0 || @ARGV!=2 || $help) { -        print_help(); -        exit; -} -my $lp = $ARGV[0]; -my $grammar = $ARGV[1]; -print STDERR "   CORPUS REPO: $dataDir\n"; -print STDERR " LANGUAGE PAIR: $lp\n"; -die "I don't know about that language pair\n" unless $paths{$lp}; -my $corpdir = "$dataDir"; -if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } -die "I can't find the corpora directory: $corpdir" unless -d $corpdir; -print STDERR "       GRAMMAR: $grammar\n"; -my $LANG_MODEL = mydircat($corpdir, $lms{$lp}); -print STDERR "            LM: $LANG_MODEL\n"; -my $CORPUS = mydircat($corpdir, $corpora{$lp}); -die "Can't find corpus: $CORPUS" unless -f $CORPUS; - -my $dev = mydircat($corpdir, $devs{$lp}); -my $drefs = $devrefs{$lp}; -die "Can't find dev: $dev\n" unless -f $dev; -die "Dev refs not set" unless $drefs; -$drefs = mydircat($corpdir, $drefs); - -my $test = mydircat($corpdir, $tests{$lp}); -my $teval = mydircat($corpdir, $testevals{$lp}); -die "Can't find test: $test\n" unless -f $test; -assert_exec($teval); - -if ($XFEATS) { -  my $xgram = mydircat($corpdir, $xgrammars{$lp}); -  die "Can't find x-grammar: $xgram" unless -f $xgram; -  $EXTRA_FILTER = "$ADDXFEATS $xgram |"; -  print STDERR "ADDING X-FEATS FROM $xgram\n"; -} - -# MAKE DEV -print STDERR "\nFILTERING FOR dev...\n"; -print STDERR "DEV: $dev (REFS=$drefs)\n"; -`mkdir -p $outdir`; -my $devgrammar = filter($grammar, $dev, 'dev', $outdir); -my $devini = mydircat($outdir, "cdec-dev.ini"); -write_cdec_ini($devini, $devgrammar); - - -# MAKE TEST -print STDERR "\nFILTERING FOR test...\n"; -print STDERR "TEST: $test (EVAL=$teval)\n"; -`mkdir -p $outdir`; -my $testgrammar = filter($grammar, $test, 'test', $outdir); -my $testini = mydircat($outdir, "cdec-test.ini"); -write_cdec_ini($testini, $testgrammar); - - -# CREATE INIT WEIGHTS -print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; -my $weights = mydircat($outdir, "weights.init"); -write_random_weights_file($weights); - - -# VEST -print STDERR "\nMINIMUM ERROR TRAINING\n"; -my $tuned_weights = mydircat($outdir, 'weights.tuned'); -if (-f $tuned_weights) { -  print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; -} else { -  my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; -  print STDERR "MERT COMMAND: $cmd\n"; -  `rm -rf $outdir/vest 2> /dev/null`; -  chdir $outdir or die "Can't chdir to $outdir: $!"; -  $weights = `$cmd`; -  die "MERT reported non-zero exit code" unless $? == 0; -  chomp $weights; -  safesystem($tuned_weights, "cp $weights $tuned_weights"); -  print STDERR "TUNED WEIGHTS: $tuned_weights\n"; -  die "$tuned_weights is missing!" unless -f $tuned_weights; -} - -# DECODE -print STDERR "\nDECODE TEST SET\n"; -my $decolog = mydircat($outdir, "test-decode.log"); -my $testtrans = mydircat($outdir, "test.trans"); -my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; -safesystem($testtrans, $cmd) or die "Failed to decode test set!"; - - -# EVALUATE -print STDERR "\nEVALUATE TEST SET\n"; -print STDERR "TEST: $testtrans\n"; -$cmd = "$teval $testtrans"; -safesystem(undef, $cmd) or die "Failed to evaluate!"; -exit 0; - - -sub write_random_weights_file { -  my ($file, @extras) = @_; -  open F, ">$file" or die "Can't write $file: $!"; -  my @feats = (@DEFAULT_FEATS, @extras); -  if ($XFEATS) { -    my @xfeats = qw( -      X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF -    ); -    @feats = (@feats, @xfeats); -  } -  for my $feat (@feats) { -    my $r = rand(1.6); -    my $w = $init_weights{$feat} * $r; -    if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } -    print F "$feat $w\n"; -  } -  close F; -} - -sub filter { -  my ($grammar, $set, $name, $outdir) = @_; -  my $outgrammar = mydircat($outdir, "$name.scfg.gz"); -  if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { -    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar"; -    safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!"; -  } -  return $outgrammar; -} - -sub mydircat { - my ($base, $suffix) = @_; - if ($suffix =~ /^\//) { return $suffix; } - my $res = $base . '/' . $suffix; - $res =~ s/\/\//\//g; - return $res; -} - -sub write_cdec_ini { -  my ($filename, $grammar_path) = (@_); -  open CDECINI, ">$filename" or die "Can't write $filename: $!"; -  print CDECINI <<EOT; -formalism=scfg -cubepruning_pop_limit=100 -add_pass_through_rules=true -scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz -grammar=$grammar_path -feature_function=WordPenalty -feature_function=LanguageModel -o 3 $LANG_MODEL -EOT -  close CDECINI; -}; - -sub print_help { -  print STDERR<<EOT; - -Usage: $0 [OPTIONS] language-pair unfiltered-grammar.gz - -Given an induced grammar for an entire corpus (i.e., generated by -local-gi-pipeline.pl), filter and featurize it for a dev and test set, -run MERT, report scores. - -EOT -} - -sub safesystem { -  my $output = shift @_; -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    if ($exitcode) { -      print STDERR "Exit code: $exitcode\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -    } -    return ! $exitcode; -  } -} - -sub assert_exec { -  my @files = @_; -  for my $file (@files) { -    die "Can't find $file - did you run make?\n" unless -e $file; -    die "Can't execute $file" unless -e $file; -  } -}; - diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl deleted file mode 100644 index ac103c8b..00000000 --- a/gi/pipeline/backoff-pipe.pl +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my @grammars; -my $OUTPUTPREFIX = './giwork/bo.hier.grammar'; -safemkdir($OUTPUTPREFIX); -my $backoff_levels = 1; -my $glue_levels = 1; - -usage() unless &GetOptions('grmr=s@' => \ @grammars, -                           'outprefix=s' => \ $OUTPUTPREFIX, -                           'bo-lvls=i' => \ $backoff_levels, -                           'glue-lvls=i' => \ $glue_levels, -); -                            -my $OUTDIR = $OUTPUTPREFIX . '/hier'; -print STDERR "@grammars\n"; - - -my %grmr = (); -foreach my $grammar (@grammars) { -    $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*/; -    $grmr{$1} = $grammar; -} - -my @index = sort keys %grmr; -$OUTDIR = $OUTDIR . join('-',@index); -safemkdir($OUTDIR); -my $BACKOFF_GRMR = $OUTDIR . '/backoff.hier.gz'; -safesystem("echo \"\" | gzip > $BACKOFF_GRMR"); -my $GLUE_GRMR = $OUTDIR . '/glue.hier.gz'; -safesystem("echo \"\" | gzip > $GLUE_GRMR"); -my $joinedgrammars = $OUTDIR . '/grammar.hier.gz'; - -join_grammars(); - -for my $i (0..(scalar @index)-2) { -    my $freqs = extract_freqs($index[$i], $index[$i+1]); -    if ($i < $backoff_levels) { -        create_backoff_rules($index[$i],$index[$i+1],$freqs); -    } -    if ($i < $glue_levels) { -        add_glue_rules($index[$i]); -    } -} - -output_grammar_info(); - - -sub usage { -  print <<EOT; - -Usage: $0 [OPTIONS] corpus.fr-en-al - -Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation. - -EOT -  exit 1; -}; - -sub safemkdir { -  my $dir = shift; -  if (-d $dir) { return 1; } -  return mkdir($dir); -} - - -sub safesystem { -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    print STDERR "Exit code: $exitcode\n" if $exitcode; -    return ! $exitcode; -  } -} - - -sub join_grammars { -    print STDERR "\n!!! JOINING GRAMMARS\n"; -    if(-e $joinedgrammars) { -        print STDERR "$joinedgrammars exists, reusing...\n"; -        return; -    } -    safesystem("echo \"\" | gzip > $joinedgrammars"); -    foreach my $i (@index) { -        my $g = $grmr{$i}; -        safesystem("zcat $g | sed -r -e 's/X([0-9]+)/X$i\\1/g' - | gzip > $g.2.gz"); -        safesystem("zcat $joinedgrammars $g.2.gz | gzip > $joinedgrammars.2.gz"); -        safesystem("mv $joinedgrammars.2.gz $joinedgrammars"); -    } -} - - -sub extract_freqs { -    my($grmr1,$grmr2) = @_; -    print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n"; -    my $IN_COARSE = substr($grmr{$grmr1},0,index($grmr{$grmr1},".grammar/")) . "/labeled_spans.txt"; -    my $IN_FINE = substr($grmr{$grmr2},0,index($grmr{$grmr2},".grammar/")) . "/labeled_spans.txt"; -    my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$grmr1-$grmr2.txt"; -    my $FREQS = "$OUTDIR/label_freq.hier$grmr1-$grmr2.txt"; -    if(-e $OUT_SPANS && -e $FREQS) { -        print STDERR "$OUT_SPANS exists, reusing...\n"; -        print STDERR "$FREQS exists, reusing...\n"; -        return $FREQS; -    } -     -    safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); -     -    my %FREQ_HIER = (); -    my %finehier = (); -     -    open SPANS, $OUT_SPANS or die $!; -    while (<SPANS>) { -        my ($tmp, $coarse, $fine) = split /\|\|\|/; -        my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; -        my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; -         -        foreach my $i (0..(scalar @coarse_spans)-1) { -            my $coarse_cat = $coarse_spans[$i]; -            my $fine_cat = $fine_spans[$i]; -             -            $FREQ_HIER{$coarse_cat}{$fine_cat}++; -        } -    } -    close SPANS; -    foreach (values %FREQ_HIER) { -        my $coarse_freq = $_; -        my $total = 0; -        $total+=$_ for (values %{ $coarse_freq }); -        $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); -    } -    open FREQS, ">", $FREQS or die $!; -    foreach my $coarse_cat (keys %FREQ_HIER) { -        print FREQS "$coarse_cat |||"; -        foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { -            my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat}; -            print FREQS " $fine_cat:$freq"; -            if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) { -               $finehier{$fine_cat} = $coarse_cat; -            }   -        } -        print FREQS "\n"; -    } -#    foreach my $fine_cat (keys %finehier) { -#        print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; -#    } -    close FREQS; -    return $FREQS; -} - - -sub create_backoff_rules { -    print STDERR "\n!!! CREATING BACKOFF RULES\n"; -    my ($grmr1, $grmr2, $freq) = @_; -    my $OUTFILE = "$OUTDIR/backoff.hier$grmr1-$grmr2.txt"; -    if(-e $OUTFILE) { -        print STDERR "$OUTFILE exists, reusing...\n"; -        return; -    } -    open FREQS, $freq or die $!; -    open TMP, ">", $OUTFILE or die $!; -    while (<FREQS>) { -        my $line = $_; -        $line = m/^(\d+) \|\|\| (.+)$/; -        my $coarse = $1; -        $line = $2; -        my @finefreq = $line =~ m/(\d+):(\S+)/g; -        for(my $i = 0; $i < scalar @finefreq; $i+=2) { -            my $finecat = $finefreq[$i]; -            my $finefreq = $finefreq[$i+1]; -            print TMP "[X$grmr1$coarse] ||| [X$grmr2$finecat,1]\t[1] ||| BackoffRule=$finefreq A=0-0\n"; -        } -    } -    close TMP; -    close FREQS; -    safesystem("zcat $BACKOFF_GRMR | cat - $OUTFILE | gzip > $BACKOFF_GRMR.2.gz"); -    safesystem("mv $BACKOFF_GRMR.2.gz $BACKOFF_GRMR"); -} - -sub add_glue_rules { -    print STDERR "\n!!! CREATING GLUE RULES\n"; -    my ($grmr) = @_; -    my $OUTFILE = "$OUTDIR/glue.$grmr.gz"; -    if (-e $OUTFILE) { -        print STDERR "$OUTFILE exists, reusing...\n"; -        return; -    } -    open TMP, ">", $OUTFILE or die $!; -    for my $i (0..($grmr-1)) { -        print TMP "[S] ||| [S,1] [X$grmr$i,2] ||| [1] [2] ||| Glue=1\n"; -        print TMP "[S] ||| [X$grmr$i,1] ||| [1] ||| GlueTop=1\n"; -    } -    close TMP; -    safesystem("zcat $GLUE_GRMR | cat - $OUTFILE | gzip > $GLUE_GRMR.2.gz"); -    safesystem("mv $GLUE_GRMR.2.gz $GLUE_GRMR"); -} - -sub output_grammar_info { -    print STDERR "\n!!! GRAMMAR INFORMATION\n"; -    print STDOUT "GRAMMAR: \t$joinedgrammars\n"; -    print STDOUT "GLUE: \t$GLUE_GRMR\n"; -    print STDOUT "BACKOFF: \t$BACKOFF_GRMR\n"; -} diff --git a/gi/pipeline/blacklight.config b/gi/pipeline/blacklight.config deleted file mode 100644 index fc59a604..00000000 --- a/gi/pipeline/blacklight.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/usr/users/0/cdyer/ws10smt/data -btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh -aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh -uren /usr/users/0/cdyer/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config deleted file mode 100644 index c23d409f..00000000 --- a/gi/pipeline/clsp.config +++ /dev/null @@ -1,10 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/export/ws10smt/data -btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /export/ws10smt/data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh -aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh -uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl deleted file mode 100755 index 4b4529d9..00000000 --- a/gi/pipeline/evaluation-pipeline.pl +++ /dev/null @@ -1,364 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use Getopt::Long; -use Cwd; -my $CWD = getcwd; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } -use LocalConfig; - -my $JOBS = 15; -my $PMEM = "9G"; -my $NUM_TRANSLATIONS = 50; -my $GOAL = "S"; - -# featurize_grammar may add multiple features from a single feature extractor -# the key in this map is the extractor name, the value is a list of the extracted features -my $feat_map = { -  "LogRuleCount" => [ "LogRuleCount", "SingletonRule" ] , -#  "XFeatures" => [ "XFE","XEF" ] , -  "XFeatures" => [ "XFE","XEF","LabelledEF","LabelledFE"], # ,"XE_Singleton","XF_Singleton"] , -  "LabelledRuleConditionals" => [ "LabelledFE","LabelledEF" ] , -  "LexProb" => [ "LexE2F", "LexF2E" ] , -  "BackoffRule" => [ "BackoffRule" ] , -  "RulePenalty" => [ "RulePenalty" ] , -  "LHSProb" => [ "LHSProb" ] , -  "LabellingShape" => [ "LabellingShape" ] , -  "GenerativeProb" => [ "GenerativeProb" ] , -}; - -my %init_weights = qw( -  EGivenF -0.735245 -  FGivenE -0.219391 -  Glue -0.306709 -  GlueTop 0.0473331 -  LanguageModel 2.40403 -  LexE2F -0.266989 -  LexF2E -0.550373 -  LogECount -0.129853 -  LogFCount -0.194037 -  LogRuleCount 0.256706 -  BackoffRule 0.5 -  XFE -0.256706 -  XEF -0.256706 -  XF_Singleton -0.05 -  XE_Singleton -0.8 -  LabelledFE -0.256706 -  LabelledEF -0.256706 -  PassThrough -0.9304905 -  SingletonE -3.04161 -  SingletonF 0.0714027 -  SingletonRule -0.889377 -  WordPenalty -1.99495 -  RulePenalty -0.1 -  LabellingShape -0.1 -  LHSProb -0.1 -  GenerativeProb -0.1 -); - - -# these features are included by default -my @DEFAULT_FEATS = qw( PassThrough Glue GlueTop LanguageModel WordPenalty ); - - - -my $FILTERBYF = "$SCRIPT_DIR/scripts/filter-by-f.pl"; -my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; -my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; -my $EXTOOLS = "$SCRIPT_DIR/../../extools"; -die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; -my $VEST = "$SCRIPT_DIR/../../vest"; -die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; -my $DISTVEST = "$VEST/dist-vest.pl"; -my $FILTER = "$EXTOOLS/filter_grammar"; -my $FEATURIZE = "$EXTOOLS/featurize_grammar"; -assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF); - -my $numtopics = 25; - -my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config'; -print STDERR "CORPORA CONFIGURATION: $config\n"; -open CONF, "<$config" or die "Can't read $config: $!"; -my %paths; -my %corpora; -my %lms; -my %devs; -my %devrefs; -my %tests; -my %testevals; -my $datadir; -print STDERR "       LANGUAGE PAIRS:"; -while(<CONF>) { -  chomp; -  next if /^#/; -  next if /^\s*$/; -  s/^\s+//; -  s/\s+$//; -  if (! defined $datadir) { $datadir = $_; next; } -  my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/; -  $paths{$name} = $path; -  $corpora{$name} = $corpus; -  $lms{$name} = $lm; -  $devs{$name} = $dev; -  $devrefs{$name} = $devref; -  $tests{$name} = $xtests[0]; -  $testevals{$name} = $xtests[1]; -  print STDERR " $name"; -} -print STDERR "\n"; - -my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); - -my $outdir = "$CWD/exp"; -my $help; -my $FEATURIZER_OPTS = ''; -my $dataDir = '/export/ws10smt/data'; -my @features; -my $bkoffgram; -my $gluegram; -my $oovgram; -my $usefork; -my $lmorder = 3; -my $density; -if (GetOptions( -        "backoff-grammar=s" => \$bkoffgram, -        "density-prune=f" => \$density, -        "glue-grammar=s" => \$gluegram, -        "oov-grammar=s" => \$oovgram, -        "data=s" => \$dataDir, -        "pmem=s" => \$PMEM, -        "n=i" => \$NUM_TRANSLATIONS, -        "features=s@" => \@features, -        "use-fork" => \$usefork, -        "jobs=i" => \$JOBS, -        "out-dir=s" => \$outdir, -        "lmorder=i" => \$lmorder, -        "goal=s" => \$GOAL, -) == 0 || @ARGV!=2 || $help) { -        print_help(); -        exit; -} -my $DENSITY_PRUNE = ''; -if ($density) { -  $DENSITY_PRUNE = "--density-prune $density"; -} -if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; } -my @fkeys = keys %$feat_map; -die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0; -my @xfeats; -for my $feat (@features) { -  my $rs = $feat_map->{$feat}; -  if (!defined $rs) { die "DON'T KNOW ABOUT FEATURE $feat\n"; } -  my @xfs = @$rs; -  @xfeats = (@xfeats, @xfs); -  $FEATURIZER_OPTS .= " -f $feat" unless $feat eq "BackoffRule"; -} -print STDERR "X-FEATS: @xfeats\n"; - -my $lp = $ARGV[0]; -my $grammar = $ARGV[1]; -print STDERR "   CORPUS REPO: $dataDir\n"; -print STDERR " LANGUAGE PAIR: $lp\n"; -die "I don't know about that language pair\n" unless $paths{$lp}; -my $corpdir = "$dataDir"; -if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } -die "I can't find the corpora directory: $corpdir" unless -d $corpdir; -print STDERR "       GRAMMAR: $grammar\n"; -my $LANG_MODEL = mydircat($corpdir, $lms{$lp}); -print STDERR "            LM: $LANG_MODEL\n"; -my $CORPUS = mydircat($corpdir, $corpora{$lp}); -die "Can't find corpus: $CORPUS" unless -f $CORPUS; - -my $dev = mydircat($corpdir, $devs{$lp}); -my $drefs = $devrefs{$lp}; -die "Can't find dev: $dev\n" unless -f $dev; -die "Dev refs not set" unless $drefs; -$drefs = mydircat($corpdir, $drefs); - -my $test = mydircat($corpdir, $tests{$lp}); -my $teval = mydircat($corpdir, $testevals{$lp}); -#die "Can't find test: $test\n" unless -f $test; -#assert_exec($teval); - -`mkdir -p $outdir`; - -# CREATE INIT WEIGHTS -print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; -my $weights = mydircat($outdir, "weights.init"); -write_random_weights_file($weights, @xfeats); - -my $bkoff_grmr; -my $glue_grmr; -if($bkoffgram) { -    print STDERR "Placing backoff grammar…\n"; -    $bkoff_grmr = mydircat($outdir, "backoff.scfg.gz"); -    print STDERR "cp $bkoffgram $bkoff_grmr\n"; -    safesystem(undef,"cp $bkoffgram $bkoff_grmr"); -} -if($gluegram) { -    print STDERR "Placing glue grammar…\n"; -    $glue_grmr = mydircat($outdir, "glue.bo.scfg.gz"); -    print STDERR "cp $gluegram $glue_grmr\n"; -    safesystem(undef,"cp $gluegram $glue_grmr"); -} - -# MAKE DEV -print STDERR "\nFILTERING FOR dev...\n"; -print STDERR "DEV: $dev (REFS=$drefs)\n"; -my $devgrammar = filter($grammar, $dev, 'dev', $outdir); -my $devini = mydircat($outdir, "cdec-dev.ini"); -write_cdec_ini($devini, $devgrammar); - - -# MAKE TEST -print STDERR "\nFILTERING FOR test...\n"; -print STDERR "TEST: $test (EVAL=$teval)\n"; -`mkdir -p $outdir`; -my $testgrammar = filter($grammar, $test, 'test', $outdir); -my $testini = mydircat($outdir, "cdec-test.ini"); -write_cdec_ini($testini, $testgrammar); - - -# VEST -print STDERR "\nMINIMUM ERROR TRAINING\n"; -my $tuned_weights = mydircat($outdir, 'weights.tuned'); -if (-f $tuned_weights) { -  print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; -} else { -  my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; -  print STDERR "MERT COMMAND: $cmd\n"; -  `rm -rf $outdir/vest 2> /dev/null`; -  chdir $outdir or die "Can't chdir to $outdir: $!"; -  $weights = `$cmd`; -  die "MERT reported non-zero exit code" unless $? == 0; -  chomp $weights; -  safesystem($tuned_weights, "cp $weights $tuned_weights"); -  print STDERR "TUNED WEIGHTS: $tuned_weights\n"; -  die "$tuned_weights is missing!" unless -f $tuned_weights; -} - -# DECODE -print STDERR "\nDECODE TEST SET\n"; -my $decolog = mydircat($outdir, "test-decode.log"); -my $testtrans = mydircat($outdir, "test.trans"); -my $cmd = "cat $test | $PARALLELIZE $usefork -j $JOBS -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; -safesystem($testtrans, $cmd) or die "Failed to decode test set!"; - - -# EVALUATE -print STDERR "\nEVALUATE TEST SET\n"; -print STDERR "TEST: $testtrans\n"; -$cmd = "$teval $testtrans"; -safesystem(undef, $cmd) or die "Failed to evaluate!"; -exit 0; - - -sub write_random_weights_file { -  my ($file, @extras) = @_; -  if (-f $file) { -    print STDERR "$file exists - REUSING!\n"; -    return; -  } -  open F, ">$file" or die "Can't write $file: $!"; -  my @feats = (@DEFAULT_FEATS, @extras); -  for my $feat (@feats) { -    my $r = rand(0.4) + 0.8; -    my $w = $init_weights{$feat} * $r; -    if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } -    print F "$feat $w\n"; -  } -  close F; -} - -sub filter { -  my ($grammar, $set, $name, $outdir) = @_; -  my $out1 = mydircat($outdir, "$name.filt.gz"); -  my $out2 = mydircat($outdir, "$name.f_feat.gz"); -  my $outgrammar = mydircat($outdir, "$name.scfg.gz"); -  if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { -    my $cmd = "gunzip -c $grammar | $FILTER -t $set | gzip > $out1"; -    safesystem($out1, $cmd) or die "Filtering failed."; -    $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2"; -    safesystem($out2, $cmd) or die "Featurizing failed"; -    $cmd = "$FILTERBYF $NUM_TRANSLATIONS $out2 $outgrammar"; -    safesystem($outgrammar, $cmd) or die "Secondary filtering failed"; -  } -  return $outgrammar; -}   - -sub mydircat { - my ($base, $suffix) = @_; - if ($suffix =~ /^\//) { return $suffix; } - my $res = $base . '/' . $suffix; - $res =~ s/\/\//\//g; - return $res; -} - -sub write_cdec_ini { -  my ($filename, $grammar_path) = (@_); -  open CDECINI, ">$filename" or die "Can't write $filename: $!"; -  my $glue = ($gluegram ? "$glue_grmr" : "$datadir/glue/glue.scfg.gz"); -  my $oov = ($oovgram ? "$oovgram" : "$datadir/oov.scfg.gz"); -  print CDECINI <<EOT; -formalism=scfg -cubepruning_pop_limit=100 -add_pass_through_rules=true -scfg_extra_glue_grammar=$glue -grammar=$oov -grammar=$grammar_path -scfg_default_nt=OOV -scfg_no_hiero_glue_grammar=true -feature_function=WordPenalty -feature_function=LanguageModel -o $lmorder $LANG_MODEL -goal=$GOAL -EOT -  print CDECINI "grammar=$bkoff_grmr\n" if $bkoffgram; -  close CDECINI; -}; - -sub print_help { -  print STDERR<<EOT; - -Usage: $0 [-c data-config-file] [-n N] language-pair grammar.bidir.gz [OPTIONS] - -Given an induced grammar for an entire corpus (i.e., generated by -local-gi-pipeline.pl), filter and featurize it for a dev and test set, -run MERT, report scores. Use -n to specify the number of translations -to keep for a given source (30 is default). - -EOT -} - -sub safesystem { -  my $output = shift @_; -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    if ($exitcode) { -      print STDERR "Exit code: $exitcode\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -    } -    return ! $exitcode; -  } -} - -sub assert_exec { -  my @files = @_; -  for my $file (@files) { -    die "Can't find $file - did you run make?\n" unless -e $file; -    die "Can't execute $file" unless -e $file; -  } -}; - diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl deleted file mode 100755 index e31167a2..00000000 --- a/gi/pipeline/local-gi-pipeline.pl +++ /dev/null @@ -1,465 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use File::Copy; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -use Getopt::Long "GetOptions"; - -my $GZIP = 'gzip'; -my $ZCAT = 'gunzip -c'; -my $SED = 'sed -e'; -my $BASE_PHRASE_MAX_SIZE = 10; -my $COMPLETE_CACHE = 1; -my $ITEMS_IN_MEMORY = 10000000;  # cache size in extractors -my $NUM_TOPICS = 50; -my $NUM_TOPICS_COARSE; -my $NUM_TOPICS_FINE = $NUM_TOPICS; -my $NUM_SAMPLES = 1000; -my $CONTEXT_SIZE = 1; -my $BIDIR = 0; -my $TOPICS_CONFIG = "pyp-topics.conf"; -my $LANGUAGE = "target"; -my $LABEL_THRESHOLD = "0"; -my $PRESERVE_PHRASES; - -my $MODEL = "pyp"; -my $NUM_ITERS = 100; -my $PR_SCALE_P = 0; -my $PR_SCALE_C = 0; -my $PR_FLAGS = ""; -my $MORFMARK = ""; - -my $EXTOOLS = "$SCRIPT_DIR/../../extools"; -die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; -my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src"; -die "Can't find pyp-topics: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS; -my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts"; -die "Can't find pyp-topics: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS; -my $PRTOOLS = "$SCRIPT_DIR/../posterior-regularisation"; -die "Can't find posterior-regularisation: $PRTOOLS" unless -e $PRTOOLS && -d $PRTOOLS; -my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce"; -my $C2D = "$PYPSCRIPTS/contexts2documents.py"; -my $S2L = "$PYPSCRIPTS/spans2labels.py"; -my $SPLIT = "$SCRIPT_DIR/../posterior-regularisation/split-languages.py"; - -my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh"; - -my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh"; -my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl"; -my $REMOVE_TAGS_CORPUS = "$SCRIPT_DIR/scripts/remove-tags-from-corpus.pl"; -my $REMOVE_TAGS_CONTEXT = "$SCRIPT_DIR/scripts/remove-tags-from-contexts.pl"; -my $EXTRACTOR = "$EXTOOLS/extractor"; -my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train"; -my $MORF_DOC_FILTER = "$SCRIPT_DIR/../morf-segmentation/filter_docs.pl"; - -assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, -            $S2L, $C2D, $TOPIC_TRAIN, $SPLIT, $REMOVE_TAGS_CONTEXT, $REMOVE_TAGS_CORPUS, $MORF_DOC_FILTER); - -my $BACKOFF_GRAMMAR; -my $DEFAULT_CAT; -my $HIER_CAT; -my %FREQ_HIER = (); -my $TAGGED_CORPUS; - -my $NAME_SHORTCUT; - -my $OUTPUT = './giwork'; -usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, -                           'backoff_grammar' => \$BACKOFF_GRAMMAR, -                           'output=s' => \$OUTPUT, -                           'model=s' => \$MODEL, -                           'topics=i' => \$NUM_TOPICS_FINE, -                           'coarse_topics=i' => \$NUM_TOPICS_COARSE, -                           'trg_context=i' => \$CONTEXT_SIZE, -                           'samples=i' => \$NUM_SAMPLES, -                           'label_threshold=f' => \$LABEL_THRESHOLD, -                           'use_default_cat' => \$DEFAULT_CAT, -                           'topics-config=s' => \$TOPICS_CONFIG, -                           'iterations=i' => \$NUM_ITERS, -                           'pr-scale-phrase=f' => \$PR_SCALE_P, -                           'pr-scale-context=f' => \$PR_SCALE_C, -                           'pr-flags=s' => \$PR_FLAGS, -                           'tagged_corpus=s' => \$TAGGED_CORPUS, -                           'language=s' => \$LANGUAGE, -                           'get_name_only' => \$NAME_SHORTCUT, -                           'preserve_phrases' => \$PRESERVE_PHRASES, -                           'morf=s' => \$MORFMARK, -                          ); -if ($NAME_SHORTCUT) { -  $NUM_TOPICS = $NUM_TOPICS_FINE; -  print STDERR labeled_dir(); -  exit 0; -} -usage() unless scalar @ARGV == 1; -my $CORPUS = $ARGV[0]; -open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; - -$NUM_TOPICS = $NUM_TOPICS_FINE; - -$HIER_CAT = ( $NUM_TOPICS_COARSE ? 1 : 0 ); - -print STDERR "   Output: $OUTPUT\n"; -my $DATA_DIR = $OUTPUT . '/corpora'; -my $LEX_NAME = "corpus.f_e_a.$LANGUAGE.lex"; -my $CORPUS_LEX = $DATA_DIR . '/' . $LEX_NAME;  # corpus used to extract rules -my $CORPUS_CLUSTER = $DATA_DIR . "/corpus.f_e_a.$LANGUAGE.cluster"; # corpus used for clustering (often identical) - -my $CONTEXT_DIR = $OUTPUT . '/' . context_dir(); -my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir(); -my $LABELED_DIR = $OUTPUT . '/' . labeled_dir(); -my $CLUSTER_DIR_C; -my $CLUSTER_DIR_F; -my $LABELED_DIR_C; -my $LABELED_DIR_F; -if($HIER_CAT) { -    $CLUSTER_DIR_F = $CLUSTER_DIR; -    $LABELED_DIR_F = $LABELED_DIR; -    $NUM_TOPICS = $NUM_TOPICS_COARSE; -    $CLUSTER_DIR_C = $OUTPUT . '/' . cluster_dir(); -    $LABELED_DIR_C = $OUTPUT . '/' . labeled_dir(); -    $NUM_TOPICS = $NUM_TOPICS_FINE; -} -my $GRAMMAR_DIR = $OUTPUT . '/' . grammar_dir(); -print STDERR "  Context: $CONTEXT_DIR\n  Cluster: $CLUSTER_DIR\n  Labeled: $LABELED_DIR\n  Grammar: $GRAMMAR_DIR\n"; -safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; -safemkdir($DATA_DIR) or die "Couldn't create output directory $DATA_DIR: $!"; -safemkdir($CONTEXT_DIR) or die "Couldn't create output directory $CONTEXT_DIR: $!"; -safemkdir($CLUSTER_DIR) or die "Couldn't create output directory $CLUSTER_DIR: $!"; -if($HIER_CAT) { -    safemkdir($CLUSTER_DIR_C) or die "Couldn't create output directory $CLUSTER_DIR_C: $!"; -    safemkdir($LABELED_DIR_C) or die "Couldn't create output directory $LABELED_DIR_C: $!"; -} -safemkdir($LABELED_DIR) or die "Couldn't create output directory $LABELED_DIR: $!"; -safemkdir($GRAMMAR_DIR) or die "Couldn't create output directory $GRAMMAR_DIR: $!"; -if(-e $TOPICS_CONFIG) { -    copy($TOPICS_CONFIG, $CLUSTER_DIR) or die "Copy failed: $!"; -} - -setup_data(); - -if (lc($MODEL) eq "blagree") { -    extract_bilingual_context(); -} else { -    extract_context(); -} - -if (lc($MODEL) eq "pyp") { -    if($HIER_CAT) { -        $NUM_TOPICS = $NUM_TOPICS_COARSE; -        $CLUSTER_DIR = $CLUSTER_DIR_C; -        topic_train(); -        $NUM_TOPICS = $NUM_TOPICS_FINE; -        $CLUSTER_DIR = $CLUSTER_DIR_F; -        topic_train(); -    } else { -        topic_train(); -    } -} elsif (lc($MODEL) =~ /pr|em|agree/) { -    prem_train(); -} else { die "Unsupported model type: $MODEL. Must be one of PYP or PREM.\n"; } -if($HIER_CAT) { -    $NUM_TOPICS = $NUM_TOPICS_COARSE; -    $CLUSTER_DIR = $CLUSTER_DIR_C; -    $LABELED_DIR = $LABELED_DIR_C; -    label_spans_with_topics(); -    $NUM_TOPICS = $NUM_TOPICS_FINE; -    $CLUSTER_DIR = $CLUSTER_DIR_F; -    $LABELED_DIR = $LABELED_DIR_F; -    label_spans_with_topics(); -    extract_freqs(); -} else { -    label_spans_with_topics(); -} -my $res; -if ($BIDIR) { -  $res = grammar_extract_bidir(); -} else { -  $res = grammar_extract(); -} -print STDERR "\n!!!COMPLETE!!!\n"; -print STDERR "GRAMMAR: $res\nYou should probably run: $SCRIPT_DIR/evaluation-pipeline.pl LANGPAIR giwork/ct1s0.L10.PYP.t4.s20.grammar/grammar.gz -f FEAT1 -f FEAT2\n\n"; -exit 0; - -sub setup_data { -  print STDERR "\n!!!PREPARE CORPORA!!!\n"; -  if (-f $CORPUS_LEX && $CORPUS_CLUSTER) { -    print STDERR "$CORPUS_LEX and $CORPUS_CLUSTER exist, reusing...\n"; -    return; -  } -  copy($CORPUS, $CORPUS_LEX); -  if ($TAGGED_CORPUS) { -    die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS; -    my $opt=""; -    $opt = "-s" if ($LANGUAGE eq "source"); -    $opt = $opt . " -a" if ($PRESERVE_PHRASES); -    my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER"; -    safesystem($cmd) or die "Failed to extract contexts."; -  } else { -    symlink($LEX_NAME, $CORPUS_CLUSTER); -  } -} - -sub context_dir { -  return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE"; -} - -sub cluster_dir { -    if (lc($MODEL) eq "pyp") { -        return context_dir() . ".PYP.t$NUM_TOPICS.s$NUM_SAMPLES"; -    } elsif (lc($MODEL) eq "em") { -        return context_dir() . ".EM.t$NUM_TOPICS.i$NUM_ITERS"; -    } elsif (lc($MODEL) eq "pr") { -        return context_dir() . ".PR.t$NUM_TOPICS.i$NUM_ITERS.sp$PR_SCALE_P.sc$PR_SCALE_C"; -    } elsif (lc($MODEL) eq "agree") { -        return context_dir() . ".AGREE.t$NUM_TOPICS.i$NUM_ITERS"; -    } elsif (lc($MODEL) eq "blagree") { -        return context_dir() . ".BLAGREE.t$NUM_TOPICS.i$NUM_ITERS"; -    } -} - -sub labeled_dir { -  if (lc($MODEL) eq "pyp" && $LABEL_THRESHOLD ne "0") { -    return cluster_dir() . "_lt$LABEL_THRESHOLD"; -  } else { -    return cluster_dir(); -  } -} - -sub grammar_dir { -  # TODO add grammar config options -- adjacent NTs, etc -  if($HIER_CAT) { -    return cluster_dir() . ".hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.grammar"; -  } else { -    return labeled_dir() . ".grammar"; -  } -} - - - -sub safemkdir { -  my $dir = shift; -  if (-d $dir) { return 1; } -  return mkdir($dir); -} - -sub usage { -  print <<EOT; - -Usage: $0 [OPTIONS] corpus.fr-en-al - -Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation. - -EOT -  exit 1; -}; - -sub assert_exec { -  my @files = @_; -  for my $file (@files) { -    die "Can't find $file - did you run make?\n" unless -e $file; -    die "Can't execute $file" unless -e $file; -  } -}; - -sub extract_context { - print STDERR "\n!!!CONTEXT EXTRACTION\n";  - my $OUT_CONTEXTS = "$CONTEXT_DIR/context.txt.gz"; - if (-e $OUT_CONTEXTS) { -   print STDERR "$OUT_CONTEXTS exists, reusing...\n"; - } else { -   my $ccopt = "-c $ITEMS_IN_MEMORY"; -   my $postsort = "| $REDUCER "; -   if ($COMPLETE_CACHE) { -     print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; -     $ccopt = "-c 0"; -     $postsort = "" unless ($PRESERVE_PHRASES); -   } - -   my $presort = ($PRESERVE_PHRASES ? "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " : ""); - -   if ($MORFMARK ne "") {  -     $presort = $presort . "| $MORF_DOC_FILTER \"$MORFMARK\" ";  -   } - -   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE $presort | $SORT_KEYS $postsort | $GZIP > $OUT_CONTEXTS"; -   safesystem($cmd) or die "Failed to extract contexts."; -  } -} - -sub extract_bilingual_context { - print STDERR "\n!!!CONTEXT EXTRACTION\n";  - my $OUT_SRC_CONTEXTS = "$CONTEXT_DIR/context.source"; - my $OUT_TGT_CONTEXTS = "$CONTEXT_DIR/context.target"; - - if (-e $OUT_SRC_CONTEXTS . ".gz" and -e $OUT_TGT_CONTEXTS . ".gz") { -   print STDERR "$OUT_SRC_CONTEXTS.gz and $OUT_TGT_CONTEXTS.gz exist, reusing...\n"; - } else { -   my $OUT_BI_CONTEXTS = "$CONTEXT_DIR/context.bilingual.txt.gz"; -   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language both --context_language both | $SORT_KEYS | $REDUCER | $GZIP > $OUT_BI_CONTEXTS"; -   if ($COMPLETE_CACHE) { -     print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; -     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE  --phrase_language both --context_language both  | $SORT_KEYS | $GZIP > $OUT_BI_CONTEXTS"; -   } -   safesystem($cmd) or die "Failed to extract contexts."; - -   safesystem("$ZCAT $OUT_BI_CONTEXTS | $SPLIT $OUT_SRC_CONTEXTS $OUT_TGT_CONTEXTS") or die "Failed to split contexts.\n"; -   safesystem("$GZIP -f $OUT_SRC_CONTEXTS") or die "Failed to zip output contexts.\n"; -   safesystem("$GZIP -f $OUT_TGT_CONTEXTS") or die "Failed to zip output contexts.\n"; - } -} - - -sub topic_train { -  print STDERR "\n!!!TRAIN PYP TOPICS\n"; -  my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz"; -  my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz"; -  if (-e $OUT_CLUSTERS) { -    print STDERR "$OUT_CLUSTERS exists, reusing...\n"; -  } else { -    safesystem("$TOPIC_TRAIN --data $IN_CONTEXTS --backoff-type simple -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -c $TOPICS_CONFIG -w /dev/null") or die "Topic training failed.\n"; -  } -} - -sub prem_train { -  print STDERR "\n!!!TRAIN PR/EM model\n"; -  my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz"; -  if (-e $OUT_CLUSTERS) { -    print STDERR "$OUT_CLUSTERS exists, reusing...\n"; -  } else { -    my $in = "--in $CONTEXT_DIR/context.txt.gz"; -    my $opts = ""; -    if (lc($MODEL) eq "pr") { -        $opts = "--scale-phrase $PR_SCALE_P --scale-context $PR_SCALE_C"; -    } elsif (lc($MODEL) eq "agree") { -        $opts = "--agree-direction"; -    } elsif (lc($MODEL) eq "blagree") { -        $in = "--in $CONTEXT_DIR/context.source.gz --in1 $CONTEXT_DIR/context.target.gz"; -        $opts = "--agree-language"; -    } -    safesystem("$PREM_TRAIN $in --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n"; -  } -} - -sub label_spans_with_topics { -  my ($file) = (@_); -  print STDERR "\n!!!LABEL SPANS\n"; -  my $IN_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz"; -  my $OUT_SPANS = "$LABELED_DIR/labeled_spans.txt"; -  if (-e $OUT_SPANS) { -    print STDERR "$OUT_SPANS exists, reusing...\n"; -  } else { -    my $extra = "tt"; -    if ($LANGUAGE eq "source") { -        $extra = "ss"; -    } elsif ($LANGUAGE eq "both") { -        $extra = "bb"; -    } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" }; -    $extra = $extra . " tok,tag" if ($PRESERVE_PHRASES); -    safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; -    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans"; -    unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; -    safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS | sed 's/ *||| *\$//'  > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; -  } -} - -sub extract_freqs { -    print STDERR "\n!!!EXTRACTING FREQUENCIES\n"; -    my $IN_COARSE = "$LABELED_DIR_C/labeled_spans.txt"; -    my $IN_FINE = "$LABELED_DIR_F/labeled_spans.txt"; -    my $OUT_SPANS = "$LABELED_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; -    my $FREQS = "$LABELED_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; -    my $COARSE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1c/g\'"; #' -    my $FINE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1f/g\'"; #' -    my %finehier = (); -    if (-e $OUT_SPANS) { -        print STDERR "$OUT_SPANS exists, reusing...\n"; -    } else { -        safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); -    } -    open SPANS, $OUT_SPANS or die $!; -    while (<SPANS>) { -        my ($tmp, $coarse, $fine) = split /\|\|\|/; -        my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; -        my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; -         -        foreach my $i (0..(scalar @coarse_spans)-1) { -            my $coarse_cat = $coarse_spans[$i]; -            my $fine_cat = $fine_spans[$i]; -             -            $FREQ_HIER{$coarse_cat}{$fine_cat}++; -        } -    } -    close SPANS; -    foreach (values %FREQ_HIER) { -        my $coarse_freq = $_; -        my $total = 0; -        $total+=$_ for (values %{ $coarse_freq }); -        $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); -    } -    open FREQS, ">", $FREQS or die $!; -    foreach my $coarse_cat (keys %FREQ_HIER) { -        print FREQS "$coarse_cat |||"; -        foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { -            my $res = $FREQ_HIER{$coarse_cat}{$fine_cat}; -            print FREQS " $fine_cat:$res"; -            if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $res) { -               $finehier{$fine_cat} = $coarse_cat; -            }   -        } -        print FREQS "\n"; -    } -#    foreach my $fine_cat (keys %finehier) { -#        print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; -#    } -    close FREQS; -    $CLUSTER_DIR = $CLUSTER_DIR_F; -} - -sub grammar_extract { -  my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label"; -  print STDERR "\n!!!EXTRACTING GRAMMAR\n"; -  my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.gz"; -  if (-e $OUTGRAMMAR) { -    print STDERR "$OUTGRAMMAR exists, reusing...\n"; -  } else { -    my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); -    my $DEFAULT_CAT_ARG = ($DEFAULT_CAT ? "-d X" : ""); -    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -t $NUM_TOPICS $BACKOFF_ARG $DEFAULT_CAT_ARG | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; -  } -  return $OUTGRAMMAR; -} - -sub grammar_extract_bidir { -#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz -  my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label"; -  print STDERR "\n!!!EXTRACTING GRAMMAR\n"; -  my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.bidir.gz"; -  if (-e $OUTGRAMMAR) { -    print STDERR "$OUTGRAMMAR exists, reusing...\n"; -  } else { -    my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); -    safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; -  } -  return $OUTGRAMMAR; -} - -sub safesystem { -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    print STDERR "Exit code: $exitcode\n" if $exitcode; -    return ! $exitcode; -  } -} - diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config deleted file mode 100644 index 3e23c8cb..00000000 --- a/gi/pipeline/lticluster.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/home/cdyer/ws10smt-data -btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh -aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh -uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl deleted file mode 100755 index 0cef0606..00000000 --- a/gi/pipeline/scripts/filter-by-f.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -my $REKEY="$SCRIPT_DIR/rekey.pl"; -my $REFILTER="$SCRIPT_DIR/refilter.pl"; -my $SORT="$SCRIPT_DIR/sort-by-key.sh"; -assert_exec($REKEY, $REFILTER, $SORT); - - -die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3; -my $translations = shift @ARGV; -die "Need number: $translations" unless $translations > 0; -die unless $ARGV[0] =~ /\.gz$/; -die unless $ARGV[1] =~ /\.gz$/; -die if $ARGV[0] eq $ARGV[1]; -die "Can't find $ARGV[0]" unless -f $ARGV[0]; - -my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]"; -safesystem($ARGV[1], $cmd) or die "Filtering failed"; -exit 0; - -sub assert_exec { -  my @files = @_; -  for my $file (@files) { -    die "Can't find $file - did you run make?\n" unless -e $file; -    die "Can't execute $file" unless -e $file; -  } -}; - -sub safesystem { -  my $output = shift @_; -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    if ($exitcode) { -      print STDERR "Exit code: $exitcode\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -    } -    return ! $exitcode; -  } -} - diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl deleted file mode 100755 index c0eec43e..00000000 --- a/gi/pipeline/scripts/patch-corpus.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $PATCH = shift @ARGV; -my $TGT = 1; -my $APPEND; -while ($PATCH eq "-s" || $PATCH eq "-a") { -    if ($PATCH eq "-s") { -        undef $TGT; -    } else { -        $APPEND = 1; -    } -    $PATCH = shift @ARGV; -} - -die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; - -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -my $first=<P>; close P; -my @fields = split / \|\|\| /, $first; -die "Bad format!" if (scalar @fields > 2); - -if (scalar @fields != 1) { -  # TODO support this -  die "Patching source and target not supported yet!"; -} - -my $line = 0; -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -while(my $pline = <P>) { -  chomp $pline; -  $line++; -  my $line = <>; -  die "Too few lines in lexical corpus!" unless $line; -  chomp $line; -  @fields = split / \|\|\| /, $line; -  my @pwords = split /\s+/, $pline; -  if ($TGT) { -      my @lwords = split /\s+/, $fields[1]; -      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      if ($APPEND) { -          foreach my $i (0..(scalar @pwords-1)) { -              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; -          } -          $fields[1] = join ' ', @lwords; -      } else { -          $fields[1] = $pline; -      } -  } else { # source side -      my @lwords = split /\s+/, $fields[0]; -      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      if ($APPEND) { -          foreach my $i (0..(scalar @pwords-1)) { -              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; -          } -          $fields[0] = join ' ', @lwords; -      } else { -          $fields[0] = $pline; -      } -  } -  print join ' ||| ', @fields; -  print "\n"; -} - - diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl deleted file mode 100755 index a783eb4e..00000000 --- a/gi/pipeline/scripts/refilter.pl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $NUM_TRANSLATIONS = shift @ARGV; -unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; } -print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n"; - -my $pk = ''; -my %dict; -while(<>) { -  s/^(.+)\t//; -  my $key = $1; -  if ($key ne $pk) { -    if ($pk) { -      emit_dict(); -    } -    %dict = (); -    $pk = $key; -  } -  my ($lhs, $f, $e, $s) = split / \|\|\| /; -  my $score = 0; -  if ($s =~ /XEF=([^ ]+)/) { -    $score += $1; -  } else { die; } -  if ($s =~ /GenerativeProb=([^ ]+)/) { -    $score += ($1 / 10); -  } else { die; } -  $dict{"$lhs ||| $f ||| $e ||| $s"} = $score; -} -emit_dict(); - -sub emit_dict { -  my $cc = 0; -  for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) { -    print "$k"; -    $cc++; -    if ($cc >= $NUM_TRANSLATIONS) { last; } -  } -} - diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl deleted file mode 100755 index 31eb86b8..00000000 --- a/gi/pipeline/scripts/rekey.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { -  my ($lhs, $f, $e, $s) = split / \|\|\| /; -  $f =~ s/\[X[0-9]+\]/\[X\]/g; -  print "$f\t$_"; -} - diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl deleted file mode 100755 index 20698816..00000000 --- a/gi/pipeline/scripts/remove-tags-from-contexts.pl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  -    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); - -my $lno = 0; -while(my $line = <>) { -    $lno++; -    chomp $line; -    my @top = split /\t/, $line; -    die unless (scalar @top == 2);  - -    my @pwords = split /\s+/, $top[0]; -    foreach my $token (@pwords) { -        #print $token . "\n"; -        my @parts = split /_(?!.*_)/, $token; -        die unless (scalar @parts == 2);  -        if ($PHRASE eq "tok") { -            $token = $parts[0] -        } elsif ($PHRASE eq "tag") { -            $token = $parts[1] -        } -    } - -    my @fields = split / \|\|\| /, $top[1]; -    foreach my $i (0..((scalar @fields) / 2 - 1)) { -        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; -        my @cwords = split /\s+/, $fields[2*$i]; -        foreach my $token (@cwords) { -            #print $i . ": " . $token . "\n"; -            my @parts = split /_(?!.*_)/, $token; -            if (scalar @parts == 2) { -                if ($CONTEXT eq "tok") { -                    $token = $parts[0] -                } elsif ($CONTEXT eq "tag") { -                    $token = $parts[1] -                } -            } -        } -        $fields[2*$i] = join ' ', @cwords; -    } - -    print join ' ', @pwords; -    print "\t"; -    print join ' ||| ', @fields; -    print "\n"; -} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl deleted file mode 100755 index be3e97c0..00000000 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $LANGUAGE = shift @ARGV; -$LANGUAGE = 'target' unless ($LANGUAGE); - -my $lno = 0; -while(my $line = <>) { -    $lno++; -    chomp $line; - -    my @fields = split / \|\|\| /, $line; - -    if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { -        my @cwords = split /\s+/, $fields[0]; -        foreach my $token (@cwords) { -            my @parts = split /_(?!.*_)/, $token; -            if (scalar @parts == 2) { -                $token = $parts[0] -            } else { -                print STDERR "WARNING: invalid tagged token $token\n"; -            } -        } -        $fields[0] = join ' ', @cwords; -    } - -    if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { -        my @cwords = split /\s+/, $fields[1]; -        foreach my $token (@cwords) { -            my @parts = split /_(?!.*_)/, $token; -            if (scalar @parts == 2) { -                $token = $parts[1] -            } else { -                print STDERR "WARNING: invalid tagged token $token\n"; -            } -        } -        $fields[0] = join ' ', @cwords; -    } - -    print join ' ||| ', @fields; -    print "\n"; -} diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh deleted file mode 100755 index 7ae33e03..00000000 --- a/gi/pipeline/scripts/sort-by-key.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export LANG=C -sort -t $'\t' -k 1 -T /tmp -S 6000000000 - diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl deleted file mode 100755 index dc578513..00000000 --- a/gi/pipeline/scripts/xfeats.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0; - -my $xgrammar = shift @ARGV; -die "Can't find $xgrammar" unless -f $xgrammar; -my $fh; -if ($xgrammar =~ /\.gz$/) { -  open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!"; -} else { -  open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!"; -} -print STDERR "Reading X-feats from $xgrammar...\n"; -my %dict; -while(<$fh>) { -  chomp; -  my ($lhs, $f, $e, $feats) = split / \|\|\| /; -  my $xfeats; -  my $cc = 0; -  my @xfeats = (); -  while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) { -    push @xfeats, "X_$1=$2"; -  } -  #print "$lhs ||| $f ||| $e ||| @xfeats\n"; -  $dict{"$lhs ||| $f ||| $e"} = "@xfeats"; -} -close $fh; - -print STDERR "Add features...\n"; -while(<>) { -  chomp; -  my ($lhs, $f, $e) = split / \|\|\| /; -  $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g; -  my $xfeats = $dict{"[X] ||| $f ||| $e"}; -  die "Can't find x features for: $_\n" unless $xfeats; -  print "$_ $xfeats\n"; -} - diff --git a/gi/pipeline/valhalla.config b/gi/pipeline/valhalla.config deleted file mode 100644 index e00a8485..00000000 --- a/gi/pipeline/valhalla.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/home/chris/ws10smt/data -btec /home/chris/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis /home/chris/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /home/chris/ws10smt/data/chinese-english corpus.zh-en.al -aren /home/chris/ws10smt/data/arabic-english corpus.ar-en.al -uren /home/chris/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /home/chris/ws10smt/data/dutch-french corpus.nl-fr.al | 
