diff options
Diffstat (limited to 'word-aligner/aligner.pl')
| -rwxr-xr-x | word-aligner/aligner.pl | 20 | 
1 files changed, 15 insertions, 5 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index d203fc53..7eec0e42 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -6,15 +6,20 @@ use Getopt::Long;  my $training_dir = "$SCRIPT_DIR/../training";  die "Can't find training dir: $training_dir" unless -d $training_dir; +my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';  my $num_classes = 50;  my $nodes = 40;  my $pmem = "2500mb";  my $DECODER = "cdec";  GetOptions("cdec=s" => \$DECODER,             "jobs=i" => \$nodes, -           "pmem=s" => \$pmem +           "pmem=s" => \$pmem, +           "mkcls=s" => \$mkcls,            ) or usage();  usage() unless (scalar @ARGV == 1); +die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; +die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; +  my $in_file = shift @ARGV;  die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);  my $f_lang = $1; @@ -22,13 +27,13 @@ my $e_lang = $2;  print STDERR "Source language: $f_lang\n";  print STDERR "Target language: $e_lang\n"; +print STDERR " Using mkcls in: $mkcls\n\n";  die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";  die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";  my @stages = qw(nopos relpos markov);  my @directions = qw(f-e e-f); -my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';  my $corpus = 'c';  my $cwd = getcwd(); @@ -75,7 +80,7 @@ NCLASSES = $num_classes  TARGETS = @targets  PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary -PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5 +PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15  export @@ -95,12 +100,16 @@ clean:  EOT  close TOPLEVEL; +print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; +exit 0; +  sub make_stage {    my ($stage, $direction, $prev_stage) = @_;    my $stage_dir = "$align_dir/$stage-$direction";    my $first = $direction;    $first =~ s/^(.+)-.*$/$1/;    mkdir $stage_dir; +  my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";    open CDEC, ">$stage_dir/cdec.ini" or die;    print CDEC <<EOT;  formalism=lexcrf @@ -108,10 +117,11 @@ intersection_strategy=full  grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz  EOT    if ($stage =~ /relpos/) { -    print CDEC "feature_function=RelativeSentencePosition\n"; +    print CDEC "$RELPOS\n";    } elsif ($stage =~ /markov/) { -    print CDEC "feature_function=RelativeSentencePosition\n"; +    print CDEC "$RELPOS\n";      print CDEC "feature_function=MarkovJump\n"; +    print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";      print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";    }    close CDEC;  | 
