diff options
Diffstat (limited to 'gi/morf-segmentation/morf-pipeline.pl')
-rwxr-xr-x | gi/morf-segmentation/morf-pipeline.pl | 466 |
1 files changed, 466 insertions, 0 deletions
diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl new file mode 100755 index 00000000..da40eb57 --- /dev/null +++ b/gi/morf-segmentation/morf-pipeline.pl @@ -0,0 +1,466 @@ +#!/usr/bin/perl -w +use strict; +use File::Copy; + +#WARNING.. THIS SCRIPT IS CURRENTLY SOMEWHAT BROKEN. AND UGLY. + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } + +use Getopt::Long "GetOptions"; + +my $GZIP = 'gzip'; +my $ZCAT = 'gunzip -c'; +my $SED = 'sed -e'; + +my $MORF_TRAIN = "$SCRIPT_DIR/morftrain.sh"; +my $MORF_SEGMENT = "$SCRIPT_DIR/morfsegment.py"; + +my $LINESTRIPPER = "$SCRIPT_DIR/linestripper.py"; +my $ALIGNER = "/export/ws10smt/software/berkeleyaligner/berkeleyaligner.jar"; +#java -d64 -Xmx10g -jar $ALIGNER ++word-align.conf >> aligner.log +assert_exec($MORF_TRAIN, $LINESTRIPPER, $MORF_SEGMENT, $ALIGNER); + +my $OUTPUT = './morfwork'; +my $PPL_SRC = 50; +my $PPL_TRG = 50; +my $MARKER = "#"; +my $MAX_WORDS = 40; +my $SENTENCES;# = 100000; +my $SPLIT_TYPE = ""; +my $NAME_SHORTCUT; + +usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, + 'output=s' => \$OUTPUT, + 'ppl_src=i' => \$PPL_SRC, + 'ppl_trg=i' => \$PPL_TRG, + 'sentences=i' => \$SENTENCES, + 'marker=s' => \$MARKER, + 'split=s' => \$SPLIT_TYPE, + 'get_name_only' => \$NAME_SHORTCUT, + ); +#if ($NAME_SHORTCUT) { +# print STDERR labeled_dir(); +# exit 0; +#} + +usage() unless scalar @ARGV >= 2; + +my %CORPUS; # for (src,trg) it has (orig, name, filtered, final) + +$CORPUS{'src'}{'orig'} = $ARGV[0]; +open F, "<$CORPUS{'src'}{'orig'}" or die "Can't read $CORPUS{'src'}{'orig'}: $!"; close F; +$CORPUS{'src'}{'name'} = get_basename($CORPUS{'src'}{'orig'}); + +$CORPUS{'trg'}{'orig'} = $ARGV[1]; +open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F; +$CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'}); + + +my %DEV; # for (src,trg) has (orig, final.split final.unsplit + + +#my %TEST_SRC; #original, final + #trg has original, final.split final.recombined + +my $TEST_SRC; +my $TEST_TRG; + +my $TEST_SRC_name; +my $TEST_TRG_name; + +if (@ARGV >= 4) { + $DEV{'src'}{'orig'} = $ARGV[2]; + open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F; + $DEV{'src'}{'name'} = get_basename($DEV{'src'}{'orig'}); + $DEV{'trg'}{'orig'} = $ARGV[3]; + open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F; + $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'}); +} +if (@ARGV >= 6) { + $TEST_SRC = $ARGV[4]; + open F, "<$TEST_SRC" or die "Can't read $TEST_SRC: $!"; close F; + $TEST_SRC_name = get_basename($TEST_SRC); + $TEST_TRG = $ARGV[5]; + open F, "<$TEST_TRG" or die "Can't read $TEST_TRG: $!"; close F; + $TEST_TRG_name = get_basename($TEST_TRG); +} + +my $SPLIT_SRC; #use these to check whether that part is being split +my $SPLIT_TRG; + +#OUTPUT WILL GO IN THESE +my $CORPUS_DIR = $OUTPUT . '/' . corpus_dir(); #subsampled corpus +my $MODEL_SRC_DIR = $OUTPUT . '/' . model_dir("src"); #splitting.. +my $MODEL_TRG_DIR = $OUTPUT . '/' . model_dir("trg"); # .. models +my $PROCESSED_DIR = $OUTPUT . '/' . processed_dir(); #segmented copora+alignments +my $ALIGNMENT_DIR = $PROCESSED_DIR . '/alignments'; + +$CORPUS{'src'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'src'}{'name'}"; +$CORPUS{'trg'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'trg'}{'name'}"; + +print STDERR "Output: $OUTPUT\n"; +print STDERR "Corpus: $CORPUS_DIR\n"; +print STDERR "Model-src: $MODEL_SRC_DIR\n"; +print STDERR "Model-trg: $MODEL_TRG_DIR\n"; +print STDERR "Finaldir: $PROCESSED_DIR\n"; + +safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; +safemkdir($CORPUS_DIR) or die "Couldn't create output directory $CORPUS_DIR: $!"; +filter_corpus(); + +safemkdir($PROCESSED_DIR); +safemkdir($ALIGNMENT_DIR); + +if ($SPLIT_SRC) { + safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!"; + learn_segmentation("src"); + apply_segmentation("src"); +} +if ($SPLIT_TRG) { + safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; + learn_segmentation("trg"); + apply_segmentation("trg"); +} + +#copy corpora if they haven't been put in place by splitting operations +if (! -f "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") { + copy($CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") or die "Copy failed: $!"; +} +if (! -f "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") { + copy($CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") or die "Copy failed: $!"; +} +if ($DEV{'src'}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{'src}{'name'}") { + copy( +} +if ($TEST_SRC) { ifne_copy($TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); } +if ($TEST_TRG) { ifne_copy("$TEST_TRG.unsplit", "$PROCESSED_DIR/$TEST_TRG_name.unsplit"); } + + + +do_align(); + +system("date"); +print STDERR "All done. You now need to train a language model (if target split), preprocess the test data and put various things where the eval scripts can find them\n\n". + +sub ifne_copy { + my ($src, $dest) = @_; + if (! -f $dest) { + copy($src, $dest) or die "Copy failed: $!"; + } +} + +} + +sub filter_corpus { + print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n"; + if ( -f $CORPUS{'src'}{'filtered'} && -f $CORPUS{'trg'}{'filtered'}) { + print STDERR "$CORPUS{'src'}{'filtered'} and $CORPUS{'trg'}{'filtered'} exist, reusing...\n"; + return; + } + my $args = "$CORPUS{'src'}{'orig'} $CORPUS{'trg'}{'orig'} $MAX_WORDS"; + if ($SENTENCES) { $args = $args . " $SENTENCES"; } + safesystem("$LINESTRIPPER $args 1> $CORPUS{'src'}{'filtered'} 2> $CORPUS{'trg'}{'filtered'}") or die "Failed to filter training corpus for length."; +} + +sub learn_segmentation +{ + my $WHICH = shift; + my $corpus; my $dev; my $test; my $moddir; my $ppl; + + if ($WHICH eq "src") { + print STDERR "\n!!!LEARNING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n"; + $corpus = $CORPUS{'src'}{'filtered'}; + $dev = $DEV{'src'}{'orig'}; + $test = $TEST_SRC; + $moddir = $MODEL_SRC_DIR; + $ppl = $PPL_SRC; + } else { + print STDERR "\n!!!LEARNING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n"; + $corpus = $CORPUS{'trg'}{'filtered'}; + $dev = $DEV{'trg'}{'orig'}; + $test = $TEST_TRG; + $moddir = $MODEL_TRG_DIR; + $ppl = $PPL_TRG; + } + system("date"); + my $cmd = "cat $corpus"; + if ($dev) { $cmd = "$cmd $dev"; } + if ($test) { $cmd = "$cmd $test"; } + my $tmpfile = "$CORPUS_DIR/all.tmp.gz"; + safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning.."; + learn_segmentation_any($tmpfile, $moddir, $ppl); + safesystem("rm $tmpfile"); +} + +sub learn_segmentation_any { + my($INPUT_FILE, $SEGOUT_DIR, $PPL) = @_; + my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; + if ( -f $SEG_FILE) { + print STDERR "$SEG_FILE exists, reusing...\n"; + return; + } + my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; + safesystem($cmd) or die "Failed to learn segmentation model"; +} + +sub do_align { + print STDERR "\n!!!WORD ALIGNMENT!!!\n"; + system("date"); + + my $ALIGNMENTS = "$ALIGNMENT_DIR/training.align"; + if ( -f $ALIGNMENTS ) { + print STDERR "$ALIGNMENTS exists, reusing...\n"; + return; + } + my $conf_file = "$ALIGNMENT_DIR/word-align.conf"; + + #decorate training files with identifiers to stop the aligner from training on dev and test too + #since they are in same directory + safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!"; + safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!"; + + write_wconf($conf_file, $PROCESSED_DIR); + safesystem("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log") or die "Failed to run word alignment."; + +} + +sub apply_segmentation { + my $WHICH = shift; + my $moddir; + my $datfile; + if ($WHICH eq "src") { + print STDERR "\n!!!APPLYING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n"; + apply_segmentation_any($MODEL_SRC_DIR, $CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}"); + if ($DEV{'src'}{'orig'}) { + apply_segmentation_any($MODEL_SRC_DIR, $DEV{'src'}{'orig'}, "$PROCESSED_DIR/$DEV{'src'}{'name'}"); + } + if ($TEST_SRC) { + apply_segmentation_any($MODEL_SRC_DIR, $TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); + } + } else { + print STDERR "\n!!!APPLYING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n"; + apply_segmentation_any($MODEL_TRG_DIR, $CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}"); + if ($DEV{'trg'}{'orig'}) { + $DEV{'trg'}{'final'} = "$PROCESSED_DIR/$DEV{'trg'}{'name'}"; + apply_segmentation_any($MODEL_TRG_DIR, $DEV{'trg'}{'orig'}, $DEV{'trg'}{'final'}); + } + if ($TEST_TRG) { + apply_segmentation_any($MODEL_TRG_DIR, $TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.split"); + copy($TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.unsplit") or die "Could not copy unsegmented test set"; + } + } + if ($WHICH eq "src" || $WHICH eq "trg") { + write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); + } +} + +sub apply_segmentation_any { + my($moddir, $datfile, $outfile) = @_; + if ( -f $outfile) { + print STDERR "$outfile exists, reusing...\n"; + return; + } + + my $args = "$moddir/inputvocab.gz $moddir/segmentation.ready \"$MARKER\""; + safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile"; +} + +sub beautify_numlines { + return ($SENTENCES ? $SENTENCES : "_all"); +} + +sub corpus_dir { + return "s" . beautify_numlines() . ".w" . $MAX_WORDS; +} + +sub model_dir { + my $lang = shift; + if ($lang eq "src") { + return corpus_dir() . ".PPL" . $PPL_SRC . ".src"; + } elsif ($lang eq "trg") { + return corpus_dir() . ".PPL" . $PPL_TRG . ".trg"; + } else { + return "PPLundef"; + } +} + +sub split_name { + #parses SPLIT_TYPE, which can have the following values + # t|s|ts|st (last 2 are equiv) + # or is undefined when no splitting is done + my $name = ""; + + if ($SPLIT_TYPE) { + $SPLIT_SRC = lc($SPLIT_TYPE) =~ /s/; + $SPLIT_TRG = lc($SPLIT_TYPE) =~ /t/; + $name = $name . ($SPLIT_SRC ? $PPL_SRC : "0"); + $name = $name . "_" . ($SPLIT_TRG ? $PPL_TRG : "0"); + } else { + #no splitting + $name = "0"; + } + + return "sp_" . $name; + +} + +sub processed_dir { + return corpus_dir() . "." . split_name; +} + +sub usage { + print <<EOT; + +Usage: $0 [OPTIONS] corpus.src corpus.trg dev.src dev.trg test.src test.trg + +Learns a segmentation model and splits up corpora as necessary. Word alignments are trained on a specified subset of the training corpus. + +EOT + exit 1; +}; + +sub safemkdir { + my $dir = shift; + if (-d $dir) { return 1; } + return mkdir($dir); +} + +sub assert_exec { + my @files = @_; + for my $file (@files) { + die "Can't find $file - did you run make?\n" unless -e $file; + die "Can't execute $file" unless -e $file; + } +}; +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + +sub get_basename +{ + my $x = shift; + $x = `basename $x`; + $x =~ s/\n//; + return $x; +} + +sub write_wconf { + my ($filename, $train_dir) = @_; + open WCONF, ">$filename" or die "Can't write $filename: $!"; + +#TODO CHANGE ITERATIONS BELOW!!! + print WCONF <<EOT; +## ---------------------- +## This is an example training script for the Berkeley +## word aligner. In this configuration it uses two HMM +## alignment models trained jointly and then decoded +## using the competitive thresholding heuristic. + +########################################## +# Training: Defines the training regimen +########################################## +forwardModels MODEL1 HMM +reverseModels MODEL1 HMM +mode JOINT JOINT +iters 1 1 + +############################################### +# Execution: Controls output and program flow +############################################### +execDir $ALIGNMENT_DIR +create +overwriteExecDir +saveParams true +numThreads 1 +msPerLine 10000 +alignTraining + +################# +# Language/Data +################# +foreignSuffix src +englishSuffix trg + +# Choose the training sources, which can either be directories or files that list files/directories +trainSources $train_dir/ +#trainSources $train_dir/sources +testSources +sentences MAX + +################# +# 1-best output +################# +competitiveThresholding + +EOT + close WCONF; +} + +sub write_eval_sh +{ + my ($filename) = @_; + open EVALFILE, ">$filename" or die "Can't write $filename: $!"; + + print EVALFILE <<EOT; +#!/bin/bash +d=`dirname \$0` + +EVAL_MAIN=/export/ws10smt/data/eval.sh +EOT + + if ($SPLIT_TRG) { + print EVALFILE <<EOT; +echo "OUTPUT EVALUATION" +echo "-----------------" +\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.split + +echo "RECOMBINED OUTPUT EVALUATION" +echo "----------------------------" +marker="$MARKER" +cat "\$1" | sed -e "s/\$marker \$marker//g" -e "s/\$marker//g" > "\$1.recombined" + +\$EVAL_MAIN "\$1.recombined" \$d/$TEST_TRG_name.unsplit +EOT + + } else { + print EVALFILE <<EOT; +#!/bin/bash +d=`dirname \$0` + +EVAL_MAIN=/export/ws10smt/data/eval.sh + +echo "ARTIFICIAL SPLIT EVALUATION" +echo "--------------------------" + +MARKER="$MARKER" +#split the output translation +cat "\$1" | $MORFSEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split" + +\$EVAL_MAIN "i\$1.split" \$d/$TEST_TRG_name.split + + +echo "DIRECT EVALUATION" +echo "--------------------------" +\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.unsplit + +EOT + } + close EVALFILE; + +} + |