From 9a30e76dabf5d9d717c4f0522b7784a98e977c94 Mon Sep 17 00:00:00 2001 From: bothameister Date: Mon, 30 Aug 2010 11:26:49 +0000 Subject: cleaning up git-svn-id: https://ws10smt.googlecode.com/svn/trunk@632 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/morf-segmentation/morf-pipeline.pl | 262 ++++++++++++++++++---------------- gi/morf-segmentation/morfsegment.py | 2 +- 2 files changed, 142 insertions(+), 122 deletions(-) (limited to 'gi') diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl index da40eb57..46eb5b46 100755 --- a/gi/morf-segmentation/morf-pipeline.pl +++ b/gi/morf-segmentation/morf-pipeline.pl @@ -2,7 +2,13 @@ use strict; use File::Copy; -#WARNING.. THIS SCRIPT IS CURRENTLY SOMEWHAT BROKEN. AND UGLY. + +# Preprocessing pipeline to take care of word segmentation +# Learns a segmentation model for each/either side of the parallel corpus using all train/dev/test data +# Applies the segmentation where necessary. +# Learns word alignments on the preprocessed training data. +# Outputs script files used later to score output. + my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } @@ -26,7 +32,7 @@ my $PPL_TRG = 50; my $MARKER = "#"; my $MAX_WORDS = 40; my $SENTENCES;# = 100000; -my $SPLIT_TYPE = ""; +my $SPLIT_TYPE = ""; #possible values: s, t, st, or (empty string) my $NAME_SHORTCUT; usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, @@ -38,10 +44,6 @@ usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, 'split=s' => \$SPLIT_TYPE, 'get_name_only' => \$NAME_SHORTCUT, ); -#if ($NAME_SHORTCUT) { -# print STDERR labeled_dir(); -# exit 0; -#} usage() unless scalar @ARGV >= 2; @@ -55,19 +57,7 @@ $CORPUS{'trg'}{'orig'} = $ARGV[1]; open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F; $CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'}); - my %DEV; # for (src,trg) has (orig, final.split final.unsplit - - -#my %TEST_SRC; #original, final - #trg has original, final.split final.recombined - -my $TEST_SRC; -my $TEST_TRG; - -my $TEST_SRC_name; -my $TEST_TRG_name; - if (@ARGV >= 4) { $DEV{'src'}{'orig'} = $ARGV[2]; open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F; @@ -76,13 +66,15 @@ if (@ARGV >= 4) { open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F; $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'}); } + +my %TEST; # for (src,trg) has (orig, name) if (@ARGV >= 6) { - $TEST_SRC = $ARGV[4]; - open F, "<$TEST_SRC" or die "Can't read $TEST_SRC: $!"; close F; - $TEST_SRC_name = get_basename($TEST_SRC); - $TEST_TRG = $ARGV[5]; - open F, "<$TEST_TRG" or die "Can't read $TEST_TRG: $!"; close F; - $TEST_TRG_name = get_basename($TEST_TRG); + $TEST{'src'}{'orig'} = $ARGV[4]; + open F, "<$TEST{'src'}{'orig'}" or die "Can't read $TEST{'src'}{'orig'}: $!"; close F; + $TEST{'src'}{'name'} = get_basename($TEST{'src'}{'orig'}); + $TEST{'trg'}{'orig'} = $ARGV[5]; + open F, "<$TEST{'trg'}{'orig'}" or die "Can't read $TEST{'trg'}{'orig'}: $!"; close F; + $TEST{'trg'}{'name'} = get_basename($TEST{'trg'}{'orig'}); } my $SPLIT_SRC; #use these to check whether that part is being split @@ -114,42 +106,45 @@ safemkdir($ALIGNMENT_DIR); if ($SPLIT_SRC) { safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!"; learn_segmentation("src"); - apply_segmentation("src"); -} -if ($SPLIT_TRG) { - safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; - learn_segmentation("trg"); - apply_segmentation("trg"); + apply_segmentation_side("src", $MODEL_SRC_DIR); } -#copy corpora if they haven't been put in place by splitting operations -if (! -f "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") { - copy($CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") or die "Copy failed: $!"; -} -if (! -f "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") { - copy($CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") or die "Copy failed: $!"; -} -if ($DEV{'src'}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{'src}{'name'}") { - copy( +#assume that unsplit hypotheses will be scored against an aritificially split target test set; thus obtain a target splitting model +#TODO: add a flag to override this behaviour +safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; +learn_segmentation("trg"); +$TEST{'trg'}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}"; +copy($TEST{'trg'}{'orig'}, $TEST{'trg'}{'finalunsplit'}) or die "Could not copy unsegmented test set"; + +if ($SPLIT_TRG) { + apply_segmentation_side("trg", $MODEL_TRG_DIR); + } else { + $TEST{'trg'}{'finalsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}.split"; + apply_segmentation_any($MODEL_TRG_DIR, $TEST{'trg'}{'finalunsplit'}, $TEST{'trg'}{'finalsplit'}); } -if ($TEST_SRC) { ifne_copy($TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); } -if ($TEST_TRG) { ifne_copy("$TEST_TRG.unsplit", "$PROCESSED_DIR/$TEST_TRG_name.unsplit"); } +write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); +#copy corpora if they haven't been put in place by splitting operations +place_missing_data_side('src'); +place_missing_data_side('trg'); do_align(); -system("date"); -print STDERR "All done. You now need to train a language model (if target split), preprocess the test data and put various things where the eval scripts can find them\n\n". - -sub ifne_copy { - my ($src, $dest) = @_; - if (! -f $dest) { - copy($src, $dest) or die "Copy failed: $!"; - } +if ($CORPUS{'src'}{'orig'} && $DEV{'src'}{'orig'} && $TEST{'src'}{'orig'}) { + print STDERR "Putting the config file entry in $PROCESSED_DIR/exp.config\n"; +#format is: + # nlfr100k_unsplit /export/ws10smt/jan/nlfr/morfwork/s100k.w40.sp_0 corpus.nl-fr.al fr-3.lm.gz dev.nl dev.fr test2008.nl eval-devtest.sh + my $line = split_name() . " $PROCESSED_DIR corpus.src-trg.al LMFILE.lm.gz"; + $line = $line . " $DEV{'src'}{'name'} $DEV{'trg'}{'name'}"; + $line = $line . " " . get_basename($TEST{'src'}{$SPLIT_SRC ? "finalsplit" : "finalunsplit"}) . " eval-devtest.sh"; + safesystem("echo '$line' > $PROCESSED_DIR/exp.config"); } -} +system("date"); +print STDERR "All done. You now need to train a language model (if target split), put it in the right dir and update the config file.\n\n"; + +############################## BILINGUAL ################################### sub filter_corpus { print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n"; @@ -167,40 +162,26 @@ sub learn_segmentation my $WHICH = shift; my $corpus; my $dev; my $test; my $moddir; my $ppl; + $corpus = $CORPUS{$WHICH}{'filtered'}; + $dev = $DEV{$WHICH}{'orig'}; + $test = $TEST{$WHICH}{'orig'}; + if ($WHICH eq "src") { - print STDERR "\n!!!LEARNING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n"; - $corpus = $CORPUS{'src'}{'filtered'}; - $dev = $DEV{'src'}{'orig'}; - $test = $TEST_SRC; $moddir = $MODEL_SRC_DIR; $ppl = $PPL_SRC; } else { - print STDERR "\n!!!LEARNING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n"; - $corpus = $CORPUS{'trg'}{'filtered'}; - $dev = $DEV{'trg'}{'orig'}; - $test = $TEST_TRG; $moddir = $MODEL_TRG_DIR; $ppl = $PPL_TRG; } - system("date"); my $cmd = "cat $corpus"; if ($dev) { $cmd = "$cmd $dev"; } if ($test) { $cmd = "$cmd $test"; } my $tmpfile = "$CORPUS_DIR/all.tmp.gz"; safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning.."; - learn_segmentation_any($tmpfile, $moddir, $ppl); - safesystem("rm $tmpfile"); -} + assert_marker($tmpfile); -sub learn_segmentation_any { - my($INPUT_FILE, $SEGOUT_DIR, $PPL) = @_; - my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; - if ( -f $SEG_FILE) { - print STDERR "$SEG_FILE exists, reusing...\n"; - return; - } - my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; - safesystem($cmd) or die "Failed to learn segmentation model"; + learn_segmentation_side($tmpfile, $moddir, $ppl, $WHICH); + safesystem("rm $tmpfile"); } sub do_align { @@ -214,44 +195,69 @@ sub do_align { } my $conf_file = "$ALIGNMENT_DIR/word-align.conf"; - #decorate training files with identifiers to stop the aligner from training on dev and test too - #since they are in same directory + #decorate training files with identifiers to stop the aligner from training on dev and test when rerun in future. safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!"; safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!"; write_wconf($conf_file, $PROCESSED_DIR); - safesystem("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log") or die "Failed to run word alignment."; + system("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log"); + + if (! -f $ALIGNMENTS) { die "Failed to run word alignment.";} + + my $cmd = "paste $PROCESSED_DIR/corpus.src $PROCESSED_DIR/corpus.trg $ALIGNMENTS"; + $cmd = $cmd . " | sed 's/\\t/ \|\|\| /g' > $PROCESSED_DIR/corpus.src-trg.al"; + safesystem($cmd) or die "Failed to paste into aligned corpus file."; } -sub apply_segmentation { - my $WHICH = shift; - my $moddir; - my $datfile; - if ($WHICH eq "src") { - print STDERR "\n!!!APPLYING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n"; - apply_segmentation_any($MODEL_SRC_DIR, $CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}"); - if ($DEV{'src'}{'orig'}) { - apply_segmentation_any($MODEL_SRC_DIR, $DEV{'src'}{'orig'}, "$PROCESSED_DIR/$DEV{'src'}{'name'}"); - } - if ($TEST_SRC) { - apply_segmentation_any($MODEL_SRC_DIR, $TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); - } - } else { - print STDERR "\n!!!APPLYING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n"; - apply_segmentation_any($MODEL_TRG_DIR, $CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}"); - if ($DEV{'trg'}{'orig'}) { - $DEV{'trg'}{'final'} = "$PROCESSED_DIR/$DEV{'trg'}{'name'}"; - apply_segmentation_any($MODEL_TRG_DIR, $DEV{'trg'}{'orig'}, $DEV{'trg'}{'final'}); - } - if ($TEST_TRG) { - apply_segmentation_any($MODEL_TRG_DIR, $TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.split"); - copy($TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.unsplit") or die "Could not copy unsegmented test set"; - } +############################# MONOLINGUAL ################################# + +#copy the necessary data files that weren't place by segmentation +sub place_missing_data_side { + my $side = shift; + + ifne_copy($CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}") ; + + if ($DEV{$side}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{$side}{'name'}") { + $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; + copy($DEV{$side}{'orig'}, $DEV{$side}{'final'}) or die "Copy failed: $!"; + } + + if ($TEST{$side}{'orig'} && ! -f "$PROCESSED_DIR/$TEST{$side}{'name'}" && ! $TEST{$side}{'finalunsplit'}) { + $TEST{$side}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}"; + copy($TEST{$side}{'orig'}, $TEST{$side}{'finalunsplit'}) or die "Copy failed: $!"; + } + +} + +sub apply_segmentation_side { + my ($side, $moddir) = @_; + + print STDERR "\n!!!APPLYING SEGMENTATION MODEL ($side)!!!\n"; + apply_segmentation_any($moddir, $CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}"); + if ($DEV{$side}{'orig'}) { + $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; + apply_segmentation_any($moddir, $DEV{$side}{'orig'}, "$DEV{$side}{'final'}"); } - if ($WHICH eq "src" || $WHICH eq "trg") { - write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); + if ($TEST{$side}{'orig'}) { + $TEST{$side}{'finalsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}.split"; + apply_segmentation_any($moddir, $TEST{$side}{'orig'}, $TEST{$side}{'finalsplit'} ); + } + +} + +sub learn_segmentation_side { + my($INPUT_FILE, $SEGOUT_DIR, $PPL, $LANG) = @_; + + print STDERR "\n!!!LEARNING SEGMENTATION MODEL ($LANG)!!!\n"; + system("date"); + my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; + if ( -f $SEG_FILE) { + print STDERR "$SEG_FILE exists, reusing...\n"; + return; } + my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; + safesystem($cmd) or die "Failed to learn segmentation model"; } sub apply_segmentation_any { @@ -265,6 +271,8 @@ sub apply_segmentation_any { safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile"; } +##################### PATH FUNCTIONS ########################## + sub beautify_numlines { return ($SENTENCES ? $SENTENCES : "_all"); } @@ -284,6 +292,19 @@ sub model_dir { } } +sub processed_dir { + return corpus_dir() . "." . split_name(); +} + +########################## HELPER FUNCTIONS ############################ + +sub ifne_copy { + my ($src, $dest) = @_; + if (! -f $dest) { + copy($src, $dest) or die "Copy failed: $!"; + } +} + sub split_name { #parses SPLIT_TYPE, which can have the following values # t|s|ts|st (last 2 are equiv) @@ -304,14 +325,10 @@ sub split_name { } -sub processed_dir { - return corpus_dir() . "." . split_name; -} - sub usage { print <$filename" or die "Can't write $filename: $!"; -#TODO CHANGE ITERATIONS BELOW!!! print WCONF < "\$1.recombined" -\$EVAL_MAIN "\$1.recombined" \$d/$TEST_TRG_name.unsplit +\$EVAL_MAIN "\$1.recombined" $TEST{'trg'}{'finalunsplit'} EOT } else { print EVALFILE < "\$1.split" - -\$EVAL_MAIN "i\$1.split" \$d/$TEST_TRG_name.split +cat "\$1" | $MORF_SEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split" +\$EVAL_MAIN "\$1.split" $TEST{'trg'}{'finalsplit'} echo "DIRECT EVALUATION" echo "--------------------------" -\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.unsplit +\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalunsplit'} EOT + } close EVALFILE; } + + + diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py index e5597c0b..85b9d4fb 100755 --- a/gi/morf-segmentation/morfsegment.py +++ b/gi/morf-segmentation/morfsegment.py @@ -16,7 +16,7 @@ if len(sys.argv) < 3: #read index: split_index={} -marker="#" +marker="##" if len(sys.argv) > 3: marker=sys.argv[3] -- cgit v1.2.3