diff options
Diffstat (limited to 'gi/morf-segmentation')
| -rwxr-xr-x | gi/morf-segmentation/morf-pipeline.pl | 262 | ||||
| -rwxr-xr-x | gi/morf-segmentation/morfsegment.py | 2 | 
2 files changed, 142 insertions, 122 deletions
diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl index da40eb57..46eb5b46 100755 --- a/gi/morf-segmentation/morf-pipeline.pl +++ b/gi/morf-segmentation/morf-pipeline.pl @@ -2,7 +2,13 @@  use strict;  use File::Copy; -#WARNING.. THIS SCRIPT IS CURRENTLY SOMEWHAT BROKEN. AND UGLY. + +# Preprocessing pipeline to take care of word segmentation +# Learns a segmentation model for each/either side of the parallel corpus using all train/dev/test data +# Applies the segmentation where necessary. +# Learns word alignments on the preprocessed training data. +# Outputs script files used later to score output. +  my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } @@ -26,7 +32,7 @@ my $PPL_TRG = 50;  my $MARKER = "#";  my $MAX_WORDS = 40;  my $SENTENCES;# = 100000; -my $SPLIT_TYPE = ""; +my $SPLIT_TYPE = ""; #possible values: s, t, st, or (empty string)  my $NAME_SHORTCUT;  usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, @@ -38,10 +44,6 @@ usage() unless &GetOptions('max_words=i' => \$MAX_WORDS,                             'split=s' => \$SPLIT_TYPE,                             'get_name_only' => \$NAME_SHORTCUT,                            ); -#if ($NAME_SHORTCUT) { -#  print STDERR labeled_dir(); -#  exit 0; -#}  usage() unless scalar @ARGV >= 2; @@ -55,19 +57,7 @@ $CORPUS{'trg'}{'orig'} = $ARGV[1];  open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F;  $CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'}); -  my %DEV; # for (src,trg) has (orig, final.split final.unsplit - - -#my %TEST_SRC; #original, final -              #trg has original, final.split final.recombined - -my $TEST_SRC; -my $TEST_TRG; - -my $TEST_SRC_name; -my $TEST_TRG_name; -  if (@ARGV >= 4) {    $DEV{'src'}{'orig'} = $ARGV[2];    open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F; @@ -76,13 +66,15 @@ if (@ARGV >= 4) {    open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F;    $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'});  } + +my %TEST; # for (src,trg) has (orig, name)   if (@ARGV >= 6) { -  $TEST_SRC = $ARGV[4]; -  open F, "<$TEST_SRC" or die "Can't read $TEST_SRC: $!"; close F; -  $TEST_SRC_name = get_basename($TEST_SRC); -  $TEST_TRG = $ARGV[5]; -  open F, "<$TEST_TRG" or die "Can't read $TEST_TRG: $!"; close F; -  $TEST_TRG_name = get_basename($TEST_TRG); +  $TEST{'src'}{'orig'} = $ARGV[4]; +  open F, "<$TEST{'src'}{'orig'}" or die "Can't read $TEST{'src'}{'orig'}: $!"; close F; +  $TEST{'src'}{'name'} = get_basename($TEST{'src'}{'orig'}); +  $TEST{'trg'}{'orig'} = $ARGV[5]; +  open F, "<$TEST{'trg'}{'orig'}" or die "Can't read $TEST{'trg'}{'orig'}: $!"; close F; +  $TEST{'trg'}{'name'} = get_basename($TEST{'trg'}{'orig'});  }  my $SPLIT_SRC; #use these to check whether that part is being split @@ -114,42 +106,45 @@ safemkdir($ALIGNMENT_DIR);  if ($SPLIT_SRC) {    safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!";    learn_segmentation("src"); -  apply_segmentation("src");   -} -if ($SPLIT_TRG) { -  safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; -  learn_segmentation("trg"); -  apply_segmentation("trg");   +  apply_segmentation_side("src", $MODEL_SRC_DIR);    } -#copy corpora if they haven't been put in place by splitting operations -if (! -f "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") { -  copy($CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") or die "Copy failed: $!"; -} -if (! -f "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") { -  copy($CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") or die "Copy failed: $!"; -} -if ($DEV{'src'}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{'src}{'name'}") { -  copy( +#assume that unsplit hypotheses will be scored against an aritificially split target test set; thus obtain a target splitting model   +#TODO: add a flag to override this behaviour +safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; +learn_segmentation("trg"); +$TEST{'trg'}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}"; +copy($TEST{'trg'}{'orig'}, $TEST{'trg'}{'finalunsplit'}) or die "Could not copy unsegmented test set";   + +if ($SPLIT_TRG) { +  apply_segmentation_side("trg", $MODEL_TRG_DIR);   +  } else { +  $TEST{'trg'}{'finalsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}.split"; +  apply_segmentation_any($MODEL_TRG_DIR, $TEST{'trg'}{'finalunsplit'}, $TEST{'trg'}{'finalsplit'});    } -if ($TEST_SRC) { ifne_copy($TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); } -if ($TEST_TRG) { ifne_copy("$TEST_TRG.unsplit", "$PROCESSED_DIR/$TEST_TRG_name.unsplit"); } +write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); +#copy corpora if they haven't been put in place by splitting operations +place_missing_data_side('src'); +place_missing_data_side('trg');  do_align(); -system("date"); -print STDERR "All done. You now need to train a language model (if target split), preprocess the test data and put various things where the eval scripts can find them\n\n". - -sub ifne_copy { -  my ($src, $dest) = @_; -  if (! -f $dest) { -    copy($src, $dest) or die "Copy failed: $!"; -  } +if ($CORPUS{'src'}{'orig'} && $DEV{'src'}{'orig'} && $TEST{'src'}{'orig'}) { +  print STDERR "Putting the config file entry in $PROCESSED_DIR/exp.config\n"; +#format is: +  # nlfr100k_unsplit /export/ws10smt/jan/nlfr/morfwork/s100k.w40.sp_0 corpus.nl-fr.al fr-3.lm.gz dev.nl dev.fr test2008.nl eval-devtest.sh +  my $line = split_name() . " $PROCESSED_DIR corpus.src-trg.al LMFILE.lm.gz"; +  $line = $line . " $DEV{'src'}{'name'} $DEV{'trg'}{'name'}"; +  $line = $line . " " . get_basename($TEST{'src'}{$SPLIT_SRC ? "finalsplit" : "finalunsplit"}) . " eval-devtest.sh"; +  safesystem("echo '$line' > $PROCESSED_DIR/exp.config");  } -} +system("date"); +print STDERR "All done. You now need to train a language model (if target split), put it in the right dir and update the config file.\n\n"; + +############################## BILINGUAL ###################################  sub filter_corpus {    print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n"; @@ -167,40 +162,26 @@ sub learn_segmentation    my $WHICH = shift;    my $corpus; my $dev; my $test; my $moddir;  my $ppl; +  $corpus = $CORPUS{$WHICH}{'filtered'}; +  $dev = $DEV{$WHICH}{'orig'}; +  $test = $TEST{$WHICH}{'orig'}; +    if ($WHICH eq "src") { -    print STDERR "\n!!!LEARNING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n"; -    $corpus = $CORPUS{'src'}{'filtered'}; -    $dev = $DEV{'src'}{'orig'}; -    $test = $TEST_SRC;      $moddir = $MODEL_SRC_DIR;      $ppl = $PPL_SRC;    } else { -    print STDERR "\n!!!LEARNING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n"; -    $corpus = $CORPUS{'trg'}{'filtered'}; -    $dev = $DEV{'trg'}{'orig'}; -    $test = $TEST_TRG;      $moddir = $MODEL_TRG_DIR;      $ppl = $PPL_TRG;    } -  system("date");    my $cmd = "cat $corpus";    if ($dev) { $cmd = "$cmd $dev"; }    if ($test) { $cmd = "$cmd $test"; }    my $tmpfile = "$CORPUS_DIR/all.tmp.gz";    safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning.."; -  learn_segmentation_any($tmpfile, $moddir, $ppl); -  safesystem("rm $tmpfile"); -} +  assert_marker($tmpfile); -sub learn_segmentation_any { -  my($INPUT_FILE, $SEGOUT_DIR, $PPL) = @_; -  my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; -   if ( -f $SEG_FILE) { -    print STDERR "$SEG_FILE exists, reusing...\n"; -    return; -  } -  my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; -  safesystem($cmd) or die "Failed to learn segmentation model"; +  learn_segmentation_side($tmpfile, $moddir, $ppl, $WHICH); +  safesystem("rm $tmpfile");  }  sub do_align { @@ -214,44 +195,69 @@ sub do_align {    }     my $conf_file = "$ALIGNMENT_DIR/word-align.conf"; -  #decorate training files with identifiers to stop the aligner from training on dev and test too -  #since they are in same directory +  #decorate training files with identifiers to stop the aligner from training on dev and test when rerun in future.    safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!";    safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!";    write_wconf($conf_file, $PROCESSED_DIR);   -  safesystem("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log") or die "Failed to run word alignment."; +  system("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log"); + +  if (! -f $ALIGNMENTS) { die "Failed to run word alignment.";} + +  my $cmd = "paste $PROCESSED_DIR/corpus.src $PROCESSED_DIR/corpus.trg $ALIGNMENTS"; +  $cmd = $cmd . " | sed 's/\\t/ \|\|\| /g' > $PROCESSED_DIR/corpus.src-trg.al"; +  safesystem($cmd) or die "Failed to paste into aligned corpus file.";  } -sub apply_segmentation { -  my $WHICH = shift; -  my $moddir; -  my $datfile; -  if ($WHICH eq "src") { -    print STDERR "\n!!!APPLYING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n"; -    apply_segmentation_any($MODEL_SRC_DIR, $CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}"); -    if ($DEV{'src'}{'orig'}) { -      apply_segmentation_any($MODEL_SRC_DIR, $DEV{'src'}{'orig'}, "$PROCESSED_DIR/$DEV{'src'}{'name'}"); -    } -    if ($TEST_SRC) { -      apply_segmentation_any($MODEL_SRC_DIR, $TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); -    } -  } else { -    print STDERR "\n!!!APPLYING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n"; -    apply_segmentation_any($MODEL_TRG_DIR, $CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}"); -    if ($DEV{'trg'}{'orig'}) { -      $DEV{'trg'}{'final'} = "$PROCESSED_DIR/$DEV{'trg'}{'name'}"; -      apply_segmentation_any($MODEL_TRG_DIR, $DEV{'trg'}{'orig'}, $DEV{'trg'}{'final'}); -    } -    if ($TEST_TRG) { -      apply_segmentation_any($MODEL_TRG_DIR, $TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.split"); -      copy($TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.unsplit") or die "Could not copy unsegmented test set"; -    } +############################# MONOLINGUAL ################################# + +#copy the necessary data files that weren't place by segmentation +sub place_missing_data_side { +  my $side = shift; + +  ifne_copy($CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}") ; + +  if ($DEV{$side}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{$side}{'name'}") { +    $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; +    copy($DEV{$side}{'orig'}, $DEV{$side}{'final'}) or die "Copy failed: $!"; +  } + +  if ($TEST{$side}{'orig'} && ! -f "$PROCESSED_DIR/$TEST{$side}{'name'}" && ! $TEST{$side}{'finalunsplit'}) { +    $TEST{$side}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}"; +    copy($TEST{$side}{'orig'}, $TEST{$side}{'finalunsplit'}) or die "Copy failed: $!"; +  } + +} + +sub apply_segmentation_side { +  my ($side, $moddir) = @_; +  +  print STDERR "\n!!!APPLYING SEGMENTATION MODEL ($side)!!!\n"; +  apply_segmentation_any($moddir, $CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}"); +  if ($DEV{$side}{'orig'}) { +     $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; +    apply_segmentation_any($moddir, $DEV{$side}{'orig'}, "$DEV{$side}{'final'}");    } -  if ($WHICH eq "src" || $WHICH eq "trg") { -      write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); +  if ($TEST{$side}{'orig'}) { +    $TEST{$side}{'finalsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}.split"; +    apply_segmentation_any($moddir, $TEST{$side}{'orig'}, $TEST{$side}{'finalsplit'} ); +  }  + +} + +sub learn_segmentation_side { +  my($INPUT_FILE, $SEGOUT_DIR, $PPL, $LANG) = @_; + +  print STDERR "\n!!!LEARNING SEGMENTATION MODEL ($LANG)!!!\n"; +  system("date"); +  my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; +   if ( -f $SEG_FILE) { +    print STDERR "$SEG_FILE exists, reusing...\n"; +    return;    } +  my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; +  safesystem($cmd) or die "Failed to learn segmentation model";  }  sub apply_segmentation_any { @@ -265,6 +271,8 @@ sub apply_segmentation_any {    safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile";  } +##################### PATH FUNCTIONS ########################## +  sub beautify_numlines {    return ($SENTENCES ? $SENTENCES : "_all");  } @@ -284,6 +292,19 @@ sub model_dir {    }      } +sub processed_dir { +  return corpus_dir() . "." . split_name(); +} + +########################## HELPER FUNCTIONS ############################ + +sub ifne_copy { +  my ($src, $dest) = @_; +  if (! -f $dest) { +    copy($src, $dest) or die "Copy failed: $!"; +  } +} +  sub split_name {    #parses SPLIT_TYPE, which can have the following values    # t|s|ts|st (last 2 are equiv) @@ -304,14 +325,10 @@ sub split_name {  } -sub processed_dir { -  return corpus_dir() . "." . split_name; -} -  sub usage {    print <<EOT; -Usage: $0 [OPTIONS] corpus.src corpus.trg dev.src dev.trg test.src test.trg +Usage: $0 [OPTIONS] corpus.src corpus.trg [dev.src dev.trg [test.src test.trg]]  Learns a segmentation model and splits up corpora as necessary. Word alignments are trained on a specified subset of the training corpus. @@ -359,11 +376,18 @@ sub get_basename    return $x;  } +sub assert_marker { +  my $file = shift; +  my $result = `zcat $file| grep '$MARKER' | wc -l` or die "Cannot read $file: $!"; +  print $result;  +  if (scalar($result) != 0) { die "Data contains marker '$MARKER'; use something else.";} +} +########################### Dynamic config files ############################## +  sub write_wconf {    my ($filename, $train_dir) = @_;    open WCONF, ">$filename" or die "Can't write $filename: $!"; -#TODO CHANGE ITERATIONS BELOW!!!    print WCONF <<EOT;  ## ----------------------  ## This is an example training script for the Berkeley @@ -377,7 +401,7 @@ sub write_wconf {  forwardModels   MODEL1 HMM  reverseModels   MODEL1 HMM  mode    JOINT JOINT -iters   1 1 +iters   5 5  ###############################################  # Execution: Controls output and program flow @@ -418,49 +442,45 @@ sub write_eval_sh    print EVALFILE <<EOT;  #!/bin/bash -d=`dirname \$0`  EVAL_MAIN=/export/ws10smt/data/eval.sh +marker="$MARKER"  EOT    if ($SPLIT_TRG) {      print EVALFILE <<EOT;  echo "OUTPUT EVALUATION"  echo "-----------------" -\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.split +\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalsplit'}  echo "RECOMBINED OUTPUT EVALUATION"  echo "----------------------------" -marker="$MARKER"  cat "\$1" | sed -e "s/\$marker \$marker//g" -e "s/\$marker//g" > "\$1.recombined" -\$EVAL_MAIN "\$1.recombined" \$d/$TEST_TRG_name.unsplit +\$EVAL_MAIN "\$1.recombined" $TEST{'trg'}{'finalunsplit'}  EOT    } else {      print EVALFILE <<EOT; -#!/bin/bash -d=`dirname \$0` - -EVAL_MAIN=/export/ws10smt/data/eval.sh -  echo "ARTIFICIAL SPLIT EVALUATION"  echo "--------------------------" -MARKER="$MARKER"  #split the output translation -cat "\$1" | $MORFSEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split" - -\$EVAL_MAIN "i\$1.split" \$d/$TEST_TRG_name.split +cat "\$1" | $MORF_SEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split" +\$EVAL_MAIN "\$1.split" $TEST{'trg'}{'finalsplit'}  echo "DIRECT EVALUATION"  echo "--------------------------" -\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.unsplit +\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalunsplit'}  EOT +    }    close EVALFILE;  } + + + diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py index e5597c0b..85b9d4fb 100755 --- a/gi/morf-segmentation/morfsegment.py +++ b/gi/morf-segmentation/morfsegment.py @@ -16,7 +16,7 @@ if len(sys.argv) < 3:  #read index:  split_index={} -marker="#" +marker="##"  if len(sys.argv) > 3:    marker=sys.argv[3]  | 
