Adding morphology-segmentation stuff. Changes include: local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f
author: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 18:03:47 +0000
committer: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 18:03:47 +0000
commit: c57c05d19fb306f7f50cc02516a8a2901c920cca (patch)
tree: 1120643e63ea2b46d6a3bc0b338fb225682c9dd7 /gi/morf-segmentation
parent: 58681ee5816d13c04002ca8aebe23c2768da4e5b (diff)
7 files changed, 736 insertions, 0 deletions
diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl
new file mode 100755
index 00000000..a78575da
--- /dev/null
+++ b/gi/morf-segmentation/filter_docs.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl
+
+#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries.
+
+#Usage: filter_docs.pl [mark]
+#  STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor
+#  STDOUT: the matching subset, same format
+
+use utf8;
+my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html
+
+my $morph=qr/$letter+/;
+
+my $m = "##"; # marker used to indicate morphemes
+if ((scalar @ARGV) >= 1) {
+   $m = $ARGV[0];
+   shift;
+}
+print STDERR "Using $m to filter for morphemes\n";
+
+my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped
+while(<>) {
+   /$expr/ && print;
+}
diff --git a/gi/morf-segmentation/invalid_vocab.patterns b/gi/morf-segmentation/invalid_vocab.patterns
new file mode 100644
index 00000000..473ce1b1
--- /dev/null
+++ b/gi/morf-segmentation/invalid_vocab.patterns
@@ -0,0 +1,6 @@
+[[:digit:]]
+[] !"#$%&()*+,./:;<=>?@[\^_`{|}~]
+^'$
+-$
+^-
+^$
diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py
new file mode 100755
index 00000000..04e9044a
--- /dev/null
+++ b/gi/morf-segmentation/linestripper.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+
+import sys
+
+#linestripper   file file maxlen [numlines]
+
+if len(sys.argv) < 3:
+  print "linestripper   file1 file2 maxlen [numlines]" 
+  print " outputs subset of file1 to stdout, ..of file2 to stderr"
+  sys.exit(1)
+
+
+f1 = open(sys.argv[1],'r')
+f2 = open(sys.argv[2],'r')
+
+maxlen=int(sys.argv[3])
+numlines = 0
+
+if len(sys.argv) > 4:
+  numlines = int(sys.argv[4])
+
+count=0
+for line1 in f1:
+  line2 = f2.readline()
+  
+  w1 = len(line1.strip().split())
+  w2 = len(line2.strip().split())
+
+  if w1 <= maxlen and w2 <= maxlen:
+    count = count + 1
+    sys.stdout.write(line1)
+    sys.stderr.write(line2)
+ 
+  if numlines > 0 and count >= numlines:
+    break
+
+f1.close()
+f2.close()
+  
+
diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl
new file mode 100755
index 00000000..da40eb57
--- /dev/null
+++ b/gi/morf-segmentation/morf-pipeline.pl
@@ -0,0 +1,466 @@
+#!/usr/bin/perl -w
+use strict;
+use File::Copy;
+
+#WARNING.. THIS SCRIPT IS CURRENTLY SOMEWHAT BROKEN. AND UGLY.
+
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+
+use Getopt::Long "GetOptions";
+
+my $GZIP = 'gzip';
+my $ZCAT = 'gunzip -c';
+my $SED = 'sed -e';
+
+my $MORF_TRAIN = "$SCRIPT_DIR/morftrain.sh";
+my $MORF_SEGMENT = "$SCRIPT_DIR/morfsegment.py";
+
+my $LINESTRIPPER = "$SCRIPT_DIR/linestripper.py";
+my $ALIGNER = "/export/ws10smt/software/berkeleyaligner/berkeleyaligner.jar";
+#java -d64 -Xmx10g -jar $ALIGNER ++word-align.conf >> aligner.log
+assert_exec($MORF_TRAIN, $LINESTRIPPER, $MORF_SEGMENT, $ALIGNER);
+
+my $OUTPUT = './morfwork';
+my $PPL_SRC = 50;
+my $PPL_TRG = 50;
+my $MARKER = "#";
+my $MAX_WORDS = 40;
+my $SENTENCES;# = 100000;
+my $SPLIT_TYPE = "";
+my $NAME_SHORTCUT;
+
+usage() unless &GetOptions('max_words=i' => \$MAX_WORDS,
+                           'output=s' => \$OUTPUT,
+                           'ppl_src=i' => \$PPL_SRC,
+                           'ppl_trg=i' => \$PPL_TRG,
+                           'sentences=i' => \$SENTENCES,
+                           'marker=s' => \$MARKER,
+                           'split=s' => \$SPLIT_TYPE,
+                           'get_name_only' => \$NAME_SHORTCUT,
+                          );
+#if ($NAME_SHORTCUT) {
+#  print STDERR labeled_dir();
+#  exit 0;
+#}
+
+usage() unless scalar @ARGV >= 2;
+
+my %CORPUS; # for (src,trg) it has (orig, name, filtered, final)
+
+$CORPUS{'src'}{'orig'} = $ARGV[0];
+open F, "<$CORPUS{'src'}{'orig'}" or die "Can't read $CORPUS{'src'}{'orig'}: $!"; close F;
+$CORPUS{'src'}{'name'} = get_basename($CORPUS{'src'}{'orig'});
+
+$CORPUS{'trg'}{'orig'} = $ARGV[1];
+open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F;
+$CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'});
+
+
+my %DEV; # for (src,trg) has (orig, final.split final.unsplit
+
+
+#my %TEST_SRC; #original, final
+              #trg has original, final.split final.recombined
+
+my $TEST_SRC;
+my $TEST_TRG;
+
+my $TEST_SRC_name;
+my $TEST_TRG_name;
+
+if (@ARGV >= 4) {
+  $DEV{'src'}{'orig'} = $ARGV[2];
+  open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F;
+  $DEV{'src'}{'name'} = get_basename($DEV{'src'}{'orig'});
+  $DEV{'trg'}{'orig'} = $ARGV[3];
+  open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F;
+  $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'});
+}
+if (@ARGV >= 6) {
+  $TEST_SRC = $ARGV[4];
+  open F, "<$TEST_SRC" or die "Can't read $TEST_SRC: $!"; close F;
+  $TEST_SRC_name = get_basename($TEST_SRC);
+  $TEST_TRG = $ARGV[5];
+  open F, "<$TEST_TRG" or die "Can't read $TEST_TRG: $!"; close F;
+  $TEST_TRG_name = get_basename($TEST_TRG);
+}
+
+my $SPLIT_SRC; #use these to check whether that part is being split
+my $SPLIT_TRG;
+
+#OUTPUT WILL GO IN THESE
+my $CORPUS_DIR = $OUTPUT . '/' . corpus_dir();            #subsampled corpus
+my $MODEL_SRC_DIR = $OUTPUT . '/' . model_dir("src"); #splitting..
+my $MODEL_TRG_DIR = $OUTPUT . '/' . model_dir("trg"); # .. models
+my $PROCESSED_DIR = $OUTPUT . '/' . processed_dir();      #segmented copora+alignments
+my $ALIGNMENT_DIR = $PROCESSED_DIR . '/alignments';
+
+$CORPUS{'src'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'src'}{'name'}";
+$CORPUS{'trg'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'trg'}{'name'}";
+
+print STDERR "Output: $OUTPUT\n";
+print STDERR "Corpus: $CORPUS_DIR\n";
+print STDERR "Model-src: $MODEL_SRC_DIR\n";
+print STDERR "Model-trg: $MODEL_TRG_DIR\n";
+print STDERR "Finaldir: $PROCESSED_DIR\n";
+
+safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!";
+safemkdir($CORPUS_DIR) or die "Couldn't create output directory $CORPUS_DIR: $!";
+filter_corpus();
+
+safemkdir($PROCESSED_DIR);
+safemkdir($ALIGNMENT_DIR);
+
+if ($SPLIT_SRC) {
+  safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!";
+  learn_segmentation("src");
+  apply_segmentation("src");  
+}
+if ($SPLIT_TRG) {
+  safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!";
+  learn_segmentation("trg");
+  apply_segmentation("trg");  
+}
+
+#copy corpora if they haven't been put in place by splitting operations
+if (! -f "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") {
+  copy($CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}") or die "Copy failed: $!";
+}
+if (! -f "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") {
+  copy($CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}") or die "Copy failed: $!";
+}
+if ($DEV{'src'}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{'src}{'name'}") {
+  copy(
+}
+if ($TEST_SRC) { ifne_copy($TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name"); }
+if ($TEST_TRG) { ifne_copy("$TEST_TRG.unsplit", "$PROCESSED_DIR/$TEST_TRG_name.unsplit"); }
+
+
+
+do_align();
+
+system("date");
+print STDERR "All done. You now need to train a language model (if target split), preprocess the test data and put various things where the eval scripts can find them\n\n".
+
+sub ifne_copy {
+  my ($src, $dest) = @_;
+  if (! -f $dest) {
+    copy($src, $dest) or die "Copy failed: $!";
+  }
+}
+
+}
+
+sub filter_corpus {
+  print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n";
+  if ( -f $CORPUS{'src'}{'filtered'} && -f $CORPUS{'trg'}{'filtered'}) {
+    print STDERR "$CORPUS{'src'}{'filtered'} and $CORPUS{'trg'}{'filtered'} exist, reusing...\n";
+    return;
+  }
+  my $args = "$CORPUS{'src'}{'orig'} $CORPUS{'trg'}{'orig'} $MAX_WORDS";
+  if ($SENTENCES) { $args = $args . " $SENTENCES"; } 
+  safesystem("$LINESTRIPPER $args 1> $CORPUS{'src'}{'filtered'} 2> $CORPUS{'trg'}{'filtered'}") or die "Failed to filter training corpus for length.";
+}
+
+sub learn_segmentation
+{
+  my $WHICH = shift;
+  my $corpus; my $dev; my $test; my $moddir;  my $ppl;
+
+  if ($WHICH eq "src") {
+    print STDERR "\n!!!LEARNING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n";
+    $corpus = $CORPUS{'src'}{'filtered'};
+    $dev = $DEV{'src'}{'orig'};
+    $test = $TEST_SRC;
+    $moddir = $MODEL_SRC_DIR;
+    $ppl = $PPL_SRC;
+  } else {
+    print STDERR "\n!!!LEARNING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n";
+    $corpus = $CORPUS{'trg'}{'filtered'};
+    $dev = $DEV{'trg'}{'orig'};
+    $test = $TEST_TRG;
+    $moddir = $MODEL_TRG_DIR;
+    $ppl = $PPL_TRG;
+  }
+  system("date");
+  my $cmd = "cat $corpus";
+  if ($dev) { $cmd = "$cmd $dev"; }
+  if ($test) { $cmd = "$cmd $test"; }
+  my $tmpfile = "$CORPUS_DIR/all.tmp.gz";
+  safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning..";
+  learn_segmentation_any($tmpfile, $moddir, $ppl);
+  safesystem("rm $tmpfile");
+}
+
+sub learn_segmentation_any {
+  my($INPUT_FILE, $SEGOUT_DIR, $PPL) = @_;
+  my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready";
+   if ( -f $SEG_FILE) {
+    print STDERR "$SEG_FILE exists, reusing...\n";
+    return;
+  }
+  my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\"";
+  safesystem($cmd) or die "Failed to learn segmentation model";
+}
+
+sub do_align {
+  print STDERR "\n!!!WORD ALIGNMENT!!!\n";
+  system("date");
+
+  my $ALIGNMENTS = "$ALIGNMENT_DIR/training.align";
+  if ( -f $ALIGNMENTS ) {
+    print STDERR "$ALIGNMENTS  exists, reusing...\n";
+    return;
+  } 
+  my $conf_file = "$ALIGNMENT_DIR/word-align.conf";
+    
+  #decorate training files with identifiers to stop the aligner from training on dev and test too
+  #since they are in same directory
+  safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!";
+  safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!";
+
+  write_wconf($conf_file, $PROCESSED_DIR);  
+  safesystem("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log") or die "Failed to run word alignment.";
+
+}
+
+sub apply_segmentation {
+  my $WHICH = shift;
+  my $moddir;
+  my $datfile;
+  if ($WHICH eq "src") {
+    print STDERR "\n!!!APPLYING SEGMENTATION MODEL (SOURCE LANGUAGE)!!!\n";
+    apply_segmentation_any($MODEL_SRC_DIR, $CORPUS{'src'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'src'}{'name'}");
+    if ($DEV{'src'}{'orig'}) {
+      apply_segmentation_any($MODEL_SRC_DIR, $DEV{'src'}{'orig'}, "$PROCESSED_DIR/$DEV{'src'}{'name'}");
+    }
+    if ($TEST_SRC) {
+      apply_segmentation_any($MODEL_SRC_DIR, $TEST_SRC, "$PROCESSED_DIR/$TEST_SRC_name");
+    }
+  } else {
+    print STDERR "\n!!!APPLYING SEGMENTATION MODEL (TARGET LANGUAGE)!!!\n";
+    apply_segmentation_any($MODEL_TRG_DIR, $CORPUS{'trg'}{'filtered'}, "$PROCESSED_DIR/$CORPUS{'trg'}{'name'}");
+    if ($DEV{'trg'}{'orig'}) {
+      $DEV{'trg'}{'final'} = "$PROCESSED_DIR/$DEV{'trg'}{'name'}";
+      apply_segmentation_any($MODEL_TRG_DIR, $DEV{'trg'}{'orig'}, $DEV{'trg'}{'final'});
+    }
+    if ($TEST_TRG) {
+      apply_segmentation_any($MODEL_TRG_DIR, $TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.split");
+      copy($TEST_TRG, "$PROCESSED_DIR/$TEST_TRG_name.unsplit") or die "Could not copy unsegmented test set";
+    }
+  }
+  if ($WHICH eq "src" || $WHICH eq "trg") {
+      write_eval_sh("$PROCESSED_DIR/eval-devtest.sh");
+  }
+}
+
+sub apply_segmentation_any {
+  my($moddir, $datfile, $outfile) = @_;
+  if ( -f $outfile) {
+    print STDERR "$outfile exists, reusing...\n";
+    return;
+  }
+  
+  my $args = "$moddir/inputvocab.gz $moddir/segmentation.ready \"$MARKER\"";
+  safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile";
+}
+
+sub beautify_numlines {
+  return ($SENTENCES ? $SENTENCES : "_all");
+}
+
+sub corpus_dir {
+  return "s" . beautify_numlines() . ".w" . $MAX_WORDS;
+}
+
+sub model_dir {
+  my $lang = shift;
+  if ($lang eq "src") { 
+    return corpus_dir() . ".PPL" . $PPL_SRC . ".src";
+  } elsif ($lang eq "trg") {
+    return corpus_dir() .  ".PPL" . $PPL_TRG . ".trg";
+  } else {
+    return "PPLundef";
+  }    
+}
+
+sub split_name {
+  #parses SPLIT_TYPE, which can have the following values
+  # t|s|ts|st (last 2 are equiv)
+  # or is undefined when no splitting is done
+  my $name = "";
+  
+  if ($SPLIT_TYPE) { 
+    $SPLIT_SRC = lc($SPLIT_TYPE) =~ /s/;
+    $SPLIT_TRG = lc($SPLIT_TYPE) =~ /t/;
+    $name = $name . ($SPLIT_SRC ? $PPL_SRC : "0");
+    $name = $name . "_" . ($SPLIT_TRG ? $PPL_TRG : "0"); 
+  } else {
+    #no splitting
+    $name = "0";
+  }
+
+  return "sp_" . $name;
+  
+}
+
+sub processed_dir {
+  return corpus_dir() . "." . split_name;
+}
+
+sub usage {
+  print <<EOT;
+
+Usage: $0 [OPTIONS] corpus.src corpus.trg dev.src dev.trg test.src test.trg
+
+Learns a segmentation model and splits up corpora as necessary. Word alignments are trained on a specified subset of the training corpus.
+
+EOT
+  exit 1;
+};
+
+sub safemkdir {
+  my $dir = shift;
+  if (-d $dir) { return 1; }
+  return mkdir($dir);
+}
+
+sub assert_exec {
+  my @files = @_;
+  for my $file (@files) {
+    die "Can't find $file - did you run make?\n" unless -e $file;
+    die "Can't execute $file" unless -e $file;
+  }
+};
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+      print STDERR "ERROR: Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
+
+sub get_basename
+{
+  my $x = shift;
+  $x = `basename $x`;
+  $x =~ s/\n//;
+  return $x;
+}
+
+sub write_wconf {
+  my ($filename, $train_dir) = @_;
+  open WCONF, ">$filename" or die "Can't write $filename: $!";
+
+#TODO CHANGE ITERATIONS BELOW!!!
+  print WCONF <<EOT;
+## ----------------------
+## This is an example training script for the Berkeley
+## word aligner.  In this configuration it uses two HMM
+## alignment models trained jointly and then decoded
+## using the competitive thresholding heuristic.
+
+##########################################
+# Training: Defines the training regimen
+##########################################
+forwardModels   MODEL1 HMM
+reverseModels   MODEL1 HMM
+mode    JOINT JOINT
+iters   1 1
+
+###############################################
+# Execution: Controls output and program flow
+###############################################
+execDir $ALIGNMENT_DIR
+create
+overwriteExecDir
+saveParams  true
+numThreads  1
+msPerLine   10000
+alignTraining
+
+#################
+# Language/Data
+#################
+foreignSuffix   src
+englishSuffix   trg
+
+# Choose the training sources, which can either be directories or files that list files/directories
+trainSources    $train_dir/
+#trainSources     $train_dir/sources
+testSources     
+sentences   MAX
+
+#################
+# 1-best output
+#################
+competitiveThresholding
+
+EOT
+  close WCONF;
+}
+
+sub write_eval_sh
+{
+  my ($filename) = @_;
+  open EVALFILE, ">$filename" or die "Can't write $filename: $!";
+
+  print EVALFILE <<EOT;
+#!/bin/bash
+d=`dirname \$0`
+
+EVAL_MAIN=/export/ws10smt/data/eval.sh
+EOT
+
+  if ($SPLIT_TRG) {
+    print EVALFILE <<EOT;
+echo "OUTPUT EVALUATION"
+echo "-----------------"
+\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.split
+
+echo "RECOMBINED OUTPUT EVALUATION"
+echo "----------------------------"
+marker="$MARKER"
+cat "\$1" | sed -e "s/\$marker \$marker//g" -e "s/\$marker//g" > "\$1.recombined"
+
+\$EVAL_MAIN "\$1.recombined" \$d/$TEST_TRG_name.unsplit
+EOT
+
+  } else {
+    print EVALFILE <<EOT;
+#!/bin/bash
+d=`dirname \$0`
+
+EVAL_MAIN=/export/ws10smt/data/eval.sh
+
+echo "ARTIFICIAL SPLIT EVALUATION"
+echo "--------------------------"
+
+MARKER="$MARKER"
+#split the output translation
+cat "\$1" | $MORFSEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split"
+
+\$EVAL_MAIN "i\$1.split" \$d/$TEST_TRG_name.split
+
+
+echo "DIRECT EVALUATION"
+echo "--------------------------"
+\$EVAL_MAIN "\$1" \$d/$TEST_TRG_name.unsplit
+  
+EOT
+  }
+  close EVALFILE;
+
+}
+
diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py
new file mode 100755
index 00000000..e5597c0b
--- /dev/null
+++ b/gi/morf-segmentation/morfsegment.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+import sys
+import gzip
+
+#usage: morfsegment.py inputvocab.gz segmentation.ready
+#  stdin: the data to segment
+#  stdout: the segmented data
+
+if len(sys.argv) < 3:
+  print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]"
+  print "  stdin: the data to segment"
+  print "  stdout: the segmented data"
+  sys.exit()
+
+#read index:
+split_index={}
+
+marker="#"
+
+if len(sys.argv) > 3:
+  marker=sys.argv[3]
+
+word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz
+seg_vocab=open(sys.argv[2], 'r') #segm.ready..
+
+for seg in seg_vocab:
+  #seg = ver# #wonder\n
+  #wordline = 1 verwonder\n
+  word = word_vocab.readline().strip().split(' ')
+  assert(len(word) == 2)
+  word = word[1]
+  seg=seg.strip()
+
+  if seg != word:
+    split_index[word] = seg
+
+word_vocab.close()
+seg_vocab.close()
+
+for line in sys.stdin:
+  words = line.strip().split()
+
+  newsent = []
+  for word in words:
+    splitword = split_index.get(word, word)
+    newsent.append(splitword)
+
+  print ' '.join(newsent)
+
diff --git a/gi/morf-segmentation/morftrain.sh b/gi/morf-segmentation/morftrain.sh
new file mode 100755
index 00000000..9004922f
--- /dev/null
+++ b/gi/morf-segmentation/morftrain.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+if [[ $# -lt 3 ]]; then
+	echo "Trains a morfessor model and places the result in writedir"
+	echo
+	echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]"
+	echo -e "\tcorpus_input_file contains a sentence per line."
+	exit 1
+fi
+
+MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2"
+SCRIPT_DIR=$(dirname `readlink -f $0`)
+
+MORFBINDIR="$MORFESSOR_DIR/bin"
+MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile"
+VOCABEXT="$SCRIPT_DIR/vocabextractor.sh"
+
+MARKER="#"
+
+if [[ ! -f $VOCABEXT ]]; then
+  echo "$VOCABEXT doesn't exist!"
+  exit 1
+fi
+if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then
+  echo "$MORFMAKEFILE_TRAIN doesn't exist!"
+  exit 1
+fi
+
+
+CORPUS="$1"
+WRITETODIR=$2
+
+if [[ ! -f $CORPUS ]]; then
+  echo "$CORPUS doesn't exist!"
+  exit 1
+fi
+
+PPL=10
+LINES=0
+if [[ $# -gt 2 ]]; then
+  PPL=$3
+fi
+if [[ $# -gt 3 ]]; then
+  MARKER="$4"
+fi
+if [[ $# -gt 4 ]]; then
+  LINES=$5
+fi
+
+mkdir -p $WRITETODIR
+
+#extract vocabulary to train on
+echo "Extracting vocabulary..."
+if [[ -f $WRITETODIR/inputvocab.gz ]]; then
+  echo " ....$WRITETODIR/inputvocab.gz exists, reusing."
+else
+  if [[ $LINES -gt 0 ]]; then
+    $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz
+  else
+    $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz
+  fi
+fi
+
+
+#train it
+echo "Training morf model..."
+if [[ -f $WRITETODIR/segmentation.final.gz ]]; then
+  echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.."
+else
+  OLDPWD=`pwd`
+  cd $WRITETODIR
+  
+  #put the training Makefile in place, with appropriate modifications
+  sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/"  \
+    -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \
+    -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \
+    $MORFMAKEFILE_TRAIN > ./Makefile
+
+  date
+  make > ./trainmorf.log 2>&1
+  cd $OLDPWD
+  
+  
+  echo "Post processing..."
+  #remove comments, counts and morph types
+  #mark morphs
+  
+  if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then
+     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
+     exit 1
+  fi
+
+  zcat $WRITETODIR/segmentation.final.gz | \
+    awk '$1 !~ /^#/ {print}' | \
+    cut -d ' ' --complement -f 1 | \
+    sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \
+    > $WRITETODIR/segmentation.ready
+
+  if [[ ! -f $WRITETODIR/segmentation.ready ]]; then
+     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
+     exit 1
+  fi
+
+
+
+  echo "Done training."
+  date
+fi
+echo "Segmentation model is $WRITETODIR/segmentation.ready."
+
diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh
new file mode 100755
index 00000000..00ae7109
--- /dev/null
+++ b/gi/morf-segmentation/vocabextractor.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+d=$(dirname `readlink -f $0`)
+if [ $# -lt 1 ]; then
+	echo "Extracts unique words and their frequencies from a subset of a corpus."
+	echo
+	echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
+	echo -e "\tinput_file contains a sentence per line."
+	echo
+	echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
+	echo
+	exit
+fi
+
+srcname=$1
+reallen=0
+
+if [[ $# -gt 1 ]]; then
+  reallen=$2
+fi
+
+pattern_file=$d/invalid_vocab.patterns
+
+if [[ ! -f $pattern_file ]]; then
+  echo "Pattern file missing"
+  exit 1 
+fi
+
+#this awk strips entries from the vocabulary if they contain invalid characters
+#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
+#uniq -c extracts the unique words and counts the occurrences
+
+if [[ $reallen -eq 0 ]]; then
+	#when a zero is passed, use the whole file
+  zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//' 
+
+else
+	zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//'
+fi
+
author	bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 18:03:47 +0000
committer	bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 18:03:47 +0000
commit	c57c05d19fb306f7f50cc02516a8a2901c920cca (patch)
tree	1120643e63ea2b46d6a3bc0b338fb225682c9dd7 /gi/morf-segmentation
parent	58681ee5816d13c04002ca8aebe23c2768da4e5b (diff)