Merge branch 'master' of https://github.com/redpony/cdec

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-10-11 14:06:32 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-10-11 14:06:32 -0400
commit: 07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch)
tree: 644496a1690d84d82a396bbc1e39160788beb2cd /gi/morf-segmentation
parent: 37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff)
parent: a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff)
7 files changed, 0 insertions, 756 deletions
diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl
deleted file mode 100755
index a78575da..00000000
--- a/gi/morf-segmentation/filter_docs.pl
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/perl
-
-#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries.
-
-#Usage: filter_docs.pl [mark]
-#  STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor
-#  STDOUT: the matching subset, same format
-
-use utf8;
-my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html
-
-my $morph=qr/$letter+/;
-
-my $m = "##"; # marker used to indicate morphemes
-if ((scalar @ARGV) >= 1) {
-   $m = $ARGV[0];
-   shift;
-}
-print STDERR "Using $m to filter for morphemes\n";
-
-my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped
-while(<>) {
-   /$expr/ && print;
-}
diff --git a/gi/morf-segmentation/invalid_vocab.patterns b/gi/morf-segmentation/invalid_vocab.patterns
deleted file mode 100644
index 473ce1b1..00000000
--- a/gi/morf-segmentation/invalid_vocab.patterns
+++ /dev/null
@@ -1,6 +0,0 @@
-[[:digit:]]
-[] !"#$%&()*+,./:;<=>?@[\^_`{|}~]
-^'$
--$
-^-
-^$
diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py
deleted file mode 100755
index 04e9044a..00000000
--- a/gi/morf-segmentation/linestripper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/python
-
-import sys
-
-#linestripper   file file maxlen [numlines]
-
-if len(sys.argv) < 3:
-  print "linestripper   file1 file2 maxlen [numlines]" 
-  print " outputs subset of file1 to stdout, ..of file2 to stderr"
-  sys.exit(1)
-
-
-f1 = open(sys.argv[1],'r')
-f2 = open(sys.argv[2],'r')
-
-maxlen=int(sys.argv[3])
-numlines = 0
-
-if len(sys.argv) > 4:
-  numlines = int(sys.argv[4])
-
-count=0
-for line1 in f1:
-  line2 = f2.readline()
-  
-  w1 = len(line1.strip().split())
-  w2 = len(line2.strip().split())
-
-  if w1 <= maxlen and w2 <= maxlen:
-    count = count + 1
-    sys.stdout.write(line1)
-    sys.stderr.write(line2)
- 
-  if numlines > 0 and count >= numlines:
-    break
-
-f1.close()
-f2.close()
-  
-
diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl
deleted file mode 100755
index 46eb5b46..00000000
--- a/gi/morf-segmentation/morf-pipeline.pl
+++ /dev/null
@@ -1,486 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-use File::Copy;
-
-
-# Preprocessing pipeline to take care of word segmentation
-# Learns a segmentation model for each/either side of the parallel corpus using all train/dev/test data
-# Applies the segmentation where necessary.
-# Learns word alignments on the preprocessed training data.
-# Outputs script files used later to score output.
-
-
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-
-use Getopt::Long "GetOptions";
-
-my $GZIP = 'gzip';
-my $ZCAT = 'gunzip -c';
-my $SED = 'sed -e';
-
-my $MORF_TRAIN = "$SCRIPT_DIR/morftrain.sh";
-my $MORF_SEGMENT = "$SCRIPT_DIR/morfsegment.py";
-
-my $LINESTRIPPER = "$SCRIPT_DIR/linestripper.py";
-my $ALIGNER = "/export/ws10smt/software/berkeleyaligner/berkeleyaligner.jar";
-#java -d64 -Xmx10g -jar $ALIGNER ++word-align.conf >> aligner.log
-assert_exec($MORF_TRAIN, $LINESTRIPPER, $MORF_SEGMENT, $ALIGNER);
-
-my $OUTPUT = './morfwork';
-my $PPL_SRC = 50;
-my $PPL_TRG = 50;
-my $MARKER = "#";
-my $MAX_WORDS = 40;
-my $SENTENCES;# = 100000;
-my $SPLIT_TYPE = ""; #possible values: s, t, st, or (empty string)
-my $NAME_SHORTCUT;
-
-usage() unless &GetOptions('max_words=i' => \$MAX_WORDS,
-                           'output=s' => \$OUTPUT,
-                           'ppl_src=i' => \$PPL_SRC,
-                           'ppl_trg=i' => \$PPL_TRG,
-                           'sentences=i' => \$SENTENCES,
-                           'marker=s' => \$MARKER,
-                           'split=s' => \$SPLIT_TYPE,
-                           'get_name_only' => \$NAME_SHORTCUT,
-                          );
-
-usage() unless scalar @ARGV >= 2;
-
-my %CORPUS; # for (src,trg) it has (orig, name, filtered, final)
-
-$CORPUS{'src'}{'orig'} = $ARGV[0];
-open F, "<$CORPUS{'src'}{'orig'}" or die "Can't read $CORPUS{'src'}{'orig'}: $!"; close F;
-$CORPUS{'src'}{'name'} = get_basename($CORPUS{'src'}{'orig'});
-
-$CORPUS{'trg'}{'orig'} = $ARGV[1];
-open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F;
-$CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'});
-
-my %DEV; # for (src,trg) has (orig, final.split final.unsplit
-if (@ARGV >= 4) {
-  $DEV{'src'}{'orig'} = $ARGV[2];
-  open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F;
-  $DEV{'src'}{'name'} = get_basename($DEV{'src'}{'orig'});
-  $DEV{'trg'}{'orig'} = $ARGV[3];
-  open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F;
-  $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'});
-}
-
-my %TEST; # for (src,trg) has (orig, name) 
-if (@ARGV >= 6) {
-  $TEST{'src'}{'orig'} = $ARGV[4];
-  open F, "<$TEST{'src'}{'orig'}" or die "Can't read $TEST{'src'}{'orig'}: $!"; close F;
-  $TEST{'src'}{'name'} = get_basename($TEST{'src'}{'orig'});
-  $TEST{'trg'}{'orig'} = $ARGV[5];
-  open F, "<$TEST{'trg'}{'orig'}" or die "Can't read $TEST{'trg'}{'orig'}: $!"; close F;
-  $TEST{'trg'}{'name'} = get_basename($TEST{'trg'}{'orig'});
-}
-
-my $SPLIT_SRC; #use these to check whether that part is being split
-my $SPLIT_TRG;
-
-#OUTPUT WILL GO IN THESE
-my $CORPUS_DIR = $OUTPUT . '/' . corpus_dir();            #subsampled corpus
-my $MODEL_SRC_DIR = $OUTPUT . '/' . model_dir("src"); #splitting..
-my $MODEL_TRG_DIR = $OUTPUT . '/' . model_dir("trg"); # .. models
-my $PROCESSED_DIR = $OUTPUT . '/' . processed_dir();      #segmented copora+alignments
-my $ALIGNMENT_DIR = $PROCESSED_DIR . '/alignments';
-
-$CORPUS{'src'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'src'}{'name'}";
-$CORPUS{'trg'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'trg'}{'name'}";
-
-print STDERR "Output: $OUTPUT\n";
-print STDERR "Corpus: $CORPUS_DIR\n";
-print STDERR "Model-src: $MODEL_SRC_DIR\n";
-print STDERR "Model-trg: $MODEL_TRG_DIR\n";
-print STDERR "Finaldir: $PROCESSED_DIR\n";
-
-safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!";
-safemkdir($CORPUS_DIR) or die "Couldn't create output directory $CORPUS_DIR: $!";
-filter_corpus();
-
-safemkdir($PROCESSED_DIR);
-safemkdir($ALIGNMENT_DIR);
-
-if ($SPLIT_SRC) {
-  safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!";
-  learn_segmentation("src");
-  apply_segmentation_side("src", $MODEL_SRC_DIR);  
-}
-
-#assume that unsplit hypotheses will be scored against an aritificially split target test set; thus obtain a target splitting model  
-#TODO: add a flag to override this behaviour
-safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!";
-learn_segmentation("trg");
-$TEST{'trg'}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}";
-copy($TEST{'trg'}{'orig'}, $TEST{'trg'}{'finalunsplit'}) or die "Could not copy unsegmented test set";  
-
-if ($SPLIT_TRG) {
-  apply_segmentation_side("trg", $MODEL_TRG_DIR);  
-  } else {
-  $TEST{'trg'}{'finalsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}.split";
-  apply_segmentation_any($MODEL_TRG_DIR, $TEST{'trg'}{'finalunsplit'}, $TEST{'trg'}{'finalsplit'});  
-}
-
-write_eval_sh("$PROCESSED_DIR/eval-devtest.sh");
-
-#copy corpora if they haven't been put in place by splitting operations
-place_missing_data_side('src');
-place_missing_data_side('trg');
-
-do_align();
-
-if ($CORPUS{'src'}{'orig'} && $DEV{'src'}{'orig'} && $TEST{'src'}{'orig'}) {
-  print STDERR "Putting the config file entry in $PROCESSED_DIR/exp.config\n";
-#format is:
-  # nlfr100k_unsplit /export/ws10smt/jan/nlfr/morfwork/s100k.w40.sp_0 corpus.nl-fr.al fr-3.lm.gz dev.nl dev.fr test2008.nl eval-devtest.sh
-  my $line = split_name() . " $PROCESSED_DIR corpus.src-trg.al LMFILE.lm.gz";
-  $line = $line . " $DEV{'src'}{'name'} $DEV{'trg'}{'name'}";
-  $line = $line . " " . get_basename($TEST{'src'}{$SPLIT_SRC ? "finalsplit" : "finalunsplit"}) . " eval-devtest.sh";
-  safesystem("echo '$line' > $PROCESSED_DIR/exp.config");
-}
-
-system("date");
-print STDERR "All done. You now need to train a language model (if target split), put it in the right dir and update the config file.\n\n";
-
-############################## BILINGUAL ###################################
-
-sub filter_corpus {
-  print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n";
-  if ( -f $CORPUS{'src'}{'filtered'} && -f $CORPUS{'trg'}{'filtered'}) {
-    print STDERR "$CORPUS{'src'}{'filtered'} and $CORPUS{'trg'}{'filtered'} exist, reusing...\n";
-    return;
-  }
-  my $args = "$CORPUS{'src'}{'orig'} $CORPUS{'trg'}{'orig'} $MAX_WORDS";
-  if ($SENTENCES) { $args = $args . " $SENTENCES"; } 
-  safesystem("$LINESTRIPPER $args 1> $CORPUS{'src'}{'filtered'} 2> $CORPUS{'trg'}{'filtered'}") or die "Failed to filter training corpus for length.";
-}
-
-sub learn_segmentation
-{
-  my $WHICH = shift;
-  my $corpus; my $dev; my $test; my $moddir;  my $ppl;
-
-  $corpus = $CORPUS{$WHICH}{'filtered'};
-  $dev = $DEV{$WHICH}{'orig'};
-  $test = $TEST{$WHICH}{'orig'};
-
-  if ($WHICH eq "src") {
-    $moddir = $MODEL_SRC_DIR;
-    $ppl = $PPL_SRC;
-  } else {
-    $moddir = $MODEL_TRG_DIR;
-    $ppl = $PPL_TRG;
-  }
-  my $cmd = "cat $corpus";
-  if ($dev) { $cmd = "$cmd $dev"; }
-  if ($test) { $cmd = "$cmd $test"; }
-  my $tmpfile = "$CORPUS_DIR/all.tmp.gz";
-  safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning..";
-  assert_marker($tmpfile);
-
-  learn_segmentation_side($tmpfile, $moddir, $ppl, $WHICH);
-  safesystem("rm $tmpfile");
-}
-
-sub do_align {
-  print STDERR "\n!!!WORD ALIGNMENT!!!\n";
-  system("date");
-
-  my $ALIGNMENTS = "$ALIGNMENT_DIR/training.align";
-  if ( -f $ALIGNMENTS ) {
-    print STDERR "$ALIGNMENTS  exists, reusing...\n";
-    return;
-  } 
-  my $conf_file = "$ALIGNMENT_DIR/word-align.conf";
-    
-  #decorate training files with identifiers to stop the aligner from training on dev and test when rerun in future.
-  safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!";
-  safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!";
-
-  write_wconf($conf_file, $PROCESSED_DIR);  
-  system("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log");
-
-  if (! -f $ALIGNMENTS) { die "Failed to run word alignment.";}
-
-  my $cmd = "paste $PROCESSED_DIR/corpus.src $PROCESSED_DIR/corpus.trg $ALIGNMENTS";
-  $cmd = $cmd . " | sed 's/\\t/ \|\|\| /g' > $PROCESSED_DIR/corpus.src-trg.al";
-  safesystem($cmd) or die "Failed to paste into aligned corpus file.";
-
-}
-
-############################# MONOLINGUAL #################################
-
-#copy the necessary data files that weren't place by segmentation
-sub place_missing_data_side {
-  my $side = shift;
-
-  ifne_copy($CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}") ;
-
-  if ($DEV{$side}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{$side}{'name'}") {
-    $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}";
-    copy($DEV{$side}{'orig'}, $DEV{$side}{'final'}) or die "Copy failed: $!";
-  }
-
-  if ($TEST{$side}{'orig'} && ! -f "$PROCESSED_DIR/$TEST{$side}{'name'}" && ! $TEST{$side}{'finalunsplit'}) {
-    $TEST{$side}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}";
-    copy($TEST{$side}{'orig'}, $TEST{$side}{'finalunsplit'}) or die "Copy failed: $!";
-  }
-
-}
-
-sub apply_segmentation_side {
-  my ($side, $moddir) = @_;
- 
-  print STDERR "\n!!!APPLYING SEGMENTATION MODEL ($side)!!!\n";
-  apply_segmentation_any($moddir, $CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}");
-  if ($DEV{$side}{'orig'}) {
-     $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}";
-    apply_segmentation_any($moddir, $DEV{$side}{'orig'}, "$DEV{$side}{'final'}");
-  }
-  if ($TEST{$side}{'orig'}) {
-    $TEST{$side}{'finalsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}.split";
-    apply_segmentation_any($moddir, $TEST{$side}{'orig'}, $TEST{$side}{'finalsplit'} );
-  } 
-
-}
-
-sub learn_segmentation_side {
-  my($INPUT_FILE, $SEGOUT_DIR, $PPL, $LANG) = @_;
-
-  print STDERR "\n!!!LEARNING SEGMENTATION MODEL ($LANG)!!!\n";
-  system("date");
-  my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready";
-   if ( -f $SEG_FILE) {
-    print STDERR "$SEG_FILE exists, reusing...\n";
-    return;
-  }
-  my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\"";
-  safesystem($cmd) or die "Failed to learn segmentation model";
-}
-
-sub apply_segmentation_any {
-  my($moddir, $datfile, $outfile) = @_;
-  if ( -f $outfile) {
-    print STDERR "$outfile exists, reusing...\n";
-    return;
-  }
-  
-  my $args = "$moddir/inputvocab.gz $moddir/segmentation.ready \"$MARKER\"";
-  safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile";
-}
-
-##################### PATH FUNCTIONS ##########################
-
-sub beautify_numlines {
-  return ($SENTENCES ? $SENTENCES : "_all");
-}
-
-sub corpus_dir {
-  return "s" . beautify_numlines() . ".w" . $MAX_WORDS;
-}
-
-sub model_dir {
-  my $lang = shift;
-  if ($lang eq "src") { 
-    return corpus_dir() . ".PPL" . $PPL_SRC . ".src";
-  } elsif ($lang eq "trg") {
-    return corpus_dir() .  ".PPL" . $PPL_TRG . ".trg";
-  } else {
-    return "PPLundef";
-  }    
-}
-
-sub processed_dir {
-  return corpus_dir() . "." . split_name();
-}
-
-########################## HELPER FUNCTIONS ############################
-
-sub ifne_copy {
-  my ($src, $dest) = @_;
-  if (! -f $dest) {
-    copy($src, $dest) or die "Copy failed: $!";
-  }
-}
-
-sub split_name {
-  #parses SPLIT_TYPE, which can have the following values
-  # t|s|ts|st (last 2 are equiv)
-  # or is undefined when no splitting is done
-  my $name = "";
-  
-  if ($SPLIT_TYPE) { 
-    $SPLIT_SRC = lc($SPLIT_TYPE) =~ /s/;
-    $SPLIT_TRG = lc($SPLIT_TYPE) =~ /t/;
-    $name = $name . ($SPLIT_SRC ? $PPL_SRC : "0");
-    $name = $name . "_" . ($SPLIT_TRG ? $PPL_TRG : "0"); 
-  } else {
-    #no splitting
-    $name = "0";
-  }
-
-  return "sp_" . $name;
-  
-}
-
-sub usage {
-  print <<EOT;
-
-Usage: $0 [OPTIONS] corpus.src corpus.trg [dev.src dev.trg [test.src test.trg]]
-
-Learns a segmentation model and splits up corpora as necessary. Word alignments are trained on a specified subset of the training corpus.
-
-EOT
-  exit 1;
-};
-
-sub safemkdir {
-  my $dir = shift;
-  if (-d $dir) { return 1; }
-  return mkdir($dir);
-}
-
-sub assert_exec {
-  my @files = @_;
-  for my $file (@files) {
-    die "Can't find $file - did you run make?\n" unless -e $file;
-    die "Can't execute $file" unless -e $file;
-  }
-};
-sub safesystem {
-  print STDERR "Executing: @_\n";
-  system(@_);
-  if ($? == -1) {
-      print STDERR "ERROR: Failed to execute: @_\n  $!\n";
-      exit(1);
-  }
-  elsif ($? & 127) {
-      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
-          ($? & 127),  ($? & 128) ? 'with' : 'without';
-      exit(1);
-  }
-  else {
-    my $exitcode = $? >> 8;
-    print STDERR "Exit code: $exitcode\n" if $exitcode;
-    return ! $exitcode;
-  }
-}
-
-sub get_basename
-{
-  my $x = shift;
-  $x = `basename $x`;
-  $x =~ s/\n//;
-  return $x;
-}
-
-sub assert_marker {
-  my $file = shift;
-  my $result = `zcat $file| grep '$MARKER' | wc -l` or die "Cannot read $file: $!";
-  print $result; 
-  if (scalar($result) != 0) { die "Data contains marker '$MARKER'; use something else.";}
-}
-########################### Dynamic config files ##############################
-
-sub write_wconf {
-  my ($filename, $train_dir) = @_;
-  open WCONF, ">$filename" or die "Can't write $filename: $!";
-
-  print WCONF <<EOT;
-## ----------------------
-## This is an example training script for the Berkeley
-## word aligner.  In this configuration it uses two HMM
-## alignment models trained jointly and then decoded
-## using the competitive thresholding heuristic.
-
-##########################################
-# Training: Defines the training regimen
-##########################################
-forwardModels   MODEL1 HMM
-reverseModels   MODEL1 HMM
-mode    JOINT JOINT
-iters   5 5
-
-###############################################
-# Execution: Controls output and program flow
-###############################################
-execDir $ALIGNMENT_DIR
-create
-overwriteExecDir
-saveParams  true
-numThreads  1
-msPerLine   10000
-alignTraining
-
-#################
-# Language/Data
-#################
-foreignSuffix   src
-englishSuffix   trg
-
-# Choose the training sources, which can either be directories or files that list files/directories
-trainSources    $train_dir/
-#trainSources     $train_dir/sources
-testSources     
-sentences   MAX
-
-#################
-# 1-best output
-#################
-competitiveThresholding
-
-EOT
-  close WCONF;
-}
-
-sub write_eval_sh
-{
-  my ($filename) = @_;
-  open EVALFILE, ">$filename" or die "Can't write $filename: $!";
-
-  print EVALFILE <<EOT;
-#!/bin/bash
-
-EVAL_MAIN=/export/ws10smt/data/eval.sh
-marker="$MARKER"
-EOT
-
-  if ($SPLIT_TRG) {
-    print EVALFILE <<EOT;
-echo "OUTPUT EVALUATION"
-echo "-----------------"
-\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalsplit'}
-
-echo "RECOMBINED OUTPUT EVALUATION"
-echo "----------------------------"
-cat "\$1" | sed -e "s/\$marker \$marker//g" -e "s/\$marker//g" > "\$1.recombined"
-
-\$EVAL_MAIN "\$1.recombined" $TEST{'trg'}{'finalunsplit'}
-EOT
-
-  } else {
-    print EVALFILE <<EOT;
-echo "ARTIFICIAL SPLIT EVALUATION"
-echo "--------------------------"
-
-#split the output translation
-cat "\$1" | $MORF_SEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split"
-
-\$EVAL_MAIN "\$1.split" $TEST{'trg'}{'finalsplit'}
-
-echo "DIRECT EVALUATION"
-echo "--------------------------"
-\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalunsplit'}
-  
-EOT
-
-  }
-  close EVALFILE;
-
-}
-
-
-
-
diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py
deleted file mode 100755
index 85b9d4fb..00000000
--- a/gi/morf-segmentation/morfsegment.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/python
-
-import sys
-import gzip
-
-#usage: morfsegment.py inputvocab.gz segmentation.ready
-#  stdin: the data to segment
-#  stdout: the segmented data
-
-if len(sys.argv) < 3:
-  print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]"
-  print "  stdin: the data to segment"
-  print "  stdout: the segmented data"
-  sys.exit()
-
-#read index:
-split_index={}
-
-marker="##"
-
-if len(sys.argv) > 3:
-  marker=sys.argv[3]
-
-word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz
-seg_vocab=open(sys.argv[2], 'r') #segm.ready..
-
-for seg in seg_vocab:
-  #seg = ver# #wonder\n
-  #wordline = 1 verwonder\n
-  word = word_vocab.readline().strip().split(' ')
-  assert(len(word) == 2)
-  word = word[1]
-  seg=seg.strip()
-
-  if seg != word:
-    split_index[word] = seg
-
-word_vocab.close()
-seg_vocab.close()
-
-for line in sys.stdin:
-  words = line.strip().split()
-
-  newsent = []
-  for word in words:
-    splitword = split_index.get(word, word)
-    newsent.append(splitword)
-
-  print ' '.join(newsent)
-
diff --git a/gi/morf-segmentation/morftrain.sh b/gi/morf-segmentation/morftrain.sh
deleted file mode 100755
index 9004922f..00000000
--- a/gi/morf-segmentation/morftrain.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/bash
-
-if [[ $# -lt 3 ]]; then
-	echo "Trains a morfessor model and places the result in writedir"
-	echo
-	echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]"
-	echo -e "\tcorpus_input_file contains a sentence per line."
-	exit 1
-fi
-
-MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2"
-SCRIPT_DIR=$(dirname `readlink -f $0`)
-
-MORFBINDIR="$MORFESSOR_DIR/bin"
-MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile"
-VOCABEXT="$SCRIPT_DIR/vocabextractor.sh"
-
-MARKER="#"
-
-if [[ ! -f $VOCABEXT ]]; then
-  echo "$VOCABEXT doesn't exist!"
-  exit 1
-fi
-if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then
-  echo "$MORFMAKEFILE_TRAIN doesn't exist!"
-  exit 1
-fi
-
-
-CORPUS="$1"
-WRITETODIR=$2
-
-if [[ ! -f $CORPUS ]]; then
-  echo "$CORPUS doesn't exist!"
-  exit 1
-fi
-
-PPL=10
-LINES=0
-if [[ $# -gt 2 ]]; then
-  PPL=$3
-fi
-if [[ $# -gt 3 ]]; then
-  MARKER="$4"
-fi
-if [[ $# -gt 4 ]]; then
-  LINES=$5
-fi
-
-mkdir -p $WRITETODIR
-
-#extract vocabulary to train on
-echo "Extracting vocabulary..."
-if [[ -f $WRITETODIR/inputvocab.gz ]]; then
-  echo " ....$WRITETODIR/inputvocab.gz exists, reusing."
-else
-  if [[ $LINES -gt 0 ]]; then
-    $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz
-  else
-    $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz
-  fi
-fi
-
-
-#train it
-echo "Training morf model..."
-if [[ -f $WRITETODIR/segmentation.final.gz ]]; then
-  echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.."
-else
-  OLDPWD=`pwd`
-  cd $WRITETODIR
-  
-  #put the training Makefile in place, with appropriate modifications
-  sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/"  \
-    -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \
-    -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \
-    $MORFMAKEFILE_TRAIN > ./Makefile
-
-  date
-  make > ./trainmorf.log 2>&1
-  cd $OLDPWD
-  
-  
-  echo "Post processing..."
-  #remove comments, counts and morph types
-  #mark morphs
-  
-  if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then
-     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
-     exit 1
-  fi
-
-  zcat $WRITETODIR/segmentation.final.gz | \
-    awk '$1 !~ /^#/ {print}' | \
-    cut -d ' ' --complement -f 1 | \
-    sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \
-    > $WRITETODIR/segmentation.ready
-
-  if [[ ! -f $WRITETODIR/segmentation.ready ]]; then
-     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
-     exit 1
-  fi
-
-
-
-  echo "Done training."
-  date
-fi
-echo "Segmentation model is $WRITETODIR/segmentation.ready."
-
diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh
deleted file mode 100755
index 00ae7109..00000000
--- a/gi/morf-segmentation/vocabextractor.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-d=$(dirname `readlink -f $0`)
-if [ $# -lt 1 ]; then
-	echo "Extracts unique words and their frequencies from a subset of a corpus."
-	echo
-	echo "Usage: `basename $0` input_file [number_of_lines] > output_file"
-	echo -e "\tinput_file contains a sentence per line."
-	echo
-	echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor."
-	echo
-	exit
-fi
-
-srcname=$1
-reallen=0
-
-if [[ $# -gt 1 ]]; then
-  reallen=$2
-fi
-
-pattern_file=$d/invalid_vocab.patterns
-
-if [[ ! -f $pattern_file ]]; then
-  echo "Pattern file missing"
-  exit 1 
-fi
-
-#this awk strips entries from the vocabulary if they contain invalid characters
-#invalid characters are digits and punctuation marks, and words beginning or ending with a dash
-#uniq -c extracts the unique words and counts the occurrences
-
-if [[ $reallen -eq 0 ]]; then
-	#when a zero is passed, use the whole file
-  zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//' 
-
-else
-	zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//'
-fi
-
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-10-11 14:06:32 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-10-11 14:06:32 -0400
commit	07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch)
tree	644496a1690d84d82a396bbc1e39160788beb2cd /gi/morf-segmentation
parent	37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff)
parent	a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff)