diff options
| author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 | 
| commit | 07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch) | |
| tree | 644496a1690d84d82a396bbc1e39160788beb2cd /gi/morf-segmentation | |
| parent | 37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff) | |
| parent | a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff) | |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/morf-segmentation')
| -rwxr-xr-x | gi/morf-segmentation/filter_docs.pl | 24 | ||||
| -rw-r--r-- | gi/morf-segmentation/invalid_vocab.patterns | 6 | ||||
| -rwxr-xr-x | gi/morf-segmentation/linestripper.py | 40 | ||||
| -rwxr-xr-x | gi/morf-segmentation/morf-pipeline.pl | 486 | ||||
| -rwxr-xr-x | gi/morf-segmentation/morfsegment.py | 50 | ||||
| -rwxr-xr-x | gi/morf-segmentation/morftrain.sh | 110 | ||||
| -rwxr-xr-x | gi/morf-segmentation/vocabextractor.sh | 40 | 
7 files changed, 0 insertions, 756 deletions
| diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl deleted file mode 100755 index a78575da..00000000 --- a/gi/morf-segmentation/filter_docs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries. - -#Usage: filter_docs.pl [mark] -#  STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor -#  STDOUT: the matching subset, same format - -use utf8; -my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html - -my $morph=qr/$letter+/; - -my $m = "##"; # marker used to indicate morphemes -if ((scalar @ARGV) >= 1) { -   $m = $ARGV[0]; -   shift; -} -print STDERR "Using $m to filter for morphemes\n"; - -my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped -while(<>) { -   /$expr/ && print; -} diff --git a/gi/morf-segmentation/invalid_vocab.patterns b/gi/morf-segmentation/invalid_vocab.patterns deleted file mode 100644 index 473ce1b1..00000000 --- a/gi/morf-segmentation/invalid_vocab.patterns +++ /dev/null @@ -1,6 +0,0 @@ -[[:digit:]] -[] !"#$%&()*+,./:;<=>?@[\^_`{|}~] -^'$ --$ -^- -^$ diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py deleted file mode 100755 index 04e9044a..00000000 --- a/gi/morf-segmentation/linestripper.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/python - -import sys - -#linestripper   file file maxlen [numlines] - -if len(sys.argv) < 3: -  print "linestripper   file1 file2 maxlen [numlines]"  -  print " outputs subset of file1 to stdout, ..of file2 to stderr" -  sys.exit(1) - - -f1 = open(sys.argv[1],'r') -f2 = open(sys.argv[2],'r') - -maxlen=int(sys.argv[3]) -numlines = 0 - -if len(sys.argv) > 4: -  numlines = int(sys.argv[4]) - -count=0 -for line1 in f1: -  line2 = f2.readline() -   -  w1 = len(line1.strip().split()) -  w2 = len(line2.strip().split()) - -  if w1 <= maxlen and w2 <= maxlen: -    count = count + 1 -    sys.stdout.write(line1) -    sys.stderr.write(line2) -  -  if numlines > 0 and count >= numlines: -    break - -f1.close() -f2.close() -   - diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl deleted file mode 100755 index 46eb5b46..00000000 --- a/gi/morf-segmentation/morf-pipeline.pl +++ /dev/null @@ -1,486 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use File::Copy; - - -# Preprocessing pipeline to take care of word segmentation -# Learns a segmentation model for each/either side of the parallel corpus using all train/dev/test data -# Applies the segmentation where necessary. -# Learns word alignments on the preprocessed training data. -# Outputs script files used later to score output. - - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -use Getopt::Long "GetOptions"; - -my $GZIP = 'gzip'; -my $ZCAT = 'gunzip -c'; -my $SED = 'sed -e'; - -my $MORF_TRAIN = "$SCRIPT_DIR/morftrain.sh"; -my $MORF_SEGMENT = "$SCRIPT_DIR/morfsegment.py"; - -my $LINESTRIPPER = "$SCRIPT_DIR/linestripper.py"; -my $ALIGNER = "/export/ws10smt/software/berkeleyaligner/berkeleyaligner.jar"; -#java -d64 -Xmx10g -jar $ALIGNER ++word-align.conf >> aligner.log -assert_exec($MORF_TRAIN, $LINESTRIPPER, $MORF_SEGMENT, $ALIGNER); - -my $OUTPUT = './morfwork'; -my $PPL_SRC = 50; -my $PPL_TRG = 50; -my $MARKER = "#"; -my $MAX_WORDS = 40; -my $SENTENCES;# = 100000; -my $SPLIT_TYPE = ""; #possible values: s, t, st, or (empty string) -my $NAME_SHORTCUT; - -usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, -                           'output=s' => \$OUTPUT, -                           'ppl_src=i' => \$PPL_SRC, -                           'ppl_trg=i' => \$PPL_TRG, -                           'sentences=i' => \$SENTENCES, -                           'marker=s' => \$MARKER, -                           'split=s' => \$SPLIT_TYPE, -                           'get_name_only' => \$NAME_SHORTCUT, -                          ); - -usage() unless scalar @ARGV >= 2; - -my %CORPUS; # for (src,trg) it has (orig, name, filtered, final) - -$CORPUS{'src'}{'orig'} = $ARGV[0]; -open F, "<$CORPUS{'src'}{'orig'}" or die "Can't read $CORPUS{'src'}{'orig'}: $!"; close F; -$CORPUS{'src'}{'name'} = get_basename($CORPUS{'src'}{'orig'}); - -$CORPUS{'trg'}{'orig'} = $ARGV[1]; -open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F; -$CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'}); - -my %DEV; # for (src,trg) has (orig, final.split final.unsplit -if (@ARGV >= 4) { -  $DEV{'src'}{'orig'} = $ARGV[2]; -  open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F; -  $DEV{'src'}{'name'} = get_basename($DEV{'src'}{'orig'}); -  $DEV{'trg'}{'orig'} = $ARGV[3]; -  open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F; -  $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'}); -} - -my %TEST; # for (src,trg) has (orig, name)  -if (@ARGV >= 6) { -  $TEST{'src'}{'orig'} = $ARGV[4]; -  open F, "<$TEST{'src'}{'orig'}" or die "Can't read $TEST{'src'}{'orig'}: $!"; close F; -  $TEST{'src'}{'name'} = get_basename($TEST{'src'}{'orig'}); -  $TEST{'trg'}{'orig'} = $ARGV[5]; -  open F, "<$TEST{'trg'}{'orig'}" or die "Can't read $TEST{'trg'}{'orig'}: $!"; close F; -  $TEST{'trg'}{'name'} = get_basename($TEST{'trg'}{'orig'}); -} - -my $SPLIT_SRC; #use these to check whether that part is being split -my $SPLIT_TRG; - -#OUTPUT WILL GO IN THESE -my $CORPUS_DIR = $OUTPUT . '/' . corpus_dir();            #subsampled corpus -my $MODEL_SRC_DIR = $OUTPUT . '/' . model_dir("src"); #splitting.. -my $MODEL_TRG_DIR = $OUTPUT . '/' . model_dir("trg"); # .. models -my $PROCESSED_DIR = $OUTPUT . '/' . processed_dir();      #segmented copora+alignments -my $ALIGNMENT_DIR = $PROCESSED_DIR . '/alignments'; - -$CORPUS{'src'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'src'}{'name'}"; -$CORPUS{'trg'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'trg'}{'name'}"; - -print STDERR "Output: $OUTPUT\n"; -print STDERR "Corpus: $CORPUS_DIR\n"; -print STDERR "Model-src: $MODEL_SRC_DIR\n"; -print STDERR "Model-trg: $MODEL_TRG_DIR\n"; -print STDERR "Finaldir: $PROCESSED_DIR\n"; - -safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; -safemkdir($CORPUS_DIR) or die "Couldn't create output directory $CORPUS_DIR: $!"; -filter_corpus(); - -safemkdir($PROCESSED_DIR); -safemkdir($ALIGNMENT_DIR); - -if ($SPLIT_SRC) { -  safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!"; -  learn_segmentation("src"); -  apply_segmentation_side("src", $MODEL_SRC_DIR);   -} - -#assume that unsplit hypotheses will be scored against an aritificially split target test set; thus obtain a target splitting model   -#TODO: add a flag to override this behaviour -safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; -learn_segmentation("trg"); -$TEST{'trg'}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}"; -copy($TEST{'trg'}{'orig'}, $TEST{'trg'}{'finalunsplit'}) or die "Could not copy unsegmented test set";   - -if ($SPLIT_TRG) { -  apply_segmentation_side("trg", $MODEL_TRG_DIR);   -  } else { -  $TEST{'trg'}{'finalsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}.split"; -  apply_segmentation_any($MODEL_TRG_DIR, $TEST{'trg'}{'finalunsplit'}, $TEST{'trg'}{'finalsplit'});   -} - -write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); - -#copy corpora if they haven't been put in place by splitting operations -place_missing_data_side('src'); -place_missing_data_side('trg'); - -do_align(); - -if ($CORPUS{'src'}{'orig'} && $DEV{'src'}{'orig'} && $TEST{'src'}{'orig'}) { -  print STDERR "Putting the config file entry in $PROCESSED_DIR/exp.config\n"; -#format is: -  # nlfr100k_unsplit /export/ws10smt/jan/nlfr/morfwork/s100k.w40.sp_0 corpus.nl-fr.al fr-3.lm.gz dev.nl dev.fr test2008.nl eval-devtest.sh -  my $line = split_name() . " $PROCESSED_DIR corpus.src-trg.al LMFILE.lm.gz"; -  $line = $line . " $DEV{'src'}{'name'} $DEV{'trg'}{'name'}"; -  $line = $line . " " . get_basename($TEST{'src'}{$SPLIT_SRC ? "finalsplit" : "finalunsplit"}) . " eval-devtest.sh"; -  safesystem("echo '$line' > $PROCESSED_DIR/exp.config"); -} - -system("date"); -print STDERR "All done. You now need to train a language model (if target split), put it in the right dir and update the config file.\n\n"; - -############################## BILINGUAL ################################### - -sub filter_corpus { -  print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n"; -  if ( -f $CORPUS{'src'}{'filtered'} && -f $CORPUS{'trg'}{'filtered'}) { -    print STDERR "$CORPUS{'src'}{'filtered'} and $CORPUS{'trg'}{'filtered'} exist, reusing...\n"; -    return; -  } -  my $args = "$CORPUS{'src'}{'orig'} $CORPUS{'trg'}{'orig'} $MAX_WORDS"; -  if ($SENTENCES) { $args = $args . " $SENTENCES"; }  -  safesystem("$LINESTRIPPER $args 1> $CORPUS{'src'}{'filtered'} 2> $CORPUS{'trg'}{'filtered'}") or die "Failed to filter training corpus for length."; -} - -sub learn_segmentation -{ -  my $WHICH = shift; -  my $corpus; my $dev; my $test; my $moddir;  my $ppl; - -  $corpus = $CORPUS{$WHICH}{'filtered'}; -  $dev = $DEV{$WHICH}{'orig'}; -  $test = $TEST{$WHICH}{'orig'}; - -  if ($WHICH eq "src") { -    $moddir = $MODEL_SRC_DIR; -    $ppl = $PPL_SRC; -  } else { -    $moddir = $MODEL_TRG_DIR; -    $ppl = $PPL_TRG; -  } -  my $cmd = "cat $corpus"; -  if ($dev) { $cmd = "$cmd $dev"; } -  if ($test) { $cmd = "$cmd $test"; } -  my $tmpfile = "$CORPUS_DIR/all.tmp.gz"; -  safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning.."; -  assert_marker($tmpfile); - -  learn_segmentation_side($tmpfile, $moddir, $ppl, $WHICH); -  safesystem("rm $tmpfile"); -} - -sub do_align { -  print STDERR "\n!!!WORD ALIGNMENT!!!\n"; -  system("date"); - -  my $ALIGNMENTS = "$ALIGNMENT_DIR/training.align"; -  if ( -f $ALIGNMENTS ) { -    print STDERR "$ALIGNMENTS  exists, reusing...\n"; -    return; -  }  -  my $conf_file = "$ALIGNMENT_DIR/word-align.conf"; -     -  #decorate training files with identifiers to stop the aligner from training on dev and test when rerun in future. -  safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!"; -  safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!"; - -  write_wconf($conf_file, $PROCESSED_DIR);   -  system("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log"); - -  if (! -f $ALIGNMENTS) { die "Failed to run word alignment.";} - -  my $cmd = "paste $PROCESSED_DIR/corpus.src $PROCESSED_DIR/corpus.trg $ALIGNMENTS"; -  $cmd = $cmd . " | sed 's/\\t/ \|\|\| /g' > $PROCESSED_DIR/corpus.src-trg.al"; -  safesystem($cmd) or die "Failed to paste into aligned corpus file."; - -} - -############################# MONOLINGUAL ################################# - -#copy the necessary data files that weren't place by segmentation -sub place_missing_data_side { -  my $side = shift; - -  ifne_copy($CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}") ; - -  if ($DEV{$side}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{$side}{'name'}") { -    $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; -    copy($DEV{$side}{'orig'}, $DEV{$side}{'final'}) or die "Copy failed: $!"; -  } - -  if ($TEST{$side}{'orig'} && ! -f "$PROCESSED_DIR/$TEST{$side}{'name'}" && ! $TEST{$side}{'finalunsplit'}) { -    $TEST{$side}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}"; -    copy($TEST{$side}{'orig'}, $TEST{$side}{'finalunsplit'}) or die "Copy failed: $!"; -  } - -} - -sub apply_segmentation_side { -  my ($side, $moddir) = @_; -  -  print STDERR "\n!!!APPLYING SEGMENTATION MODEL ($side)!!!\n"; -  apply_segmentation_any($moddir, $CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}"); -  if ($DEV{$side}{'orig'}) { -     $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; -    apply_segmentation_any($moddir, $DEV{$side}{'orig'}, "$DEV{$side}{'final'}"); -  } -  if ($TEST{$side}{'orig'}) { -    $TEST{$side}{'finalsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}.split"; -    apply_segmentation_any($moddir, $TEST{$side}{'orig'}, $TEST{$side}{'finalsplit'} ); -  }  - -} - -sub learn_segmentation_side { -  my($INPUT_FILE, $SEGOUT_DIR, $PPL, $LANG) = @_; - -  print STDERR "\n!!!LEARNING SEGMENTATION MODEL ($LANG)!!!\n"; -  system("date"); -  my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; -   if ( -f $SEG_FILE) { -    print STDERR "$SEG_FILE exists, reusing...\n"; -    return; -  } -  my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; -  safesystem($cmd) or die "Failed to learn segmentation model"; -} - -sub apply_segmentation_any { -  my($moddir, $datfile, $outfile) = @_; -  if ( -f $outfile) { -    print STDERR "$outfile exists, reusing...\n"; -    return; -  } -   -  my $args = "$moddir/inputvocab.gz $moddir/segmentation.ready \"$MARKER\""; -  safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile"; -} - -##################### PATH FUNCTIONS ########################## - -sub beautify_numlines { -  return ($SENTENCES ? $SENTENCES : "_all"); -} - -sub corpus_dir { -  return "s" . beautify_numlines() . ".w" . $MAX_WORDS; -} - -sub model_dir { -  my $lang = shift; -  if ($lang eq "src") {  -    return corpus_dir() . ".PPL" . $PPL_SRC . ".src"; -  } elsif ($lang eq "trg") { -    return corpus_dir() .  ".PPL" . $PPL_TRG . ".trg"; -  } else { -    return "PPLundef"; -  }     -} - -sub processed_dir { -  return corpus_dir() . "." . split_name(); -} - -########################## HELPER FUNCTIONS ############################ - -sub ifne_copy { -  my ($src, $dest) = @_; -  if (! -f $dest) { -    copy($src, $dest) or die "Copy failed: $!"; -  } -} - -sub split_name { -  #parses SPLIT_TYPE, which can have the following values -  # t|s|ts|st (last 2 are equiv) -  # or is undefined when no splitting is done -  my $name = ""; -   -  if ($SPLIT_TYPE) {  -    $SPLIT_SRC = lc($SPLIT_TYPE) =~ /s/; -    $SPLIT_TRG = lc($SPLIT_TYPE) =~ /t/; -    $name = $name . ($SPLIT_SRC ? $PPL_SRC : "0"); -    $name = $name . "_" . ($SPLIT_TRG ? $PPL_TRG : "0");  -  } else { -    #no splitting -    $name = "0"; -  } - -  return "sp_" . $name; -   -} - -sub usage { -  print <<EOT; - -Usage: $0 [OPTIONS] corpus.src corpus.trg [dev.src dev.trg [test.src test.trg]] - -Learns a segmentation model and splits up corpora as necessary. Word alignments are trained on a specified subset of the training corpus. - -EOT -  exit 1; -}; - -sub safemkdir { -  my $dir = shift; -  if (-d $dir) { return 1; } -  return mkdir($dir); -} - -sub assert_exec { -  my @files = @_; -  for my $file (@files) { -    die "Can't find $file - did you run make?\n" unless -e $file; -    die "Can't execute $file" unless -e $file; -  } -}; -sub safesystem { -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    print STDERR "Exit code: $exitcode\n" if $exitcode; -    return ! $exitcode; -  } -} - -sub get_basename -{ -  my $x = shift; -  $x = `basename $x`; -  $x =~ s/\n//; -  return $x; -} - -sub assert_marker { -  my $file = shift; -  my $result = `zcat $file| grep '$MARKER' | wc -l` or die "Cannot read $file: $!"; -  print $result;  -  if (scalar($result) != 0) { die "Data contains marker '$MARKER'; use something else.";} -} -########################### Dynamic config files ############################## - -sub write_wconf { -  my ($filename, $train_dir) = @_; -  open WCONF, ">$filename" or die "Can't write $filename: $!"; - -  print WCONF <<EOT; -## ---------------------- -## This is an example training script for the Berkeley -## word aligner.  In this configuration it uses two HMM -## alignment models trained jointly and then decoded -## using the competitive thresholding heuristic. - -########################################## -# Training: Defines the training regimen -########################################## -forwardModels   MODEL1 HMM -reverseModels   MODEL1 HMM -mode    JOINT JOINT -iters   5 5 - -############################################### -# Execution: Controls output and program flow -############################################### -execDir $ALIGNMENT_DIR -create -overwriteExecDir -saveParams  true -numThreads  1 -msPerLine   10000 -alignTraining - -################# -# Language/Data -################# -foreignSuffix   src -englishSuffix   trg - -# Choose the training sources, which can either be directories or files that list files/directories -trainSources    $train_dir/ -#trainSources     $train_dir/sources -testSources      -sentences   MAX - -################# -# 1-best output -################# -competitiveThresholding - -EOT -  close WCONF; -} - -sub write_eval_sh -{ -  my ($filename) = @_; -  open EVALFILE, ">$filename" or die "Can't write $filename: $!"; - -  print EVALFILE <<EOT; -#!/bin/bash - -EVAL_MAIN=/export/ws10smt/data/eval.sh -marker="$MARKER" -EOT - -  if ($SPLIT_TRG) { -    print EVALFILE <<EOT; -echo "OUTPUT EVALUATION" -echo "-----------------" -\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalsplit'} - -echo "RECOMBINED OUTPUT EVALUATION" -echo "----------------------------" -cat "\$1" | sed -e "s/\$marker \$marker//g" -e "s/\$marker//g" > "\$1.recombined" - -\$EVAL_MAIN "\$1.recombined" $TEST{'trg'}{'finalunsplit'} -EOT - -  } else { -    print EVALFILE <<EOT; -echo "ARTIFICIAL SPLIT EVALUATION" -echo "--------------------------" - -#split the output translation -cat "\$1" | $MORF_SEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split" - -\$EVAL_MAIN "\$1.split" $TEST{'trg'}{'finalsplit'} - -echo "DIRECT EVALUATION" -echo "--------------------------" -\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalunsplit'} -   -EOT - -  } -  close EVALFILE; - -} - - - - diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py deleted file mode 100755 index 85b9d4fb..00000000 --- a/gi/morf-segmentation/morfsegment.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/python - -import sys -import gzip - -#usage: morfsegment.py inputvocab.gz segmentation.ready -#  stdin: the data to segment -#  stdout: the segmented data - -if len(sys.argv) < 3: -  print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]" -  print "  stdin: the data to segment" -  print "  stdout: the segmented data" -  sys.exit() - -#read index: -split_index={} - -marker="##" - -if len(sys.argv) > 3: -  marker=sys.argv[3] - -word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz -seg_vocab=open(sys.argv[2], 'r') #segm.ready.. - -for seg in seg_vocab: -  #seg = ver# #wonder\n -  #wordline = 1 verwonder\n -  word = word_vocab.readline().strip().split(' ') -  assert(len(word) == 2) -  word = word[1] -  seg=seg.strip() - -  if seg != word: -    split_index[word] = seg - -word_vocab.close() -seg_vocab.close() - -for line in sys.stdin: -  words = line.strip().split() - -  newsent = [] -  for word in words: -    splitword = split_index.get(word, word) -    newsent.append(splitword) - -  print ' '.join(newsent) - diff --git a/gi/morf-segmentation/morftrain.sh b/gi/morf-segmentation/morftrain.sh deleted file mode 100755 index 9004922f..00000000 --- a/gi/morf-segmentation/morftrain.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash - -if [[ $# -lt 3 ]]; then -	echo "Trains a morfessor model and places the result in writedir" -	echo -	echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]" -	echo -e "\tcorpus_input_file contains a sentence per line." -	exit 1 -fi - -MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2" -SCRIPT_DIR=$(dirname `readlink -f $0`) - -MORFBINDIR="$MORFESSOR_DIR/bin" -MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile" -VOCABEXT="$SCRIPT_DIR/vocabextractor.sh" - -MARKER="#" - -if [[ ! -f $VOCABEXT ]]; then -  echo "$VOCABEXT doesn't exist!" -  exit 1 -fi -if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then -  echo "$MORFMAKEFILE_TRAIN doesn't exist!" -  exit 1 -fi - - -CORPUS="$1" -WRITETODIR=$2 - -if [[ ! -f $CORPUS ]]; then -  echo "$CORPUS doesn't exist!" -  exit 1 -fi - -PPL=10 -LINES=0 -if [[ $# -gt 2 ]]; then -  PPL=$3 -fi -if [[ $# -gt 3 ]]; then -  MARKER="$4" -fi -if [[ $# -gt 4 ]]; then -  LINES=$5 -fi - -mkdir -p $WRITETODIR - -#extract vocabulary to train on -echo "Extracting vocabulary..." -if [[ -f $WRITETODIR/inputvocab.gz ]]; then -  echo " ....$WRITETODIR/inputvocab.gz exists, reusing." -else -  if [[ $LINES -gt 0 ]]; then -    $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz -  else -    $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz -  fi -fi - - -#train it -echo "Training morf model..." -if [[ -f $WRITETODIR/segmentation.final.gz ]]; then -  echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.." -else -  OLDPWD=`pwd` -  cd $WRITETODIR -   -  #put the training Makefile in place, with appropriate modifications -  sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/"  \ -    -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \ -    -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \ -    $MORFMAKEFILE_TRAIN > ./Makefile - -  date -  make > ./trainmorf.log 2>&1 -  cd $OLDPWD -   -   -  echo "Post processing..." -  #remove comments, counts and morph types -  #mark morphs -   -  if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then -     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written" -     exit 1 -  fi - -  zcat $WRITETODIR/segmentation.final.gz | \ -    awk '$1 !~ /^#/ {print}' | \ -    cut -d ' ' --complement -f 1 | \ -    sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \ -    > $WRITETODIR/segmentation.ready - -  if [[ ! -f $WRITETODIR/segmentation.ready ]]; then -     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written" -     exit 1 -  fi - - - -  echo "Done training." -  date -fi -echo "Segmentation model is $WRITETODIR/segmentation.ready." - diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh deleted file mode 100755 index 00ae7109..00000000 --- a/gi/morf-segmentation/vocabextractor.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -d=$(dirname `readlink -f $0`) -if [ $# -lt 1 ]; then -	echo "Extracts unique words and their frequencies from a subset of a corpus." -	echo -	echo "Usage: `basename $0` input_file [number_of_lines] > output_file" -	echo -e "\tinput_file contains a sentence per line." -	echo -	echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor." -	echo -	exit -fi - -srcname=$1 -reallen=0 - -if [[ $# -gt 1 ]]; then -  reallen=$2 -fi - -pattern_file=$d/invalid_vocab.patterns - -if [[ ! -f $pattern_file ]]; then -  echo "Pattern file missing" -  exit 1  -fi - -#this awk strips entries from the vocabulary if they contain invalid characters -#invalid characters are digits and punctuation marks, and words beginning or ending with a dash -#uniq -c extracts the unique words and counts the occurrences - -if [[ $reallen -eq 0 ]]; then -	#when a zero is passed, use the whole file -  zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//'  - -else -	zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^  *//' -fi - | 
