diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
commit | 07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch) | |
tree | 644496a1690d84d82a396bbc1e39160788beb2cd /gi/morf-segmentation | |
parent | 37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff) | |
parent | a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff) |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/morf-segmentation')
-rwxr-xr-x | gi/morf-segmentation/filter_docs.pl | 24 | ||||
-rw-r--r-- | gi/morf-segmentation/invalid_vocab.patterns | 6 | ||||
-rwxr-xr-x | gi/morf-segmentation/linestripper.py | 40 | ||||
-rwxr-xr-x | gi/morf-segmentation/morf-pipeline.pl | 486 | ||||
-rwxr-xr-x | gi/morf-segmentation/morfsegment.py | 50 | ||||
-rwxr-xr-x | gi/morf-segmentation/morftrain.sh | 110 | ||||
-rwxr-xr-x | gi/morf-segmentation/vocabextractor.sh | 40 |
7 files changed, 0 insertions, 756 deletions
diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl deleted file mode 100755 index a78575da..00000000 --- a/gi/morf-segmentation/filter_docs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries. - -#Usage: filter_docs.pl [mark] -# STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor -# STDOUT: the matching subset, same format - -use utf8; -my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html - -my $morph=qr/$letter+/; - -my $m = "##"; # marker used to indicate morphemes -if ((scalar @ARGV) >= 1) { - $m = $ARGV[0]; - shift; -} -print STDERR "Using $m to filter for morphemes\n"; - -my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped -while(<>) { - /$expr/ && print; -} diff --git a/gi/morf-segmentation/invalid_vocab.patterns b/gi/morf-segmentation/invalid_vocab.patterns deleted file mode 100644 index 473ce1b1..00000000 --- a/gi/morf-segmentation/invalid_vocab.patterns +++ /dev/null @@ -1,6 +0,0 @@ -[[:digit:]] -[] !"#$%&()*+,./:;<=>?@[\^_`{|}~] -^'$ --$ -^- -^$ diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py deleted file mode 100755 index 04e9044a..00000000 --- a/gi/morf-segmentation/linestripper.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/python - -import sys - -#linestripper file file maxlen [numlines] - -if len(sys.argv) < 3: - print "linestripper file1 file2 maxlen [numlines]" - print " outputs subset of file1 to stdout, ..of file2 to stderr" - sys.exit(1) - - -f1 = open(sys.argv[1],'r') -f2 = open(sys.argv[2],'r') - -maxlen=int(sys.argv[3]) -numlines = 0 - -if len(sys.argv) > 4: - numlines = int(sys.argv[4]) - -count=0 -for line1 in f1: - line2 = f2.readline() - - w1 = len(line1.strip().split()) - w2 = len(line2.strip().split()) - - if w1 <= maxlen and w2 <= maxlen: - count = count + 1 - sys.stdout.write(line1) - sys.stderr.write(line2) - - if numlines > 0 and count >= numlines: - break - -f1.close() -f2.close() - - diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl deleted file mode 100755 index 46eb5b46..00000000 --- a/gi/morf-segmentation/morf-pipeline.pl +++ /dev/null @@ -1,486 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use File::Copy; - - -# Preprocessing pipeline to take care of word segmentation -# Learns a segmentation model for each/either side of the parallel corpus using all train/dev/test data -# Applies the segmentation where necessary. -# Learns word alignments on the preprocessed training data. -# Outputs script files used later to score output. - - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -use Getopt::Long "GetOptions"; - -my $GZIP = 'gzip'; -my $ZCAT = 'gunzip -c'; -my $SED = 'sed -e'; - -my $MORF_TRAIN = "$SCRIPT_DIR/morftrain.sh"; -my $MORF_SEGMENT = "$SCRIPT_DIR/morfsegment.py"; - -my $LINESTRIPPER = "$SCRIPT_DIR/linestripper.py"; -my $ALIGNER = "/export/ws10smt/software/berkeleyaligner/berkeleyaligner.jar"; -#java -d64 -Xmx10g -jar $ALIGNER ++word-align.conf >> aligner.log -assert_exec($MORF_TRAIN, $LINESTRIPPER, $MORF_SEGMENT, $ALIGNER); - -my $OUTPUT = './morfwork'; -my $PPL_SRC = 50; -my $PPL_TRG = 50; -my $MARKER = "#"; -my $MAX_WORDS = 40; -my $SENTENCES;# = 100000; -my $SPLIT_TYPE = ""; #possible values: s, t, st, or (empty string) -my $NAME_SHORTCUT; - -usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, - 'output=s' => \$OUTPUT, - 'ppl_src=i' => \$PPL_SRC, - 'ppl_trg=i' => \$PPL_TRG, - 'sentences=i' => \$SENTENCES, - 'marker=s' => \$MARKER, - 'split=s' => \$SPLIT_TYPE, - 'get_name_only' => \$NAME_SHORTCUT, - ); - -usage() unless scalar @ARGV >= 2; - -my %CORPUS; # for (src,trg) it has (orig, name, filtered, final) - -$CORPUS{'src'}{'orig'} = $ARGV[0]; -open F, "<$CORPUS{'src'}{'orig'}" or die "Can't read $CORPUS{'src'}{'orig'}: $!"; close F; -$CORPUS{'src'}{'name'} = get_basename($CORPUS{'src'}{'orig'}); - -$CORPUS{'trg'}{'orig'} = $ARGV[1]; -open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F; -$CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'}); - -my %DEV; # for (src,trg) has (orig, final.split final.unsplit -if (@ARGV >= 4) { - $DEV{'src'}{'orig'} = $ARGV[2]; - open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F; - $DEV{'src'}{'name'} = get_basename($DEV{'src'}{'orig'}); - $DEV{'trg'}{'orig'} = $ARGV[3]; - open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F; - $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'}); -} - -my %TEST; # for (src,trg) has (orig, name) -if (@ARGV >= 6) { - $TEST{'src'}{'orig'} = $ARGV[4]; - open F, "<$TEST{'src'}{'orig'}" or die "Can't read $TEST{'src'}{'orig'}: $!"; close F; - $TEST{'src'}{'name'} = get_basename($TEST{'src'}{'orig'}); - $TEST{'trg'}{'orig'} = $ARGV[5]; - open F, "<$TEST{'trg'}{'orig'}" or die "Can't read $TEST{'trg'}{'orig'}: $!"; close F; - $TEST{'trg'}{'name'} = get_basename($TEST{'trg'}{'orig'}); -} - -my $SPLIT_SRC; #use these to check whether that part is being split -my $SPLIT_TRG; - -#OUTPUT WILL GO IN THESE -my $CORPUS_DIR = $OUTPUT . '/' . corpus_dir(); #subsampled corpus -my $MODEL_SRC_DIR = $OUTPUT . '/' . model_dir("src"); #splitting.. -my $MODEL_TRG_DIR = $OUTPUT . '/' . model_dir("trg"); # .. models -my $PROCESSED_DIR = $OUTPUT . '/' . processed_dir(); #segmented copora+alignments -my $ALIGNMENT_DIR = $PROCESSED_DIR . '/alignments'; - -$CORPUS{'src'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'src'}{'name'}"; -$CORPUS{'trg'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'trg'}{'name'}"; - -print STDERR "Output: $OUTPUT\n"; -print STDERR "Corpus: $CORPUS_DIR\n"; -print STDERR "Model-src: $MODEL_SRC_DIR\n"; -print STDERR "Model-trg: $MODEL_TRG_DIR\n"; -print STDERR "Finaldir: $PROCESSED_DIR\n"; - -safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; -safemkdir($CORPUS_DIR) or die "Couldn't create output directory $CORPUS_DIR: $!"; -filter_corpus(); - -safemkdir($PROCESSED_DIR); -safemkdir($ALIGNMENT_DIR); - -if ($SPLIT_SRC) { - safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!"; - learn_segmentation("src"); - apply_segmentation_side("src", $MODEL_SRC_DIR); -} - -#assume that unsplit hypotheses will be scored against an aritificially split target test set; thus obtain a target splitting model -#TODO: add a flag to override this behaviour -safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; -learn_segmentation("trg"); -$TEST{'trg'}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}"; -copy($TEST{'trg'}{'orig'}, $TEST{'trg'}{'finalunsplit'}) or die "Could not copy unsegmented test set"; - -if ($SPLIT_TRG) { - apply_segmentation_side("trg", $MODEL_TRG_DIR); - } else { - $TEST{'trg'}{'finalsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}.split"; - apply_segmentation_any($MODEL_TRG_DIR, $TEST{'trg'}{'finalunsplit'}, $TEST{'trg'}{'finalsplit'}); -} - -write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); - -#copy corpora if they haven't been put in place by splitting operations -place_missing_data_side('src'); -place_missing_data_side('trg'); - -do_align(); - -if ($CORPUS{'src'}{'orig'} && $DEV{'src'}{'orig'} && $TEST{'src'}{'orig'}) { - print STDERR "Putting the config file entry in $PROCESSED_DIR/exp.config\n"; -#format is: - # nlfr100k_unsplit /export/ws10smt/jan/nlfr/morfwork/s100k.w40.sp_0 corpus.nl-fr.al fr-3.lm.gz dev.nl dev.fr test2008.nl eval-devtest.sh - my $line = split_name() . " $PROCESSED_DIR corpus.src-trg.al LMFILE.lm.gz"; - $line = $line . " $DEV{'src'}{'name'} $DEV{'trg'}{'name'}"; - $line = $line . " " . get_basename($TEST{'src'}{$SPLIT_SRC ? "finalsplit" : "finalunsplit"}) . " eval-devtest.sh"; - safesystem("echo '$line' > $PROCESSED_DIR/exp.config"); -} - -system("date"); -print STDERR "All done. You now need to train a language model (if target split), put it in the right dir and update the config file.\n\n"; - -############################## BILINGUAL ################################### - -sub filter_corpus { - print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n"; - if ( -f $CORPUS{'src'}{'filtered'} && -f $CORPUS{'trg'}{'filtered'}) { - print STDERR "$CORPUS{'src'}{'filtered'} and $CORPUS{'trg'}{'filtered'} exist, reusing...\n"; - return; - } - my $args = "$CORPUS{'src'}{'orig'} $CORPUS{'trg'}{'orig'} $MAX_WORDS"; - if ($SENTENCES) { $args = $args . " $SENTENCES"; } - safesystem("$LINESTRIPPER $args 1> $CORPUS{'src'}{'filtered'} 2> $CORPUS{'trg'}{'filtered'}") or die "Failed to filter training corpus for length."; -} - -sub learn_segmentation -{ - my $WHICH = shift; - my $corpus; my $dev; my $test; my $moddir; my $ppl; - - $corpus = $CORPUS{$WHICH}{'filtered'}; - $dev = $DEV{$WHICH}{'orig'}; - $test = $TEST{$WHICH}{'orig'}; - - if ($WHICH eq "src") { - $moddir = $MODEL_SRC_DIR; - $ppl = $PPL_SRC; - } else { - $moddir = $MODEL_TRG_DIR; - $ppl = $PPL_TRG; - } - my $cmd = "cat $corpus"; - if ($dev) { $cmd = "$cmd $dev"; } - if ($test) { $cmd = "$cmd $test"; } - my $tmpfile = "$CORPUS_DIR/all.tmp.gz"; - safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning.."; - assert_marker($tmpfile); - - learn_segmentation_side($tmpfile, $moddir, $ppl, $WHICH); - safesystem("rm $tmpfile"); -} - -sub do_align { - print STDERR "\n!!!WORD ALIGNMENT!!!\n"; - system("date"); - - my $ALIGNMENTS = "$ALIGNMENT_DIR/training.align"; - if ( -f $ALIGNMENTS ) { - print STDERR "$ALIGNMENTS exists, reusing...\n"; - return; - } - my $conf_file = "$ALIGNMENT_DIR/word-align.conf"; - - #decorate training files with identifiers to stop the aligner from training on dev and test when rerun in future. - safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!"; - safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!"; - - write_wconf($conf_file, $PROCESSED_DIR); - system("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log"); - - if (! -f $ALIGNMENTS) { die "Failed to run word alignment.";} - - my $cmd = "paste $PROCESSED_DIR/corpus.src $PROCESSED_DIR/corpus.trg $ALIGNMENTS"; - $cmd = $cmd . " | sed 's/\\t/ \|\|\| /g' > $PROCESSED_DIR/corpus.src-trg.al"; - safesystem($cmd) or die "Failed to paste into aligned corpus file."; - -} - -############################# MONOLINGUAL ################################# - -#copy the necessary data files that weren't place by segmentation -sub place_missing_data_side { - my $side = shift; - - ifne_copy($CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}") ; - - if ($DEV{$side}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{$side}{'name'}") { - $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; - copy($DEV{$side}{'orig'}, $DEV{$side}{'final'}) or die "Copy failed: $!"; - } - - if ($TEST{$side}{'orig'} && ! -f "$PROCESSED_DIR/$TEST{$side}{'name'}" && ! $TEST{$side}{'finalunsplit'}) { - $TEST{$side}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}"; - copy($TEST{$side}{'orig'}, $TEST{$side}{'finalunsplit'}) or die "Copy failed: $!"; - } - -} - -sub apply_segmentation_side { - my ($side, $moddir) = @_; - - print STDERR "\n!!!APPLYING SEGMENTATION MODEL ($side)!!!\n"; - apply_segmentation_any($moddir, $CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}"); - if ($DEV{$side}{'orig'}) { - $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; - apply_segmentation_any($moddir, $DEV{$side}{'orig'}, "$DEV{$side}{'final'}"); - } - if ($TEST{$side}{'orig'}) { - $TEST{$side}{'finalsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}.split"; - apply_segmentation_any($moddir, $TEST{$side}{'orig'}, $TEST{$side}{'finalsplit'} ); - } - -} - -sub learn_segmentation_side { - my($INPUT_FILE, $SEGOUT_DIR, $PPL, $LANG) = @_; - - print STDERR "\n!!!LEARNING SEGMENTATION MODEL ($LANG)!!!\n"; - system("date"); - my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; - if ( -f $SEG_FILE) { - print STDERR "$SEG_FILE exists, reusing...\n"; - return; - } - my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; - safesystem($cmd) or die "Failed to learn segmentation model"; -} - -sub apply_segmentation_any { - my($moddir, $datfile, $outfile) = @_; - if ( -f $outfile) { - print STDERR "$outfile exists, reusing...\n"; - return; - } - - my $args = "$moddir/inputvocab.gz $moddir/segmentation.ready \"$MARKER\""; - safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile"; -} - -##################### PATH FUNCTIONS ########################## - -sub beautify_numlines { - return ($SENTENCES ? $SENTENCES : "_all"); -} - -sub corpus_dir { - return "s" . beautify_numlines() . ".w" . $MAX_WORDS; -} - -sub model_dir { - my $lang = shift; - if ($lang eq "src") { - return corpus_dir() . ".PPL" . $PPL_SRC . ".src"; - } elsif ($lang eq "trg") { - return corpus_dir() . ".PPL" . $PPL_TRG . ".trg"; - } else { - return "PPLundef"; - } -} - -sub processed_dir { - return corpus_dir() . "." . split_name(); -} - -########################## HELPER FUNCTIONS ############################ - -sub ifne_copy { - my ($src, $dest) = @_; - if (! -f $dest) { - copy($src, $dest) or die "Copy failed: $!"; - } -} - -sub split_name { - #parses SPLIT_TYPE, which can have the following values - # t|s|ts|st (last 2 are equiv) - # or is undefined when no splitting is done - my $name = ""; - - if ($SPLIT_TYPE) { - $SPLIT_SRC = lc($SPLIT_TYPE) =~ /s/; - $SPLIT_TRG = lc($SPLIT_TYPE) =~ /t/; - $name = $name . ($SPLIT_SRC ? $PPL_SRC : "0"); - $name = $name . "_" . ($SPLIT_TRG ? $PPL_TRG : "0"); - } else { - #no splitting - $name = "0"; - } - - return "sp_" . $name; - -} - -sub usage { - print <<EOT; - -Usage: $0 [OPTIONS] corpus.src corpus.trg [dev.src dev.trg [test.src test.trg]] - -Learns a segmentation model and splits up corpora as necessary. Word alignments are trained on a specified subset of the training corpus. - -EOT - exit 1; -}; - -sub safemkdir { - my $dir = shift; - if (-d $dir) { return 1; } - return mkdir($dir); -} - -sub assert_exec { - my @files = @_; - for my $file (@files) { - die "Can't find $file - did you run make?\n" unless -e $file; - die "Can't execute $file" unless -e $file; - } -}; -sub safesystem { - print STDERR "Executing: @_\n"; - system(@_); - if ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - exit(1); - } - elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - } - else { - my $exitcode = $? >> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } -} - -sub get_basename -{ - my $x = shift; - $x = `basename $x`; - $x =~ s/\n//; - return $x; -} - -sub assert_marker { - my $file = shift; - my $result = `zcat $file| grep '$MARKER' | wc -l` or die "Cannot read $file: $!"; - print $result; - if (scalar($result) != 0) { die "Data contains marker '$MARKER'; use something else.";} -} -########################### Dynamic config files ############################## - -sub write_wconf { - my ($filename, $train_dir) = @_; - open WCONF, ">$filename" or die "Can't write $filename: $!"; - - print WCONF <<EOT; -## ---------------------- -## This is an example training script for the Berkeley -## word aligner. In this configuration it uses two HMM -## alignment models trained jointly and then decoded -## using the competitive thresholding heuristic. - -########################################## -# Training: Defines the training regimen -########################################## -forwardModels MODEL1 HMM -reverseModels MODEL1 HMM -mode JOINT JOINT -iters 5 5 - -############################################### -# Execution: Controls output and program flow -############################################### -execDir $ALIGNMENT_DIR -create -overwriteExecDir -saveParams true -numThreads 1 -msPerLine 10000 -alignTraining - -################# -# Language/Data -################# -foreignSuffix src -englishSuffix trg - -# Choose the training sources, which can either be directories or files that list files/directories -trainSources $train_dir/ -#trainSources $train_dir/sources -testSources -sentences MAX - -################# -# 1-best output -################# -competitiveThresholding - -EOT - close WCONF; -} - -sub write_eval_sh -{ - my ($filename) = @_; - open EVALFILE, ">$filename" or die "Can't write $filename: $!"; - - print EVALFILE <<EOT; -#!/bin/bash - -EVAL_MAIN=/export/ws10smt/data/eval.sh -marker="$MARKER" -EOT - - if ($SPLIT_TRG) { - print EVALFILE <<EOT; -echo "OUTPUT EVALUATION" -echo "-----------------" -\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalsplit'} - -echo "RECOMBINED OUTPUT EVALUATION" -echo "----------------------------" -cat "\$1" | sed -e "s/\$marker \$marker//g" -e "s/\$marker//g" > "\$1.recombined" - -\$EVAL_MAIN "\$1.recombined" $TEST{'trg'}{'finalunsplit'} -EOT - - } else { - print EVALFILE <<EOT; -echo "ARTIFICIAL SPLIT EVALUATION" -echo "--------------------------" - -#split the output translation -cat "\$1" | $MORF_SEGMENT $MODEL_TRG_DIR/inputvocab.gz $MODEL_TRG_DIR/segmentation.ready "\$MARKER" > "\$1.split" - -\$EVAL_MAIN "\$1.split" $TEST{'trg'}{'finalsplit'} - -echo "DIRECT EVALUATION" -echo "--------------------------" -\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalunsplit'} - -EOT - - } - close EVALFILE; - -} - - - - diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py deleted file mode 100755 index 85b9d4fb..00000000 --- a/gi/morf-segmentation/morfsegment.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/python - -import sys -import gzip - -#usage: morfsegment.py inputvocab.gz segmentation.ready -# stdin: the data to segment -# stdout: the segmented data - -if len(sys.argv) < 3: - print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]" - print " stdin: the data to segment" - print " stdout: the segmented data" - sys.exit() - -#read index: -split_index={} - -marker="##" - -if len(sys.argv) > 3: - marker=sys.argv[3] - -word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz -seg_vocab=open(sys.argv[2], 'r') #segm.ready.. - -for seg in seg_vocab: - #seg = ver# #wonder\n - #wordline = 1 verwonder\n - word = word_vocab.readline().strip().split(' ') - assert(len(word) == 2) - word = word[1] - seg=seg.strip() - - if seg != word: - split_index[word] = seg - -word_vocab.close() -seg_vocab.close() - -for line in sys.stdin: - words = line.strip().split() - - newsent = [] - for word in words: - splitword = split_index.get(word, word) - newsent.append(splitword) - - print ' '.join(newsent) - diff --git a/gi/morf-segmentation/morftrain.sh b/gi/morf-segmentation/morftrain.sh deleted file mode 100755 index 9004922f..00000000 --- a/gi/morf-segmentation/morftrain.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash - -if [[ $# -lt 3 ]]; then - echo "Trains a morfessor model and places the result in writedir" - echo - echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]" - echo -e "\tcorpus_input_file contains a sentence per line." - exit 1 -fi - -MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2" -SCRIPT_DIR=$(dirname `readlink -f $0`) - -MORFBINDIR="$MORFESSOR_DIR/bin" -MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile" -VOCABEXT="$SCRIPT_DIR/vocabextractor.sh" - -MARKER="#" - -if [[ ! -f $VOCABEXT ]]; then - echo "$VOCABEXT doesn't exist!" - exit 1 -fi -if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then - echo "$MORFMAKEFILE_TRAIN doesn't exist!" - exit 1 -fi - - -CORPUS="$1" -WRITETODIR=$2 - -if [[ ! -f $CORPUS ]]; then - echo "$CORPUS doesn't exist!" - exit 1 -fi - -PPL=10 -LINES=0 -if [[ $# -gt 2 ]]; then - PPL=$3 -fi -if [[ $# -gt 3 ]]; then - MARKER="$4" -fi -if [[ $# -gt 4 ]]; then - LINES=$5 -fi - -mkdir -p $WRITETODIR - -#extract vocabulary to train on -echo "Extracting vocabulary..." -if [[ -f $WRITETODIR/inputvocab.gz ]]; then - echo " ....$WRITETODIR/inputvocab.gz exists, reusing." -else - if [[ $LINES -gt 0 ]]; then - $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz - else - $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz - fi -fi - - -#train it -echo "Training morf model..." -if [[ -f $WRITETODIR/segmentation.final.gz ]]; then - echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.." -else - OLDPWD=`pwd` - cd $WRITETODIR - - #put the training Makefile in place, with appropriate modifications - sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/" \ - -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \ - -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \ - $MORFMAKEFILE_TRAIN > ./Makefile - - date - make > ./trainmorf.log 2>&1 - cd $OLDPWD - - - echo "Post processing..." - #remove comments, counts and morph types - #mark morphs - - if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then - echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written" - exit 1 - fi - - zcat $WRITETODIR/segmentation.final.gz | \ - awk '$1 !~ /^#/ {print}' | \ - cut -d ' ' --complement -f 1 | \ - sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \ - > $WRITETODIR/segmentation.ready - - if [[ ! -f $WRITETODIR/segmentation.ready ]]; then - echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written" - exit 1 - fi - - - - echo "Done training." - date -fi -echo "Segmentation model is $WRITETODIR/segmentation.ready." - diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh deleted file mode 100755 index 00ae7109..00000000 --- a/gi/morf-segmentation/vocabextractor.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -d=$(dirname `readlink -f $0`) -if [ $# -lt 1 ]; then - echo "Extracts unique words and their frequencies from a subset of a corpus." - echo - echo "Usage: `basename $0` input_file [number_of_lines] > output_file" - echo -e "\tinput_file contains a sentence per line." - echo - echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor." - echo - exit -fi - -srcname=$1 -reallen=0 - -if [[ $# -gt 1 ]]; then - reallen=$2 -fi - -pattern_file=$d/invalid_vocab.patterns - -if [[ ! -f $pattern_file ]]; then - echo "Pattern file missing" - exit 1 -fi - -#this awk strips entries from the vocabulary if they contain invalid characters -#invalid characters are digits and punctuation marks, and words beginning or ending with a dash -#uniq -c extracts the unique words and counts the occurrences - -if [[ $reallen -eq 0 ]]; then - #when a zero is passed, use the whole file - zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' - -else - zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' -fi - |