summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/morftrain.sh
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commit5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/morf-segmentation/morftrain.sh
parentcf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent21825a09d97c2e0afd20512f306fb25fed55e529 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'gi/morf-segmentation/morftrain.sh')
-rwxr-xr-xgi/morf-segmentation/morftrain.sh110
1 files changed, 0 insertions, 110 deletions
diff --git a/gi/morf-segmentation/morftrain.sh b/gi/morf-segmentation/morftrain.sh
deleted file mode 100755
index 9004922f..00000000
--- a/gi/morf-segmentation/morftrain.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/bash
-
-if [[ $# -lt 3 ]]; then
- echo "Trains a morfessor model and places the result in writedir"
- echo
- echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]"
- echo -e "\tcorpus_input_file contains a sentence per line."
- exit 1
-fi
-
-MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2"
-SCRIPT_DIR=$(dirname `readlink -f $0`)
-
-MORFBINDIR="$MORFESSOR_DIR/bin"
-MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile"
-VOCABEXT="$SCRIPT_DIR/vocabextractor.sh"
-
-MARKER="#"
-
-if [[ ! -f $VOCABEXT ]]; then
- echo "$VOCABEXT doesn't exist!"
- exit 1
-fi
-if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then
- echo "$MORFMAKEFILE_TRAIN doesn't exist!"
- exit 1
-fi
-
-
-CORPUS="$1"
-WRITETODIR=$2
-
-if [[ ! -f $CORPUS ]]; then
- echo "$CORPUS doesn't exist!"
- exit 1
-fi
-
-PPL=10
-LINES=0
-if [[ $# -gt 2 ]]; then
- PPL=$3
-fi
-if [[ $# -gt 3 ]]; then
- MARKER="$4"
-fi
-if [[ $# -gt 4 ]]; then
- LINES=$5
-fi
-
-mkdir -p $WRITETODIR
-
-#extract vocabulary to train on
-echo "Extracting vocabulary..."
-if [[ -f $WRITETODIR/inputvocab.gz ]]; then
- echo " ....$WRITETODIR/inputvocab.gz exists, reusing."
-else
- if [[ $LINES -gt 0 ]]; then
- $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz
- else
- $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz
- fi
-fi
-
-
-#train it
-echo "Training morf model..."
-if [[ -f $WRITETODIR/segmentation.final.gz ]]; then
- echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.."
-else
- OLDPWD=`pwd`
- cd $WRITETODIR
-
- #put the training Makefile in place, with appropriate modifications
- sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/" \
- -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \
- -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \
- $MORFMAKEFILE_TRAIN > ./Makefile
-
- date
- make > ./trainmorf.log 2>&1
- cd $OLDPWD
-
-
- echo "Post processing..."
- #remove comments, counts and morph types
- #mark morphs
-
- if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then
- echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
- exit 1
- fi
-
- zcat $WRITETODIR/segmentation.final.gz | \
- awk '$1 !~ /^#/ {print}' | \
- cut -d ' ' --complement -f 1 | \
- sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \
- > $WRITETODIR/segmentation.ready
-
- if [[ ! -f $WRITETODIR/segmentation.ready ]]; then
- echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
- exit 1
- fi
-
-
-
- echo "Done training."
- date
-fi
-echo "Segmentation model is $WRITETODIR/segmentation.ready."
-