blob: 9004922f49c62ffc86f5875326c0f9d8e0fedc2d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
#!/bin/bash
if [[ $# -lt 3 ]]; then
echo "Trains a morfessor model and places the result in writedir"
echo
echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]"
echo -e "\tcorpus_input_file contains a sentence per line."
exit 1
fi
MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2"
SCRIPT_DIR=$(dirname `readlink -f $0`)
MORFBINDIR="$MORFESSOR_DIR/bin"
MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile"
VOCABEXT="$SCRIPT_DIR/vocabextractor.sh"
MARKER="#"
if [[ ! -f $VOCABEXT ]]; then
echo "$VOCABEXT doesn't exist!"
exit 1
fi
if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then
echo "$MORFMAKEFILE_TRAIN doesn't exist!"
exit 1
fi
CORPUS="$1"
WRITETODIR=$2
if [[ ! -f $CORPUS ]]; then
echo "$CORPUS doesn't exist!"
exit 1
fi
PPL=10
LINES=0
if [[ $# -gt 2 ]]; then
PPL=$3
fi
if [[ $# -gt 3 ]]; then
MARKER="$4"
fi
if [[ $# -gt 4 ]]; then
LINES=$5
fi
mkdir -p $WRITETODIR
#extract vocabulary to train on
echo "Extracting vocabulary..."
if [[ -f $WRITETODIR/inputvocab.gz ]]; then
echo " ....$WRITETODIR/inputvocab.gz exists, reusing."
else
if [[ $LINES -gt 0 ]]; then
$VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz
else
$VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz
fi
fi
#train it
echo "Training morf model..."
if [[ -f $WRITETODIR/segmentation.final.gz ]]; then
echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.."
else
OLDPWD=`pwd`
cd $WRITETODIR
#put the training Makefile in place, with appropriate modifications
sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/" \
-e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \
-e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \
$MORFMAKEFILE_TRAIN > ./Makefile
date
make > ./trainmorf.log 2>&1
cd $OLDPWD
echo "Post processing..."
#remove comments, counts and morph types
#mark morphs
if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then
echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
exit 1
fi
zcat $WRITETODIR/segmentation.final.gz | \
awk '$1 !~ /^#/ {print}' | \
cut -d ' ' --complement -f 1 | \
sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \
> $WRITETODIR/segmentation.ready
if [[ ! -f $WRITETODIR/segmentation.ready ]]; then
echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
exit 1
fi
echo "Done training."
date
fi
echo "Segmentation model is $WRITETODIR/segmentation.ready."
|