summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/morftrain.sh
blob: 9004922f49c62ffc86f5875326c0f9d8e0fedc2d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash

if [[ $# -lt 3 ]]; then
	echo "Trains a morfessor model and places the result in writedir"
	echo
	echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]"
	echo -e "\tcorpus_input_file contains a sentence per line."
	exit 1
fi

MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2"
SCRIPT_DIR=$(dirname `readlink -f $0`)

MORFBINDIR="$MORFESSOR_DIR/bin"
MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile"
VOCABEXT="$SCRIPT_DIR/vocabextractor.sh"

MARKER="#"

if [[ ! -f $VOCABEXT ]]; then
  echo "$VOCABEXT doesn't exist!"
  exit 1
fi
if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then
  echo "$MORFMAKEFILE_TRAIN doesn't exist!"
  exit 1
fi


CORPUS="$1"
WRITETODIR=$2

if [[ ! -f $CORPUS ]]; then
  echo "$CORPUS doesn't exist!"
  exit 1
fi

PPL=10
LINES=0
if [[ $# -gt 2 ]]; then
  PPL=$3
fi
if [[ $# -gt 3 ]]; then
  MARKER="$4"
fi
if [[ $# -gt 4 ]]; then
  LINES=$5
fi

mkdir -p $WRITETODIR

#extract vocabulary to train on
echo "Extracting vocabulary..."
if [[ -f $WRITETODIR/inputvocab.gz ]]; then
  echo " ....$WRITETODIR/inputvocab.gz exists, reusing."
else
  if [[ $LINES -gt 0 ]]; then
    $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz
  else
    $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz
  fi
fi


#train it
echo "Training morf model..."
if [[ -f $WRITETODIR/segmentation.final.gz ]]; then
  echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.."
else
  OLDPWD=`pwd`
  cd $WRITETODIR
  
  #put the training Makefile in place, with appropriate modifications
  sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/"  \
    -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \
    -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \
    $MORFMAKEFILE_TRAIN > ./Makefile

  date
  make > ./trainmorf.log 2>&1
  cd $OLDPWD
  
  
  echo "Post processing..."
  #remove comments, counts and morph types
  #mark morphs
  
  if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then
     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
     exit 1
  fi

  zcat $WRITETODIR/segmentation.final.gz | \
    awk '$1 !~ /^#/ {print}' | \
    cut -d ' ' --complement -f 1 | \
    sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \
    > $WRITETODIR/segmentation.ready

  if [[ ! -f $WRITETODIR/segmentation.ready ]]; then
     echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written"
     exit 1
  fi



  echo "Done training."
  date
fi
echo "Segmentation model is $WRITETODIR/segmentation.ready."