From 26c490f404731d053a6205719b6246502c07b449 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 14 Jun 2014 16:46:27 +0200 Subject: init --- cdec/train.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100755 cdec/train.sh (limited to 'cdec/train.sh') diff --git a/cdec/train.sh b/cdec/train.sh new file mode 100755 index 0000000..02c6c84 --- /dev/null +++ b/cdec/train.sh @@ -0,0 +1,27 @@ +SCRIPTS=~/src/scripts/ +CDEC=~/src/cdec-dtrain/ +DATA=../data/ +NAME=news-commentary + +#$SCRIPTS/preprocess de < $DATA/$NAME.de.raw > $NAME.de.nof.nocs +$SCRIPTS/preprocess de < $DATA/$NAME.de.raw > $NAME.de.nof +$SCRIPTS/preprocess en < $DATA/$NAME.en.raw > $NAME.en.nof + +#$MOSES/scripts/generic/compound-splitter.perl -train -corpus $NAME.de.nof.nocs -model cs_model.de 2>compound-splitter-train.de.err +#$MOSES/scripts/generic/compound-splitter.perl -model cs_model.de < $NAME.de.nof.nocs > $NAME.de.nof + +$SCRIPTS/no_empty $NAME.de.nof $NAME.en.nof $NAME.de $NAME.en + +$CDEC/corpus/paste-files.pl $NAME.de $NAME.en > $NAME + +mkdir lm && cd lm +$CDEC/klm/lm/builder/lmplz -S 80% -T /tmp -o 4 < ../$NAME.en > $NAME.arpa.4 2>lmplz.err +$CDEC/klm/lm/build_binary $NAME.arpa.4 $NAME.ken.4 2>build_binary.err + +cd .. && mkdir a && cd a +$CDEC/word-aligner/fast_align -d -v -o -i ../$NAME > forward 2>forward.err +$CDEC/word-aligner/fast_align -d -v -o -r -i ../$NAME > backward 2>backward.err +$CDEC/utils/atools -i forward -j backward -c grow-diag-final-and > gdfa + +$CDEC/extractor/run_extractor -b news-commentary -a a/gdfa --leave_one_out --grammars g/ < news-commentary.de > news-commentary.de.sgm + -- cgit v1.2.3