init

author: Patrick Simianer <p@simianer.de> 2014-06-14 16:46:27 +0200
committer: Patrick Simianer <p@simianer.de> 2014-06-14 16:46:27 +0200
commit: 26c490f404731d053a6205719b6246502c07b449 (patch)
tree: 3aa721098f1251dfbf2249ecd2736434c13b1d48 /cdec
15 files changed, 230 insertions, 0 deletions
diff --git a/cdec/cdec.ini b/cdec/cdec.ini
new file mode 100644
index 0000000..c77fa09
--- /dev/null
+++ b/cdec/cdec.ini
@@ -0,0 +1,23 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel ../data/lm/lm.1k.en.5
+grammar=../data/grammar
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/cdec/dtrain.ini b/cdec/dtrain.ini
new file mode 100644
index 0000000..857290d
--- /dev/null
+++ b/cdec/dtrain.ini
@@ -0,0 +1,18 @@
+k=100
+N=4
+learning_rate=0.00001
+gamma=0
+loss_margin=1.0
+epochs=15
+scorer=fixed_stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=avg
+print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV
+decoder_config=/path/to/cdec.ini
+output=weights.avg.gz
+keep=true
+input=/path/to/input
+refs=/path/to/references
diff --git a/cdec/mert.sh b/cdec/mert.sh
new file mode 100755
index 0000000..3af9e65
--- /dev/null
+++ b/cdec/mert.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+ 
+CDEC=/path/to/cdec
+DEVSET=/path/to/pasted/file
+ 
+$CDEC/training/dpmert/dpmert.pl \
+  --devset $DEVSET \
+  --config $(pwd)/cdec.ini \
+  --weights $(pwd)/weights.init \
+  --metric IBM_BLEU \
+  --iterations 15 \
+  --random-directions 15 \
+  --output-dir $(pwd)/work \
+  --jobs 2 \
+  &> mert.out
+
+gzip mert.out
+ 
diff --git a/cdec/mira.sh b/cdec/mira.sh
new file mode 100755
index 0000000..4bb4f78
--- /dev/null
+++ b/cdec/mira.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+CDEC=/path/tocdec
+DEVSET=/path/to/pasted/file
+
+$CDEC/training/mira/mira.py \
+  --devset $DEVSET  \
+  --config $(pwd)/cdec.ini \
+  --weights $(pwd)/weights.init \
+  --jobs 2 \
+  --output-dir $(pwd)/work \
+  --metric ibm_bleu \
+  --max-iterations 20 \
+  --metric-scale 1 \
+  -k 500 \
+  --step-size 0.001  \
+  --optimizer 2 \
+  --hope 1 \
+  --fear 1 \
+
+gzip mira.out
+
diff --git a/cdec/mira.sh.old b/cdec/mira.sh.old
new file mode 100755
index 0000000..5e3c9c3
--- /dev/null
+++ b/cdec/mira.sh.old
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+CDEC=/cdec
+DEV_IN=/path/to/input/file
+DEV_REF=/path/to/refs/file
+
+$CDEC/mira/kbest_mira \
+  --input_weights weights.init \
+  --source $DEV_IN \
+  --reference $DEV_REF \
+  --passes 15 \
+  --mt_metric NIST_BLEU \
+  --max_step_size 0.01 \
+  --mt_metric_scale 1 \
+  --k_best_size 250 \
+  --sample_forest \
+  --sample_forest_unit_weight_vector \
+  --decoder_config $(pwd)/cdec.ini \
+  &> mira.out
+
+gzip mira.out
+
diff --git a/cdec/pro.sh b/cdec/pro.sh
new file mode 100755
index 0000000..48b0078
--- /dev/null
+++ b/cdec/pro.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CDEC=/path/to/cdec
+DEVSET=/path/to/pasted/file
+
+$CDEC/training/pro/pro.pl \
+  --config $(pwd)/cdec.ini \
+  --devset $DEVSET \
+  --weights $(pwd)/weights.init \
+  --metric IBM_BLEU \
+  --output-dir $(pwd)/work \
+  --reg 500 \
+  --reg-previous 5000 \
+  --jobs 2 \
+  &>pro.out
+
+gzip pro.out
+
diff --git a/cdec/rampion.sh b/cdec/rampion.sh
new file mode 100755
index 0000000..a3fc362
--- /dev/null
+++ b/cdec/rampion.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+CDEC=/path/to/cdec
+DEVSET=/path/to/pasted/file
+
+$CDEC/training/rampion/rampion.pl \
+  --source-file $DEVSET.de.sgm \
+  --ref-files $DEVSET.en \
+  --weights $(pwd)/weights.init \
+  --max-iterations 30 \
+  --metric IBM_BLEU \
+  --workdir $(pwd)/work \
+  --reg 500 \
+  --jobs 2 \
+  $(pwd)/../../cdec.ini \
+  &>rampion.out
+
+gzip rampion.out
+
diff --git a/cdec/toy/cdec.ini b/cdec/toy/cdec.ini
new file mode 100644
index 0000000..d444989
--- /dev/null
+++ b/cdec/toy/cdec.ini
@@ -0,0 +1,5 @@
+formalism=scfg
+grammar=grammar
+add_pass_through_rules=true
+weights=weights
+
diff --git a/cdec/toy/grammar b/cdec/toy/grammar
new file mode 100644
index 0000000..382c94f
--- /dev/null
+++ b/cdec/toy/grammar
@@ -0,0 +1,12 @@
+[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
+[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
+[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
+[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
+[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
+[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0
+[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0
+[JJ] ||| grosses ||| big ||| logp=0
+[JJ] ||| grosses ||| large ||| logp=0
+[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
+[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0
+[V] ||| fand ||| found ||| logp=0
diff --git a/cdec/toy/in b/cdec/toy/in
new file mode 100644
index 0000000..e6df927
--- /dev/null
+++ b/cdec/toy/in
@@ -0,0 +1 @@
+ich sah ein kleines haus
diff --git a/cdec/toy/weights b/cdec/toy/weights
new file mode 100644
index 0000000..70075b7
--- /dev/null
+++ b/cdec/toy/weights
@@ -0,0 +1,3 @@
+logp 2
+use_house 0
+use_shell 1
diff --git a/cdec/train.sh b/cdec/train.sh
new file mode 100755
index 0000000..02c6c84
--- /dev/null
+++ b/cdec/train.sh
@@ -0,0 +1,27 @@
+SCRIPTS=~/src/scripts/ 
+CDEC=~/src/cdec-dtrain/
+DATA=../data/
+NAME=news-commentary
+ 
+#$SCRIPTS/preprocess de < $DATA/$NAME.de.raw > $NAME.de.nof.nocs
+$SCRIPTS/preprocess de < $DATA/$NAME.de.raw > $NAME.de.nof
+$SCRIPTS/preprocess en < $DATA/$NAME.en.raw > $NAME.en.nof
+ 
+#$MOSES/scripts/generic/compound-splitter.perl -train -corpus $NAME.de.nof.nocs -model cs_model.de 2>compound-splitter-train.de.err
+#$MOSES/scripts/generic/compound-splitter.perl -model cs_model.de < $NAME.de.nof.nocs > $NAME.de.nof
+ 
+$SCRIPTS/no_empty $NAME.de.nof $NAME.en.nof $NAME.de $NAME.en
+ 
+$CDEC/corpus/paste-files.pl $NAME.de $NAME.en > $NAME
+ 
+mkdir lm && cd lm
+$CDEC/klm/lm/builder/lmplz -S 80% -T /tmp -o 4 < ../$NAME.en > $NAME.arpa.4 2>lmplz.err
+$CDEC/klm/lm/build_binary $NAME.arpa.4 $NAME.ken.4 2>build_binary.err
+ 
+cd .. && mkdir a && cd a
+$CDEC/word-aligner/fast_align -d -v -o -i ../$NAME > forward 2>forward.err
+$CDEC/word-aligner/fast_align -d -v -o -r -i ../$NAME > backward 2>backward.err
+$CDEC/utils/atools -i forward -j backward -c grow-diag-final-and > gdfa
+ 
+$CDEC/extractor/run_extractor -b news-commentary -a a/gdfa --leave_one_out --grammars g/ < news-commentary.de > news-commentary.de.sgm
+
diff --git a/cdec/weights.init b/cdec/weights.init
new file mode 100644
index 0000000..0d09f9f
--- /dev/null
+++ b/cdec/weights.init
@@ -0,0 +1,12 @@
+CountEF 0.1
+EgivenFCoherent -0.1
+Glue 0.01
+IsSingletonF -0.01
+IsSingletonFE -0.01
+LanguageModel 0.1
+LanguageModel_OOV -1
+MaxLexFgivenE -0.1
+MaxLexEgivenF -0.1
+PassThrough -0.1
+SampleCountF -0.1
+WordPenalty -0.1
diff --git a/cdec/weights.init.old b/cdec/weights.init.old
new file mode 100644
index 0000000..1c798b6
--- /dev/null
+++ b/cdec/weights.init.old
@@ -0,0 +1,12 @@
+PhraseModel_0 -0.25
+PhraseModel_1 0.1
+PhraseModel_2 -0.15
+PhraseModel_3 -0.2
+PhraseModel_4 -0.1
+PhraseModel_5 0.01
+PhraseModel_6 0.01
+Glue 0.0
+WordPenalty -2.0
+PassThrough -2.0
+LanguageModel 1.0
+LanguageModel_OOV -1.0
diff --git a/cdec/weights.init.passthrough b/cdec/weights.init.passthrough
new file mode 100644
index 0000000..33706b7
--- /dev/null
+++ b/cdec/weights.init.passthrough
@@ -0,0 +1,18 @@
+CountEF 0.1
+EgivenFCoherent -0.1
+Glue 0.01
+IsSingletonF -0.01
+IsSingletonFE -0.01
+LanguageModel 0.1
+LanguageModel_OOV -1
+MaxLexFgivenE -0.1
+MaxLexEgivenF -0.1
+PassThrough -0.1
+PassThrough_1 -0.1
+PassThrough_2 -0.1
+PassThrough_3 -0.1
+PassThrough_4 -0.1
+PassThrough_5 -0.1
+PassThrough_6 -0.1
+SampleCountF -0.1
+WordPenalty -0.1
author	Patrick Simianer <p@simianer.de>	2014-06-14 16:46:27 +0200
committer	Patrick Simianer <p@simianer.de>	2014-06-14 16:46:27 +0200
commit	26c490f404731d053a6205719b6246502c07b449 (patch)
tree	3aa721098f1251dfbf2249ecd2736434c13b1d48 /cdec