diff options
author | Patrick Simianer <p@simianer.de> | 2014-06-14 16:46:27 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-06-14 16:46:27 +0200 |
commit | 26c490f404731d053a6205719b6246502c07b449 (patch) | |
tree | 3aa721098f1251dfbf2249ecd2736434c13b1d48 /cdec |
init
Diffstat (limited to 'cdec')
-rw-r--r-- | cdec/cdec.ini | 23 | ||||
-rw-r--r-- | cdec/dtrain.ini | 18 | ||||
-rwxr-xr-x | cdec/mert.sh | 18 | ||||
-rwxr-xr-x | cdec/mira.sh | 22 | ||||
-rwxr-xr-x | cdec/mira.sh.old | 22 | ||||
-rwxr-xr-x | cdec/pro.sh | 18 | ||||
-rwxr-xr-x | cdec/rampion.sh | 19 | ||||
-rw-r--r-- | cdec/toy/cdec.ini | 5 | ||||
-rw-r--r-- | cdec/toy/grammar | 12 | ||||
-rw-r--r-- | cdec/toy/in | 1 | ||||
-rw-r--r-- | cdec/toy/weights | 3 | ||||
-rwxr-xr-x | cdec/train.sh | 27 | ||||
-rw-r--r-- | cdec/weights.init | 12 | ||||
-rw-r--r-- | cdec/weights.init.old | 12 | ||||
-rw-r--r-- | cdec/weights.init.passthrough | 18 |
15 files changed, 230 insertions, 0 deletions
diff --git a/cdec/cdec.ini b/cdec/cdec.ini new file mode 100644 index 0000000..c77fa09 --- /dev/null +++ b/cdec/cdec.ini @@ -0,0 +1,23 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel ../data/lm/lm.1k.en.5 +grammar=../data/grammar +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/cdec/dtrain.ini b/cdec/dtrain.ini new file mode 100644 index 0000000..857290d --- /dev/null +++ b/cdec/dtrain.ini @@ -0,0 +1,18 @@ +k=100 +N=4 +learning_rate=0.00001 +gamma=0 +loss_margin=1.0 +epochs=15 +scorer=fixed_stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=avg +print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV +decoder_config=/path/to/cdec.ini +output=weights.avg.gz +keep=true +input=/path/to/input +refs=/path/to/references diff --git a/cdec/mert.sh b/cdec/mert.sh new file mode 100755 index 0000000..3af9e65 --- /dev/null +++ b/cdec/mert.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CDEC=/path/to/cdec +DEVSET=/path/to/pasted/file + +$CDEC/training/dpmert/dpmert.pl \ + --devset $DEVSET \ + --config $(pwd)/cdec.ini \ + --weights $(pwd)/weights.init \ + --metric IBM_BLEU \ + --iterations 15 \ + --random-directions 15 \ + --output-dir $(pwd)/work \ + --jobs 2 \ + &> mert.out + +gzip mert.out + diff --git a/cdec/mira.sh b/cdec/mira.sh new file mode 100755 index 0000000..4bb4f78 --- /dev/null +++ b/cdec/mira.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +CDEC=/path/tocdec +DEVSET=/path/to/pasted/file + +$CDEC/training/mira/mira.py \ + --devset $DEVSET \ + --config $(pwd)/cdec.ini \ + --weights $(pwd)/weights.init \ + --jobs 2 \ + --output-dir $(pwd)/work \ + --metric ibm_bleu \ + --max-iterations 20 \ + --metric-scale 1 \ + -k 500 \ + --step-size 0.001 \ + --optimizer 2 \ + --hope 1 \ + --fear 1 \ + +gzip mira.out + diff --git a/cdec/mira.sh.old b/cdec/mira.sh.old new file mode 100755 index 0000000..5e3c9c3 --- /dev/null +++ b/cdec/mira.sh.old @@ -0,0 +1,22 @@ +#!/bin/bash + +CDEC=/cdec +DEV_IN=/path/to/input/file +DEV_REF=/path/to/refs/file + +$CDEC/mira/kbest_mira \ + --input_weights weights.init \ + --source $DEV_IN \ + --reference $DEV_REF \ + --passes 15 \ + --mt_metric NIST_BLEU \ + --max_step_size 0.01 \ + --mt_metric_scale 1 \ + --k_best_size 250 \ + --sample_forest \ + --sample_forest_unit_weight_vector \ + --decoder_config $(pwd)/cdec.ini \ + &> mira.out + +gzip mira.out + diff --git a/cdec/pro.sh b/cdec/pro.sh new file mode 100755 index 0000000..48b0078 --- /dev/null +++ b/cdec/pro.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CDEC=/path/to/cdec +DEVSET=/path/to/pasted/file + +$CDEC/training/pro/pro.pl \ + --config $(pwd)/cdec.ini \ + --devset $DEVSET \ + --weights $(pwd)/weights.init \ + --metric IBM_BLEU \ + --output-dir $(pwd)/work \ + --reg 500 \ + --reg-previous 5000 \ + --jobs 2 \ + &>pro.out + +gzip pro.out + diff --git a/cdec/rampion.sh b/cdec/rampion.sh new file mode 100755 index 0000000..a3fc362 --- /dev/null +++ b/cdec/rampion.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +CDEC=/path/to/cdec +DEVSET=/path/to/pasted/file + +$CDEC/training/rampion/rampion.pl \ + --source-file $DEVSET.de.sgm \ + --ref-files $DEVSET.en \ + --weights $(pwd)/weights.init \ + --max-iterations 30 \ + --metric IBM_BLEU \ + --workdir $(pwd)/work \ + --reg 500 \ + --jobs 2 \ + $(pwd)/../../cdec.ini \ + &>rampion.out + +gzip rampion.out + diff --git a/cdec/toy/cdec.ini b/cdec/toy/cdec.ini new file mode 100644 index 0000000..d444989 --- /dev/null +++ b/cdec/toy/cdec.ini @@ -0,0 +1,5 @@ +formalism=scfg +grammar=grammar +add_pass_through_rules=true +weights=weights + diff --git a/cdec/toy/grammar b/cdec/toy/grammar new file mode 100644 index 0000000..382c94f --- /dev/null +++ b/cdec/toy/grammar @@ -0,0 +1,12 @@ +[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 +[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0 +[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0 +[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 +[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 +[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0 +[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0 +[JJ] ||| grosses ||| big ||| logp=0 +[JJ] ||| grosses ||| large ||| logp=0 +[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 +[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0 +[V] ||| fand ||| found ||| logp=0 diff --git a/cdec/toy/in b/cdec/toy/in new file mode 100644 index 0000000..e6df927 --- /dev/null +++ b/cdec/toy/in @@ -0,0 +1 @@ +ich sah ein kleines haus diff --git a/cdec/toy/weights b/cdec/toy/weights new file mode 100644 index 0000000..70075b7 --- /dev/null +++ b/cdec/toy/weights @@ -0,0 +1,3 @@ +logp 2 +use_house 0 +use_shell 1 diff --git a/cdec/train.sh b/cdec/train.sh new file mode 100755 index 0000000..02c6c84 --- /dev/null +++ b/cdec/train.sh @@ -0,0 +1,27 @@ +SCRIPTS=~/src/scripts/ +CDEC=~/src/cdec-dtrain/ +DATA=../data/ +NAME=news-commentary + +#$SCRIPTS/preprocess de < $DATA/$NAME.de.raw > $NAME.de.nof.nocs +$SCRIPTS/preprocess de < $DATA/$NAME.de.raw > $NAME.de.nof +$SCRIPTS/preprocess en < $DATA/$NAME.en.raw > $NAME.en.nof + +#$MOSES/scripts/generic/compound-splitter.perl -train -corpus $NAME.de.nof.nocs -model cs_model.de 2>compound-splitter-train.de.err +#$MOSES/scripts/generic/compound-splitter.perl -model cs_model.de < $NAME.de.nof.nocs > $NAME.de.nof + +$SCRIPTS/no_empty $NAME.de.nof $NAME.en.nof $NAME.de $NAME.en + +$CDEC/corpus/paste-files.pl $NAME.de $NAME.en > $NAME + +mkdir lm && cd lm +$CDEC/klm/lm/builder/lmplz -S 80% -T /tmp -o 4 < ../$NAME.en > $NAME.arpa.4 2>lmplz.err +$CDEC/klm/lm/build_binary $NAME.arpa.4 $NAME.ken.4 2>build_binary.err + +cd .. && mkdir a && cd a +$CDEC/word-aligner/fast_align -d -v -o -i ../$NAME > forward 2>forward.err +$CDEC/word-aligner/fast_align -d -v -o -r -i ../$NAME > backward 2>backward.err +$CDEC/utils/atools -i forward -j backward -c grow-diag-final-and > gdfa + +$CDEC/extractor/run_extractor -b news-commentary -a a/gdfa --leave_one_out --grammars g/ < news-commentary.de > news-commentary.de.sgm + diff --git a/cdec/weights.init b/cdec/weights.init new file mode 100644 index 0000000..0d09f9f --- /dev/null +++ b/cdec/weights.init @@ -0,0 +1,12 @@ +CountEF 0.1 +EgivenFCoherent -0.1 +Glue 0.01 +IsSingletonF -0.01 +IsSingletonFE -0.01 +LanguageModel 0.1 +LanguageModel_OOV -1 +MaxLexFgivenE -0.1 +MaxLexEgivenF -0.1 +PassThrough -0.1 +SampleCountF -0.1 +WordPenalty -0.1 diff --git a/cdec/weights.init.old b/cdec/weights.init.old new file mode 100644 index 0000000..1c798b6 --- /dev/null +++ b/cdec/weights.init.old @@ -0,0 +1,12 @@ +PhraseModel_0 -0.25 +PhraseModel_1 0.1 +PhraseModel_2 -0.15 +PhraseModel_3 -0.2 +PhraseModel_4 -0.1 +PhraseModel_5 0.01 +PhraseModel_6 0.01 +Glue 0.0 +WordPenalty -2.0 +PassThrough -2.0 +LanguageModel 1.0 +LanguageModel_OOV -1.0 diff --git a/cdec/weights.init.passthrough b/cdec/weights.init.passthrough new file mode 100644 index 0000000..33706b7 --- /dev/null +++ b/cdec/weights.init.passthrough @@ -0,0 +1,18 @@ +CountEF 0.1 +EgivenFCoherent -0.1 +Glue 0.01 +IsSingletonF -0.01 +IsSingletonFE -0.01 +LanguageModel 0.1 +LanguageModel_OOV -1 +MaxLexFgivenE -0.1 +MaxLexEgivenF -0.1 +PassThrough -0.1 +PassThrough_1 -0.1 +PassThrough_2 -0.1 +PassThrough_3 -0.1 +PassThrough_4 -0.1 +PassThrough_5 -0.1 +PassThrough_6 -0.1 +SampleCountF -0.1 +WordPenalty -0.1 |