Merge branch 'master' of https://github.com/redpony/cdec

author: Paul Baltescu <pauldb89@gmail.com> 2013-04-24 17:18:10 +0100
committer: Paul Baltescu <pauldb89@gmail.com> 2013-04-24 17:18:10 +0100
commit: e8b412577b9d3fe2090b9f48443f919cd268c809 (patch)
tree: b46a7b51d365519dfb5170d71bac33be6d3e29b9 /training/dtrain/examples/standard
parent: d189426a7ea56b71eb6e25ed02a7b0993cfb56a8 (diff)
parent: 5aee54869aa19cfe9be965e67a472e94449d16da (diff)
8 files changed, 143 insertions, 0 deletions
diff --git a/training/dtrain/examples/standard/README b/training/dtrain/examples/standard/README
new file mode 100644
index 00000000..ce37d31a
--- /dev/null
+++ b/training/dtrain/examples/standard/README
@@ -0,0 +1,2 @@
+Call `dtrain` from this folder with ../../dtrain -c dtrain.ini .
+
diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini
new file mode 100644
index 00000000..e1edc68d
--- /dev/null
+++ b/training/dtrain/examples/standard/cdec.ini
@@ -0,0 +1,26 @@
+formalism=scfg
+add_pass_through_rules=true
+scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+grammar=nc-wmt11.grammar.gz
+feature_function=WordPenalty
+feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz
+# all currently working feature functions for translation:
+# (with those features active that were used in the ACL paper)
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+feature_function=RuleIdentityFeatures
+feature_function=RuleSourceBigramFeatures
+feature_function=RuleTargetBigramFeatures
+feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
new file mode 100644
index 00000000..e1072d30
--- /dev/null
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -0,0 +1,24 @@
+input=./nc-wmt11.de.gz
+refs=./nc-wmt11.en.gz
+output=-                  # a weights file (add .gz for gzip compression) or STDOUT '-'
+select_weights=VOID       # output average (over epochs) weight vector
+decoder_config=./cdec.ini # config for cdec
+# weights for these features will be printed on each iteration
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+# newer version of the grammar extractor use different feature names: 
+#print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV
+stop_after=10 # stop epoch after 10 inputs
+
+# interesting stuff
+epochs=2                # run over input 2 times
+k=100                   # use 100best lists
+N=4                     # optimize (approx) BLEU4
+scorer=stupid_bleu      # use 'stupid' BLEU+1
+learning_rate=1.0       # learning rate, don't care if gamma=0 (perceptron)
+gamma=0                 # use SVM reg
+sample_from=kbest       # use kbest lists (as opposed to forest)
+filter=uniq             # only unique entries in kbest (surface form)
+pair_sampling=XYX       #
+hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
+pair_threshold=0        # minimum distance in BLEU (here: > 0)
+loss_margin=0           # update if correctly ranked, but within this margin
diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output
new file mode 100644
index 00000000..7cd09dbf
--- /dev/null
+++ b/training/dtrain/examples/standard/expected-output
@@ -0,0 +1,91 @@
+                cdec cfg './cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ./nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+  Example feature: Shape_S00000_T00000
+Seeding random number sequence to 2679584485
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 2
+                  scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 1
+                   gamma 0
+             loss margin 0
+       faster perceptron 1
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'VOID'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg './cdec.ini'
+                   input './nc-wmt11.de.gz'
+                    refs './nc-wmt11.en.gz'
+                  output '-'
+              stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+              Glue = -576
+       WordPenalty = +417.79
+     LanguageModel = +5117.5
+ LanguageModel_OOV = -1307
+     PhraseModel_0 = -1612
+     PhraseModel_1 = -2159.6
+     PhraseModel_2 = -677.36
+     PhraseModel_3 = +2663.8
+     PhraseModel_4 = -1025.9
+     PhraseModel_5 = -8
+     PhraseModel_6 = +70
+       PassThrough = -1455
+        ---
+       1best avg score: 0.27697 (+0.27697)
+ 1best avg model score: -47918 (-47918)
+           avg # pairs: 581.9 (meaningless)
+        avg # rank err: 581.9
+     avg # margin viol: 0
+    non0 feature count: 703
+           avg list sz: 90.9
+           avg f count: 100.09
+(time 0.25 min, 1.5 s/S)
+
+Iteration #2 of 2.
+ . 10
+WEIGHTS
+              Glue = -622
+       WordPenalty = +898.56
+     LanguageModel = +8066.2
+ LanguageModel_OOV = -2590
+     PhraseModel_0 = -4335.8
+     PhraseModel_1 = -5864.4
+     PhraseModel_2 = -1729.8
+     PhraseModel_3 = +2831.9
+     PhraseModel_4 = -5384.8
+     PhraseModel_5 = +1449
+     PhraseModel_6 = +480
+       PassThrough = -2578
+        ---
+       1best avg score: 0.37119 (+0.094226)
+ 1best avg model score: -1.3174e+05 (-83822)
+           avg # pairs: 584.1 (meaningless)
+        avg # rank err: 584.1
+     avg # margin viol: 0
+    non0 feature count: 1115
+           avg list sz: 91.3
+           avg f count: 90.755
+(time 0.3 min, 1.8 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 2 [SCORE 'stupid_bleu'=0.37119].
+This took 0.55 min.
diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz
new file mode 100644
index 00000000..0741fd92
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.de.gz
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz
new file mode 100644
index 00000000..1c0bd401
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.en.gz
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz
new file mode 100644
index 00000000..7ce81057
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz
diff --git a/training/dtrain/examples/standard/nc-wmt11.grammar.gz b/training/dtrain/examples/standard/nc-wmt11.grammar.gz
new file mode 100644
index 00000000..ce4024a1
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.grammar.gz
author	Paul Baltescu <pauldb89@gmail.com>	2013-04-24 17:18:10 +0100
committer	Paul Baltescu <pauldb89@gmail.com>	2013-04-24 17:18:10 +0100
commit	e8b412577b9d3fe2090b9f48443f919cd268c809 (patch)
tree	b46a7b51d365519dfb5170d71bac33be6d3e29b9 /training/dtrain/examples/standard
parent	d189426a7ea56b71eb6e25ed02a7b0993cfb56a8 (diff)
parent	5aee54869aa19cfe9be965e67a472e94449d16da (diff)