major restructure of the training code

author: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> 2012-11-18 13:35:42 -0500
committer: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> 2012-11-18 13:35:42 -0500
commit: 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch)
tree: 33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /training/dtrain/test/example
parent: 7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff)
4 files changed, 144 insertions, 0 deletions
diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README
new file mode 100644
index 00000000..6937b11b
--- /dev/null
+++ b/training/dtrain/test/example/README
@@ -0,0 +1,8 @@
+Small example of input format for distributed training.
+Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini .
+
+For this to work, undef 'DTRAIN_LOCAL' in dtrain.h
+and recompile.
+
+Data is here: http://simianer.de/#dtrain
+
diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini
new file mode 100644
index 00000000..d5955f0e
--- /dev/null
+++ b/training/dtrain/test/example/cdec.ini
@@ -0,0 +1,25 @@
+formalism=scfg
+add_pass_through_rules=true
+scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=30
+feature_function=WordPenalty
+feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
+# all currently working feature functions for translation:
+# (with those features active that were used in the ACL paper)
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+feature_function=RuleIdentityFeatures
+feature_function=RuleSourceBigramFeatures
+feature_function=RuleTargetBigramFeatures
+feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini
new file mode 100644
index 00000000..72d50ca1
--- /dev/null
+++ b/training/dtrain/test/example/dtrain.ini
@@ -0,0 +1,22 @@
+input=test/example/nc-wmt11.1k.gz    # use '-' for STDIN
+output=-                             # a weights file (add .gz for gzip compression) or STDOUT '-'
+select_weights=VOID                  # don't output weights
+decoder_config=test/example/cdec.ini # config for cdec
+# weights for these features will be printed on each iteration
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+stop_after=10 # stop epoch after 10 inputs
+
+# interesting stuff
+epochs=2                # run over input 2 times
+k=100                   # use 100best lists
+N=4                     # optimize (approx) BLEU4
+scorer=stupid_bleu      # use 'stupid' BLEU+1
+learning_rate=1.0       # learning rate, don't care if gamma=0 (perceptron)
+gamma=0                 # use SVM reg
+sample_from=kbest       # use kbest lists (as opposed to forest)
+filter=uniq             # only unique entries in kbest (surface form)
+pair_sampling=XYX
+hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
+pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
+loss_margin=0
diff --git a/training/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output
new file mode 100644
index 00000000..05326763
--- /dev/null
+++ b/training/dtrain/test/example/expected-output
@@ -0,0 +1,89 @@
+                cdec cfg 'test/example/cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading test/example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+  Example feature: Shape_S00000_T00000
+Seeding random number sequence to 2912000813
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 2
+                 scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 1
+                   gamma 0
+             loss margin 0
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'VOID'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'test/example/cdec.ini'
+                   input 'test/example/nc-wmt11.1k.gz'
+                  output '-'
+              stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+              Glue = -637
+       WordPenalty = +1064
+     LanguageModel = +1175.3
+ LanguageModel_OOV = -1437
+     PhraseModel_0 = +1935.6
+     PhraseModel_1 = +2499.3
+     PhraseModel_2 = +964.96
+     PhraseModel_3 = +1410.8
+     PhraseModel_4 = -5977.9
+     PhraseModel_5 = +522
+     PhraseModel_6 = +1089
+       PassThrough = -1308
+        ---
+       1best avg score: 0.16963 (+0.16963)
+ 1best avg model score: 64485 (+64485)
+           avg # pairs: 1494.4
+        avg # rank err: 702.6
+     avg # margin viol: 0
+    non0 feature count: 528
+           avg list sz: 85.7
+           avg f count: 102.75
+(time 0.083 min, 0.5 s/S)
+
+Iteration #2 of 2.
+ . 10
+WEIGHTS
+              Glue = -1196
+       WordPenalty = +809.52
+     LanguageModel = +3112.1
+ LanguageModel_OOV = -1464
+     PhraseModel_0 = +3895.5
+     PhraseModel_1 = +4683.4
+     PhraseModel_2 = +1092.8
+     PhraseModel_3 = +1079.6
+     PhraseModel_4 = -6827.7
+     PhraseModel_5 = -888
+     PhraseModel_6 = +142
+       PassThrough = -1335
+        ---
+       1best avg score: 0.277 (+0.10736)
+ 1best avg model score: -3110.5 (-67595)
+           avg # pairs: 1144.2
+        avg # rank err: 529.1
+     avg # margin viol: 0
+    non0 feature count: 859
+           avg list sz: 74.9
+           avg f count: 112.84
+(time 0.067 min, 0.4 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 2 [SCORE 'stupid_bleu'=0.277].
+This took 0.15 min.
author	Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>	2012-11-18 13:35:42 -0500
committer	Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>	2012-11-18 13:35:42 -0500
commit	1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch)
tree	33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /training/dtrain/test/example
parent	7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff)