summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2012-11-05 10:25:54 -0800
committerChris Dyer <redpony@gmail.com>2012-11-05 10:25:54 -0800
commit2eb77b77d28f6d9dc88ecb6ca999994bdb555445 (patch)
tree1e19af403041bd86933ce88fa1a70030ab100ca2 /dtrain
parentc615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
parent0c54220adfaada6ad1e2d54f31a9895da35127fd (diff)
Merge pull request #10 from pks/master
renamed RuleNgramFeatures
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/dtrain.cc4
-rw-r--r--dtrain/dtrain.h3
-rwxr-xr-xdtrain/parallelize.rb79
-rw-r--r--dtrain/parallelize/test/cdec.ini22
-rw-r--r--dtrain/parallelize/test/dtrain.ini15
-rw-r--r--dtrain/parallelize/test/in10
-rw-r--r--dtrain/parallelize/test/refs10
-rw-r--r--dtrain/test/example/cdec.ini3
-rw-r--r--dtrain/test/example/dtrain.ini6
-rw-r--r--dtrain/test/example/expected-output130
-rw-r--r--dtrain/test/parallelize/cdec.ini22
-rw-r--r--dtrain/test/parallelize/dtrain.ini15
-rw-r--r--dtrain/test/parallelize/in10
-rw-r--r--dtrain/test/parallelize/refs10
14 files changed, 248 insertions, 91 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index b7a4bb6f..18286668 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -24,13 +24,13 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs")
("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_")
- ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
+ ("learning_rate", po::value<weight_t>()->default_value(1.0), "learning rate")
("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
- ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO
+ ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 7e084a79..4b6f415c 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -3,7 +3,7 @@
#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
// DO NOT USE WITH SVM!
-#define DTRAIN_LOCAL
+//#define DTRAIN_LOCAL
#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
#define DTRAIN_GRAMMAR_DELIM "########EOS########"
#define DTRAIN_SCALE 100000
@@ -22,7 +22,6 @@
#include "filelib.h"
-
using namespace std;
using namespace dtrain;
namespace po = boost::program_options;
diff --git a/dtrain/parallelize.rb b/dtrain/parallelize.rb
new file mode 100755
index 00000000..1d277ff6
--- /dev/null
+++ b/dtrain/parallelize.rb
@@ -0,0 +1,79 @@
+#!/usr/bin/env ruby
+
+
+if ARGV.size != 5
+ STDERR.write "Usage: "
+ STDERR.write "ruby parallelize.rb <#shards> <input> <refs> <epochs> <dtrain.ini>\n"
+ exit
+end
+
+dtrain_bin = '/home/pks/bin/dtrain_local'
+ruby = '/usr/bin/ruby'
+lplp_rb = '/home/pks/mt/cdec-dtrain/dtrain/hstreaming/lplp.rb'
+lplp_args = 'l2 select_k 100000'
+gzip = '/bin/gzip'
+
+num_shards = ARGV[0].to_i
+input = ARGV[1]
+refs = ARGV[2]
+epochs = ARGV[3].to_i
+ini = ARGV[4]
+
+
+`mkdir work`
+
+def make_shards(input, refs, num_shards)
+ lc = `wc -l #{input}`.split.first.to_i
+ shard_sz = lc / num_shards
+ leftover = lc % num_shards
+ in_f = File.new input, 'r'
+ refs_f = File.new refs, 'r'
+ shard_in_files = []
+ shard_refs_files = []
+ 0.upto(num_shards-1) { |shard|
+ shard_in = File.new "work/shard.#{shard}.in", 'w+'
+ shard_refs = File.new "work/shard.#{shard}.refs", 'w+'
+ 0.upto(shard_sz-1) { |i|
+ shard_in.write in_f.gets
+ shard_refs.write refs_f.gets
+ }
+ shard_in_files << shard_in
+ shard_refs_files << shard_refs
+ }
+ while leftover > 0
+ shard_in_files[-1].write in_f.gets
+ shard_refs_files[-1].write refs_f.gets
+ leftover -= 1
+ end
+ (shard_in_files + shard_refs_files).each do |f| f.close end
+ in_f.close
+ refs_f.close
+end
+
+make_shards input, refs, num_shards
+
+0.upto(epochs-1) { |epoch|
+ pids = []
+ input_weights = ''
+ if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end
+ weights_files = []
+ 0.upto(num_shards-1) { |shard|
+ pids << Kernel.fork {
+ `#{dtrain_bin} -c #{ini}\
+ --input work/shard.#{shard}.in\
+ --refs work/shard.#{shard}.refs #{input_weights}\
+ --output work/weights.#{shard}.#{epoch}\
+ &> work/out.#{shard}.#{epoch}`
+ }
+ weights_files << "work/weights.#{shard}.#{epoch}"
+ }
+ pids.each { |pid| Process.wait(pid) }
+ cat = File.new('work/weights_cat', 'w+')
+ weights_files.each { |f| cat.write File.new(f, 'r').read }
+ cat.close
+ `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}`
+}
+
+`rm work/weights_cat`
+`#{gzip} work/*`
+
diff --git a/dtrain/parallelize/test/cdec.ini b/dtrain/parallelize/test/cdec.ini
new file mode 100644
index 00000000..72e99dc5
--- /dev/null
+++ b/dtrain/parallelize/test/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/dtrain/parallelize/test/dtrain.ini b/dtrain/parallelize/test/dtrain.ini
new file mode 100644
index 00000000..03f9d240
--- /dev/null
+++ b/dtrain/parallelize/test/dtrain.ini
@@ -0,0 +1,15 @@
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+loss_margin=0
+epochs=1
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=last
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+decoder_config=cdec.ini
diff --git a/dtrain/parallelize/test/in b/dtrain/parallelize/test/in
new file mode 100644
index 00000000..a312809f
--- /dev/null
+++ b/dtrain/parallelize/test/in
@@ -0,0 +1,10 @@
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/dtrain/parallelize/test/refs b/dtrain/parallelize/test/refs
new file mode 100644
index 00000000..4d3128cb
--- /dev/null
+++ b/dtrain/parallelize/test/refs
@@ -0,0 +1,10 @@
+barack obama becomes the fourth american president to receive the nobel peace prize
+the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
+he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
+the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
+first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
+the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
+then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
+he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
+the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
+the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini
index 6642107f..d5955f0e 100644
--- a/dtrain/test/example/cdec.ini
+++ b/dtrain/test/example/cdec.ini
@@ -17,7 +17,8 @@ feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
#feature_function=NonLatinCount
#feature_function=OutputIndicator
feature_function=RuleIdentityFeatures
-feature_function=RuleNgramFeatures
+feature_function=RuleSourceBigramFeatures
+feature_function=RuleTargetBigramFeatures
feature_function=RuleShape
#feature_function=SourceSpanSizeFeatures
#feature_function=SourceWordPenalty
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index c8ac7c3f..72d50ca1 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,6 +1,6 @@
input=test/example/nc-wmt11.1k.gz # use '-' for STDIN
output=- # a weights file (add .gz for gzip compression) or STDOUT '-'
-select_weights=VOID # don't output weights
+select_weights=VOID # don't output weights
decoder_config=test/example/cdec.ini # config for cdec
# weights for these features will be printed on each iteration
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
@@ -8,11 +8,11 @@ tmp=/tmp
stop_after=10 # stop epoch after 10 inputs
# interesting stuff
-epochs=3 # run over input 3 times
+epochs=2 # run over input 2 times
k=100 # use 100best lists
N=4 # optimize (approx) BLEU4
scorer=stupid_bleu # use 'stupid' BLEU+1
-learning_rate=0.0001 # learning rate
+learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron)
gamma=0 # use SVM reg
sample_from=kbest # use kbest lists (as opposed to forest)
filter=uniq # only unique entries in kbest (surface form)
diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output
index 25d2c069..05326763 100644
--- a/dtrain/test/example/expected-output
+++ b/dtrain/test/example/expected-output
@@ -1,31 +1,20 @@
cdec cfg 'test/example/cdec.ini'
-feature: WordPenalty (no config parameters)
-State is 0 bytes for feature WordPenalty
-feature: KLanguageModel (with config parameters 'test/example/nc-wmt11.en.srilm.gz')
Loading the LM will be faster if you build a binary file.
Reading test/example/nc-wmt11.en.srilm.gz
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
-Loaded 5-gram KLM from test/example/nc-wmt11.en.srilm.gz (MapSize=49581)
-State is 98 bytes for feature KLanguageModel test/example/nc-wmt11.en.srilm.gz
-feature: RuleIdentityFeatures (no config parameters)
-State is 0 bytes for feature RuleIdentityFeatures
-feature: RuleNgramFeatures (no config parameters)
-State is 0 bytes for feature RuleNgramFeatures
-feature: RuleShape (no config parameters)
Example feature: Shape_S00000_T00000
-State is 0 bytes for feature RuleShape
-Seeding random number sequence to 1072059181
+Seeding random number sequence to 2912000813
dtrain
Parameters:
k 100
N 4
- T 3
+ T 2
scorer 'stupid_bleu'
sample from 'kbest'
filter 'uniq'
- learning rate 0.0001
+ learning rate 1
gamma 0
loss margin 0
pairs 'XYX'
@@ -33,93 +22,68 @@ Parameters:
pair threshold 0
select weights 'VOID'
l1 reg 0 'none'
+ max pairs 4294967295
cdec cfg 'test/example/cdec.ini'
input 'test/example/nc-wmt11.1k.gz'
output '-'
stop_after 10
(a dot represents 10 inputs)
-Iteration #1 of 3.
+Iteration #1 of 2.
. 10
Stopping after 10 input sentences.
WEIGHTS
- Glue = -0.0293
- WordPenalty = +0.049075
- LanguageModel = +0.24345
- LanguageModel_OOV = -0.2029
- PhraseModel_0 = +0.0084102
- PhraseModel_1 = +0.021729
- PhraseModel_2 = +0.014922
- PhraseModel_3 = +0.104
- PhraseModel_4 = -0.14308
- PhraseModel_5 = +0.0247
- PhraseModel_6 = -0.012
- PassThrough = -0.2161
+ Glue = -637
+ WordPenalty = +1064
+ LanguageModel = +1175.3
+ LanguageModel_OOV = -1437
+ PhraseModel_0 = +1935.6
+ PhraseModel_1 = +2499.3
+ PhraseModel_2 = +964.96
+ PhraseModel_3 = +1410.8
+ PhraseModel_4 = -5977.9
+ PhraseModel_5 = +522
+ PhraseModel_6 = +1089
+ PassThrough = -1308
---
- 1best avg score: 0.16872 (+0.16872)
- 1best avg model score: -1.8276 (-1.8276)
- avg # pairs: 1121.1
- avg # rank err: 555.6
+ 1best avg score: 0.16963 (+0.16963)
+ 1best avg model score: 64485 (+64485)
+ avg # pairs: 1494.4
+ avg # rank err: 702.6
avg # margin viol: 0
- non0 feature count: 277
- avg list sz: 77.2
- avg f count: 90.96
-(time 0.1 min, 0.6 s/S)
-
-Iteration #2 of 3.
- . 10
-WEIGHTS
- Glue = -0.3526
- WordPenalty = +0.067576
- LanguageModel = +1.155
- LanguageModel_OOV = -0.2728
- PhraseModel_0 = -0.025529
- PhraseModel_1 = +0.095869
- PhraseModel_2 = +0.094567
- PhraseModel_3 = +0.12482
- PhraseModel_4 = -0.36533
- PhraseModel_5 = +0.1068
- PhraseModel_6 = -0.1517
- PassThrough = -0.286
- ---
- 1best avg score: 0.18394 (+0.015221)
- 1best avg model score: 3.205 (+5.0326)
- avg # pairs: 1168.3
- avg # rank err: 594.8
- avg # margin viol: 0
- non0 feature count: 543
- avg list sz: 77.5
- avg f count: 85.916
+ non0 feature count: 528
+ avg list sz: 85.7
+ avg f count: 102.75
(time 0.083 min, 0.5 s/S)
-Iteration #3 of 3.
+Iteration #2 of 2.
. 10
WEIGHTS
- Glue = -0.392
- WordPenalty = +0.071963
- LanguageModel = +0.81266
- LanguageModel_OOV = -0.4177
- PhraseModel_0 = -0.2649
- PhraseModel_1 = -0.17931
- PhraseModel_2 = +0.038261
- PhraseModel_3 = +0.20261
- PhraseModel_4 = -0.42621
- PhraseModel_5 = +0.3198
- PhraseModel_6 = -0.1437
- PassThrough = -0.4309
+ Glue = -1196
+ WordPenalty = +809.52
+ LanguageModel = +3112.1
+ LanguageModel_OOV = -1464
+ PhraseModel_0 = +3895.5
+ PhraseModel_1 = +4683.4
+ PhraseModel_2 = +1092.8
+ PhraseModel_3 = +1079.6
+ PhraseModel_4 = -6827.7
+ PhraseModel_5 = -888
+ PhraseModel_6 = +142
+ PassThrough = -1335
---
- 1best avg score: 0.2962 (+0.11225)
- 1best avg model score: -36.274 (-39.479)
- avg # pairs: 1109.6
- avg # rank err: 515.9
+ 1best avg score: 0.277 (+0.10736)
+ 1best avg model score: -3110.5 (-67595)
+ avg # pairs: 1144.2
+ avg # rank err: 529.1
avg # margin viol: 0
- non0 feature count: 741
- avg list sz: 77
- avg f count: 88.982
-(time 0.083 min, 0.5 s/S)
+ non0 feature count: 859
+ avg list sz: 74.9
+ avg f count: 112.84
+(time 0.067 min, 0.4 s/S)
Writing weights file to '-' ...
done
---
-Best iteration: 3 [SCORE 'stupid_bleu'=0.2962].
-This took 0.26667 min.
+Best iteration: 2 [SCORE 'stupid_bleu'=0.277].
+This took 0.15 min.
diff --git a/dtrain/test/parallelize/cdec.ini b/dtrain/test/parallelize/cdec.ini
new file mode 100644
index 00000000..72e99dc5
--- /dev/null
+++ b/dtrain/test/parallelize/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/dtrain/test/parallelize/dtrain.ini b/dtrain/test/parallelize/dtrain.ini
new file mode 100644
index 00000000..03f9d240
--- /dev/null
+++ b/dtrain/test/parallelize/dtrain.ini
@@ -0,0 +1,15 @@
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+loss_margin=0
+epochs=1
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=last
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+decoder_config=cdec.ini
diff --git a/dtrain/test/parallelize/in b/dtrain/test/parallelize/in
new file mode 100644
index 00000000..a312809f
--- /dev/null
+++ b/dtrain/test/parallelize/in
@@ -0,0 +1,10 @@
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/dtrain/test/parallelize/refs b/dtrain/test/parallelize/refs
new file mode 100644
index 00000000..4d3128cb
--- /dev/null
+++ b/dtrain/test/parallelize/refs
@@ -0,0 +1,10 @@
+barack obama becomes the fourth american president to receive the nobel peace prize
+the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
+he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
+the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
+first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
+the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
+then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
+he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
+the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
+the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .