From 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 18 Nov 2012 13:35:42 -0500 Subject: major restructure of the training code --- training/dtrain/test/example/README | 8 +++ training/dtrain/test/example/cdec.ini | 25 ++++++++ training/dtrain/test/example/dtrain.ini | 22 +++++++ training/dtrain/test/example/expected-output | 89 ++++++++++++++++++++++++++++ training/dtrain/test/parallelize/cdec.ini | 22 +++++++ training/dtrain/test/parallelize/dtrain.ini | 15 +++++ training/dtrain/test/parallelize/in | 10 ++++ training/dtrain/test/parallelize/refs | 10 ++++ training/dtrain/test/toy/cdec.ini | 2 + training/dtrain/test/toy/dtrain.ini | 12 ++++ training/dtrain/test/toy/input | 2 + 11 files changed, 217 insertions(+) create mode 100644 training/dtrain/test/example/README create mode 100644 training/dtrain/test/example/cdec.ini create mode 100644 training/dtrain/test/example/dtrain.ini create mode 100644 training/dtrain/test/example/expected-output create mode 100644 training/dtrain/test/parallelize/cdec.ini create mode 100644 training/dtrain/test/parallelize/dtrain.ini create mode 100644 training/dtrain/test/parallelize/in create mode 100644 training/dtrain/test/parallelize/refs create mode 100644 training/dtrain/test/toy/cdec.ini create mode 100644 training/dtrain/test/toy/dtrain.ini create mode 100644 training/dtrain/test/toy/input (limited to 'training/dtrain/test') diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README new file mode 100644 index 00000000..6937b11b --- /dev/null +++ b/training/dtrain/test/example/README @@ -0,0 +1,8 @@ +Small example of input format for distributed training. +Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . + +For this to work, undef 'DTRAIN_LOCAL' in dtrain.h +and recompile. + +Data is here: http://simianer.de/#dtrain + diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini new file mode 100644 index 00000000..d5955f0e --- /dev/null +++ b/training/dtrain/test/example/cdec.ini @@ -0,0 +1,25 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=30 +feature_function=WordPenalty +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +# all currently working feature functions for translation: +# (with those features active that were used in the ACL paper) +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +feature_function=RuleIdentityFeatures +feature_function=RuleSourceBigramFeatures +feature_function=RuleTargetBigramFeatures +feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini new file mode 100644 index 00000000..72d50ca1 --- /dev/null +++ b/training/dtrain/test/example/dtrain.ini @@ -0,0 +1,22 @@ +input=test/example/nc-wmt11.1k.gz # use '-' for STDIN +output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +select_weights=VOID # don't output weights +decoder_config=test/example/cdec.ini # config for cdec +# weights for these features will be printed on each iteration +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +stop_after=10 # stop epoch after 10 inputs + +# interesting stuff +epochs=2 # run over input 2 times +k=100 # use 100best lists +N=4 # optimize (approx) BLEU4 +scorer=stupid_bleu # use 'stupid' BLEU+1 +learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) +gamma=0 # use SVM reg +sample_from=kbest # use kbest lists (as opposed to forest) +filter=uniq # only unique entries in kbest (surface form) +pair_sampling=XYX +hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here +pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) +loss_margin=0 diff --git a/training/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output new file mode 100644 index 00000000..05326763 --- /dev/null +++ b/training/dtrain/test/example/expected-output @@ -0,0 +1,89 @@ + cdec cfg 'test/example/cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading test/example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** + Example feature: Shape_S00000_T00000 +Seeding random number sequence to 2912000813 + +dtrain +Parameters: + k 100 + N 4 + T 2 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 1 + gamma 0 + loss margin 0 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'VOID' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'test/example/cdec.ini' + input 'test/example/nc-wmt11.1k.gz' + output '-' + stop_after 10 +(a dot represents 10 inputs) +Iteration #1 of 2. + . 10 +Stopping after 10 input sentences. +WEIGHTS + Glue = -637 + WordPenalty = +1064 + LanguageModel = +1175.3 + LanguageModel_OOV = -1437 + PhraseModel_0 = +1935.6 + PhraseModel_1 = +2499.3 + PhraseModel_2 = +964.96 + PhraseModel_3 = +1410.8 + PhraseModel_4 = -5977.9 + PhraseModel_5 = +522 + PhraseModel_6 = +1089 + PassThrough = -1308 + --- + 1best avg score: 0.16963 (+0.16963) + 1best avg model score: 64485 (+64485) + avg # pairs: 1494.4 + avg # rank err: 702.6 + avg # margin viol: 0 + non0 feature count: 528 + avg list sz: 85.7 + avg f count: 102.75 +(time 0.083 min, 0.5 s/S) + +Iteration #2 of 2. + . 10 +WEIGHTS + Glue = -1196 + WordPenalty = +809.52 + LanguageModel = +3112.1 + LanguageModel_OOV = -1464 + PhraseModel_0 = +3895.5 + PhraseModel_1 = +4683.4 + PhraseModel_2 = +1092.8 + PhraseModel_3 = +1079.6 + PhraseModel_4 = -6827.7 + PhraseModel_5 = -888 + PhraseModel_6 = +142 + PassThrough = -1335 + --- + 1best avg score: 0.277 (+0.10736) + 1best avg model score: -3110.5 (-67595) + avg # pairs: 1144.2 + avg # rank err: 529.1 + avg # margin viol: 0 + non0 feature count: 859 + avg list sz: 74.9 + avg f count: 112.84 +(time 0.067 min, 0.4 s/S) + +Writing weights file to '-' ... +done + +--- +Best iteration: 2 [SCORE 'stupid_bleu'=0.277]. +This took 0.15 min. diff --git a/training/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini new file mode 100644 index 00000000..72e99dc5 --- /dev/null +++ b/training/dtrain/test/parallelize/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5 +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini new file mode 100644 index 00000000..03f9d240 --- /dev/null +++ b/training/dtrain/test/parallelize/dtrain.ini @@ -0,0 +1,15 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +decoder_config=cdec.ini diff --git a/training/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in new file mode 100644 index 00000000..a312809f --- /dev/null +++ b/training/dtrain/test/parallelize/in @@ -0,0 +1,10 @@ +barack obama erhält als vierter us @-@ präsident den frieden nobelpreis +der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen . +darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern . +der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein . +zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben . +das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant . +nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt . +diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt . +das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird . +der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt . diff --git a/training/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs new file mode 100644 index 00000000..4d3128cb --- /dev/null +++ b/training/dtrain/test/parallelize/refs @@ -0,0 +1,10 @@ +barack obama becomes the fourth american president to receive the nobel peace prize +the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so . +he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things . +the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule . +first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations . +the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway . +then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award . +he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office . +the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan . +the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries . diff --git a/training/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini new file mode 100644 index 00000000..98b02d44 --- /dev/null +++ b/training/dtrain/test/toy/cdec.ini @@ -0,0 +1,2 @@ +formalism=scfg +add_pass_through_rules=true diff --git a/training/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini new file mode 100644 index 00000000..a091732f --- /dev/null +++ b/training/dtrain/test/toy/dtrain.ini @@ -0,0 +1,12 @@ +decoder_config=test/toy/cdec.ini +input=test/toy/input +output=- +print_weights=logp shell_rule house_rule small_rule little_rule PassThrough +k=4 +N=4 +epochs=2 +scorer=bleu +sample_from=kbest +filter=uniq +pair_sampling=all +learning_rate=1 diff --git a/training/dtrain/test/toy/input b/training/dtrain/test/toy/input new file mode 100644 index 00000000..4d10a9ea --- /dev/null +++ b/training/dtrain/test/toy/input @@ -0,0 +1,2 @@ +0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 +1 ich fand ein kleines haus i found a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 -- cgit v1.2.3