From 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 18 Nov 2012 13:35:42 -0500 Subject: major restructure of the training code --- dtrain/test/example/README | 8 ---- dtrain/test/example/cdec.ini | 25 ----------- dtrain/test/example/dtrain.ini | 22 --------- dtrain/test/example/expected-output | 89 ------------------------------------- dtrain/test/parallelize/cdec.ini | 22 --------- dtrain/test/parallelize/dtrain.ini | 15 ------- dtrain/test/parallelize/in | 10 ----- dtrain/test/parallelize/refs | 10 ----- dtrain/test/toy/cdec.ini | 2 - dtrain/test/toy/dtrain.ini | 12 ----- dtrain/test/toy/input | 2 - 11 files changed, 217 deletions(-) delete mode 100644 dtrain/test/example/README delete mode 100644 dtrain/test/example/cdec.ini delete mode 100644 dtrain/test/example/dtrain.ini delete mode 100644 dtrain/test/example/expected-output delete mode 100644 dtrain/test/parallelize/cdec.ini delete mode 100644 dtrain/test/parallelize/dtrain.ini delete mode 100644 dtrain/test/parallelize/in delete mode 100644 dtrain/test/parallelize/refs delete mode 100644 dtrain/test/toy/cdec.ini delete mode 100644 dtrain/test/toy/dtrain.ini delete mode 100644 dtrain/test/toy/input (limited to 'dtrain/test') diff --git a/dtrain/test/example/README b/dtrain/test/example/README deleted file mode 100644 index 6937b11b..00000000 --- a/dtrain/test/example/README +++ /dev/null @@ -1,8 +0,0 @@ -Small example of input format for distributed training. -Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . - -For this to work, undef 'DTRAIN_LOCAL' in dtrain.h -and recompile. - -Data is here: http://simianer.de/#dtrain - diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini deleted file mode 100644 index d5955f0e..00000000 --- a/dtrain/test/example/cdec.ini +++ /dev/null @@ -1,25 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -scfg_max_span_limit=15 -intersection_strategy=cube_pruning -cubepruning_pop_limit=30 -feature_function=WordPenalty -feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz -# all currently working feature functions for translation: -# (with those features active that were used in the ACL paper) -#feature_function=ArityPenalty -#feature_function=CMR2008ReorderingFeatures -#feature_function=Dwarf -#feature_function=InputIndicator -#feature_function=LexNullJump -#feature_function=NewJump -#feature_function=NgramFeatures -#feature_function=NonLatinCount -#feature_function=OutputIndicator -feature_function=RuleIdentityFeatures -feature_function=RuleSourceBigramFeatures -feature_function=RuleTargetBigramFeatures -feature_function=RuleShape -#feature_function=SourceSpanSizeFeatures -#feature_function=SourceWordPenalty -#feature_function=SpanFeatures diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini deleted file mode 100644 index 72d50ca1..00000000 --- a/dtrain/test/example/dtrain.ini +++ /dev/null @@ -1,22 +0,0 @@ -input=test/example/nc-wmt11.1k.gz # use '-' for STDIN -output=- # a weights file (add .gz for gzip compression) or STDOUT '-' -select_weights=VOID # don't output weights -decoder_config=test/example/cdec.ini # config for cdec -# weights for these features will be printed on each iteration -print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough -tmp=/tmp -stop_after=10 # stop epoch after 10 inputs - -# interesting stuff -epochs=2 # run over input 2 times -k=100 # use 100best lists -N=4 # optimize (approx) BLEU4 -scorer=stupid_bleu # use 'stupid' BLEU+1 -learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) -gamma=0 # use SVM reg -sample_from=kbest # use kbest lists (as opposed to forest) -filter=uniq # only unique entries in kbest (surface form) -pair_sampling=XYX -hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here -pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) -loss_margin=0 diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output deleted file mode 100644 index 05326763..00000000 --- a/dtrain/test/example/expected-output +++ /dev/null @@ -1,89 +0,0 @@ - cdec cfg 'test/example/cdec.ini' -Loading the LM will be faster if you build a binary file. -Reading test/example/nc-wmt11.en.srilm.gz -----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 -**************************************************************************************************** - Example feature: Shape_S00000_T00000 -Seeding random number sequence to 2912000813 - -dtrain -Parameters: - k 100 - N 4 - T 2 - scorer 'stupid_bleu' - sample from 'kbest' - filter 'uniq' - learning rate 1 - gamma 0 - loss margin 0 - pairs 'XYX' - hi lo 0.1 - pair threshold 0 - select weights 'VOID' - l1 reg 0 'none' - max pairs 4294967295 - cdec cfg 'test/example/cdec.ini' - input 'test/example/nc-wmt11.1k.gz' - output '-' - stop_after 10 -(a dot represents 10 inputs) -Iteration #1 of 2. - . 10 -Stopping after 10 input sentences. -WEIGHTS - Glue = -637 - WordPenalty = +1064 - LanguageModel = +1175.3 - LanguageModel_OOV = -1437 - PhraseModel_0 = +1935.6 - PhraseModel_1 = +2499.3 - PhraseModel_2 = +964.96 - PhraseModel_3 = +1410.8 - PhraseModel_4 = -5977.9 - PhraseModel_5 = +522 - PhraseModel_6 = +1089 - PassThrough = -1308 - --- - 1best avg score: 0.16963 (+0.16963) - 1best avg model score: 64485 (+64485) - avg # pairs: 1494.4 - avg # rank err: 702.6 - avg # margin viol: 0 - non0 feature count: 528 - avg list sz: 85.7 - avg f count: 102.75 -(time 0.083 min, 0.5 s/S) - -Iteration #2 of 2. - . 10 -WEIGHTS - Glue = -1196 - WordPenalty = +809.52 - LanguageModel = +3112.1 - LanguageModel_OOV = -1464 - PhraseModel_0 = +3895.5 - PhraseModel_1 = +4683.4 - PhraseModel_2 = +1092.8 - PhraseModel_3 = +1079.6 - PhraseModel_4 = -6827.7 - PhraseModel_5 = -888 - PhraseModel_6 = +142 - PassThrough = -1335 - --- - 1best avg score: 0.277 (+0.10736) - 1best avg model score: -3110.5 (-67595) - avg # pairs: 1144.2 - avg # rank err: 529.1 - avg # margin viol: 0 - non0 feature count: 859 - avg list sz: 74.9 - avg f count: 112.84 -(time 0.067 min, 0.4 s/S) - -Writing weights file to '-' ... -done - ---- -Best iteration: 2 [SCORE 'stupid_bleu'=0.277]. -This took 0.15 min. diff --git a/dtrain/test/parallelize/cdec.ini b/dtrain/test/parallelize/cdec.ini deleted file mode 100644 index 72e99dc5..00000000 --- a/dtrain/test/parallelize/cdec.ini +++ /dev/null @@ -1,22 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -intersection_strategy=cube_pruning -cubepruning_pop_limit=200 -scfg_max_span_limit=15 -feature_function=WordPenalty -feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5 -#feature_function=ArityPenalty -#feature_function=CMR2008ReorderingFeatures -#feature_function=Dwarf -#feature_function=InputIndicator -#feature_function=LexNullJump -#feature_function=NewJump -#feature_function=NgramFeatures -#feature_function=NonLatinCount -#feature_function=OutputIndicator -#feature_function=RuleIdentityFeatures -#feature_function=RuleNgramFeatures -#feature_function=RuleShape -#feature_function=SourceSpanSizeFeatures -#feature_function=SourceWordPenalty -#feature_function=SpanFeatures diff --git a/dtrain/test/parallelize/dtrain.ini b/dtrain/test/parallelize/dtrain.ini deleted file mode 100644 index 03f9d240..00000000 --- a/dtrain/test/parallelize/dtrain.ini +++ /dev/null @@ -1,15 +0,0 @@ -k=100 -N=4 -learning_rate=0.0001 -gamma=0 -loss_margin=0 -epochs=1 -scorer=stupid_bleu -sample_from=kbest -filter=uniq -pair_sampling=XYX -hi_lo=0.1 -select_weights=last -print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough -tmp=/tmp -decoder_config=cdec.ini diff --git a/dtrain/test/parallelize/in b/dtrain/test/parallelize/in deleted file mode 100644 index a312809f..00000000 --- a/dtrain/test/parallelize/in +++ /dev/null @@ -1,10 +0,0 @@ -barack obama erhält als vierter us @-@ präsident den frieden nobelpreis -der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen . -darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern . -der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein . -zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben . -das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant . -nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt . -diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt . -das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird . -der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt . diff --git a/dtrain/test/parallelize/refs b/dtrain/test/parallelize/refs deleted file mode 100644 index 4d3128cb..00000000 --- a/dtrain/test/parallelize/refs +++ /dev/null @@ -1,10 +0,0 @@ -barack obama becomes the fourth american president to receive the nobel peace prize -the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so . -he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things . -the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule . -first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations . -the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway . -then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award . -he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office . -the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan . -the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries . diff --git a/dtrain/test/toy/cdec.ini b/dtrain/test/toy/cdec.ini deleted file mode 100644 index 98b02d44..00000000 --- a/dtrain/test/toy/cdec.ini +++ /dev/null @@ -1,2 +0,0 @@ -formalism=scfg -add_pass_through_rules=true diff --git a/dtrain/test/toy/dtrain.ini b/dtrain/test/toy/dtrain.ini deleted file mode 100644 index a091732f..00000000 --- a/dtrain/test/toy/dtrain.ini +++ /dev/null @@ -1,12 +0,0 @@ -decoder_config=test/toy/cdec.ini -input=test/toy/input -output=- -print_weights=logp shell_rule house_rule small_rule little_rule PassThrough -k=4 -N=4 -epochs=2 -scorer=bleu -sample_from=kbest -filter=uniq -pair_sampling=all -learning_rate=1 diff --git a/dtrain/test/toy/input b/dtrain/test/toy/input deleted file mode 100644 index 4d10a9ea..00000000 --- a/dtrain/test/toy/input +++ /dev/null @@ -1,2 +0,0 @@ -0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 -1 ich fand ein kleines haus i found a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 -- cgit v1.2.3