From 4d48a6d19521b24d9ac0987ce9a472d9ba574c4b Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 24 Jun 2012 16:42:56 +0200 Subject: RuleTargetBigramFeatures, parallelize.rb --- decoder/cdec_ff.cc | 3 +- decoder/ff_rules.cc | 48 ++++++++++++++++++++--- decoder/ff_rules.h | 19 ++++++++- dtrain/parallelize/parallelize.rb | 79 ++++++++++++++++++++++++++++++++++++++ dtrain/parallelize/test/cdec.ini | 22 +++++++++++ dtrain/parallelize/test/dtrain.ini | 15 ++++++++ dtrain/parallelize/test/in | 10 +++++ dtrain/parallelize/test/refs | 10 +++++ 8 files changed, 198 insertions(+), 8 deletions(-) create mode 100755 dtrain/parallelize/parallelize.rb create mode 100644 dtrain/parallelize/test/cdec.ini create mode 100644 dtrain/parallelize/test/dtrain.ini create mode 100644 dtrain/parallelize/test/in create mode 100644 dtrain/parallelize/test/refs diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index b516c386..d64bdada 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -47,8 +47,9 @@ void register_feature_functions() { ff_registry.Register("RuleIdentityFeatures", new FFFactory()); ff_registry.Register("SourceSyntaxFeatures", new FFFactory); ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); - ff_registry.Register("RuleNgramFeatures", new FFFactory()); ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory()); + ff_registry.Register("RuleSourceBigramFeatures", new FFFactory()); + ff_registry.Register("RuleTargetBigramFeatures", new FFFactory()); ff_registry.Register("KLanguageModel", new KLanguageModelFactory()); ff_registry.Register("NonLatinCount", new FFFactory); ff_registry.Register("RuleShape", new FFFactory); diff --git a/decoder/ff_rules.cc b/decoder/ff_rules.cc index bd4c4cc0..3d0e514a 100644 --- a/decoder/ff_rules.cc +++ b/decoder/ff_rules.cc @@ -66,15 +66,15 @@ void RuleIdentityFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, features->add_value(it->second, 1); } -RuleNgramFeatures::RuleNgramFeatures(const std::string& param) { +RuleSourceBigramFeatures::RuleSourceBigramFeatures(const std::string& param) { } -void RuleNgramFeatures::PrepareForInput(const SentenceMetadata& smeta) { +void RuleSourceBigramFeatures::PrepareForInput(const SentenceMetadata& smeta) { // std::map > rule2_feats_.clear(); } -void RuleNgramFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void RuleSourceBigramFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector& ant_contexts, SparseVector* features, @@ -92,14 +92,52 @@ void RuleNgramFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, assert(w > 0); const string& cur = TD::Convert(w); ostringstream os; - os << "RB:" << prev << '_' << cur; + os << "RBS:" << prev << '_' << cur; const int fid = FD::Convert(Escape(os.str())); if (fid <= 0) return; f.add_value(fid, 1.0); prev = cur; } ostringstream os; - os << "RB:" << prev << '_' << ""; + os << "RBS:" << prev << '_' << ""; + f.set_value(FD::Convert(Escape(os.str())), 1.0); + } + (*features) += it->second; +} + +RuleTargetBigramFeatures::RuleTargetBigramFeatures(const std::string& param) { +} + +void RuleTargetBigramFeatures::PrepareForInput(const SentenceMetadata& smeta) { + rule2_feats_.clear(); +} + +void RuleTargetBigramFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + map >::iterator it = rule2_feats_.find(edge.rule_.get()); + if (it == rule2_feats_.end()) { + const TRule& rule = *edge.rule_; + it = rule2_feats_.insert(make_pair(&rule, SparseVector())).first; + SparseVector& f = it->second; + string prev = ""; + for (int i = 0; i < rule.e_.size(); ++i) { + WordID w = rule.e_[i]; + if (w < 0) w = -w; + if (w == 0) return; + const string& cur = TD::Convert(w); + ostringstream os; + os << "RBT:" << prev << '_' << cur; + const int fid = FD::Convert(Escape(os.str())); + if (fid <= 0) return; + f.add_value(fid, 1.0); + prev = cur; + } + ostringstream os; + os << "RBT:" << prev << '_' << ""; f.set_value(FD::Convert(Escape(os.str())), 1.0); } (*features) += it->second; diff --git a/decoder/ff_rules.h b/decoder/ff_rules.h index 48d8bd05..08b168b0 100644 --- a/decoder/ff_rules.h +++ b/decoder/ff_rules.h @@ -22,9 +22,24 @@ class RuleIdentityFeatures : public FeatureFunction { mutable std::map rule2_fid_; }; -class RuleNgramFeatures : public FeatureFunction { +class RuleSourceBigramFeatures : public FeatureFunction { public: - RuleNgramFeatures(const std::string& param); + RuleSourceBigramFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + mutable std::map > rule2_feats_; +}; + +class RuleTargetBigramFeatures : public FeatureFunction { + public: + RuleTargetBigramFeatures(const std::string& param); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, diff --git a/dtrain/parallelize/parallelize.rb b/dtrain/parallelize/parallelize.rb new file mode 100755 index 00000000..e88d9eef --- /dev/null +++ b/dtrain/parallelize/parallelize.rb @@ -0,0 +1,79 @@ +#!/usr/bin/env ruby + + +if ARGV.size != 5 + STDERR.write "Usage: " + STDERR.write "ruby parallelize.rb <#shards> \n" + exit +end + +dtrain_bin = '/home/pks/mt/cdec-dtrain/dtrain/dtrain_local_new_f' +ruby = '/usr/bin/ruby' +lplp_rb = '/home/pks/mt/cdec-dtrain/dtrain/hstreaming/lplp.rb' +lplp_args = 'l2 select_k 100000' +gzip = '/bin/gzip' + +num_shards = ARGV[0].to_i +input = ARGV[1] +refs = ARGV[2] +epochs = ARGV[3].to_i +ini = ARGV[4] + + +`mkdir work` + +def make_shards(input, refs, num_shards) + lc = `wc -l #{input}`.split.first.to_i + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + refs_f = File.new refs, 'r' + shard_in_files = [] + shard_refs_files = [] + 0.upto(num_shards-1) { |shard| + shard_in = File.new "work/shard.#{shard}.in", 'w+' + shard_refs = File.new "work/shard.#{shard}.refs", 'w+' + 0.upto(shard_sz-1) { |i| + shard_in.write in_f.gets + shard_refs.write refs_f.gets + } + shard_in_files << shard_in + shard_refs_files << shard_refs + } + while leftover > 0 + shard_in_files[-1].write in_f.gets + shard_refs_files[-1].write refs_f.gets + leftover -= 1 + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close +end + +make_shards input, refs, num_shards + +0.upto(epochs-1) { |epoch| + pids = [] + input_weights = '' + if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end + weights_files = [] + 0.upto(num_shards-1) { |shard| + pids << Kernel.fork { + `#{dtrain_bin} -c #{ini}\ + --input work/shard.#{shard}.in\ + --refs work/shard.#{shard}.refs #{input_weights}\ + --output work/weights.#{shard}.#{epoch}\ + &> work/out.#{shard}.#{epoch}` + } + weights_files << "work/weights.#{shard}.#{epoch}" + } + pids.each { |pid| Process.wait(pid) } + cat = File.new('work/weights_cat', 'w+') + weights_files.each { |f| cat.write File.new(f, 'r').read } + cat.close + `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}` +} + +`rm work/weights_cat` +`#{gzip} work/*` + diff --git a/dtrain/parallelize/test/cdec.ini b/dtrain/parallelize/test/cdec.ini new file mode 100644 index 00000000..72e99dc5 --- /dev/null +++ b/dtrain/parallelize/test/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5 +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/parallelize/test/dtrain.ini b/dtrain/parallelize/test/dtrain.ini new file mode 100644 index 00000000..03f9d240 --- /dev/null +++ b/dtrain/parallelize/test/dtrain.ini @@ -0,0 +1,15 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +decoder_config=cdec.ini diff --git a/dtrain/parallelize/test/in b/dtrain/parallelize/test/in new file mode 100644 index 00000000..a312809f --- /dev/null +++ b/dtrain/parallelize/test/in @@ -0,0 +1,10 @@ +barack obama erhält als vierter us @-@ präsident den frieden nobelpreis +der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen . +darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern . +der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein . +zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben . +das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant . +nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt . +diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt . +das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird . +der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt . diff --git a/dtrain/parallelize/test/refs b/dtrain/parallelize/test/refs new file mode 100644 index 00000000..4d3128cb --- /dev/null +++ b/dtrain/parallelize/test/refs @@ -0,0 +1,10 @@ +barack obama becomes the fourth american president to receive the nobel peace prize +the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so . +he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things . +the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule . +first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations . +the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway . +then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award . +he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office . +the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan . +the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries . -- cgit v1.2.3 From d04c516536db996e5fe5b94b8e5dea4ce2e04b4a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 24 Jun 2012 16:43:54 +0200 Subject: move --- dtrain/parallelize.rb | 79 +++++++++++++++++++++++++++++++++++++++ dtrain/parallelize/parallelize.rb | 79 --------------------------------------- 2 files changed, 79 insertions(+), 79 deletions(-) create mode 100755 dtrain/parallelize.rb delete mode 100755 dtrain/parallelize/parallelize.rb diff --git a/dtrain/parallelize.rb b/dtrain/parallelize.rb new file mode 100755 index 00000000..e88d9eef --- /dev/null +++ b/dtrain/parallelize.rb @@ -0,0 +1,79 @@ +#!/usr/bin/env ruby + + +if ARGV.size != 5 + STDERR.write "Usage: " + STDERR.write "ruby parallelize.rb <#shards> \n" + exit +end + +dtrain_bin = '/home/pks/mt/cdec-dtrain/dtrain/dtrain_local_new_f' +ruby = '/usr/bin/ruby' +lplp_rb = '/home/pks/mt/cdec-dtrain/dtrain/hstreaming/lplp.rb' +lplp_args = 'l2 select_k 100000' +gzip = '/bin/gzip' + +num_shards = ARGV[0].to_i +input = ARGV[1] +refs = ARGV[2] +epochs = ARGV[3].to_i +ini = ARGV[4] + + +`mkdir work` + +def make_shards(input, refs, num_shards) + lc = `wc -l #{input}`.split.first.to_i + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + refs_f = File.new refs, 'r' + shard_in_files = [] + shard_refs_files = [] + 0.upto(num_shards-1) { |shard| + shard_in = File.new "work/shard.#{shard}.in", 'w+' + shard_refs = File.new "work/shard.#{shard}.refs", 'w+' + 0.upto(shard_sz-1) { |i| + shard_in.write in_f.gets + shard_refs.write refs_f.gets + } + shard_in_files << shard_in + shard_refs_files << shard_refs + } + while leftover > 0 + shard_in_files[-1].write in_f.gets + shard_refs_files[-1].write refs_f.gets + leftover -= 1 + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close +end + +make_shards input, refs, num_shards + +0.upto(epochs-1) { |epoch| + pids = [] + input_weights = '' + if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end + weights_files = [] + 0.upto(num_shards-1) { |shard| + pids << Kernel.fork { + `#{dtrain_bin} -c #{ini}\ + --input work/shard.#{shard}.in\ + --refs work/shard.#{shard}.refs #{input_weights}\ + --output work/weights.#{shard}.#{epoch}\ + &> work/out.#{shard}.#{epoch}` + } + weights_files << "work/weights.#{shard}.#{epoch}" + } + pids.each { |pid| Process.wait(pid) } + cat = File.new('work/weights_cat', 'w+') + weights_files.each { |f| cat.write File.new(f, 'r').read } + cat.close + `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}` +} + +`rm work/weights_cat` +`#{gzip} work/*` + diff --git a/dtrain/parallelize/parallelize.rb b/dtrain/parallelize/parallelize.rb deleted file mode 100755 index e88d9eef..00000000 --- a/dtrain/parallelize/parallelize.rb +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env ruby - - -if ARGV.size != 5 - STDERR.write "Usage: " - STDERR.write "ruby parallelize.rb <#shards> \n" - exit -end - -dtrain_bin = '/home/pks/mt/cdec-dtrain/dtrain/dtrain_local_new_f' -ruby = '/usr/bin/ruby' -lplp_rb = '/home/pks/mt/cdec-dtrain/dtrain/hstreaming/lplp.rb' -lplp_args = 'l2 select_k 100000' -gzip = '/bin/gzip' - -num_shards = ARGV[0].to_i -input = ARGV[1] -refs = ARGV[2] -epochs = ARGV[3].to_i -ini = ARGV[4] - - -`mkdir work` - -def make_shards(input, refs, num_shards) - lc = `wc -l #{input}`.split.first.to_i - shard_sz = lc / num_shards - leftover = lc % num_shards - in_f = File.new input, 'r' - refs_f = File.new refs, 'r' - shard_in_files = [] - shard_refs_files = [] - 0.upto(num_shards-1) { |shard| - shard_in = File.new "work/shard.#{shard}.in", 'w+' - shard_refs = File.new "work/shard.#{shard}.refs", 'w+' - 0.upto(shard_sz-1) { |i| - shard_in.write in_f.gets - shard_refs.write refs_f.gets - } - shard_in_files << shard_in - shard_refs_files << shard_refs - } - while leftover > 0 - shard_in_files[-1].write in_f.gets - shard_refs_files[-1].write refs_f.gets - leftover -= 1 - end - (shard_in_files + shard_refs_files).each do |f| f.close end - in_f.close - refs_f.close -end - -make_shards input, refs, num_shards - -0.upto(epochs-1) { |epoch| - pids = [] - input_weights = '' - if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end - weights_files = [] - 0.upto(num_shards-1) { |shard| - pids << Kernel.fork { - `#{dtrain_bin} -c #{ini}\ - --input work/shard.#{shard}.in\ - --refs work/shard.#{shard}.refs #{input_weights}\ - --output work/weights.#{shard}.#{epoch}\ - &> work/out.#{shard}.#{epoch}` - } - weights_files << "work/weights.#{shard}.#{epoch}" - } - pids.each { |pid| Process.wait(pid) } - cat = File.new('work/weights_cat', 'w+') - weights_files.each { |f| cat.write File.new(f, 'r').read } - cat.close - `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}` -} - -`rm work/weights_cat` -`#{gzip} work/*` - -- cgit v1.2.3 From 3a94ac22e5c60aa205f2b3dadf81b0666500e0c3 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 8 Jul 2012 14:26:37 +0200 Subject: parallel. test --- dtrain/test/parallelize/cdec.ini | 22 ++++++++++++++++++++++ dtrain/test/parallelize/dtrain.ini | 15 +++++++++++++++ dtrain/test/parallelize/in | 10 ++++++++++ dtrain/test/parallelize/refs | 10 ++++++++++ 4 files changed, 57 insertions(+) create mode 100644 dtrain/test/parallelize/cdec.ini create mode 100644 dtrain/test/parallelize/dtrain.ini create mode 100644 dtrain/test/parallelize/in create mode 100644 dtrain/test/parallelize/refs diff --git a/dtrain/test/parallelize/cdec.ini b/dtrain/test/parallelize/cdec.ini new file mode 100644 index 00000000..72e99dc5 --- /dev/null +++ b/dtrain/test/parallelize/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5 +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/test/parallelize/dtrain.ini b/dtrain/test/parallelize/dtrain.ini new file mode 100644 index 00000000..03f9d240 --- /dev/null +++ b/dtrain/test/parallelize/dtrain.ini @@ -0,0 +1,15 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +decoder_config=cdec.ini diff --git a/dtrain/test/parallelize/in b/dtrain/test/parallelize/in new file mode 100644 index 00000000..a312809f --- /dev/null +++ b/dtrain/test/parallelize/in @@ -0,0 +1,10 @@ +barack obama erhält als vierter us @-@ präsident den frieden nobelpreis +der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen . +darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern . +der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein . +zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben . +das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant . +nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt . +diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt . +das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird . +der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt . diff --git a/dtrain/test/parallelize/refs b/dtrain/test/parallelize/refs new file mode 100644 index 00000000..4d3128cb --- /dev/null +++ b/dtrain/test/parallelize/refs @@ -0,0 +1,10 @@ +barack obama becomes the fourth american president to receive the nobel peace prize +the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so . +he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things . +the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule . +first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations . +the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway . +then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award . +he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office . +the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan . +the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries . -- cgit v1.2.3 From bb9309432fd35e95cf88b630853a928a3e3228c3 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 1 Aug 2012 18:27:51 +0200 Subject: Makefile.am, dtrain output, python build --- .gitignore | 1 + Makefile.am | 2 +- dtrain/test/example/cdec.ini | 3 +- dtrain/test/example/dtrain.ini | 2 +- dtrain/test/example/expected-output | 148 +++++++++++++++++------------------- python/setup.py | 2 +- 6 files changed, 75 insertions(+), 83 deletions(-) diff --git a/.gitignore b/.gitignore index 571360ed..4c40fe81 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,4 @@ phrasinator/gibbs_train_plm phrasinator/gibbs_train_plm_notables previous.sh dist +python/build/ diff --git a/Makefile.am b/Makefile.am index 4df72cff..332542c4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -25,4 +25,4 @@ SUBDIRS = \ AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 -AM_CPPFLAGS = -D_GLIBCXX_PARALLEL +AM_CPPFLAGS = -D_GLIBCXX_PARALLEL -march=native -mtune=native -O2 -pipe -fomit-frame-pointer -Wall diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini index 6642107f..d5955f0e 100644 --- a/dtrain/test/example/cdec.ini +++ b/dtrain/test/example/cdec.ini @@ -17,7 +17,8 @@ feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz #feature_function=NonLatinCount #feature_function=OutputIndicator feature_function=RuleIdentityFeatures -feature_function=RuleNgramFeatures +feature_function=RuleSourceBigramFeatures +feature_function=RuleTargetBigramFeatures feature_function=RuleShape #feature_function=SourceSpanSizeFeatures #feature_function=SourceWordPenalty diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index c8ac7c3f..8338b2d3 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -stop_after=10 # stop epoch after 10 inputs +stop_after=100 # stop epoch after 10 inputs # interesting stuff epochs=3 # run over input 3 times diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output index 25d2c069..43798484 100644 --- a/dtrain/test/example/expected-output +++ b/dtrain/test/example/expected-output @@ -1,21 +1,10 @@ cdec cfg 'test/example/cdec.ini' -feature: WordPenalty (no config parameters) -State is 0 bytes for feature WordPenalty -feature: KLanguageModel (with config parameters 'test/example/nc-wmt11.en.srilm.gz') Loading the LM will be faster if you build a binary file. Reading test/example/nc-wmt11.en.srilm.gz ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** -Loaded 5-gram KLM from test/example/nc-wmt11.en.srilm.gz (MapSize=49581) -State is 98 bytes for feature KLanguageModel test/example/nc-wmt11.en.srilm.gz -feature: RuleIdentityFeatures (no config parameters) -State is 0 bytes for feature RuleIdentityFeatures -feature: RuleNgramFeatures (no config parameters) -State is 0 bytes for feature RuleNgramFeatures -feature: RuleShape (no config parameters) Example feature: Shape_S00000_T00000 -State is 0 bytes for feature RuleShape -Seeding random number sequence to 1072059181 +Seeding random number sequence to 2108658507 dtrain Parameters: @@ -33,93 +22,94 @@ Parameters: pair threshold 0 select weights 'VOID' l1 reg 0 'none' + max pairs 4294967295 cdec cfg 'test/example/cdec.ini' input 'test/example/nc-wmt11.1k.gz' output '-' - stop_after 10 + stop_after 100 (a dot represents 10 inputs) Iteration #1 of 3. - . 10 -Stopping after 10 input sentences. + .......... 100 +Stopping after 100 input sentences. WEIGHTS - Glue = -0.0293 - WordPenalty = +0.049075 - LanguageModel = +0.24345 - LanguageModel_OOV = -0.2029 - PhraseModel_0 = +0.0084102 - PhraseModel_1 = +0.021729 - PhraseModel_2 = +0.014922 - PhraseModel_3 = +0.104 - PhraseModel_4 = -0.14308 - PhraseModel_5 = +0.0247 - PhraseModel_6 = -0.012 - PassThrough = -0.2161 + Glue = -0.236 + WordPenalty = +0.056111 + LanguageModel = +0.71011 + LanguageModel_OOV = -0.489 + PhraseModel_0 = -0.21332 + PhraseModel_1 = -0.13038 + PhraseModel_2 = +0.085148 + PhraseModel_3 = -0.16982 + PhraseModel_4 = -0.026332 + PhraseModel_5 = +0.2133 + PhraseModel_6 = +0.1002 + PassThrough = -0.5541 --- - 1best avg score: 0.16872 (+0.16872) - 1best avg model score: -1.8276 (-1.8276) - avg # pairs: 1121.1 - avg # rank err: 555.6 + 1best avg score: 0.16928 (+0.16928) + 1best avg model score: 2.4454 (+2.4454) + avg # pairs: 1616.2 + avg # rank err: 769.6 avg # margin viol: 0 - non0 feature count: 277 - avg list sz: 77.2 - avg f count: 90.96 -(time 0.1 min, 0.6 s/S) + non0 feature count: 4068 + avg list sz: 96.65 + avg f count: 118.01 +(time 1.3 min, 0.79 s/S) Iteration #2 of 3. - . 10 + .......... 100 WEIGHTS - Glue = -0.3526 - WordPenalty = +0.067576 - LanguageModel = +1.155 - LanguageModel_OOV = -0.2728 - PhraseModel_0 = -0.025529 - PhraseModel_1 = +0.095869 - PhraseModel_2 = +0.094567 - PhraseModel_3 = +0.12482 - PhraseModel_4 = -0.36533 - PhraseModel_5 = +0.1068 - PhraseModel_6 = -0.1517 - PassThrough = -0.286 + Glue = -0.1721 + WordPenalty = -0.14132 + LanguageModel = +0.56023 + LanguageModel_OOV = -0.6786 + PhraseModel_0 = +0.14155 + PhraseModel_1 = +0.34218 + PhraseModel_2 = +0.22954 + PhraseModel_3 = -0.24762 + PhraseModel_4 = -0.25848 + PhraseModel_5 = -0.0453 + PhraseModel_6 = -0.0264 + PassThrough = -0.7436 --- - 1best avg score: 0.18394 (+0.015221) - 1best avg model score: 3.205 (+5.0326) - avg # pairs: 1168.3 - avg # rank err: 594.8 + 1best avg score: 0.19585 (+0.02657) + 1best avg model score: -16.311 (-18.757) + avg # pairs: 1475.8 + avg # rank err: 668.48 avg # margin viol: 0 - non0 feature count: 543 - avg list sz: 77.5 - avg f count: 85.916 -(time 0.083 min, 0.5 s/S) + non0 feature count: 6300 + avg list sz: 96.08 + avg f count: 114.92 +(time 1.3 min, 0.76 s/S) Iteration #3 of 3. - . 10 + .......... 100 WEIGHTS - Glue = -0.392 - WordPenalty = +0.071963 - LanguageModel = +0.81266 - LanguageModel_OOV = -0.4177 - PhraseModel_0 = -0.2649 - PhraseModel_1 = -0.17931 - PhraseModel_2 = +0.038261 - PhraseModel_3 = +0.20261 - PhraseModel_4 = -0.42621 - PhraseModel_5 = +0.3198 - PhraseModel_6 = -0.1437 - PassThrough = -0.4309 + Glue = -0.1577 + WordPenalty = -0.086902 + LanguageModel = +0.30136 + LanguageModel_OOV = -0.7848 + PhraseModel_0 = +0.11743 + PhraseModel_1 = +0.11142 + PhraseModel_2 = -0.0053865 + PhraseModel_3 = -0.18731 + PhraseModel_4 = -0.67144 + PhraseModel_5 = +0.1236 + PhraseModel_6 = -0.2665 + PassThrough = -0.8498 --- - 1best avg score: 0.2962 (+0.11225) - 1best avg model score: -36.274 (-39.479) - avg # pairs: 1109.6 - avg # rank err: 515.9 + 1best avg score: 0.20034 (+0.0044978) + 1best avg model score: -7.2775 (+9.0336) + avg # pairs: 1578.6 + avg # rank err: 705.77 avg # margin viol: 0 - non0 feature count: 741 - avg list sz: 77 - avg f count: 88.982 -(time 0.083 min, 0.5 s/S) + non0 feature count: 7313 + avg list sz: 96.84 + avg f count: 124.48 +(time 1.5 min, 0.9 s/S) Writing weights file to '-' ... done --- -Best iteration: 3 [SCORE 'stupid_bleu'=0.2962]. -This took 0.26667 min. +Best iteration: 3 [SCORE 'stupid_bleu'=0.20034]. +This took 4.0833 min. diff --git a/python/setup.py b/python/setup.py index 7be976e8..cced4c1a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -6,7 +6,7 @@ import glob INC = ['..', 'src/', '../decoder', '../utils', '../mteval'] LIB = ['../decoder', '../utils', '../mteval', '../training', '../klm/lm', '../klm/util'] -LINK_ARGS = [] +LINK_ARGS = ['-lz'] # Detect Boost BOOST_ROOT = os.getenv('BOOST_ROOT') -- cgit v1.2.3 From 0c54220adfaada6ad1e2d54f31a9895da35127fd Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 5 Nov 2012 18:57:39 +0100 Subject: build fix, default learning rate --- decoder/ff_rules.h | 1 + dtrain/dtrain.cc | 4 +- dtrain/dtrain.h | 3 +- dtrain/test/example/dtrain.ini | 8 +-- dtrain/test/example/expected-output | 128 ++++++++++++++---------------------- 5 files changed, 59 insertions(+), 85 deletions(-) diff --git a/decoder/ff_rules.h b/decoder/ff_rules.h index dc9a15d5..b100ec34 100644 --- a/decoder/ff_rules.h +++ b/decoder/ff_rules.h @@ -5,6 +5,7 @@ #include #include "trule.h" #include "ff.h" +#include "hg.h" #include "array2d.h" #include "wordid.h" diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index b7a4bb6f..18286668 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -24,13 +24,13 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("pair_threshold", po::value()->default_value(0.), "bleu [0,1] threshold to filter pairs") ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_") - ("learning_rate", po::value()->default_value(0.0001), "learning rate") + ("learning_rate", po::value()->default_value(1.0), "learning rate") ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") ("l1_reg_strength", po::value(), "l1 regularization strength") - ("fselect", po::value()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO + ("fselect", po::value()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 7e084a79..4b6f415c 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -3,7 +3,7 @@ #undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs // DO NOT USE WITH SVM! -#define DTRAIN_LOCAL +//#define DTRAIN_LOCAL #define DTRAIN_DOTS 10 // after how many inputs to display a '.' #define DTRAIN_GRAMMAR_DELIM "########EOS########" #define DTRAIN_SCALE 100000 @@ -22,7 +22,6 @@ #include "filelib.h" - using namespace std; using namespace dtrain; namespace po = boost::program_options; diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 8338b2d3..72d50ca1 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,18 +1,18 @@ input=test/example/nc-wmt11.1k.gz # use '-' for STDIN output=- # a weights file (add .gz for gzip compression) or STDOUT '-' -select_weights=VOID # don't output weights +select_weights=VOID # don't output weights decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -stop_after=100 # stop epoch after 10 inputs +stop_after=10 # stop epoch after 10 inputs # interesting stuff -epochs=3 # run over input 3 times +epochs=2 # run over input 2 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 scorer=stupid_bleu # use 'stupid' BLEU+1 -learning_rate=0.0001 # learning rate +learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output index 43798484..05326763 100644 --- a/dtrain/test/example/expected-output +++ b/dtrain/test/example/expected-output @@ -4,17 +4,17 @@ Reading test/example/nc-wmt11.en.srilm.gz ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Example feature: Shape_S00000_T00000 -Seeding random number sequence to 2108658507 +Seeding random number sequence to 2912000813 dtrain Parameters: k 100 N 4 - T 3 + T 2 scorer 'stupid_bleu' sample from 'kbest' filter 'uniq' - learning rate 0.0001 + learning rate 1 gamma 0 loss margin 0 pairs 'XYX' @@ -26,90 +26,64 @@ Parameters: cdec cfg 'test/example/cdec.ini' input 'test/example/nc-wmt11.1k.gz' output '-' - stop_after 100 + stop_after 10 (a dot represents 10 inputs) -Iteration #1 of 3. - .......... 100 -Stopping after 100 input sentences. +Iteration #1 of 2. + . 10 +Stopping after 10 input sentences. WEIGHTS - Glue = -0.236 - WordPenalty = +0.056111 - LanguageModel = +0.71011 - LanguageModel_OOV = -0.489 - PhraseModel_0 = -0.21332 - PhraseModel_1 = -0.13038 - PhraseModel_2 = +0.085148 - PhraseModel_3 = -0.16982 - PhraseModel_4 = -0.026332 - PhraseModel_5 = +0.2133 - PhraseModel_6 = +0.1002 - PassThrough = -0.5541 + Glue = -637 + WordPenalty = +1064 + LanguageModel = +1175.3 + LanguageModel_OOV = -1437 + PhraseModel_0 = +1935.6 + PhraseModel_1 = +2499.3 + PhraseModel_2 = +964.96 + PhraseModel_3 = +1410.8 + PhraseModel_4 = -5977.9 + PhraseModel_5 = +522 + PhraseModel_6 = +1089 + PassThrough = -1308 --- - 1best avg score: 0.16928 (+0.16928) - 1best avg model score: 2.4454 (+2.4454) - avg # pairs: 1616.2 - avg # rank err: 769.6 + 1best avg score: 0.16963 (+0.16963) + 1best avg model score: 64485 (+64485) + avg # pairs: 1494.4 + avg # rank err: 702.6 avg # margin viol: 0 - non0 feature count: 4068 - avg list sz: 96.65 - avg f count: 118.01 -(time 1.3 min, 0.79 s/S) + non0 feature count: 528 + avg list sz: 85.7 + avg f count: 102.75 +(time 0.083 min, 0.5 s/S) -Iteration #2 of 3. - .......... 100 +Iteration #2 of 2. + . 10 WEIGHTS - Glue = -0.1721 - WordPenalty = -0.14132 - LanguageModel = +0.56023 - LanguageModel_OOV = -0.6786 - PhraseModel_0 = +0.14155 - PhraseModel_1 = +0.34218 - PhraseModel_2 = +0.22954 - PhraseModel_3 = -0.24762 - PhraseModel_4 = -0.25848 - PhraseModel_5 = -0.0453 - PhraseModel_6 = -0.0264 - PassThrough = -0.7436 + Glue = -1196 + WordPenalty = +809.52 + LanguageModel = +3112.1 + LanguageModel_OOV = -1464 + PhraseModel_0 = +3895.5 + PhraseModel_1 = +4683.4 + PhraseModel_2 = +1092.8 + PhraseModel_3 = +1079.6 + PhraseModel_4 = -6827.7 + PhraseModel_5 = -888 + PhraseModel_6 = +142 + PassThrough = -1335 --- - 1best avg score: 0.19585 (+0.02657) - 1best avg model score: -16.311 (-18.757) - avg # pairs: 1475.8 - avg # rank err: 668.48 + 1best avg score: 0.277 (+0.10736) + 1best avg model score: -3110.5 (-67595) + avg # pairs: 1144.2 + avg # rank err: 529.1 avg # margin viol: 0 - non0 feature count: 6300 - avg list sz: 96.08 - avg f count: 114.92 -(time 1.3 min, 0.76 s/S) - -Iteration #3 of 3. - .......... 100 -WEIGHTS - Glue = -0.1577 - WordPenalty = -0.086902 - LanguageModel = +0.30136 - LanguageModel_OOV = -0.7848 - PhraseModel_0 = +0.11743 - PhraseModel_1 = +0.11142 - PhraseModel_2 = -0.0053865 - PhraseModel_3 = -0.18731 - PhraseModel_4 = -0.67144 - PhraseModel_5 = +0.1236 - PhraseModel_6 = -0.2665 - PassThrough = -0.8498 - --- - 1best avg score: 0.20034 (+0.0044978) - 1best avg model score: -7.2775 (+9.0336) - avg # pairs: 1578.6 - avg # rank err: 705.77 - avg # margin viol: 0 - non0 feature count: 7313 - avg list sz: 96.84 - avg f count: 124.48 -(time 1.5 min, 0.9 s/S) + non0 feature count: 859 + avg list sz: 74.9 + avg f count: 112.84 +(time 0.067 min, 0.4 s/S) Writing weights file to '-' ... done --- -Best iteration: 3 [SCORE 'stupid_bleu'=0.20034]. -This took 4.0833 min. +Best iteration: 2 [SCORE 'stupid_bleu'=0.277]. +This took 0.15 min. -- cgit v1.2.3 From 782fb27af98ed98256cc25c832131c59c8e9ce9c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Nov 2012 21:34:14 -0500 Subject: script to add sos/eos --- corpus/add-sos-eos.pl | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 corpus/add-sos-eos.pl diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl new file mode 100755 index 00000000..5e2d44cb --- /dev/null +++ b/corpus/add-sos-eos.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + chomp; + my @fields = split / \|\|\| /; + my ($ff, $ee, $aa) = @fields; + die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3; + my @fs = split /\s+/, $ff; + my @es = split /\s+/, $ee; + my @as = split /\s+/, $aa; + my @oas = (); + push @oas, '0-0'; + my $flen = scalar @fs; + my $elen = scalar @es; + for my $ap (@as) { + my ($a, $b) = split /-/, $ap; + die "Bad format in: @as" unless defined $a && defined $b; + push @oas, ($a + 1) . '-' . ($b + 1); + } + push @oas, ($flen + 1) . '-' . ($elen + 1); + print " $ff ||| $ee ||| @oas\n"; +} + -- cgit v1.2.3 From 9e5107a05bfabb76ce547d2849173c5a11aeba60 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Nov 2012 23:41:55 -0500 Subject: larger training data for semi-crf word segmenter --- compound-split/de/TRAIN | 3 +- compound-split/de/cdec-train.ini | 2 +- compound-split/de/dev.in-ref | 103 +++++++++++++++++++++++++++++++++++++ compound-split/de/large_dict.de.gz | Bin 4188658 -> 11796619 bytes compound-split/de/weights.trained | 40 +++++++------- 5 files changed, 126 insertions(+), 22 deletions(-) diff --git a/compound-split/de/TRAIN b/compound-split/de/TRAIN index 6f7184ea..2b48a8d2 100755 --- a/compound-split/de/TRAIN +++ b/compound-split/de/TRAIN @@ -1 +1,2 @@ -~/cdec/training/mpi_batch_optimize -w weights.cur.gz -t dev.in-ref -d cdec-train.ini -M 200 +~/cdec/training/mpi_batch_optimize -w weights.trained -t dev.in-ref -p -c cdec-train.ini -M 200 + diff --git a/compound-split/de/cdec-train.ini b/compound-split/de/cdec-train.ini index 383fa040..44f5934d 100644 --- a/compound-split/de/cdec-train.ini +++ b/compound-split/de/cdec-train.ini @@ -2,4 +2,4 @@ formalism=csplit # crf_uniform_empirical=true intersection_strategy=full feature_function=CSplit_BasicFeatures large_dict.de.gz badlist.de.gz -feature_function=CSplit_ReverseCharLM charlm.rev.5gm.de.klm +feature_function=CSplit_ReverseCharLM charlm.rev.5gm.de.lm.gz diff --git a/compound-split/de/dev.in-ref b/compound-split/de/dev.in-ref index ab6af9dd..b91daced 100644 --- a/compound-split/de/dev.in-ref +++ b/compound-split/de/dev.in-ref @@ -798,3 +798,106 @@ familie ||| # familie vielen ||| # vielen jahren ||| # jahren tageszeitung ||| ((('#',0,1),),(('tag',0,1),('tages',0,1),),(('zeitung',0,1),),) +washington ||| # washington +wahlschlacht ||| # wahl schlacht +letzte ||| # letzte +milliarden ||| # milliarden +dollar ||| # dollar +sollen ||| # sollen +wahlkämpfer ||| # wahlkämpfer +bislang ||| # bislang +kampagnen ||| # kampagnen +ausgegeben ||| # ausgegeben +abstimmung ||| # abstimmung +mobilisieren ||| # mobilisieren +letzten ||| # letzten +reserven ||| # reserven +wähler ||| # wähler +bekommen ||| # bekommen +herausforderer ||| # herausforderer +romney ||| # romney +kündigte ||| # kündigte +wahltag ||| # wahltag +selbst ||| # selbst +mehreren ||| # mehreren +bundesstaaten ||| # bundesstaaten +aufzutreten ||| # aufzutreten +ursprünglich ||| # ursprünglich +abschlussveranstaltung ||| # abschluss veranstaltung +montagabend ||| # montag abend +vorgesehen ||| # vorgesehen +schließung ||| # schließung +wahllokale ||| # wahl lokale +stimmen ||| # stimmen +werben ||| # werben +sprecher ||| # sprecher +wahlkampfteams ||| # wahlkampf teams +pennsylvania ||| # pennsylvania +natürlich ||| # natürlich +schicksalstaat ||| # schicksal staat +republikaner ||| # republikaner +präsident ||| # präsident +geworden ||| # geworden +gewonnen ||| # gewonnen +auswertung ||| # auswertung +portals ||| # portals +national ||| # national +sieben ||| # sieben +november ||| # november +umfragen ||| # umfragen +meistumkämpfte ||| # meist umkämpfte +mehrheit ||| # mehrheit +feststeht ||| # feststeht +wahlkämpfer ||| # wahlkämpfer +besonders ||| # besonders +relevant ||| # relevant +direkt ||| # direkt +gewählt ||| # gewählt +präsident ||| # präsident +wahlmännergremium ||| # wahlmänner gremium +spiegeln ||| # spiegeln +ergebnisse ||| # ergebnisse +einzelnen ||| # einzelnen +bundesstaaten ||| # bundesstaaten +präsident ||| # präsident +letzten ||| # letzten +seiner ||| # seiner +kampagne ||| # kampagne +vorgelegt ||| # vorgelegt +rocklegende ||| # rock legende +springsteen ||| # springsteen +botschafter ||| # botschafter +seiner ||| # seiner +kampagne ||| # kampagne +wisconsin ||| # wisconsin +dankte ||| # dankte +präsidenten ||| # präsidenten +während ||| # während +konzerts ||| # konzerts +gesundheitsreform ||| ((('#',0.0,1),),(('gesundheits',0.0,1),('gesundheit',0.0,1),),(('reform',0.0,1),),) +regulierung ||| # regulierung +street ||| # street +später ||| # später +auftritte ||| # auftritte +natürlich ||| # natürlich +summen ||| # summen +vibrieren ||| # vibrieren +duftstoffe ||| ((('#',0.0,1),),(('duftstoffe',0.0,2),('duft',0.0,1),),(('stoffe',0.0,1),),) +echten ||| # echten +verströmen ||| # verströmen +roboterbiene ||| # roboter biene +entwickelt ||| # entwickelt +wissenschaftlern ||| # wissenschaftlern +freien ||| # freien +universität ||| # universität +berlin ||| # berlin +künstlichen ||| # künstlichen +insekt ||| # insekt +wollen ||| # wollen +forscher ||| # forscher +futterquellen ||| # futter quellen +lotsen ||| # lotsen +geheimnis ||| # geheimnis +bienentanzes ||| # biene tanzes +entschlüsseln ||| # entschlüsseln +klimawandel ||| # klima wandel diff --git a/compound-split/de/large_dict.de.gz b/compound-split/de/large_dict.de.gz index 9af2c447..22b55908 100644 Binary files a/compound-split/de/large_dict.de.gz and b/compound-split/de/large_dict.de.gz differ diff --git a/compound-split/de/weights.trained b/compound-split/de/weights.trained index 4ae8a8ce..313d6ba9 100644 --- a/compound-split/de/weights.trained +++ b/compound-split/de/weights.trained @@ -1,20 +1,20 @@ -# Objective = 141.249 (eval count=281) -LettersSq -0.04232699523807458 -LettersSqrt 0.4355587430228624 -RevCharLM 0.41198831478844122 -FugS 0.075512682701211239 -FugN -0.61902217202456356 -WordCount -0.0082286209848003913 -InDict -0.98529136326577915 -InDictSubWord 1.0386001157542868 -Short 0.70242841302446457 -Long -0.69651861257390713 -OOV 0.97706274228074586 -OOVSubWord -0.76138571782502074 -ShortRange -1.1864424374105051 -HighFreq -4.1150415279961052 -MedFreq 0.014790338975451987 -Freq -0.28901069668114737 -Bad -3.8059407890457644 -FreqLen1 -0.3827361966178347 -FreqLen2 -0.17308899259418953 +# Objective = 164.304 (eval count=118) +WordCount 0.10973670965860199 +LettersSq -0.040894083779153829 +LettersSqrt 0.33757888570124861 +InDict 0.049573722026753933 +InDictSubWord 0.69975734577036308 +Short 0.33555471096277373 +Long -0.41832772458034762 +OOV 0.060162987632197122 +OOVSubWord 0.068471660184093452 +ShortRange -0.95127190946200446 +HighFreq -2.4179829648207223 +MedFreq -0.21881857269052826 +Freq -0.32292456278379877 +FreqLen1 -0.34734160349477439 +FreqLen2 -0.1574854779610021 +Bad -1.7868318814690027 +RevCharLM 0.52613329631178929 +FugS 0.028876419364008043 +FugN -0.086060954349416352 -- cgit v1.2.3