From 7c4a9e0825b15ce6c08c45c7654c614d542cf93a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 24 Jan 2013 16:28:23 +0100 Subject: made examples work again --- training/dtrain/test/example/README | 4 ++-- training/dtrain/test/example/cdec.ini | 2 +- training/dtrain/test/example/dtrain.ini | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'training/dtrain/test/example') diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README index 6937b11b..2df77086 100644 --- a/training/dtrain/test/example/README +++ b/training/dtrain/test/example/README @@ -1,8 +1,8 @@ Small example of input format for distributed training. -Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . +Call dtrain from this folder with ../../dtrain -c test/example/dtrain.ini . For this to work, undef 'DTRAIN_LOCAL' in dtrain.h and recompile. -Data is here: http://simianer.de/#dtrain +data can be found here: http://simianer.de/#dtrain diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini index d5955f0e..068ebd4d 100644 --- a/training/dtrain/test/example/cdec.ini +++ b/training/dtrain/test/example/cdec.ini @@ -4,7 +4,7 @@ scfg_max_span_limit=15 intersection_strategy=cube_pruning cubepruning_pop_limit=30 feature_function=WordPenalty -feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz # all currently working feature functions for translation: # (with those features active that were used in the ACL paper) #feature_function=ArityPenalty diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini index 72d50ca1..97fce7f0 100644 --- a/training/dtrain/test/example/dtrain.ini +++ b/training/dtrain/test/example/dtrain.ini @@ -1,7 +1,7 @@ -input=test/example/nc-wmt11.1k.gz # use '-' for STDIN +input=./nc-wmt11.1k.gz # use '-' for STDIN output=- # a weights file (add .gz for gzip compression) or STDOUT '-' select_weights=VOID # don't output weights -decoder_config=test/example/cdec.ini # config for cdec +decoder_config=./cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -- cgit v1.2.3 From 529c8f0671ce0b09c2a797278a8f84242c86465d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 15 Mar 2013 10:29:13 +0100 Subject: removed hadoop/hstreaming mode --- training/dtrain/README.md | 28 +---- training/dtrain/dtrain.cc | 121 +------------------ training/dtrain/dtrain.h | 8 +- training/dtrain/hstreaming/avg.rb | 32 ----- training/dtrain/hstreaming/cdec.ini | 22 ---- training/dtrain/hstreaming/dtrain.ini | 15 --- training/dtrain/hstreaming/dtrain.sh | 9 -- training/dtrain/hstreaming/hadoop-streaming-job.sh | 30 ----- training/dtrain/hstreaming/lplp.rb | 131 --------------------- training/dtrain/hstreaming/red-test | 9 -- training/dtrain/lplp.rb | 131 +++++++++++++++++++++ training/dtrain/parallelize.rb | 4 +- training/dtrain/test/example/cdec.ini | 2 +- 13 files changed, 144 insertions(+), 398 deletions(-) delete mode 100755 training/dtrain/hstreaming/avg.rb delete mode 100644 training/dtrain/hstreaming/cdec.ini delete mode 100644 training/dtrain/hstreaming/dtrain.ini delete mode 100755 training/dtrain/hstreaming/dtrain.sh delete mode 100755 training/dtrain/hstreaming/hadoop-streaming-job.sh delete mode 100755 training/dtrain/hstreaming/lplp.rb delete mode 100644 training/dtrain/hstreaming/red-test create mode 100755 training/dtrain/lplp.rb (limited to 'training/dtrain/test/example') diff --git a/training/dtrain/README.md b/training/dtrain/README.md index 7edabbf1..2ab2f232 100644 --- a/training/dtrain/README.md +++ b/training/dtrain/README.md @@ -13,36 +13,18 @@ Builds when building cdec, see ../BUILDING . To build only parts needed for dtrain do ``` autoreconf -ifv - ./configure [--disable-gtest] - cd dtrain/; make + ./configure + cd training/dtrain/; make ``` Running ------- -To run this on a dev set locally: -``` - #define DTRAIN_LOCAL -``` -otherwise remove that line or undef, then recompile. You need a single -grammar file or input annotated with per-sentence grammars (psg) as you -would use with cdec. Additionally you need to give dtrain a file with -references (--refs) when running locally. - -The input for use with hadoop streaming looks like this: -``` - \t\t\t -``` -To convert a psg to this format you need to replace all "\n" -by "\t". Make sure there are no tabs in your data. - -For an example of local usage (with the 'distributed' format) -the see test/example/ . This expects dtrain to be built without -DTRAIN_LOCAL. +See directories under test/ . Legal ----- -Copyright (c) 2012 by Patrick Simianer +Copyright (c) 2012-2013 by Patrick Simianer -See the file ../LICENSE.txt for the licensing terms that this software is +See the file LICENSE.txt in the root folder for the licensing terms that this software is released under. diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 53487d34..dfb5b351 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -12,9 +12,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("decoder_config", po::value(), "configuration file for cdec") ("print_weights", po::value(), "weights to print on each iteration") ("stop_after", po::value()->default_value(0), "stop after X input sentences") - ("tmp", po::value()->default_value("/tmp"), "temp dir to use") ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") - ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") ("k", po::value()->default_value(100), "how many translations to sample") ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") @@ -28,16 +26,14 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") - ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") + ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED") ("l1_reg_strength", po::value(), "l1 regularization strength") ("fselect", po::value()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") -#ifdef DTRAIN_LOCAL ("refs,r", po::value(), "references in local mode") -#endif ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() @@ -55,16 +51,6 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) cerr << cl << endl; return false; } - if (cfg->count("hstreaming") && (*cfg)["output"].as() != "-") { - cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl; - return false; - } -#ifdef DTRAIN_LOCAL - if ((*cfg)["input"].as() == "-") { - cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl; - return false; - } -#endif if ((*cfg)["sample_from"].as() != "kbest" && (*cfg)["sample_from"].as() != "forest") { cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as() << "', use 'kbest' or 'forest'." << endl; @@ -111,17 +97,8 @@ main(int argc, char** argv) if (cfg.count("verbose")) verbose = true; bool noup = false; if (cfg.count("noup")) noup = true; - bool hstreaming = false; - string task_id; - if (cfg.count("hstreaming")) { - hstreaming = true; - quiet = true; - task_id = cfg["hstreaming"].as(); - cerr.precision(17); - } bool rescale = false; if (cfg.count("rescale")) rescale = true; - HSReporter rep(task_id); bool keep = false; if (cfg.count("keep")) keep = true; @@ -224,16 +201,8 @@ main(int argc, char** argv) // buffer input for t > 0 vector src_str_buf; // source strings (decoder takes only strings) vector > ref_ids_buf; // references as WordID vecs - // where temp files go - string tmp_path = cfg["tmp"].as(); -#ifdef DTRAIN_LOCAL string refs_fn = cfg["refs"].as(); ReadFile refs(refs_fn); -#else - string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars"); - ogzstream grammar_buf_out; - grammar_buf_out.open(grammar_buf_fn.c_str()); -#endif unsigned in_sz = std::numeric_limits::max(); // input index, input size vector > all_scores; @@ -270,9 +239,7 @@ main(int argc, char** argv) cerr << setw(25) << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; -#ifdef DTRAIN_LOCAL cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; -#endif cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; if (cfg.count("input_weights")) cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as() << "'" << endl; @@ -285,14 +252,10 @@ main(int argc, char** argv) for (unsigned t = 0; t < T; t++) // T epochs { - if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl; - time_t start, end; time(&start); -#ifndef DTRAIN_LOCAL igzstream grammar_buf_in; if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str()); -#endif score_t score_sum = 0.; score_t model_sum(0); unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; @@ -340,52 +303,6 @@ main(int argc, char** argv) // getting input vector ref_ids; // reference as vector -#ifndef DTRAIN_LOCAL - vector in_split; // input: sid\tsrc\tref\tpsg - if (t == 0) { - // handling input - split_in(in, in_split); - if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl; - // getting reference - vector ref_tok; - boost::split(ref_tok, in_split[2], boost::is_any_of(" ")); - register_and_convert(ref_tok, ref_ids); - ref_ids_buf.push_back(ref_ids); - // process and set grammar - bool broken_grammar = true; // ignore broken grammars - for (string::iterator it = in.begin(); it != in.end(); it++) { - if (!isspace(*it)) { - broken_grammar = false; - break; - } - } - if (broken_grammar) { - cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl; - continue; - } - boost::replace_all(in, "\t", "\n"); - in += "\n"; - grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; - decoder.AddSupplementalGrammarFromString(in); - src_str_buf.push_back(in_split[1]); - // decode - observer->SetRef(ref_ids); - decoder.Decode(in_split[1], observer); - } else { - // get buffered grammar - string grammar_str; - while (true) { - string rule; - getline(grammar_buf_in, rule); - if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break; - grammar_str += rule + "\n"; - } - decoder.AddSupplementalGrammarFromString(grammar_str); - // decode - observer->SetRef(ref_ids_buf[ii]); - decoder.Decode(src_str_buf[ii], observer); - } -#else if (t == 0) { string r_; getline(*refs, r_); @@ -402,7 +319,6 @@ main(int argc, char** argv) decoder.Decode(in, observer); else decoder.Decode(src_str_buf[ii], observer); -#endif // get (scored) samples vector* samples = observer->GetSamples(); @@ -505,11 +421,6 @@ main(int argc, char** argv) ++ii; - if (hstreaming) { - rep.update_counter("Seen #"+boost::lexical_cast(t+1), 1u); - rep.update_counter("Seen", 1u); - } - } // input loop if (average) w_average += lambdas; @@ -518,21 +429,8 @@ main(int argc, char** argv) if (t == 0) { in_sz = ii; // remember size of input (# lines) - if (hstreaming) { - rep.update_counter("|Input|", ii); - rep.update_gcounter("|Input|", ii); - rep.update_gcounter("Shards", 1u); - } } -#ifndef DTRAIN_LOCAL - if (t == 0) { - grammar_buf_out.close(); - } else { - grammar_buf_in.close(); - } -#endif - // print some stats score_t score_avg = score_sum/(score_t)in_sz; score_t model_avg = model_sum/(score_t)in_sz; @@ -546,7 +444,7 @@ main(int argc, char** argv) } unsigned nonz = 0; - if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero(); + if (!quiet) nonz = (unsigned)lambdas.num_nonzero(); if (!quiet) { cerr << _p5 << _p << "WEIGHTS" << endl; @@ -571,16 +469,6 @@ main(int argc, char** argv) cerr << " avg f count: " << f_count/(float)list_sz << endl; } - if (hstreaming) { - rep.update_counter("Score 1best avg #"+boost::lexical_cast(t+1), (unsigned)(score_avg*DTRAIN_SCALE)); - rep.update_counter("Model 1best avg #"+boost::lexical_cast(t+1), (unsigned)(model_avg*DTRAIN_SCALE)); - rep.update_counter("Pairs avg #"+boost::lexical_cast(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE)); - rep.update_counter("Rank errors avg #"+boost::lexical_cast(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE)); - rep.update_counter("Margin violations avg #"+boost::lexical_cast(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE)); - rep.update_counter("Non zero feature count #"+boost::lexical_cast(t+1), nonz); - rep.update_gcounter("Non zero feature count #"+boost::lexical_cast(t+1), nonz); - } - pair remember; remember.first = score_avg; remember.second = model_avg; @@ -611,10 +499,6 @@ main(int argc, char** argv) if (average) w_average /= (weight_t)T; -#ifndef DTRAIN_LOCAL - unlink(grammar_buf_fn.c_str()); -#endif - if (!noup) { if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl; if (select_weights == "last" || average) { // last, average @@ -651,7 +535,6 @@ main(int argc, char** argv) } } } - if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl; if (!quiet) cerr << "done" << endl; } diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index 572fd613..f368d810 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -1,14 +1,12 @@ #ifndef _DTRAIN_H_ #define _DTRAIN_H_ -#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs - // DO NOT USE WITH SVM! -#define DTRAIN_LOCAL +#undef DTRAIN_FASTER_PERCEPTRON // only consider actually misranked pairs + // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin! + #define DTRAIN_DOTS 10 // after how many inputs to display a '.' -#define DTRAIN_GRAMMAR_DELIM "########EOS########" #define DTRAIN_SCALE 100000 - #include #include #include diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb deleted file mode 100755 index 2599c732..00000000 --- a/training/dtrain/hstreaming/avg.rb +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env ruby -# first arg may be an int of custom shard count - -shard_count_key = "__SHARD_COUNT__" - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -w = {} -c = {} -w.default = 0 -c.default = 0 -while line = STDIN.gets - key, val = line.split /\s/ - w[key] += val.to_f - c[key] += 1 -end - -if ARGV.size == 0 - shard_count = w["__SHARD_COUNT__"] -else - shard_count = ARGV[0].to_f -end -w.each_key { |k| - if k == shard_count_key - next - else - puts "#{k}\t#{w[k]/shard_count}" - #puts "# #{c[k]}" - end -} - diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini deleted file mode 100644 index d4f5cecd..00000000 --- a/training/dtrain/hstreaming/cdec.ini +++ /dev/null @@ -1,22 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -scfg_max_span_limit=15 -intersection_strategy=cube_pruning -cubepruning_pop_limit=30 -feature_function=WordPenalty -feature_function=KLanguageModel nc-wmt11.en.srilm.gz -#feature_function=ArityPenalty -#feature_function=CMR2008ReorderingFeatures -#feature_function=Dwarf -#feature_function=InputIndicator -#feature_function=LexNullJump -#feature_function=NewJump -#feature_function=NgramFeatures -#feature_function=NonLatinCount -#feature_function=OutputIndicator -#feature_function=RuleIdentityFeatures -#feature_function=RuleNgramFeatures -#feature_function=RuleShape -#feature_function=SourceSpanSizeFeatures -#feature_function=SourceWordPenalty -#feature_function=SpanFeatures diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini deleted file mode 100644 index a2c219a1..00000000 --- a/training/dtrain/hstreaming/dtrain.ini +++ /dev/null @@ -1,15 +0,0 @@ -input=- -output=- -decoder_config=cdec.ini -tmp=/var/hadoop/mapred/local/ -epochs=1 -k=100 -N=4 -learning_rate=0.0001 -gamma=0 -scorer=stupid_bleu -sample_from=kbest -filter=uniq -pair_sampling=XYX -pair_threshold=0 -select_weights=last diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh deleted file mode 100755 index 877ff94c..00000000 --- a/training/dtrain/hstreaming/dtrain.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# script to run dtrain with a task id - -pushd . &>/dev/null -cd .. -ID=$(basename $(pwd)) # attempt_... -popd &>/dev/null -./dtrain -c dtrain.ini --hstreaming $ID - diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh deleted file mode 100755 index 92419956..00000000 --- a/training/dtrain/hstreaming/hadoop-streaming-job.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/sh - -EXP=a_simple_test - -# change these vars to fit your hadoop installation -HADOOP_HOME=/usr/lib/hadoop-0.20 -JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar -HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" - - IN=input_on_hdfs -OUT=output_weights_on_hdfs - -# you can -reducer to NONE if you want to -# do feature selection/averaging locally (e.g. to -# keep weights of all epochs) -$HSTREAMING \ - -mapper "dtrain.sh" \ - -reducer "ruby lplp.rb l2 select_k 100000" \ - -input $IN \ - -output $OUT \ - -file dtrain.sh \ - -file lplp.rb \ - -file ../dtrain \ - -file dtrain.ini \ - -file cdec.ini \ - -file ../test/example/nc-wmt11.en.srilm.gz \ - -jobconf mapred.reduce.tasks=30 \ - -jobconf mapred.max.map.failures.percent=0 \ - -jobconf mapred.job.name="dtrain $EXP" - diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb deleted file mode 100755 index f0cd58c5..00000000 --- a/training/dtrain/hstreaming/lplp.rb +++ /dev/null @@ -1,131 +0,0 @@ -# lplp.rb - -# norms -def l0(feature_column, n) - if feature_column.size >= n then return 1 else return 0 end -end - -def l1(feature_column, n=-1) - return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i } -end - -def l2(feature_column, n=-1) - return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i } -end - -def linfty(feature_column, n=-1) - return feature_column.map { |i| i.abs }.max -end - -# stats -def median(feature_column, n) - return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] -end - -def mean(feature_column, n) - return feature_column.reduce { |sum, i| sum+i } / n -end - -# selection -def select_k(weights, norm_fun, n, k=10000) - weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| - puts "#{p[0]}\t#{mean(p[1], n)}" - k -= 1 - if k == 0 then break end - } -end - -def cut(weights, norm_fun, n, epsilon=0.0001) - weights.each { |k,v| - if norm_fun.call(v, n).abs >= epsilon - puts "#{k}\t#{mean(v, n)}" - end - } -end - -# test -def _test() - puts - w = {} - w["a"] = [1, 2, 3] - w["b"] = [1, 2] - w["c"] = [66] - w["d"] = [10, 20, 30] - n = 3 - puts w.to_s - puts - puts "select_k" - puts "l0 expect ad" - select_k(w, method(:l0), n, 2) - puts "l1 expect cd" - select_k(w, method(:l1), n, 2) - puts "l2 expect c" - select_k(w, method(:l2), n, 1) - puts - puts "cut" - puts "l1 expect cd" - cut(w, method(:l1), n, 7) - puts - puts "median" - a = [1,2,3,4,5] - puts a.to_s - puts median(a, 5) - puts - puts "#{median(a, 7)} <- that's because we add missing 0s:" - puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s - puts - puts "mean expect bc" - w.clear - w["a"] = [2] - w["b"] = [2.1] - w["c"] = [2.2] - cut(w, method(:mean), 1, 2.05) - exit -end -#_test() - -# actually do something -def usage() - puts "lplp.rb [n] < " - puts " l0...: norms for selection" - puts "select_k: only output top k (according to the norm of their column vector) features" - puts " cut: output features with weight >= threshold" - puts " n: if we do not have a shard count use this number for averaging" - exit -end - -if ARGV.size < 3 then usage end -norm_fun = method(ARGV[0].to_sym) -type = ARGV[1] -x = ARGV[2].to_f - -shard_count_key = "__SHARD_COUNT__" - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -w = {} -shard_count = 0 -while line = STDIN.gets - key, val = line.split /\s+/ - if key == shard_count_key - shard_count += 1 - next - end - if w.has_key? key - w[key].push val.to_f - else - w[key] = [val.to_f] - end -end - -if ARGV.size == 4 then shard_count = ARGV[3].to_f end - -if type == 'cut' - cut(w, norm_fun, shard_count, x) -elsif type == 'select_k' - select_k(w, norm_fun, shard_count, x) -else - puts "oh oh" -end - diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test deleted file mode 100644 index 2623d697..00000000 --- a/training/dtrain/hstreaming/red-test +++ /dev/null @@ -1,9 +0,0 @@ -a 1 -b 2 -c 3.5 -a 1 -b 2 -c 3.5 -d 1 -e 2 -__SHARD_COUNT__ 2 diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb new file mode 100755 index 00000000..f0cd58c5 --- /dev/null +++ b/training/dtrain/lplp.rb @@ -0,0 +1,131 @@ +# lplp.rb + +# norms +def l0(feature_column, n) + if feature_column.size >= n then return 1 else return 0 end +end + +def l1(feature_column, n=-1) + return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i } +end + +def l2(feature_column, n=-1) + return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i } +end + +def linfty(feature_column, n=-1) + return feature_column.map { |i| i.abs }.max +end + +# stats +def median(feature_column, n) + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] +end + +def mean(feature_column, n) + return feature_column.reduce { |sum, i| sum+i } / n +end + +# selection +def select_k(weights, norm_fun, n, k=10000) + weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| + puts "#{p[0]}\t#{mean(p[1], n)}" + k -= 1 + if k == 0 then break end + } +end + +def cut(weights, norm_fun, n, epsilon=0.0001) + weights.each { |k,v| + if norm_fun.call(v, n).abs >= epsilon + puts "#{k}\t#{mean(v, n)}" + end + } +end + +# test +def _test() + puts + w = {} + w["a"] = [1, 2, 3] + w["b"] = [1, 2] + w["c"] = [66] + w["d"] = [10, 20, 30] + n = 3 + puts w.to_s + puts + puts "select_k" + puts "l0 expect ad" + select_k(w, method(:l0), n, 2) + puts "l1 expect cd" + select_k(w, method(:l1), n, 2) + puts "l2 expect c" + select_k(w, method(:l2), n, 1) + puts + puts "cut" + puts "l1 expect cd" + cut(w, method(:l1), n, 7) + puts + puts "median" + a = [1,2,3,4,5] + puts a.to_s + puts median(a, 5) + puts + puts "#{median(a, 7)} <- that's because we add missing 0s:" + puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s + puts + puts "mean expect bc" + w.clear + w["a"] = [2] + w["b"] = [2.1] + w["c"] = [2.2] + cut(w, method(:mean), 1, 2.05) + exit +end +#_test() + +# actually do something +def usage() + puts "lplp.rb [n] < " + puts " l0...: norms for selection" + puts "select_k: only output top k (according to the norm of their column vector) features" + puts " cut: output features with weight >= threshold" + puts " n: if we do not have a shard count use this number for averaging" + exit +end + +if ARGV.size < 3 then usage end +norm_fun = method(ARGV[0].to_sym) +type = ARGV[1] +x = ARGV[2].to_f + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +shard_count = 0 +while line = STDIN.gets + key, val = line.split /\s+/ + if key == shard_count_key + shard_count += 1 + next + end + if w.has_key? key + w[key].push val.to_f + else + w[key] = [val.to_f] + end +end + +if ARGV.size == 4 then shard_count = ARGV[3].to_f end + +if type == 'cut' + cut(w, norm_fun, shard_count, x) +elsif type == 'select_k' + select_k(w, norm_fun, shard_count, x) +else + puts "oh oh" +end + diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index fca9b10d..24e7f49e 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -80,7 +80,7 @@ def make_shards(input, refs, num_shards, epoch, rand) shard_refs = File.new refs_fn, 'w+' refs_fns << refs_fn 0.upto(shard_sz-1) { |i| - j = index.pop + j = index.pop shard_in.write in_lines[j] shard_refs.write refs_lines[j] } @@ -125,7 +125,7 @@ end if use_qsub qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \"" qsub_str_end = "\"" - local_end = '' + local_end = '' else local_end = "&>work/out.#{shard}.#{epoch}" end diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini index 068ebd4d..0215416d 100644 --- a/training/dtrain/test/example/cdec.ini +++ b/training/dtrain/test/example/cdec.ini @@ -2,7 +2,7 @@ formalism=scfg add_pass_through_rules=true scfg_max_span_limit=15 intersection_strategy=cube_pruning -cubepruning_pop_limit=30 +cubepruning_pop_limit=200 feature_function=WordPenalty feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz # all currently working feature functions for translation: -- cgit v1.2.3 From 2a48d73eb794fdd736d1df035c8a31af887cde0a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 15 Mar 2013 11:31:18 +0100 Subject: overhauled ruby scripts and examples --- training/dtrain/dtrain.cc | 2 - training/dtrain/examples/parallelized/README | 5 + training/dtrain/examples/parallelized/cdec.ini | 22 + training/dtrain/examples/parallelized/dtrain.ini | 16 + .../examples/parallelized/grammar/grammar.out.0.gz | Bin 0 -> 8318 bytes .../examples/parallelized/grammar/grammar.out.1.gz | Bin 0 -> 358560 bytes .../examples/parallelized/grammar/grammar.out.2.gz | Bin 0 -> 1014466 bytes .../examples/parallelized/grammar/grammar.out.3.gz | Bin 0 -> 391811 bytes .../examples/parallelized/grammar/grammar.out.4.gz | Bin 0 -> 149590 bytes .../examples/parallelized/grammar/grammar.out.5.gz | Bin 0 -> 537024 bytes .../examples/parallelized/grammar/grammar.out.6.gz | Bin 0 -> 291286 bytes .../examples/parallelized/grammar/grammar.out.7.gz | Bin 0 -> 1038140 bytes .../examples/parallelized/grammar/grammar.out.8.gz | Bin 0 -> 419889 bytes .../examples/parallelized/grammar/grammar.out.9.gz | Bin 0 -> 409140 bytes training/dtrain/examples/parallelized/in | 10 + training/dtrain/examples/parallelized/refs | 10 + training/dtrain/examples/parallelized/work/out.0.0 | 61 + training/dtrain/examples/parallelized/work/out.0.1 | 62 + training/dtrain/examples/parallelized/work/out.1.0 | 61 + training/dtrain/examples/parallelized/work/out.1.1 | 62 + .../dtrain/examples/parallelized/work/shard.0.0.in | 5 + .../examples/parallelized/work/shard.0.0.refs | 5 + .../dtrain/examples/parallelized/work/shard.1.0.in | 5 + .../examples/parallelized/work/shard.1.0.refs | 5 + .../dtrain/examples/parallelized/work/weights.0 | 12 + .../dtrain/examples/parallelized/work/weights.0.0 | 12 + .../dtrain/examples/parallelized/work/weights.0.1 | 12 + .../dtrain/examples/parallelized/work/weights.1 | 12 + .../dtrain/examples/parallelized/work/weights.1.0 | 11 + .../dtrain/examples/parallelized/work/weights.1.1 | 12 + training/dtrain/examples/standard/README | 2 + training/dtrain/examples/standard/cdec.ini | 26 + training/dtrain/examples/standard/dtrain.ini | 24 + training/dtrain/examples/standard/expected-output | 1206 ++++++++++++++++++++ training/dtrain/examples/standard/nc-wmt11.de.gz | Bin 0 -> 58324 bytes training/dtrain/examples/standard/nc-wmt11.en.gz | Bin 0 -> 49600 bytes .../dtrain/examples/standard/nc-wmt11.en.srilm.gz | Bin 0 -> 16017291 bytes .../dtrain/examples/standard/nc-wmt11.grammar.gz | Bin 0 -> 1399924 bytes training/dtrain/examples/toy/cdec.ini | 3 + training/dtrain/examples/toy/dtrain.ini | 13 + training/dtrain/examples/toy/expected-output | 77 ++ training/dtrain/examples/toy/grammar.gz | Bin 0 -> 219 bytes training/dtrain/examples/toy/src | 2 + training/dtrain/examples/toy/tgt | 2 + training/dtrain/lplp.rb | 18 +- training/dtrain/parallelize.rb | 26 +- training/dtrain/test/example/README | 8 - training/dtrain/test/example/cdec.ini | 25 - training/dtrain/test/example/dtrain.ini | 22 - training/dtrain/test/example/expected-output | 89 -- training/dtrain/test/parallelize/README | 5 - training/dtrain/test/parallelize/cdec.ini | 22 - training/dtrain/test/parallelize/dtrain.ini | 15 - .../dtrain/test/parallelize/g/grammar.out.0.gz | Bin 8318 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.1.gz | Bin 358560 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.2.gz | Bin 1014466 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.3.gz | Bin 391811 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.4.gz | Bin 149590 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.5.gz | Bin 537024 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.6.gz | Bin 291286 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.7.gz | Bin 1038140 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.8.gz | Bin 419889 -> 0 bytes .../dtrain/test/parallelize/g/grammar.out.9.gz | Bin 409140 -> 0 bytes training/dtrain/test/parallelize/in | 10 - training/dtrain/test/parallelize/refs | 10 - training/dtrain/test/toy/cdec.ini | 2 - training/dtrain/test/toy/dtrain.ini | 12 - training/dtrain/test/toy/input | 2 - 68 files changed, 1771 insertions(+), 252 deletions(-) create mode 100644 training/dtrain/examples/parallelized/README create mode 100644 training/dtrain/examples/parallelized/cdec.ini create mode 100644 training/dtrain/examples/parallelized/dtrain.ini create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.0.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.1.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.2.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.3.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.4.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.5.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.6.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.7.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.8.gz create mode 100644 training/dtrain/examples/parallelized/grammar/grammar.out.9.gz create mode 100644 training/dtrain/examples/parallelized/in create mode 100644 training/dtrain/examples/parallelized/refs create mode 100644 training/dtrain/examples/parallelized/work/out.0.0 create mode 100644 training/dtrain/examples/parallelized/work/out.0.1 create mode 100644 training/dtrain/examples/parallelized/work/out.1.0 create mode 100644 training/dtrain/examples/parallelized/work/out.1.1 create mode 100644 training/dtrain/examples/parallelized/work/shard.0.0.in create mode 100644 training/dtrain/examples/parallelized/work/shard.0.0.refs create mode 100644 training/dtrain/examples/parallelized/work/shard.1.0.in create mode 100644 training/dtrain/examples/parallelized/work/shard.1.0.refs create mode 100644 training/dtrain/examples/parallelized/work/weights.0 create mode 100644 training/dtrain/examples/parallelized/work/weights.0.0 create mode 100644 training/dtrain/examples/parallelized/work/weights.0.1 create mode 100644 training/dtrain/examples/parallelized/work/weights.1 create mode 100644 training/dtrain/examples/parallelized/work/weights.1.0 create mode 100644 training/dtrain/examples/parallelized/work/weights.1.1 create mode 100644 training/dtrain/examples/standard/README create mode 100644 training/dtrain/examples/standard/cdec.ini create mode 100644 training/dtrain/examples/standard/dtrain.ini create mode 100644 training/dtrain/examples/standard/expected-output create mode 100644 training/dtrain/examples/standard/nc-wmt11.de.gz create mode 100644 training/dtrain/examples/standard/nc-wmt11.en.gz create mode 100644 training/dtrain/examples/standard/nc-wmt11.en.srilm.gz create mode 100644 training/dtrain/examples/standard/nc-wmt11.grammar.gz create mode 100644 training/dtrain/examples/toy/cdec.ini create mode 100644 training/dtrain/examples/toy/dtrain.ini create mode 100644 training/dtrain/examples/toy/expected-output create mode 100644 training/dtrain/examples/toy/grammar.gz create mode 100644 training/dtrain/examples/toy/src create mode 100644 training/dtrain/examples/toy/tgt delete mode 100644 training/dtrain/test/example/README delete mode 100644 training/dtrain/test/example/cdec.ini delete mode 100644 training/dtrain/test/example/dtrain.ini delete mode 100644 training/dtrain/test/example/expected-output delete mode 100644 training/dtrain/test/parallelize/README delete mode 100644 training/dtrain/test/parallelize/cdec.ini delete mode 100644 training/dtrain/test/parallelize/dtrain.ini delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.0.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.1.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.2.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.3.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.4.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.5.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.6.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.7.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.8.gz delete mode 100644 training/dtrain/test/parallelize/g/grammar.out.9.gz delete mode 100644 training/dtrain/test/parallelize/in delete mode 100644 training/dtrain/test/parallelize/refs delete mode 100644 training/dtrain/test/toy/cdec.ini delete mode 100644 training/dtrain/test/toy/dtrain.ini delete mode 100644 training/dtrain/test/toy/input (limited to 'training/dtrain/test/example') diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index dfb5b351..fcb46db2 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -254,8 +254,6 @@ main(int argc, char** argv) time_t start, end; time(&start); - igzstream grammar_buf_in; - if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str()); score_t score_sum = 0.; score_t model_sum(0); unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; diff --git a/training/dtrain/examples/parallelized/README b/training/dtrain/examples/parallelized/README new file mode 100644 index 00000000..89715105 --- /dev/null +++ b/training/dtrain/examples/parallelized/README @@ -0,0 +1,5 @@ +run for example + ../../parallelize.rb ./dtrain.ini 4 false 2 2 ./in ./refs + +final weights will be in the file work/weights.3 + diff --git a/training/dtrain/examples/parallelized/cdec.ini b/training/dtrain/examples/parallelized/cdec.ini new file mode 100644 index 00000000..e43ba1c4 --- /dev/null +++ b/training/dtrain/examples/parallelized/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel ../example/nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/examples/parallelized/dtrain.ini b/training/dtrain/examples/parallelized/dtrain.ini new file mode 100644 index 00000000..f19ef891 --- /dev/null +++ b/training/dtrain/examples/parallelized/dtrain.ini @@ -0,0 +1,16 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=1.0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +# newer version of the grammar extractor use different feature names: +#print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +decoder_config=cdec.ini diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz new file mode 100644 index 00000000..1e28a24b Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz new file mode 100644 index 00000000..372f5675 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz new file mode 100644 index 00000000..145d0dc0 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz new file mode 100644 index 00000000..105593ff Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz new file mode 100644 index 00000000..30781f48 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz new file mode 100644 index 00000000..834ee759 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz new file mode 100644 index 00000000..2e76f348 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz new file mode 100644 index 00000000..3741a887 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz new file mode 100644 index 00000000..ebf6bd0c Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz differ diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz new file mode 100644 index 00000000..c1791059 Binary files /dev/null and b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz differ diff --git a/training/dtrain/examples/parallelized/in b/training/dtrain/examples/parallelized/in new file mode 100644 index 00000000..51d01fe7 --- /dev/null +++ b/training/dtrain/examples/parallelized/in @@ -0,0 +1,10 @@ +europas nach rassen geteiltes haus +ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen . +der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln . +während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden . +eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern . +die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden . +das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt . +die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen . +der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken . +genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen . diff --git a/training/dtrain/examples/parallelized/refs b/training/dtrain/examples/parallelized/refs new file mode 100644 index 00000000..632e27b0 --- /dev/null +++ b/training/dtrain/examples/parallelized/refs @@ -0,0 +1,10 @@ +europe 's divided racial house +a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge . +the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them . +while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon . +an aging population at home and ever more open borders imply increasing racial fragmentation in european countries . +mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear . +it will not , as america 's racial history clearly shows . +race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes . +the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths . +this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us . diff --git a/training/dtrain/examples/parallelized/work/out.0.0 b/training/dtrain/examples/parallelized/work/out.0.0 new file mode 100644 index 00000000..7a00ed0f --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.0.0 @@ -0,0 +1,61 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 3121929377 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.0.0.in' + refs 'work/shard.0.0.refs' + output 'work/weights.0.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = +0.2663 + WordPenalty = -0.0079042 + LanguageModel = +0.44782 + LanguageModel_OOV = -0.0401 + PhraseModel_0 = -0.193 + PhraseModel_1 = +0.71321 + PhraseModel_2 = +0.85196 + PhraseModel_3 = -0.43986 + PhraseModel_4 = -0.44803 + PhraseModel_5 = -0.0538 + PhraseModel_6 = -0.1788 + PassThrough = -0.1477 + --- + 1best avg score: 0.17521 (+0.17521) + 1best avg model score: 21.556 (+21.556) + avg # pairs: 1671.2 + avg # rank err: 1118.6 + avg # margin viol: 552.6 + non0 feature count: 12 + avg list sz: 100 + avg f count: 11.32 +(time 0.37 min, 4.4 s/S) + +Writing weights file to 'work/weights.0.0' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.17521]. +This took 0.36667 min. diff --git a/training/dtrain/examples/parallelized/work/out.0.1 b/training/dtrain/examples/parallelized/work/out.0.1 new file mode 100644 index 00000000..e2bd6649 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.0.1 @@ -0,0 +1,62 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 2767202922 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.0.0.in' + refs 'work/shard.0.0.refs' + output 'work/weights.0.1' + weights in 'work/weights.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = -0.2699 + WordPenalty = +0.080605 + LanguageModel = -0.026572 + LanguageModel_OOV = -0.30025 + PhraseModel_0 = -0.32076 + PhraseModel_1 = +0.67451 + PhraseModel_2 = +0.92 + PhraseModel_3 = -0.36402 + PhraseModel_4 = -0.592 + PhraseModel_5 = -0.0269 + PhraseModel_6 = -0.28755 + PassThrough = -0.33285 + --- + 1best avg score: 0.26638 (+0.26638) + 1best avg model score: 53.197 (+53.197) + avg # pairs: 2028.6 + avg # rank err: 998.2 + avg # margin viol: 918.8 + non0 feature count: 12 + avg list sz: 100 + avg f count: 10.496 +(time 0.32 min, 3.8 s/S) + +Writing weights file to 'work/weights.0.1' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.26638]. +This took 0.31667 min. diff --git a/training/dtrain/examples/parallelized/work/out.1.0 b/training/dtrain/examples/parallelized/work/out.1.0 new file mode 100644 index 00000000..6e790e38 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.1.0 @@ -0,0 +1,61 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 1432415010 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.1.0.in' + refs 'work/shard.1.0.refs' + output 'work/weights.1.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = -0.3815 + WordPenalty = +0.20064 + LanguageModel = +0.95304 + LanguageModel_OOV = -0.264 + PhraseModel_0 = -0.22362 + PhraseModel_1 = +0.12254 + PhraseModel_2 = +0.26328 + PhraseModel_3 = +0.38018 + PhraseModel_4 = -0.48654 + PhraseModel_5 = +0 + PhraseModel_6 = -0.3645 + PassThrough = -0.2216 + --- + 1best avg score: 0.10863 (+0.10863) + 1best avg model score: -4.9841 (-4.9841) + avg # pairs: 1345.4 + avg # rank err: 822.4 + avg # margin viol: 501 + non0 feature count: 11 + avg list sz: 100 + avg f count: 11.814 +(time 0.45 min, 5.4 s/S) + +Writing weights file to 'work/weights.1.0' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.10863]. +This took 0.45 min. diff --git a/training/dtrain/examples/parallelized/work/out.1.1 b/training/dtrain/examples/parallelized/work/out.1.1 new file mode 100644 index 00000000..0b984761 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.1.1 @@ -0,0 +1,62 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 1771918374 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.1.0.in' + refs 'work/shard.1.0.refs' + output 'work/weights.1.1' + weights in 'work/weights.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = -0.3178 + WordPenalty = +0.11092 + LanguageModel = +0.17269 + LanguageModel_OOV = -0.13485 + PhraseModel_0 = -0.45371 + PhraseModel_1 = +0.38789 + PhraseModel_2 = +0.75311 + PhraseModel_3 = -0.38163 + PhraseModel_4 = -0.58817 + PhraseModel_5 = -0.0269 + PhraseModel_6 = -0.27315 + PassThrough = -0.16745 + --- + 1best avg score: 0.13169 (+0.13169) + 1best avg model score: 24.226 (+24.226) + avg # pairs: 1951.2 + avg # rank err: 985.4 + avg # margin viol: 951 + non0 feature count: 12 + avg list sz: 100 + avg f count: 11.224 +(time 0.42 min, 5 s/S) + +Writing weights file to 'work/weights.1.1' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.13169]. +This took 0.41667 min. diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.in b/training/dtrain/examples/parallelized/work/shard.0.0.in new file mode 100644 index 00000000..92f9c78e --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.0.0.in @@ -0,0 +1,5 @@ +europas nach rassen geteiltes haus +ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen . +der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln . +während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden . +eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern . diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.refs b/training/dtrain/examples/parallelized/work/shard.0.0.refs new file mode 100644 index 00000000..bef68fee --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.0.0.refs @@ -0,0 +1,5 @@ +europe 's divided racial house +a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge . +the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them . +while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon . +an aging population at home and ever more open borders imply increasing racial fragmentation in european countries . diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.in b/training/dtrain/examples/parallelized/work/shard.1.0.in new file mode 100644 index 00000000..b7695ce7 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.1.0.in @@ -0,0 +1,5 @@ +die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden . +das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt . +die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen . +der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken . +genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen . diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.refs b/training/dtrain/examples/parallelized/work/shard.1.0.refs new file mode 100644 index 00000000..6076f6d5 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.1.0.refs @@ -0,0 +1,5 @@ +mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear . +it will not , as america 's racial history clearly shows . +race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes . +the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths . +this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us . diff --git a/training/dtrain/examples/parallelized/work/weights.0 b/training/dtrain/examples/parallelized/work/weights.0 new file mode 100644 index 00000000..ddd595a8 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.0 @@ -0,0 +1,12 @@ +LanguageModel 0.7004298992212881 +PhraseModel_2 0.5576194336478857 +PhraseModel_1 0.41787318415343155 +PhraseModel_4 -0.46728502545635164 +PhraseModel_3 -0.029839521598455515 +Glue -0.05760000000000068 +PhraseModel_6 -0.2716499999999978 +PhraseModel_0 -0.20831031065605327 +LanguageModel_OOV -0.15205000000000077 +PassThrough -0.1846500000000006 +WordPenalty 0.09636994553433414 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/parallelized/work/weights.0.0 b/training/dtrain/examples/parallelized/work/weights.0.0 new file mode 100644 index 00000000..c9370b18 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.0.0 @@ -0,0 +1,12 @@ +WordPenalty -0.0079041595706392243 +LanguageModel 0.44781580828279532 +LanguageModel_OOV -0.04010000000000042 +Glue 0.26629999999999948 +PhraseModel_0 -0.19299677809125185 +PhraseModel_1 0.71321026861732773 +PhraseModel_2 0.85195540993310537 +PhraseModel_3 -0.43986310822842656 +PhraseModel_4 -0.44802855630415955 +PhraseModel_5 -0.053800000000000514 +PhraseModel_6 -0.17879999999999835 +PassThrough -0.14770000000000036 diff --git a/training/dtrain/examples/parallelized/work/weights.0.1 b/training/dtrain/examples/parallelized/work/weights.0.1 new file mode 100644 index 00000000..8fad3de8 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.0.1 @@ -0,0 +1,12 @@ +WordPenalty 0.080605055841244472 +LanguageModel -0.026571720531022844 +LanguageModel_OOV -0.30024999999999141 +Glue -0.26989999999999842 +PhraseModel_2 0.92000295209089566 +PhraseModel_1 0.67450748692470841 +PhraseModel_4 -0.5920000014976784 +PhraseModel_3 -0.36402437203127397 +PhraseModel_6 -0.28754999999999603 +PhraseModel_0 -0.32076244202907672 +PassThrough -0.33284999999999004 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/parallelized/work/weights.1 b/training/dtrain/examples/parallelized/work/weights.1 new file mode 100644 index 00000000..03058a16 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.1 @@ -0,0 +1,12 @@ +PhraseModel_2 0.8365578543552836 +PhraseModel_4 -0.5900840266009169 +PhraseModel_1 0.5312000609786991 +PhraseModel_0 -0.3872342271319619 +PhraseModel_3 -0.3728279676912084 +Glue -0.2938500000000036 +PhraseModel_6 -0.2803499999999967 +PassThrough -0.25014999999999626 +LanguageModel_OOV -0.21754999999999702 +LanguageModel 0.07306061161169894 +WordPenalty 0.09576193325966899 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/parallelized/work/weights.1.0 b/training/dtrain/examples/parallelized/work/weights.1.0 new file mode 100644 index 00000000..6a6a65c1 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.1.0 @@ -0,0 +1,11 @@ +WordPenalty 0.20064405063930751 +LanguageModel 0.9530439901597807 +LanguageModel_OOV -0.26400000000000112 +Glue -0.38150000000000084 +PhraseModel_0 -0.22362384322085468 +PhraseModel_1 0.12253609968953538 +PhraseModel_2 0.26328345736266612 +PhraseModel_3 0.38018406503151553 +PhraseModel_4 -0.48654149460854373 +PhraseModel_6 -0.36449999999999722 +PassThrough -0.22160000000000085 diff --git a/training/dtrain/examples/parallelized/work/weights.1.1 b/training/dtrain/examples/parallelized/work/weights.1.1 new file mode 100644 index 00000000..f56ea4a2 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.1.1 @@ -0,0 +1,12 @@ +WordPenalty 0.1109188106780935 +LanguageModel 0.17269294375442074 +LanguageModel_OOV -0.13485000000000266 +Glue -0.3178000000000088 +PhraseModel_2 0.75311275661967159 +PhraseModel_1 0.38789263503268989 +PhraseModel_4 -0.58816805170415531 +PhraseModel_3 -0.38163156335114284 +PhraseModel_6 -0.27314999999999739 +PhraseModel_0 -0.45370601223484697 +PassThrough -0.16745000000000249 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/standard/README b/training/dtrain/examples/standard/README new file mode 100644 index 00000000..ce37d31a --- /dev/null +++ b/training/dtrain/examples/standard/README @@ -0,0 +1,2 @@ +Call `dtrain` from this folder with ../../dtrain -c dtrain.ini . + diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini new file mode 100644 index 00000000..e1edc68d --- /dev/null +++ b/training/dtrain/examples/standard/cdec.ini @@ -0,0 +1,26 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +grammar=nc-wmt11.grammar.gz +feature_function=WordPenalty +feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz +# all currently working feature functions for translation: +# (with those features active that were used in the ACL paper) +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +feature_function=RuleIdentityFeatures +feature_function=RuleSourceBigramFeatures +feature_function=RuleTargetBigramFeatures +feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini new file mode 100644 index 00000000..a05e9c29 --- /dev/null +++ b/training/dtrain/examples/standard/dtrain.ini @@ -0,0 +1,24 @@ +input=./nc-wmt11.de.gz +refs=./nc-wmt11.en.gz +output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +select_weights=avg # output average (over epochs) weight vector +decoder_config=./cdec.ini # config for cdec +# weights for these features will be printed on each iteration +print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV +# newer version of the grammar extractor use different feature names: +#print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +stop_after=10 # stop epoch after 10 inputs + +# interesting stuff +epochs=2 # run over input 2 times +k=100 # use 100best lists +N=4 # optimize (approx) BLEU4 +scorer=stupid_bleu # use 'stupid' BLEU+1 +learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) +gamma=0 # use SVM reg +sample_from=kbest # use kbest lists (as opposed to forest) +filter=uniq # only unique entries in kbest (surface form) +pair_sampling=XYX # +hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here +pair_threshold=0 # minimum distance in BLEU (here: > 0) +loss_margin=0 diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output new file mode 100644 index 00000000..8d72f4c3 --- /dev/null +++ b/training/dtrain/examples/standard/expected-output @@ -0,0 +1,1206 @@ + cdec cfg './cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ./nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** + Example feature: Shape_S00000_T00000 +Seeding random number sequence to 1511823303 + +dtrain +Parameters: + k 100 + N 4 + T 2 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 1 + gamma 0 + loss margin 0 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'avg' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg './cdec.ini' + input './nc-wmt11.de.gz' + refs './nc-wmt11.en.gz' + output '-' + stop_after 10 +(a dot represents 10 inputs) +Iteration #1 of 2. + . 10 +Stopping after 10 input sentences. +WEIGHTS + EgivenFCoherent = +0 + SampleCountF = +0 + CountEF = +0 + MaxLexFgivenE = +0 + MaxLexEgivenF = +0 + IsSingletonF = +0 + IsSingletonFE = +0 + Glue = -576 + WordPenalty = +417.79 + PassThrough = -1455 + LanguageModel = +5117.5 + LanguageModel_OOV = -1307 + --- + 1best avg score: 0.27697 (+0.27697) + 1best avg model score: -47918 (-47918) + avg # pairs: 1129.8 + avg # rank err: 581.9 + avg # margin viol: 0 + non0 feature count: 703 + avg list sz: 90.9 + avg f count: 100.09 +(time 0.33 min, 2 s/S) + +Iteration #2 of 2. + . 10 +WEIGHTS + EgivenFCoherent = +0 + SampleCountF = +0 + CountEF = +0 + MaxLexFgivenE = +0 + MaxLexEgivenF = +0 + IsSingletonF = +0 + IsSingletonFE = +0 + Glue = -622 + WordPenalty = +898.56 + PassThrough = -2578 + LanguageModel = +8066.2 + LanguageModel_OOV = -2590 + --- + 1best avg score: 0.37119 (+0.094226) + 1best avg model score: -1.3174e+05 (-83822) + avg # pairs: 1214.9 + avg # rank err: 584.1 + avg # margin viol: 0 + non0 feature count: 1115 + avg list sz: 91.3 + avg f count: 90.755 +(time 0.27 min, 1.6 s/S) + +Writing weights file to '-' ... +R:X:NX_sein:N1_its 61.5 +WordPenalty 658.17328732437022 +LanguageModel 6591.8747593425214 +LanguageModel_OOV -1948.5 +R:X:das_NX:this_N1 12 +R:X:NX_sein_NX:N1_from_ever_being_able_to_N2 30 +R:X:NX_bemühen:N1_effort 2.5 +RBS:X_bemühen 2.5 +R:X:sich:sich -17.5 +RBT:_sich -17.5 +RBT:sich_ -17.5 +RBS:sich_X 17.5 +RBS:_als 147 +RBS:als_ -59 +Shape_S10000_T10000 -1711.5 +RBT:_when 84 +R:X:zum_NX:as_N1 -134 +RBS:_zum -30 +R:X:als_NX:as_N1 63 +R:X:zum_NX:'s_N1 33 +R:X:zum_NX:the_N1 24 +RBS:X_sich -12 +R:X:zum_NX:to_N1 -36 +R:X:zum_NX:with_the_N1 83 +R:X:NX_zum:N1_the -66 +R:X:NX_zum:N1_to 66 +R:X:als_NX:when_N1 84 +RBS:als_das 59 +RBS:X_das -104 +R:X:NX_das:N1_a 28.5 +R:X:er_sich_NX:he_N1 86.5 +RBS:er_sich 29.5 +R:X:NX_das:N1_it -6 +R:X:er_sich_NX:him_N1 -57 +RBT:_declared -488 +R:X:NX_das:N1_that -5 +RBT:declared_ -8 +R:X:NX_das:N1_the -57 +R:X:NX_das:N1_this -17 +R:X:NX_.:N1_. -323 +RBS:X_. 134 +R:X:NX_.:N1_debate_. 6.5 +R:X:NX_.:N1_disruptions_. -14.5 +R:X:NX_.:N1_established_. 7.5 +R:X:NX_.:N1_heading_. 17 +R:X:NX_.:N1_on_. 94 +R:X:NX_.:N1_pace_. 51.5 +R:X:NX_das_NX:N1_a_growing_N2 -45 +R:X:general:general -23.5 +R:X:NX_.:N1_politics_. 84 +R:X:NX_das_NX:N1_a_N2 -0.5 +R:X:NX_.:N1_power_. -99.5 +RBS:general_ -23.5 +R:X:NX_.:N1_-_range_missiles_. -28.5 +Shape_S11000_T11000 40 +RBT:general_ -23.5 +RBT:_. -645 +R:X:betrat:entered -91 +R:X:NX_.:N1_war_. 68.5 +RBS:_betrat 23.5 +Shape_S11000_T01100 475.5 +RBT:_entered -91 +RBT:entered_ -91 +R:X:NX_das_NX:N1_the_N2 -2 +R:X:betrat:betrat 114.5 +RBT:_betrat 114.5 +RBT:betrat_ 114.5 +R:X:12:12 79 +R:X:maßnahmen:action 24 +R:X:.:. -566 +RBS:12_ 79 +RBS:_maßnahmen -44.5 +RBS:_. -645 +RBT:._ -566 +RBT:_action 24 +RBT:12_ 79 +RBT:action_ 24 +R:X:maßnahmen:actions -13 +RBT:_actions -13 +RBT:actions_ -13 +R:X:12_NX:12_N1 -79 +RBT:declared_a -428 +RBS:12_X -79 +RBT:a_state -428 +RBT:state_of -428 +R:X:maßnahmen:maßnahmen -55.5 +R:X:internationale_NX:global_N1 -270 +RBS:X_am 316.5 +RBT:_maßnahmen -55.5 +RBS:am_ 267.5 +RBT:maßnahmen_ -55.5 +RBS:_den 883 +R:X:internationale_NX:international_N1 270 +RBS:den_X -286.5 +R:X:NX_am:N1_of 267.5 +R:X:NX_als:N1_a -273.5 +RBS:am_X -281 +R:X:den_NX:'s_N1 -31 +R:X:NX_am_NX:N1_of_N2 -30 +R:X:NX_am_NX:N1_on_N2 79 +R:X:NX_als:N1_'s 273.5 +R:X:NX_betrat:N1_entered -23.5 +R:X:ins_NX:into_the_N1 -32.5 +RBS:X_betrat -23.5 +RBT:into_the -55 +R:X:ins_NX:into_N1 32.5 +RBT:_their 303 +R:X:general_NX:general_N1 23.5 +RBS:general_X 23.5 +RBS:_am -316.5 +R:X:den_NX:the_N1 89 +R:X:den_NX_.:the_N1_. 86.5 +R:X:NX_und:and_N1 -216 +RBS:X_und -203.5 +RBS:und_ 522.5 +RBT:_and 438.5 +R:X:am_NX:at_N1 23 +R:X:NX_als_das:N1_than_the 59 +R:X:NX_und:N1_- -114 +R:X:NX_und:N1_, 114 +R:X:am_NX:of_N1 -4 +R:X:am_NX:on_N1 -158.5 +R:X:am_NX:the_N1 -190 +RBS:_seine -16.5 +RBS:seine_ 39 +R:X:oktober:october -79.5 +R:X:seine:his -5.5 +RBS:_oktober -79.5 +R:X:seine:its 50 +RBT:_october -79.5 +RBT:october_ -79.5 +R:X:seine_NX:a_N1 7.5 +RBS:seine_X -39 +R:X:NX_und_NX:and_N1_N2 -22 +RBS:und_X 160.5 +R:X:seine_NX:his_N1 -97 +R:X:seine_NX:its_N1 102.5 +R:X:NX_und_NX:N1_,_and_N2 -4 +R:X:NX_maßnahmen:N1_actions 44.5 +RBS:X_maßnahmen 44.5 +R:X:seine_NX_als:his_N1_than 5.5 +R:X:seine_NX_als:its_N1_as -64.5 +R:X:NX_und_NX:N1_,_N2 -7 +Shape_S01100_T11000 -312.5 +RBS:und_den -822.5 +Shape_S01100_T01100 -537.5 +Shape_S01100_T11100 15 +R:X:NX_seine:'s_N1 -5.5 +RBS:X_seine 16.5 +RBS:X_den -38 +R:X:amerika_NX_sich_NX:america_N1_N2 -12 +R:X:NX_seine_NX:'s_N1_N2 22 +R:X:auf_NX_den_NX:to_N1_the_N2 -23 +R:X:auf_NX_den_NX:to_N1_N2 -23 +RBS:_unterstützen -716 +RBS:unterstützen_ -1 +Shape_S11100_T11000 783.5 +Shape_S11100_T01100 -716 +Shape_S11100_T11100 488 +R:X:unterstützen:unterstützen -1 +RBT:_unterstützen -1 +RBT:unterstützen_ -1 +R:X:unterstützen_NX:support_N1 -715 +RBS:unterstützen_X -715 +RBT:_will -6 +RBS:X_unterstützen 716 +RBT:_if 35 +R:X:NX_den_NX_.:N1_N2_. 41 +R:X:verfassung:constitution 15 +RBS:_verfassung -43 +RBT:_constitution 15 +RBT:constitution_ 15 +R:X:verfassung:constitutional 9.5 +RBT:_constitutional 9.5 +RBS:unterstützen_. 716 +RBT:constitutional_ 9.5 +R:X:NX_unterstützen_.:N1_. 716 +R:X:verfassung:verfassung -67.5 +R:X:eine_NX:an_N1 162 +RBT:_verfassung -67.5 +RBT:verfassung_ -67.5 +R:X:und:, -21.5 +R:X:,_NX_zu_NX:to_N2_N1 -153 +RBS:_und -389.5 +R:X:und:and -35 +RBS:angeführten_ -716 +RBT:and_ -35 +RBT:_as 63 +RBS:versucht_ 68 +R:X:und:with -3 +R:X:eine_NX:is_N1 -162 +RBS:angeführten_X 716 +R:X:und:und 91 +RBT:_und 91 +RBT:und_ 91 +R:X:versucht:tried 68 +RBT:tried_ 68 +RBS:versucht_X -68 +R:X:versucht_NX:tried_N1 -68 +R:X:und_NX:and_N1 250 +R:X:und_NX:with_N1 -18 +R:X:und_NX:,_N1 -7 +R:X:und_NX:N1_and -12 +R:X:und_den_NX:and_N1 -716 +R:X:er:he 17 +R:X:NX_eine:N1_is -7 +RBS:_er -47.5 +RBS:er_ 54 +RBT:_he 485.5 +RBT:he_ 17 +RBT:_him -1 +R:X:und_NX_.:,_N1_. -3 +R:X:er:his 91 +R:X:und_den_NX_.:and_the_N1_. 88 +R:X:NX_eine:N1_will 7 +R:X:er:it 3 +R:X:und_den_NX_.:and_N1_. -216.5 +R:X:er:er -196 +RBT:_er -196 +RBT:er_ -196 +RBS:er_X 8 +R:X:er_NX:he_N1 399 +R:X:er_NX:it_N1 -379 +Shape_S01010_T01010 -599 +RBS:pakistanischen_ 43 +R:X:NX_versucht:N1_tried 196 +RBT:_pakistan -43 +RBT:_pakistani 2 +R:X:er_NX_,_NX:he_N1_N2 -12 +R:X:NX_hat_er:N1_,_he_has 196 +RBS:hat_er 196 +R:X:NX_er:he_N1 -17 +RBS:X_er -148.5 +RBS:pakistanischen_X -43 +R:X:NX_er:it_N1 -7 +RBS:X_verfassung 43 +R:X:NX_verfassung:N1_'s_constitution 43 +R:X:NX_hat_NX_versucht:N1_N2_has_tried -190 +R:X:NX_hat_NX_versucht:N1_,_N2_has_tried -6 +RBS:der_pakistanischen 43 +RBS:X_pakistanischen -43 +RBS:_aber 46 +RBS:,_als -147 +RBT:_but -321 +R:X:aber_NX:but_N1 46 +R:X:von_NX_angeführten:N1_-_led -716 +R:X:von_NX_angeführten_NX:N1_-_led_N2 716 +RBS:,_aber -114 +RBS:X_aber 68 +R:X:,_als_NX:,_as_N1 -40 +R:X:NX_aber_NX_,:N1_N2_to 68 +R:X:NX_pakistanischen_NX_.:pakistan_N1_N2_. -43 +R:X:NX_,_aber_NX:N1_,_N2 -114 +RBS:_rahmen 43 +RBS:rahmen_ 43 +R:X:rahmen:within 20 +R:X:rahmen:rahmen 23 +RBT:_rahmen 23 +RBT:rahmen_ 23 +Shape_S01110_T11010 35.5 +R:X:NX_der_pakistanischen:N1_pakistan 43 +Shape_S01110_T01110 -1195 +Shape_S01110_T11110 -6.5 +R:X:NX_,_NX_er:N1_N2_he -33 +RBS:geben_X -577.5 +RBS:_gestalten 196 +Shape_S01110_T01011 278 +RBS:gestalten_ 196 +RBS:geben_und 577.5 +R:X:gestalten:more 221 +Shape_S01110_T01111 -181.5 +RBT:_more 221 +RBT:more_ 221 +R:X:gestalten:gestalten -25 +RBT:_gestalten -25 +RBT:gestalten_ -25 +R:X:effektiver:effectively -151 +RBS:_effektiver 54 +RBS:effektiver_ -221 +RBT:_effectively -151 +RBT:effectively_ -151 +R:X:effektiver:effektiver -99 +RBT:_effektiver -99 +RBT:effektiver_ -99 +Shape_S11110_T11010 -1130 +RBS:zu_geben -107.5 +R:X:effektiver_zu_NX:N1_effectively 304 +RBS:effektiver_zu 221 +RBS:X_geben 107.5 +Shape_S11110_T01110 621 +Shape_S11110_T11110 -75 +RBS:X_gestalten -196 +R:X:NX_gestalten_.:N1_. -196 +RBS:gestalten_. -196 +R:X:terror:terror 672 +RBS:_terror -16 +RBS:terror_ 640 +R:X:den:- -4 +RBT:_terror 136 +RBT:terror_ 646 +RBS:den_ 42.5 +R:X:den:for -11.5 +R:X:terror:terrorism -54 +RBT:_terrorism -54 +Shape_S11110_T11011 -4.5 +RBT:terrorism_ -54 +R:X:terror_NX:terror_N1 -634 +R:X:den:of -17 +RBS:terror_X -640 +R:X:den:'s 32.5 +Shape_S11110_T01111 -1.5 +R:X:NX_effektiver:N1_more_effectively 29 +RBS:X_effektiver -54 +R:X:den:the 68 +R:X:NX_geben_und:N1_and 107.5 +R:X:NX_effektiver_zu_NX:N1_N2_effectively -83 +R:X:den:to -33 +RBS:1999_ -302.5 +R:X:,_NX_zu_geben_NX:to_N1_N2 -577.5 +R:X:den:with -10 +RBS:X_terror -4.5 +R:X:,_NX_zu_geben_und:to_N1_and 470 +R:X:NX_1999:N1_1999 -302.5 +R:X:NX_1999_NX:N2_N1_1999 302.5 +RBS:1999_X 302.5 +R:X:den_NX_zu:to_N1 783.5 +R:X:NX_rahmen_der:N1_the -43 +RBS:X_rahmen -43 +RBS:rahmen_der -43 +RBS:gegen_ 22.5 +R:X:gegen:against -2 +RBT:_against -2 +RBT:against_ -2 +R:X:._NX:._N1 -79 +RBS:._X -79.5 +RBS:gegen_den -22.5 +R:X:NX_._oktober:october_N1 79.5 +RBS:._oktober 79.5 +R:X:am_NX_._NX:the_N2_N1 -0.5 +R:X:gegen_den_NX:on_N1 2 +RBS:den_terror 20.5 +RBT:on_terror -26 +R:X:NX_den_terror:the_N1_terror 29 +R:X:den_NX_den_NX:the_N1_N2 -110.5 +R:X:den_NX_den_NX:N2_the_N1 -95 +RBT:_the -1.5 +R:X:krieg:war -4.5 +RBS:_krieg -22 +R:X:musharraf:musharraf 43 +RBS:krieg_ -4.5 +RBT:_war -22 +RBS:_musharraf 66.5 +RBS:musharraf_ -23.5 +RBT:war_ -4.5 +R:X:musharraf_NX:musharraf_imposed_N1 23.5 +RBS:musharraf_X 23.5 +RBT:musharraf_imposed 23.5 +RBS:krieg_gegen 4.5 +R:X:musharraf_NX:musharraf_N1 107 +R:X:krieg_gegen:war_on 24.5 +RBT:war_on -17.5 +RBS:X_gegen -4.5 +R:X:musharraf_NX_,_als_NX:musharraf_N1_as_N2 -20 +R:X:musharraf_NX_,_als_NX:musharraf_N1_N2 -87 +R:X:krieg_gegen_den_NX:war_on_N1 -16 +R:X:krieg_gegen_den_terror:war_on_terror -26 +R:X:pervez:pervez 22 +RBS:_pervez 22 +RBS:pervez_ 57.5 +RBS:X_krieg 22 +RBT:_pervez 22 +RBT:pervez_ 22 +RBS:pervez_musharraf -57.5 +RBS:X_musharraf -9 +R:X:NX_musharraf:N1_musharraf -9 +R:X:den_NX_gegen_den:the_N1_on -4.5 +R:X:den_NX_den_terror:the_N1_terror -3 +R:X:NX_krieg_gegen_den_terror:N1_war_on_terror 22 +R:X:den_NX_den_terror_NX:N2_the_N1_terror -1.5 +RBT:_project 91 +RBS:hat_ 2 +RBS:X_- 14 +R:X:NX_-:,_N1 48.5 +R:X:NX_-:N1_months_of 32 +R:X:NX_-:N1_relief_and 64 +R:X:NX_-:N1_'s -144.5 +RBS:hat_X -198 +R:X:und_NX_terror_NX:and_N2_N1_terror -4.5 +RBT:and_ -4.5 +R:X:sorgen:bring -19 +RBS:X_pervez -22 +RBT:_bring -19 +RBT:bring_ -19 +R:X:sorgen:ensure 19 +RBT:_ensure 19 +RBT:ensure_ 19 +R:X:NX_-_NX:N1_N2_security -4 +R:X:NX_projekt_NX:N2_N1_project -156 +R:X:NX_-_NX_.:N1_N2_. 18 +R:X:NX_projekt_NX_.:N2_N1_project_. 156 +RBS:_- -14 +RBT:to_ensure 0.5 +R:X:NX_hat:has_N1 -5 +R:X:NX_hat:N1_, 3 +R:X:NX_hat:,_N1 21.5 +R:X:NX_hat:N1_has -17 +R:X:NX_hat:N1_is -0.5 +R:X:-_NX:of_N1 -26 +R:X:-_NX:'s_N1 -58 +R:X:NX_hat_NX:N1_,_N2 -73 +R:X:NX_hat_NX:N1_N2_has 28 +R:X:-_NX:-_N1 122 +R:X:NX_hat_NX:N1_,_N2_has 21 +R:X:-_NX:--_N1 -21 +R:X:-_NX:,_N1 -31 +R:X:stabilität:stability -118 +RBS:_stabilität -129 +RBT:_stability -118 +RBT:stability_ -118 +R:X:stabilität:stabilität -11 +RBT:_stabilität -11 +RBT:stabilität_ -11 +RBT:_country 253 +RBS:_für 101 +RBS:für_ 129 +RBS:X_ihres -16 +R:X:NX_ihres_NX:N1_of_their_N2 -16 +R:X:für:that 129 +RBT:_political -16 +RBS:für_X -129 +R:X:,_NX_und_NX:,_N1_N2 -2 +R:X:für_NX:to_N1 -28 +R:X:NX_stabilität:N1_stability 129 +RBS:X_stabilität 129 +RBS:X_für 22 +RBT:_with -109 +RBS:,_für -123 +R:X:,_für_NX:,_N1 15.5 +R:X:,_NX_den_NX_zu:to_N2_N1 69 +R:X:NX_für_NX_.:N1_N2_. 22 +RBS:_ihres 16 +R:X:ihres_NX:its_N1 -50 +R:X:ihres_NX:their_N1 66 +R:X:NX_zu_verkaufen_NX:sell_N1_N2 140.5 +RBS:verkaufen_X 140.5 +RBS:_würde -204 +RBS:würde_ -117 +R:X:würde:would -204 +RBS:würde_X 126 +R:X:in_NX_hat_NX:in_N1_N2 22 +R:X:NX_dem_NX_pervez:N1_N2_pervez 35.5 +RBS:_halten 284 +RBS:halten_ 204 +R:X:NX_dem_NX_pervez_musharraf:N1_N2_pervez_musharraf -57.5 +Shape_S01111_T01011 560.5 +Shape_S01111_T11011 -20.5 +Shape_S01111_T01111 -5 +RBT:_maintain 30 +R:X:halten:halten 284 +RBT:_halten 284 +RBT:halten_ 284 +RBS:halten_X -204 +R:X:NX_würde:if_N1 35 +RBS:X_würde 204 +R:X:NX_würde:will_N1 -6 +Shape_S11111_T11010 69 +R:X:NX_würde:would_face_a_N1 -9.5 +RBT:would_face -18.5 +RBT:face_a -18.5 +Shape_S11111_T11110 -57 +R:X:NX_würde:would_N1 78 +R:X:NX_würde:N1_will -10.5 +R:X:NX_würde_NX:would_N1_N2 126 +R:X:NX_würde_.:would_face_a_N1_. -9 +RBS:würde_. -9 +PhraseModel_0 -2973.8953021225416 +R:X:vielleicht:may -177 +PhraseModel_1 -4012.0052074229625 +PhraseModel_2 -1203.5725821427027 +RBS:vielleicht_ -284 +PhraseModel_3 2747.8420998127522 +PhraseModel_4 -3205.3163436680484 +PhraseModel_5 720.5 +PhraseModel_6 275 +R:X:vielleicht:vielleicht -107 +RBT:_vielleicht -107 +RBT:vielleicht_ -107 +R:X:vielleicht_NX:perhaps_N1 284 +RBS:vielleicht_X 284 +R:X:NX_halten:maintain_the_N1 -29 +RBS:X_halten -284 +RBT:maintain_the -174 +R:X:NX_halten:N1_hold -51 +R:X:NX_halten_NX:N2_maintain_the_N1 -204 +RBT:_maintain -204 +RBS:_versprechen 30 +RBS:versprechen_ -75 +RBT:_commitment 107 +R:X:versprechen_NX:commitment_N1 30 +RBS:versprechen_X 75 +R:X:NX_versprechen:N1_commitment -75 +RBS:X_versprechen -30 +R:X:NX_,_für_NX:N1_,_N2 -138.5 +R:X:NX_versprechen_NX:N1_commitment_N2 45 +RBS:_dass -451 +RBS:dass_ -91.5 +R:X:dass_NX:that_N1 -451 +RBS:dass_X 91.5 +R:X:NX_er_sein:N1_to_make_up_for_his -91.5 +RBS:er_sein -91.5 +R:X:seine_NX_und:a_N1_, -15 +R:X:NX_,_NX_und:N1_N2_, 129 +RBS:,_dass 851.5 +R:X:NX_,_dass:N1_keep -27 +R:X:NX_,_dass:N1_said_that -0.5 +R:X:NX_,_dass:N1_to_let -9.5 +R:X:NX_dass:that_N1 -8.5 +RBS:X_dass -400.5 +R:X:NX_dass:N1_let -51.5 +R:X:NX_dass:N1_see -243.5 +R:X:NX_dass:N1_thought -97 +R:X:NX_,_dass_NX:N1_that_N2 134 +Glue -599 +PassThrough -2016.5 +R:X:musharrafs:his 2 +RBS:musharrafs_ -29 +R:X:NX_und_den:N1_and_the 22 +RBT:_his 250.5 +RBT:his_ 160.5 +R:X:musharrafs:musharraf -1.5 +RBT:_musharraf 135.5 +RBT:musharraf_ 41.5 +R:X:NX_,_dass_NX_.:N1_N2_. 91.5 +R:X:musharrafs:musharrafs -29.5 +RBT:_musharrafs -29.5 +RBT:musharrafs_ -29.5 +RBS:sie_X 346 +RBS:_X -1369.5 +R:X:dies:so -74.5 +RBS:X_ -1743 +RBS:dies_ -348 +R:X:dies:so_,_this 47 +RBT:so_, 47 +R:X:sie_NX:it_N1 22 +RBT:,_this 47 +R:X:dies:that -256.5 +R:X:NX_?:N1_? -134.5 +R:X:dies:these -5.5 +RBS:X_? -235 +RBT:_these -5.5 +RBT:these_ -5.5 +R:X:NX_?:N1_consulting_? -100.5 +R:X:dies:this -58.5 +R:X:letzter_NX:last_N1 -14 +RBS:_letzter -20 +RBS:letzter_X 19.5 +RBT:_last -2 +R:X:letzter:last 7 +RBS:letzter_ -19.5 +R:X:sein:be 1.5 +RBT:last_ 7 +R:X:letzter:late 11.5 +RBT:_they -6 +RBS:sein_ 68 +RBT:_late 11.5 +R:X:ist_NX:be_N1 464.5 +RBT:_be -10.5 +RBT:late_ 11.5 +R:X:sie_NX:they_N1 -22 +RBS:_ist 415.5 +RBT:be_ 120 +R:X:letzter:letzter -24.5 +RBS:ist_X 8 +R:X:sein:being -16 +RBT:_letzter -24.5 +R:X:ist_NX:has_N1 16 +RBT:_being -79 +RBT:letzter_ -24.5 +R:X:ist_NX:is_at_N1 6 +RBT:being_ -16 +R:X:musharrafs_NX:his_N1 -25 +R:X:sein:his 73 +RBS:musharrafs_X 29 +R:X:ist_NX:is_well_N1 6 +R:X:sein:its -15.5 +R:X:musharrafs_NX:musharraf_'s_N1 77.5 +R:X:sein:sein 55 +RBT:musharraf_'s 55.5 +R:X:ist_NX:is_N1 23 +RBT:_sein 55 +R:X:musharrafs_NX:musharraf_N1 -23.5 +R:X:ist_NX:more_N1 -130.5 +RBT:sein_ 55 +R:X:NX_letzter:N1_late -26.5 +R:X:ist_NX:N1_be 176 +R:X:ziel:aim -32.5 +RBS:X_letzter 20 +R:X:ist_NX:N1_has -67 +RBS:_ziel -143 +R:X:NX_letzter:N1_'s_last 13 +R:X:ist_NX:N1_is -19 +RBS:ziel_ -219 +R:S:NS_NX:N1_N2 -599 +R:X:ist_NX:N1_,_is 18 +RBT:_aim -32.5 +RBS:_S -599 +R:X:ist_NX:N1_it_is 49 +RBT:aim_ -32.5 +RBS:S_X -599 +R:X:ist:are -65.5 +R:X:ziel:goal 45 +R:X:NX_letzter_NX:N1_'s_last_N2 33.5 +RBS:ist_ -8 +RBT:_goal 45 +R:X:?:? 235 +RBT:goal_ 45 +RBS:_? 235 +R:X:ziel:target -22.5 +RBT:_? 235 +RBS:X__ -347 +RBT:_target -22.5 +RBT:?_ 235 +RBT:target_ -22.5 +R:X:ist:'s -61 +R:X:ziel:targets -18 +RBS:in_ -22 +RBT:_targets -18 +RBT:targets_ -18 +RBT:_, 24.5 +R:X:ziel:ziel -125 +RBT:,_ -38 +R:X:NX___NX:N1___N2 -347 +R:X:dies_NX:so_N1 200 +RBT:_ziel -125 +RBS:dies_X 256 +RBT:ziel_ -125 +RBT:_at 23 +R:X:dies_NX:this_to_N1 156.5 +R:X:ziel_NX:goal_N1 49 +RBT:this_to 156.5 +RBS:ziel_X 219 +R:X:dies_NX:this_N1 -100.5 +R:X:ziel_NX:targets_N1 -19 +R:X:dies_ist:could_be 118.5 +R:X:ziel_NX:target_N1 -20 +RBS:dies_ist 92 +R:X:sein_NX:being_able_to_N1 -71.5 +RBT:in_ -65.5 +R:X:in:for 31 +RBT:_could 118.5 +RBS:sein_X -68 +RBT:could_be 118.5 +RBT:being_able -63 +RBT:_for 14.5 +RBT:able_to -63 +RBT:for_ 14.5 +R:X:sein_NX:be_N1 -10 +R:X:sein_NX:his_N1 184.5 +RBS:X_ist -507.5 +R:X:sein_NX:its_N1 -26.5 +R:X:in:in -53 +R:X:sein_NX:N1_be -174.5 +R:X:NX_ziel:N1_aim -32.5 +RBT:_in -75.5 +RBS:X_ziel 143 +R:X:NX_ziel:N1_goal 20 +R:X:NX_ziel:N1_target -26.5 +R:X:NX_ziel:N1_targets -27 +RBT:_into -270 +R:X:NX_ziel_NX:N1_goal_N2 60 +R:X:NX_ziel_NX:N1_targets_N2 -6 +R:X:NX_sie_NX_,_dass:N1_N2_that 346 +R:X:NX_ziel_NX:N1_target_N2 -6 +R:X:dies_ist_NX:this_is_N1 -26.5 +R:X:NX_ziel_NX:N2_N1_goal 161 +RBT:_of -38 +RBT:of_ -17 +R:X:NX_ist_NX:is_N1_N2 -129 +RBS:_die 428.5 +R:X:NX_ist_NX:is_N1_,_N2 16.5 +RBS:die_ -116 +RBT:_on -653.5 +RBT:on_ 84.5 +R:X:NX_ist_NX:'s_N1_N2 -41.5 +R:X:die:, -9 +RBT:_over 45 +R:X:die:a -5 +R:X:NX_ist_NX:N1_has_N2 -104.5 +R:X:blieben_NX:remained_N1 135 +R:X:die:an -123 +R:X:NX_ist_NX:N1_is_at_N2 -5.5 +RBS:_blieben 187.5 +R:X:NX_ist_NX:N1_is_well_N2 -5 +RBS:blieben_X -13 +RBT:_are -65.5 +RBT:_'s 16 +R:X:NX_ist_NX:N1_is_N2 -31 +RBT:are_ -65.5 +RBT:'s_ -28.5 +R:X:blieben_NX:N1_remained 81.5 +R:X:NX_ist_NX:N1_,_is_N2 59.5 +R:X:die:by -10 +R:X:die:its 302.5 +RBS:_pakistanis 57 +RBS:pakistanis_ 116.5 +RBT:_to 93.5 +RBT:_pakistanis 161 +R:X:NX_ist_NX:N1_N2_has -75 +R:X:die:the -28 +RBT:to_ 18 +R:X:NX_ist_NX:N1_N2_is -97.5 +R:X:pakistanis_NX:pakistanis_N1 57 +R:X:NX_ist_NX:N1_,_N2_is -1 +RBT:_those -6 +RBT:_within 20 +RBT:within_ 20 +RBS:pakistanis_X -116.5 +R:X:NX_blieben_NX:N1_,_N2_remained -229.5 +R:X:NX_ist_NX:N2_is_N1 -47 +RBS:X_blieben -187.5 +RBT:_is -21 +R:X:NX_pakistanis:pakistanis_,_N1 235.5 +RBS:X_pakistanis -57 +RBT:pakistanis_, 104 +R:X:NX_pakistanis:N1_pakistanis -119 +R:X:NX_ist_NX:N2_N1_is -46.5 +RBS:blieben_ 13 +RBT:_is -251 +R:X:blieben:blieben -29 +RBT:_blieben -29 +RBT:blieben_ -29 +R:X:NX_pakistanis_NX:pakistanis_,_N1_,_N2 -23 +RBS:_zu -560 +R:X:NX_pakistanis_NX:N1_pakistanis_N2 -150.5 +RBS:zu_X -717.5 +R:X:NX_blieben:N1_,_remained 42 +RBS:__ 347 +RBS:_ein 37.5 +RBS:ein_ -9.5 +RBS:der_ -88.5 +R:X:zu_NX:for_N1 43 +R:X:__NX:__N1 -97 +RBT:_- 113 +RBT:-_ -4 +R:X:__NX:,_N1 444 +R:X:zu_NX:in_N1 37.5 +RBT:_a -27.5 +RBT:a_ -5 +RBS:sie_ -346 +RBT:the_ 40 +R:X:zu_NX:to_N1 -716 +R:X:zu_NX:with_N1 40.5 +R:X:zu_NX:N1_on 30 +RBT:_the 324.5 +R:X:NX_sie:but_N1 -346 +RBS:X_ein -37.5 +RBT:be_transformed -12 +R:X:medien:media 299.5 +RBS:_medien -71.5 +RBT:_with 54.5 +RBS:medien_ -156 +RBT:with_ -19 +RBT:_media 299.5 +R:X:NX_ein:N1_has_an -3.5 +RBT:media_ 299.5 +R:X:NX_ein:N1_put_forward_a -6 +R:X:medien:medien -371 +RBT:_medien -371 +RBT:medien_ -371 +RBS:der_X 45 +RBS:medien_X 156 +R:X:NX_zu_NX:in_N2_N1 -9.5 +RBS:X_zu 339 +RBT:in_ -2.5 +R:X:NX_zu_NX:of_N2_N1 -52.5 +RBT:to_ -102.5 +RBT:_to 30 +R:X:,_dass_NX:that_N1 317 +R:X:NX_zu_NX:to_N2_N1 19 +R:X:NX_zu_NX:N1_in_N2 -2 +R:X:NX_zu_NX:N1_is_N2 -2 +RBS:X_macht -0.5 +R:X:NX_zu_NX:N1_to_N2 48 +R:X:NX_macht_NX:N1_N2_does -0.5 +R:X:NX_zu_NX:N2_N1_to -28 +R:X:NX_zu_NX_.:to_N2_N1_. 22.5 +RBS:an_ 28 +R:X:NX_zu_NX_.:N1_is_N2_. -3.5 +R:X:NX_zu_NX_.:N1_to_N2_. 7.5 +R:X:NX_zu_NX_.:N1_with_N2_. -3 +R:X:NX_zu_NX_.:N1_N2_. -221.5 +R:X:NX_zu_NX_.:N2_N1_. 4.5 +R:X:freien:free -83.5 +RBS:_freien -118 +RBS:freien_ -201.5 +RBT:_free 210 +RBT:free_ -83.5 +R:X:freien:freien -276 +RBT:_freien -276 +RBT:freien_ -276 +RBT:_an 31.5 +R:X:freien_NX:free_N1 248 +RBT:an_ -123 +RBS:freien_X 201.5 +R:X:NX_medien:N1_media -90 +RBS:X_medien 71.5 +R:X:amerika:america 193 +RBS:_amerika -36 +R:X:NX_medien_NX:N2_N1_media 5 +R:X:an_NX:in_N1 210 +R:X:freien_NX_.:free_N1_. -6.5 +RBS:amerika_ -131 +R:X:NX_medien_NX_.:N2_N1_media_. 151 +RBT:_america 283.5 +RBT:america_ 193 +R:X:die_NX:an_N1 -7.5 +R:X:amerika:american -3 +RBS:die_X -45.5 +RBT:_american -3 +RBT:american_ -3 +R:X:amerika:amerika -321 +RBS:_jener 62.5 +R:X:die_NX:a_N1 19 +RBT:_amerika -321 +RBS:jener_X 62.5 +RBT:amerika_ -321 +R:X:jener_NX:the_N1 62.5 +R:X:an_NX:to_N1 -210 +RBS:X_jener -62.5 +RBS:amerika_X 131 +R:X:amerika_NX:america_N1 107 +R:X:die_NX:is_N1 -2.5 +RBS:an_der -28 +R:X:auf:, -5 +R:X:die_NX:its_N1 -14 +RBS:auf_ 46.5 +R:X:die_NX:'s_N1 46.5 +RBS:X_der 71 +R:X:NX_der:N1_for -74 +R:X:NX_der:N1_in -43 +R:X:auf:in -5.5 +RBT:_choice -103 +R:X:die_NX:the_N1 -86.5 +RBT:_decision 103 +R:X:auf:on 60 +R:X:die_NX:those_N1 -6 +R:X:NX_der:N1_to 72 +R:X:entscheidung_NX:choice_is_N1 -103 +R:X:die_NX:with_N1 73.5 +R:X:auf:auf -3 +RBT:choice_is -103 +RBT:_auf -3 +R:X:entscheidung_NX:decision_N1 103 +R:X:die_NX:,_N1 57 +R:X:die_NX:N1_is -0.5 +RBT:auf_ -3 +R:X:die_NX:N1_'s -1 +RBS:auf_X -46.5 +R:X:die_NX:N1_the -1 +R:X:NX_freien:N1_free 158 +RBT:of_ -13 +RBS:X_freien 118 +R:X:NX_der_NX:over_N2_N1 45 +R:X:NX_freien_NX:N1_free_N2 -34 +R:X:NX_freien_NX:N1_free_,_N2 -6 +RBT:over_ 45 +R:X:die_NX_medien:the_N1_media 5.5 +R:X:auf_NX:in_N1 -46.5 +RBT:the_ -0.5 +R:X:auf_NX:on_N1 66 +R:X:auf_NX:to_N1 -2 +R:X:auf_NX:,_N1 -18 +RBS:X_amerika 36 +RBT:_may -177 +RBS:und_die 139.5 +RBT:may_ -177 +RBT:_ 585.5 +RBT:_would -18.5 +RBS:X_die -568 +RBT:would_ -204 +R:X:NX_die:the_N1 34.5 +R:X:NX_amerika_NX:N2_N1_america 36 +R:X:terroranschläge:terrorist -22 +R:X:NX_die:,_N1 -42 +R:X:NX_die:N1_, -173 +RBS:_terroranschläge -161.5 +RBS:der_macht 0.5 +R:X:NX_die:-_N1 -5 +RBS:terroranschläge_ -46 +R:X:NX_die:N1_a -1 +RBT:_terrorist -119.5 +R:X:NX_der_macht_NX:N1_hold_N2_power 28 +RBT:terrorist_ -22 +R:X:,:, -2.5 +RBT:terrorist_attacks 77.5 +RBS:_, -182 +RBT:attacks_ 28 +RBS:,_ -160.5 +R:X:terroranschläge:terroranschläge -52 +RBT:_terroranschläge -52 +RBT:__ -139 +RBT:terroranschläge_ -52 +R:X:NX_die:N1_its -128.5 +RBS:terroranschläge_X 46 +RBT:_-- -64 +R:X:terroranschläge_NX:terrorist_attacks_N1 -87.5 +RBT:_by -10 +RBT:by_ -10 +R:X:,:out -3.5 +RBT:_out -3.5 +R:X:und_die_NX:and_N1 218 +RBT:out_ -3.5 +RBT:_that -261.5 +R:X:NX_die_NX:the_N1_N2 -1 +RBT:that_ -127.5 +R:X:NX_die_NX:the_N2_N1 -4 +RBS:,_X -335 +RBT:,_as -40 +R:X:,_NX:in_N1 -239 +R:X:,_NX:of_N1 -4 +R:X:,_NX:on_N1 -166 +R:X:,_NX:to_N1 649 +R:X:NX_die_NX:N1_the_N2 -4 +R:X:,_NX:,_N1 -399 +R:X:,_NX:__N1 -42 +R:X:,_NX:--_N1 -102 +R:X:,_an:to 28 +RBS:,_an 28 +R:X:NX_die_NX:N1_,_N2 -5 +R:X:NX_die_NX:N1_N2_the -4 +RBS:X_an -28 +RBS:die_terroranschläge 161.5 +R:X:die_terroranschläge:,_terrorist_attacks 28 +RBT:,_terrorist 175 +R:X:die_terroranschläge_NX:,_terrorist_attacks_N1 147 +R:X:NX_so:N1_as -1.5 +R:X:justiz:judiciary -90 +RBS:_justiz -1 +RBS:justiz_ -220.5 +R:X:NX_so:N1_that -14 +RBT:_judiciary 215 +R:X:NX_so:N1_the 15.5 +RBT:judiciary_ -90 +R:X:justiz:justiz -216 +RBT:_justiz -216 +RBT:justiz_ -216 +R:X:justiz_NX:judiciary_N1 305 +RBS:justiz_X 205 +RBS:_brachten -28 +RBS:justiz_und 15.5 +RBS:brachten_ -175 +R:X:NX_und_die:'s_N1_and -5 +R:X:brachten:brachten -175 +RBT:_brachten -175 +RBT:brachten_ -175 +R:X:NX_an_der:N1_the -0.5 +R:X:brachten_NX:N1_brought 147 +RBS:brachten_X 175 +R:X:NX_die_terroranschläge_NX:,_terrorist_attacks_N2_N1 -13.5 +R:X:NX_und_die:N1_'s -12 +R:X:NX_und_die_NX:'s_N2_N1 -16 +RBS:_2001 -14.5 +RBS:2001_ 28 +RBT:_2001 37.5 +R:X:NX_und_die_NX:N1_and_N2 -159 +RBT:2001_ 28 +R:X:2001_NX:2001_N1 147 +RBS:2001_X -28 +R:X:NX_brachten_NX:N1_N2_brought 28 +RBS:X_brachten 28 +RBT:,_ -109.5 +R:X:2001_NX_die_NX:2001_,_N2_N1 -161.5 +R:X:unabhängige:independent 38 +RBT:2001_, -109.5 +RBS:_unabhängige 127 +RBS:unabhängige_ -197 +RBT:_independent 343 +RBT:independent_ 38 +RBT:_september -13.5 +R:X:unabhängige:unabhängige -198 +RBT:_unabhängige -198 +RBS:ein_X 9.5 +RBT:unabhängige_ -198 +RBS:september_X -14.5 +R:X:unabhängige_NX:independent_N1 287 +R:X:ein_NX:an_N1 132 +R:X:ein_NX:any_N1 25 +RBS:unabhängige_X 197 +R:X:NX_justiz:N1_judiciary 85.5 +R:X:NX_an_der_macht_NX:N1_of_power_N2 -27.5 +RBS:X_justiz 1 +R:X:NX_justiz_NX:N1_judiciary_N2 -43 +R:X:NX_justiz_und:N1_judiciary_and 15.5 +RBS:_11 -13.5 +R:X:NX_unabhängige:N1_independent -37 +R:X:ein_NX:a_N1 -93 +RBS:X_unabhängige -127 +R:X:ein_NX:one_N1 -15 +R:X:NX_unabhängige_NX:N1_independent_N2 -90 +R:X:ein_NX:-_N1 -11.5 +R:X:NX_ein_NX:an_N1_N2 -6 +R:X:NX_ein_NX:be_transformed_N1_N2 -22 +RBS:X_, -3.5 +RBS:september_2001 14.5 +RBT:,_2001 14.5 +R:X:NX_,:to_N1 68 +R:X:NX_,:N1__ 1 +R:X:NX_,:N1_-- -172.5 +R:X:11_._september_2001_NX:september_11_,_2001_N1 -13.5 +R:X:die_NX_und_NX:the_N1_N2 -10 +R:X:NX_,:N1_for -127.5 +R:X:NX_,:N1_in -13.5 +R:X:NX_,:N1_of -55 +R:X:NX_,:N1_on 257.5 +R:X:NX_,:N1_out -58 +RBS:am_11 13.5 +R:X:die_NX_justiz_NX_die:the_N1_judiciary_N2 -57 +R:X:NX_,:N1_refuses_to -232.5 +R:X:die_NX_und_die:the_N1_and 148 +R:X:die_NX_und_die:the_N1_and_the -2.5 +RBT:the_september 13.5 +R:X:die_NX_die_NX:the_N1_N2 -3 +R:X:am_11_._september_NX:the_september_11_,_N1 -14.5 +R:X:die_NX_und_die_NX:the_N1_and_N2 -32 +RBS:zu_ 672 +R:X:NX_,_NX:N1_,_N2 -78 +R:X:NX_,_NX:N1_N2_, 80 +R:X:am_11_._september_2001:the_september_11_,_2001 28 +R:X:zu:for -5 +R:X:zu:in -7 +R:X:zu:to 23 +R:X:taliban:taliban -251.5 +RBS:_taliban -223.5 +RBS:taliban_ -157.5 +R:X:zu:with -6 +R:X:verzweifelten:desperate 28.5 +RBT:_taliban -205.5 +RBT:_desperate 28.5 +RBT:taliban_ -107 +RBT:desperate_ 28.5 +R:X:taliban_NX:taliban_N1 28 +R:X:verzweifelten:verzweifelten -28.5 +RBS:taliban_X 157.5 +R:X:NX_zu:to_N1 -229 +RBT:_verzweifelten -28.5 +R:X:den_taliban:the_taliban 144.5 +RBT:verzweifelten_ -28.5 +RBS:den_taliban 223.5 +RBT:the_taliban 144.5 +R:X:NX_zu:N1_for -152 +R:X:NX_zu:N1_in -6 +R:X:NX_zu:N1_is 251 +R:X:NX_zu:N1_of -49.5 +RBS:_dem 22 +RBT:_its 458 +RBT:its_ 337 +R:X:NX_den_taliban:N1_taliban -50.5 +R:X:NX_den_taliban_NX:N1_taliban_N2 -2.5 +R:X:NX_den_taliban_NX:N2_N1_taliban 132 +R:X:erklärte:declared -8 +RBS:_erklärte -185.5 +RBS:erklärte_ -124.5 +RBT:_declaring -9 +R:X:erklärte:erklärte -116.5 +RBT:_erklärte -116.5 +RBT:erklärte_ -116.5 +R:X:erklärte_NX:declared_N1 -52 +RBS:erklärte_X -61 +RBS:jener_ -62.5 +R:X:erklärte_NX:declaring_N1 -9 +RBS:erklärte_, 185.5 +R:X:NX_jener:N1_of -62.5 +R:X:dem_NX:the_N1 22 +R:X:verkaufen:sell -153 +RBS:_verkaufen -153 +RBS:verkaufen_ -140.5 +RBT:sell_ -153 +RBS:bereit_ 86 +RBS:zu_verkaufen 153 +RBS:_bemühen -2.5 +R:X:bereit:bereit 86 +RBT:_bereit 86 +RBT:bereit_ 86 +R:X:bereit_NX:ready_N1 -31 +RBS:bereit_X -86 +R:X:bereit_NX:N1_ready -55 +RBS:X_zum 30 +R:X:bemühen:bemühen -2.5 +R:X:NX_erklärte_,:N1_, 110 +RBT:_bemühen -2.5 +RBS:X_erklärte 185.5 +RBT:bemühen_ -2.5 +R:X:NX_erklärte_,_NX:N1_,_N2 75.5 +RBS:in_X 22 +RBS:_sich -17.5 +R:X:NX_zu_verkaufen:sell_N1 12.5 +RBS:sich_ -17.5 +R:X:NX_zum_NX:N2_to_further_N1 30 +RBS:_das 45 +RBS:das_ 2.5 +RBT:to_further 30 +RBT:_it -381 +RBT:it_ 3 +RBT:_so 172.5 +RBT:so_ -74.5 +RBT:_this 9.5 +RBT:this_ -11.5 +RBS:X_dem -22 +R:X:das_NX:a_growing_N1 77 +RBS:das_X -2.5 +RBT:a_growing -41 +R:X:das_NX:be_N1 169 +R:X:das_NX:its_N1 -95 +R:X:das_NX:so_N1 -38 +RBS:X_sein 91.5 +R:X:das_NX:the_N1 -80 +done + +--- +Best iteration: 2 [SCORE 'stupid_bleu'=0.37119]. +This took 0.6 min. diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz new file mode 100644 index 00000000..0741fd92 Binary files /dev/null and b/training/dtrain/examples/standard/nc-wmt11.de.gz differ diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz new file mode 100644 index 00000000..1c0bd401 Binary files /dev/null and b/training/dtrain/examples/standard/nc-wmt11.en.gz differ diff --git a/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz new file mode 100644 index 00000000..7ce81057 Binary files /dev/null and b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz differ diff --git a/training/dtrain/examples/standard/nc-wmt11.grammar.gz b/training/dtrain/examples/standard/nc-wmt11.grammar.gz new file mode 100644 index 00000000..ce4024a1 Binary files /dev/null and b/training/dtrain/examples/standard/nc-wmt11.grammar.gz differ diff --git a/training/dtrain/examples/toy/cdec.ini b/training/dtrain/examples/toy/cdec.ini new file mode 100644 index 00000000..b14f4819 --- /dev/null +++ b/training/dtrain/examples/toy/cdec.ini @@ -0,0 +1,3 @@ +formalism=scfg +add_pass_through_rules=true +grammar=grammar.gz diff --git a/training/dtrain/examples/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini new file mode 100644 index 00000000..cd715f26 --- /dev/null +++ b/training/dtrain/examples/toy/dtrain.ini @@ -0,0 +1,13 @@ +decoder_config=cdec.ini +input=src +refs=tgt +output=- +print_weights=logp shell_rule house_rule small_rule little_rule PassThrough +k=4 +N=4 +epochs=2 +scorer=bleu +sample_from=kbest +filter=uniq +pair_sampling=all +learning_rate=1 diff --git a/training/dtrain/examples/toy/expected-output b/training/dtrain/examples/toy/expected-output new file mode 100644 index 00000000..1da2aadd --- /dev/null +++ b/training/dtrain/examples/toy/expected-output @@ -0,0 +1,77 @@ +Warning: hi_lo only works with pair_sampling XYX. + cdec cfg 'cdec.ini' +Seeding random number sequence to 1664825829 + +dtrain +Parameters: + k 4 + N 4 + T 2 + scorer 'bleu' + sample from 'kbest' + filter 'uniq' + learning rate 1 + gamma 0 + loss margin 0 + pairs 'all' + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'src' + refs 'tgt' + output '-' +(a dot represents 10 inputs) +Iteration #1 of 2. + 2 +WEIGHTS + logp = +0 + shell_rule = -1 + house_rule = +2 + small_rule = -2 + little_rule = +3 + PassThrough = -5 + --- + 1best avg score: 0.5 (+0.5) + 1best avg model score: 2.5 (+2.5) + avg # pairs: 4 + avg # rank err: 1.5 + avg # margin viol: 0 + non0 feature count: 6 + avg list sz: 4 + avg f count: 2.875 +(time 0 min, 0 s/S) + +Iteration #2 of 2. + 2 +WEIGHTS + logp = +0 + shell_rule = -1 + house_rule = +2 + small_rule = -2 + little_rule = +3 + PassThrough = -5 + --- + 1best avg score: 1 (+0.5) + 1best avg model score: 5 (+2.5) + avg # pairs: 5 + avg # rank err: 0 + avg # margin viol: 0 + non0 feature count: 6 + avg list sz: 4 + avg f count: 3 +(time 0 min, 0 s/S) + +Writing weights file to '-' ... +house_rule 2 +little_rule 3 +Glue -4 +PassThrough -5 +small_rule -2 +shell_rule -1 +done + +--- +Best iteration: 2 [SCORE 'bleu'=1]. +This took 0 min. diff --git a/training/dtrain/examples/toy/grammar.gz b/training/dtrain/examples/toy/grammar.gz new file mode 100644 index 00000000..8eb0d29e Binary files /dev/null and b/training/dtrain/examples/toy/grammar.gz differ diff --git a/training/dtrain/examples/toy/src b/training/dtrain/examples/toy/src new file mode 100644 index 00000000..87e39ef2 --- /dev/null +++ b/training/dtrain/examples/toy/src @@ -0,0 +1,2 @@ +ich sah ein kleines haus +ich fand ein kleines haus diff --git a/training/dtrain/examples/toy/tgt b/training/dtrain/examples/toy/tgt new file mode 100644 index 00000000..174926b3 --- /dev/null +++ b/training/dtrain/examples/toy/tgt @@ -0,0 +1,2 @@ +i saw a little house +i found a little house diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb index f0cd58c5..86e835e8 100755 --- a/training/dtrain/lplp.rb +++ b/training/dtrain/lplp.rb @@ -84,34 +84,28 @@ def _test() end #_test() -# actually do something + def usage() - puts "lplp.rb [n] < " + puts "lplp.rb <#shards> < " puts " l0...: norms for selection" puts "select_k: only output top k (according to the norm of their column vector) features" puts " cut: output features with weight >= threshold" puts " n: if we do not have a shard count use this number for averaging" - exit + exit 1 end -if ARGV.size < 3 then usage end +if ARGV.size < 4 then usage end norm_fun = method(ARGV[0].to_sym) type = ARGV[1] x = ARGV[2].to_f - -shard_count_key = "__SHARD_COUNT__" +shard_count = ARGV[3].to_f STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' w = {} -shard_count = 0 while line = STDIN.gets key, val = line.split /\s+/ - if key == shard_count_key - shard_count += 1 - next - end if w.has_key? key w[key].push val.to_f else @@ -119,8 +113,6 @@ while line = STDIN.gets end end -if ARGV.size == 4 then shard_count = ARGV[3].to_f end - if type == 'cut' cut(w, norm_fun, shard_count, x) elsif type == 'select_k' diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 24e7f49e..e661416e 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -3,16 +3,15 @@ require 'trollop' def usage - if ARGV.size != 8 - STDERR.write "Usage: " - STDERR.write "ruby parallelize.rb -c -e [--randomize/-z] [--reshard/-y] -s <#shards|0> -p -i -r [--qsub/-q] --dtrain_binary -l \"l2 select_k 100000\"\n" - exit 1 - end + STDERR.write "Usage: " + STDERR.write "ruby parallelize.rb -c [-e ] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p ] -i -r [--qsub/-q] [--dtrain_binary ] [-l \"l2 select_k 100000\"]\n" + exit 1 end opts = Trollop::options do opt :config, "dtrain config file", :type => :string - opt :epochs, "number of epochs", :type => :int + opt :epochs, "number of epochs", :type => :int, :default => 10 + opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000" opt :randomize, "randomize shards before each epoch", :type => :bool, :short => '-z', :default => false opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false opt :shards, "number of shards", :type => :int @@ -21,8 +20,8 @@ opts = Trollop::options do opt :references, "references", :type => :string opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string - opt :lplp_args, "arguments for lplp arguments", :type => :string, :default => "l2 select_k 100000" end +usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] dtrain_dir = File.expand_path File.dirname(__FILE__) @@ -32,16 +31,14 @@ else dtrain_bin = opts[:dtrain_binary] end ruby = '/usr/bin/ruby' -lplp_rb = "#{dtrain_dir}/hstreaming/lplp.rb" +lplp_rb = "#{dtrain_dir}/lplp.rb" lplp_args = opts[:lplp_args] cat = '/bin/cat' ini = opts[:config] epochs = opts[:epochs] -rand = false -rand = true if opts[:randomize] -reshard = false -reshard = true if opts[:reshard] +rand = opts[:randomize] +reshard = opts[:reshard] predefined_shards = false if opts[:shards] == 0 predefined_shards = true @@ -49,11 +46,10 @@ if opts[:shards] == 0 else num_shards = opts[:shards] end -shards_at_once = opts[:processes_at_once] input = opts[:input] refs = opts[:references] -use_qsub = false -use_qsub = true if opts[:qsub] +use_qsub = opts[:qsub] +shards_at_once = opts[:processes_at_once] `mkdir work` diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README deleted file mode 100644 index 2df77086..00000000 --- a/training/dtrain/test/example/README +++ /dev/null @@ -1,8 +0,0 @@ -Small example of input format for distributed training. -Call dtrain from this folder with ../../dtrain -c test/example/dtrain.ini . - -For this to work, undef 'DTRAIN_LOCAL' in dtrain.h -and recompile. - -data can be found here: http://simianer.de/#dtrain - diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini deleted file mode 100644 index 0215416d..00000000 --- a/training/dtrain/test/example/cdec.ini +++ /dev/null @@ -1,25 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -scfg_max_span_limit=15 -intersection_strategy=cube_pruning -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz -# all currently working feature functions for translation: -# (with those features active that were used in the ACL paper) -#feature_function=ArityPenalty -#feature_function=CMR2008ReorderingFeatures -#feature_function=Dwarf -#feature_function=InputIndicator -#feature_function=LexNullJump -#feature_function=NewJump -#feature_function=NgramFeatures -#feature_function=NonLatinCount -#feature_function=OutputIndicator -feature_function=RuleIdentityFeatures -feature_function=RuleSourceBigramFeatures -feature_function=RuleTargetBigramFeatures -feature_function=RuleShape -#feature_function=SourceSpanSizeFeatures -#feature_function=SourceWordPenalty -#feature_function=SpanFeatures diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini deleted file mode 100644 index 97fce7f0..00000000 --- a/training/dtrain/test/example/dtrain.ini +++ /dev/null @@ -1,22 +0,0 @@ -input=./nc-wmt11.1k.gz # use '-' for STDIN -output=- # a weights file (add .gz for gzip compression) or STDOUT '-' -select_weights=VOID # don't output weights -decoder_config=./cdec.ini # config for cdec -# weights for these features will be printed on each iteration -print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough -tmp=/tmp -stop_after=10 # stop epoch after 10 inputs - -# interesting stuff -epochs=2 # run over input 2 times -k=100 # use 100best lists -N=4 # optimize (approx) BLEU4 -scorer=stupid_bleu # use 'stupid' BLEU+1 -learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) -gamma=0 # use SVM reg -sample_from=kbest # use kbest lists (as opposed to forest) -filter=uniq # only unique entries in kbest (surface form) -pair_sampling=XYX -hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here -pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) -loss_margin=0 diff --git a/training/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output deleted file mode 100644 index 05326763..00000000 --- a/training/dtrain/test/example/expected-output +++ /dev/null @@ -1,89 +0,0 @@ - cdec cfg 'test/example/cdec.ini' -Loading the LM will be faster if you build a binary file. -Reading test/example/nc-wmt11.en.srilm.gz -----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 -**************************************************************************************************** - Example feature: Shape_S00000_T00000 -Seeding random number sequence to 2912000813 - -dtrain -Parameters: - k 100 - N 4 - T 2 - scorer 'stupid_bleu' - sample from 'kbest' - filter 'uniq' - learning rate 1 - gamma 0 - loss margin 0 - pairs 'XYX' - hi lo 0.1 - pair threshold 0 - select weights 'VOID' - l1 reg 0 'none' - max pairs 4294967295 - cdec cfg 'test/example/cdec.ini' - input 'test/example/nc-wmt11.1k.gz' - output '-' - stop_after 10 -(a dot represents 10 inputs) -Iteration #1 of 2. - . 10 -Stopping after 10 input sentences. -WEIGHTS - Glue = -637 - WordPenalty = +1064 - LanguageModel = +1175.3 - LanguageModel_OOV = -1437 - PhraseModel_0 = +1935.6 - PhraseModel_1 = +2499.3 - PhraseModel_2 = +964.96 - PhraseModel_3 = +1410.8 - PhraseModel_4 = -5977.9 - PhraseModel_5 = +522 - PhraseModel_6 = +1089 - PassThrough = -1308 - --- - 1best avg score: 0.16963 (+0.16963) - 1best avg model score: 64485 (+64485) - avg # pairs: 1494.4 - avg # rank err: 702.6 - avg # margin viol: 0 - non0 feature count: 528 - avg list sz: 85.7 - avg f count: 102.75 -(time 0.083 min, 0.5 s/S) - -Iteration #2 of 2. - . 10 -WEIGHTS - Glue = -1196 - WordPenalty = +809.52 - LanguageModel = +3112.1 - LanguageModel_OOV = -1464 - PhraseModel_0 = +3895.5 - PhraseModel_1 = +4683.4 - PhraseModel_2 = +1092.8 - PhraseModel_3 = +1079.6 - PhraseModel_4 = -6827.7 - PhraseModel_5 = -888 - PhraseModel_6 = +142 - PassThrough = -1335 - --- - 1best avg score: 0.277 (+0.10736) - 1best avg model score: -3110.5 (-67595) - avg # pairs: 1144.2 - avg # rank err: 529.1 - avg # margin viol: 0 - non0 feature count: 859 - avg list sz: 74.9 - avg f count: 112.84 -(time 0.067 min, 0.4 s/S) - -Writing weights file to '-' ... -done - ---- -Best iteration: 2 [SCORE 'stupid_bleu'=0.277]. -This took 0.15 min. diff --git a/training/dtrain/test/parallelize/README b/training/dtrain/test/parallelize/README deleted file mode 100644 index 89715105..00000000 --- a/training/dtrain/test/parallelize/README +++ /dev/null @@ -1,5 +0,0 @@ -run for example - ../../parallelize.rb ./dtrain.ini 4 false 2 2 ./in ./refs - -final weights will be in the file work/weights.3 - diff --git a/training/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini deleted file mode 100644 index e43ba1c4..00000000 --- a/training/dtrain/test/parallelize/cdec.ini +++ /dev/null @@ -1,22 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -intersection_strategy=cube_pruning -cubepruning_pop_limit=200 -scfg_max_span_limit=15 -feature_function=WordPenalty -feature_function=KLanguageModel ../example/nc-wmt11.en.srilm.gz -#feature_function=ArityPenalty -#feature_function=CMR2008ReorderingFeatures -#feature_function=Dwarf -#feature_function=InputIndicator -#feature_function=LexNullJump -#feature_function=NewJump -#feature_function=NgramFeatures -#feature_function=NonLatinCount -#feature_function=OutputIndicator -#feature_function=RuleIdentityFeatures -#feature_function=RuleNgramFeatures -#feature_function=RuleShape -#feature_function=SourceSpanSizeFeatures -#feature_function=SourceWordPenalty -#feature_function=SpanFeatures diff --git a/training/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini deleted file mode 100644 index 03f9d240..00000000 --- a/training/dtrain/test/parallelize/dtrain.ini +++ /dev/null @@ -1,15 +0,0 @@ -k=100 -N=4 -learning_rate=0.0001 -gamma=0 -loss_margin=0 -epochs=1 -scorer=stupid_bleu -sample_from=kbest -filter=uniq -pair_sampling=XYX -hi_lo=0.1 -select_weights=last -print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough -tmp=/tmp -decoder_config=cdec.ini diff --git a/training/dtrain/test/parallelize/g/grammar.out.0.gz b/training/dtrain/test/parallelize/g/grammar.out.0.gz deleted file mode 100644 index 1e28a24b..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.0.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.1.gz b/training/dtrain/test/parallelize/g/grammar.out.1.gz deleted file mode 100644 index 372f5675..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.1.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.2.gz b/training/dtrain/test/parallelize/g/grammar.out.2.gz deleted file mode 100644 index 145d0dc0..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.2.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.3.gz b/training/dtrain/test/parallelize/g/grammar.out.3.gz deleted file mode 100644 index 105593ff..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.3.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.4.gz b/training/dtrain/test/parallelize/g/grammar.out.4.gz deleted file mode 100644 index 30781f48..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.4.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.5.gz b/training/dtrain/test/parallelize/g/grammar.out.5.gz deleted file mode 100644 index 834ee759..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.5.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.6.gz b/training/dtrain/test/parallelize/g/grammar.out.6.gz deleted file mode 100644 index 2e76f348..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.6.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.7.gz b/training/dtrain/test/parallelize/g/grammar.out.7.gz deleted file mode 100644 index 3741a887..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.7.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.8.gz b/training/dtrain/test/parallelize/g/grammar.out.8.gz deleted file mode 100644 index ebf6bd0c..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.8.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/g/grammar.out.9.gz b/training/dtrain/test/parallelize/g/grammar.out.9.gz deleted file mode 100644 index c1791059..00000000 Binary files a/training/dtrain/test/parallelize/g/grammar.out.9.gz and /dev/null differ diff --git a/training/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in deleted file mode 100644 index 3b7dec39..00000000 --- a/training/dtrain/test/parallelize/in +++ /dev/null @@ -1,10 +0,0 @@ -europas nach rassen geteiltes haus -ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen . -der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln . -während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden . -eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern . -die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden . -das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt . -die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen . -der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken . -genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen . diff --git a/training/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs deleted file mode 100644 index 632e27b0..00000000 --- a/training/dtrain/test/parallelize/refs +++ /dev/null @@ -1,10 +0,0 @@ -europe 's divided racial house -a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge . -the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them . -while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon . -an aging population at home and ever more open borders imply increasing racial fragmentation in european countries . -mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear . -it will not , as america 's racial history clearly shows . -race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes . -the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths . -this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us . diff --git a/training/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini deleted file mode 100644 index 98b02d44..00000000 --- a/training/dtrain/test/toy/cdec.ini +++ /dev/null @@ -1,2 +0,0 @@ -formalism=scfg -add_pass_through_rules=true diff --git a/training/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini deleted file mode 100644 index a091732f..00000000 --- a/training/dtrain/test/toy/dtrain.ini +++ /dev/null @@ -1,12 +0,0 @@ -decoder_config=test/toy/cdec.ini -input=test/toy/input -output=- -print_weights=logp shell_rule house_rule small_rule little_rule PassThrough -k=4 -N=4 -epochs=2 -scorer=bleu -sample_from=kbest -filter=uniq -pair_sampling=all -learning_rate=1 diff --git a/training/dtrain/test/toy/input b/training/dtrain/test/toy/input deleted file mode 100644 index 4d10a9ea..00000000 --- a/training/dtrain/test/toy/input +++ /dev/null @@ -1,2 +0,0 @@ -0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 -1 ich fand ein kleines haus i found a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 -- cgit v1.2.3