diff options
| author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 | 
| commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
| tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /pro-train | |
| parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
| parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) | |
merge with upstream
Diffstat (limited to 'pro-train')
| -rwxr-xr-x | pro-train/dist-pro.pl | 6 | ||||
| -rw-r--r-- | pro-train/mr_pro_map.cc | 37 | 
2 files changed, 27 insertions, 16 deletions
| diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 5db053de..31258fa6 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -12,7 +12,7 @@ use POSIX ":sys_wait_h";  my $QSUB_CMD = qsub_args(mert_memory());  my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../vest"; +my $VEST_DIR="$SCRIPT_DIR/../dpmert";  require "$VEST_DIR/libcall.pl";  # Default settings @@ -288,7 +288,7 @@ while (1){  	    $retries++;  	}  	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); -	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); +	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");  	chomp $dec_score;  	print STDERR "DECODER SCORE: $dec_score\n"; @@ -338,7 +338,7 @@ while (1){  		$mapoutput =~ s/mapinput/mapoutput/;  		push @mapoutputs, "$dir/splag.$im1/$mapoutput";  		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; -		my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; +		my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";  		if ($use_make) {  			my $script_file = "$dir/scripts/map.$shard";  			open F, ">$script_file" or die "Can't write $script_file: $!"; diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 0a9b75d7..52b67f32 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -13,11 +13,12 @@  #include "filelib.h"  #include "stringlib.h"  #include "weights.h" -#include "scorer.h"  #include "inside_outside.h"  #include "hg_io.h"  #include "kbest.h"  #include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h"  // This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) @@ -80,7 +81,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {          ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")          ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")          ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") -        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") +        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")          ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")          ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")          ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -109,9 +110,12 @@ struct HypInfo {    HypInfo(const vector<WordID>& h, const SparseVector<weight_t>& feats) : hyp(h), g_(-100.0f), x(feats) {}    // lazy evaluation -  double g(const SentenceScorer& scorer) const { -    if (g_ == -100.0f) -      g_ = scorer.ScoreCandidate(hyp)->ComputeScore(); +  double g(const SegmentEvaluator& scorer, const EvaluationMetric* metric) const { +    if (g_ == -100.0f) { +      SufficientStats ss; +      scorer.Evaluate(hyp, &ss); +      g_ = metric->ComputeScore(ss); +    }      return g_;    }    vector<WordID> hyp; @@ -233,15 +237,21 @@ struct DiffOrder {    }  }; -void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) { +void Sample(const unsigned gamma, +            const unsigned xi, +            const vector<HypInfo>& J_i, +            const SegmentEvaluator& scorer, +            const EvaluationMetric* metric, +            vector<TrainingInstance>* pv) { +  const bool invert_score = metric->IsErrorMetric();    vector<TrainingInstance> v1, v2;    float avg_diff = 0;    for (unsigned i = 0; i < gamma; ++i) {      const size_t a = rng->inclusive(0, J_i.size() - 1)();      const size_t b = rng->inclusive(0, J_i.size() - 1)();      if (a == b) continue; -    float ga = J_i[a].g(scorer); -    float gb = J_i[b].g(scorer); +    float ga = J_i[a].g(scorer, metric); +    float gb = J_i[b].g(scorer, metric);      bool positive = gb < ga;      if (invert_score) positive = !positive;      const float gdiff = fabs(ga - gb); @@ -288,11 +298,12 @@ int main(int argc, char** argv) {      rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));    else      rng.reset(new MT19937); -  const string loss_function = conf["loss_function"].as<string>(); +  const string evaluation_metric = conf["evaluation_metric"].as<string>(); + +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +  DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; -  ScoreType type = ScoreTypeFromString(loss_function); -  DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); -  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;    Hypergraph hg;    string last_file;    ReadFile in_read(conf["input"].as<string>()); @@ -335,7 +346,7 @@ int main(int argc, char** argv) {      Dedup(&J_i);      WriteKBest(kbest_file, J_i); -    Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); +    Sample(gamma, xi, J_i, *ds[sent_id], metric, &v);      for (unsigned i = 0; i < v.size(); ++i) {        const TrainingInstance& vi = v[i];        cout << vi.y << "\t" << vi.x << endl; | 
