summaryrefslogtreecommitdiff
path: root/pro-train
diff options
context:
space:
mode:
Diffstat (limited to 'pro-train')
-rwxr-xr-xpro-train/dist-pro.pl6
-rw-r--r--pro-train/mr_pro_map.cc37
2 files changed, 27 insertions, 16 deletions
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index 5db053de..31258fa6 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -12,7 +12,7 @@ use POSIX ":sys_wait_h";
my $QSUB_CMD = qsub_args(mert_memory());
my $default_jobs = env_default_jobs();
-my $VEST_DIR="$SCRIPT_DIR/../vest";
+my $VEST_DIR="$SCRIPT_DIR/../dpmert";
require "$VEST_DIR/libcall.pl";
# Default settings
@@ -288,7 +288,7 @@ while (1){
$retries++;
}
die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
- my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric");
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
chomp $dec_score;
print STDERR "DECODER SCORE: $dec_score\n";
@@ -338,7 +338,7 @@ while (1){
$mapoutput =~ s/mapinput/mapoutput/;
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
- my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+ my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
if ($use_make) {
my $script_file = "$dir/scripts/map.$shard";
open F, ">$script_file" or die "Can't write $script_file: $!";
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index 0a9b75d7..52b67f32 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -13,11 +13,12 @@
#include "filelib.h"
#include "stringlib.h"
#include "weights.h"
-#include "scorer.h"
#include "inside_outside.h"
#include "hg_io.h"
#include "kbest.h"
#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
@@ -80,7 +81,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
- ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+ ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
@@ -109,9 +110,12 @@ struct HypInfo {
HypInfo(const vector<WordID>& h, const SparseVector<weight_t>& feats) : hyp(h), g_(-100.0f), x(feats) {}
// lazy evaluation
- double g(const SentenceScorer& scorer) const {
- if (g_ == -100.0f)
- g_ = scorer.ScoreCandidate(hyp)->ComputeScore();
+ double g(const SegmentEvaluator& scorer, const EvaluationMetric* metric) const {
+ if (g_ == -100.0f) {
+ SufficientStats ss;
+ scorer.Evaluate(hyp, &ss);
+ g_ = metric->ComputeScore(ss);
+ }
return g_;
}
vector<WordID> hyp;
@@ -233,15 +237,21 @@ struct DiffOrder {
}
};
-void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) {
+void Sample(const unsigned gamma,
+ const unsigned xi,
+ const vector<HypInfo>& J_i,
+ const SegmentEvaluator& scorer,
+ const EvaluationMetric* metric,
+ vector<TrainingInstance>* pv) {
+ const bool invert_score = metric->IsErrorMetric();
vector<TrainingInstance> v1, v2;
float avg_diff = 0;
for (unsigned i = 0; i < gamma; ++i) {
const size_t a = rng->inclusive(0, J_i.size() - 1)();
const size_t b = rng->inclusive(0, J_i.size() - 1)();
if (a == b) continue;
- float ga = J_i[a].g(scorer);
- float gb = J_i[b].g(scorer);
+ float ga = J_i[a].g(scorer, metric);
+ float gb = J_i[b].g(scorer, metric);
bool positive = gb < ga;
if (invert_score) positive = !positive;
const float gdiff = fabs(ga - gb);
@@ -288,11 +298,12 @@ int main(int argc, char** argv) {
rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
else
rng.reset(new MT19937);
- const string loss_function = conf["loss_function"].as<string>();
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
- ScoreType type = ScoreTypeFromString(loss_function);
- DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
- cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
Hypergraph hg;
string last_file;
ReadFile in_read(conf["input"].as<string>());
@@ -335,7 +346,7 @@ int main(int argc, char** argv) {
Dedup(&J_i);
WriteKBest(kbest_file, J_i);
- Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v);
+ Sample(gamma, xi, J_i, *ds[sent_id], metric, &v);
for (unsigned i = 0; i < v.size(); ++i) {
const TrainingInstance& vi = v[i];
cout << vi.y << "\t" << vi.x << endl;