adaptive hope-fear learner

author: Chris Dyer <redpony@gmail.com> 2014-02-09 20:50:41 -0500
committer: Chris Dyer <redpony@gmail.com> 2014-02-09 20:50:41 -0500
commit: 9b83a2e82aba73b5ff0e848182a8726481a10485 (patch)
tree: 81e7ade548bdffaa8534705c2d34beb8c752dc24 /mteval
parent: 702591b3296af472cc5c7c4720f1c21b2a6e34b1 (diff)
4 files changed, 46 insertions, 0 deletions
diff --git a/mteval/ns.h b/mteval/ns.h
index ac7b0a23..153bf0b8 100644
--- a/mteval/ns.h
+++ b/mteval/ns.h
@@ -78,6 +78,7 @@ inline const SufficientStats operator-(const SufficientStats& a, const Sufficien
 struct SegmentEvaluator {
   virtual ~SegmentEvaluator();
   virtual void Evaluate(const std::vector<WordID>& hyp, SufficientStats* out) const = 0;
+  std::string src; // this may not always be available
 };
 
 // Instructions for implementing a new metric
diff --git a/mteval/ns_docscorer.cc b/mteval/ns_docscorer.cc
index 83bd1a29..242f134a 100644
--- a/mteval/ns_docscorer.cc
+++ b/mteval/ns_docscorer.cc
@@ -13,6 +13,40 @@ DocumentScorer::~DocumentScorer() {}
 
 DocumentScorer::DocumentScorer() {}
 
+DocumentScorer::DocumentScorer(const EvaluationMetric* metric,
+                               const string& src_ref_file) {
+  const WordID kDIV = TD::Convert("|||");
+  assert(!src_ref_file.empty());
+  cerr << "Loading source and references from " << src_ref_file << "...\n";
+  ReadFile rf(src_ref_file);
+  istream& in = *rf.stream();
+  unsigned lc = 0;
+  string src_ref;
+  vector<WordID> tmp;
+  vector<vector<WordID> > refs;
+  while(getline(in, src_ref)) {
+    ++lc;
+    size_t end_src = src_ref.find(" ||| ");
+    if (end_src == string::npos) {
+      cerr << "Expected SRC ||| REF [||| REF2 ||| REF3 ...] in line " << lc << endl;
+      abort();
+    }
+    refs.clear();
+    tmp.clear();
+    TD::ConvertSentence(src_ref, &tmp, end_src + 5);
+    unsigned last = 0;
+    for (unsigned j = 0; j < tmp.size(); ++j) {
+      if (tmp[j] == kDIV) {
+        refs.push_back(vector<WordID>(tmp.begin() + last, tmp.begin() + j));
+        last = j + 1;
+      }
+    }
+    refs.push_back(vector<WordID>(tmp.begin() + last, tmp.end()));
+    scorers_.push_back(metric->CreateSegmentEvaluator(refs));
+    scorers_.back()->src = src_ref.substr(0, end_src);
+  }
+}
+
 void DocumentScorer::Init(const EvaluationMetric* metric,
             const vector<string>& ref_files,
             const string& src_file,
diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc
index da07f154..cd27f020 100644
--- a/mteval/scorer_test.cc
+++ b/mteval/scorer_test.cc
@@ -3,6 +3,7 @@
 #include <boost/test/unit_test.hpp>
 #include <boost/test/floating_point_comparison.hpp>
 
+#include "ns_docscorer.h"
 #include "ns.h"
 #include "tdict.h"
 #include "scorer.h"
@@ -223,4 +224,12 @@ BOOST_AUTO_TEST_CASE(NewScoreAPI) {
   //cerr << metric->ComputeScore(statse) << endl;
 }
 
+BOOST_AUTO_TEST_CASE(HybridSourceReferenceFileFormat) {
+  std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA);
+  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+  DocumentScorer ds(metric, path + "/devset.txt");
+  BOOST_CHECK_EQUAL(2, ds.size());
+  BOOST_CHECK_EQUAL("Quelltext hier .", ds[0]->src);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/mteval/test_data/devset.txt b/mteval/test_data/devset.txt
new file mode 100644
index 00000000..f9135d98
--- /dev/null
+++ b/mteval/test_data/devset.txt
@@ -0,0 +1,2 @@
+Quelltext hier . ||| source text here . ||| original text . ||| some source text .
+ein anderer Satz . ||| another sentence . ||| a different sentece .
author	Chris Dyer <redpony@gmail.com>	2014-02-09 20:50:41 -0500
committer	Chris Dyer <redpony@gmail.com>	2014-02-09 20:50:41 -0500
commit	9b83a2e82aba73b5ff0e848182a8726481a10485 (patch)
tree	81e7ade548bdffaa8534705c2d34beb8c752dc24 /mteval
parent	702591b3296af472cc5c7c4720f1c21b2a6e34b1 (diff)