diff options
Diffstat (limited to 'mteval')
-rw-r--r-- | mteval/ns.h | 1 | ||||
-rw-r--r-- | mteval/ns_docscorer.cc | 34 | ||||
-rw-r--r-- | mteval/scorer_test.cc | 9 | ||||
-rw-r--r-- | mteval/test_data/devset.txt | 2 |
4 files changed, 46 insertions, 0 deletions
diff --git a/mteval/ns.h b/mteval/ns.h index ac7b0a23..153bf0b8 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -78,6 +78,7 @@ inline const SufficientStats operator-(const SufficientStats& a, const Sufficien struct SegmentEvaluator { virtual ~SegmentEvaluator(); virtual void Evaluate(const std::vector<WordID>& hyp, SufficientStats* out) const = 0; + std::string src; // this may not always be available }; // Instructions for implementing a new metric diff --git a/mteval/ns_docscorer.cc b/mteval/ns_docscorer.cc index 83bd1a29..242f134a 100644 --- a/mteval/ns_docscorer.cc +++ b/mteval/ns_docscorer.cc @@ -13,6 +13,40 @@ DocumentScorer::~DocumentScorer() {} DocumentScorer::DocumentScorer() {} +DocumentScorer::DocumentScorer(const EvaluationMetric* metric, + const string& src_ref_file) { + const WordID kDIV = TD::Convert("|||"); + assert(!src_ref_file.empty()); + cerr << "Loading source and references from " << src_ref_file << "...\n"; + ReadFile rf(src_ref_file); + istream& in = *rf.stream(); + unsigned lc = 0; + string src_ref; + vector<WordID> tmp; + vector<vector<WordID> > refs; + while(getline(in, src_ref)) { + ++lc; + size_t end_src = src_ref.find(" ||| "); + if (end_src == string::npos) { + cerr << "Expected SRC ||| REF [||| REF2 ||| REF3 ...] in line " << lc << endl; + abort(); + } + refs.clear(); + tmp.clear(); + TD::ConvertSentence(src_ref, &tmp, end_src + 5); + unsigned last = 0; + for (unsigned j = 0; j < tmp.size(); ++j) { + if (tmp[j] == kDIV) { + refs.push_back(vector<WordID>(tmp.begin() + last, tmp.begin() + j)); + last = j + 1; + } + } + refs.push_back(vector<WordID>(tmp.begin() + last, tmp.end())); + scorers_.push_back(metric->CreateSegmentEvaluator(refs)); + scorers_.back()->src = src_ref.substr(0, end_src); + } +} + void DocumentScorer::Init(const EvaluationMetric* metric, const vector<string>& ref_files, const string& src_file, diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc index da07f154..cd27f020 100644 --- a/mteval/scorer_test.cc +++ b/mteval/scorer_test.cc @@ -3,6 +3,7 @@ #include <boost/test/unit_test.hpp> #include <boost/test/floating_point_comparison.hpp> +#include "ns_docscorer.h" #include "ns.h" #include "tdict.h" #include "scorer.h" @@ -223,4 +224,12 @@ BOOST_AUTO_TEST_CASE(NewScoreAPI) { //cerr << metric->ComputeScore(statse) << endl; } +BOOST_AUTO_TEST_CASE(HybridSourceReferenceFileFormat) { + std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + DocumentScorer ds(metric, path + "/devset.txt"); + BOOST_CHECK_EQUAL(2, ds.size()); + BOOST_CHECK_EQUAL("Quelltext hier .", ds[0]->src); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/mteval/test_data/devset.txt b/mteval/test_data/devset.txt new file mode 100644 index 00000000..f9135d98 --- /dev/null +++ b/mteval/test_data/devset.txt @@ -0,0 +1,2 @@ +Quelltext hier . ||| source text here . ||| original text . ||| some source text . +ein anderer Satz . ||| another sentence . ||| a different sentece . |