From c9804461426ad400e8189ff8217e93f13b199ded Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 4 Sep 2014 13:24:26 -0400 Subject: word error rate metric --- mteval/Makefile.am | 3 +++ mteval/ns.cc | 3 +++ mteval/ns_cer.cc | 26 ++++---------------------- mteval/ns_cer.h | 3 --- mteval/ns_wer.cc | 35 +++++++++++++++++++++++++++++++++++ mteval/ns_wer.h | 20 ++++++++++++++++++++ 6 files changed, 65 insertions(+), 25 deletions(-) create mode 100644 mteval/ns_wer.cc create mode 100644 mteval/ns_wer.h diff --git a/mteval/Makefile.am b/mteval/Makefile.am index c833eb01..aac3e6b5 100644 --- a/mteval/Makefile.am +++ b/mteval/Makefile.am @@ -14,6 +14,7 @@ libmteval_a_SOURCES = \ aer_scorer.h \ comb_scorer.h \ external_scorer.h \ + levenshtein.h \ ns.h \ ns_cer.h \ ns_comb.h \ @@ -21,6 +22,7 @@ libmteval_a_SOURCES = \ ns_ext.h \ ns_ssk.h \ ns_ter.h \ + ns_wer.h \ scorer.h \ ter.h \ aer_scorer.cc \ @@ -34,6 +36,7 @@ libmteval_a_SOURCES = \ ns_ext.cc \ ns_ssk.cc \ ns_ter.cc \ + ns_wer.cc \ scorer.cc \ ter.cc diff --git a/mteval/ns.cc b/mteval/ns.cc index c1ea238b..075e0121 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -3,6 +3,7 @@ #include "ns_ext.h" #include "ns_comb.h" #include "ns_cer.h" +#include "ns_wer.h" #include "ns_ssk.h" #include @@ -285,6 +286,8 @@ EvaluationMetric* EvaluationMetric::Instance(const string& imetric_id) { m = new CombinationMetric(metric_id); } else if (metric_id == "CER") { m = new CERMetric; + } else if (metric_id == "WER") { + m = new WERMetric; } else { cerr << "Implement please: " << metric_id << endl; abort(); diff --git a/mteval/ns_cer.cc b/mteval/ns_cer.cc index a843d471..da6683b1 100644 --- a/mteval/ns_cer.cc +++ b/mteval/ns_cer.cc @@ -1,5 +1,6 @@ #include "ns_cer.h" #include "tdict.h" +#include "levenshtein.h" static const unsigned kNUMFIELDS = 2; static const unsigned kEDITDISTANCE = 0; @@ -13,27 +14,6 @@ unsigned CERMetric::SufficientStatisticsVectorSize() const { return 2; } -unsigned CERMetric::EditDistance(const std::string& hyp, - const std::string& ref) const { - const unsigned m = hyp.size(), n = ref.size(); - std::vector edit((m + 1) * 2); - for(unsigned i = 0; i < n + 1; i++) { - for(unsigned j = 0; j < m + 1; j++) { - if(i == 0) - edit[j] = j; - else if(j == 0) - edit[(i%2)*(m+1)] = i; - else - edit[(i%2)*(m+1) + j] = std::min(std::min(edit[(i%2)*(m+1) + j-1] + 1, - edit[((i-1)%2)*(m+1) + j] + 1), - edit[((i-1)%2)*(m+1) + (j-1)] - + (hyp[j-1] == ref[i-1] ? 0 : 1)); - - } - } - return edit[(n%2)*(m+1) + m]; -} - void CERMetric::ComputeSufficientStatistics(const std::vector& hyp, const std::vector >& refs, SufficientStats* out) const { @@ -42,7 +22,7 @@ void CERMetric::ComputeSufficientStatistics(const std::vector& hyp, float best_score = hyp_str.size(); for (size_t i = 0; i < refs.size(); ++i) { std::string ref_str(TD::GetString(refs[i])); - float score = EditDistance(hyp_str, ref_str); + float score = cdec::LevenshteinDistance(hyp_str, ref_str); if (score < best_score) { out->fields[kEDITDISTANCE] = score; out->fields[kCHARCOUNT] = ref_str.size(); @@ -50,6 +30,8 @@ void CERMetric::ComputeSufficientStatistics(const std::vector& hyp, } } } + float CERMetric::ComputeScore(const SufficientStats& stats) const { return stats.fields[kEDITDISTANCE] / stats.fields[kCHARCOUNT]; } + diff --git a/mteval/ns_cer.h b/mteval/ns_cer.h index 9d211181..cb2b4b4a 100644 --- a/mteval/ns_cer.h +++ b/mteval/ns_cer.h @@ -5,9 +5,6 @@ class CERMetric : public EvaluationMetric { friend class EvaluationMetric; - private: - unsigned EditDistance(const std::string& hyp, - const std::string& ref) const; protected: CERMetric() : EvaluationMetric("CER") {} diff --git a/mteval/ns_wer.cc b/mteval/ns_wer.cc new file mode 100644 index 00000000..f9b2bbbb --- /dev/null +++ b/mteval/ns_wer.cc @@ -0,0 +1,35 @@ +#include "ns_wer.h" +#include "tdict.h" +#include "levenshtein.h" + +static const unsigned kNUMFIELDS = 2; +static const unsigned kEDITDISTANCE = 0; +static const unsigned kCHARCOUNT = 1; + +bool WERMetric::IsErrorMetric() const { + return true; +} + +unsigned WERMetric::SufficientStatisticsVectorSize() const { + return 2; +} + +void WERMetric::ComputeSufficientStatistics(const std::vector& hyp, + const std::vector >& refs, + SufficientStats* out) const { + out->fields.resize(kNUMFIELDS); + float best_score = hyp.size(); + for (size_t i = 0; i < refs.size(); ++i) { + float score = cdec::LevenshteinDistance(hyp, refs[i]); + if (score < best_score) { + out->fields[kEDITDISTANCE] = score; + out->fields[kCHARCOUNT] = refs[i].size(); + best_score = score; + } + } +} + +float WERMetric::ComputeScore(const SufficientStats& stats) const { + return stats.fields[kEDITDISTANCE] / stats.fields[kCHARCOUNT]; +} + diff --git a/mteval/ns_wer.h b/mteval/ns_wer.h new file mode 100644 index 00000000..24c85d83 --- /dev/null +++ b/mteval/ns_wer.h @@ -0,0 +1,20 @@ +#ifndef _NS_WER_H_ +#define _NS_WER_H_ + +#include "ns.h" + +class WERMetric : public EvaluationMetric { + friend class EvaluationMetric; + protected: + WERMetric() : EvaluationMetric("WER") {} + + public: + virtual bool IsErrorMetric() const; + virtual unsigned SufficientStatisticsVectorSize() const; + virtual void ComputeSufficientStatistics(const std::vector& hyp, + const std::vector >& refs, + SufficientStats* out) const; + virtual float ComputeScore(const SufficientStats& stats) const; +}; + +#endif -- cgit v1.2.3