From 899032c9728c7a1c9c97f624ba0cc49b0814277b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 3 Jan 2016 16:17:35 -0500 Subject: corpus stats script --- corpus/corpus-stats.pl | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 corpus/corpus-stats.pl diff --git a/corpus/corpus-stats.pl b/corpus/corpus-stats.pl new file mode 100755 index 00000000..0bbd49b4 --- /dev/null +++ b/corpus/corpus-stats.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w +use strict; + +my $f = <>; +my $IS_PARALLEL = ($f =~ / \|\|\| /); +if ($IS_PARALLEL) { + die "This script is only valid for monolingual corpora, but file contains |||\n"; +} + +my %d; +my $tc = 0; +my $lc = 0; +while($f) { + $lc++; + chomp $f; + my @toks = split /\s+/, $f; + for my $t (@toks) { + $d{$t}++; + $tc++; + } + $f=<>; +} + +my $types = scalar keys %d; +my $ttr = $tc / $types; +my @mfts; +for my $k (sort {$d{$b} <=> $d{$a}} keys %d) { + push @mfts, $k; + last if scalar @mfts > 24; +} +my $sing = 0; +for my $k (keys %d) { + if ($d{$k} == 1) { $sing++; } +} +my $stypes = sqrt($types); + +print < Date: Thu, 14 Jan 2016 21:22:56 -0500 Subject: Added character-level BLEU metric --- mteval/comb_scorer.cc | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ mteval/comb_scorer.h | 11 ++++++ mteval/ns.cc | 26 ++++++++------ mteval/ns.h | 3 ++ mteval/scorer.cc | 58 +++++++++++++++++++++++++++++++- mteval/scorer.h | 6 +++- mteval/wer.cc | 16 +++++---- 7 files changed, 194 insertions(+), 19 deletions(-) diff --git a/mteval/comb_scorer.cc b/mteval/comb_scorer.cc index 9fc37868..63f327ca 100644 --- a/mteval/comb_scorer.cc +++ b/mteval/comb_scorer.cc @@ -95,3 +95,96 @@ ScoreP BLEUTERCombinationScorer::ScoreFromString(const std::string& in) { r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss)); return ScoreP(r); } + + +class BLEUCBLEUCombinationScore : public ScoreBase { + friend class BLEUCBLEUCombinationScorer; + public: + ~BLEUCBLEUCombinationScore(); + float ComputePartialScore() const { return 0.0;} + float ComputeScore() const { + return (bleu->ComputeScore() + cbleu->ComputeScore()) / 2.0f; + } + void ScoreDetails(string* details) const { + char buf[160]; + sprintf(buf, "Combi = %.2f, BLEU = %.2f, CBLEU = %.2f", + ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, cbleu->ComputeScore()*100.0f); + *details = buf; + } + void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + + void PlusEquals(const Score& delta, const float scale) { + bleu->PlusEquals(*static_cast(delta).bleu, scale); + cbleu->PlusEquals(*static_cast(delta).cbleu, scale); + } + void PlusEquals(const Score& delta) { + bleu->PlusEquals(*static_cast(delta).bleu); + cbleu->PlusEquals(*static_cast(delta).cbleu); + } + + + + ScoreP GetOne() const { + BLEUCBLEUCombinationScore* res = new BLEUCBLEUCombinationScore; + res->bleu = bleu->GetOne(); + res->cbleu = cbleu->GetOne(); + return ScoreP(res); + } + ScoreP GetZero() const { + BLEUCBLEUCombinationScore* res = new BLEUCBLEUCombinationScore; + res->bleu = bleu->GetZero(); + res->cbleu = cbleu->GetZero(); + return ScoreP(res); + } + void Subtract(const Score& rhs, Score* res) const { + bleu->Subtract(*static_cast(rhs).bleu, + static_cast(res)->bleu.get()); + cbleu->Subtract(*static_cast(rhs).cbleu, + static_cast(res)->cbleu.get()); + } + void Encode(std::string* out) const { + string bs, ts; + bleu->Encode(&bs); + cbleu->Encode(&ts); + out->clear(); + (*out) += static_cast(bs.size()); + (*out) += bs; + (*out) += ts; + } + bool IsAdditiveIdentity() const { + return bleu->IsAdditiveIdentity() && cbleu->IsAdditiveIdentity(); + } + private: + ScoreP bleu; + ScoreP cbleu; +}; + +BLEUCBLEUCombinationScore::~BLEUCBLEUCombinationScore() { +} + +BLEUCBLEUCombinationScorer::BLEUCBLEUCombinationScorer(const vector >& refs) { + bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs); + cbleu_ = SentenceScorer::CreateSentenceScorer(CBLEU, refs); +} + +BLEUCBLEUCombinationScorer::~BLEUCBLEUCombinationScorer() { +} + +ScoreP BLEUCBLEUCombinationScorer::ScoreCCandidate(const vector& hyp) const { + return ScoreP(); +} + +ScoreP BLEUCBLEUCombinationScorer::ScoreCandidate(const std::vector& hyp) const { + BLEUCBLEUCombinationScore* res = new BLEUCBLEUCombinationScore; + res->bleu = bleu_->ScoreCandidate(hyp); + res->cbleu = cbleu_->ScoreCandidate(hyp); + return ScoreP(res); +} + +ScoreP BLEUCBLEUCombinationScorer::ScoreFromString(const std::string& in) { + int bss = in[0]; + BLEUCBLEUCombinationScore* r = new BLEUCBLEUCombinationScore; + r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss)); + r->cbleu = SentenceScorer::CreateScoreFromString(CBLEU, in.substr(1 + bss)); + return ScoreP(r); +} diff --git a/mteval/comb_scorer.h b/mteval/comb_scorer.h index d17d089d..1e2f0c25 100644 --- a/mteval/comb_scorer.h +++ b/mteval/comb_scorer.h @@ -14,4 +14,15 @@ class BLEUTERCombinationScorer : public SentenceScorer { ScorerP bleu_,ter_; }; +class BLEUCBLEUCombinationScorer : public SentenceScorer { + public: + BLEUCBLEUCombinationScorer(const std::vector >& refs); + ~BLEUCBLEUCombinationScorer(); + ScoreP ScoreCandidate(const std::vector& hyp) const; + ScoreP ScoreCCandidate(const std::vector& hyp) const; + static ScoreP ScoreFromString(const std::string& in); + private: + ScorerP bleu_, cbleu_; +}; + #endif diff --git a/mteval/ns.cc b/mteval/ns.cc index 2c8bd806..1d37c436 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -65,38 +65,41 @@ string EvaluationMetric::DetailedScore(const SufficientStats& stats) const { } enum BleuType { IBM, Koehn, NIST, QCRI }; -template +template struct BleuSegmentEvaluator : public SegmentEvaluator { BleuSegmentEvaluator(const vector >& refs, const EvaluationMetric* em) : evaluation_metric(em) { - assert(refs.size() > 0); + const vector >& local_refs = (CharBased ? Characterize(refs) : refs); + + assert(local_refs.size() > 0); float tot = 0; int smallest = 9999999; - for (vector >::const_iterator ci = refs.begin(); - ci != refs.end(); ++ci) { + for (vector >::const_iterator ci = local_refs.begin(); + ci != local_refs.end(); ++ci) { lengths_.push_back(ci->size()); tot += lengths_.back(); if (lengths_.back() < smallest) smallest = lengths_.back(); CountRef(*ci); } if (BrevityType == Koehn) - lengths_[0] = tot / refs.size(); + lengths_[0] = tot / local_refs.size(); if (BrevityType == NIST) lengths_[0] = smallest; } void Evaluate(const vector& hyp, SufficientStats* out) const { + const vector& local_hyp = (CharBased ? Characterize(hyp) : hyp); out->fields.resize(N + N + 2); out->id_ = evaluation_metric->MetricId(); for (unsigned i = 0; i < N+N+2; ++i) out->fields[i] = 0; - ComputeNgramStats(hyp, &out->fields[0], &out->fields[N], true); + ComputeNgramStats(local_hyp, &out->fields[0], &out->fields[N], true); float& hyp_len = out->fields[2*N]; float& ref_len = out->fields[2*N + 1]; - hyp_len = hyp.size(); + hyp_len = local_hyp.size(); ref_len = lengths_[0]; if (lengths_.size() > 1 && (BrevityType == IBM || BrevityType == QCRI)) { float bestd = 2000000; - float hl = hyp.size(); + float hl = local_hyp.size(); float bl = -1; for (vector::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { if (fabs(*ci - hl) < bestd) { @@ -187,12 +190,12 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { mutable NGramCountMap ngrams_; }; -template +template struct BleuMetric : public EvaluationMetric { BleuMetric() : EvaluationMetric(BrevityType == IBM ? "IBM_BLEU" : (BrevityType == Koehn ? "KOEHN_BLEU" : (BrevityType == NIST ? "NIST_BLEU" : "QCRI_BLEU"))) {} unsigned SufficientStatisticsVectorSize() const { return N*2 + 2; } boost::shared_ptr CreateSegmentEvaluator(const vector >& refs) const { - return boost::shared_ptr(new BleuSegmentEvaluator(refs, this)); + return boost::shared_ptr(new BleuSegmentEvaluator(refs, this)); } float ComputeBreakdown(const SufficientStats& stats, float* bp, vector* out) const { if (out) { out->clear(); } @@ -290,6 +293,8 @@ EvaluationMetric* EvaluationMetric::Instance(const string& imetric_id) { m = new CERMetric; } else if (metric_id == "WER") { m = new WERMetric; + } else if (metric_id == "CBLEU") { + return new BleuMetric<5, IBM, true>; } else { cerr << "Implement please: " << metric_id << endl; abort(); @@ -322,4 +327,3 @@ void SufficientStats::Encode(string* out) const { os << ' ' << fields[i]; *out = os.str(); } - diff --git a/mteval/ns.h b/mteval/ns.h index f6329b65..16edfdf0 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -8,6 +8,9 @@ #include "wordid.h" #include +std::vector Characterize(const std::vector& reference); +std::vector > Characterize(const std::vector >& references); + class SufficientStats { public: SufficientStats() : id_() {} diff --git a/mteval/scorer.cc b/mteval/scorer.cc index 4c05dbd8..71e05e9c 100644 --- a/mteval/scorer.cc +++ b/mteval/scorer.cc @@ -49,12 +49,17 @@ ScoreType ScoreTypeFromString(const string& st) { return METEOR; if (sl == "wer") return WER; + if (sl == "cbleu") + return CBLEU; + if (sl == "bleu_cbleu") + return BLEU_plus_CBLEU_over_2; cerr << "Don't understand score type '" << st << "', defaulting to ibm_bleu.\n"; + assert (false); return IBM_BLEU; } static char const* score_names[]={ - "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3", "METEOR", "WER" + "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3", "METEOR", "WER", "CBLEU", "BLEU_plus_CBLEU_over_2" }; std::string StringFromScoreType(ScoreType st) { @@ -291,6 +296,21 @@ ScoreP BLEUScorerBase::ScoreFromString(const string& in) { return ScoreP(r); } +class CBLEUScorer : public BLEUScorerBase { + public: + CBLEUScorer(const vector >& references, + int n=5) : BLEUScorerBase(Characterize(references), n), lengths_(references.size()) { + for (unsigned i=0; i < references.size(); ++i) + lengths_[i] = Characterize(references[i]).size(); + } + + float ComputeRefLength(const vector& hyp) const { + return 1000; + } + private: + vector lengths_; +}; + class IBM_BLEUScorer : public BLEUScorerBase { public: IBM_BLEUScorer(const vector >& references, @@ -362,8 +382,10 @@ ScorerP SentenceScorer::CreateSentenceScorer(const ScoreType type, case TER: r = new TERScorer(refs);break; case SER: r = new SERScorer(refs);break; case BLEU_minus_TER_over_2: r = new BLEUTERCombinationScorer(refs);break; + case BLEU_plus_CBLEU_over_2: r = new BLEUCBLEUCombinationScorer(refs); break; case METEOR: r = new ExternalSentenceScorer(ScoreServerManager::Instance("meteor"), refs); break; case WER: r = new WERScorer(refs);break; + case CBLEU: r = new CBLEUScorer(refs, 5); break; default: assert(!"Not implemented!"); } @@ -410,6 +432,10 @@ ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string& return ExternalSentenceScorer::ScoreFromString(ScoreServerManager::Instance("meteor"), in); case WER: return WERScorer::ScoreFromString(in); + case CBLEU: + return CBLEUScorer::ScoreFromString(in); + case BLEU_plus_CBLEU_over_2: + return BLEUCBLEUCombinationScorer::ScoreFromString(in); default: assert(!"Not implemented!"); } @@ -685,3 +711,33 @@ void DocStreamScorer::update(const std::string& ref) { TD::ConvertSentence(ref, &refs[0]); scorer = ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line)); } + +vector Characterize(const vector& reference) { + vector r; + string space = " "; + for (WordID word_id: reference) { + string word = TD::Convert(word_id); + unsigned i = 0; + while (i < word.length()) { + unsigned char_length = UTF8Len(word[i]); + string c = word.substr(i, char_length); + i += char_length; + r.push_back(TD::Convert(c)); + } + r.push_back(TD::Convert(space)); + } + + // Remove the last space + if (r.size() > 0) { + r.pop_back(); + } + return r; +} + +vector> Characterize(const vector >& references) { + vector > r; + for (const vector& reference : references) { + r.push_back(Characterize(reference)); + } + return r; +} diff --git a/mteval/scorer.h b/mteval/scorer.h index a411f14b..e7de0118 100644 --- a/mteval/scorer.h +++ b/mteval/scorer.h @@ -17,10 +17,14 @@ class ErrorSurface; class Hypergraph; // needed for alignment //TODO: BLEU N (N separate arg, not part of enum)? -enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3, METEOR, WER }; +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3, METEOR, WER, CBLEU, BLEU_plus_CBLEU_over_2 }; ScoreType ScoreTypeFromString(const std::string& st); std::string StringFromScoreType(ScoreType st); +std::vector Characterize(const std::vector& reference); +std::vector > Characterize(const std::vector >& references); + + class Score : public boost::intrusive_refcount { public: virtual ~Score(); diff --git a/mteval/wer.cc b/mteval/wer.cc index c806b3be..b8cfd3d8 100644 --- a/mteval/wer.cc +++ b/mteval/wer.cc @@ -31,16 +31,17 @@ class WERScore : public ScoreBase { WERScore() : stats(0,kDUMMY_LAST_ENTRY) {} float ComputePartialScore() const { return 0.0;} float ComputeScore() const { + if (static_cast(stats[kCHARCOUNT]) < 0.5) + return 0; return static_cast(stats[kEDITDISTANCE]) / static_cast(stats[kCHARCOUNT]); } void ScoreDetails(string* details) const; void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} void PlusEquals(const Score& delta, const float scale) { - if (scale==1) - stats += static_cast(delta).stats; - if (scale==-1) - stats -= static_cast(delta).stats; - throw std::runtime_error("WERScore::PlusEquals with scale != +-1"); + const WERScore& delta_stats = static_cast(delta); + for (unsigned i = 0; i < kDUMMY_LAST_ENTRY; ++i) { + stats[i] += scale * static_cast(delta_stats.stats[i]); + } } void PlusEquals(const Score& delta) { stats += static_cast(delta).stats; @@ -88,7 +89,7 @@ void WERScore::ScoreDetails(std::string* details) const { } WERScorer::~WERScorer() {} -WERScorer::WERScorer(const vector >& refs) {} +WERScorer::WERScorer(const vector >& refs) {this->refs = refs;} ScoreP WERScorer::ScoreCCandidate(const vector& hyp) const { return ScoreP(); @@ -97,6 +98,9 @@ ScoreP WERScorer::ScoreCCandidate(const vector& hyp) const { float WERScorer::Calculate(const std::vector& hyp, const Sentence& ref, int& edits, int& char_count) const { edits = cdec::LevenshteinDistance(hyp, ref); char_count = ref.size(); + if (char_count == 0) { + return 0; + } return static_cast(edits) / static_cast(char_count); } -- cgit v1.2.3 From ee4f3c5581e43510d98de1274c6c1c2984c87faf Mon Sep 17 00:00:00 2001 From: armatthews Date: Sun, 17 Jan 2016 04:03:35 -0500 Subject: bug fixes when training with WER --- mteval/ns_wer.cc | 4 ++-- training/mira/kbest_cut_mira.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mteval/ns_wer.cc b/mteval/ns_wer.cc index f9b2bbbb..057ad49e 100644 --- a/mteval/ns_wer.cc +++ b/mteval/ns_wer.cc @@ -18,10 +18,10 @@ void WERMetric::ComputeSufficientStatistics(const std::vector& hyp, const std::vector >& refs, SufficientStats* out) const { out->fields.resize(kNUMFIELDS); - float best_score = hyp.size(); + float best_score = 0; for (size_t i = 0; i < refs.size(); ++i) { float score = cdec::LevenshteinDistance(hyp, refs[i]); - if (score < best_score) { + if (score < best_score || i == 0) { out->fields[kEDITDISTANCE] = score; out->fields[kCHARCOUNT] = refs[i].size(); best_score = score; diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index 724b1853..5d8385c2 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -645,7 +645,7 @@ int main(int argc, char** argv) { ScoreType type = ScoreTypeFromString(metric_name); //establish metric used for tuning - if (type == TER) { + if (type == TER || type == WER) { invert_score = true; } else { invert_score = false; -- cgit v1.2.3