From 2a9ee1febae6a63173f74ae24e2bfe439e409525 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 18 Mar 2014 02:05:25 -0400 Subject: chris edits --- corpus/support/tokenizer.pl | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'corpus/support') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 7771201f..f57bc87a 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -240,6 +240,10 @@ sub proc_token { return $token; } + if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){ + return $token; + } + ## 1,234,345.34 if($token =~ /^\d+(\.\d{3})*,\d+$/){ ## number -- cgit v1.2.3 From b66e838ed52decc0be1eb5817b2a77c3840db2c5 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 3 Jun 2014 16:58:29 -0400 Subject: fix for nonjoining chars --- corpus/support/quote-norm.pl | 1 + training/pro/mr_pro_map.cc | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'corpus/support') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0366fad5..3eee0666 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -40,6 +40,7 @@ while() { # Regularlize spaces: s/\x{ad}//g; # soft hyphen + s/\x{200C}//g; # zero-width non-joiner s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index a5e6e48f..da58cd24 100644 --- a/training/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc @@ -88,23 +88,43 @@ struct DiffOrder { } }; -void Sample(const unsigned gamma, +double LengthDifferenceStdDev(const training::CandidateSet& J_i, int n) { + double sum = 0; + for (int i = 0; i < n; ++i) { + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) { --i; continue; } + double p = J_i[a].ewords.size(); + p -= J_i[b].ewords.size(); + sum += p * p; // mean is 0 by construction + } + return max(sqrt(sum / n), 2.0); +}; + +void Sample(const int gamma, const unsigned xi, const training::CandidateSet& J_i, const EvaluationMetric* metric, vector* pv) { + const double len_stddev = LengthDifferenceStdDev(J_i, 5000); const bool invert_score = metric->IsErrorMetric(); vector v1, v2; float avg_diff = 0; - for (unsigned i = 0; i < gamma; ++i) { + const double z_score_threshold=2; + for (int i = 0; i < gamma; ++i) { const size_t a = rng->inclusive(0, J_i.size() - 1)(); const size_t b = rng->inclusive(0, J_i.size() - 1)(); - if (a == b) continue; + if (a == b) { --i; continue; } + double z_score = fabs(((int)J_i[a].ewords.size() - (int)J_i[b].ewords.size()) / len_stddev); + // variation on Nakov et al. (2011) + if (z_score > z_score_threshold) { --i; continue; } float ga = metric->ComputeScore(J_i[a].eval_feats); float gb = metric->ComputeScore(J_i[b].eval_feats); bool positive = gb < ga; if (invert_score) positive = !positive; const float gdiff = fabs(ga - gb); + //cerr << ((int)J_i[a].ewords.size() - (int)J_i[b].ewords.size()) << endl; + //cerr << (ga - gb) << endl; if (!gdiff) continue; avg_diff += gdiff; SparseVector xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros(); -- cgit v1.2.3