diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-06-03 16:58:29 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-06-03 16:58:29 -0400 |
commit | b66e838ed52decc0be1eb5817b2a77c3840db2c5 (patch) | |
tree | 5e3646d827d0932399d0930e9c65ae572f16c662 | |
parent | dc372570c906d1b7d4c856132f8be925fd7ba8b0 (diff) |
fix for nonjoining chars
-rwxr-xr-x | corpus/support/quote-norm.pl | 1 | ||||
-rw-r--r-- | training/pro/mr_pro_map.cc | 26 |
2 files changed, 24 insertions, 3 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0366fad5..3eee0666 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -40,6 +40,7 @@ while(<STDIN>) { # Regularlize spaces: s/\x{ad}//g; # soft hyphen + s/\x{200C}//g; # zero-width non-joiner s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index a5e6e48f..da58cd24 100644 --- a/training/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc @@ -88,23 +88,43 @@ struct DiffOrder { } }; -void Sample(const unsigned gamma, +double LengthDifferenceStdDev(const training::CandidateSet& J_i, int n) { + double sum = 0; + for (int i = 0; i < n; ++i) { + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) { --i; continue; } + double p = J_i[a].ewords.size(); + p -= J_i[b].ewords.size(); + sum += p * p; // mean is 0 by construction + } + return max(sqrt(sum / n), 2.0); +}; + +void Sample(const int gamma, const unsigned xi, const training::CandidateSet& J_i, const EvaluationMetric* metric, vector<TrainingInstance>* pv) { + const double len_stddev = LengthDifferenceStdDev(J_i, 5000); const bool invert_score = metric->IsErrorMetric(); vector<TrainingInstance> v1, v2; float avg_diff = 0; - for (unsigned i = 0; i < gamma; ++i) { + const double z_score_threshold=2; + for (int i = 0; i < gamma; ++i) { const size_t a = rng->inclusive(0, J_i.size() - 1)(); const size_t b = rng->inclusive(0, J_i.size() - 1)(); - if (a == b) continue; + if (a == b) { --i; continue; } + double z_score = fabs(((int)J_i[a].ewords.size() - (int)J_i[b].ewords.size()) / len_stddev); + // variation on Nakov et al. (2011) + if (z_score > z_score_threshold) { --i; continue; } float ga = metric->ComputeScore(J_i[a].eval_feats); float gb = metric->ComputeScore(J_i[b].eval_feats); bool positive = gb < ga; if (invert_score) positive = !positive; const float gdiff = fabs(ga - gb); + //cerr << ((int)J_i[a].ewords.size() - (int)J_i[b].ewords.size()) << endl; + //cerr << (ga - gb) << endl; if (!gdiff) continue; avg_diff += gdiff; SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros(); |