diff options
-rwxr-xr-x | corpus/support/quote-norm.pl | 49 | ||||
-rw-r--r-- | decoder/decoder.cc | 2 | ||||
-rw-r--r-- | decoder/ff_ngrams.cc | 16 |
3 files changed, 45 insertions, 22 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 7bdcee67..1d9bb96f 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -8,20 +8,8 @@ while(<STDIN>) { chomp; $_ = " $_ "; - # Regularlize spaces: - s/\x{a0}/ /g; # non-breaking space - s/\x{2009}/ /g; # thin space - s/\x{2028}/ /g; # "line separator" - s/\x{2029}/ /g; # "paragraph separator" - s/\x{202a}/ /g; # "left-to-right embedding" - s/\x{202b}/ /g; # "right-to-left embedding" - s/\x{202c}/ /g; # "pop directional formatting" - s/\x{202d}/ /g; # "left-to-right override" - s/\x{202e}/ /g; # "right-to-left override" - s/\x{85}/ /g; # "next line" - s/\x{fffd}/ /g; # "replacement character" - s/\x{feff}/ /g; # byte-order mark - s/\x{fdd3}/ /g; # "unicode non-character" + # Delete control characters: + s/[\x{00}-\x{1f}]//g; # Regularize named HTML/XML escapes: s/&\s*lt\s*;/</gi; # HTML opening angle bracket @@ -41,6 +29,21 @@ while(<STDIN>) { s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge; s/&\#([0-9]+);/pack("U", $1)/ge; + # Regularlize spaces: + s/\x{a0}/ /g; # non-breaking space + s/\x{2009}/ /g; # thin space + s/\x{2028}/ /g; # "line separator" + s/\x{2029}/ /g; # "paragraph separator" + s/\x{202a}/ /g; # "left-to-right embedding" + s/\x{202b}/ /g; # "right-to-left embedding" + s/\x{202c}/ /g; # "pop directional formatting" + s/\x{202d}/ /g; # "left-to-right override" + s/\x{202e}/ /g; # "right-to-left override" + s/\x{85}/ /g; # "next line" + s/\x{fffd}/ /g; # "replacement character" + s/\x{feff}/ /g; # byte-order mark + s/\x{fdd3}/ /g; # "unicode non-character" + # Convert other Windows 1252 characters to UTF-8 s/\x{80}/\x{20ac}/g; # euro sign s/\x{95}/\x{2022}/g; # bullet @@ -53,7 +56,7 @@ while(<STDIN>) { s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g; s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; - # Ridiculous double conversions(?) (news commentary and Giga-FrEn): + # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8: s/�c/--/g; # long dash s/\x{e2}\x{20ac}oe/\"/g; # opening double quote s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote @@ -63,6 +66,19 @@ while(<STDIN>) { s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash? s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote? s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote? + s/\x{c3}\x{9f}/\x{df}/g; # esset + s/\x{c3}\x{0178}/\x{df}/g; # esset + s/\x{c3}\x{a4}/\x{e4}/g; # a umlaut + s/\x{c3}\x{b6}/\x{f6}/g; # o umlaut + s/\x{c3}\x{bc}/\x{fc}/g; # u umlaut + s/\x{c3}\x{84}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{201e}/\x{c4}/g; # A umlaut: create no C4s after this + s/\x{c3}\x{96}/\x{d6}/g; # O umlaut + s/\x{c3}\x{2013}/\x{d6}/g; # O umlaut + s/\x{c3}\x{bc}/\x{dc}/g; # U umlaut + s/\x{80}/\x{20ac}/g; # euro sign + s/\x{95}/\x{2022}/g; # bullet + s/\x{99}/\x{2122}/g; # trademark sign # Regularize quotes: s/ˇ/'/g; # caron @@ -132,6 +148,7 @@ while(<STDIN>) { s/–/--/g; s/─/--/g; s/—/--/g; + s/\x{97}/--/g; s/•/ * /g; s/\*/ * /g; s/،/,/g; @@ -160,8 +177,6 @@ while(<STDIN>) { s/^\s+//; s/\s+$//; - # Delete control characters: - s/[\x{00}-\x{1f}]//g; print "$_\n"; } diff --git a/decoder/decoder.cc b/decoder/decoder.cc index e02c7730..7b49fcfa 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -408,7 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart") ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion") ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)") - ("get_oracle_forest,o", "Calculate rescored hypregraph using approximate BLEU scoring of rules") + ("get_oracle_forest,o", "Calculate rescored hypergraph using approximate BLEU scoring of rules") ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d337b28b..0bc14e5a 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -60,8 +60,9 @@ namespace { } } -static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file) { +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file, string* featname) { vector<string> const& argv=SplitOnWhitespace(in); + *featname = ""; *explicit_markers = false; *order = 3; prefixes.push_back("NOT-USED"); @@ -83,6 +84,9 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, case 'x': *explicit_markers = true; break; + case 'n': + LMSPEC_NEXTARG; *featname=*i; + break; case 'U': LMSPEC_NEXTARG; prefixes[1] = *i; @@ -226,6 +230,7 @@ class NgramDetectorImpl { ++n; if (!fid) { ostringstream os; + os << featname_; os << prefixes_[n]; for (int i = n-1; i >= 0; --i) { os << (i != n-1 ? target_separator_ : ""); @@ -404,7 +409,8 @@ class NgramDetectorImpl { public: explicit NgramDetectorImpl(bool explicit_markers, unsigned order, - vector<string>& prefixes, string& target_separator, const string& clusters) : + vector<string>& prefixes, string& target_separator, const string& clusters, + const string& featname) : kCDEC_UNK(TD::Convert("<unk>")) , add_sos_eos_(!explicit_markers) { order_ = order; @@ -414,6 +420,7 @@ class NgramDetectorImpl { unscored_words_offset_ = is_complete_offset_ + 1; prefixes_ = prefixes; target_separator_ = target_separator; + featname_ = featname; // special handling of beginning / ending sentence markers dummy_state_ = new char[state_size_]; @@ -454,6 +461,7 @@ class NgramDetectorImpl { TRulePtr dummy_rule_; vector<string> prefixes_; string target_separator_; + string featname_; struct FidTree { map<WordID, int> fids; map<WordID, FidTree> levels; @@ -467,9 +475,9 @@ NgramDetector::NgramDetector(const string& param) { bool explicit_markers = false; unsigned order = 3; string clusters; - ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters); + ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters, &featname); pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, - target_separator, clusters); + target_separator, clusters, featname); SetStateSize(pimpl_->ReserveStateSize()); } |