summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcorpus/support/quote-norm.pl49
-rw-r--r--decoder/decoder.cc2
-rw-r--r--decoder/ff_ngrams.cc16
3 files changed, 45 insertions, 22 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 7bdcee67..1d9bb96f 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -8,20 +8,8 @@ while(<STDIN>) {
chomp;
$_ = " $_ ";
- # Regularlize spaces:
- s/\x{a0}/ /g; # non-breaking space
- s/\x{2009}/ /g; # thin space
- s/\x{2028}/ /g; # "line separator"
- s/\x{2029}/ /g; # "paragraph separator"
- s/\x{202a}/ /g; # "left-to-right embedding"
- s/\x{202b}/ /g; # "right-to-left embedding"
- s/\x{202c}/ /g; # "pop directional formatting"
- s/\x{202d}/ /g; # "left-to-right override"
- s/\x{202e}/ /g; # "right-to-left override"
- s/\x{85}/ /g; # "next line"
- s/\x{fffd}/ /g; # "replacement character"
- s/\x{feff}/ /g; # byte-order mark
- s/\x{fdd3}/ /g; # "unicode non-character"
+ # Delete control characters:
+ s/[\x{00}-\x{1f}]//g;
# Regularize named HTML/XML escapes:
s/&\s*lt\s*;/</gi; # HTML opening angle bracket
@@ -41,6 +29,21 @@ while(<STDIN>) {
s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
s/&\#([0-9]+);/pack("U", $1)/ge;
+ # Regularlize spaces:
+ s/\x{a0}/ /g; # non-breaking space
+ s/\x{2009}/ /g; # thin space
+ s/\x{2028}/ /g; # "line separator"
+ s/\x{2029}/ /g; # "paragraph separator"
+ s/\x{202a}/ /g; # "left-to-right embedding"
+ s/\x{202b}/ /g; # "right-to-left embedding"
+ s/\x{202c}/ /g; # "pop directional formatting"
+ s/\x{202d}/ /g; # "left-to-right override"
+ s/\x{202e}/ /g; # "right-to-left override"
+ s/\x{85}/ /g; # "next line"
+ s/\x{fffd}/ /g; # "replacement character"
+ s/\x{feff}/ /g; # byte-order mark
+ s/\x{fdd3}/ /g; # "unicode non-character"
+
# Convert other Windows 1252 characters to UTF-8
s/\x{80}/\x{20ac}/g; # euro sign
s/\x{95}/\x{2022}/g; # bullet
@@ -53,7 +56,7 @@ while(<STDIN>) {
s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
- # Ridiculous double conversions(?) (news commentary and Giga-FrEn):
+ # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8:
s/�c/--/g; # long dash
s/\x{e2}\x{20ac}oe/\"/g; # opening double quote
s/\x{e2}\x{20ac}\x{9c}/\"/g; # opening double quote
@@ -63,6 +66,19 @@ while(<STDIN>) {
s/\x{e2}\x{20ac}\x{201d}/ -- /g; # em dash?
s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote?
s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote?
+ s/\x{c3}\x{9f}/\x{df}/g; # esset
+ s/\x{c3}\x{0178}/\x{df}/g; # esset
+ s/\x{c3}\x{a4}/\x{e4}/g; # a umlaut
+ s/\x{c3}\x{b6}/\x{f6}/g; # o umlaut
+ s/\x{c3}\x{bc}/\x{fc}/g; # u umlaut
+ s/\x{c3}\x{84}/\x{c4}/g; # A umlaut: create no C4s after this
+ s/\x{c3}\x{201e}/\x{c4}/g; # A umlaut: create no C4s after this
+ s/\x{c3}\x{96}/\x{d6}/g; # O umlaut
+ s/\x{c3}\x{2013}/\x{d6}/g; # O umlaut
+ s/\x{c3}\x{bc}/\x{dc}/g; # U umlaut
+ s/\x{80}/\x{20ac}/g; # euro sign
+ s/\x{95}/\x{2022}/g; # bullet
+ s/\x{99}/\x{2122}/g; # trademark sign
# Regularize quotes:
s/ˇ/'/g; # caron
@@ -132,6 +148,7 @@ while(<STDIN>) {
s/–/--/g;
s/─/--/g;
s/—/--/g;
+ s/\x{97}/--/g;
s/•/ * /g;
s/\*/ * /g;
s/،/,/g;
@@ -160,8 +177,6 @@ while(<STDIN>) {
s/^\s+//;
s/\s+$//;
- # Delete control characters:
- s/[\x{00}-\x{1f}]//g;
print "$_\n";
}
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index e02c7730..7b49fcfa 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -408,7 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart")
("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")
("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")
- ("get_oracle_forest,o", "Calculate rescored hypregraph using approximate BLEU scoring of rules")
+ ("get_oracle_forest,o", "Calculate rescored hypergraph using approximate BLEU scoring of rules")
("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)")
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index d337b28b..0bc14e5a 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -60,8 +60,9 @@ namespace {
}
}
-static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file) {
+static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file, string* featname) {
vector<string> const& argv=SplitOnWhitespace(in);
+ *featname = "";
*explicit_markers = false;
*order = 3;
prefixes.push_back("NOT-USED");
@@ -83,6 +84,9 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order,
case 'x':
*explicit_markers = true;
break;
+ case 'n':
+ LMSPEC_NEXTARG; *featname=*i;
+ break;
case 'U':
LMSPEC_NEXTARG;
prefixes[1] = *i;
@@ -226,6 +230,7 @@ class NgramDetectorImpl {
++n;
if (!fid) {
ostringstream os;
+ os << featname_;
os << prefixes_[n];
for (int i = n-1; i >= 0; --i) {
os << (i != n-1 ? target_separator_ : "");
@@ -404,7 +409,8 @@ class NgramDetectorImpl {
public:
explicit NgramDetectorImpl(bool explicit_markers, unsigned order,
- vector<string>& prefixes, string& target_separator, const string& clusters) :
+ vector<string>& prefixes, string& target_separator, const string& clusters,
+ const string& featname) :
kCDEC_UNK(TD::Convert("<unk>")) ,
add_sos_eos_(!explicit_markers) {
order_ = order;
@@ -414,6 +420,7 @@ class NgramDetectorImpl {
unscored_words_offset_ = is_complete_offset_ + 1;
prefixes_ = prefixes;
target_separator_ = target_separator;
+ featname_ = featname;
// special handling of beginning / ending sentence markers
dummy_state_ = new char[state_size_];
@@ -454,6 +461,7 @@ class NgramDetectorImpl {
TRulePtr dummy_rule_;
vector<string> prefixes_;
string target_separator_;
+ string featname_;
struct FidTree {
map<WordID, int> fids;
map<WordID, FidTree> levels;
@@ -467,9 +475,9 @@ NgramDetector::NgramDetector(const string& param) {
bool explicit_markers = false;
unsigned order = 3;
string clusters;
- ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters);
+ ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters, &featname);
pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes,
- target_separator, clusters);
+ target_separator, clusters, featname);
SetStateSize(pimpl_->ReserveStateSize());
}