Merge branch 'master' of github.com:redpony/cdec

author: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> 2014-02-23 02:13:40 -0500
committer: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> 2014-02-23 02:13:40 -0500
commit: dd555f71427c72753bb0a2451de05d8d7125717c (patch)
tree: eabc2d2fda8e4cd5f9ad34cd9bf18e1ff9bf93ef
parent: 3ec30b72f47e063d94648a9823653e6ec3e17401 (diff)
parent: fbdc905f6f201e2cc0dbee89ef81e36a53bb3c42 (diff)
4 files changed, 47 insertions, 24 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 7bdcee67..1d9bb96f 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -8,20 +8,8 @@ while(<STDIN>) {
   chomp;
   $_ = " $_ ";
 
-  # Regularlize spaces:
-  s/\x{a0}/ /g;       # non-breaking space
-  s/\x{2009}/ /g;     # thin space
-  s/\x{2028}/ /g;     # "line separator"
-  s/\x{2029}/ /g;     # "paragraph separator"
-  s/\x{202a}/ /g;     # "left-to-right embedding"
-  s/\x{202b}/ /g;     # "right-to-left embedding"
-  s/\x{202c}/ /g;     # "pop directional formatting"
-  s/\x{202d}/ /g;     # "left-to-right override"
-  s/\x{202e}/ /g;     # "right-to-left override"
-  s/\x{85}/ /g;       # "next line"
-  s/\x{fffd}/ /g;     # "replacement character"
-  s/\x{feff}/ /g;     # byte-order mark
-  s/\x{fdd3}/ /g;     # "unicode non-character"
+  # Delete control characters:
+  s/[\x{00}-\x{1f}]//g; 
 
   # Regularize named HTML/XML escapes:
   s/&\s*lt\s*;/</gi;    # HTML opening angle bracket
@@ -41,6 +29,21 @@ while(<STDIN>) {
   s/&\#x([0-9A-Fa-f]+);/pack("U", hex($1))/ge;
   s/&\#([0-9]+);/pack("U", $1)/ge;
 
+  # Regularlize spaces:
+  s/\x{a0}/ /g;       # non-breaking space
+  s/\x{2009}/ /g;     # thin space
+  s/\x{2028}/ /g;     # "line separator"
+  s/\x{2029}/ /g;     # "paragraph separator"
+  s/\x{202a}/ /g;     # "left-to-right embedding"
+  s/\x{202b}/ /g;     # "right-to-left embedding"
+  s/\x{202c}/ /g;     # "pop directional formatting"
+  s/\x{202d}/ /g;     # "left-to-right override"
+  s/\x{202e}/ /g;     # "right-to-left override"
+  s/\x{85}/ /g;       # "next line"
+  s/\x{fffd}/ /g;     # "replacement character"
+  s/\x{feff}/ /g;     # byte-order mark
+  s/\x{fdd3}/ /g;     # "unicode non-character"
+
   # Convert other Windows 1252 characters to UTF-8 
   s/\x{80}/\x{20ac}/g;    # euro sign
   s/\x{95}/\x{2022}/g;    # bullet
@@ -53,7 +56,7 @@ while(<STDIN>) {
   s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
   s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
 
-  # Ridiculous double conversions(?) (news commentary and Giga-FrEn):
+  # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8:
   s/ï¿½c/--/g;                        # long dash
   s/\x{e2}\x{20ac}oe/\"/g;            # opening double quote
   s/\x{e2}\x{20ac}\x{9c}/\"/g;        # opening double quote
@@ -63,6 +66,19 @@ while(<STDIN>) {
   s/\x{e2}\x{20ac}\x{201d}/ -- /g;    # em dash? 
   s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; # single quote?
   s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; # double quote?
+  s/\x{c3}\x{9f}/\x{df}/g;            # esset
+  s/\x{c3}\x{0178}/\x{df}/g;          # esset
+  s/\x{c3}\x{a4}/\x{e4}/g;            # a umlaut
+  s/\x{c3}\x{b6}/\x{f6}/g;            # o umlaut
+  s/\x{c3}\x{bc}/\x{fc}/g;            # u umlaut
+  s/\x{c3}\x{84}/\x{c4}/g;            # A umlaut: create no C4s after this
+  s/\x{c3}\x{201e}/\x{c4}/g;          # A umlaut: create no C4s after this
+  s/\x{c3}\x{96}/\x{d6}/g;            # O umlaut
+  s/\x{c3}\x{2013}/\x{d6}/g;          # O umlaut
+  s/\x{c3}\x{bc}/\x{dc}/g;            # U umlaut
+  s/\x{80}/\x{20ac}/g;                # euro sign
+  s/\x{95}/\x{2022}/g;                # bullet
+  s/\x{99}/\x{2122}/g;                # trademark sign
 
   # Regularize quotes:
   s/ˇ/'/g;            # caron
@@ -132,6 +148,7 @@ while(<STDIN>) {
   s/–/--/g;
   s/─/--/g;
   s/—/--/g;
+  s/\x{97}/--/g;
   s/•/ * /g;
   s/\*/ * /g;
   s/،/,/g;
@@ -160,8 +177,6 @@ while(<STDIN>) {
   s/^\s+//;
   s/\s+$//;
 
-  # Delete control characters:
-  s/[\x{00}-\x{1f}]//g; 
   print "$_\n";
 }
 
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index f8104c5e..31049216 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -408,7 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
         ("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart")
         ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")
         ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")
-        ("get_oracle_forest,o", "Calculate rescored hypregraph using approximate BLEU scoring of rules")
+        ("get_oracle_forest,o", "Calculate rescored hypergraph using approximate BLEU scoring of rules")
         ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
         ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
         ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)")
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index d337b28b..0bc14e5a 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -60,8 +60,9 @@ namespace {
   }
 }
 
-static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file) {
+static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator, string* cluster_file, string* featname) {
   vector<string> const& argv=SplitOnWhitespace(in);
+  *featname = "";
   *explicit_markers = false;
   *order = 3;
   prefixes.push_back("NOT-USED");
@@ -83,6 +84,9 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order,
       case 'x':
         *explicit_markers = true;
         break;
+      case 'n':
+        LMSPEC_NEXTARG; *featname=*i;
+        break;
       case 'U':
 	LMSPEC_NEXTARG;
 	prefixes[1] = *i;
@@ -226,6 +230,7 @@ class NgramDetectorImpl {
       ++n;
       if (!fid) {
         ostringstream os;
+        os << featname_;
         os << prefixes_[n];
         for (int i = n-1; i >= 0; --i) {
           os << (i != n-1 ? target_separator_ : "");
@@ -404,7 +409,8 @@ class NgramDetectorImpl {
 
  public:
   explicit NgramDetectorImpl(bool explicit_markers, unsigned order,
-			     vector<string>& prefixes, string& target_separator, const string& clusters) :
+			     vector<string>& prefixes, string& target_separator, const string& clusters,
+                             const string& featname) :
       kCDEC_UNK(TD::Convert("<unk>")) ,
       add_sos_eos_(!explicit_markers) {
     order_ = order;
@@ -414,6 +420,7 @@ class NgramDetectorImpl {
     unscored_words_offset_ = is_complete_offset_ + 1;
     prefixes_ = prefixes;
     target_separator_ = target_separator;
+    featname_ = featname;
 
     // special handling of beginning / ending sentence markers
     dummy_state_ = new char[state_size_];
@@ -454,6 +461,7 @@ class NgramDetectorImpl {
   TRulePtr dummy_rule_;
   vector<string> prefixes_;
   string target_separator_;
+  string featname_;
   struct FidTree {
     map<WordID, int> fids;
     map<WordID, FidTree> levels;
@@ -467,9 +475,9 @@ NgramDetector::NgramDetector(const string& param) {
   bool explicit_markers = false;
   unsigned order = 3;
   string clusters;
-  ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters);
+  ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters, &featname);
   pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, 
-				 target_separator, clusters);
+				 target_separator, clusters, featname);
   SetStateSize(pimpl_->ReserveStateSize());
 }
 
diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc
index e075bed3..1a6415be 100644
--- a/training/mira/kbest_cut_mira.cc
+++ b/training/mira/kbest_cut_mira.cc
@@ -82,14 +82,14 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
     ("optimizer,o",po::value<int>()->default_value(1), "Optimizer (SGD=1, PA MIRA w/Delta=2, Cutting Plane MIRA=3, PA MIRA=4, Triple nbest list MIRA=5)")
     ("fear,f",po::value<int>()->default_value(1), "Fear selection (model-cost=1, maxcost=2, maxscore=3)")
     ("hope,h",po::value<int>()->default_value(1), "Hope selection (model+cost=1, mincost=2)")
-    ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)")
+    ("max_step_size,C", po::value<double>()->default_value(0.001), "regularization strength (C)")
     ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
     ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by")
     ("sent_approx,a", "Use smoothed sentence-level BLEU score for approximate scoring")
     ("pseudo_doc,e", "Use pseudo-document BLEU score for approximate scoring")
     ("no_reweight,d","Do not reweight forest for cutting plane")
     ("no_select,n", "Do not use selection heuristic")
-    ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+    ("k_best_size,k", po::value<int>()->default_value(500), "Size of hypothesis list to search for oracles")
     ("update_k_best,b", po::value<int>()->default_value(1), "Size of good, bad lists to perform update with")
     ("unique_k_best,u", "Unique k-best translation list")
     ("stream,t", "Stream mode (used for realtime)")
author	Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>	2014-02-23 02:13:40 -0500
committer	Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>	2014-02-23 02:13:40 -0500
commit	dd555f71427c72753bb0a2451de05d8d7125717c (patch)
tree	eabc2d2fda8e4cd5f9ad34cd9bf18e1ff9bf93ef
parent	3ec30b72f47e063d94648a9823653e6ec3e17401 (diff)
parent	fbdc905f6f201e2cc0dbee89ef81e36a53bb3c42 (diff)