From 3dfa575d202c9277060bc43a7af9351702da9f12 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 09:05:04 -0500
Subject: fix my dumb bug that killed qsub functionality

---
 vest/dist-vest.pl   | 2 --
 vest/parallelize.pl | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 973a29ef..f6f661b9 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -194,7 +194,6 @@ sub modbin {
         my $src=$$_;
         $$_="$bindir/".basename($src);
         check_call("cp -p $src $$_");
-        die "cp $src $$_ failed: $!" unless $? == 0;
     }
 }
 sub dirsize {
@@ -374,7 +373,6 @@ while (1){
 				$nmappers++;
 				my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
 				my $jobid = check_output("$qcmd");
-				die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0;
 				chomp $jobid;
 				$jobid =~ s/^(\d+)(.*?)$/\1/g;
 				$jobid =~ s/^Your job (\d+) .*$/\1/;
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index 47b77c79..2798a303 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -82,7 +82,7 @@ sub preview_files {
     my @f=grep { ! ($skipempty && -z $_) } @$l;
     my $fn=join(' ',map {escape_shell($_)} @f);
     my $cmd="tail -n $n $fn";
-    check_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
+    unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
 }
 sub prefix_dirname($) {
     #like `dirname but if ends in / then return the whole thing
@@ -323,7 +323,7 @@ sub launch_job {
             }
       if ($joblist == "") { $joblist = $jobid; }
       else {$joblist = $joblist . "\|" . $jobid; }
-            my $cleanfn=check_output("qdel $jobid 2> /dev/null");
+      my $cleanfn="qdel $jobid 2> /dev/null";
       push(@cleanup_cmds, $cleanfn);
     }
     close QOUT;
-- 
cgit v1.2.3


From 13b15df6a00137395eae03ba3f33a987a916257b Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 10:00:53 -0500
Subject: another dumb bug involving cleanup being executed preemptively

---
 vest/dist-vest.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index f6f661b9..c27af804 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -371,12 +371,12 @@ while (1){
 				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
 
 				$nmappers++;
-				my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
 				my $jobid = check_output("$qcmd");
 				chomp $jobid;
 				$jobid =~ s/^(\d+)(.*?)$/\1/g;
 				$jobid =~ s/^Your job (\d+) .*$/\1/;
-		 	 	push(@cleanupcmds, check_output("qdel $jobid 2> /dev/null"));
+		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
 				print STDERR " $jobid";
 				if ($joblist == "") { $joblist = $jobid; }
 				else {$joblist = $joblist . "\|" . $jobid; }
-- 
cgit v1.2.3


From 702df29b83ca10998ea3a8f84bc2e0e6c9e86eea Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 10:01:09 -0500
Subject: ignore emacs temp files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3892891c..2a287bbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,3 +120,4 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar
 *.dvi
 *.ps
 *.toc
+*~
\ No newline at end of file
-- 
cgit v1.2.3


From 25d5729b850d1dc62eaf151b5550bd83963b08e8 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 10:09:18 -0500
Subject: dont die when there are no running jobs

---
 vest/dist-vest.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index c27af804..cfddf61c 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#grep!/usr/bin/env perl
 use strict;
 my @ORIG_ARGV=@ARGV;
 use Cwd qw(getcwd);
@@ -396,7 +396,7 @@ while (1){
 			print STDERR "Waiting for mappers to complete...\n";
 			while ($nmappers > 0) {
 			  sleep 5;
-			  my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | grep -v ' C '")));
+			  my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | awk '{if($0 !~ \" C \"){print}}'")));
 			  $nmappers = scalar @livejobs;
 			}
 			print STDERR "All mappers complete.\n";
-- 
cgit v1.2.3


From db200aeefcfad33e789a8790961ef5c0f66d8ba3 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 10:13:20 -0500
Subject: fail

---
 vest/dist-vest.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index cfddf61c..6a5959dc 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -1,4 +1,4 @@
-#grep!/usr/bin/env perl
+#!/usr/bin/env perl
 use strict;
 my @ORIG_ARGV=@ARGV;
 use Cwd qw(getcwd);
-- 
cgit v1.2.3


From 92ca6e23b39043ad026c07a5aab71ffc750c1db2 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 10:22:31 -0500
Subject: just use grep and dont check return code

---
 vest/dist-vest.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 6a5959dc..f95754dc 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -396,7 +396,7 @@ while (1){
 			print STDERR "Waiting for mappers to complete...\n";
 			while ($nmappers > 0) {
 			  sleep 5;
-			  my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | awk '{if($0 !~ \" C \"){print}}'")));
+			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
 			  $nmappers = scalar @livejobs;
 			}
 			print STDERR "All mappers complete.\n";
-- 
cgit v1.2.3


From 6b25a85dd45af5982e07577b33c64e3b577579c3 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 10:27:43 -0500
Subject: dont fail on possibly temporary qstat errors

---
 vest/parallelize.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index 2798a303..c2526503 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -283,7 +283,8 @@ sub numof_live_jobs {
   if ($use_fork) {
     die "not implemented";
   } else {
-    my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat")));
+    # We can probably continue decoding if the qstat error is only temporary
+    my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat")));
     return ($#livejobs + 1);
   }
 }
-- 
cgit v1.2.3


From eda8d83cd957463d32980da7c60085a820f7eae0 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Fri, 11 Mar 2011 11:06:48 -0500
Subject: be more verbose when running each child decoder process when forking.
 also, avoid some non-bash errors

---
 vest/parallelize.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index c2526503..b4783f91 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -347,7 +347,7 @@ sub launch_job_fork {
     my ($fh, $scr_name) = get_temp_script();
     print $fh $script;
     close $fh;
-    my $todo = "/bin/sh $scr_name 1> $outfile 2> $errorfile";
+    my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
     print STDERR "EXEC: $todo\n";
     my $out = check_output("$todo");
     print STDERR "RES: $out\n";
-- 
cgit v1.2.3


From dccf47501f078a354375b9f3edd481d8c8d30268 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Mon, 14 Mar 2011 17:03:51 -0400
Subject: more paranoid checking when (idiot/time-crunched) user tries to
 define his own <seg> tags during tuning

---
 vest/dist-vest.pl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index f95754dc..d17d7de1 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -573,7 +573,11 @@ sub enseg {
 	while (my $line=<SRC>){
 		chomp $line;
 		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
 			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
 		} else {
 			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
 		}
-- 
cgit v1.2.3


From 5d0f3c6aa4e78aea09952a7a65f61d3c4dce0a0e Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Mon, 14 Mar 2011 17:05:14 -0400
Subject: Fix wordset to override features() so that we can safely use multiple
 instances of it

---
 decoder/ff_wordset.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h
index 00e1145b..643097ef 100644
--- a/decoder/ff_wordset.h
+++ b/decoder/ff_wordset.h
@@ -32,6 +32,7 @@ class WordSet : public FeatureFunction {
   ~WordSet() {
   }
 
+  Features features() const { return single_feature(fid_); }
 
  protected:
   virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
-- 
cgit v1.2.3


From 237de3db6d5917707b745e3df7be42f2497e3783 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Mon, 14 Mar 2011 17:39:04 -0400
Subject: Get enough compiling with scons to finish off the emnlp paper

---
 SConstruct | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/SConstruct b/SConstruct
index 1a7885bc..c21d85d5 100644
--- a/SConstruct
+++ b/SConstruct
@@ -11,6 +11,9 @@ AddOption('--with-glc', dest='glc', type='string', nargs=1, action='store', meta
 AddOption('--efence', dest='efence', action='store_true',
                   help='use electric fence for debugging memory corruptions')
 
+# TODO: Troll http://www.scons.org/wiki/SconsAutoconf
+# for some initial autoconf-like steps
+
 platform = ARGUMENTS.get('OS', Platform())
 include = Split('decoder utils klm mteval .')
 env = Environment(PREFIX=GetOption('prefix'),
@@ -45,7 +48,7 @@ if glc:
    srcs.append(glc+'/feature-factory.cc')
    srcs.append(glc+'/cdec/ff_glc.cc')
 
-for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc']:
+for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc', 'vest/*.cc']:
     srcs.extend([ file for file in Glob(pattern)
     		       if not 'test' in str(file)
 		       	  and 'build_binary.cc' not in str(file)
@@ -53,6 +56,30 @@ for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mtev
 			  and 'mbr_kbest.cc' not in str(file)
 			  and 'sri.cc' not in str(file)
 			  and 'fast_score.cc' not in str(file)
+                          and 'cdec.cc' not in str(file)
+                          and 'mr_' not in str(file)
 		])
 
-env.Program(target='decoder/cdec', source=srcs)
+print 'Found {0} source files'.format(len(srcs))
+def comb(cc, srcs):
+   x = [cc]
+   x.extend(srcs)
+   return x
+
+env.Program(target='decoder/cdec', source=comb('decoder/cdec.cc', srcs))
+# TODO: The various decoder tests
+# TODO: extools
+env.Program(target='klm/lm/build_binary', source=comb('klm/lm/build_binary.cc', srcs))
+# TODO: klm ngram_query and tests
+env.Program(target='mteval/fast_score', source=comb('mteval/fast_score.cc', srcs))
+env.Program(target='mteval/mbr_kbest', source=comb('mteval/mbr_kbest.cc', srcs))
+#env.Program(target='mteval/scorer_test', source=comb('mteval/fast_score.cc', srcs))
+# TODO: phrasinator
+# TODO: Various training binaries
+env.Program(target='vest/sentserver', source=['vest/sentserver.c'], LINKFLAGS='-all-static')
+env.Program(target='vest/sentclient', source=['vest/sentclient.c'], LINKFLAGS='-all-static')
+env.Program(target='vest/mr_vest_generate_mapper_input', source=comb('vest/mr_vest_generate_mapper_input.cc', srcs))
+env.Program(target='vest/mr_vest_map', source=comb('vest/mr_vest_map.cc', srcs))
+env.Program(target='vest/mr_vest_reduce', source=comb('vest/mr_vest_reduce.cc', srcs))
+#env.Program(target='vest/lo_test', source=comb('vest/lo_test.cc', srcs))
+# TODO: util tests
-- 
cgit v1.2.3


From 6b6eeff3130bcb40980886d8179ba4ad6842325e Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 16 Mar 2011 19:48:41 -0400
Subject: explicit markers turned on by default

---
 decoder/ff_klm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc
index adc2c8bf..62908cdc 100644
--- a/decoder/ff_klm.cc
+++ b/decoder/ff_klm.cc
@@ -21,7 +21,7 @@ static const unsigned char MASK             = 7;
 // -n NAME : feature id is NAME
 bool ParseLMArgs(string const& in, string* filename, string* mapfile, bool* explicit_markers, string* featname) {
   vector<string> const& argv=SplitOnWhitespace(in);
-  *explicit_markers = true;
+  *explicit_markers = false;
   *featname="LanguageModel";
   *mapfile = "";
 #define LMSPEC_NEXTARG if (i==argv.end()) {            \
-- 
cgit v1.2.3


From 95e50962fe307b930e835513e4d9998df91426a4 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 16 Mar 2011 20:30:37 -0400
Subject: possible mert bug with rules with alignments

---
 decoder/trule.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/decoder/trule.cc b/decoder/trule.cc
index 9820e6d5..fda62741 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -145,7 +145,9 @@ bool TRule::ReadFromString(const string& line, bool strict, bool mono) {
       getline(is, ss);
       //cerr << "L: " << ss << endl;
       int start = 0;
-      const int len = ss.size();
+      int len = ss.size();
+      const size_t ppos = ss.find(" |||");
+      if (ppos != string::npos) { len = ppos; }
       while (start < len) {
         while(start < len && (ss[start] == ' ' || ss[start] == ';'))
           ++start;
-- 
cgit v1.2.3


From 9f78539edbbe00feeee618932fc5d51f5c5b9eb4 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 17 Mar 2011 22:29:43 -0400
Subject: enable weights to be frozen during training

---
 training/mpi_online_optimize.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 325ba030..1367581a 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -64,6 +64,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("input_weights,w",po::value<string>(),"Input feature weights file")
+        ("frozen_features,z",po::value<string>(), "List of features not to optimize")
         ("training_data,t",po::value<string>(),"Training data corpus")
         ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
         ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
@@ -254,6 +255,20 @@ int main(int argc, char** argv) {
   if (conf.count("input_weights"))
     weights.InitFromFile(conf["input_weights"].as<string>());
 
+  vector<int> frozen_fids;
+  if (conf.count("frozen_features")) {
+    ReadFile rf(conf["frozen_features"].as<string>());
+    istream& in = *rf.stream();
+    string line;
+    while(in) {
+      getline(in, line);
+      if (line.empty()) continue;
+      if (line[0] == ' ' || line[line.size() - 1] == ' ') { line = Trim(line); }
+      frozen_fids.push_back(FD::Convert(line));
+    }
+    if (rank == 0) cerr << "Freezing " << frozen_fids.size() << " features.\n";
+  }
+
   vector<string> corpus;
   vector<int> ids;
   ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
@@ -362,6 +377,8 @@ int main(int argc, char** argv) {
       g.swap(local_grad);
 #endif
       local_grad.clear();
+      for (int i = 0; i < frozen_fids.size(); ++i)
+        g.erase(frozen_fids[i]);
       if (rank == 0) {
         g /= (size_per_proc * size);
         o->UpdateWeights(g, FD::NumFeats(), &x);
-- 
cgit v1.2.3


From 7079e3685def6f231ecf9f0c3f31b5c03a46d858 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 17 Mar 2011 22:46:35 -0400
Subject: freeze features, including penalty

---
 training/mpi_online_optimize.cc |  4 +---
 training/online_optimizer.h     | 17 ++++++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 1367581a..32033c19 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -299,7 +299,7 @@ int main(int argc, char** argv) {
     const string omethod = conf["optimization_method"].as<string>();
     if (omethod == "sgd") {
       const double C = conf["regularization_strength"].as<double>();
-      o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C));
+      o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C, frozen_fids));
     } else {
       assert(!"fail");
     }
@@ -377,8 +377,6 @@ int main(int argc, char** argv) {
       g.swap(local_grad);
 #endif
       local_grad.clear();
-      for (int i = 0; i < frozen_fids.size(); ++i)
-        g.erase(frozen_fids[i]);
       if (rank == 0) {
         g /= (size_per_proc * size);
         o->UpdateWeights(g, FD::NumFeats(), &x);
diff --git a/training/online_optimizer.h b/training/online_optimizer.h
index 312aabae..61d62a37 100644
--- a/training/online_optimizer.h
+++ b/training/online_optimizer.h
@@ -2,6 +2,7 @@
 #define _ONL_OPTIMIZE_H_
 
 #include <tr1/memory>
+#include <set>
 #include <string>
 #include <cmath>
 #include "sparse_vector.h"
@@ -56,8 +57,12 @@ class OnlineOptimizer {
  public:
   virtual ~OnlineOptimizer();
   OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
-                  size_t batch_size)
-    : N_(batch_size),schedule_(s),k_() {}
+                  size_t batch_size,
+                  const std::vector<int>& frozen_feats = std::vector<int>())
+      : N_(batch_size),schedule_(s),k_() {
+    for (int i = 0; i < frozen_feats.size(); ++i)
+      frozen_.insert(frozen_feats[i]);
+  }
   void ResetEpoch() { k_ = 0; ResetEpochImpl(); }
   void UpdateWeights(const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
     ++k_;
@@ -69,6 +74,7 @@ class OnlineOptimizer {
   virtual void ResetEpochImpl();
   virtual void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) = 0;
   const size_t N_; // number of training instances per batch
+  std::set<int> frozen_;  // frozen (non-optimizing) features
 
  private:
   std::tr1::shared_ptr<LearningRateSchedule> schedule_;
@@ -78,8 +84,9 @@ class OnlineOptimizer {
 class CumulativeL1OnlineOptimizer : public OnlineOptimizer {
  public:
   CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
-                              size_t training_instances, double C) :
-    OnlineOptimizer(s, training_instances), C_(C), u_() {}
+                              size_t training_instances, double C,
+                              const std::vector<int>& frozen) :
+    OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {}
 
  protected:
   void ResetEpochImpl() { u_ = 0; }
@@ -87,7 +94,7 @@ class CumulativeL1OnlineOptimizer : public OnlineOptimizer {
     u_ += eta * C_ / N_;
     (*weights) += eta * approx_g;
     for (int i = 1; i < max_feat; ++i)
-      ApplyPenalty(i, weights);
+      if (frozen_.count(i) == 0) ApplyPenalty(i, weights);
   }
 
  private:
-- 
cgit v1.2.3


From 4482fe7a82e3f9a197bf65d60635885c4bfab195 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Thu, 17 Mar 2011 22:53:19 -0400
Subject: try 2

---
 training/online_optimizer.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/training/online_optimizer.h b/training/online_optimizer.h
index 61d62a37..28d89344 100644
--- a/training/online_optimizer.h
+++ b/training/online_optimizer.h
@@ -92,7 +92,11 @@ class CumulativeL1OnlineOptimizer : public OnlineOptimizer {
   void ResetEpochImpl() { u_ = 0; }
   void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
     u_ += eta * C_ / N_;
-    (*weights) += eta * approx_g;
+    for (SparseVector<double>::const_iterator it = approx_g.begin(); 
+         it != approx_g.end(); ++it) {
+      if (frozen_.count(it->first) == 0)
+        weights->add_value(it->first, eta * it->second);
+    }
     for (int i = 1; i < max_feat; ++i)
       if (frozen_.count(i) == 0) ApplyPenalty(i, weights);
   }
-- 
cgit v1.2.3


From ed47102885e52c52146fc8631ff624779bd7eb0a Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 18 Mar 2011 10:36:26 -0400
Subject: compile fix

---
 Makefile.am               | 4 +++-
 training/optimize_test.cc | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index a808c211..bd46bd91 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,9 @@
 # warning - the subdirectories in the following list should
 # be kept in topologically sorted order. Also, DO NOT introduce
 # cyclic dependencies between these directories!
-SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
+SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools
+
+#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
 
 AUTOMAKE_OPTIONS = foreign
 ACLOCAL_AMFLAGS = -I m4
diff --git a/training/optimize_test.cc b/training/optimize_test.cc
index 6fa5efd4..fe7ca70f 100644
--- a/training/optimize_test.cc
+++ b/training/optimize_test.cc
@@ -104,7 +104,7 @@ void TestOnline() {
   double eta0 = 0.2;
   shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85));
   //shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0));
-  CumulativeL1OnlineOptimizer opt(r, N, C);
+  CumulativeL1OnlineOptimizer opt(r, N, C, std::vector<int>());
   assert(r->eta(10) < r->eta(1));
 }
 
-- 
cgit v1.2.3


From 21a136afad4d1b04cddc3ff1e105b0fc7e9d8c2c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sun, 20 Mar 2011 16:16:11 -0400
Subject: prevent over-aggressive error checking in vest script

---
 vest/dist-vest.pl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index d17d7de1..80d2471e 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -417,7 +417,8 @@ while (1){
 		print STDERR "COMMAND:\n$cmd\n";
 		check_bash_call($cmd);
 		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
-		my $best=check_bash_output("$cmd"); chomp $best;
+		# sort returns failure even when it doesn't fail for some reason
+		my $best=unchecked_output("$cmd"); chomp $best;
 		print STDERR "$best\n";
 		my ($oa, $x, $xscore) = split /\|/, $best;
 		$score = $xscore;
@@ -450,7 +451,7 @@ while (1){
 			my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
 			print W "$k $v\n";
 		}
-		check_call("rm -rf $dir/splag.$im1");
+		check_call("rm $dir/splag.$im1/*");
 		$inweights = $finalFile;
 	}
 	$lastWeightsFile = "$dir/weights.$iteration";
-- 
cgit v1.2.3


From 78af1ef80f84023b4ff0661c47201850dbd46363 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@cluster17.lti.ece.cmu.local>
Date: Mon, 21 Mar 2011 15:55:14 -0400
Subject: Update error handling and a corner case of trie.

---
 klm/lm/build_binary.cc | 102 +++++++++++++++++++++++--------------------------
 klm/lm/config.cc       |   2 +-
 klm/lm/config.hh       |   2 +-
 klm/lm/model.cc        |   2 +-
 klm/lm/search_trie.cc  |  20 +++++++---
 klm/lm/vocab.cc        |   2 +-
 klm/util/exception.cc  |   4 +-
 7 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index d6dd5994..920ff080 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -15,8 +15,9 @@ namespace ngram {
 namespace {
 
 void Usage(const char *name) {
-  std::cerr << "Usage: " << name << " [-u unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
-"-u sets the default probability for <unk> if the ARPA file does not have one.\n"
+  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
+"-u sets the default log10 probability for <unk> if the ARPA file does not have\n"
+"one.\n"
 "-s allows models to be built even if they do not have <s> and </s>.\n\n"
 "type is one of probing, trie, or sorted:\n\n"
 "probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
@@ -69,65 +70,58 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
 } // namespace lm
 } // namespace
 
-void terminate_handler() {
-  try { throw; }
-  catch(const std::exception& e) {
-    std::cerr << e.what() << std::endl;
-  }
-  catch(...) {
-    std::cerr << "A non-standard exception was thrown." << std::endl;
-  }
-  std::abort();
-}
-
 int main(int argc, char *argv[]) {
   using namespace lm::ngram;
 
-  std::set_terminate(terminate_handler);
-
-  lm::ngram::Config config;
-  int opt;
-  while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) {
-    switch(opt) {
-      case 'u':
-        config.unknown_missing_prob = ParseFloat(optarg);
-        break;
-      case 'p':
-        config.probing_multiplier = ParseFloat(optarg);
-        break;
-      case 't':
-        config.temporary_directory_prefix = optarg;
-        break;
-      case 'm':
-        config.building_memory = ParseUInt(optarg) * 1048576;
-        break;
-      case 's':
-        config.sentence_marker_missing = lm::ngram::Config::SILENT;
-        break;
-      default:
-        Usage(argv[0]);
+  try {
+    lm::ngram::Config config;
+    int opt;
+    while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) {
+      switch(opt) {
+        case 'u':
+          config.unknown_missing_logprob = ParseFloat(optarg);
+          break;
+        case 'p':
+          config.probing_multiplier = ParseFloat(optarg);
+          break;
+        case 't':
+          config.temporary_directory_prefix = optarg;
+          break;
+        case 'm':
+          config.building_memory = ParseUInt(optarg) * 1048576;
+          break;
+        case 's':
+          config.sentence_marker_missing = lm::ngram::Config::SILENT;
+          break;
+        default:
+          Usage(argv[0]);
+      }
     }
-  }
-  if (optind + 1 == argc) {
-    ShowSizes(argv[optind], config);
-  } else if (optind + 2 == argc) {
-    config.write_mmap = argv[optind + 1];
-    ProbingModel(argv[optind], config);
-  } else if (optind + 3 == argc) {
-    const char *model_type = argv[optind];
-    const char *from_file = argv[optind + 1];
-    config.write_mmap = argv[optind + 2];
-    if (!strcmp(model_type, "probing")) {
-      ProbingModel(from_file, config);
-    } else if (!strcmp(model_type, "sorted")) {
-      SortedModel(from_file, config);
-    } else if (!strcmp(model_type, "trie")) {
-      TrieModel(from_file, config);
+    if (optind + 1 == argc) {
+      ShowSizes(argv[optind], config);
+    } else if (optind + 2 == argc) {
+      config.write_mmap = argv[optind + 1];
+      ProbingModel(argv[optind], config);
+    } else if (optind + 3 == argc) {
+      const char *model_type = argv[optind];
+      const char *from_file = argv[optind + 1];
+      config.write_mmap = argv[optind + 2];
+      if (!strcmp(model_type, "probing")) {
+        ProbingModel(from_file, config);
+      } else if (!strcmp(model_type, "sorted")) {
+        SortedModel(from_file, config);
+      } else if (!strcmp(model_type, "trie")) {
+        TrieModel(from_file, config);
+      } else {
+        Usage(argv[0]);
+      }
     } else {
       Usage(argv[0]);
     }
-  } else {
-    Usage(argv[0]);
+  }
+  catch (std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    abort();
   }
   return 0;
 }
diff --git a/klm/lm/config.cc b/klm/lm/config.cc
index d8773fe5..71646e51 100644
--- a/klm/lm/config.cc
+++ b/klm/lm/config.cc
@@ -10,7 +10,7 @@ Config::Config() :
   enumerate_vocab(NULL),
   unknown_missing(COMPLAIN),
   sentence_marker_missing(THROW_UP),
-  unknown_missing_prob(0.0),
+  unknown_missing_logprob(-100.0),
   probing_multiplier(1.5),
   building_memory(1073741824ULL), // 1 GB
   temporary_directory_prefix(NULL),
diff --git a/klm/lm/config.hh b/klm/lm/config.hh
index 17f67df3..1f7762be 100644
--- a/klm/lm/config.hh
+++ b/klm/lm/config.hh
@@ -36,7 +36,7 @@ struct Config {
 
   // The probability to substitute for <unk> if it's missing from the model.  
   // No effect if the model has <unk> or unknown_missing == THROW_UP.
-  float unknown_missing_prob;
+  float unknown_missing_logprob;
 
   // Size multiplier for probing hash table.  Must be > 1.  Space is linear in
   // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect
diff --git a/klm/lm/model.cc b/klm/lm/model.cc
index 14949e97..1492276a 100644
--- a/klm/lm/model.cc
+++ b/klm/lm/model.cc
@@ -86,7 +86,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
     assert(config.unknown_missing != Config::THROW_UP);
     // Default probabilities for unknown.  
     search_.unigram.Unknown().backoff = 0.0;
-    search_.unigram.Unknown().prob = config.unknown_missing_prob;
+    search_.unigram.Unknown().prob = config.unknown_missing_logprob;
   }
   FinishFile(config, kModelType, counts, backing_);
 }
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index 63631223..b830dfc3 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -535,13 +535,16 @@ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const st
   }
 }
 
-void ARPAToSortedFiles(const Config &config, util::FilePiece &f, const std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
   {
     std::string unigram_name = file_prefix + "unigrams";
     util::scoped_fd unigram_file;
-    util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), counts[0] * sizeof(ProbBackoff), unigram_file), counts[0] * sizeof(ProbBackoff));
+    // In case <unk> appears.  
+    size_t extra_count = counts[0] + 1;
+    util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), extra_count * sizeof(ProbBackoff), unigram_file), extra_count * sizeof(ProbBackoff));
     Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()));
     CheckSpecials(config, vocab);
+    if (!vocab.SawUnk()) ++counts[0];
   }
 
   // Only use as much buffer as we need.  
@@ -572,7 +575,7 @@ bool HeadMatch(const WordIndex *words, const WordIndex *const words_end, const W
   return true;
 }
 
-// Counting phrase
+// Phase to count n-grams, including blanks inserted because they were pruned but have extensions
 class JustCount {
   public:
     JustCount(ContextReader * /*contexts*/, UnigramValue * /*unigrams*/, BitPackedMiddle * /*middle*/, BitPackedLongest &/*longest*/, uint64_t *counts, unsigned char order)
@@ -603,6 +606,7 @@ class JustCount {
     uint64_t *const counts_, *const longest_counts_;
 };
 
+// Phase to actually write n-grams to the trie.  
 class WriteEntries {
   public:
     WriteEntries(ContextReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, const uint64_t * /*counts*/, unsigned char order) : 
@@ -764,7 +768,7 @@ template <class Doing> class RecursiveInsert {
 
 void SanityCheckCounts(const std::vector<uint64_t> &initial, const std::vector<uint64_t> &fixed) {
   if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]);
-  if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant");
+  if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back());
   for (unsigned char i = 0; i < initial.size(); ++i) {
     if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected.  This shouldn't happen");
   }
@@ -789,6 +793,9 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co
     RecursiveInsert<JustCount> counter(inputs, contexts, NULL, &*out.middle.begin(), out.longest, &*fixed_counts.begin(), counts.size());
     counter.Apply(config.messages, "Counting n-grams that should not have been pruned", counts[0]);
   }
+  for (SortedFileReader *i = inputs; i < inputs + counts.size() - 1; ++i) {
+    if (!i->Ended()) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
+  }
   SanityCheckCounts(counts, fixed_counts);
   counts = fixed_counts;
 
@@ -805,7 +812,7 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co
   }
 
   // Fill unigram probabilities.  
-  {
+  try {
     std::string name(file_prefix + "unigrams");
     util::scoped_FILE file(OpenOrThrow(name.c_str(), "r"));
     for (WordIndex i = 0; i < counts[0]; ++i) {
@@ -816,6 +823,9 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co
       }
     }
     RemoveOrThrow(name.c_str());
+  } catch (util::Exception &e) {
+    e << " while re-reading unigram probabilities";
+    throw;
   }
 
   // Do not disable this error message or else too little state will be returned.  Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.   
diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc
index 415f8331..fd11ad2c 100644
--- a/klm/lm/vocab.cc
+++ b/klm/lm/vocab.cc
@@ -192,7 +192,7 @@ void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
     case Config::SILENT:
       return;
     case Config::COMPLAIN:
-      if (config.messages) *config.messages << "The ARPA file is missing <unk>.  Substituting probability " << config.unknown_missing_prob << "." << std::endl;
+      if (config.messages) *config.messages << "The ARPA file is missing <unk>.  Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl;
       break;
     case Config::THROW_UP:
       UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing <unk> and the model is configured to throw an exception.");
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index 077405f4..84f9fe7c 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -9,11 +9,11 @@ Exception::Exception() throw() {}
 Exception::~Exception() throw() {}
 
 Exception::Exception(const Exception &from) : std::exception() {
-  stream_.str(from.stream_.str());
+  stream_ << from.stream_.str();
 }
 
 Exception &Exception::operator=(const Exception &from) {
-  stream_.str(from.stream_.str());
+  stream_ << from.stream_.str();
   return *this;
 }
 
-- 
cgit v1.2.3


From 2d0c2c2bf7c35db2e16d16f39cb9749a04bc91df Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@cluster10.lti.ece.cmu.local>
Date: Mon, 21 Mar 2011 22:00:29 -0400
Subject: compiler warnings

---
 klm/util/bit_packing.hh | 7 +++++--
 klm/util/have.hh        | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 70cfc2d2..5c71c792 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -28,16 +28,19 @@ namespace util {
  * but it may be called multiple times when that's inconvenient.  
  */
 
-inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
+
 // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.  
 #if BYTE_ORDER == LITTLE_ENDIAN
+inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) {
   return bit;
+}
 #elif BYTE_ORDER == BIG_ENDIAN
+inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
   return 64 - length - bit;
+}
 #else
 #error "Bit packing code isn't written for your byte order."
 #endif
-}
 
 /* Pack integers up to 57 bits using their least significant digits. 
  * The length is specified using mask:
diff --git a/klm/util/have.hh b/klm/util/have.hh
index 7cf62008..f2f0cf90 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -2,8 +2,14 @@
 #ifndef UTIL_HAVE__
 #define UTIL_HAVE__
 
+#ifndef HAVE_ZLIB
 #define HAVE_ZLIB
+#endif
+
 // #define HAVE_ICU
+
+#ifndef HAVE_BOOST
 #define HAVE_BOOST
+#endif
 
 #endif // UTIL_HAVE__
-- 
cgit v1.2.3


From da6c892bc05a5520910e23089d83ceb1f2a0fbb4 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 21 Mar 2011 22:10:02 -0400
Subject: add support for normalized 'summary features'- seemingly sound way of
 dealing with normalization problems in embedded crf translation models

---
 decoder/decoder.cc | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 95ff6270..8a03c5c9 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -141,12 +141,13 @@ inline shared_ptr<FsaFeatureFunction> make_fsa_ff(string const& ffp,bool verbose
 // and then prune the resulting (rescored) hypergraph. All feature values from previous
 // passes are carried over into subsequent passes (where they may have different weights).
 struct RescoringPass {
-  RescoringPass() : density_prune(), beam_prune() {}
+  RescoringPass() : fid_summary(), density_prune(), beam_prune() {}
   shared_ptr<ModelSet> models;
   shared_ptr<IntersectionConfiguration> inter_conf;
   vector<const FeatureFunction*> ffs;
   shared_ptr<Weights> w;      // null == use previous weights
   vector<double> weight_vector;
+  int fid_summary;            // 0 == no summary feature
   double density_prune;       // 0 == don't density prune
   double beam_prune;          // 0 == don't beam prune
 };
@@ -155,6 +156,7 @@ ostream& operator<<(ostream& os, const RescoringPass& rp) {
   os << "[num_fn=" << rp.ffs.size();
   if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; }
   if (rp.w) os << " new_weights";
+  if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary);
   if (rp.density_prune) os << " density_prune=" << rp.density_prune;
   if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune;
   os << ']';
@@ -361,18 +363,21 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
         ("weights,w",po::value<string>(),"Feature weights file (initial forest / pass 1)")
         ("feature_function,F",po::value<vector<string> >()->composing(), "Pass 1 additional feature function(s) (-L for list)")
         ("intersection_strategy,I",po::value<string>()->default_value("cube_pruning"), "Pass 1 intersection strategy for incorporating finite-state features; values include Cube_pruning, Full")
+        ("summary_feature", po::value<string>(), "Compute a 'summary feature' at the end of the pass (before any pruning) with name=arg and value=inside-outside/Z")
         ("density_prune", po::value<double>(), "Pass 1 pruning: keep no more than this many times the number of edges used in the best derivation tree (>=1.0)")
         ("beam_prune", po::value<double>(), "Pass 1 pruning: Prune paths from scored forest, keep paths within exp(alpha>=0)")
 
         ("weights2",po::value<string>(),"Optional pass 2")
         ("feature_function2",po::value<vector<string> >()->composing(), "Optional pass 2")
         ("intersection_strategy2",po::value<string>()->default_value("cube_pruning"), "Optional pass 2")
+        ("summary_feature2", po::value<string>(), "Optional pass 2")
         ("density_prune2", po::value<double>(), "Optional pass 2")
         ("beam_prune2", po::value<double>(), "Optional pass 2")
 
         ("weights3",po::value<string>(),"Optional pass 3")
         ("feature_function3",po::value<vector<string> >()->composing(), "Optional pass 3")
         ("intersection_strategy3",po::value<string>()->default_value("cube_pruning"), "Optional pass 3")
+        ("summary_feature3", po::value<string>(), "Optional pass 3")
         ("density_prune3", po::value<double>(), "Optional pass 3")
         ("beam_prune3", po::value<double>(), "Optional pass 3")
 
@@ -559,6 +564,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   for (int pass = 0; pass < MAX_PASSES; ++pass) {
     string ws = "weights" + StringSuffixForRescoringPass(pass);
     string ff = "feature_function" + StringSuffixForRescoringPass(pass);
+    string sf = "summary_feature" + StringSuffixForRescoringPass(pass);
     string bp = "beam_prune" + StringSuffixForRescoringPass(pass);
     string dp = "density_prune" + StringSuffixForRescoringPass(pass);
     bool first_pass_condition = ((pass == 0) && (conf.count(ff) || conf.count(bp) || conf.count(dp)));
@@ -583,6 +589,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
           if (p->IsStateful()) { has_stateful = true; }
         }
       }
+      if (conf.count(sf)) {
+        rp.fid_summary = FD::Convert(conf[sf].as<string>());
+        assert(rp.fid_summary > 0);
+        // TODO assert that weights for this pass have coef(fid_summary) == 0.0?
+      }
       if (conf.count(bp)) { rp.beam_prune = conf[bp].as<double>(); }
       if (conf.count(dp)) { rp.density_prune = conf[dp].as<double>(); }
       int palg = (has_stateful ? 1 : 0);  // if there are no stateful featueres, default to FULL
@@ -794,6 +805,15 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
       cerr << "  " << passtr << " partition     log(Z): " << log(z) << endl;
     }
 
+    if (rp.fid_summary) {
+      Hypergraph::EdgeProbs posteriors;
+      const prob_t z = forest.ComputeEdgePosteriors(1.0, &posteriors);
+      if (!SILENT) { cerr << "  " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; }
+      assert(forest.edges_.size() == posteriors.size());
+      for (int i = 0; i < posteriors.size(); ++i)
+        forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z));
+    }
+
     string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass);
     string fulldp = "density_prune" + StringSuffixForRescoringPass(pass);
     maybe_prune(forest,conf,fullbp.c_str(),fulldp.c_str(),passtr,srclen);
-- 
cgit v1.2.3


From 4bc9ea17ba9f85c899e35a9d657ee3f174ff2863 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 22 Mar 2011 11:35:45 -0400
Subject: check for infs

---
 decoder/decoder.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 8a03c5c9..a16a9b5a 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -810,8 +810,12 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
       const prob_t z = forest.ComputeEdgePosteriors(1.0, &posteriors);
       if (!SILENT) { cerr << "  " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; }
       assert(forest.edges_.size() == posteriors.size());
-      for (int i = 0; i < posteriors.size(); ++i)
-        forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z));
+      if (!isfinite(log(z)) || isnan(log(z))) {
+        cerr << "  " << passtr << " !!! Invalid partition detected, abandoning.\n";
+      } else {
+        for (int i = 0; i < posteriors.size(); ++i)
+          forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z));
+      }
     }
 
     string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass);
-- 
cgit v1.2.3


From c0ae6f362b245ccf2ab3b8d6dc7e367cbcc64c1c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 22 Mar 2011 14:44:46 -0400
Subject: fix local normalizer code for summary features

---
 decoder/decoder.cc | 16 +++++++++++-----
 decoder/hg.cc      |  5 +++--
 decoder/hg.h       |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index a16a9b5a..89425198 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -806,15 +806,21 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
     }
 
     if (rp.fid_summary) {
-      Hypergraph::EdgeProbs posteriors;
-      const prob_t z = forest.ComputeEdgePosteriors(1.0, &posteriors);
+      const prob_t z = forest.PushWeightsToGoal(1.0);
       if (!SILENT) { cerr << "  " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; }
-      assert(forest.edges_.size() == posteriors.size());
       if (!isfinite(log(z)) || isnan(log(z))) {
         cerr << "  " << passtr << " !!! Invalid partition detected, abandoning.\n";
       } else {
-        for (int i = 0; i < posteriors.size(); ++i)
-          forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z));
+        for (int i = 0; i < forest.edges_.size(); ++i) {
+          const double log_prob_transition = log(forest.edges_[i].edge_prob_); // locally normalized by the edge
+                                                                            // head node by forest.PushWeightsToGoal
+          if (!isfinite(log_prob_transition) || isnan(log_prob_transition)) {
+            cerr << "Edge: i=" << i << " got bad inside prob: " << *forest.edges_[i].rule_ << endl;
+            abort();
+          }
+
+          forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition);
+        }
       }
     }
 
diff --git a/decoder/hg.cc b/decoder/hg.cc
index 39ac5132..a4028b0e 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -226,9 +226,9 @@ prob_t Hypergraph::PushViterbiWeightsToGoal(int fid) {
 }
 
 
-void Hypergraph::PushWeightsToGoal(double scale) {
+prob_t Hypergraph::PushWeightsToGoal(double scale) {
   vector<prob_t> posts;
-  ComputeEdgePosteriors(scale, &posts);
+  const prob_t inside_z = ComputeEdgePosteriors(scale, &posts);
   for (int i = 0; i < nodes_.size(); ++i) {
     const Hypergraph::Node& node = nodes_[i];
     prob_t z = prob_t::Zero();
@@ -238,6 +238,7 @@ void Hypergraph::PushWeightsToGoal(double scale) {
       edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z;
     }
   }
+  return inside_z;
 }
 
 struct EdgeExistsWeightFunction {
diff --git a/decoder/hg.h b/decoder/hg.h
index aa1202b1..e5ef05f8 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -449,7 +449,7 @@ public:
   void PushWeightsToSource(double scale = 1.0);
   // same, except weights are pushed to the goal, works for HGs,
   // not just lattices
-  void PushWeightsToGoal(double scale = 1.0);
+  prob_t PushWeightsToGoal(double scale = 1.0);
 
   // contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid.  note that p_viterbi becomes 1; k*p_viterbi becomes k.  also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight)
   // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after).  returns inside(goal)
-- 
cgit v1.2.3


From 12ece6ddfa91ec61cdeee698db2c7edac941e096 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 22 Mar 2011 16:48:28 -0400
Subject: reweight after weight pushing to avoid weird output

---
 decoder/decoder.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 89425198..ac063659 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -821,6 +821,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
 
           forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition);
         }
+        forest.Reweight(cur_weights);  // reset weights
       }
     }
 
-- 
cgit v1.2.3


From 57a218e86e30d57d9795bccd280737c431f6b4e4 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 23 Mar 2011 12:15:55 -0400
Subject: yet another feature attempt

---
 decoder/decoder.cc | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index ac063659..b7774acc 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -806,6 +806,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
     }
 
     if (rp.fid_summary) {
+#if 0
       const prob_t z = forest.PushWeightsToGoal(1.0);
       if (!SILENT) { cerr << "  " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; }
       if (!isfinite(log(z)) || isnan(log(z))) {
@@ -823,6 +824,26 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
         }
         forest.Reweight(cur_weights);  // reset weights
       }
+#endif
+      Hypergraph::EdgeProbs posts;
+      const prob_t z = forest.ComputeEdgePosteriors(1.0, &posts);
+      if (!isfinite(log(z)) || isnan(log(z))) {
+        cerr << "  " << passtr << " !!! Invalid partition detected, abandoning.\n";
+      } else {
+        for (int i = 0; i < forest.nodes_.size(); ++i) {
+          const Hypergraph::EdgesVector& in_edges = forest.nodes_[i].in_edges_;
+          prob_t node_post = prob_t(0);
+          for (int j = 0; j < in_edges.size(); ++j)
+            node_post += (posts[in_edges[j]] / z);
+          const double log_np = log(node_post);
+          if (!isfinite(log_np) || isnan(log_np)) {
+            cerr << "got bad posterior prob for node " << i << endl;
+            abort();
+          }
+          for (int j = 0; j < in_edges.size(); ++j)
+            forest.edges_[in_edges[j]].feature_values_.set_value(rp.fid_summary, exp(log_np));
+        }
+      }
     }
 
     string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass);
-- 
cgit v1.2.3


From 918ed4bf919a55e3eb5d99d98c9b915921dc11ab Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 23 Mar 2011 22:53:44 -0400
Subject: remove thread-local stuff which was fragile on some build systems

---
 decoder/trule.cc    |  3 +--
 utils/static_utoa.h |  2 +-
 utils/tdict.cc      |  1 -
 utils/threadlocal.h | 71 -----------------------------------------------------
 4 files changed, 2 insertions(+), 75 deletions(-)
 delete mode 100755 utils/threadlocal.h

diff --git a/decoder/trule.cc b/decoder/trule.cc
index fda62741..40235542 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -5,7 +5,6 @@
 #include "stringlib.h"
 #include "tdict.h"
 #include "rule_lexer.h"
-#include "threadlocal.h"
 
 using namespace std;
 
@@ -99,7 +98,7 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) {
 
 namespace {
 // callback for lexer
-THREADLOCAL int n_assigned=0;
+int n_assigned=0;
 void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) {
   TRule *assignto=(TRule *)extra;
   *assignto=*new_rule;
diff --git a/utils/static_utoa.h b/utils/static_utoa.h
index d15ed35b..bb3d821f 100755
--- a/utils/static_utoa.h
+++ b/utils/static_utoa.h
@@ -7,7 +7,7 @@
 namespace {
 static const int utoa_bufsize=40; // 64bit safe.
 static const int utoa_bufsizem1=utoa_bufsize-1; // 64bit safe.
-THREADLOCAL char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20]
+static char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20]
 }
 
 inline char *static_utoa(unsigned n) {
diff --git a/utils/tdict.cc b/utils/tdict.cc
index 23a298f8..c21b2b48 100644
--- a/utils/tdict.cc
+++ b/utils/tdict.cc
@@ -8,7 +8,6 @@
 #include "dict.h"
 #include "tdict.h"
 #include "stringlib.h"
-#include "threadlocal.h"
 
 using namespace std;
 
diff --git a/utils/threadlocal.h b/utils/threadlocal.h
deleted file mode 100755
index d79f5d9d..00000000
--- a/utils/threadlocal.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef THREADLOCAL_H
-#define THREADLOCAL_H
-
-#ifndef SETLOCAL_SWAP
-# define SETLOCAL_SWAP 0
-#endif
-
-#ifdef BOOST_NO_MT
-
-# define THREADLOCAL
-
-#else
-
-#ifdef _MSC_VER
-
-//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html)
-# define THREADLOCAL __declspec(thread)
-
-#else
-
-# define THREADLOCAL __thread
-
-#endif
-
-#endif
-
-#include <algorithm> //swap
-
-// naturally, the below are only thread-safe if value is THREADLOCAL
-template <class D>
-struct SaveLocal {
-    D &value;
-    D old_value;
-    SaveLocal(D& val) : value(val), old_value(val) {}
-    ~SaveLocal() {
-#if SETLOCAL_SWAP
-      swap(value,old_value);
-#else
-      value=old_value;
-#endif
-    }
-};
-
-template <class D>
-struct SetLocal {
-    D &value;
-    D old_value;
-    SetLocal(D& val,const D &new_value) : value(val), old_value(
-#if SETLOCAL_SWAP
-      new_value
-#else
-      val
-#endif
-      ) {
-#if SETLOCAL_SWAP
-      swap(value,old_value);
-#else
-      value=new_value;
-#endif
-    }
-    ~SetLocal() {
-#if SETLOCAL_SWAP
-      swap(value,old_value);
-#else
-      value=old_value;
-#endif
-    }
-};
-
-
-#endif
-- 
cgit v1.2.3


From e03a6c2b2e3cc21d75904300d34249cd1e2e032b Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 23 Mar 2011 22:55:50 -0400
Subject: refactor makefile

---
 decoder/Makefile.am | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index e1dba497..244da2de 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -6,16 +6,13 @@ noinst_PROGRAMS = \
   hg_test \
   ff_test \
   parser_test \
-  grammar_test \
-  cfg_test
-TESTS = trule_test ff_test parser_test grammar_test hg_test cfg_test
-endif
-
-cdec_SOURCES = cdec.cc
-cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-cfg_test_SOURCES = cfg_test.cc
-cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+  grammar_test
+ 
+ # cfg_test
+TESTS = trule_test ff_test parser_test grammar_test hg_test
+# cfg_test
+#cfg_test_SOURCES = cfg_test.cc
+#cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
 parser_test_SOURCES = parser_test.cc
 parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
 ff_test_SOURCES = ff_test.cc
@@ -26,6 +23,11 @@ hg_test_SOURCES = hg_test.cc
 hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
 trule_test_SOURCES = trule_test.cc
 trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+endif
+
+cdec_SOURCES = cdec.cc
+cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 AM_CPPFLAGS = -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm
 
 rule_lexer.cc: rule_lexer.l
-- 
cgit v1.2.3


From 972c40c819bb1e6ea8c78eb2e067f014713adf86 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 23 Mar 2011 23:00:32 -0400
Subject: fail if no lex program is found (i think)

---
 configure.ac | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configure.ac b/configure.ac
index 56f08147..b323576f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,6 +3,9 @@ AM_INIT_AUTOMAKE(cdec,0.1)
 AC_CONFIG_HEADERS(config.h)
 AC_PROG_LIBTOOL
 AC_PROG_LEX
+case $LEX in 
+:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; 
+esac 
 AC_PROG_CC
 AC_PROG_CXX
 AC_LANG_CPLUSPLUS
-- 
cgit v1.2.3


From a580faa8177331cf51138a2208e276b703470934 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 23 Mar 2011 23:12:31 -0400
Subject: remove dependency on boost thread library

---
 configure.ac | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index b323576f..da66c3fb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -11,10 +11,12 @@ AC_PROG_CXX
 AC_LANG_CPLUSPLUS
 BOOST_REQUIRE
 BOOST_PROGRAM_OPTIONS
-BOOST_THREADS
+#BOOST_THREADS
 CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
-LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_THREAD_LDFLAGS"
-LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS"
+# $BOOST_THREAD_LDFLAGS"
+LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS"
+# $BOOST_THREAD_LIBS"
 
 AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp,
                [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])])
-- 
cgit v1.2.3