From 8aa29810bb77611cc20b7a384897ff6703783ea1 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Sun, 18 Nov 2012 13:35:42 -0500
Subject: major restructure of the training code

---
 word-aligner/makefiles/makefile.grammars | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'word-aligner/makefiles/makefile.grammars')
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index 08ff33e1..ce3e1638 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -16,7 +16,7 @@ STEM_E = $(SCRIPT_DIR)/stemmers/$(E_LANG).pl
 
 CLASSIFY = $(SUPPORT_DIR)/classify.pl
 MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl
-MODEL1 = $(TRAINING_DIR)/fast_align
+MODEL1 = $(SCRIPT_DIR)/fast_align
 MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl
 
 e.voc: corpus.e
-- 
cgit v1.2.3


From 7493f3b3b8a7d398f7f959a974ce1cf878dbe613 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Tue, 20 Nov 2012 13:56:08 -0500
Subject: fixes for 2011 optimizer

---
 training/crf/mpi_online_optimize.cc       | 12 +++++++++++-
 word-aligner/aligner.pl                   | 19 ++++++++++++++++++-
 word-aligner/makefiles/makefile.grammars  |  2 +-
 word-aligner/makefiles/makefile.model.f-e | 14 ++++++++++++++
 4 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 word-aligner/makefiles/makefile.model.f-e

(limited to 'word-aligner/makefiles/makefile.grammars')

diff --git a/training/crf/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc
index d6968848..9e1ae34c 100644
--- a/training/crf/mpi_online_optimize.cc
+++ b/training/crf/mpi_online_optimize.cc
@@ -5,6 +5,7 @@
 #include <cassert>
 #include <cmath>
 #include <tr1/memory>
+#include <ctime>
 
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
@@ -41,6 +42,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
         ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
         ("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)")
+        ("max_walltime", po::value<unsigned>(), "Maximum walltime to run (in minutes)")
         ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
         ("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)")
         ("L1,1","Use L1 regularization")
@@ -304,6 +306,9 @@ int main(int argc, char** argv) {
   int write_weights_every_ith = 100; // TODO configure
   int titer = -1;
 
+  unsigned timeout = 0;
+  if (conf.count("max_walltime")) timeout = 60 * conf["max_walltime"].as<unsigned>();
+  const time_t start_time = time(NULL);
   for (int ai = 0; ai < agenda.size(); ++ai) {
     const string& cur_config = agenda[ai].first;
     const unsigned max_iteration = agenda[ai].second;
@@ -336,9 +341,14 @@ int main(int argc, char** argv) {
           ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
           fname = o.str();
         }
+        const time_t cur_time = time(NULL);
+        if (timeout) {
+          if ((cur_time - start_time) > timeout) converged = true;
+        }
         if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
         ostringstream vv;
-        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
+        double minutes = (cur_time - start_time) / 60.0;
+        vv << "total walltime=" << minutes << "min iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
         const string svv = vv.str();
         cerr << svv << endl;
         Weights::WriteToFile(fname, lambdas, true, &svv);
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index c5078645..cbccb94a 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -51,6 +51,8 @@ while(<IN>) {
   chomp;
   my ($f, $e) = split / \|\|\| /;
   die "Bad format, excepted ||| separated line" unless defined $f && defined $e;
+  $f =~ s/\[/(/g;
+  $e =~ s/\]/)/g;
   print F "$f\n";
   print E "$e\n";
 }
@@ -80,6 +82,11 @@ NCLASSES = $num_classes
 TARGETS = @targets
 PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
 PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
+#MPIJOBS = 4
+#MPIRUN = mpirun -np $(MPIJOBS)
+MPIRUN=
+
+WALLTIME=90
 
 export
 
@@ -99,7 +106,15 @@ clean:
 EOT
 close TOPLEVEL;
 
-print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+print STDERR <<EOT;
+Created alignment task. To start, run:
+cd talign/
+make
+
+To specify the walltime *in minutes* used by the optimizer, use
+make WALLTIME=120
+
+EOT
 exit 0;
 
 sub make_stage {
@@ -142,6 +157,8 @@ EOT
   open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!";
   print AGENDA "cdec.ini $TRAINING_ITERATIONS\n";
   close AGENDA;
+  `cp $SCRIPT_DIR/makefiles/makefile.model.$direction $stage_dir/Makefile`;
+  die unless $? == 0;
 }
 
 sub usage {
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index ce3e1638..8d3ea8cb 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -4,7 +4,7 @@ clean:
 	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs*
 
 SUPPORT_DIR = $(SCRIPT_DIR)/support
-GZIP = /usr/bin/gzip
+GZIP = gzip
 ZCAT = zcat
 EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
 EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
diff --git a/word-aligner/makefiles/makefile.model.f-e b/word-aligner/makefiles/makefile.model.f-e
new file mode 100644
index 00000000..404f5b30
--- /dev/null
+++ b/word-aligner/makefiles/makefile.model.f-e
@@ -0,0 +1,14 @@
+all: output.f-e.aligned
+
+clean:
+	$(RM) output.f-e.a weights.cur.gz
+
+CDEC = $(SCRIPT_DIR)/../decoder/cdec
+OPTIMIZE = $(SCRIPT_DIR)/../training/crf/mpi_online_optimize
+
+weights.cur.gz: ../grammars/wordpairs.f-e.features.gz
+	$(MPIRUN) $(OPTIMIZE) -a agenda.txt -1 -C 1.0 -t ../grammars/corpus.f-e --max_walltime 90
+
+output.f-e.aligned: weights.cur.gz
+	$(CDEC) -c cdec.ini -w $< --lextrans_align_only -i ../grammars/corpus.f-e -a > $@
+
-- 
cgit v1.2.3