From 3dfa575d202c9277060bc43a7af9351702da9f12 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 09:05:04 -0500 Subject: fix my dumb bug that killed qsub functionality --- vest/dist-vest.pl | 2 -- vest/parallelize.pl | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 973a29ef..f6f661b9 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -194,7 +194,6 @@ sub modbin { my $src=$$_; $$_="$bindir/".basename($src); check_call("cp -p $src $$_"); - die "cp $src $$_ failed: $!" unless $? == 0; } } sub dirsize { @@ -374,7 +373,6 @@ while (1){ $nmappers++; my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; my $jobid = check_output("$qcmd"); - die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0; chomp $jobid; $jobid =~ s/^(\d+)(.*?)$/\1/g; $jobid =~ s/^Your job (\d+) .*$/\1/; diff --git a/vest/parallelize.pl b/vest/parallelize.pl index 47b77c79..2798a303 100755 --- a/vest/parallelize.pl +++ b/vest/parallelize.pl @@ -82,7 +82,7 @@ sub preview_files { my @f=grep { ! ($skipempty && -z $_) } @$l; my $fn=join(' ',map {escape_shell($_)} @f); my $cmd="tail -n $n $fn"; - check_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); } sub prefix_dirname($) { #like `dirname but if ends in / then return the whole thing @@ -323,7 +323,7 @@ sub launch_job { } if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } - my $cleanfn=check_output("qdel $jobid 2> /dev/null"); + my $cleanfn="qdel $jobid 2> /dev/null"; push(@cleanup_cmds, $cleanfn); } close QOUT; -- cgit v1.2.3 From 13b15df6a00137395eae03ba3f33a987a916257b Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 10:00:53 -0500 Subject: another dumb bug involving cleanup being executed preemptively --- vest/dist-vest.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index f6f661b9..c27af804 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -371,12 +371,12 @@ while (1){ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } $nmappers++; - my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; my $jobid = check_output("$qcmd"); chomp $jobid; $jobid =~ s/^(\d+)(.*?)$/\1/g; $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, check_output("qdel $jobid 2> /dev/null")); + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); print STDERR " $jobid"; if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } -- cgit v1.2.3 From 702df29b83ca10998ea3a8f84bc2e0e6c9e86eea Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 10:01:09 -0500 Subject: ignore emacs temp files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3892891c..2a287bbc 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,4 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar *.dvi *.ps *.toc +*~ \ No newline at end of file -- cgit v1.2.3 From 25d5729b850d1dc62eaf151b5550bd83963b08e8 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 10:09:18 -0500 Subject: dont die when there are no running jobs --- vest/dist-vest.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index c27af804..cfddf61c 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#grep!/usr/bin/env perl use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); @@ -396,7 +396,7 @@ while (1){ print STDERR "Waiting for mappers to complete...\n"; while ($nmappers > 0) { sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | grep -v ' C '"))); + my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | awk '{if($0 !~ \" C \"){print}}'"))); $nmappers = scalar @livejobs; } print STDERR "All mappers complete.\n"; -- cgit v1.2.3 From db200aeefcfad33e789a8790961ef5c0f66d8ba3 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 10:13:20 -0500 Subject: fail --- vest/dist-vest.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index cfddf61c..6a5959dc 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -1,4 +1,4 @@ -#grep!/usr/bin/env perl +#!/usr/bin/env perl use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -- cgit v1.2.3 From 92ca6e23b39043ad026c07a5aab71ffc750c1db2 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 10:22:31 -0500 Subject: just use grep and dont check return code --- vest/dist-vest.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 6a5959dc..f95754dc 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -396,7 +396,7 @@ while (1){ print STDERR "Waiting for mappers to complete...\n"; while ($nmappers > 0) { sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | awk '{if($0 !~ \" C \"){print}}'"))); + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); $nmappers = scalar @livejobs; } print STDERR "All mappers complete.\n"; -- cgit v1.2.3 From 6b25a85dd45af5982e07577b33c64e3b577579c3 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 10:27:43 -0500 Subject: dont fail on possibly temporary qstat errors --- vest/parallelize.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vest/parallelize.pl b/vest/parallelize.pl index 2798a303..c2526503 100755 --- a/vest/parallelize.pl +++ b/vest/parallelize.pl @@ -283,7 +283,8 @@ sub numof_live_jobs { if ($use_fork) { die "not implemented"; } else { - my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat"))); + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); return ($#livejobs + 1); } } -- cgit v1.2.3 From eda8d83cd957463d32980da7c60085a820f7eae0 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Fri, 11 Mar 2011 11:06:48 -0500 Subject: be more verbose when running each child decoder process when forking. also, avoid some non-bash errors --- vest/parallelize.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vest/parallelize.pl b/vest/parallelize.pl index c2526503..b4783f91 100755 --- a/vest/parallelize.pl +++ b/vest/parallelize.pl @@ -347,7 +347,7 @@ sub launch_job_fork { my ($fh, $scr_name) = get_temp_script(); print $fh $script; close $fh; - my $todo = "/bin/sh $scr_name 1> $outfile 2> $errorfile"; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; print STDERR "EXEC: $todo\n"; my $out = check_output("$todo"); print STDERR "RES: $out\n"; -- cgit v1.2.3 From dccf47501f078a354375b9f3edd481d8c8d30268 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Mon, 14 Mar 2011 17:03:51 -0400 Subject: more paranoid checking when (idiot/time-crunched) user tries to define his own tags during tuning --- vest/dist-vest.pl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index f95754dc..d17d7de1 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -573,7 +573,11 @@ sub enseg { while (my $line=){ chomp $line; if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } } else { print NEWSRC "$line\n"; } -- cgit v1.2.3 From 5d0f3c6aa4e78aea09952a7a65f61d3c4dce0a0e Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Mon, 14 Mar 2011 17:05:14 -0400 Subject: Fix wordset to override features() so that we can safely use multiple instances of it --- decoder/ff_wordset.h | 1 + 1 file changed, 1 insertion(+) diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h index 00e1145b..643097ef 100644 --- a/decoder/ff_wordset.h +++ b/decoder/ff_wordset.h @@ -32,6 +32,7 @@ class WordSet : public FeatureFunction { ~WordSet() { } + Features features() const { return single_feature(fid_); } protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, -- cgit v1.2.3 From 237de3db6d5917707b745e3df7be42f2497e3783 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Mon, 14 Mar 2011 17:39:04 -0400 Subject: Get enough compiling with scons to finish off the emnlp paper --- SConstruct | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/SConstruct b/SConstruct index 1a7885bc..c21d85d5 100644 --- a/SConstruct +++ b/SConstruct @@ -11,6 +11,9 @@ AddOption('--with-glc', dest='glc', type='string', nargs=1, action='store', meta AddOption('--efence', dest='efence', action='store_true', help='use electric fence for debugging memory corruptions') +# TODO: Troll http://www.scons.org/wiki/SconsAutoconf +# for some initial autoconf-like steps + platform = ARGUMENTS.get('OS', Platform()) include = Split('decoder utils klm mteval .') env = Environment(PREFIX=GetOption('prefix'), @@ -45,7 +48,7 @@ if glc: srcs.append(glc+'/feature-factory.cc') srcs.append(glc+'/cdec/ff_glc.cc') -for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc']: +for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc', 'vest/*.cc']: srcs.extend([ file for file in Glob(pattern) if not 'test' in str(file) and 'build_binary.cc' not in str(file) @@ -53,6 +56,30 @@ for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mtev and 'mbr_kbest.cc' not in str(file) and 'sri.cc' not in str(file) and 'fast_score.cc' not in str(file) + and 'cdec.cc' not in str(file) + and 'mr_' not in str(file) ]) -env.Program(target='decoder/cdec', source=srcs) +print 'Found {0} source files'.format(len(srcs)) +def comb(cc, srcs): + x = [cc] + x.extend(srcs) + return x + +env.Program(target='decoder/cdec', source=comb('decoder/cdec.cc', srcs)) +# TODO: The various decoder tests +# TODO: extools +env.Program(target='klm/lm/build_binary', source=comb('klm/lm/build_binary.cc', srcs)) +# TODO: klm ngram_query and tests +env.Program(target='mteval/fast_score', source=comb('mteval/fast_score.cc', srcs)) +env.Program(target='mteval/mbr_kbest', source=comb('mteval/mbr_kbest.cc', srcs)) +#env.Program(target='mteval/scorer_test', source=comb('mteval/fast_score.cc', srcs)) +# TODO: phrasinator +# TODO: Various training binaries +env.Program(target='vest/sentserver', source=['vest/sentserver.c'], LINKFLAGS='-all-static') +env.Program(target='vest/sentclient', source=['vest/sentclient.c'], LINKFLAGS='-all-static') +env.Program(target='vest/mr_vest_generate_mapper_input', source=comb('vest/mr_vest_generate_mapper_input.cc', srcs)) +env.Program(target='vest/mr_vest_map', source=comb('vest/mr_vest_map.cc', srcs)) +env.Program(target='vest/mr_vest_reduce', source=comb('vest/mr_vest_reduce.cc', srcs)) +#env.Program(target='vest/lo_test', source=comb('vest/lo_test.cc', srcs)) +# TODO: util tests -- cgit v1.2.3 From 6b6eeff3130bcb40980886d8179ba4ad6842325e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 16 Mar 2011 19:48:41 -0400 Subject: explicit markers turned on by default --- decoder/ff_klm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index adc2c8bf..62908cdc 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -21,7 +21,7 @@ static const unsigned char MASK = 7; // -n NAME : feature id is NAME bool ParseLMArgs(string const& in, string* filename, string* mapfile, bool* explicit_markers, string* featname) { vector const& argv=SplitOnWhitespace(in); - *explicit_markers = true; + *explicit_markers = false; *featname="LanguageModel"; *mapfile = ""; #define LMSPEC_NEXTARG if (i==argv.end()) { \ -- cgit v1.2.3 From 95e50962fe307b930e835513e4d9998df91426a4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 16 Mar 2011 20:30:37 -0400 Subject: possible mert bug with rules with alignments --- decoder/trule.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/decoder/trule.cc b/decoder/trule.cc index 9820e6d5..fda62741 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -145,7 +145,9 @@ bool TRule::ReadFromString(const string& line, bool strict, bool mono) { getline(is, ss); //cerr << "L: " << ss << endl; int start = 0; - const int len = ss.size(); + int len = ss.size(); + const size_t ppos = ss.find(" |||"); + if (ppos != string::npos) { len = ppos; } while (start < len) { while(start < len && (ss[start] == ' ' || ss[start] == ';')) ++start; -- cgit v1.2.3 From 9f78539edbbe00feeee618932fc5d51f5c5b9eb4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 17 Mar 2011 22:29:43 -0400 Subject: enable weights to be frozen during training --- training/mpi_online_optimize.cc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 325ba030..1367581a 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -64,6 +64,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("input_weights,w",po::value(),"Input feature weights file") + ("frozen_features,z",po::value(), "List of features not to optimize") ("training_data,t",po::value(),"Training data corpus") ("training_agenda,a",po::value(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively") ("minibatch_size_per_proc,s", po::value()->default_value(5), "Number of training instances evaluated per processor in each minibatch") @@ -254,6 +255,20 @@ int main(int argc, char** argv) { if (conf.count("input_weights")) weights.InitFromFile(conf["input_weights"].as()); + vector frozen_fids; + if (conf.count("frozen_features")) { + ReadFile rf(conf["frozen_features"].as()); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (line.empty()) continue; + if (line[0] == ' ' || line[line.size() - 1] == ' ') { line = Trim(line); } + frozen_fids.push_back(FD::Convert(line)); + } + if (rank == 0) cerr << "Freezing " << frozen_fids.size() << " features.\n"; + } + vector corpus; vector ids; ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); @@ -362,6 +377,8 @@ int main(int argc, char** argv) { g.swap(local_grad); #endif local_grad.clear(); + for (int i = 0; i < frozen_fids.size(); ++i) + g.erase(frozen_fids[i]); if (rank == 0) { g /= (size_per_proc * size); o->UpdateWeights(g, FD::NumFeats(), &x); -- cgit v1.2.3 From 7079e3685def6f231ecf9f0c3f31b5c03a46d858 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 17 Mar 2011 22:46:35 -0400 Subject: freeze features, including penalty --- training/mpi_online_optimize.cc | 4 +--- training/online_optimizer.h | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 1367581a..32033c19 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -299,7 +299,7 @@ int main(int argc, char** argv) { const string omethod = conf["optimization_method"].as(); if (omethod == "sgd") { const double C = conf["regularization_strength"].as(); - o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C)); + o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C, frozen_fids)); } else { assert(!"fail"); } @@ -377,8 +377,6 @@ int main(int argc, char** argv) { g.swap(local_grad); #endif local_grad.clear(); - for (int i = 0; i < frozen_fids.size(); ++i) - g.erase(frozen_fids[i]); if (rank == 0) { g /= (size_per_proc * size); o->UpdateWeights(g, FD::NumFeats(), &x); diff --git a/training/online_optimizer.h b/training/online_optimizer.h index 312aabae..61d62a37 100644 --- a/training/online_optimizer.h +++ b/training/online_optimizer.h @@ -2,6 +2,7 @@ #define _ONL_OPTIMIZE_H_ #include +#include #include #include #include "sparse_vector.h" @@ -56,8 +57,12 @@ class OnlineOptimizer { public: virtual ~OnlineOptimizer(); OnlineOptimizer(const std::tr1::shared_ptr& s, - size_t batch_size) - : N_(batch_size),schedule_(s),k_() {} + size_t batch_size, + const std::vector& frozen_feats = std::vector()) + : N_(batch_size),schedule_(s),k_() { + for (int i = 0; i < frozen_feats.size(); ++i) + frozen_.insert(frozen_feats[i]); + } void ResetEpoch() { k_ = 0; ResetEpochImpl(); } void UpdateWeights(const SparseVector& approx_g, int max_feat, SparseVector* weights) { ++k_; @@ -69,6 +74,7 @@ class OnlineOptimizer { virtual void ResetEpochImpl(); virtual void UpdateWeightsImpl(const double& eta, const SparseVector& approx_g, int max_feat, SparseVector* weights) = 0; const size_t N_; // number of training instances per batch + std::set frozen_; // frozen (non-optimizing) features private: std::tr1::shared_ptr schedule_; @@ -78,8 +84,9 @@ class OnlineOptimizer { class CumulativeL1OnlineOptimizer : public OnlineOptimizer { public: CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr& s, - size_t training_instances, double C) : - OnlineOptimizer(s, training_instances), C_(C), u_() {} + size_t training_instances, double C, + const std::vector& frozen) : + OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {} protected: void ResetEpochImpl() { u_ = 0; } @@ -87,7 +94,7 @@ class CumulativeL1OnlineOptimizer : public OnlineOptimizer { u_ += eta * C_ / N_; (*weights) += eta * approx_g; for (int i = 1; i < max_feat; ++i) - ApplyPenalty(i, weights); + if (frozen_.count(i) == 0) ApplyPenalty(i, weights); } private: -- cgit v1.2.3 From 4482fe7a82e3f9a197bf65d60635885c4bfab195 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 17 Mar 2011 22:53:19 -0400 Subject: try 2 --- training/online_optimizer.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/training/online_optimizer.h b/training/online_optimizer.h index 61d62a37..28d89344 100644 --- a/training/online_optimizer.h +++ b/training/online_optimizer.h @@ -92,7 +92,11 @@ class CumulativeL1OnlineOptimizer : public OnlineOptimizer { void ResetEpochImpl() { u_ = 0; } void UpdateWeightsImpl(const double& eta, const SparseVector& approx_g, int max_feat, SparseVector* weights) { u_ += eta * C_ / N_; - (*weights) += eta * approx_g; + for (SparseVector::const_iterator it = approx_g.begin(); + it != approx_g.end(); ++it) { + if (frozen_.count(it->first) == 0) + weights->add_value(it->first, eta * it->second); + } for (int i = 1; i < max_feat; ++i) if (frozen_.count(i) == 0) ApplyPenalty(i, weights); } -- cgit v1.2.3 From ed47102885e52c52146fc8631ff624779bd7eb0a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 18 Mar 2011 10:36:26 -0400 Subject: compile fix --- Makefile.am | 4 +++- training/optimize_test.cc | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index a808c211..bd46bd91 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,9 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools + +#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 diff --git a/training/optimize_test.cc b/training/optimize_test.cc index 6fa5efd4..fe7ca70f 100644 --- a/training/optimize_test.cc +++ b/training/optimize_test.cc @@ -104,7 +104,7 @@ void TestOnline() { double eta0 = 0.2; shared_ptr r(new ExponentialDecayLearningRate(N, eta0, 0.85)); //shared_ptr r(new StandardLearningRate(N, eta0)); - CumulativeL1OnlineOptimizer opt(r, N, C); + CumulativeL1OnlineOptimizer opt(r, N, C, std::vector()); assert(r->eta(10) < r->eta(1)); } -- cgit v1.2.3 From 21a136afad4d1b04cddc3ff1e105b0fc7e9d8c2c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 20 Mar 2011 16:16:11 -0400 Subject: prevent over-aggressive error checking in vest script --- vest/dist-vest.pl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index d17d7de1..80d2471e 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -417,7 +417,8 @@ while (1){ print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - my $best=check_bash_output("$cmd"); chomp $best; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; print STDERR "$best\n"; my ($oa, $x, $xscore) = split /\|/, $best; $score = $xscore; @@ -450,7 +451,7 @@ while (1){ my $v = ($ori{$k} + $axi{$k} * $x) / $norm; print W "$k $v\n"; } - check_call("rm -rf $dir/splag.$im1"); + check_call("rm $dir/splag.$im1/*"); $inweights = $finalFile; } $lastWeightsFile = "$dir/weights.$iteration"; -- cgit v1.2.3 From 78af1ef80f84023b4ff0661c47201850dbd46363 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 21 Mar 2011 15:55:14 -0400 Subject: Update error handling and a corner case of trie. --- klm/lm/build_binary.cc | 102 +++++++++++++++++++++++-------------------------- klm/lm/config.cc | 2 +- klm/lm/config.hh | 2 +- klm/lm/model.cc | 2 +- klm/lm/search_trie.cc | 20 +++++++--- klm/lm/vocab.cc | 2 +- klm/util/exception.cc | 4 +- 7 files changed, 69 insertions(+), 65 deletions(-) diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index d6dd5994..920ff080 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -15,8 +15,9 @@ namespace ngram { namespace { void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" -"-u sets the default probability for if the ARPA file does not have one.\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" +"-u sets the default log10 probability for if the ARPA file does not have\n" +"one.\n" "-s allows models to be built even if they do not have and .\n\n" "type is one of probing, trie, or sorted:\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" @@ -69,65 +70,58 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { } // namespace lm } // namespace -void terminate_handler() { - try { throw; } - catch(const std::exception& e) { - std::cerr << e.what() << std::endl; - } - catch(...) { - std::cerr << "A non-standard exception was thrown." << std::endl; - } - std::abort(); -} - int main(int argc, char *argv[]) { using namespace lm::ngram; - std::set_terminate(terminate_handler); - - lm::ngram::Config config; - int opt; - while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) { - switch(opt) { - case 'u': - config.unknown_missing_prob = ParseFloat(optarg); - break; - case 'p': - config.probing_multiplier = ParseFloat(optarg); - break; - case 't': - config.temporary_directory_prefix = optarg; - break; - case 'm': - config.building_memory = ParseUInt(optarg) * 1048576; - break; - case 's': - config.sentence_marker_missing = lm::ngram::Config::SILENT; - break; - default: - Usage(argv[0]); + try { + lm::ngram::Config config; + int opt; + while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) { + switch(opt) { + case 'u': + config.unknown_missing_logprob = ParseFloat(optarg); + break; + case 'p': + config.probing_multiplier = ParseFloat(optarg); + break; + case 't': + config.temporary_directory_prefix = optarg; + break; + case 'm': + config.building_memory = ParseUInt(optarg) * 1048576; + break; + case 's': + config.sentence_marker_missing = lm::ngram::Config::SILENT; + break; + default: + Usage(argv[0]); + } } - } - if (optind + 1 == argc) { - ShowSizes(argv[optind], config); - } else if (optind + 2 == argc) { - config.write_mmap = argv[optind + 1]; - ProbingModel(argv[optind], config); - } else if (optind + 3 == argc) { - const char *model_type = argv[optind]; - const char *from_file = argv[optind + 1]; - config.write_mmap = argv[optind + 2]; - if (!strcmp(model_type, "probing")) { - ProbingModel(from_file, config); - } else if (!strcmp(model_type, "sorted")) { - SortedModel(from_file, config); - } else if (!strcmp(model_type, "trie")) { - TrieModel(from_file, config); + if (optind + 1 == argc) { + ShowSizes(argv[optind], config); + } else if (optind + 2 == argc) { + config.write_mmap = argv[optind + 1]; + ProbingModel(argv[optind], config); + } else if (optind + 3 == argc) { + const char *model_type = argv[optind]; + const char *from_file = argv[optind + 1]; + config.write_mmap = argv[optind + 2]; + if (!strcmp(model_type, "probing")) { + ProbingModel(from_file, config); + } else if (!strcmp(model_type, "sorted")) { + SortedModel(from_file, config); + } else if (!strcmp(model_type, "trie")) { + TrieModel(from_file, config); + } else { + Usage(argv[0]); + } } else { Usage(argv[0]); } - } else { - Usage(argv[0]); + } + catch (std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); } return 0; } diff --git a/klm/lm/config.cc b/klm/lm/config.cc index d8773fe5..71646e51 100644 --- a/klm/lm/config.cc +++ b/klm/lm/config.cc @@ -10,7 +10,7 @@ Config::Config() : enumerate_vocab(NULL), unknown_missing(COMPLAIN), sentence_marker_missing(THROW_UP), - unknown_missing_prob(0.0), + unknown_missing_logprob(-100.0), probing_multiplier(1.5), building_memory(1073741824ULL), // 1 GB temporary_directory_prefix(NULL), diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 17f67df3..1f7762be 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -36,7 +36,7 @@ struct Config { // The probability to substitute for if it's missing from the model. // No effect if the model has or unknown_missing == THROW_UP. - float unknown_missing_prob; + float unknown_missing_logprob; // Size multiplier for probing hash table. Must be > 1. Space is linear in // this. Time is probing_multiplier / (probing_multiplier - 1). No effect diff --git a/klm/lm/model.cc b/klm/lm/model.cc index 14949e97..1492276a 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -86,7 +86,7 @@ template void GenericModel &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { +void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { { std::string unigram_name = file_prefix + "unigrams"; util::scoped_fd unigram_file; - util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), counts[0] * sizeof(ProbBackoff), unigram_file), counts[0] * sizeof(ProbBackoff)); + // In case appears. + size_t extra_count = counts[0] + 1; + util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), extra_count * sizeof(ProbBackoff), unigram_file), extra_count * sizeof(ProbBackoff)); Read1Grams(f, counts[0], vocab, reinterpret_cast(unigram_mmap.get())); CheckSpecials(config, vocab); + if (!vocab.SawUnk()) ++counts[0]; } // Only use as much buffer as we need. @@ -572,7 +575,7 @@ bool HeadMatch(const WordIndex *words, const WordIndex *const words_end, const W return true; } -// Counting phrase +// Phase to count n-grams, including blanks inserted because they were pruned but have extensions class JustCount { public: JustCount(ContextReader * /*contexts*/, UnigramValue * /*unigrams*/, BitPackedMiddle * /*middle*/, BitPackedLongest &/*longest*/, uint64_t *counts, unsigned char order) @@ -603,6 +606,7 @@ class JustCount { uint64_t *const counts_, *const longest_counts_; }; +// Phase to actually write n-grams to the trie. class WriteEntries { public: WriteEntries(ContextReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, const uint64_t * /*counts*/, unsigned char order) : @@ -764,7 +768,7 @@ template class RecursiveInsert { void SanityCheckCounts(const std::vector &initial, const std::vector &fixed) { if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]); - if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant"); + if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back()); for (unsigned char i = 0; i < initial.size(); ++i) { if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected. This shouldn't happen"); } @@ -789,6 +793,9 @@ void BuildTrie(const std::string &file_prefix, std::vector &counts, co RecursiveInsert counter(inputs, contexts, NULL, &*out.middle.begin(), out.longest, &*fixed_counts.begin(), counts.size()); counter.Apply(config.messages, "Counting n-grams that should not have been pruned", counts[0]); } + for (SortedFileReader *i = inputs; i < inputs + counts.size() - 1; ++i) { + if (!i->Ended()) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); + } SanityCheckCounts(counts, fixed_counts); counts = fixed_counts; @@ -805,7 +812,7 @@ void BuildTrie(const std::string &file_prefix, std::vector &counts, co } // Fill unigram probabilities. - { + try { std::string name(file_prefix + "unigrams"); util::scoped_FILE file(OpenOrThrow(name.c_str(), "r")); for (WordIndex i = 0; i < counts[0]; ++i) { @@ -816,6 +823,9 @@ void BuildTrie(const std::string &file_prefix, std::vector &counts, co } } RemoveOrThrow(name.c_str()); + } catch (util::Exception &e) { + e << " while re-reading unigram probabilities"; + throw; } // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc index 415f8331..fd11ad2c 100644 --- a/klm/lm/vocab.cc +++ b/klm/lm/vocab.cc @@ -192,7 +192,7 @@ void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { case Config::SILENT: return; case Config::COMPLAIN: - if (config.messages) *config.messages << "The ARPA file is missing . Substituting probability " << config.unknown_missing_prob << "." << std::endl; + if (config.messages) *config.messages << "The ARPA file is missing . Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl; break; case Config::THROW_UP: UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing and the model is configured to throw an exception."); diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 077405f4..84f9fe7c 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -9,11 +9,11 @@ Exception::Exception() throw() {} Exception::~Exception() throw() {} Exception::Exception(const Exception &from) : std::exception() { - stream_.str(from.stream_.str()); + stream_ << from.stream_.str(); } Exception &Exception::operator=(const Exception &from) { - stream_.str(from.stream_.str()); + stream_ << from.stream_.str(); return *this; } -- cgit v1.2.3 From 2d0c2c2bf7c35db2e16d16f39cb9749a04bc91df Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 21 Mar 2011 22:00:29 -0400 Subject: compiler warnings --- klm/util/bit_packing.hh | 7 +++++-- klm/util/have.hh | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 70cfc2d2..5c71c792 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -28,16 +28,19 @@ namespace util { * but it may be called multiple times when that's inconvenient. */ -inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { + // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. #if BYTE_ORDER == LITTLE_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { return bit; +} #elif BYTE_ORDER == BIG_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { return 64 - length - bit; +} #else #error "Bit packing code isn't written for your byte order." #endif -} /* Pack integers up to 57 bits using their least significant digits. * The length is specified using mask: diff --git a/klm/util/have.hh b/klm/util/have.hh index 7cf62008..f2f0cf90 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -2,8 +2,14 @@ #ifndef UTIL_HAVE__ #define UTIL_HAVE__ +#ifndef HAVE_ZLIB #define HAVE_ZLIB +#endif + // #define HAVE_ICU + +#ifndef HAVE_BOOST #define HAVE_BOOST +#endif #endif // UTIL_HAVE__ -- cgit v1.2.3 From da6c892bc05a5520910e23089d83ceb1f2a0fbb4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Mar 2011 22:10:02 -0400 Subject: add support for normalized 'summary features'- seemingly sound way of dealing with normalization problems in embedded crf translation models --- decoder/decoder.cc | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 95ff6270..8a03c5c9 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -141,12 +141,13 @@ inline shared_ptr make_fsa_ff(string const& ffp,bool verbose // and then prune the resulting (rescored) hypergraph. All feature values from previous // passes are carried over into subsequent passes (where they may have different weights). struct RescoringPass { - RescoringPass() : density_prune(), beam_prune() {} + RescoringPass() : fid_summary(), density_prune(), beam_prune() {} shared_ptr models; shared_ptr inter_conf; vector ffs; shared_ptr w; // null == use previous weights vector weight_vector; + int fid_summary; // 0 == no summary feature double density_prune; // 0 == don't density prune double beam_prune; // 0 == don't beam prune }; @@ -155,6 +156,7 @@ ostream& operator<<(ostream& os, const RescoringPass& rp) { os << "[num_fn=" << rp.ffs.size(); if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; } if (rp.w) os << " new_weights"; + if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary); if (rp.density_prune) os << " density_prune=" << rp.density_prune; if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune; os << ']'; @@ -361,18 +363,21 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("weights,w",po::value(),"Feature weights file (initial forest / pass 1)") ("feature_function,F",po::value >()->composing(), "Pass 1 additional feature function(s) (-L for list)") ("intersection_strategy,I",po::value()->default_value("cube_pruning"), "Pass 1 intersection strategy for incorporating finite-state features; values include Cube_pruning, Full") + ("summary_feature", po::value(), "Compute a 'summary feature' at the end of the pass (before any pruning) with name=arg and value=inside-outside/Z") ("density_prune", po::value(), "Pass 1 pruning: keep no more than this many times the number of edges used in the best derivation tree (>=1.0)") ("beam_prune", po::value(), "Pass 1 pruning: Prune paths from scored forest, keep paths within exp(alpha>=0)") ("weights2",po::value(),"Optional pass 2") ("feature_function2",po::value >()->composing(), "Optional pass 2") ("intersection_strategy2",po::value()->default_value("cube_pruning"), "Optional pass 2") + ("summary_feature2", po::value(), "Optional pass 2") ("density_prune2", po::value(), "Optional pass 2") ("beam_prune2", po::value(), "Optional pass 2") ("weights3",po::value(),"Optional pass 3") ("feature_function3",po::value >()->composing(), "Optional pass 3") ("intersection_strategy3",po::value()->default_value("cube_pruning"), "Optional pass 3") + ("summary_feature3", po::value(), "Optional pass 3") ("density_prune3", po::value(), "Optional pass 3") ("beam_prune3", po::value(), "Optional pass 3") @@ -559,6 +564,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream for (int pass = 0; pass < MAX_PASSES; ++pass) { string ws = "weights" + StringSuffixForRescoringPass(pass); string ff = "feature_function" + StringSuffixForRescoringPass(pass); + string sf = "summary_feature" + StringSuffixForRescoringPass(pass); string bp = "beam_prune" + StringSuffixForRescoringPass(pass); string dp = "density_prune" + StringSuffixForRescoringPass(pass); bool first_pass_condition = ((pass == 0) && (conf.count(ff) || conf.count(bp) || conf.count(dp))); @@ -583,6 +589,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream if (p->IsStateful()) { has_stateful = true; } } } + if (conf.count(sf)) { + rp.fid_summary = FD::Convert(conf[sf].as()); + assert(rp.fid_summary > 0); + // TODO assert that weights for this pass have coef(fid_summary) == 0.0? + } if (conf.count(bp)) { rp.beam_prune = conf[bp].as(); } if (conf.count(dp)) { rp.density_prune = conf[dp].as(); } int palg = (has_stateful ? 1 : 0); // if there are no stateful featueres, default to FULL @@ -794,6 +805,15 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { cerr << " " << passtr << " partition log(Z): " << log(z) << endl; } + if (rp.fid_summary) { + Hypergraph::EdgeProbs posteriors; + const prob_t z = forest.ComputeEdgePosteriors(1.0, &posteriors); + if (!SILENT) { cerr << " " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; } + assert(forest.edges_.size() == posteriors.size()); + for (int i = 0; i < posteriors.size(); ++i) + forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z)); + } + string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass); string fulldp = "density_prune" + StringSuffixForRescoringPass(pass); maybe_prune(forest,conf,fullbp.c_str(),fulldp.c_str(),passtr,srclen); -- cgit v1.2.3 From 4bc9ea17ba9f85c899e35a9d657ee3f174ff2863 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 22 Mar 2011 11:35:45 -0400 Subject: check for infs --- decoder/decoder.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 8a03c5c9..a16a9b5a 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -810,8 +810,12 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { const prob_t z = forest.ComputeEdgePosteriors(1.0, &posteriors); if (!SILENT) { cerr << " " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; } assert(forest.edges_.size() == posteriors.size()); - for (int i = 0; i < posteriors.size(); ++i) - forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z)); + if (!isfinite(log(z)) || isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + for (int i = 0; i < posteriors.size(); ++i) + forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z)); + } } string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass); -- cgit v1.2.3 From c0ae6f362b245ccf2ab3b8d6dc7e367cbcc64c1c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 22 Mar 2011 14:44:46 -0400 Subject: fix local normalizer code for summary features --- decoder/decoder.cc | 16 +++++++++++----- decoder/hg.cc | 5 +++-- decoder/hg.h | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index a16a9b5a..89425198 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -806,15 +806,21 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } if (rp.fid_summary) { - Hypergraph::EdgeProbs posteriors; - const prob_t z = forest.ComputeEdgePosteriors(1.0, &posteriors); + const prob_t z = forest.PushWeightsToGoal(1.0); if (!SILENT) { cerr << " " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; } - assert(forest.edges_.size() == posteriors.size()); if (!isfinite(log(z)) || isnan(log(z))) { cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; } else { - for (int i = 0; i < posteriors.size(); ++i) - forest.edges_[i].feature_values_.set_value(rp.fid_summary, log(posteriors[i] / z)); + for (int i = 0; i < forest.edges_.size(); ++i) { + const double log_prob_transition = log(forest.edges_[i].edge_prob_); // locally normalized by the edge + // head node by forest.PushWeightsToGoal + if (!isfinite(log_prob_transition) || isnan(log_prob_transition)) { + cerr << "Edge: i=" << i << " got bad inside prob: " << *forest.edges_[i].rule_ << endl; + abort(); + } + + forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition); + } } } diff --git a/decoder/hg.cc b/decoder/hg.cc index 39ac5132..a4028b0e 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -226,9 +226,9 @@ prob_t Hypergraph::PushViterbiWeightsToGoal(int fid) { } -void Hypergraph::PushWeightsToGoal(double scale) { +prob_t Hypergraph::PushWeightsToGoal(double scale) { vector posts; - ComputeEdgePosteriors(scale, &posts); + const prob_t inside_z = ComputeEdgePosteriors(scale, &posts); for (int i = 0; i < nodes_.size(); ++i) { const Hypergraph::Node& node = nodes_[i]; prob_t z = prob_t::Zero(); @@ -238,6 +238,7 @@ void Hypergraph::PushWeightsToGoal(double scale) { edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z; } } + return inside_z; } struct EdgeExistsWeightFunction { diff --git a/decoder/hg.h b/decoder/hg.h index aa1202b1..e5ef05f8 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -449,7 +449,7 @@ public: void PushWeightsToSource(double scale = 1.0); // same, except weights are pushed to the goal, works for HGs, // not just lattices - void PushWeightsToGoal(double scale = 1.0); + prob_t PushWeightsToGoal(double scale = 1.0); // contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid. note that p_viterbi becomes 1; k*p_viterbi becomes k. also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight) // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after). returns inside(goal) -- cgit v1.2.3 From 12ece6ddfa91ec61cdeee698db2c7edac941e096 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 22 Mar 2011 16:48:28 -0400 Subject: reweight after weight pushing to avoid weird output --- decoder/decoder.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 89425198..ac063659 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -821,6 +821,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition); } + forest.Reweight(cur_weights); // reset weights } } -- cgit v1.2.3 From 57a218e86e30d57d9795bccd280737c431f6b4e4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 23 Mar 2011 12:15:55 -0400 Subject: yet another feature attempt --- decoder/decoder.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index ac063659..b7774acc 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -806,6 +806,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } if (rp.fid_summary) { +#if 0 const prob_t z = forest.PushWeightsToGoal(1.0); if (!SILENT) { cerr << " " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; } if (!isfinite(log(z)) || isnan(log(z))) { @@ -823,6 +824,26 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } forest.Reweight(cur_weights); // reset weights } +#endif + Hypergraph::EdgeProbs posts; + const prob_t z = forest.ComputeEdgePosteriors(1.0, &posts); + if (!isfinite(log(z)) || isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + for (int i = 0; i < forest.nodes_.size(); ++i) { + const Hypergraph::EdgesVector& in_edges = forest.nodes_[i].in_edges_; + prob_t node_post = prob_t(0); + for (int j = 0; j < in_edges.size(); ++j) + node_post += (posts[in_edges[j]] / z); + const double log_np = log(node_post); + if (!isfinite(log_np) || isnan(log_np)) { + cerr << "got bad posterior prob for node " << i << endl; + abort(); + } + for (int j = 0; j < in_edges.size(); ++j) + forest.edges_[in_edges[j]].feature_values_.set_value(rp.fid_summary, exp(log_np)); + } + } } string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass); -- cgit v1.2.3 From 918ed4bf919a55e3eb5d99d98c9b915921dc11ab Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 23 Mar 2011 22:53:44 -0400 Subject: remove thread-local stuff which was fragile on some build systems --- decoder/trule.cc | 3 +-- utils/static_utoa.h | 2 +- utils/tdict.cc | 1 - utils/threadlocal.h | 71 ----------------------------------------------------- 4 files changed, 2 insertions(+), 75 deletions(-) delete mode 100755 utils/threadlocal.h diff --git a/decoder/trule.cc b/decoder/trule.cc index fda62741..40235542 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -5,7 +5,6 @@ #include "stringlib.h" #include "tdict.h" #include "rule_lexer.h" -#include "threadlocal.h" using namespace std; @@ -99,7 +98,7 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) { namespace { // callback for lexer -THREADLOCAL int n_assigned=0; +int n_assigned=0; void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) { TRule *assignto=(TRule *)extra; *assignto=*new_rule; diff --git a/utils/static_utoa.h b/utils/static_utoa.h index d15ed35b..bb3d821f 100755 --- a/utils/static_utoa.h +++ b/utils/static_utoa.h @@ -7,7 +7,7 @@ namespace { static const int utoa_bufsize=40; // 64bit safe. static const int utoa_bufsizem1=utoa_bufsize-1; // 64bit safe. -THREADLOCAL char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20] +static char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20] } inline char *static_utoa(unsigned n) { diff --git a/utils/tdict.cc b/utils/tdict.cc index 23a298f8..c21b2b48 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -8,7 +8,6 @@ #include "dict.h" #include "tdict.h" #include "stringlib.h" -#include "threadlocal.h" using namespace std; diff --git a/utils/threadlocal.h b/utils/threadlocal.h deleted file mode 100755 index d79f5d9d..00000000 --- a/utils/threadlocal.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef THREADLOCAL_H -#define THREADLOCAL_H - -#ifndef SETLOCAL_SWAP -# define SETLOCAL_SWAP 0 -#endif - -#ifdef BOOST_NO_MT - -# define THREADLOCAL - -#else - -#ifdef _MSC_VER - -//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html) -# define THREADLOCAL __declspec(thread) - -#else - -# define THREADLOCAL __thread - -#endif - -#endif - -#include //swap - -// naturally, the below are only thread-safe if value is THREADLOCAL -template -struct SaveLocal { - D &value; - D old_value; - SaveLocal(D& val) : value(val), old_value(val) {} - ~SaveLocal() { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=old_value; -#endif - } -}; - -template -struct SetLocal { - D &value; - D old_value; - SetLocal(D& val,const D &new_value) : value(val), old_value( -#if SETLOCAL_SWAP - new_value -#else - val -#endif - ) { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=new_value; -#endif - } - ~SetLocal() { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=old_value; -#endif - } -}; - - -#endif -- cgit v1.2.3 From e03a6c2b2e3cc21d75904300d34249cd1e2e032b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 23 Mar 2011 22:55:50 -0400 Subject: refactor makefile --- decoder/Makefile.am | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/decoder/Makefile.am b/decoder/Makefile.am index e1dba497..244da2de 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -6,16 +6,13 @@ noinst_PROGRAMS = \ hg_test \ ff_test \ parser_test \ - grammar_test \ - cfg_test -TESTS = trule_test ff_test parser_test grammar_test hg_test cfg_test -endif - -cdec_SOURCES = cdec.cc -cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -cfg_test_SOURCES = cfg_test.cc -cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz + grammar_test + + # cfg_test +TESTS = trule_test ff_test parser_test grammar_test hg_test +# cfg_test +#cfg_test_SOURCES = cfg_test.cc +#cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz parser_test_SOURCES = parser_test.cc parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz ff_test_SOURCES = ff_test.cc @@ -26,6 +23,11 @@ hg_test_SOURCES = hg_test.cc hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz trule_test_SOURCES = trule_test.cc trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz +endif + +cdec_SOURCES = cdec.cc +cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + AM_CPPFLAGS = -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm rule_lexer.cc: rule_lexer.l -- cgit v1.2.3 From 972c40c819bb1e6ea8c78eb2e067f014713adf86 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 23 Mar 2011 23:00:32 -0400 Subject: fail if no lex program is found (i think) --- configure.ac | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configure.ac b/configure.ac index 56f08147..b323576f 100644 --- a/configure.ac +++ b/configure.ac @@ -3,6 +3,9 @@ AM_INIT_AUTOMAKE(cdec,0.1) AC_CONFIG_HEADERS(config.h) AC_PROG_LIBTOOL AC_PROG_LEX +case $LEX in +:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; +esac AC_PROG_CC AC_PROG_CXX AC_LANG_CPLUSPLUS -- cgit v1.2.3 From a580faa8177331cf51138a2208e276b703470934 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 23 Mar 2011 23:12:31 -0400 Subject: remove dependency on boost thread library --- configure.ac | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index b323576f..da66c3fb 100644 --- a/configure.ac +++ b/configure.ac @@ -11,10 +11,12 @@ AC_PROG_CXX AC_LANG_CPLUSPLUS BOOST_REQUIRE BOOST_PROGRAM_OPTIONS -BOOST_THREADS +#BOOST_THREADS CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" -LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_THREAD_LDFLAGS" -LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_THREAD_LIBS" +LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" +# $BOOST_THREAD_LDFLAGS" +LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS" +# $BOOST_THREAD_LIBS" AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp, [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])]) -- cgit v1.2.3