summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Clark <jon.h.clark@gmail.com>2011-03-24 09:51:40 -0400
committerJonathan Clark <jon.h.clark@gmail.com>2011-03-24 09:51:40 -0400
commiteb33700d1c868662b5d0abedaaf3fa47948a89d0 (patch)
treeed70be84820d243524bab0b59a84b8da033a9c41
parentba4f147f84aa0d4623da640a2d0de7e6242a53af (diff)
parenta580faa8177331cf51138a2208e276b703470934 (diff)
Undo some silly local changes so we can pull
-rw-r--r--.gitignore1
-rw-r--r--Makefile.am4
-rw-r--r--SConstruct106
-rw-r--r--configure.ac11
-rw-r--r--decoder/Makefile.am26
-rw-r--r--decoder/decoder.cc54
-rw-r--r--decoder/ff_klm.cc2
-rw-r--r--decoder/ff_wordset.h1
-rw-r--r--decoder/hg.cc5
-rw-r--r--decoder/hg.h2
-rw-r--r--decoder/trule.cc7
-rw-r--r--klm/lm/build_binary.cc102
-rw-r--r--klm/lm/config.cc2
-rw-r--r--klm/lm/config.hh2
-rw-r--r--klm/lm/model.cc2
-rw-r--r--klm/lm/search_trie.cc20
-rw-r--r--klm/lm/vocab.cc2
-rw-r--r--klm/util/bit_packing.hh7
-rw-r--r--klm/util/exception.cc4
-rw-r--r--klm/util/have.hh6
-rw-r--r--training/mpi_online_optimize.cc17
-rw-r--r--training/online_optimizer.h23
-rw-r--r--training/optimize_test.cc2
-rwxr-xr-xutils/static_utoa.h2
-rw-r--r--utils/tdict.cc1
-rwxr-xr-xutils/threadlocal.h71
-rwxr-xr-xvest/dist-vest.pl17
-rwxr-xr-xvest/parallelize.pl9
28 files changed, 289 insertions, 219 deletions
diff --git a/.gitignore b/.gitignore
index 3892891c..2a287bbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,3 +120,4 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar
*.dvi
*.ps
*.toc
+*~ \ No newline at end of file
diff --git a/Makefile.am b/Makefile.am
index a808c211..bd46bd91 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,9 @@
# warning - the subdirectories in the following list should
# be kept in topologically sorted order. Also, DO NOT introduce
# cyclic dependencies between these directories!
-SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
+SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools
+
+#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
AUTOMAKE_OPTIONS = foreign
ACLOCAL_AMFLAGS = -I m4
diff --git a/SConstruct b/SConstruct
index dc5497ae..c21d85d5 100644
--- a/SConstruct
+++ b/SConstruct
@@ -1,23 +1,54 @@
-AddOption('--prefix',
- dest='prefix',
- type='string',
- nargs=1,
- action='store',
- metavar='DIR',
- help='installation prefix')
-
-AddOption('--with-boost',
- dest='boost',
- type='string',
- nargs=1,
- action='store',
- metavar='DIR',
+#!/usr/bin/python
+
+# EXPERIMENTAL and HACKY version of cdec build in scons
+
+AddOption('--prefix', dest='prefix', type='string', nargs=1, action='store', metavar='DIR',
+ help='installation prefix')
+AddOption('--with-boost', dest='boost', type='string', nargs=1, action='store', metavar='DIR',
help='boost installation directory (if in a non-standard location)')
+AddOption('--with-glc', dest='glc', type='string', nargs=1, action='store', metavar='DIR',
+ help='path to Global Lexical Coherence package (optional)')
+AddOption('--efence', dest='efence', action='store_true',
+ help='use electric fence for debugging memory corruptions')
+
+# TODO: Troll http://www.scons.org/wiki/SconsAutoconf
+# for some initial autoconf-like steps
platform = ARGUMENTS.get('OS', Platform())
+include = Split('decoder utils klm mteval .')
+env = Environment(PREFIX=GetOption('prefix'),
+ PLATFORM = platform,
+# BINDIR = bin,
+# INCDIR = include,
+# LIBDIR = lib,
+ CPPPATH = include,
+ LIBPATH = [],
+ LIBS = Split('boost_program_options boost_serialization boost_thread z'),
+ CCFLAGS=Split('-g -O3'))
+
+boost = GetOption('boost')
+if boost:
+ print 'Using Boost at {0}'.format(boost)
+ env.Append(CPPPATH=boost+'/include',
+ LIBPATH=boost+'/lib')
+
+if GetOption('efence'):
+ env.Append(LIBS=Split('efence Segfault'))
srcs = []
-for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc']:
+
+# TODO: Get rid of config.h
+
+glc = GetOption('glc')
+if glc:
+ print 'Using Global Lexical Coherence package at {0}'.format(glc)
+ env.Append(CCFLAGS='-DHAVE_GLC',
+ CPPPATH=[glc, glc+'/cdec'])
+ srcs.append(glc+'/string_util.cc')
+ srcs.append(glc+'/feature-factory.cc')
+ srcs.append(glc+'/cdec/ff_glc.cc')
+
+for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc', 'vest/*.cc']:
srcs.extend([ file for file in Glob(pattern)
if not 'test' in str(file)
and 'build_binary.cc' not in str(file)
@@ -25,27 +56,30 @@ for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mtev
and 'mbr_kbest.cc' not in str(file)
and 'sri.cc' not in str(file)
and 'fast_score.cc' not in str(file)
+ and 'cdec.cc' not in str(file)
+ and 'mr_' not in str(file)
])
-include = Split('decoder utils klm mteval .')
-libPaths = []
-
-boost = GetOption('boost')
-if boost:
- include.append(boost+'/include')
- libPaths.append(boost+'/lib')
-
-glcDir = None
-glcDir = '../GlobalLexicalCoherence'
-if glcDir:
- include.append(glcDir)
+print 'Found {0} source files'.format(len(srcs))
+def comb(cc, srcs):
+ x = [cc]
+ x.extend(srcs)
+ return x
-env = Environment(PREFIX=GetOption('prefix'),
- PLATFORM = platform,
-# BINDIR = bin,
- INCDIR = include,
-# LIBDIR = lib,
- CPPPATH = [include, '.'],
- LIBPATH = libPaths,
- LIBS = Split('boost_program_options boost_serialization boost_thread z'))
-env.Program(target='decoder/cdec', source=srcs)
+env.Program(target='decoder/cdec', source=comb('decoder/cdec.cc', srcs))
+# TODO: The various decoder tests
+# TODO: extools
+env.Program(target='klm/lm/build_binary', source=comb('klm/lm/build_binary.cc', srcs))
+# TODO: klm ngram_query and tests
+env.Program(target='mteval/fast_score', source=comb('mteval/fast_score.cc', srcs))
+env.Program(target='mteval/mbr_kbest', source=comb('mteval/mbr_kbest.cc', srcs))
+#env.Program(target='mteval/scorer_test', source=comb('mteval/fast_score.cc', srcs))
+# TODO: phrasinator
+# TODO: Various training binaries
+env.Program(target='vest/sentserver', source=['vest/sentserver.c'], LINKFLAGS='-all-static')
+env.Program(target='vest/sentclient', source=['vest/sentclient.c'], LINKFLAGS='-all-static')
+env.Program(target='vest/mr_vest_generate_mapper_input', source=comb('vest/mr_vest_generate_mapper_input.cc', srcs))
+env.Program(target='vest/mr_vest_map', source=comb('vest/mr_vest_map.cc', srcs))
+env.Program(target='vest/mr_vest_reduce', source=comb('vest/mr_vest_reduce.cc', srcs))
+#env.Program(target='vest/lo_test', source=comb('vest/lo_test.cc', srcs))
+# TODO: util tests
diff --git a/configure.ac b/configure.ac
index 56f08147..da66c3fb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,15 +3,20 @@ AM_INIT_AUTOMAKE(cdec,0.1)
AC_CONFIG_HEADERS(config.h)
AC_PROG_LIBTOOL
AC_PROG_LEX
+case $LEX in
+:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);;
+esac
AC_PROG_CC
AC_PROG_CXX
AC_LANG_CPLUSPLUS
BOOST_REQUIRE
BOOST_PROGRAM_OPTIONS
-BOOST_THREADS
+#BOOST_THREADS
CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
-LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_THREAD_LDFLAGS"
-LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS"
+# $BOOST_THREAD_LDFLAGS"
+LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS"
+# $BOOST_THREAD_LIBS"
AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp,
[AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])])
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 9cf4c3c4..244da2de 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -6,16 +6,13 @@ noinst_PROGRAMS = \
hg_test \
ff_test \
parser_test \
- grammar_test \
- cfg_test
-TESTS = trule_test ff_test parser_test grammar_test hg_test cfg_test
-endif
-
-cdec_SOURCES = cdec.cc
-cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-cfg_test_SOURCES = cfg_test.cc
-cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+ grammar_test
+
+ # cfg_test
+TESTS = trule_test ff_test parser_test grammar_test hg_test
+# cfg_test
+#cfg_test_SOURCES = cfg_test.cc
+#cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
parser_test_SOURCES = parser_test.cc
parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
ff_test_SOURCES = ff_test.cc
@@ -26,7 +23,12 @@ hg_test_SOURCES = hg_test.cc
hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
trule_test_SOURCES = trule_test.cc
trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm
+endif
+
+cdec_SOURCES = cdec.cc
+cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
+AM_CPPFLAGS = -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm
rule_lexer.cc: rule_lexer.l
$(LEX) -s -CF -8 -o$@ $<
@@ -82,5 +84,5 @@ libcdec_a_SOURCES = \
if GLC
# Until we build GLC as a library...
- libcdec_a_SOURCES += ff_glc.cc
+ libcdec_a_SOURCES += ff_glc.cc string_util.cc feature-factory.cc
endif
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 95ff6270..b7774acc 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -141,12 +141,13 @@ inline shared_ptr<FsaFeatureFunction> make_fsa_ff(string const& ffp,bool verbose
// and then prune the resulting (rescored) hypergraph. All feature values from previous
// passes are carried over into subsequent passes (where they may have different weights).
struct RescoringPass {
- RescoringPass() : density_prune(), beam_prune() {}
+ RescoringPass() : fid_summary(), density_prune(), beam_prune() {}
shared_ptr<ModelSet> models;
shared_ptr<IntersectionConfiguration> inter_conf;
vector<const FeatureFunction*> ffs;
shared_ptr<Weights> w; // null == use previous weights
vector<double> weight_vector;
+ int fid_summary; // 0 == no summary feature
double density_prune; // 0 == don't density prune
double beam_prune; // 0 == don't beam prune
};
@@ -155,6 +156,7 @@ ostream& operator<<(ostream& os, const RescoringPass& rp) {
os << "[num_fn=" << rp.ffs.size();
if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; }
if (rp.w) os << " new_weights";
+ if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary);
if (rp.density_prune) os << " density_prune=" << rp.density_prune;
if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune;
os << ']';
@@ -361,18 +363,21 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("weights,w",po::value<string>(),"Feature weights file (initial forest / pass 1)")
("feature_function,F",po::value<vector<string> >()->composing(), "Pass 1 additional feature function(s) (-L for list)")
("intersection_strategy,I",po::value<string>()->default_value("cube_pruning"), "Pass 1 intersection strategy for incorporating finite-state features; values include Cube_pruning, Full")
+ ("summary_feature", po::value<string>(), "Compute a 'summary feature' at the end of the pass (before any pruning) with name=arg and value=inside-outside/Z")
("density_prune", po::value<double>(), "Pass 1 pruning: keep no more than this many times the number of edges used in the best derivation tree (>=1.0)")
("beam_prune", po::value<double>(), "Pass 1 pruning: Prune paths from scored forest, keep paths within exp(alpha>=0)")
("weights2",po::value<string>(),"Optional pass 2")
("feature_function2",po::value<vector<string> >()->composing(), "Optional pass 2")
("intersection_strategy2",po::value<string>()->default_value("cube_pruning"), "Optional pass 2")
+ ("summary_feature2", po::value<string>(), "Optional pass 2")
("density_prune2", po::value<double>(), "Optional pass 2")
("beam_prune2", po::value<double>(), "Optional pass 2")
("weights3",po::value<string>(),"Optional pass 3")
("feature_function3",po::value<vector<string> >()->composing(), "Optional pass 3")
("intersection_strategy3",po::value<string>()->default_value("cube_pruning"), "Optional pass 3")
+ ("summary_feature3", po::value<string>(), "Optional pass 3")
("density_prune3", po::value<double>(), "Optional pass 3")
("beam_prune3", po::value<double>(), "Optional pass 3")
@@ -559,6 +564,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
for (int pass = 0; pass < MAX_PASSES; ++pass) {
string ws = "weights" + StringSuffixForRescoringPass(pass);
string ff = "feature_function" + StringSuffixForRescoringPass(pass);
+ string sf = "summary_feature" + StringSuffixForRescoringPass(pass);
string bp = "beam_prune" + StringSuffixForRescoringPass(pass);
string dp = "density_prune" + StringSuffixForRescoringPass(pass);
bool first_pass_condition = ((pass == 0) && (conf.count(ff) || conf.count(bp) || conf.count(dp)));
@@ -583,6 +589,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
if (p->IsStateful()) { has_stateful = true; }
}
}
+ if (conf.count(sf)) {
+ rp.fid_summary = FD::Convert(conf[sf].as<string>());
+ assert(rp.fid_summary > 0);
+ // TODO assert that weights for this pass have coef(fid_summary) == 0.0?
+ }
if (conf.count(bp)) { rp.beam_prune = conf[bp].as<double>(); }
if (conf.count(dp)) { rp.density_prune = conf[dp].as<double>(); }
int palg = (has_stateful ? 1 : 0); // if there are no stateful featueres, default to FULL
@@ -794,6 +805,47 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
cerr << " " << passtr << " partition log(Z): " << log(z) << endl;
}
+ if (rp.fid_summary) {
+#if 0
+ const prob_t z = forest.PushWeightsToGoal(1.0);
+ if (!SILENT) { cerr << " " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; }
+ if (!isfinite(log(z)) || isnan(log(z))) {
+ cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n";
+ } else {
+ for (int i = 0; i < forest.edges_.size(); ++i) {
+ const double log_prob_transition = log(forest.edges_[i].edge_prob_); // locally normalized by the edge
+ // head node by forest.PushWeightsToGoal
+ if (!isfinite(log_prob_transition) || isnan(log_prob_transition)) {
+ cerr << "Edge: i=" << i << " got bad inside prob: " << *forest.edges_[i].rule_ << endl;
+ abort();
+ }
+
+ forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition);
+ }
+ forest.Reweight(cur_weights); // reset weights
+ }
+#endif
+ Hypergraph::EdgeProbs posts;
+ const prob_t z = forest.ComputeEdgePosteriors(1.0, &posts);
+ if (!isfinite(log(z)) || isnan(log(z))) {
+ cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n";
+ } else {
+ for (int i = 0; i < forest.nodes_.size(); ++i) {
+ const Hypergraph::EdgesVector& in_edges = forest.nodes_[i].in_edges_;
+ prob_t node_post = prob_t(0);
+ for (int j = 0; j < in_edges.size(); ++j)
+ node_post += (posts[in_edges[j]] / z);
+ const double log_np = log(node_post);
+ if (!isfinite(log_np) || isnan(log_np)) {
+ cerr << "got bad posterior prob for node " << i << endl;
+ abort();
+ }
+ for (int j = 0; j < in_edges.size(); ++j)
+ forest.edges_[in_edges[j]].feature_values_.set_value(rp.fid_summary, exp(log_np));
+ }
+ }
+ }
+
string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass);
string fulldp = "density_prune" + StringSuffixForRescoringPass(pass);
maybe_prune(forest,conf,fullbp.c_str(),fulldp.c_str(),passtr,srclen);
diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc
index adc2c8bf..62908cdc 100644
--- a/decoder/ff_klm.cc
+++ b/decoder/ff_klm.cc
@@ -21,7 +21,7 @@ static const unsigned char MASK = 7;
// -n NAME : feature id is NAME
bool ParseLMArgs(string const& in, string* filename, string* mapfile, bool* explicit_markers, string* featname) {
vector<string> const& argv=SplitOnWhitespace(in);
- *explicit_markers = true;
+ *explicit_markers = false;
*featname="LanguageModel";
*mapfile = "";
#define LMSPEC_NEXTARG if (i==argv.end()) { \
diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h
index 00e1145b..643097ef 100644
--- a/decoder/ff_wordset.h
+++ b/decoder/ff_wordset.h
@@ -32,6 +32,7 @@ class WordSet : public FeatureFunction {
~WordSet() {
}
+ Features features() const { return single_feature(fid_); }
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
diff --git a/decoder/hg.cc b/decoder/hg.cc
index 39ac5132..a4028b0e 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -226,9 +226,9 @@ prob_t Hypergraph::PushViterbiWeightsToGoal(int fid) {
}
-void Hypergraph::PushWeightsToGoal(double scale) {
+prob_t Hypergraph::PushWeightsToGoal(double scale) {
vector<prob_t> posts;
- ComputeEdgePosteriors(scale, &posts);
+ const prob_t inside_z = ComputeEdgePosteriors(scale, &posts);
for (int i = 0; i < nodes_.size(); ++i) {
const Hypergraph::Node& node = nodes_[i];
prob_t z = prob_t::Zero();
@@ -238,6 +238,7 @@ void Hypergraph::PushWeightsToGoal(double scale) {
edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z;
}
}
+ return inside_z;
}
struct EdgeExistsWeightFunction {
diff --git a/decoder/hg.h b/decoder/hg.h
index aa1202b1..e5ef05f8 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -449,7 +449,7 @@ public:
void PushWeightsToSource(double scale = 1.0);
// same, except weights are pushed to the goal, works for HGs,
// not just lattices
- void PushWeightsToGoal(double scale = 1.0);
+ prob_t PushWeightsToGoal(double scale = 1.0);
// contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid. note that p_viterbi becomes 1; k*p_viterbi becomes k. also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight)
// afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after). returns inside(goal)
diff --git a/decoder/trule.cc b/decoder/trule.cc
index 9820e6d5..40235542 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -5,7 +5,6 @@
#include "stringlib.h"
#include "tdict.h"
#include "rule_lexer.h"
-#include "threadlocal.h"
using namespace std;
@@ -99,7 +98,7 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) {
namespace {
// callback for lexer
-THREADLOCAL int n_assigned=0;
+int n_assigned=0;
void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) {
TRule *assignto=(TRule *)extra;
*assignto=*new_rule;
@@ -145,7 +144,9 @@ bool TRule::ReadFromString(const string& line, bool strict, bool mono) {
getline(is, ss);
//cerr << "L: " << ss << endl;
int start = 0;
- const int len = ss.size();
+ int len = ss.size();
+ const size_t ppos = ss.find(" |||");
+ if (ppos != string::npos) { len = ppos; }
while (start < len) {
while(start < len && (ss[start] == ' ' || ss[start] == ';'))
++start;
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index d6dd5994..920ff080 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -15,8 +15,9 @@ namespace ngram {
namespace {
void Usage(const char *name) {
- std::cerr << "Usage: " << name << " [-u unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
-"-u sets the default probability for <unk> if the ARPA file does not have one.\n"
+ std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
+"-u sets the default log10 probability for <unk> if the ARPA file does not have\n"
+"one.\n"
"-s allows models to be built even if they do not have <s> and </s>.\n\n"
"type is one of probing, trie, or sorted:\n\n"
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
@@ -69,65 +70,58 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
} // namespace lm
} // namespace
-void terminate_handler() {
- try { throw; }
- catch(const std::exception& e) {
- std::cerr << e.what() << std::endl;
- }
- catch(...) {
- std::cerr << "A non-standard exception was thrown." << std::endl;
- }
- std::abort();
-}
-
int main(int argc, char *argv[]) {
using namespace lm::ngram;
- std::set_terminate(terminate_handler);
-
- lm::ngram::Config config;
- int opt;
- while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) {
- switch(opt) {
- case 'u':
- config.unknown_missing_prob = ParseFloat(optarg);
- break;
- case 'p':
- config.probing_multiplier = ParseFloat(optarg);
- break;
- case 't':
- config.temporary_directory_prefix = optarg;
- break;
- case 'm':
- config.building_memory = ParseUInt(optarg) * 1048576;
- break;
- case 's':
- config.sentence_marker_missing = lm::ngram::Config::SILENT;
- break;
- default:
- Usage(argv[0]);
+ try {
+ lm::ngram::Config config;
+ int opt;
+ while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) {
+ switch(opt) {
+ case 'u':
+ config.unknown_missing_logprob = ParseFloat(optarg);
+ break;
+ case 'p':
+ config.probing_multiplier = ParseFloat(optarg);
+ break;
+ case 't':
+ config.temporary_directory_prefix = optarg;
+ break;
+ case 'm':
+ config.building_memory = ParseUInt(optarg) * 1048576;
+ break;
+ case 's':
+ config.sentence_marker_missing = lm::ngram::Config::SILENT;
+ break;
+ default:
+ Usage(argv[0]);
+ }
}
- }
- if (optind + 1 == argc) {
- ShowSizes(argv[optind], config);
- } else if (optind + 2 == argc) {
- config.write_mmap = argv[optind + 1];
- ProbingModel(argv[optind], config);
- } else if (optind + 3 == argc) {
- const char *model_type = argv[optind];
- const char *from_file = argv[optind + 1];
- config.write_mmap = argv[optind + 2];
- if (!strcmp(model_type, "probing")) {
- ProbingModel(from_file, config);
- } else if (!strcmp(model_type, "sorted")) {
- SortedModel(from_file, config);
- } else if (!strcmp(model_type, "trie")) {
- TrieModel(from_file, config);
+ if (optind + 1 == argc) {
+ ShowSizes(argv[optind], config);
+ } else if (optind + 2 == argc) {
+ config.write_mmap = argv[optind + 1];
+ ProbingModel(argv[optind], config);
+ } else if (optind + 3 == argc) {
+ const char *model_type = argv[optind];
+ const char *from_file = argv[optind + 1];
+ config.write_mmap = argv[optind + 2];
+ if (!strcmp(model_type, "probing")) {
+ ProbingModel(from_file, config);
+ } else if (!strcmp(model_type, "sorted")) {
+ SortedModel(from_file, config);
+ } else if (!strcmp(model_type, "trie")) {
+ TrieModel(from_file, config);
+ } else {
+ Usage(argv[0]);
+ }
} else {
Usage(argv[0]);
}
- } else {
- Usage(argv[0]);
+ }
+ catch (std::exception &e) {
+ std::cerr << e.what() << std::endl;
+ abort();
}
return 0;
}
diff --git a/klm/lm/config.cc b/klm/lm/config.cc
index d8773fe5..71646e51 100644
--- a/klm/lm/config.cc
+++ b/klm/lm/config.cc
@@ -10,7 +10,7 @@ Config::Config() :
enumerate_vocab(NULL),
unknown_missing(COMPLAIN),
sentence_marker_missing(THROW_UP),
- unknown_missing_prob(0.0),
+ unknown_missing_logprob(-100.0),
probing_multiplier(1.5),
building_memory(1073741824ULL), // 1 GB
temporary_directory_prefix(NULL),
diff --git a/klm/lm/config.hh b/klm/lm/config.hh
index 17f67df3..1f7762be 100644
--- a/klm/lm/config.hh
+++ b/klm/lm/config.hh
@@ -36,7 +36,7 @@ struct Config {
// The probability to substitute for <unk> if it's missing from the model.
// No effect if the model has <unk> or unknown_missing == THROW_UP.
- float unknown_missing_prob;
+ float unknown_missing_logprob;
// Size multiplier for probing hash table. Must be > 1. Space is linear in
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
diff --git a/klm/lm/model.cc b/klm/lm/model.cc
index 14949e97..1492276a 100644
--- a/klm/lm/model.cc
+++ b/klm/lm/model.cc
@@ -86,7 +86,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
assert(config.unknown_missing != Config::THROW_UP);
// Default probabilities for unknown.
search_.unigram.Unknown().backoff = 0.0;
- search_.unigram.Unknown().prob = config.unknown_missing_prob;
+ search_.unigram.Unknown().prob = config.unknown_missing_logprob;
}
FinishFile(config, kModelType, counts, backing_);
}
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index 63631223..b830dfc3 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -535,13 +535,16 @@ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const st
}
}
-void ARPAToSortedFiles(const Config &config, util::FilePiece &f, const std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
{
std::string unigram_name = file_prefix + "unigrams";
util::scoped_fd unigram_file;
- util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), counts[0] * sizeof(ProbBackoff), unigram_file), counts[0] * sizeof(ProbBackoff));
+ // In case <unk> appears.
+ size_t extra_count = counts[0] + 1;
+ util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), extra_count * sizeof(ProbBackoff), unigram_file), extra_count * sizeof(ProbBackoff));
Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()));
CheckSpecials(config, vocab);
+ if (!vocab.SawUnk()) ++counts[0];
}
// Only use as much buffer as we need.
@@ -572,7 +575,7 @@ bool HeadMatch(const WordIndex *words, const WordIndex *const words_end, const W
return true;
}
-// Counting phrase
+// Phase to count n-grams, including blanks inserted because they were pruned but have extensions
class JustCount {
public:
JustCount(ContextReader * /*contexts*/, UnigramValue * /*unigrams*/, BitPackedMiddle * /*middle*/, BitPackedLongest &/*longest*/, uint64_t *counts, unsigned char order)
@@ -603,6 +606,7 @@ class JustCount {
uint64_t *const counts_, *const longest_counts_;
};
+// Phase to actually write n-grams to the trie.
class WriteEntries {
public:
WriteEntries(ContextReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, const uint64_t * /*counts*/, unsigned char order) :
@@ -764,7 +768,7 @@ template <class Doing> class RecursiveInsert {
void SanityCheckCounts(const std::vector<uint64_t> &initial, const std::vector<uint64_t> &fixed) {
if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]);
- if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant");
+ if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back());
for (unsigned char i = 0; i < initial.size(); ++i) {
if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected. This shouldn't happen");
}
@@ -789,6 +793,9 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co
RecursiveInsert<JustCount> counter(inputs, contexts, NULL, &*out.middle.begin(), out.longest, &*fixed_counts.begin(), counts.size());
counter.Apply(config.messages, "Counting n-grams that should not have been pruned", counts[0]);
}
+ for (SortedFileReader *i = inputs; i < inputs + counts.size() - 1; ++i) {
+ if (!i->Ended()) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
+ }
SanityCheckCounts(counts, fixed_counts);
counts = fixed_counts;
@@ -805,7 +812,7 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co
}
// Fill unigram probabilities.
- {
+ try {
std::string name(file_prefix + "unigrams");
util::scoped_FILE file(OpenOrThrow(name.c_str(), "r"));
for (WordIndex i = 0; i < counts[0]; ++i) {
@@ -816,6 +823,9 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co
}
}
RemoveOrThrow(name.c_str());
+ } catch (util::Exception &e) {
+ e << " while re-reading unigram probabilities";
+ throw;
}
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc
index 415f8331..fd11ad2c 100644
--- a/klm/lm/vocab.cc
+++ b/klm/lm/vocab.cc
@@ -192,7 +192,7 @@ void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
case Config::SILENT:
return;
case Config::COMPLAIN:
- if (config.messages) *config.messages << "The ARPA file is missing <unk>. Substituting probability " << config.unknown_missing_prob << "." << std::endl;
+ if (config.messages) *config.messages << "The ARPA file is missing <unk>. Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl;
break;
case Config::THROW_UP:
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing <unk> and the model is configured to throw an exception.");
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 70cfc2d2..5c71c792 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -28,16 +28,19 @@ namespace util {
* but it may be called multiple times when that's inconvenient.
*/
-inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
+
// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
#if BYTE_ORDER == LITTLE_ENDIAN
+inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) {
return bit;
+}
#elif BYTE_ORDER == BIG_ENDIAN
+inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
return 64 - length - bit;
+}
#else
#error "Bit packing code isn't written for your byte order."
#endif
-}
/* Pack integers up to 57 bits using their least significant digits.
* The length is specified using mask:
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index 077405f4..84f9fe7c 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -9,11 +9,11 @@ Exception::Exception() throw() {}
Exception::~Exception() throw() {}
Exception::Exception(const Exception &from) : std::exception() {
- stream_.str(from.stream_.str());
+ stream_ << from.stream_.str();
}
Exception &Exception::operator=(const Exception &from) {
- stream_.str(from.stream_.str());
+ stream_ << from.stream_.str();
return *this;
}
diff --git a/klm/util/have.hh b/klm/util/have.hh
index 7cf62008..f2f0cf90 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -2,8 +2,14 @@
#ifndef UTIL_HAVE__
#define UTIL_HAVE__
+#ifndef HAVE_ZLIB
#define HAVE_ZLIB
+#endif
+
// #define HAVE_ICU
+
+#ifndef HAVE_BOOST
#define HAVE_BOOST
+#endif
#endif // UTIL_HAVE__
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 325ba030..32033c19 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -64,6 +64,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("input_weights,w",po::value<string>(),"Input feature weights file")
+ ("frozen_features,z",po::value<string>(), "List of features not to optimize")
("training_data,t",po::value<string>(),"Training data corpus")
("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
@@ -254,6 +255,20 @@ int main(int argc, char** argv) {
if (conf.count("input_weights"))
weights.InitFromFile(conf["input_weights"].as<string>());
+ vector<int> frozen_fids;
+ if (conf.count("frozen_features")) {
+ ReadFile rf(conf["frozen_features"].as<string>());
+ istream& in = *rf.stream();
+ string line;
+ while(in) {
+ getline(in, line);
+ if (line.empty()) continue;
+ if (line[0] == ' ' || line[line.size() - 1] == ' ') { line = Trim(line); }
+ frozen_fids.push_back(FD::Convert(line));
+ }
+ if (rank == 0) cerr << "Freezing " << frozen_fids.size() << " features.\n";
+ }
+
vector<string> corpus;
vector<int> ids;
ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
@@ -284,7 +299,7 @@ int main(int argc, char** argv) {
const string omethod = conf["optimization_method"].as<string>();
if (omethod == "sgd") {
const double C = conf["regularization_strength"].as<double>();
- o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C));
+ o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C, frozen_fids));
} else {
assert(!"fail");
}
diff --git a/training/online_optimizer.h b/training/online_optimizer.h
index 312aabae..28d89344 100644
--- a/training/online_optimizer.h
+++ b/training/online_optimizer.h
@@ -2,6 +2,7 @@
#define _ONL_OPTIMIZE_H_
#include <tr1/memory>
+#include <set>
#include <string>
#include <cmath>
#include "sparse_vector.h"
@@ -56,8 +57,12 @@ class OnlineOptimizer {
public:
virtual ~OnlineOptimizer();
OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
- size_t batch_size)
- : N_(batch_size),schedule_(s),k_() {}
+ size_t batch_size,
+ const std::vector<int>& frozen_feats = std::vector<int>())
+ : N_(batch_size),schedule_(s),k_() {
+ for (int i = 0; i < frozen_feats.size(); ++i)
+ frozen_.insert(frozen_feats[i]);
+ }
void ResetEpoch() { k_ = 0; ResetEpochImpl(); }
void UpdateWeights(const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
++k_;
@@ -69,6 +74,7 @@ class OnlineOptimizer {
virtual void ResetEpochImpl();
virtual void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) = 0;
const size_t N_; // number of training instances per batch
+ std::set<int> frozen_; // frozen (non-optimizing) features
private:
std::tr1::shared_ptr<LearningRateSchedule> schedule_;
@@ -78,16 +84,21 @@ class OnlineOptimizer {
class CumulativeL1OnlineOptimizer : public OnlineOptimizer {
public:
CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
- size_t training_instances, double C) :
- OnlineOptimizer(s, training_instances), C_(C), u_() {}
+ size_t training_instances, double C,
+ const std::vector<int>& frozen) :
+ OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {}
protected:
void ResetEpochImpl() { u_ = 0; }
void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
u_ += eta * C_ / N_;
- (*weights) += eta * approx_g;
+ for (SparseVector<double>::const_iterator it = approx_g.begin();
+ it != approx_g.end(); ++it) {
+ if (frozen_.count(it->first) == 0)
+ weights->add_value(it->first, eta * it->second);
+ }
for (int i = 1; i < max_feat; ++i)
- ApplyPenalty(i, weights);
+ if (frozen_.count(i) == 0) ApplyPenalty(i, weights);
}
private:
diff --git a/training/optimize_test.cc b/training/optimize_test.cc
index 6fa5efd4..fe7ca70f 100644
--- a/training/optimize_test.cc
+++ b/training/optimize_test.cc
@@ -104,7 +104,7 @@ void TestOnline() {
double eta0 = 0.2;
shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85));
//shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0));
- CumulativeL1OnlineOptimizer opt(r, N, C);
+ CumulativeL1OnlineOptimizer opt(r, N, C, std::vector<int>());
assert(r->eta(10) < r->eta(1));
}
diff --git a/utils/static_utoa.h b/utils/static_utoa.h
index d15ed35b..bb3d821f 100755
--- a/utils/static_utoa.h
+++ b/utils/static_utoa.h
@@ -7,7 +7,7 @@
namespace {
static const int utoa_bufsize=40; // 64bit safe.
static const int utoa_bufsizem1=utoa_bufsize-1; // 64bit safe.
-THREADLOCAL char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20]
+static char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20]
}
inline char *static_utoa(unsigned n) {
diff --git a/utils/tdict.cc b/utils/tdict.cc
index 23a298f8..c21b2b48 100644
--- a/utils/tdict.cc
+++ b/utils/tdict.cc
@@ -8,7 +8,6 @@
#include "dict.h"
#include "tdict.h"
#include "stringlib.h"
-#include "threadlocal.h"
using namespace std;
diff --git a/utils/threadlocal.h b/utils/threadlocal.h
deleted file mode 100755
index d79f5d9d..00000000
--- a/utils/threadlocal.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef THREADLOCAL_H
-#define THREADLOCAL_H
-
-#ifndef SETLOCAL_SWAP
-# define SETLOCAL_SWAP 0
-#endif
-
-#ifdef BOOST_NO_MT
-
-# define THREADLOCAL
-
-#else
-
-#ifdef _MSC_VER
-
-//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html)
-# define THREADLOCAL __declspec(thread)
-
-#else
-
-# define THREADLOCAL __thread
-
-#endif
-
-#endif
-
-#include <algorithm> //swap
-
-// naturally, the below are only thread-safe if value is THREADLOCAL
-template <class D>
-struct SaveLocal {
- D &value;
- D old_value;
- SaveLocal(D& val) : value(val), old_value(val) {}
- ~SaveLocal() {
-#if SETLOCAL_SWAP
- swap(value,old_value);
-#else
- value=old_value;
-#endif
- }
-};
-
-template <class D>
-struct SetLocal {
- D &value;
- D old_value;
- SetLocal(D& val,const D &new_value) : value(val), old_value(
-#if SETLOCAL_SWAP
- new_value
-#else
- val
-#endif
- ) {
-#if SETLOCAL_SWAP
- swap(value,old_value);
-#else
- value=new_value;
-#endif
- }
- ~SetLocal() {
-#if SETLOCAL_SWAP
- swap(value,old_value);
-#else
- value=old_value;
-#endif
- }
-};
-
-
-#endif
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 973a29ef..80d2471e 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -194,7 +194,6 @@ sub modbin {
my $src=$$_;
$$_="$bindir/".basename($src);
check_call("cp -p $src $$_");
- die "cp $src $$_ failed: $!" unless $? == 0;
}
}
sub dirsize {
@@ -372,13 +371,12 @@ while (1){
if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
$nmappers++;
- my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
my $jobid = check_output("$qcmd");
- die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0;
chomp $jobid;
$jobid =~ s/^(\d+)(.*?)$/\1/g;
$jobid =~ s/^Your job (\d+) .*$/\1/;
- push(@cleanupcmds, check_output("qdel $jobid 2> /dev/null"));
+ push(@cleanupcmds, "qdel $jobid 2> /dev/null");
print STDERR " $jobid";
if ($joblist == "") { $joblist = $jobid; }
else {$joblist = $joblist . "\|" . $jobid; }
@@ -398,7 +396,7 @@ while (1){
print STDERR "Waiting for mappers to complete...\n";
while ($nmappers > 0) {
sleep 5;
- my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | grep -v ' C '")));
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
$nmappers = scalar @livejobs;
}
print STDERR "All mappers complete.\n";
@@ -419,7 +417,8 @@ while (1){
print STDERR "COMMAND:\n$cmd\n";
check_bash_call($cmd);
$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
- my $best=check_bash_output("$cmd"); chomp $best;
+ # sort returns failure even when it doesn't fail for some reason
+ my $best=unchecked_output("$cmd"); chomp $best;
print STDERR "$best\n";
my ($oa, $x, $xscore) = split /\|/, $best;
$score = $xscore;
@@ -452,7 +451,7 @@ while (1){
my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
print W "$k $v\n";
}
- check_call("rm -rf $dir/splag.$im1");
+ check_call("rm $dir/splag.$im1/*");
$inweights = $finalFile;
}
$lastWeightsFile = "$dir/weights.$iteration";
@@ -575,7 +574,11 @@ sub enseg {
while (my $line=<SRC>){
chomp $line;
if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
} else {
print NEWSRC "<seg id=\"$i\">$line</seg>\n";
}
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index 47b77c79..b4783f91 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -82,7 +82,7 @@ sub preview_files {
my @f=grep { ! ($skipempty && -z $_) } @$l;
my $fn=join(' ',map {escape_shell($_)} @f);
my $cmd="tail -n $n $fn";
- check_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
+ unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
}
sub prefix_dirname($) {
#like `dirname but if ends in / then return the whole thing
@@ -283,7 +283,8 @@ sub numof_live_jobs {
if ($use_fork) {
die "not implemented";
} else {
- my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat")));
+ # We can probably continue decoding if the qstat error is only temporary
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat")));
return ($#livejobs + 1);
}
}
@@ -323,7 +324,7 @@ sub launch_job {
}
if ($joblist == "") { $joblist = $jobid; }
else {$joblist = $joblist . "\|" . $jobid; }
- my $cleanfn=check_output("qdel $jobid 2> /dev/null");
+ my $cleanfn="qdel $jobid 2> /dev/null";
push(@cleanup_cmds, $cleanfn);
}
close QOUT;
@@ -346,7 +347,7 @@ sub launch_job_fork {
my ($fh, $scr_name) = get_temp_script();
print $fh $script;
close $fh;
- my $todo = "/bin/sh $scr_name 1> $outfile 2> $errorfile";
+ my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
print STDERR "EXEC: $todo\n";
my $out = check_output("$todo");
print STDERR "RES: $out\n";