diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile.am | 4 | ||||
-rw-r--r-- | SConstruct | 106 | ||||
-rw-r--r-- | configure.ac | 11 | ||||
-rw-r--r-- | decoder/Makefile.am | 26 | ||||
-rw-r--r-- | decoder/decoder.cc | 54 | ||||
-rw-r--r-- | decoder/ff_klm.cc | 2 | ||||
-rw-r--r-- | decoder/ff_wordset.h | 1 | ||||
-rw-r--r-- | decoder/hg.cc | 5 | ||||
-rw-r--r-- | decoder/hg.h | 2 | ||||
-rw-r--r-- | decoder/trule.cc | 7 | ||||
-rw-r--r-- | klm/lm/build_binary.cc | 102 | ||||
-rw-r--r-- | klm/lm/config.cc | 2 | ||||
-rw-r--r-- | klm/lm/config.hh | 2 | ||||
-rw-r--r-- | klm/lm/model.cc | 2 | ||||
-rw-r--r-- | klm/lm/search_trie.cc | 20 | ||||
-rw-r--r-- | klm/lm/vocab.cc | 2 | ||||
-rw-r--r-- | klm/util/bit_packing.hh | 7 | ||||
-rw-r--r-- | klm/util/exception.cc | 4 | ||||
-rw-r--r-- | klm/util/have.hh | 6 | ||||
-rw-r--r-- | training/mpi_online_optimize.cc | 17 | ||||
-rw-r--r-- | training/online_optimizer.h | 23 | ||||
-rw-r--r-- | training/optimize_test.cc | 2 | ||||
-rwxr-xr-x | utils/static_utoa.h | 2 | ||||
-rw-r--r-- | utils/tdict.cc | 1 | ||||
-rwxr-xr-x | utils/threadlocal.h | 71 | ||||
-rwxr-xr-x | vest/dist-vest.pl | 17 | ||||
-rwxr-xr-x | vest/parallelize.pl | 9 |
28 files changed, 289 insertions, 219 deletions
@@ -120,3 +120,4 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar *.dvi *.ps *.toc +*~
\ No newline at end of file diff --git a/Makefile.am b/Makefile.am index a808c211..bd46bd91 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,9 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools + +#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 @@ -1,23 +1,54 @@ -AddOption('--prefix', - dest='prefix', - type='string', - nargs=1, - action='store', - metavar='DIR', - help='installation prefix') - -AddOption('--with-boost', - dest='boost', - type='string', - nargs=1, - action='store', - metavar='DIR', +#!/usr/bin/python + +# EXPERIMENTAL and HACKY version of cdec build in scons + +AddOption('--prefix', dest='prefix', type='string', nargs=1, action='store', metavar='DIR', + help='installation prefix') +AddOption('--with-boost', dest='boost', type='string', nargs=1, action='store', metavar='DIR', help='boost installation directory (if in a non-standard location)') +AddOption('--with-glc', dest='glc', type='string', nargs=1, action='store', metavar='DIR', + help='path to Global Lexical Coherence package (optional)') +AddOption('--efence', dest='efence', action='store_true', + help='use electric fence for debugging memory corruptions') + +# TODO: Troll http://www.scons.org/wiki/SconsAutoconf +# for some initial autoconf-like steps platform = ARGUMENTS.get('OS', Platform()) +include = Split('decoder utils klm mteval .') +env = Environment(PREFIX=GetOption('prefix'), + PLATFORM = platform, +# BINDIR = bin, +# INCDIR = include, +# LIBDIR = lib, + CPPPATH = include, + LIBPATH = [], + LIBS = Split('boost_program_options boost_serialization boost_thread z'), + CCFLAGS=Split('-g -O3')) + +boost = GetOption('boost') +if boost: + print 'Using Boost at {0}'.format(boost) + env.Append(CPPPATH=boost+'/include', + LIBPATH=boost+'/lib') + +if GetOption('efence'): + env.Append(LIBS=Split('efence Segfault')) srcs = [] -for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc']: + +# TODO: Get rid of config.h + +glc = GetOption('glc') +if glc: + print 'Using Global Lexical Coherence package at {0}'.format(glc) + env.Append(CCFLAGS='-DHAVE_GLC', + CPPPATH=[glc, glc+'/cdec']) + srcs.append(glc+'/string_util.cc') + srcs.append(glc+'/feature-factory.cc') + srcs.append(glc+'/cdec/ff_glc.cc') + +for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc', 'vest/*.cc']: srcs.extend([ file for file in Glob(pattern) if not 'test' in str(file) and 'build_binary.cc' not in str(file) @@ -25,27 +56,30 @@ for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mtev and 'mbr_kbest.cc' not in str(file) and 'sri.cc' not in str(file) and 'fast_score.cc' not in str(file) + and 'cdec.cc' not in str(file) + and 'mr_' not in str(file) ]) -include = Split('decoder utils klm mteval .') -libPaths = [] - -boost = GetOption('boost') -if boost: - include.append(boost+'/include') - libPaths.append(boost+'/lib') - -glcDir = None -glcDir = '../GlobalLexicalCoherence' -if glcDir: - include.append(glcDir) +print 'Found {0} source files'.format(len(srcs)) +def comb(cc, srcs): + x = [cc] + x.extend(srcs) + return x -env = Environment(PREFIX=GetOption('prefix'), - PLATFORM = platform, -# BINDIR = bin, - INCDIR = include, -# LIBDIR = lib, - CPPPATH = [include, '.'], - LIBPATH = libPaths, - LIBS = Split('boost_program_options boost_serialization boost_thread z')) -env.Program(target='decoder/cdec', source=srcs) +env.Program(target='decoder/cdec', source=comb('decoder/cdec.cc', srcs)) +# TODO: The various decoder tests +# TODO: extools +env.Program(target='klm/lm/build_binary', source=comb('klm/lm/build_binary.cc', srcs)) +# TODO: klm ngram_query and tests +env.Program(target='mteval/fast_score', source=comb('mteval/fast_score.cc', srcs)) +env.Program(target='mteval/mbr_kbest', source=comb('mteval/mbr_kbest.cc', srcs)) +#env.Program(target='mteval/scorer_test', source=comb('mteval/fast_score.cc', srcs)) +# TODO: phrasinator +# TODO: Various training binaries +env.Program(target='vest/sentserver', source=['vest/sentserver.c'], LINKFLAGS='-all-static') +env.Program(target='vest/sentclient', source=['vest/sentclient.c'], LINKFLAGS='-all-static') +env.Program(target='vest/mr_vest_generate_mapper_input', source=comb('vest/mr_vest_generate_mapper_input.cc', srcs)) +env.Program(target='vest/mr_vest_map', source=comb('vest/mr_vest_map.cc', srcs)) +env.Program(target='vest/mr_vest_reduce', source=comb('vest/mr_vest_reduce.cc', srcs)) +#env.Program(target='vest/lo_test', source=comb('vest/lo_test.cc', srcs)) +# TODO: util tests diff --git a/configure.ac b/configure.ac index 56f08147..da66c3fb 100644 --- a/configure.ac +++ b/configure.ac @@ -3,15 +3,20 @@ AM_INIT_AUTOMAKE(cdec,0.1) AC_CONFIG_HEADERS(config.h) AC_PROG_LIBTOOL AC_PROG_LEX +case $LEX in +:) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; +esac AC_PROG_CC AC_PROG_CXX AC_LANG_CPLUSPLUS BOOST_REQUIRE BOOST_PROGRAM_OPTIONS -BOOST_THREADS +#BOOST_THREADS CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" -LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_THREAD_LDFLAGS" -LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_THREAD_LIBS" +LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" +# $BOOST_THREAD_LDFLAGS" +LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS" +# $BOOST_THREAD_LIBS" AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp, [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])]) diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 9cf4c3c4..244da2de 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -6,16 +6,13 @@ noinst_PROGRAMS = \ hg_test \ ff_test \ parser_test \ - grammar_test \ - cfg_test -TESTS = trule_test ff_test parser_test grammar_test hg_test cfg_test -endif - -cdec_SOURCES = cdec.cc -cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -cfg_test_SOURCES = cfg_test.cc -cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz + grammar_test + + # cfg_test +TESTS = trule_test ff_test parser_test grammar_test hg_test +# cfg_test +#cfg_test_SOURCES = cfg_test.cc +#cfg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz parser_test_SOURCES = parser_test.cc parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz ff_test_SOURCES = ff_test.cc @@ -26,7 +23,12 @@ hg_test_SOURCES = hg_test.cc hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz trule_test_SOURCES = trule_test.cc trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm +endif + +cdec_SOURCES = cdec.cc +cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm rule_lexer.cc: rule_lexer.l $(LEX) -s -CF -8 -o$@ $< @@ -82,5 +84,5 @@ libcdec_a_SOURCES = \ if GLC # Until we build GLC as a library... - libcdec_a_SOURCES += ff_glc.cc + libcdec_a_SOURCES += ff_glc.cc string_util.cc feature-factory.cc endif diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 95ff6270..b7774acc 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -141,12 +141,13 @@ inline shared_ptr<FsaFeatureFunction> make_fsa_ff(string const& ffp,bool verbose // and then prune the resulting (rescored) hypergraph. All feature values from previous // passes are carried over into subsequent passes (where they may have different weights). struct RescoringPass { - RescoringPass() : density_prune(), beam_prune() {} + RescoringPass() : fid_summary(), density_prune(), beam_prune() {} shared_ptr<ModelSet> models; shared_ptr<IntersectionConfiguration> inter_conf; vector<const FeatureFunction*> ffs; shared_ptr<Weights> w; // null == use previous weights vector<double> weight_vector; + int fid_summary; // 0 == no summary feature double density_prune; // 0 == don't density prune double beam_prune; // 0 == don't beam prune }; @@ -155,6 +156,7 @@ ostream& operator<<(ostream& os, const RescoringPass& rp) { os << "[num_fn=" << rp.ffs.size(); if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; } if (rp.w) os << " new_weights"; + if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary); if (rp.density_prune) os << " density_prune=" << rp.density_prune; if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune; os << ']'; @@ -361,18 +363,21 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("weights,w",po::value<string>(),"Feature weights file (initial forest / pass 1)") ("feature_function,F",po::value<vector<string> >()->composing(), "Pass 1 additional feature function(s) (-L for list)") ("intersection_strategy,I",po::value<string>()->default_value("cube_pruning"), "Pass 1 intersection strategy for incorporating finite-state features; values include Cube_pruning, Full") + ("summary_feature", po::value<string>(), "Compute a 'summary feature' at the end of the pass (before any pruning) with name=arg and value=inside-outside/Z") ("density_prune", po::value<double>(), "Pass 1 pruning: keep no more than this many times the number of edges used in the best derivation tree (>=1.0)") ("beam_prune", po::value<double>(), "Pass 1 pruning: Prune paths from scored forest, keep paths within exp(alpha>=0)") ("weights2",po::value<string>(),"Optional pass 2") ("feature_function2",po::value<vector<string> >()->composing(), "Optional pass 2") ("intersection_strategy2",po::value<string>()->default_value("cube_pruning"), "Optional pass 2") + ("summary_feature2", po::value<string>(), "Optional pass 2") ("density_prune2", po::value<double>(), "Optional pass 2") ("beam_prune2", po::value<double>(), "Optional pass 2") ("weights3",po::value<string>(),"Optional pass 3") ("feature_function3",po::value<vector<string> >()->composing(), "Optional pass 3") ("intersection_strategy3",po::value<string>()->default_value("cube_pruning"), "Optional pass 3") + ("summary_feature3", po::value<string>(), "Optional pass 3") ("density_prune3", po::value<double>(), "Optional pass 3") ("beam_prune3", po::value<double>(), "Optional pass 3") @@ -559,6 +564,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream for (int pass = 0; pass < MAX_PASSES; ++pass) { string ws = "weights" + StringSuffixForRescoringPass(pass); string ff = "feature_function" + StringSuffixForRescoringPass(pass); + string sf = "summary_feature" + StringSuffixForRescoringPass(pass); string bp = "beam_prune" + StringSuffixForRescoringPass(pass); string dp = "density_prune" + StringSuffixForRescoringPass(pass); bool first_pass_condition = ((pass == 0) && (conf.count(ff) || conf.count(bp) || conf.count(dp))); @@ -583,6 +589,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream if (p->IsStateful()) { has_stateful = true; } } } + if (conf.count(sf)) { + rp.fid_summary = FD::Convert(conf[sf].as<string>()); + assert(rp.fid_summary > 0); + // TODO assert that weights for this pass have coef(fid_summary) == 0.0? + } if (conf.count(bp)) { rp.beam_prune = conf[bp].as<double>(); } if (conf.count(dp)) { rp.density_prune = conf[dp].as<double>(); } int palg = (has_stateful ? 1 : 0); // if there are no stateful featueres, default to FULL @@ -794,6 +805,47 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { cerr << " " << passtr << " partition log(Z): " << log(z) << endl; } + if (rp.fid_summary) { +#if 0 + const prob_t z = forest.PushWeightsToGoal(1.0); + if (!SILENT) { cerr << " " << passtr << " adding summary feature " << FD::Convert(rp.fid_summary) << " log(Z)=" << log(z) << endl; } + if (!isfinite(log(z)) || isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + for (int i = 0; i < forest.edges_.size(); ++i) { + const double log_prob_transition = log(forest.edges_[i].edge_prob_); // locally normalized by the edge + // head node by forest.PushWeightsToGoal + if (!isfinite(log_prob_transition) || isnan(log_prob_transition)) { + cerr << "Edge: i=" << i << " got bad inside prob: " << *forest.edges_[i].rule_ << endl; + abort(); + } + + forest.edges_[i].feature_values_.set_value(rp.fid_summary, log_prob_transition); + } + forest.Reweight(cur_weights); // reset weights + } +#endif + Hypergraph::EdgeProbs posts; + const prob_t z = forest.ComputeEdgePosteriors(1.0, &posts); + if (!isfinite(log(z)) || isnan(log(z))) { + cerr << " " << passtr << " !!! Invalid partition detected, abandoning.\n"; + } else { + for (int i = 0; i < forest.nodes_.size(); ++i) { + const Hypergraph::EdgesVector& in_edges = forest.nodes_[i].in_edges_; + prob_t node_post = prob_t(0); + for (int j = 0; j < in_edges.size(); ++j) + node_post += (posts[in_edges[j]] / z); + const double log_np = log(node_post); + if (!isfinite(log_np) || isnan(log_np)) { + cerr << "got bad posterior prob for node " << i << endl; + abort(); + } + for (int j = 0; j < in_edges.size(); ++j) + forest.edges_[in_edges[j]].feature_values_.set_value(rp.fid_summary, exp(log_np)); + } + } + } + string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass); string fulldp = "density_prune" + StringSuffixForRescoringPass(pass); maybe_prune(forest,conf,fullbp.c_str(),fulldp.c_str(),passtr,srclen); diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index adc2c8bf..62908cdc 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -21,7 +21,7 @@ static const unsigned char MASK = 7; // -n NAME : feature id is NAME bool ParseLMArgs(string const& in, string* filename, string* mapfile, bool* explicit_markers, string* featname) { vector<string> const& argv=SplitOnWhitespace(in); - *explicit_markers = true; + *explicit_markers = false; *featname="LanguageModel"; *mapfile = ""; #define LMSPEC_NEXTARG if (i==argv.end()) { \ diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h index 00e1145b..643097ef 100644 --- a/decoder/ff_wordset.h +++ b/decoder/ff_wordset.h @@ -32,6 +32,7 @@ class WordSet : public FeatureFunction { ~WordSet() { } + Features features() const { return single_feature(fid_); } protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, diff --git a/decoder/hg.cc b/decoder/hg.cc index 39ac5132..a4028b0e 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -226,9 +226,9 @@ prob_t Hypergraph::PushViterbiWeightsToGoal(int fid) { } -void Hypergraph::PushWeightsToGoal(double scale) { +prob_t Hypergraph::PushWeightsToGoal(double scale) { vector<prob_t> posts; - ComputeEdgePosteriors(scale, &posts); + const prob_t inside_z = ComputeEdgePosteriors(scale, &posts); for (int i = 0; i < nodes_.size(); ++i) { const Hypergraph::Node& node = nodes_[i]; prob_t z = prob_t::Zero(); @@ -238,6 +238,7 @@ void Hypergraph::PushWeightsToGoal(double scale) { edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z; } } + return inside_z; } struct EdgeExistsWeightFunction { diff --git a/decoder/hg.h b/decoder/hg.h index aa1202b1..e5ef05f8 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -449,7 +449,7 @@ public: void PushWeightsToSource(double scale = 1.0); // same, except weights are pushed to the goal, works for HGs, // not just lattices - void PushWeightsToGoal(double scale = 1.0); + prob_t PushWeightsToGoal(double scale = 1.0); // contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid. note that p_viterbi becomes 1; k*p_viterbi becomes k. also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight) // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after). returns inside(goal) diff --git a/decoder/trule.cc b/decoder/trule.cc index 9820e6d5..40235542 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -5,7 +5,6 @@ #include "stringlib.h" #include "tdict.h" #include "rule_lexer.h" -#include "threadlocal.h" using namespace std; @@ -99,7 +98,7 @@ TRule* TRule::CreateRuleMonolingual(const string& rule) { namespace { // callback for lexer -THREADLOCAL int n_assigned=0; +int n_assigned=0; void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) { TRule *assignto=(TRule *)extra; *assignto=*new_rule; @@ -145,7 +144,9 @@ bool TRule::ReadFromString(const string& line, bool strict, bool mono) { getline(is, ss); //cerr << "L: " << ss << endl; int start = 0; - const int len = ss.size(); + int len = ss.size(); + const size_t ppos = ss.find(" |||"); + if (ppos != string::npos) { len = ppos; } while (start < len) { while(start < len && (ss[start] == ' ' || ss[start] == ';')) ++start; diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index d6dd5994..920ff080 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -15,8 +15,9 @@ namespace ngram { namespace { void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" -"-u sets the default probability for <unk> if the ARPA file does not have one.\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n" +"-u sets the default log10 probability for <unk> if the ARPA file does not have\n" +"one.\n" "-s allows models to be built even if they do not have <s> and </s>.\n\n" "type is one of probing, trie, or sorted:\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" @@ -69,65 +70,58 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { } // namespace lm } // namespace -void terminate_handler() { - try { throw; } - catch(const std::exception& e) { - std::cerr << e.what() << std::endl; - } - catch(...) { - std::cerr << "A non-standard exception was thrown." << std::endl; - } - std::abort(); -} - int main(int argc, char *argv[]) { using namespace lm::ngram; - std::set_terminate(terminate_handler); - - lm::ngram::Config config; - int opt; - while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) { - switch(opt) { - case 'u': - config.unknown_missing_prob = ParseFloat(optarg); - break; - case 'p': - config.probing_multiplier = ParseFloat(optarg); - break; - case 't': - config.temporary_directory_prefix = optarg; - break; - case 'm': - config.building_memory = ParseUInt(optarg) * 1048576; - break; - case 's': - config.sentence_marker_missing = lm::ngram::Config::SILENT; - break; - default: - Usage(argv[0]); + try { + lm::ngram::Config config; + int opt; + while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) { + switch(opt) { + case 'u': + config.unknown_missing_logprob = ParseFloat(optarg); + break; + case 'p': + config.probing_multiplier = ParseFloat(optarg); + break; + case 't': + config.temporary_directory_prefix = optarg; + break; + case 'm': + config.building_memory = ParseUInt(optarg) * 1048576; + break; + case 's': + config.sentence_marker_missing = lm::ngram::Config::SILENT; + break; + default: + Usage(argv[0]); + } } - } - if (optind + 1 == argc) { - ShowSizes(argv[optind], config); - } else if (optind + 2 == argc) { - config.write_mmap = argv[optind + 1]; - ProbingModel(argv[optind], config); - } else if (optind + 3 == argc) { - const char *model_type = argv[optind]; - const char *from_file = argv[optind + 1]; - config.write_mmap = argv[optind + 2]; - if (!strcmp(model_type, "probing")) { - ProbingModel(from_file, config); - } else if (!strcmp(model_type, "sorted")) { - SortedModel(from_file, config); - } else if (!strcmp(model_type, "trie")) { - TrieModel(from_file, config); + if (optind + 1 == argc) { + ShowSizes(argv[optind], config); + } else if (optind + 2 == argc) { + config.write_mmap = argv[optind + 1]; + ProbingModel(argv[optind], config); + } else if (optind + 3 == argc) { + const char *model_type = argv[optind]; + const char *from_file = argv[optind + 1]; + config.write_mmap = argv[optind + 2]; + if (!strcmp(model_type, "probing")) { + ProbingModel(from_file, config); + } else if (!strcmp(model_type, "sorted")) { + SortedModel(from_file, config); + } else if (!strcmp(model_type, "trie")) { + TrieModel(from_file, config); + } else { + Usage(argv[0]); + } } else { Usage(argv[0]); } - } else { - Usage(argv[0]); + } + catch (std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); } return 0; } diff --git a/klm/lm/config.cc b/klm/lm/config.cc index d8773fe5..71646e51 100644 --- a/klm/lm/config.cc +++ b/klm/lm/config.cc @@ -10,7 +10,7 @@ Config::Config() : enumerate_vocab(NULL), unknown_missing(COMPLAIN), sentence_marker_missing(THROW_UP), - unknown_missing_prob(0.0), + unknown_missing_logprob(-100.0), probing_multiplier(1.5), building_memory(1073741824ULL), // 1 GB temporary_directory_prefix(NULL), diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 17f67df3..1f7762be 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -36,7 +36,7 @@ struct Config { // The probability to substitute for <unk> if it's missing from the model. // No effect if the model has <unk> or unknown_missing == THROW_UP. - float unknown_missing_prob; + float unknown_missing_logprob; // Size multiplier for probing hash table. Must be > 1. Space is linear in // this. Time is probing_multiplier / (probing_multiplier - 1). No effect diff --git a/klm/lm/model.cc b/klm/lm/model.cc index 14949e97..1492276a 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -86,7 +86,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT assert(config.unknown_missing != Config::THROW_UP); // Default probabilities for unknown. search_.unigram.Unknown().backoff = 0.0; - search_.unigram.Unknown().prob = config.unknown_missing_prob; + search_.unigram.Unknown().prob = config.unknown_missing_logprob; } FinishFile(config, kModelType, counts, backing_); } diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 63631223..b830dfc3 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -535,13 +535,16 @@ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const st } } -void ARPAToSortedFiles(const Config &config, util::FilePiece &f, const std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { +void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { { std::string unigram_name = file_prefix + "unigrams"; util::scoped_fd unigram_file; - util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), counts[0] * sizeof(ProbBackoff), unigram_file), counts[0] * sizeof(ProbBackoff)); + // In case <unk> appears. + size_t extra_count = counts[0] + 1; + util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), extra_count * sizeof(ProbBackoff), unigram_file), extra_count * sizeof(ProbBackoff)); Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get())); CheckSpecials(config, vocab); + if (!vocab.SawUnk()) ++counts[0]; } // Only use as much buffer as we need. @@ -572,7 +575,7 @@ bool HeadMatch(const WordIndex *words, const WordIndex *const words_end, const W return true; } -// Counting phrase +// Phase to count n-grams, including blanks inserted because they were pruned but have extensions class JustCount { public: JustCount(ContextReader * /*contexts*/, UnigramValue * /*unigrams*/, BitPackedMiddle * /*middle*/, BitPackedLongest &/*longest*/, uint64_t *counts, unsigned char order) @@ -603,6 +606,7 @@ class JustCount { uint64_t *const counts_, *const longest_counts_; }; +// Phase to actually write n-grams to the trie. class WriteEntries { public: WriteEntries(ContextReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, const uint64_t * /*counts*/, unsigned char order) : @@ -764,7 +768,7 @@ template <class Doing> class RecursiveInsert { void SanityCheckCounts(const std::vector<uint64_t> &initial, const std::vector<uint64_t> &fixed) { if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]); - if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant"); + if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back()); for (unsigned char i = 0; i < initial.size(); ++i) { if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected. This shouldn't happen"); } @@ -789,6 +793,9 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co RecursiveInsert<JustCount> counter(inputs, contexts, NULL, &*out.middle.begin(), out.longest, &*fixed_counts.begin(), counts.size()); counter.Apply(config.messages, "Counting n-grams that should not have been pruned", counts[0]); } + for (SortedFileReader *i = inputs; i < inputs + counts.size() - 1; ++i) { + if (!i->Ended()) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); + } SanityCheckCounts(counts, fixed_counts); counts = fixed_counts; @@ -805,7 +812,7 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co } // Fill unigram probabilities. - { + try { std::string name(file_prefix + "unigrams"); util::scoped_FILE file(OpenOrThrow(name.c_str(), "r")); for (WordIndex i = 0; i < counts[0]; ++i) { @@ -816,6 +823,9 @@ void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, co } } RemoveOrThrow(name.c_str()); + } catch (util::Exception &e) { + e << " while re-reading unigram probabilities"; + throw; } // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc index 415f8331..fd11ad2c 100644 --- a/klm/lm/vocab.cc +++ b/klm/lm/vocab.cc @@ -192,7 +192,7 @@ void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { case Config::SILENT: return; case Config::COMPLAIN: - if (config.messages) *config.messages << "The ARPA file is missing <unk>. Substituting probability " << config.unknown_missing_prob << "." << std::endl; + if (config.messages) *config.messages << "The ARPA file is missing <unk>. Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl; break; case Config::THROW_UP: UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing <unk> and the model is configured to throw an exception."); diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 70cfc2d2..5c71c792 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -28,16 +28,19 @@ namespace util { * but it may be called multiple times when that's inconvenient. */ -inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { + // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. #if BYTE_ORDER == LITTLE_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { return bit; +} #elif BYTE_ORDER == BIG_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { return 64 - length - bit; +} #else #error "Bit packing code isn't written for your byte order." #endif -} /* Pack integers up to 57 bits using their least significant digits. * The length is specified using mask: diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 077405f4..84f9fe7c 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -9,11 +9,11 @@ Exception::Exception() throw() {} Exception::~Exception() throw() {} Exception::Exception(const Exception &from) : std::exception() { - stream_.str(from.stream_.str()); + stream_ << from.stream_.str(); } Exception &Exception::operator=(const Exception &from) { - stream_.str(from.stream_.str()); + stream_ << from.stream_.str(); return *this; } diff --git a/klm/util/have.hh b/klm/util/have.hh index 7cf62008..f2f0cf90 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -2,8 +2,14 @@ #ifndef UTIL_HAVE__ #define UTIL_HAVE__ +#ifndef HAVE_ZLIB #define HAVE_ZLIB +#endif + // #define HAVE_ICU + +#ifndef HAVE_BOOST #define HAVE_BOOST +#endif #endif // UTIL_HAVE__ diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 325ba030..32033c19 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -64,6 +64,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("input_weights,w",po::value<string>(),"Input feature weights file") + ("frozen_features,z",po::value<string>(), "List of features not to optimize") ("training_data,t",po::value<string>(),"Training data corpus") ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively") ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch") @@ -254,6 +255,20 @@ int main(int argc, char** argv) { if (conf.count("input_weights")) weights.InitFromFile(conf["input_weights"].as<string>()); + vector<int> frozen_fids; + if (conf.count("frozen_features")) { + ReadFile rf(conf["frozen_features"].as<string>()); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (line.empty()) continue; + if (line[0] == ' ' || line[line.size() - 1] == ' ') { line = Trim(line); } + frozen_fids.push_back(FD::Convert(line)); + } + if (rank == 0) cerr << "Freezing " << frozen_fids.size() << " features.\n"; + } + vector<string> corpus; vector<int> ids; ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids); @@ -284,7 +299,7 @@ int main(int argc, char** argv) { const string omethod = conf["optimization_method"].as<string>(); if (omethod == "sgd") { const double C = conf["regularization_strength"].as<double>(); - o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C)); + o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C, frozen_fids)); } else { assert(!"fail"); } diff --git a/training/online_optimizer.h b/training/online_optimizer.h index 312aabae..28d89344 100644 --- a/training/online_optimizer.h +++ b/training/online_optimizer.h @@ -2,6 +2,7 @@ #define _ONL_OPTIMIZE_H_ #include <tr1/memory> +#include <set> #include <string> #include <cmath> #include "sparse_vector.h" @@ -56,8 +57,12 @@ class OnlineOptimizer { public: virtual ~OnlineOptimizer(); OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s, - size_t batch_size) - : N_(batch_size),schedule_(s),k_() {} + size_t batch_size, + const std::vector<int>& frozen_feats = std::vector<int>()) + : N_(batch_size),schedule_(s),k_() { + for (int i = 0; i < frozen_feats.size(); ++i) + frozen_.insert(frozen_feats[i]); + } void ResetEpoch() { k_ = 0; ResetEpochImpl(); } void UpdateWeights(const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) { ++k_; @@ -69,6 +74,7 @@ class OnlineOptimizer { virtual void ResetEpochImpl(); virtual void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) = 0; const size_t N_; // number of training instances per batch + std::set<int> frozen_; // frozen (non-optimizing) features private: std::tr1::shared_ptr<LearningRateSchedule> schedule_; @@ -78,16 +84,21 @@ class OnlineOptimizer { class CumulativeL1OnlineOptimizer : public OnlineOptimizer { public: CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s, - size_t training_instances, double C) : - OnlineOptimizer(s, training_instances), C_(C), u_() {} + size_t training_instances, double C, + const std::vector<int>& frozen) : + OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {} protected: void ResetEpochImpl() { u_ = 0; } void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) { u_ += eta * C_ / N_; - (*weights) += eta * approx_g; + for (SparseVector<double>::const_iterator it = approx_g.begin(); + it != approx_g.end(); ++it) { + if (frozen_.count(it->first) == 0) + weights->add_value(it->first, eta * it->second); + } for (int i = 1; i < max_feat; ++i) - ApplyPenalty(i, weights); + if (frozen_.count(i) == 0) ApplyPenalty(i, weights); } private: diff --git a/training/optimize_test.cc b/training/optimize_test.cc index 6fa5efd4..fe7ca70f 100644 --- a/training/optimize_test.cc +++ b/training/optimize_test.cc @@ -104,7 +104,7 @@ void TestOnline() { double eta0 = 0.2; shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85)); //shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0)); - CumulativeL1OnlineOptimizer opt(r, N, C); + CumulativeL1OnlineOptimizer opt(r, N, C, std::vector<int>()); assert(r->eta(10) < r->eta(1)); } diff --git a/utils/static_utoa.h b/utils/static_utoa.h index d15ed35b..bb3d821f 100755 --- a/utils/static_utoa.h +++ b/utils/static_utoa.h @@ -7,7 +7,7 @@ namespace { static const int utoa_bufsize=40; // 64bit safe. static const int utoa_bufsizem1=utoa_bufsize-1; // 64bit safe. -THREADLOCAL char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20] +static char utoa_buf[utoa_bufsize]; // to put end of string character at buf[20] } inline char *static_utoa(unsigned n) { diff --git a/utils/tdict.cc b/utils/tdict.cc index 23a298f8..c21b2b48 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -8,7 +8,6 @@ #include "dict.h" #include "tdict.h" #include "stringlib.h" -#include "threadlocal.h" using namespace std; diff --git a/utils/threadlocal.h b/utils/threadlocal.h deleted file mode 100755 index d79f5d9d..00000000 --- a/utils/threadlocal.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef THREADLOCAL_H -#define THREADLOCAL_H - -#ifndef SETLOCAL_SWAP -# define SETLOCAL_SWAP 0 -#endif - -#ifdef BOOST_NO_MT - -# define THREADLOCAL - -#else - -#ifdef _MSC_VER - -//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html) -# define THREADLOCAL __declspec(thread) - -#else - -# define THREADLOCAL __thread - -#endif - -#endif - -#include <algorithm> //swap - -// naturally, the below are only thread-safe if value is THREADLOCAL -template <class D> -struct SaveLocal { - D &value; - D old_value; - SaveLocal(D& val) : value(val), old_value(val) {} - ~SaveLocal() { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=old_value; -#endif - } -}; - -template <class D> -struct SetLocal { - D &value; - D old_value; - SetLocal(D& val,const D &new_value) : value(val), old_value( -#if SETLOCAL_SWAP - new_value -#else - val -#endif - ) { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=new_value; -#endif - } - ~SetLocal() { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=old_value; -#endif - } -}; - - -#endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 973a29ef..80d2471e 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -194,7 +194,6 @@ sub modbin { my $src=$$_; $$_="$bindir/".basename($src); check_call("cp -p $src $$_"); - die "cp $src $$_ failed: $!" unless $? == 0; } } sub dirsize { @@ -372,13 +371,12 @@ while (1){ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } $nmappers++; - my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; my $jobid = check_output("$qcmd"); - die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0; chomp $jobid; $jobid =~ s/^(\d+)(.*?)$/\1/g; $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, check_output("qdel $jobid 2> /dev/null")); + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); print STDERR " $jobid"; if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } @@ -398,7 +396,7 @@ while (1){ print STDERR "Waiting for mappers to complete...\n"; while ($nmappers > 0) { sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | grep -v ' C '"))); + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); $nmappers = scalar @livejobs; } print STDERR "All mappers complete.\n"; @@ -419,7 +417,8 @@ while (1){ print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - my $best=check_bash_output("$cmd"); chomp $best; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; print STDERR "$best\n"; my ($oa, $x, $xscore) = split /\|/, $best; $score = $xscore; @@ -452,7 +451,7 @@ while (1){ my $v = ($ori{$k} + $axi{$k} * $x) / $norm; print W "$k $v\n"; } - check_call("rm -rf $dir/splag.$im1"); + check_call("rm $dir/splag.$im1/*"); $inweights = $finalFile; } $lastWeightsFile = "$dir/weights.$iteration"; @@ -575,7 +574,11 @@ sub enseg { while (my $line=<SRC>){ chomp $line; if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } } else { print NEWSRC "<seg id=\"$i\">$line</seg>\n"; } diff --git a/vest/parallelize.pl b/vest/parallelize.pl index 47b77c79..b4783f91 100755 --- a/vest/parallelize.pl +++ b/vest/parallelize.pl @@ -82,7 +82,7 @@ sub preview_files { my @f=grep { ! ($skipempty && -z $_) } @$l; my $fn=join(' ',map {escape_shell($_)} @f); my $cmd="tail -n $n $fn"; - check_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); } sub prefix_dirname($) { #like `dirname but if ends in / then return the whole thing @@ -283,7 +283,8 @@ sub numof_live_jobs { if ($use_fork) { die "not implemented"; } else { - my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat"))); + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); return ($#livejobs + 1); } } @@ -323,7 +324,7 @@ sub launch_job { } if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } - my $cleanfn=check_output("qdel $jobid 2> /dev/null"); + my $cleanfn="qdel $jobid 2> /dev/null"; push(@cleanup_cmds, $cleanfn); } close QOUT; @@ -346,7 +347,7 @@ sub launch_job_fork { my ($fh, $scr_name) = get_temp_script(); print $fh $script; close $fh; - my $todo = "/bin/sh $scr_name 1> $outfile 2> $errorfile"; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; print STDERR "EXEC: $todo\n"; my $out = check_output("$todo"); print STDERR "RES: $out\n"; |