summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BUILDING2
-rw-r--r--decoder/Makefile.am1
-rw-r--r--decoder/ff_csplit.cc2
-rw-r--r--decoder/ff_klm.cc2
-rw-r--r--decoder/hg_sampler.cc73
-rw-r--r--decoder/hg_sampler.h27
-rw-r--r--environment/LocalConfig.pm43
-rw-r--r--klm/lm/bhiksha.hh5
-rw-r--r--klm/lm/build_binary.cc2
-rw-r--r--klm/lm/config.hh6
-rw-r--r--klm/lm/enumerate_vocab.hh2
-rw-r--r--klm/lm/left.hh43
-rw-r--r--klm/lm/model.cc4
-rw-r--r--klm/lm/read_arpa.cc2
-rw-r--r--klm/lm/search_hashed.cc16
-rw-r--r--klm/lm/search_trie.cc3
-rw-r--r--klm/lm/sri.cc108
-rw-r--r--klm/lm/sri.hh102
-rw-r--r--klm/lm/vocab.cc1
-rw-r--r--klm/lm/vocab.hh3
-rw-r--r--klm/util/mmap.cc2
-rw-r--r--klm/util/murmur_hash.cc15
-rw-r--r--klm/util/probing_hash_table.hh4
-rw-r--r--klm/util/tokenize_piece.hh75
-rw-r--r--klm/util/tokenize_piece_test.cc94
-rw-r--r--mira/kbest_mira.cc79
-rwxr-xr-xpro-train/dist-pro.pl114
-rw-r--r--pro-train/mr_pro_reduce.cc84
-rw-r--r--training/augment_grammar.cc2
-rw-r--r--training/test_ngram.cc2
-rwxr-xr-xvest/dist-vest.pl96
-rwxr-xr-xvest/parallelize.pl2
32 files changed, 587 insertions, 429 deletions
diff --git a/BUILDING b/BUILDING
index b7535d70..c7b954c7 100644
--- a/BUILDING
+++ b/BUILDING
@@ -33,7 +33,7 @@ Instructions for building
If you're building on cygwin, their libtool is buggy; this make command
works for now:
- make LIBS+="-loolm -ldstruct -lmisc -lz -lboost_program_options" \
+ make LIBS+="-lz -lboost_program_options" \
CFLAGS+="-Wno-sign-compare"
5) Test
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 6b9360d8..30eaf04d 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -51,6 +51,7 @@ libcdec_a_SOURCES = \
hg_io.cc \
decoder.cc \
hg_intersect.cc \
+ hg_sampler.cc \
factored_lexicon_helper.cc \
viterbi.cc \
lattice.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index dee6f4f9..3991d38f 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -155,7 +155,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl(
}
namespace {
-struct CSVMapper : public lm::ngram::EnumerateVocab {
+struct CSVMapper : public lm::EnumerateVocab {
CSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); }
void Add(lm::WordIndex index, const StringPiece &str) {
const WordID cdec_id = TD::Convert(str.as_string());
diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc
index ed6f731e..a4b26f7c 100644
--- a/decoder/ff_klm.cc
+++ b/decoder/ff_klm.cc
@@ -70,7 +70,7 @@ string KLanguageModel<Model>::usage(bool /*param*/,bool /*verbose*/) {
namespace {
-struct VMapper : public lm::ngram::EnumerateVocab {
+struct VMapper : public lm::EnumerateVocab {
VMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); }
void Add(lm::WordIndex index, const StringPiece &str) {
const WordID cdec_id = TD::Convert(str.as_string());
diff --git a/decoder/hg_sampler.cc b/decoder/hg_sampler.cc
new file mode 100644
index 00000000..cdf0ec3c
--- /dev/null
+++ b/decoder/hg_sampler.cc
@@ -0,0 +1,73 @@
+#include "hg_sampler.h"
+
+#include <queue>
+
+#include "viterbi.h"
+#include "inside_outside.h"
+
+using namespace std;
+
+struct SampledDerivationWeightFunction {
+ typedef double Weight;
+ explicit SampledDerivationWeightFunction(const vector<bool>& sampled) : sampled_edges(sampled) {}
+ double operator()(const Hypergraph::Edge& e) const {
+ return static_cast<double>(sampled_edges[e.id_]);
+ }
+ const vector<bool>& sampled_edges;
+};
+
+void HypergraphSampler::sample_hypotheses(const Hypergraph& hg,
+ unsigned n,
+ MT19937* rng,
+ vector<Hypothesis>* hypos) {
+ hypos->clear();
+ hypos->resize(n);
+
+ // compute inside probabilities
+ vector<prob_t> node_probs;
+ Inside<prob_t, EdgeProb>(hg, &node_probs, EdgeProb());
+
+ vector<bool> sampled_edges(hg.edges_.size());
+ queue<unsigned> q;
+ SampleSet<prob_t> ss;
+ for (unsigned i = 0; i < n; ++i) {
+ fill(sampled_edges.begin(), sampled_edges.end(), false);
+ // sample derivation top down
+ assert(q.empty());
+ Hypothesis& hyp = (*hypos)[i];
+ SparseVector<double>& deriv_features = hyp.fmap;
+ q.push(hg.nodes_.size() - 1);
+ prob_t& model_score = hyp.model_score;
+ model_score = prob_t::One();
+ while(!q.empty()) {
+ unsigned cur_node_id = q.front();
+ q.pop();
+ const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+ const unsigned num_in_edges = node.in_edges_.size();
+ unsigned sampled_edge_idx = 0;
+ if (num_in_edges == 1) {
+ sampled_edge_idx = node.in_edges_[0];
+ } else {
+ assert(num_in_edges > 1);
+ ss.clear();
+ for (unsigned j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+ prob_t p = edge.edge_prob_; // edge weight
+ for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+ p *= node_probs[edge.tail_nodes_[k]]; // tail node inside weight
+ ss.add(p);
+ }
+ sampled_edge_idx = node.in_edges_[rng->SelectSample(ss)];
+ }
+ sampled_edges[sampled_edge_idx] = true;
+ const Hypergraph::Edge& sampled_edge = hg.edges_[sampled_edge_idx];
+ deriv_features += sampled_edge.feature_values_;
+ model_score *= sampled_edge.edge_prob_;
+ //sampled_deriv->push_back(sampled_edge_idx);
+ for (unsigned j = 0; j < sampled_edge.tail_nodes_.size(); ++j) {
+ q.push(sampled_edge.tail_nodes_[j]);
+ }
+ }
+ Viterbi(hg, &hyp.words, ESentenceTraversal(), SampledDerivationWeightFunction(sampled_edges));
+ }
+}
diff --git a/decoder/hg_sampler.h b/decoder/hg_sampler.h
new file mode 100644
index 00000000..bf4e1eb0
--- /dev/null
+++ b/decoder/hg_sampler.h
@@ -0,0 +1,27 @@
+#ifndef _HG_SAMPLER_H_
+#define _HG_SAMPLER_H_
+
+
+#include <vector>
+#include "sparse_vector.h"
+#include "sampler.h"
+#include "wordid.h"
+
+class Hypergraph;
+
+struct HypergraphSampler {
+
+ struct Hypothesis {
+ std::vector<WordID> words;
+ SparseVector<double> fmap;
+ prob_t model_score; // log unnormalized probability
+ };
+
+ static void
+ sample_hypotheses(const Hypergraph& hg,
+ unsigned n, // how many samples to draw
+ MT19937* rng,
+ std::vector<Hypothesis>* hypos);
+};
+
+#endif
diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm
index db83696c..252463aa 100644
--- a/environment/LocalConfig.pm
+++ b/environment/LocalConfig.pm
@@ -4,7 +4,7 @@ use strict;
use warnings;
use base 'Exporter';
-our @EXPORT = qw( qsub_args mert_memory environment_name );
+our @EXPORT = qw( qsub_args mert_memory environment_name env_default_jobs has_qsub );
use Net::Domain qw(hostname hostfqdn hostdomain domainname);
@@ -14,43 +14,58 @@ my $host = domainname;
my $CCONFIG = {
'StarCluster' => {
'HOST_REGEXP' => qr/compute-\d+\.internal$/,
+ 'JobControl' => 'qsub',
'QSubMemFlag' => '-l mem',
+ 'DefaultJobs' => 20,
},
'LTICluster' => {
'HOST_REGEXP' => qr/^cluster\d+\.lti\.cs\.cmu\.edu$/,
+ 'JobControl' => 'qsub',
'QSubMemFlag' => '-l h_vmem=',
'QSubExtraFlags' => '-l walltime=0:45:00',
+ 'DefaultJobs' => 15,
#'QSubQueue' => '-q long',
},
'UMIACS' => {
'HOST_REGEXP' => qr/^d.*\.umiacs\.umd\.edu$/,
+ 'JobControl' => 'qsub',
'QSubMemFlag' => '-l pmem=',
'QSubQueue' => '-q batch',
'QSubExtraFlags' => '-l walltime=144:00:00',
+ 'DefaultJobs' => 15,
},
'CLSP' => {
'HOST_REGEXP' => qr/\.clsp\.jhu\.edu$/,
+ 'JobControl' => 'qsub',
'QSubMemFlag' => '-l mem_free=',
'MERTMem' => '9G',
+ 'DefaultJobs' => 15,
},
'Valhalla' => {
'HOST_REGEXP' => qr/^(thor|tyr)\.inf\.ed\.ac\.uk$/,
+ 'JobControl' => 'fork',
+ 'DefaultJobs' => 8,
},
'Blacklight' => {
'HOST_REGEXP' => qr/^(tg-login1.blacklight.psc.teragrid.org|blacklight.psc.edu|bl1.psc.teragrid.org|bl0.psc.teragrid.org)$/,
- 'QSubMemFlag' => '-l pmem=',
+ 'JobControl' => 'fork',
+ 'DefaultJobs' => 32,
},
'Barrow/Chicago' => {
'HOST_REGEXP' => qr/^(barrow|chicago).lti.cs.cmu.edu$/,
- 'QSubMemFlag' => '-l pmem=',
+ 'JobControl' => 'fork',
+ 'DefaultJobs' => 8,
},
'OxfordDeathSnakes' => {
'HOST_REGEXP' => qr/^(taipan|tiger).cs.ox.ac.uk$/,
- 'QSubMemFlag' => '-l pmem=',
+ 'JobControl' => 'fork',
+ 'DefaultJobs' => 12,
},
- 'LOCALx' => {
- 'HOST_REGEXP' => qr/local\./,
+ 'LOCAL' => { # LOCAL must be last in the list!!!
+ 'HOST_REGEXP' => qr//,
'QSubMemFlag' => ' ',
+ 'JobControl' => 'fork',
+ 'DefaultJobs' => 2,
},
'LOCAL' => {
'HOST_REGEXP' => qr/coltrane/,
@@ -58,7 +73,7 @@ my $CCONFIG = {
},
};
-our $senvironment_name;
+our $senvironment_name = 'LOCAL';
for my $config_key (keys %$CCONFIG) {
my $re = $CCONFIG->{$config_key}->{'HOST_REGEXP'};
die "Can't find HOST_REGEXP for $config_key" unless $re;
@@ -67,15 +82,23 @@ for my $config_key (keys %$CCONFIG) {
}
}
-die "NO ENVIRONMENT INFO FOR HOST: $host\nPLEASE EDIT LocalConfig.pm\n" unless $senvironment_name;
-
our %CONFIG = %{$CCONFIG->{$senvironment_name}};
-print STDERR "**Environment: $senvironment_name\n";
+print STDERR "**Environment: $senvironment_name";
+print STDERR " (has qsub)" if has_qsub();
+print STDERR "\n";
+
+sub has_qsub {
+ return ($CONFIG{'JobControl'} eq 'qsub');
+}
sub environment_name {
return $senvironment_name;
}
+sub env_default_jobs {
+ return 1 * $CONFIG{'DefaultJobs'};
+}
+
sub qsub_args {
my $mem = shift @_;
die "qsub_args requires a memory amount as a parameter, e.g. 4G" unless $mem;
diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh
index bc705959..3df43dda 100644
--- a/klm/lm/bhiksha.hh
+++ b/klm/lm/bhiksha.hh
@@ -10,6 +10,9 @@
* Currently only used for next pointers.
*/
+#ifndef LM_BHIKSHA__
+#define LM_BHIKSHA__
+
#include <inttypes.h>
#include <assert.h>
@@ -108,3 +111,5 @@ class ArrayBhiksha {
} // namespace trie
} // namespace ngram
} // namespace lm
+
+#endif // LM_BHIKSHA__
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc
index b7aee4de..fdb62a71 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary.cc
@@ -15,7 +15,7 @@ namespace ngram {
namespace {
void Usage(const char *name) {
- std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-c bits] [type] input.arpa [output.mmap]\n\n"
+ std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
" Default is -100. The ARPA file will always take precedence.\n"
"-s allows models to be built even if they do not have <s> and </s>.\n"
diff --git a/klm/lm/config.hh b/klm/lm/config.hh
index 227b8512..8564661b 100644
--- a/klm/lm/config.hh
+++ b/klm/lm/config.hh
@@ -8,10 +8,12 @@
/* Configuration for ngram model. Separate header to reduce pollution. */
-namespace lm { namespace ngram {
-
+namespace lm {
+
class EnumerateVocab;
+namespace ngram {
+
struct Config {
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
diff --git a/klm/lm/enumerate_vocab.hh b/klm/lm/enumerate_vocab.hh
index e734316b..27263621 100644
--- a/klm/lm/enumerate_vocab.hh
+++ b/klm/lm/enumerate_vocab.hh
@@ -5,7 +5,6 @@
#include "util/string_piece.hh"
namespace lm {
-namespace ngram {
/* If you need the actual strings in the vocabulary, inherit from this class
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
@@ -23,7 +22,6 @@ class EnumerateVocab {
EnumerateVocab() {}
};
-} // namespace ngram
} // namespace lm
#endif // LM_ENUMERATE_VOCAB__
diff --git a/klm/lm/left.hh b/klm/lm/left.hh
index bb3f5539..41f71f84 100644
--- a/klm/lm/left.hh
+++ b/klm/lm/left.hh
@@ -175,22 +175,14 @@ template <class M> class RuleScore {
float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1];
float *back = backoffs, *back2 = backoffs2;
- unsigned char next_use;
- FullScoreReturn ret;
- ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use));
- if (!next_use) {
- left_done_ = true;
- out_.right = in.right;
- return;
- }
- unsigned char extend_length = 2;
- for (const uint64_t *i = in.left.pointers + 1; i < in.left.pointers + in.left.length; ++i, ++extend_length) {
- ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use));
- if (!next_use) {
- left_done_ = true;
- out_.right = in.right;
- return;
- }
+ unsigned char next_use = out_.right.length;
+
+ // First word
+ if (ExtendLeft(in, next_use, 1, out_.right.backoff, back)) return;
+
+ // Words after the first, so extending a bigram to begin with
+ for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) {
+ if (ExtendLeft(in, next_use, extend_length, back, back2)) return;
std::swap(back, back2);
}
@@ -226,6 +218,25 @@ template <class M> class RuleScore {
}
private:
+ bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) {
+ ProcessRet(model_.ExtendLeft(
+ out_.right.words, out_.right.words + next_use, // Words to extend into
+ back_in, // Backoffs to use
+ in.left.pointers[extend_length - 1], extend_length, // Words to be extended
+ back_out, // Backoffs for the next score
+ next_use)); // Length of n-gram to use in next scoring.
+ if (next_use != out_.right.length) {
+ left_done_ = true;
+ if (!next_use) {
+ out_.right = in.right;
+ // Early exit.
+ return true;
+ }
+ }
+ // Continue scoring.
+ return false;
+ }
+
void ProcessRet(const FullScoreReturn &ret) {
prob_ += ret.prob;
if (left_done_) return;
diff --git a/klm/lm/model.cc b/klm/lm/model.cc
index 25f1ab7c..e4c1ec1d 100644
--- a/klm/lm/model.cc
+++ b/klm/lm/model.cc
@@ -91,8 +91,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
- if (ret.ngram_length - 1 < in_state.length) {
- ret.prob = std::accumulate(in_state.backoff + ret.ngram_length - 1, in_state.backoff + in_state.length, ret.prob);
+ for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
+ ret.prob += *i;
}
return ret;
}
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc
index 455bc4ba..dce73f77 100644
--- a/klm/lm/read_arpa.cc
+++ b/klm/lm/read_arpa.cc
@@ -38,6 +38,8 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
}
if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic)
UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?");
+ UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?");
+ UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst.");
UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\.");
}
while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc
index 334adf12..247832b0 100644
--- a/klm/lm/search_hashed.cc
+++ b/klm/lm/search_hashed.cc
@@ -87,14 +87,14 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
ReadNGramHeader(f, n);
// vocab ids of words in reverse order
- WordIndex vocab_ids[n];
- uint64_t keys[n - 1];
+ std::vector<WordIndex> vocab_ids(n);
+ std::vector<uint64_t> keys(n-1);
typename Store::Packing::Value value;
typename Middle::MutableIterator found;
for (size_t i = 0; i < count; ++i) {
- ReadNGram(f, n, vocab, vocab_ids, value, warn);
+ ReadNGram(f, n, vocab, &*vocab_ids.begin(), value, warn);
- keys[0] = detail::CombineWordHash(static_cast<uint64_t>(*vocab_ids), vocab_ids[1]);
+ keys[0] = detail::CombineWordHash(static_cast<uint64_t>(vocab_ids.front()), vocab_ids[1]);
for (unsigned int h = 1; h < n - 1; ++h) {
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
}
@@ -106,9 +106,9 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
util::FloatEnc fix_prob;
for (lower = n - 3; ; --lower) {
if (lower == -1) {
- fix_prob.f = unigrams[vocab_ids[0]].prob;
+ fix_prob.f = unigrams[vocab_ids.front()].prob;
fix_prob.i &= ~util::kSignBit;
- unigrams[vocab_ids[0]].prob = fix_prob.f;
+ unigrams[vocab_ids.front()].prob = fix_prob.f;
break;
}
if (middle[lower].UnsafeMutableFind(keys[lower], found)) {
@@ -120,8 +120,8 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
break;
}
}
- if (lower != static_cast<int>(n) - 3) FixSRI(lower, fix_prob.f, n, keys, vocab_ids, unigrams, middle);
- activate(vocab_ids, n);
+ if (lower != static_cast<int>(n) - 3) FixSRI(lower, fix_prob.f, n, &*keys.begin(), &*vocab_ids.begin(), unigrams, middle);
+ activate(&*vocab_ids.begin(), n);
}
store.FinishedInserting();
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index 5d8c70db..4bd3f4ee 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -358,6 +358,7 @@ template <class Doing> class BlankManager {
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
basis_[blank - 1] = kBadProb;
}
+ *pre = *cur;
been_length_ = length;
}
@@ -493,7 +494,7 @@ template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_pre
util::scoped_FILE unigram_file;
{
std::string name(file_prefix + "unigrams");
- unigram_file.reset(OpenOrThrow(name.c_str(), "r"));
+ unigram_file.reset(OpenOrThrow(name.c_str(), "r+"));
util::RemoveOrThrow(name.c_str());
}
sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
diff --git a/klm/lm/sri.cc b/klm/lm/sri.cc
deleted file mode 100644
index 825f699b..00000000
--- a/klm/lm/sri.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "lm/lm_exception.hh"
-#include "lm/sri.hh"
-
-#include <Ngram.h>
-#include <Vocab.h>
-
-#include <errno.h>
-
-namespace lm {
-namespace sri {
-
-Vocabulary::Vocabulary() : sri_(new Vocab) {}
-
-Vocabulary::~Vocabulary() {}
-
-WordIndex Vocabulary::Index(const char *str) const {
- WordIndex ret = sri_->getIndex(str);
- // NGram wants the index of Vocab_Unknown for unknown words, but for some reason SRI returns Vocab_None here :-(.
- if (ret == Vocab_None) {
- return not_found_;
- } else {
- return ret;
- }
-}
-
-const char *Vocabulary::Word(WordIndex index) const {
- return sri_->getWord(index);
-}
-
-void Vocabulary::FinishedLoading() {
- SetSpecial(
- sri_->ssIndex(),
- sri_->seIndex(),
- sri_->unkIndex());
-}
-
-namespace {
-Ngram *MakeSRIModel(const char *file_name, unsigned int ngram_length, Vocab &sri_vocab) {
- sri_vocab.unkIsWord() = true;
- std::auto_ptr<Ngram> ret(new Ngram(sri_vocab, ngram_length));
- File file(file_name, "r");
- errno = 0;
- if (!ret->read(file)) {
- UTIL_THROW(FormatLoadException, "reading file " << file_name << " with SRI failed.");
- }
- return ret.release();
-}
-} // namespace
-
-Model::Model(const char *file_name, unsigned int ngram_length) : sri_(MakeSRIModel(file_name, ngram_length, *vocab_.sri_)) {
- if (!sri_->setorder()) {
- UTIL_THROW(FormatLoadException, "Can't have an SRI model with order 0.");
- }
- vocab_.FinishedLoading();
- State begin_state = State();
- begin_state.valid_length_ = 1;
- if (kMaxOrder > 1) {
- begin_state.history_[0] = vocab_.BeginSentence();
- if (kMaxOrder > 2) begin_state.history_[1] = Vocab_None;
- }
- State null_state = State();
- null_state.valid_length_ = 0;
- if (kMaxOrder > 1) null_state.history_[0] = Vocab_None;
- Init(begin_state, null_state, vocab_, sri_->setorder());
- not_found_ = vocab_.NotFound();
-}
-
-Model::~Model() {}
-
-namespace {
-
-/* Argh SRI's wordProb knows the ngram length but doesn't return it. One more
- * reason you should use my model. */
-// TODO(stolcke): fix SRILM so I don't have to do this.
-unsigned int MatchedLength(Ngram &model, const WordIndex new_word, const SRIVocabIndex *const_history) {
- unsigned int out_length = 0;
- // This gets the length of context used, which is ngram_length - 1 unless new_word is OOV in which case it is 0.
- model.contextID(new_word, const_history, out_length);
- return out_length + 1;
-}
-
-} // namespace
-
-FullScoreReturn Model::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
- // If you get a compiler in this function, change SRIVocabIndex in sri.hh to match the one found in SRI's Vocab.h.
- const SRIVocabIndex *const_history;
- SRIVocabIndex local_history[Order()];
- if (in_state.valid_length_ < kMaxOrder - 1) {
- const_history = in_state.history_;
- } else {
- std::copy(in_state.history_, in_state.history_ + in_state.valid_length_, local_history);
- local_history[in_state.valid_length_] = Vocab_None;
- const_history = local_history;
- }
- FullScoreReturn ret;
- ret.ngram_length = MatchedLength(*sri_, new_word, const_history);
- out_state.history_[0] = new_word;
- out_state.valid_length_ = std::min<unsigned char>(ret.ngram_length, Order() - 1);
- std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1);
- if (out_state.valid_length_ < kMaxOrder - 1) {
- out_state.history_[out_state.valid_length_] = Vocab_None;
- }
- ret.prob = sri_->wordProb(new_word, const_history);
- return ret;
-}
-
-} // namespace sri
-} // namespace lm
diff --git a/klm/lm/sri.hh b/klm/lm/sri.hh
deleted file mode 100644
index b57e9b73..00000000
--- a/klm/lm/sri.hh
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef LM_SRI__
-#define LM_SRI__
-
-#include "lm/facade.hh"
-#include "util/murmur_hash.hh"
-
-#include <cmath>
-#include <exception>
-#include <memory>
-
-class Ngram;
-class Vocab;
-
-/* The ngram length reported uses some random API I found and may be wrong.
- *
- * See ngram, which should return equivalent results.
- */
-
-namespace lm {
-namespace sri {
-
-static const unsigned int kMaxOrder = 6;
-
-/* This should match VocabIndex found in SRI's Vocab.h
- * The reason I define this here independently is that SRI's headers
- * pollute and increase compile time.
- * It's difficult to extract this from their header and anyway would
- * break packaging.
- * If these differ there will be a compiler error in ActuallyCall.
- */
-typedef unsigned int SRIVocabIndex;
-
-class State {
- public:
- // You shouldn't need to touch these, but they're public so State will be a POD.
- // If valid_length_ < kMaxOrder - 1 then history_[valid_length_] == Vocab_None.
- SRIVocabIndex history_[kMaxOrder - 1];
- unsigned char valid_length_;
-};
-
-inline bool operator==(const State &left, const State &right) {
- if (left.valid_length_ != right.valid_length_) {
- return false;
- }
- for (const SRIVocabIndex *l = left.history_, *r = right.history_;
- l != left.history_ + left.valid_length_;
- ++l, ++r) {
- if (*l != *r) return false;
- }
- return true;
-}
-
-inline size_t hash_value(const State &state) {
- return util::MurmurHashNative(&state.history_, sizeof(SRIVocabIndex) * state.valid_length_);
-}
-
-class Vocabulary : public base::Vocabulary {
- public:
- Vocabulary();
-
- ~Vocabulary();
-
- WordIndex Index(const StringPiece &str) const {
- std::string temp(str.data(), str.length());
- return Index(temp.c_str());
- }
- WordIndex Index(const std::string &str) const {
- return Index(str.c_str());
- }
- WordIndex Index(const char *str) const;
-
- const char *Word(WordIndex index) const;
-
- private:
- friend class Model;
- void FinishedLoading();
-
- // The parent class isn't copyable so auto_ptr is the same as scoped_ptr
- // but without the boost dependence.
- mutable std::auto_ptr<Vocab> sri_;
-};
-
-class Model : public base::ModelFacade<Model, State, Vocabulary> {
- public:
- Model(const char *file_name, unsigned int ngram_length);
-
- ~Model();
-
- FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
-
- private:
- Vocabulary vocab_;
-
- mutable std::auto_ptr<Ngram> sri_;
-
- WordIndex not_found_;
-};
-
-} // namespace sri
-} // namespace lm
-
-#endif // LM_SRI__
diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc
index 03b0767a..ffec41ca 100644
--- a/klm/lm/vocab.cc
+++ b/klm/lm/vocab.cc
@@ -135,6 +135,7 @@ void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
ReadWords(fd, to);
SetSpecial(Index("<s>"), Index("</s>"), 0);
+ bound_ = end_ - begin_ + 1;
}
namespace {
diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh
index 41e97052..3c3414fb 100644
--- a/klm/lm/vocab.hh
+++ b/klm/lm/vocab.hh
@@ -15,10 +15,10 @@
namespace lm {
class ProbBackoff;
+class EnumerateVocab;
namespace ngram {
class Config;
-class EnumerateVocab;
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len);
@@ -66,7 +66,6 @@ class SortedVocabulary : public base::Vocabulary {
static size_t Size(std::size_t entries, const Config &config);
// Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
- // While this number is correct, ProbingVocabulary::Bound might not be correct in some cases.
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 5ce7adc9..279bafa8 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -15,7 +15,7 @@ namespace util {
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
- // Thanks Denis Filimonov for pointing on NFS likes msync first.
+ // Thanks Denis Filimonov for pointing out NFS likes msync first.
if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) {
std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl;
abort();
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index fec47fd9..ef5783fe 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -117,13 +117,18 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
return h;
}
+// Trick to test for 64-bit architecture at compile time.
+namespace {
+template <unsigned L> uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) {
+ return MurmurHash64A(key, len, seed);
+}
+template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) {
+ return MurmurHash64B(key, len, seed);
+}
+} // namespace
uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
- if (sizeof(int) == 4) {
- return MurmurHash64B(key, len, seed);
- } else {
- return MurmurHash64A(key, len, seed);
- }
+ return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);
}
} // namespace util
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index 2ec342a6..8122d69c 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -61,14 +61,14 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
#endif
{}
- template <class T> void Insert(const T &t) {
+ template <class T> MutableIterator Insert(const T &t) {
if (++entries_ >= buckets_)
UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
#ifdef DEBUG
assert(initialized_);
#endif
for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
- if (equal_(i->GetKey(), invalid_)) { *i = t; return; }
+ if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
if (++i == end_) { i = begin_; }
}
}
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
index ee1c7ab2..413bda0b 100644
--- a/klm/util/tokenize_piece.hh
+++ b/klm/util/tokenize_piece.hh
@@ -5,6 +5,9 @@
#include <boost/iterator/iterator_facade.hpp>
+#include <algorithm>
+#include <iostream>
+
/* Usage:
*
* for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) {
@@ -64,6 +67,78 @@ template <char d> class PieceIterator : public boost::iterator_facade<PieceItera
StringPiece after_;
};
+class MultiCharacter {
+ public:
+ explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}
+
+ StringPiece Find(const StringPiece &in) const {
+ return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size());
+ }
+
+ private:
+ StringPiece delimiter_;
+};
+
+class AnyCharacter {
+ public:
+ explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}
+
+ StringPiece Find(const StringPiece &in) const {
+ return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
+ }
+
+ private:
+ StringPiece chars_;
+};
+
+template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {
+ public:
+ TokenIter() {}
+
+ TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) {
+ increment();
+ }
+
+ bool operator!() const {
+ return current_.data() == 0;
+ }
+ operator bool() const {
+ return current_.data() != 0;
+ }
+
+ static TokenIter<Find> end() {
+ return TokenIter<Find>();
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ do {
+ StringPiece found(finder_.Find(after_));
+ current_ = StringPiece(after_.data(), found.data() - after_.data());
+ if (found.data() == after_.data() + after_.size()) {
+ after_ = StringPiece(NULL, 0);
+ } else {
+ after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size());
+ }
+ } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
+ }
+
+ bool equal(const TokenIter<Find> &other) const {
+ return after_.data() == other.after_.data();
+ }
+
+ const StringPiece &dereference() const {
+ return current_;
+ }
+
+ StringPiece current_;
+ StringPiece after_;
+
+ Find finder_;
+};
+
} // namespace util
#endif // UTIL_TOKENIZE_PIECE__
diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc
new file mode 100644
index 00000000..e07ebcf5
--- /dev/null
+++ b/klm/util/tokenize_piece_test.cc
@@ -0,0 +1,94 @@
+#include "util/tokenize_piece.hh"
+#include "util/string_piece.hh"
+
+#define BOOST_TEST_MODULE TokenIteratorTest
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+
+namespace util {
+namespace {
+
+BOOST_AUTO_TEST_CASE(simple) {
+ PieceIterator<' '> it("single spaced words.");
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("single"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("spaced"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("words."), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+BOOST_AUTO_TEST_CASE(null_delimiter) {
+ const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0";
+ PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("first"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("second"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("third"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("fourth"), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+BOOST_AUTO_TEST_CASE(null_entries) {
+ const char str[] = "\0split\0\0 \0me\0 ";
+ PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1));
+ BOOST_REQUIRE(it);
+ const char first[] = "\0split\0\0";
+ BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ const char second[] = "\0me\0";
+ BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) {
+ const char str[] = "nodelimit at all";
+ TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(str), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+BOOST_AUTO_TEST_CASE(pipe_pipe_two) {
+ const char str[] = "|||";
+ TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+BOOST_AUTO_TEST_CASE(remove_empty) {
+ const char str[] = "|||";
+ TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
+ BOOST_CHECK(!it);
+}*/
+
+BOOST_AUTO_TEST_CASE(remove_empty_keep) {
+ const char str[] = " |||";
+ TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(" "), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+} // namespace
+} // namespace util
diff --git a/mira/kbest_mira.cc b/mira/kbest_mira.cc
index 811bbd4b..9fda9b32 100644
--- a/mira/kbest_mira.cc
+++ b/mira/kbest_mira.cc
@@ -10,6 +10,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "hg_sampler.h"
#include "sentence_metadata.h"
#include "scorer.h"
#include "verbose.h"
@@ -54,6 +55,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)")
//("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by")
("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+ ("sample_forest,f", "Instead of a k-best list, sample k hypotheses from the decoder's forest")
+ ("sample_forest_unit_weight_vector,x", "Before sampling (must use -f option), rescale the weight vector used so it has unit length; this may improve the quality of the samples")
("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
("decoder_config,c",po::value<string>(),"Decoder configuration file");
po::options_description clo("Command line options");
@@ -91,11 +94,12 @@ struct GoodBadOracle {
};
struct TrainingObserver : public DecoderObserver {
- TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k) {}
+ TrainingObserver(const int k, const DocScorer& d, bool sf, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k), sample_forest(sf) {}
const DocScorer& ds;
vector<GoodBadOracle>& oracles;
shared_ptr<HypothesisInfo> cur_best;
const int kbest_size;
+ const bool sample_forest;
const HypothesisInfo& GetCurrentBestHypothesis() const {
return *cur_best;
@@ -116,24 +120,43 @@ struct TrainingObserver : public DecoderObserver {
shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good;
shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad;
cur_bad.reset(); // TODO get rid of??
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
- for (int i = 0; i < kbest_size; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest(forest.nodes_.size() - 1, i);
- if (!d) break;
- float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
- if (invert_score) sentscore *= -1.0;
- // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
- if (i == 0)
- cur_best = MakeHypothesisInfo(d->feature_values, sentscore);
- if (!cur_good || sentscore > cur_good->mt_metric)
- cur_good = MakeHypothesisInfo(d->feature_values, sentscore);
- if (!cur_bad || sentscore < cur_bad->mt_metric)
- cur_bad = MakeHypothesisInfo(d->feature_values, sentscore);
+
+ if (sample_forest) {
+ vector<WordID> cur_prediction;
+ ViterbiESentence(forest, &cur_prediction);
+ float sentscore = ds[sent_id]->ScoreCandidate(cur_prediction)->ComputeScore();
+ cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore);
+
+ vector<HypergraphSampler::Hypothesis> samples;
+ HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples);
+ for (unsigned i = 0; i < samples.size(); ++i) {
+ sentscore = ds[sent_id]->ScoreCandidate(samples[i].words)->ComputeScore();
+ if (invert_score) sentscore *= -1.0;
+ if (!cur_good || sentscore > cur_good->mt_metric)
+ cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore);
+ if (!cur_bad || sentscore < cur_bad->mt_metric)
+ cur_bad = MakeHypothesisInfo(samples[i].fmap, sentscore);
+ }
+ } else {
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
+ for (int i = 0; i < kbest_size; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
+ if (invert_score) sentscore *= -1.0;
+ // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
+ if (i == 0)
+ cur_best = MakeHypothesisInfo(d->feature_values, sentscore);
+ if (!cur_good || sentscore > cur_good->mt_metric)
+ cur_good = MakeHypothesisInfo(d->feature_values, sentscore);
+ if (!cur_bad || sentscore < cur_bad->mt_metric)
+ cur_bad = MakeHypothesisInfo(d->feature_values, sentscore);
+ }
+ //cerr << "GOOD: " << cur_good->mt_metric << endl;
+ //cerr << " CUR: " << cur_best->mt_metric << endl;
+ //cerr << " BAD: " << cur_bad->mt_metric << endl;
}
- //cerr << "GOOD: " << cur_good->mt_metric << endl;
- //cerr << " CUR: " << cur_best->mt_metric << endl;
- //cerr << " BAD: " << cur_bad->mt_metric << endl;
}
};
@@ -164,6 +187,12 @@ int main(int argc, char** argv) {
rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
else
rng.reset(new MT19937);
+ const bool sample_forest = conf.count("sample_forest") > 0;
+ const bool sample_forest_unit_weight_vector = conf.count("sample_forest_unit_weight_vector") > 0;
+ if (sample_forest_unit_weight_vector && !sample_forest) {
+ cerr << "Cannot --sample_forest_unit_weight_vector without --sample_forest" << endl;
+ return 1;
+ }
vector<string> corpus;
ReadTrainingCorpus(conf["source"].as<string>(), &corpus);
const string metric_name = conf["mt_metric"].as<string>();
@@ -195,7 +224,7 @@ int main(int argc, char** argv) {
assert(corpus.size() > 0);
vector<GoodBadOracle> oracles(corpus.size());
- TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles);
+ TrainingObserver observer(conf["k_best_size"].as<int>(), ds, sample_forest, &oracles);
int cur_sent = 0;
int lcount = 0;
int normalizer = 0;
@@ -234,7 +263,19 @@ int main(int argc, char** argv) {
cerr << "PASS " << (lcount / corpus.size() + 1) << endl;
}
decoder.SetId(order[cur_sent]);
+ double sc = 1.0;
+ if (sample_forest_unit_weight_vector) {
+ sc = lambdas.l2norm();
+ if (sc > 0) {
+ for (unsigned i = 0; i < dense_weights.size(); ++i)
+ dense_weights[i] /= sc;
+ }
+ }
decoder.Decode(corpus[order[cur_sent]], &observer); // update oracles
+ if (sc && sc != 1.0) {
+ for (unsigned i = 0; i < dense_weights.size(); ++i)
+ dense_weights[i] *= sc;
+ }
const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis();
const HypothesisInfo& cur_good = *oracles[order[cur_sent]].good;
const HypothesisInfo& cur_bad = *oracles[order[cur_sent]].bad;
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index dbfa329a..5db053de 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -10,6 +10,7 @@ use Getopt::Long;
use IPC::Open2;
use POSIX ":sys_wait_h";
my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
my $VEST_DIR="$SCRIPT_DIR/../vest";
require "$VEST_DIR/libcall.pl";
@@ -39,10 +40,11 @@ die "Can't find $libcall" unless -e $libcall;
my $decoder = $cdec;
my $lines_per_mapper = 30;
my $iteration = 1;
-my $run_local = 0;
my $best_weights;
-my $max_iterations = 30;
-my $decode_nodes = 15; # number of decode nodes
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs; # number of decode nodes
my $pmem = "4g";
my $disable_clean = 0;
my %seen_weights;
@@ -54,30 +56,32 @@ my $metric = "ibm_bleu";
my $dir;
my $iniFile;
my $weights;
-my $use_make; # use make to parallelize
-my $usefork;
+my $use_make = 1; # use make to parallelize
+my $useqsub = 0;
my $initial_weights;
my $pass_suffix = '';
my $cpbin=1;
# regularization strength
my $tune_regularizer = 0;
-my $reg = 1e-2;
+my $reg = 500;
+my $reg_previous = 5000;
# Process command-line options
Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
- "decode-nodes=i" => \$decode_nodes,
+ "jobs=i" => \$jobs,
"dont-clean" => \$disable_clean,
"pass-suffix=s" => \$pass_suffix,
- "use-fork" => \$usefork,
+ "qsub" => \$useqsub,
"dry-run" => \$dryrun,
"epsilon=s" => \$epsilon,
+ "interpolate-with-weights=f" => \$psi,
"help" => \$help,
"weights=s" => \$initial_weights,
"tune-regularizer" => \$tune_regularizer,
"reg=f" => \$reg,
- "local" => \$run_local,
+ "reg-previous=f" => \$reg_previous,
"use-make=i" => \$use_make,
"max-iterations=i" => \$max_iterations,
"pmem=s" => \$pmem,
@@ -91,7 +95,18 @@ if (GetOptions(
exit;
}
-if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
if ($metric =~ /^(combi|ter)$/i) {
$lines_per_mapper = 5;
@@ -248,13 +263,10 @@ while (1){
`rm -f $dir/hgs/*.gz`;
my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
my $pcmd;
- if ($run_local) {
- $pcmd = "cat $srcFile |";
- } elsif ($use_make) {
- # TODO: Throw error when decode_nodes is specified along with use_make
- $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
} else {
- $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --";
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
}
my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
print STDERR "COMMAND:\n$cmd\n";
@@ -327,10 +339,7 @@ while (1){
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
- if ($run_local) {
- print STDERR "COMMAND:\n$script\n";
- check_bash_call($script);
- } elsif ($use_make) {
+ if ($use_make) {
my $script_file = "$dir/scripts/map.$shard";
open F, ">$script_file" or die "Can't write $script_file: $!";
print F "#!/bin/bash\n";
@@ -376,12 +385,10 @@ while (1){
} else {
@dev_outs = @mapoutputs;
}
- if ($run_local) {
- print STDERR "\nCompleted extraction of training exemplars.\n";
- } elsif ($use_make) {
+ if ($use_make) {
print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
close $mkfile;
- my $mcmd = "make -j $use_make -f $mkfilename";
+ my $mcmd = "make -j $jobs -f $mkfilename";
print STDERR "\nExecuting: $mcmd\n";
check_call($mcmd);
} else {
@@ -411,7 +418,7 @@ while (1){
}
print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
print STDERR unchecked_output("date");
- $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg";
+ $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";
if ($tune_regularizer) {
$cmd .= " -T -t $dev_test_file";
}
@@ -492,7 +499,7 @@ sub write_config {
print $fh "REFS (DEV): $refFiles\n";
print $fh "EVAL METRIC: $metric\n";
print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "DECODE NODES: $decode_nodes\n";
+ print $fh "JOBS: $jobs\n";
print $fh "HEAD NODE: $host\n";
print $fh "PMEM (DECODING): $pmem\n";
print $fh "CLEANUP: $cleanup\n";
@@ -541,16 +548,12 @@ sub enseg {
sub print_help {
my $executable = check_output("basename $0"); chomp $executable;
- print << "Help";
+ print << "Help";
Usage: $executable [options] <ini file>
$executable [options] <ini file>
- Runs a complete MERT optimization and test set decoding, using
- the decoder configuration in ini file. Note that many of the
- options have default values that are inferred automatically
- based on certain conventions. For details, refer to descriptions
- of the options --decoder, --weights, and --workdir.
+ Runs a complete PRO optimization using the ini file specified.
Required:
@@ -567,18 +570,12 @@ Required:
General options:
- --local
- Run the decoder and optimizer locally with a single thread.
-
- --decode-nodes <I>
- Number of decoder processes to run in parallel. [default=15]
-
--help
Print this message and exit.
--max-iterations <M>
Maximum number of iterations to run. If not specified, defaults
- to 10.
+ to $default_max_iter.
--metric <method>
Metric to optimize.
@@ -588,13 +585,6 @@ General options:
If the decoder is doing multi-pass decoding, the pass suffix "2",
"3", etc., is used to control what iteration of weights is set.
- --pmem <N>
- Amount of physical memory requested for parallel decoding jobs.
-
- --use-make <I>
- Use make -j <I> to run the optimizer commands (useful on large
- shared-memory machines where qsub is unavailable).
-
--workdir <dir>
Directory for intermediate and output files. If not specified, the
name is derived from the ini filename. Assuming that the ini
@@ -605,11 +595,35 @@ General options:
Regularization options:
- --tune-regularizer
- Hold out one third of the tuning data and used this to tune the
- regularization parameter.
-
--reg <F>
+ l2 regularization strength [default=500]. The greater this value,
+ the closer to zero the weights will be.
+
+ --reg-previous <F>
+ l2 penalty for moving away from the weights from the previous
+ iteration. [default=5000]. The greater this value, the closer
+ to the previous iteration's weights the next iteration's weights
+ will be.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Deprecated options:
+
+ --interpolate-with-weights <F>
+ [deprecated] At each iteration the resulting weights are
+ interpolated with the weights from the previous iteration, with
+ this factor. [default=1.0, i.e., no effect]
Help
}
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index aff410a0..6362ce47 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -23,13 +23,14 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
- ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
- ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
- ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior")
- ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght")
- ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght")
+ ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength")
+ ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect")
+ ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)")
+ ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght")
+ ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght")
("testset,t",po::value<string>(), "Optional held-out test set")
("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
+ ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect")
("help,h", "Help");
po::options_description dcmdline_options;
dcmdline_options.add(opts);
@@ -95,6 +96,27 @@ void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_
}
}
+double ApplyRegularizationTerms(const double C,
+ const double T,
+ const vector<weight_t>& weights,
+ const vector<weight_t>& prev_weights,
+ vector<weight_t>* g) {
+ assert(weights.size() == g->size());
+ double reg = 0;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
+ const double& w_i = weights[i];
+ double& g_i = (*g)[i];
+ reg += C * w_i * w_i;
+ g_i += 2 * C * w_i;
+
+ const double diff_i = w_i - prev_w_i;
+ reg += T * diff_i * diff_i;
+ g_i += 2 * T * diff_i;
+ }
+ return reg;
+}
+
double TrainingInference(const vector<weight_t>& x,
const vector<pair<bool, SparseVector<weight_t> > >& corpus,
vector<weight_t>* g = NULL) {
@@ -134,8 +156,10 @@ double TrainingInference(const vector<weight_t>& x,
// return held-out log likelihood
double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
const vector<pair<bool, SparseVector<weight_t> > >& testing,
- const double sigsq,
+ const double C,
+ const double T,
const unsigned memory_buffers,
+ const vector<weight_t>& prev_x,
vector<weight_t>* px) {
vector<weight_t>& x = *px;
vector<weight_t> vg(FD::NumFeats(), 0.0);
@@ -157,26 +181,12 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train
}
// handle regularizer
-#if 1
- double norm = 0;
- for (int i = 1; i < x.size(); ++i) {
- const double mean_i = 0.0;
- const double param = (x[i] - mean_i);
- norm += param * param;
- vg[i] += param / sigsq;
- }
- const double reg = norm / (2.0 * sigsq);
-#else
- double reg = 0;
-#endif
+ double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg);
cll += reg;
- cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t";
+ cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl;
try {
- vector<weight_t> old_x = x;
- do {
- opt.Optimize(cll, vg, &x);
- converged = opt.HasConverged();
- } while (!converged && x == old_x);
+ opt.Optimize(cll, vg, &x);
+ converged = opt.HasConverged();
} catch (...) {
cerr << "Exception caught, assuming convergence is close enough...\n";
converged = true;
@@ -201,13 +211,14 @@ int main(int argc, char** argv) {
}
const double min_reg = conf["min_reg"].as<double>();
const double max_reg = conf["max_reg"].as<double>();
- double sigsq = conf["sigma_squared"].as<double>(); // will be overridden if parameter is tuned
- assert(sigsq > 0.0);
+ double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
+ const double T = conf["regularize_to_weights"].as<double>();
+ assert(C > 0.0);
assert(min_reg > 0.0);
assert(max_reg > 0.0);
assert(max_reg > min_reg);
- const double psi = conf["interpolation"].as<double>();
- if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+ const double psi = conf["interpolate_with_weights"].as<double>();
+ if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
ReadCorpus(&cin, &training);
if (conf.count("testset")) {
ReadFile rf(conf["testset"].as<string>());
@@ -231,14 +242,15 @@ int main(int argc, char** argv) {
vector<pair<double,double> > sp;
vector<double> smoothed;
if (tune_regularizer) {
- sigsq = min_reg;
+ C = min_reg;
const double steps = 18;
double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
cerr << "SWEEP FACTOR: " << sweep_factor << endl;
- while(sigsq < max_reg) {
- tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
- sp.push_back(make_pair(sigsq, tppl));
- sigsq *= sweep_factor;
+ while(C < max_reg) {
+ cerr << "C=" << C << "\tT=" <<T << endl;
+ tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
+ sp.push_back(make_pair(C, tppl));
+ C *= sweep_factor;
}
smoothed.resize(sp.size(), 0);
smoothed[0] = sp[0].second;
@@ -257,16 +269,16 @@ int main(int argc, char** argv) {
best_i = i;
}
}
- sigsq = sp[best_i].first;
+ C = sp[best_i].first;
} // tune regularizer
- tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+ tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
if (conf.count("weights")) {
for (int i = 1; i < x.size(); ++i) {
x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);
}
}
cout.precision(15);
- cout << "# sigma^2=" << sigsq << "\theld out perplexity=";
+ cout << "# C=" << C << "\theld out perplexity=";
if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
if (sp.size()) {
cout << "# Parameter sweep:\n";
diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc
index e89a92d5..1e5af9a1 100644
--- a/training/augment_grammar.cc
+++ b/training/augment_grammar.cc
@@ -18,7 +18,7 @@ using namespace std;
vector<lm::WordIndex> word_map;
lm::ngram::ProbingModel* ngram;
-struct VMapper : public lm::ngram::EnumerateVocab {
+struct VMapper : public lm::EnumerateVocab {
VMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); }
void Add(lm::WordIndex index, const StringPiece &str) {
const WordID cdec_id = TD::Convert(str.as_string());
diff --git a/training/test_ngram.cc b/training/test_ngram.cc
index c481b564..4597cc01 100644
--- a/training/test_ngram.cc
+++ b/training/test_ngram.cc
@@ -12,7 +12,7 @@ namespace po = boost::program_options;
using namespace std;
lm::ngram::ProbingModel* ngram;
-struct GetVocab : public lm::ngram::EnumerateVocab {
+struct GetVocab : public lm::EnumerateVocab {
GetVocab(vector<lm::WordIndex>* out) : out_(out) { }
void Add(lm::WordIndex index, const StringPiece &str) {
out_->push_back(index);
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index b7a862c4..11e791c1 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -16,6 +16,7 @@ require "libcall.pl";
# Default settings
my $srcFile;
my $refFiles;
+my $default_jobs = env_default_jobs();
my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
my $FAST_SCORE="$bin_dir/../mteval/fast_score";
@@ -39,11 +40,10 @@ my $decoder = $cdec;
my $lines_per_mapper = 400;
my $rand_directions = 15;
my $iteration = 1;
-my $run_local = 0;
my $best_weights;
my $max_iterations = 15;
my $optimization_iters = 6;
-my $decode_nodes = 15; # number of decode nodes
+my $jobs = $default_jobs; # number of decode nodes
my $pmem = "9g";
my $disable_clean = 0;
my %seen_weights;
@@ -64,28 +64,25 @@ my $maxsim=0;
my $oraclen=0;
my $oracleb=20;
my $bleu_weight=1;
-my $use_make; # use make to parallelize line search
+my $use_make = 1; # use make to parallelize line search
my $dirargs='';
my $density_prune;
-my $usefork;
+my $useqsub;
my $pass_suffix = '';
my $cpbin=1;
# Process command-line options
Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
"decoder=s" => \$decoderOpt,
- "decode-nodes=i" => \$decode_nodes,
+ "jobs=i" => \$jobs,
"density-prune=f" => \$density_prune,
"dont-clean" => \$disable_clean,
"pass-suffix=s" => \$pass_suffix,
- "use-fork" => \$usefork,
"dry-run" => \$dryrun,
"epsilon=s" => \$epsilon,
"help" => \$help,
"interval" => \$interval,
- "iteration=i" => \$iteration,
- "local" => \$run_local,
- "use-make=i" => \$use_make,
+ "qsub" => \$useqsub,
"max-iterations=i" => \$max_iterations,
"normalize=s" => \$normalize,
"pmem=s" => \$pmem,
@@ -114,7 +111,16 @@ if (defined $density_prune) {
die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
}
-if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initialWeights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
if ($metric =~ /^(combi|ter)$/i) {
$lines_per_mapper = 40;
@@ -276,17 +282,11 @@ while (1){
my $im1 = $iteration - 1;
my $weightsFile="$dir/weights.$im1";
my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
- if ($density_prune) {
- $decoder_cmd .= " --density_prune $density_prune";
- }
my $pcmd;
- if ($run_local) {
- $pcmd = "cat $srcFile |";
- } elsif ($use_make) {
- # TODO: Throw error when decode_nodes is specified along with use_make
- $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
} else {
- $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --";
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
}
my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
print STDERR "COMMAND:\n$cmd\n";
@@ -365,10 +365,7 @@ while (1){
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
- if ($run_local) {
- print STDERR "COMMAND:\n$script\n";
- check_bash_call($script);
- } elsif ($use_make) {
+ if ($use_make) {
my $script_file = "$dir/scripts/map.$shard";
open F, ">$script_file" or die "Can't write $script_file: $!";
print F "#!/bin/bash\n";
@@ -398,12 +395,10 @@ while (1){
else {$joblist = $joblist . "\|" . $jobid; }
}
}
- if ($run_local) {
- print STDERR "\nProcessing line search complete.\n";
- } elsif ($use_make) {
+ if ($use_make) {
print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
close $mkfile;
- my $mcmd = "make -j $use_make -f $mkfilename";
+ my $mcmd = "make -j $jobs -f $mkfilename";
print STDERR "\nExecuting: $mcmd\n";
check_call($mcmd);
} else {
@@ -558,7 +553,7 @@ sub write_config {
print $fh "EVAL METRIC: $metric\n";
print $fh "START ITERATION: $iteration\n";
print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "DECODE NODES: $decode_nodes\n";
+ print $fh "PARALLEL JOBS: $jobs\n";
print $fh "HEAD NODE: $host\n";
print $fh "PMEM (DECODING): $pmem\n";
print $fh "CLEANUP: $cleanup\n";
@@ -612,37 +607,15 @@ sub print_help {
Usage: $executable [options] <ini file>
$executable [options] <ini file>
- Runs a complete MERT optimization and test set decoding, using
- the decoder configuration in ini file. Note that many of the
- options have default values that are inferred automatically
- based on certain conventions. For details, refer to descriptions
- of the options --decoder, --weights, and --workdir.
+ Runs a complete MERT optimization using the decoder configuration
+ in <ini file>. Required options are --weights, --source-file, and
+ --ref-files.
Options:
- --local
- Run the decoder and optimizer locally with a single thread.
-
- --use-make <I>
- Use make -j <I> to run the optimizer commands (useful on large
- shared-memory machines where qsub is unavailable).
-
- --decode-nodes <I>
- Number of decoder processes to run in parallel. [default=15]
-
- --decoder <decoder path>
- Decoder binary to use.
-
- --density-prune <N>
- Limit the density of the hypergraph on each iteration to N times
- the number of edges on the Viterbi path.
-
--help
Print this message and exit.
- --iteration <I>
- Starting iteration number. If not specified, defaults to 1.
-
--max-iterations <M>
Maximum number of iterations to run. If not specified, defaults
to 10.
@@ -651,9 +624,6 @@ Options:
If the decoder is doing multi-pass decoding, the pass suffix "2",
"3", etc., is used to control what iteration of weights is set.
- --pmem <N>
- Amount of physical memory requested for parallel decoding jobs.
-
--ref-files <files>
Dev set ref files. This option takes only a single string argument.
To use multiple files (including file globbing), this argument should
@@ -678,6 +648,7 @@ Options:
A file specifying initial feature weights. The format is
FeatureName_1 value1
FeatureName_2 value2
+ **All and only the weights listed in <file> will be optimized!**
--workdir <dir>
Directory for intermediate and output files. If not specified, the
@@ -687,6 +658,19 @@ Options:
the filename. E.g. an ini file named decoder.foo.ini would have
a default working directory name foo.
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
Help
}
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index 869f430b..7d0365cc 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -396,7 +396,7 @@ usage: $name [options]
options:
- --fork
+ --use-fork
Instead of using qsub, use fork.
-e, --error-dir <dir>