From edb0cc0cbae1e75e4aeedb6360eab325effe6573 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 9 Sep 2011 15:33:35 +0200
Subject: partial merge, ruleid feature
---
decoder/Makefile.am | 2 +
decoder/cdec_ff.cc | 9 +-
decoder/ff_klm.cc | 38 ++++-
decoder/ff_klm.h | 7 +-
decoder/ff_ngrams.cc | 341 +++++++++++++++++++++++++++++++++++++++++
decoder/ff_ngrams.h | 29 ++++
decoder/ff_rules.cc | 107 +++++++++++++
decoder/ff_rules.h | 40 +++++
decoder/ff_spans.cc | 74 +++------
decoder/ff_spans.h | 15 --
dtrain/dtrain.cc | 50 +++---
dtrain/run.sh | 5 +-
dtrain/sample.h | 18 +--
dtrain/test/EXAMPLE/dtrain.ini | 2 +-
klm/lm/Makefile.am | 1 +
klm/lm/bhiksha.cc | 93 +++++++++++
klm/lm/bhiksha.hh | 108 +++++++++++++
klm/lm/binary_format.cc | 13 +-
klm/lm/binary_format.hh | 9 +-
klm/lm/build_binary.cc | 54 ++++---
klm/lm/config.cc | 1 +
klm/lm/config.hh | 5 +-
klm/lm/model.cc | 67 ++++----
klm/lm/model.hh | 12 +-
klm/lm/model_test.cc | 73 +++++++--
klm/lm/ngram_query.cc | 9 ++
klm/lm/quantize.cc | 1 +
klm/lm/quantize.hh | 4 +-
klm/lm/read_arpa.cc | 6 +-
klm/lm/search_hashed.cc | 2 +-
klm/lm/search_hashed.hh | 3 +-
klm/lm/search_trie.cc | 45 +++---
klm/lm/search_trie.hh | 20 +--
klm/lm/test_nounk.arpa | 120 +++++++++++++++
klm/lm/trie.cc | 57 +++----
klm/lm/trie.hh | 24 +--
klm/lm/vocab.cc | 6 +-
klm/lm/vocab.hh | 4 +
klm/util/bit_packing.hh | 13 +-
klm/util/exception.cc | 28 ++++
klm/util/exception.hh | 56 ++++++-
klm/util/file_piece.cc | 42 +++--
klm/util/file_piece.hh | 34 ++--
klm/util/murmur_hash.cc | 258 +++++++++++++++----------------
klm/util/probing_hash_table.hh | 2 +-
klm/util/sorted_uniform.hh | 23 ++-
46 files changed, 1490 insertions(+), 440 deletions(-)
create mode 100644 decoder/ff_ngrams.cc
create mode 100644 decoder/ff_ngrams.h
create mode 100644 decoder/ff_rules.cc
create mode 100644 decoder/ff_rules.h
create mode 100644 klm/lm/bhiksha.cc
create mode 100644 klm/lm/bhiksha.hh
create mode 100644 klm/lm/test_nounk.arpa
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 244da2de..e5f7505f 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -61,10 +61,12 @@ libcdec_a_SOURCES = \
phrasetable_fst.cc \
trule.cc \
ff.cc \
+ ff_rules.cc \
ff_wordset.cc \
ff_charset.cc \
ff_lm.cc \
ff_klm.cc \
+ ff_ngrams.cc \
ff_spans.cc \
ff_ruleshape.cc \
ff_wordalign.cc \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 31f88a4f..588842f1 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -4,10 +4,12 @@
#include "ff_spans.h"
#include "ff_lm.h"
#include "ff_klm.h"
+#include "ff_ngrams.h"
#include "ff_csplit.h"
#include "ff_wordalign.h"
#include "ff_tagger.h"
#include "ff_factory.h"
+#include "ff_rules.h"
#include "ff_ruleshape.h"
#include "ff_bleu.h"
#include "ff_lm_fsa.h"
@@ -51,12 +53,11 @@ void register_feature_functions() {
ff_registry.Register("RandLM", new FFFactory);
#endif
ff_registry.Register("SpanFeatures", new FFFactory());
+ ff_registry.Register("NgramFeatures", new FFFactory());
+ ff_registry.Register("RuleIdentityFeatures", new FFFactory());
ff_registry.Register("RuleNgramFeatures", new FFFactory());
ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory());
- ff_registry.Register("KLanguageModel", new FFFactory >());
- ff_registry.Register("KLanguageModel_Trie", new FFFactory >());
- ff_registry.Register("KLanguageModel_QuantTrie", new FFFactory >());
- ff_registry.Register("KLanguageModel_Probing", new FFFactory >());
+ ff_registry.Register("KLanguageModel", new KLanguageModelFactory());
ff_registry.Register("NonLatinCount", new FFFactory);
ff_registry.Register("RuleShape", new FFFactory);
ff_registry.Register("RelativeSentencePosition", new FFFactory);
diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc
index 9b7fe2d3..24dcb9c3 100644
--- a/decoder/ff_klm.cc
+++ b/decoder/ff_klm.cc
@@ -9,6 +9,7 @@
#include "stringlib.h"
#include "hg.h"
#include "tdict.h"
+#include "lm/model.hh"
#include "lm/enumerate_vocab.hh"
using namespace std;
@@ -434,8 +435,37 @@ void KLanguageModel::FinalTraversalFeatures(const void* ant_state,
features->set_value(oov_fid_, oovs);
}
-// instantiate templates
-template class KLanguageModel;
-template class KLanguageModel;
-template class KLanguageModel;
+template boost::shared_ptr CreateModel(const std::string ¶m) {
+ KLanguageModel *ret = new KLanguageModel(param);
+ ret->Init();
+ return boost::shared_ptr(ret);
+}
+boost::shared_ptr KLanguageModelFactory::Create(std::string param) const {
+ using namespace lm::ngram;
+ std::string filename, ignored_map;
+ bool ignored_markers;
+ std::string ignored_featname;
+ ParseLMArgs(param, &filename, &ignored_map, &ignored_markers, &ignored_featname);
+ ModelType m;
+ if (!RecognizeBinary(filename.c_str(), m)) m = HASH_PROBING;
+
+ switch (m) {
+ case HASH_PROBING:
+ return CreateModel(param);
+ case TRIE_SORTED:
+ return CreateModel(param);
+ case ARRAY_TRIE_SORTED:
+ return CreateModel(param);
+ case QUANT_TRIE_SORTED:
+ return CreateModel(param);
+ case QUANT_ARRAY_TRIE_SORTED:
+ return CreateModel(param);
+ default:
+ UTIL_THROW(util::Exception, "Unrecognized kenlm binary file type " << (unsigned)m);
+ }
+}
+
+std::string KLanguageModelFactory::usage(bool params,bool verbose) const {
+ return KLanguageModel::usage(params, verbose);
+}
diff --git a/decoder/ff_klm.h b/decoder/ff_klm.h
index 5eafe8be..6efe50f6 100644
--- a/decoder/ff_klm.h
+++ b/decoder/ff_klm.h
@@ -4,8 +4,8 @@
#include
#include
+#include "ff_factory.h"
#include "ff.h"
-#include "lm/model.hh"
template struct KLanguageModelImpl;
@@ -34,4 +34,9 @@ class KLanguageModel : public FeatureFunction {
KLanguageModelImpl* pimpl_;
};
+struct KLanguageModelFactory : public FactoryBase {
+ FP Create(std::string param) const;
+ std::string usage(bool params,bool verbose) const;
+};
+
#endif
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
new file mode 100644
index 00000000..04dd1906
--- /dev/null
+++ b/decoder/ff_ngrams.cc
@@ -0,0 +1,341 @@
+#include "ff_ngrams.h"
+
+#include
+#include
+
+#include
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "hg.h"
+#include "tdict.h"
+
+using namespace std;
+
+static const unsigned char HAS_FULL_CONTEXT = 1;
+static const unsigned char HAS_EOS_ON_RIGHT = 2;
+static const unsigned char MASK = 7;
+
+namespace {
+template
+struct State {
+ explicit State() {
+ memset(state, 0, sizeof(state));
+ }
+ explicit State(int order) {
+ memset(state, 0, (order - 1) * sizeof(WordID));
+ }
+ State(char order, const WordID* mem) {
+ memcpy(state, mem, (order - 1) * sizeof(WordID));
+ }
+ State(const State& other) {
+ memcpy(state, other.state, sizeof(state));
+ }
+ const State& operator=(const State& other) {
+ memcpy(state, other.state, sizeof(state));
+ }
+ explicit State(const State& other, unsigned order, WordID extend) {
+ char om1 = order - 1;
+ assert(om1 > 0);
+ for (char i = 1; i < om1; ++i) state[i - 1]= other.state[i];
+ state[om1 - 1] = extend;
+ }
+ const WordID& operator[](size_t i) const { return state[i]; }
+ WordID& operator[](size_t i) { return state[i]; }
+ WordID state[MAX_ORDER];
+};
+}
+
+namespace {
+ string Escape(const string& x) {
+ string y = x;
+ for (int i = 0; i < y.size(); ++i) {
+ if (y[i] == '=') y[i]='_';
+ if (y[i] == ';') y[i]='_';
+ }
+ return y;
+ }
+}
+
+class NgramDetectorImpl {
+
+ // returns the number of unscored words at the left edge of a span
+ inline int UnscoredSize(const void* state) const {
+ return *(static_cast(state) + unscored_size_offset_);
+ }
+
+ inline void SetUnscoredSize(int size, void* state) const {
+ *(static_cast(state) + unscored_size_offset_) = size;
+ }
+
+ inline State<5> RemnantLMState(const void* cstate) const {
+ return State<5>(order_, static_cast(cstate));
+ }
+
+ inline const State<5> BeginSentenceState() const {
+ State<5> state(order_);
+ state.state[0] = kSOS_;
+ return state;
+ }
+
+ inline void SetRemnantLMState(const State<5>& lmstate, void* state) const {
+ // if we were clever, we could use the memory pointed to by state to do all
+ // the work, avoiding this copy
+ memcpy(state, lmstate.state, (order_-1) * sizeof(WordID));
+ }
+
+ WordID IthUnscoredWord(int i, const void* state) const {
+ const WordID* const mem = reinterpret_cast(static_cast(state) + unscored_words_offset_);
+ return mem[i];
+ }
+
+ void SetIthUnscoredWord(int i, const WordID index, void *state) const {
+ WordID* mem = reinterpret_cast(static_cast(state) + unscored_words_offset_);
+ mem[i] = index;
+ }
+
+ inline bool GetFlag(const void *state, unsigned char flag) const {
+ return (*(static_cast(state) + is_complete_offset_) & flag);
+ }
+
+ inline void SetFlag(bool on, unsigned char flag, void *state) const {
+ if (on) {
+ *(static_cast(state) + is_complete_offset_) |= flag;
+ } else {
+ *(static_cast(state) + is_complete_offset_) &= (MASK ^ flag);
+ }
+ }
+
+ inline bool HasFullContext(const void *state) const {
+ return GetFlag(state, HAS_FULL_CONTEXT);
+ }
+
+ inline void SetHasFullContext(bool flag, void *state) const {
+ SetFlag(flag, HAS_FULL_CONTEXT, state);
+ }
+
+ void FireFeatures(const State<5>& state, WordID cur, SparseVector* feats) {
+ FidTree* ft = &fidroot_;
+ int n = 0;
+ WordID buf[10];
+ int ci = order_ - 1;
+ WordID curword = cur;
+ while(curword) {
+ buf[n] = curword;
+ int& fid = ft->fids[curword];
+ ++n;
+ if (!fid) {
+ const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.)
+ ostringstream os;
+ os << code[n] << ':';
+ for (int i = n-1; i >= 0; --i) {
+ os << (i != n-1 ? "_" : "");
+ const string& tok = TD::Convert(buf[i]);
+ if (tok.find('=') == string::npos)
+ os << tok;
+ else
+ os << Escape(tok);
+ }
+ fid = FD::Convert(os.str());
+ }
+ feats->set_value(fid, 1);
+ ft = &ft->levels[curword];
+ --ci;
+ if (ci < 0) break;
+ curword = state[ci];
+ }
+ }
+
+ public:
+ void LookupWords(const TRule& rule, const vector& ant_states, SparseVector* feats, SparseVector* est_feats, void* remnant) {
+ double sum = 0.0;
+ double est_sum = 0.0;
+ int num_scored = 0;
+ int num_estimated = 0;
+ bool saw_eos = false;
+ bool has_some_history = false;
+ State<5> state;
+ const vector& e = rule.e();
+ bool context_complete = false;
+ for (int j = 0; j < e.size(); ++j) {
+ if (e[j] < 1) { // handle non-terminal substitution
+ const void* astate = (ant_states[-e[j]]);
+ int unscored_ant_len = UnscoredSize(astate);
+ for (int k = 0; k < unscored_ant_len; ++k) {
+ const WordID cur_word = IthUnscoredWord(k, astate);
+ const bool is_oov = (cur_word == 0);
+ SparseVector p;
+ if (cur_word == kSOS_) {
+ state = BeginSentenceState();
+ if (has_some_history) { // this is immediately fully scored, and bad
+ p.set_value(FD::Convert("Malformed"), 1.0);
+ context_complete = true;
+ } else { // this might be a real
+ num_scored = max(0, order_ - 2);
+ }
+ } else {
+ FireFeatures(state, cur_word, &p);
+ const State<5> scopy = State<5>(state, order_, cur_word);
+ state = scopy;
+ if (saw_eos) { p.set_value(FD::Convert("Malformed"), 1.0); }
+ saw_eos = (cur_word == kEOS_);
+ }
+ has_some_history = true;
+ ++num_scored;
+ if (!context_complete) {
+ if (num_scored >= order_) context_complete = true;
+ }
+ if (context_complete) {
+ (*feats) += p;
+ } else {
+ if (remnant)
+ SetIthUnscoredWord(num_estimated, cur_word, remnant);
+ ++num_estimated;
+ (*est_feats) += p;
+ }
+ }
+ saw_eos = GetFlag(astate, HAS_EOS_ON_RIGHT);
+ if (HasFullContext(astate)) { // this is equivalent to the "star" in Chiang 2007
+ state = RemnantLMState(astate);
+ context_complete = true;
+ }
+ } else { // handle terminal
+ const WordID cur_word = e[j];
+ SparseVector p;
+ if (cur_word == kSOS_) {
+ state = BeginSentenceState();
+ if (has_some_history) { // this is immediately fully scored, and bad
+ p.set_value(FD::Convert("Malformed"), -100);
+ context_complete = true;
+ } else { // this might be a real
+ num_scored = max(0, order_ - 2);
+ }
+ } else {
+ FireFeatures(state, cur_word, &p);
+ const State<5> scopy = State<5>(state, order_, cur_word);
+ state = scopy;
+ if (saw_eos) { p.set_value(FD::Convert("Malformed"), 1.0); }
+ saw_eos = (cur_word == kEOS_);
+ }
+ has_some_history = true;
+ ++num_scored;
+ if (!context_complete) {
+ if (num_scored >= order_) context_complete = true;
+ }
+ if (context_complete) {
+ (*feats) += p;
+ } else {
+ if (remnant)
+ SetIthUnscoredWord(num_estimated, cur_word, remnant);
+ ++num_estimated;
+ (*est_feats) += p;
+ }
+ }
+ }
+ if (remnant) {
+ SetFlag(saw_eos, HAS_EOS_ON_RIGHT, remnant);
+ SetRemnantLMState(state, remnant);
+ SetUnscoredSize(num_estimated, remnant);
+ SetHasFullContext(context_complete || (num_scored >= order_), remnant);
+ }
+ }
+
+ // this assumes no target words on final unary -> goal rule. is that ok?
+ // for (n-1 left words) and (n-1 right words)
+ void FinalTraversal(const void* state, SparseVector* feats) {
+ if (add_sos_eos_) { // rules do not produce , so do it here
+ SetRemnantLMState(BeginSentenceState(), dummy_state_);
+ SetHasFullContext(1, dummy_state_);
+ SetUnscoredSize(0, dummy_state_);
+ dummy_ants_[1] = state;
+ LookupWords(*dummy_rule_, dummy_ants_, feats, NULL, NULL);
+ } else { // rules DO produce ...
+#if 0
+ double p = 0;
+ if (!GetFlag(state, HAS_EOS_ON_RIGHT)) { p -= 100; }
+ if (UnscoredSize(state) > 0) { // are there unscored words
+ if (kSOS_ != IthUnscoredWord(0, state)) {
+ p -= 100 * UnscoredSize(state);
+ }
+ }
+ return p;
+#endif
+ }
+ }
+
+ public:
+ explicit NgramDetectorImpl(bool explicit_markers) :
+ kCDEC_UNK(TD::Convert("")) ,
+ add_sos_eos_(!explicit_markers) {
+ order_ = 3;
+ state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID);
+ unscored_size_offset_ = (order_ - 1) * sizeof(WordID);
+ is_complete_offset_ = unscored_size_offset_ + 1;
+ unscored_words_offset_ = is_complete_offset_ + 1;
+
+ // special handling of beginning / ending sentence markers
+ dummy_state_ = new char[state_size_];
+ memset(dummy_state_, 0, state_size_);
+ dummy_ants_.push_back(dummy_state_);
+ dummy_ants_.push_back(NULL);
+ dummy_rule_.reset(new TRule("[DUMMY] ||| [BOS] [DUMMY] ||| [1] [2] ||| X=0"));
+ kSOS_ = TD::Convert("");
+ kEOS_ = TD::Convert("");
+ }
+
+ ~NgramDetectorImpl() {
+ delete[] dummy_state_;
+ }
+
+ int ReserveStateSize() const { return state_size_; }
+
+ private:
+ const WordID kCDEC_UNK;
+ WordID kSOS_; // - requires special handling.
+ WordID kEOS_; //
+ const bool add_sos_eos_; // flag indicating whether the hypergraph produces and
+ // if this is true, FinalTransitionFeatures will "add" and
+ // if false, FinalTransitionFeatures will score anything with the
+ // markers in the right place (i.e., the beginning and end of
+ // the sentence) with 0, and anything else with -100
+
+ int order_;
+ int state_size_;
+ int unscored_size_offset_;
+ int is_complete_offset_;
+ int unscored_words_offset_;
+ char* dummy_state_;
+ vector dummy_ants_;
+ TRulePtr dummy_rule_;
+ struct FidTree {
+ map fids;
+ map levels;
+ };
+ mutable FidTree fidroot_;
+};
+
+NgramDetector::NgramDetector(const string& param) {
+ string filename, mapfile, featname;
+ bool explicit_markers = (param == "-x");
+ pimpl_ = new NgramDetectorImpl(explicit_markers);
+ SetStateSize(pimpl_->ReserveStateSize());
+}
+
+NgramDetector::~NgramDetector() {
+ delete pimpl_;
+}
+
+void NgramDetector::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */,
+ const Hypergraph::Edge& edge,
+ const vector& ant_states,
+ SparseVector* features,
+ SparseVector* estimated_features,
+ void* state) const {
+ pimpl_->LookupWords(*edge.rule_, ant_states, features, estimated_features, state);
+}
+
+void NgramDetector::FinalTraversalFeatures(const void* ant_state,
+ SparseVector* features) const {
+ pimpl_->FinalTraversal(ant_state, features);
+}
+
diff --git a/decoder/ff_ngrams.h b/decoder/ff_ngrams.h
new file mode 100644
index 00000000..82f61b33
--- /dev/null
+++ b/decoder/ff_ngrams.h
@@ -0,0 +1,29 @@
+#ifndef _NGRAMS_FF_H_
+#define _NGRAMS_FF_H_
+
+#include
+#include