diff options
-rw-r--r-- | compound-split/cdec-de.ini | 4 | ||||
-rw-r--r-- | compound-split/de/cdec-train.ini | 2 | ||||
-rw-r--r-- | compound-split/de/charlm.rev.5gm.de.klm | bin | 0 -> 14148755 bytes | |||
-rw-r--r-- | configure.ac | 27 | ||||
-rw-r--r-- | decoder/decoder.cc | 9 | ||||
-rw-r--r-- | decoder/ff_bleu.cc | 5 | ||||
-rw-r--r-- | decoder/ff_csplit.cc | 93 | ||||
-rw-r--r-- | decoder/ff_csplit.h | 5 | ||||
-rwxr-xr-x | decoder/ff_from_fsa.h | 15 | ||||
-rw-r--r-- | decoder/ff_lm.cc | 111 | ||||
-rw-r--r-- | decoder/ff_lm.h | 5 | ||||
-rwxr-xr-x | decoder/ff_lm_fsa.h | 13 | ||||
-rwxr-xr-x | decoder/ff_sample_fsa.h | 2 | ||||
-rw-r--r-- | utils/sparse_vector.h | 5 | ||||
-rw-r--r-- | utils/tdict.cc | 74 | ||||
-rw-r--r-- | utils/tdict.h | 23 |
16 files changed, 111 insertions, 282 deletions
diff --git a/compound-split/cdec-de.ini b/compound-split/cdec-de.ini index 65065487..2bfe63d2 100644 --- a/compound-split/cdec-de.ini +++ b/compound-split/cdec-de.ini @@ -1,5 +1,5 @@ formalism=csplit -cubepruning_pop_limit=100000 +intersection_strategy=full weights=de/weights.trained feature_function=CSplit_BasicFeatures de/large_dict.de.gz de/badlist.de.gz -feature_function=CSplit_ReverseCharLM de/charlm.rev.5gm.de.lm.gz +feature_function=CSplit_ReverseCharLM de/charlm.rev.5gm.de.klm diff --git a/compound-split/de/cdec-train.ini b/compound-split/de/cdec-train.ini index 44f5934d..383fa040 100644 --- a/compound-split/de/cdec-train.ini +++ b/compound-split/de/cdec-train.ini @@ -2,4 +2,4 @@ formalism=csplit # crf_uniform_empirical=true intersection_strategy=full feature_function=CSplit_BasicFeatures large_dict.de.gz badlist.de.gz -feature_function=CSplit_ReverseCharLM charlm.rev.5gm.de.lm.gz +feature_function=CSplit_ReverseCharLM charlm.rev.5gm.de.klm diff --git a/compound-split/de/charlm.rev.5gm.de.klm b/compound-split/de/charlm.rev.5gm.de.klm Binary files differnew file mode 100644 index 00000000..574898dc --- /dev/null +++ b/compound-split/de/charlm.rev.5gm.de.klm diff --git a/configure.ac b/configure.ac index c84d671f..56f08147 100644 --- a/configure.ac +++ b/configure.ac @@ -36,13 +36,6 @@ then LIBS="$LIBS -lboost_mpi $BOOST_SERIALIZATION_LIBS -lmpi++ -lmpi" fi -AM_CONDITIONAL([SRI_LM], false) -AC_ARG_WITH(srilm, - [AC_HELP_STRING([--with-srilm=PATH], [(optional) path to SRI's LM toolkit])], - [with_srilm=$withval], - [with_srilm=no] - ) - AM_CONDITIONAL([RAND_LM], false) AC_ARG_WITH(randlm, [AC_HELP_STRING([--with-randlm=PATH], [(optional) path to RandLM toolkit])], @@ -58,26 +51,6 @@ AC_ARG_WITH(glc, ) FF_GLC="" -if test "x$with_srilm" != 'xno' -then - SAVE_CPPFLAGS="$CPPFLAGS" - CPPFLAGS="$CPPFLAGS -I${with_srilm}/include" - - AC_CHECK_HEADER(Ngram.h, - [AC_DEFINE([HAVE_SRILM], [], [flag for SRILM])], - [AC_MSG_ERROR([Cannot find SRILM!])]) - - LIB_SRILM="-loolm -ldstruct -lmisc" - # ROOT/lib/i686-m64/liboolm.a - # ROOT/lib/i686-m64/libdstruct.a - # ROOT/lib/i686-m64/libmisc.a - MY_ARCH=`${with_srilm}/sbin/machine-type` - LDFLAGS="$LDFLAGS -L${with_srilm}/lib/${MY_ARCH}" - LIBS="$LIBS $LIB_SRILM" - FMTLIBS="$FMTLIBS liboolm.a libdstruct.a libmisc.a" - AM_CONDITIONAL([SRI_LM], true) -fi - if test "x$with_randlm" != 'xno' then SAVE_CPPFLAGS="$CPPFLAGS" diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 239c8620..95ff6270 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -763,10 +763,6 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { PRWeightFunction<double, EdgeProb, double, ELengthWeightFunction> >(forest); cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; } - if (conf.count("show_partition")) { - const prob_t z = Inside<prob_t, EdgeProb>(forest); - cerr << " Init. partition log(Z): " << log(z) << endl; - } for (int pass = 0; pass < rescoring_passes.size(); ++pass) { const RescoringPass& rp = rescoring_passes[pass]; @@ -793,6 +789,11 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,show_features,cur_weights,oracle.show_derivation); } + if (conf.count("show_partition")) { + const prob_t z = Inside<prob_t, EdgeProb>(forest); + cerr << " " << passtr << " partition log(Z): " << log(z) << endl; + } + string fullbp = "beam_prune" + StringSuffixForRescoringPass(pass); string fulldp = "density_prune" + StringSuffixForRescoringPass(pass); maybe_prune(forest,conf,fullbp.c_str(),fulldp.c_str(),passtr,srclen); diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc index aa4e6d85..a842bba8 100644 --- a/decoder/ff_bleu.cc +++ b/decoder/ff_bleu.cc @@ -13,8 +13,6 @@ char const* bleu_usage_verbose="Uses feature id 0! Make sure there are no other #include "ff_bleu.h" #include "tdict.h" -#include "Vocab.h" -#include "Ngram.h" #include "hg.h" #include "stringlib.h" #include "sentence_metadata.h" @@ -25,7 +23,7 @@ using namespace std; class BLEUModelImpl { public: explicit BLEUModelImpl(int order) : - ngram_(TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), + buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), floor_(-100.0), kSTART(TD::Convert("<s>")), kSTOP(TD::Convert("</s>")), @@ -219,7 +217,6 @@ class BLEUModelImpl { } protected: - Ngram ngram_; vector<WordID> buffer_; const int order_; const int state_size_; diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 204b7ce6..dee6f4f9 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -3,8 +3,7 @@ #include <set> #include <cstring> -#include "Vocab.h" -#include "Ngram.h" +#include "klm/lm/model.hh" #include "sentence_metadata.h" #include "lattice.h" @@ -155,51 +154,62 @@ void BasicCSplitFeatures::TraversalFeaturesImpl( pimpl_->TraversalFeaturesImpl(edge, smeta.GetSourceLattice().size(), features); } +namespace { +struct CSVMapper : public lm::ngram::EnumerateVocab { + CSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } + void Add(lm::WordIndex index, const StringPiece &str) { + const WordID cdec_id = TD::Convert(str.as_string()); + if (cdec_id >= out_->size()) + out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); + (*out_)[cdec_id] = index; + } + vector<lm::WordIndex>* out_; + const lm::WordIndex kLM_UNKNOWN_TOKEN; +}; +} + +template<class Model> struct ReverseCharLMCSplitFeatureImpl { - ReverseCharLMCSplitFeatureImpl(const string& param) : - order_(5), - vocab_(TD::dict_), - ngram_(vocab_, order_) { - kBOS = vocab_.getIndex("<s>"); - kEOS = vocab_.getIndex("</s>"); - File file(param.c_str(), "r", 0); - assert(file); - cerr << "Reading " << order_ << "-gram LM from " << param << endl; - ngram_.read(file); + ReverseCharLMCSplitFeatureImpl(const string& param) { + CSVMapper vm(&cdec2klm_map_); + lm::ngram::Config conf; + conf.enumerate_vocab = &vm; + cerr << "Reading character LM from " << param << endl; + ngram_ = new Model(param.c_str(), conf); + order_ = ngram_->Order(); + kEOS = MapWord(TD::Convert("</s>")); + assert(kEOS > 0); + } + lm::WordIndex MapWord(const WordID w) const { + if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; + return 0; } double LeftPhonotacticProb(const Lattice& inword, const int start) { const int end = inword.size(); - for (int i = 0; i < order_; ++i) - sc[i] = kBOS; + lm::ngram::State state = ngram_->BeginSentenceState(); int sp = min(end - start, order_ - 1); // cerr << "[" << start << "," << sp << "]\n"; - int ci = (order_ - sp - 1); - int wi = start; + int wi = start + sp - 1; while (sp > 0) { - sc[ci] = inword[wi][0].label; - // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl; - ++wi; - ++ci; + const lm::ngram::State scopy(state); + ngram_->Score(scopy, MapWord(inword[wi][0].label), state); + --wi; --sp; } - // cerr << " END ci=" << ci << endl; - sc[ci] = Vocab_None; - const double startprob = ngram_.wordProb(kEOS, sc); - // cerr << " PROB=" << startprob << endl; + const lm::ngram::State scopy(state); + const double startprob = ngram_->Score(scopy, kEOS, state); return startprob; } private: - const int order_; - Vocab& vocab_; - VocabIndex kBOS; - VocabIndex kEOS; - Ngram ngram_; - VocabIndex sc[80]; + Model* ngram_; + int order_; + vector<lm::WordIndex> cdec2klm_map_; + lm::WordIndex kEOS; }; ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) : - pimpl_(new ReverseCharLMCSplitFeatureImpl(param)), + pimpl_(new ReverseCharLMCSplitFeatureImpl<lm::ngram::ProbingModel>(param)), fid_(FD::Convert("RevCharLM")) {} void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( @@ -217,26 +227,5 @@ void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( if (edge.rule_->EWords() != 1) return; const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_); features->set_value(fid_, lpp); -#if 0 - WordID neighbor_word = 0; - const WordID word = edge.rule_->e_[1]; - const char* sword = TD::Convert(word); - const int len = strlen(sword); - int cur = 0; - int chars = 0; - while(cur < len) { - cur += UTF8Len(sword[cur]); - ++chars; - } - if (chars > 4 && (sword[0] == 's' || sword[0] == 'n')) { - neighbor_word = TD::Convert(string(&sword[1])); - } - if (neighbor_word) { - float nfreq = freq_dict_.LookUp(neighbor_word); - cerr << "COMPARE: " << TD::Convert(word) << " & " << TD::Convert(neighbor_word) << endl; - if (!nfreq) nfreq = 99.0f; - features->set_value(fdoes_deletion_help_, (freq - nfreq)); - } -#endif } diff --git a/decoder/ff_csplit.h b/decoder/ff_csplit.h index c1cfb64b..38c0c5b8 100644 --- a/decoder/ff_csplit.h +++ b/decoder/ff_csplit.h @@ -4,6 +4,7 @@ #include <boost/shared_ptr.hpp> #include "ff.h" +#include "klm/lm/model.hh" class BasicCSplitFeaturesImpl; class BasicCSplitFeatures : public FeatureFunction { @@ -20,7 +21,7 @@ class BasicCSplitFeatures : public FeatureFunction { boost::shared_ptr<BasicCSplitFeaturesImpl> pimpl_; }; -class ReverseCharLMCSplitFeatureImpl; +template <class M> class ReverseCharLMCSplitFeatureImpl; class ReverseCharLMCSplitFeature : public FeatureFunction { public: ReverseCharLMCSplitFeature(const std::string& param); @@ -32,7 +33,7 @@ class ReverseCharLMCSplitFeature : public FeatureFunction { SparseVector<double>* estimated_features, void* out_context) const; private: - boost::shared_ptr<ReverseCharLMCSplitFeatureImpl> pimpl_; + boost::shared_ptr<ReverseCharLMCSplitFeatureImpl<lm::ngram::ProbingModel> > pimpl_; const int fid_; }; diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index 26aca048..f2db8a4b 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -3,6 +3,11 @@ #include "ff_fsa.h" +#ifndef TD__none +// replacing dependency on SRILM +#define TD__none -1 +#endif + #ifndef FSA_FF_DEBUG # define FSA_FF_DEBUG 0 #endif @@ -94,7 +99,7 @@ public: return; } - // bear with me, because this is hard to understand. reminder: ant_contexts and out_state are left-words first (up to M, TD::none padded). if all M words are present, then FSA state follows. otherwise 0 bytes to keep memcmp/hash happy. + // bear with me, because this is hard to understand. reminder: ant_contexts and out_state are left-words first (up to M, TD__none padded). if all M words are present, then FSA state follows. otherwise 0 bytes to keep memcmp/hash happy. //why do we compute heuristic in so many places? well, because that's how we know what state we should score words in once we're full on our left context (because of markov order bound, we know the score will be the same no matter what came before that left context) // these left_* refer to our output (out_state): @@ -163,7 +168,7 @@ public: if (left_out<left_full) { // finally: partial heuristic for unfilled items // fsa.reset(ff.heuristic_start_state()); fsa.scan(left_begin,left_out,&h_accum); ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,ff.heuristic_start_state(),&h_accum); - do { *left_out++=TD::none; } while(left_out<left_full); // none-terminate so left_end(out_state) will know how many words + do { *left_out++=TD__none; } while(left_out<left_full); // none-terminate so left_end(out_state) will know how many words ff.state_zero(out_fsa_state); // so we compare / hash correctly. don't know state yet because left context isn't full } else // or else store final right-state. heuristic was already assigned ff.state_copy(out_fsa_state,fsa.cs); @@ -233,7 +238,7 @@ public: static void test() { WordID w1[1],w1b[1],w2[2]; w1[0]=w2[0]=TD::Convert("hi"); - w2[1]=w1b[0]=TD::none; + w2[1]=w1b[0]=TD__none; assert(left_end(w1,w1+1)==w1+1); assert(left_end(w1b,w1b+1)==w1b); assert(left_end(w2,w2+2)==w2+1); @@ -262,12 +267,12 @@ private: /* state layout: left WordIds, followed by fsa state left words have never been scored. last ones remaining will be scored on FinalTraversalFeatures only. - right state is unknown until we have all M left words (less than M means TD::none will pad out right end). unk right state will be zeroed out for proper hash/equal recombination. + right state is unknown until we have all M left words (less than M means TD__none will pad out right end). unk right state will be zeroed out for proper hash/equal recombination. */ static inline WordID const* left_end(WordID const* left, WordID const* e) { for (;e>left;--e) - if (e[-1]!=TD::none) break; + if (e[-1]!=TD__none) break; //post: [left,e] are the seen left words return e; } diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index a9929253..afa36b96 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -59,8 +59,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight #include "fast_lexical_cast.hpp" #include "tdict.h" -#include "Vocab.h" -#include "Ngram.h" #include "hg.h" #include "stringlib.h" @@ -80,41 +78,9 @@ string LanguageModel::usage(bool param,bool verbose) { } -// NgramShare will keep track of all loaded lms and reuse them. -//TODO: ref counting by shared_ptr? for now, first one to load LM needs to stick around as long as all subsequent users. - #include <boost/shared_ptr.hpp> using namespace boost; -//WARNING: first person to add a pointer to ngram must keep it around until others are done using it. -struct NgramShare -{ -// typedef shared_ptr<Ngram> NP; - typedef Ngram *NP; - map<string,NP> ns; - bool have(string const& file) const - { - return ns.find(file)!=ns.end(); - } - NP get(string const& file) const - { - assert(have(file)); - return ns.find(file)->second; - } - void set(string const& file,NP n) - { - ns[file]=n; - } - void add(string const& file,NP n) - { - assert(!have(file)); - set(file,n); - } -}; - -//TODO: namespace or static? -NgramShare ngs; - namespace NgramCache { struct Cache { map<WordID, Cache> tree; @@ -215,37 +181,28 @@ class LanguageModelImpl : public LanguageModelInterface { state_size_ = OrderToStateSize(order)-1; unigram=(order<=1); floor_ = -100; - kSTART = TD::ss; - kSTOP = TD::se; - kUNKNOWN = TD::unk; - kNONE = TD::none; + kSTART = TD::Convert("<s>"); + kSTOP = TD::Convert("</s>"); + kUNKNOWN = TD::Convert("<unk>"); + kNONE = 0; kSTAR = TD::Convert("<{STAR}>"); } public: - explicit LanguageModelImpl(int order) : ngram_(TD::dict_, order) + explicit LanguageModelImpl(int order) { init(order); } -//TODO: show that unigram special case (0 state) computes what it should. - LanguageModelImpl(int order, const string& f, int load_order=0) : - ngram_(TD::dict_, load_order ? load_order : order) - { - init(order); - File file(f.c_str(), "r", 0); - assert(file); - cerr << "Reading " << order_ << "-gram LM from " << f << endl; - ngram_.read(file, false); - } - virtual ~LanguageModelImpl() { } - Ngram *get_lm() // for make_lm_impl ngs sharing only. + //Ngram *get_lm() // for make_lm_impl ngs sharing only. + void *get_lm() // for make_lm_impl ngs sharing only. { - return &ngram_; + //return &ngram_; + return 0; } @@ -258,17 +215,19 @@ class LanguageModelImpl : public LanguageModelInterface { } virtual double WordProb(WordID word, WordID const* context) { - return ngram_.wordProb(word, (VocabIndex*)context); + return -100; + //return ngram_.wordProb(word, (VocabIndex*)context); } // may be shorter than actual null-terminated length. context must be null terminated. len is just to save effort for subclasses that don't support contextID virtual int ContextSize(WordID const* context,int len) { unsigned ret; - ngram_.contextID((VocabIndex*)context,ret); + //ngram_.contextID((VocabIndex*)context,ret); return ret; } virtual double ContextBOW(WordID const* context,int shortened_len) { - return ngram_.contextBOW((VocabIndex*)context,shortened_len); + //return ngram_.contextBOW((VocabIndex*)context,shortened_len); + return -100; } inline double LookupProbForBufferContents(int i) { @@ -457,7 +416,6 @@ public: } protected: - Ngram ngram_; vector<WordID> buffer_; int order_; int state_size_; @@ -470,8 +428,7 @@ public: bool unigram; }; -struct ClientLMI : public LanguageModelImpl -{ +struct ClientLMI : public LanguageModelImpl { ClientLMI(int order,string const& server) : LanguageModelImpl(order), client_(server) {} @@ -489,37 +446,13 @@ protected: LMClient client_; }; -struct ReuseLMI : public LanguageModelImpl -{ - ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng) - {} - double WordProb(int word, WordID const* context) { - return ng->wordProb(word, (VocabIndex*)context); - } - virtual int ContextSize(WordID const* context, int len) { - unsigned ret; - ng->contextID((VocabIndex*)context,ret); - return ret; - } - virtual double ContextBOW(WordID const* context,int shortened_len) { - return ng->contextBOW((VocabIndex*)context,shortened_len); - } -protected: - Ngram *ng; -}; - LanguageModelImpl *make_lm_impl(int order, string const& f, int load_order) { if (f.find("lm://") == 0) { return new ClientLMI(order,f.substr(5)); - } else if (load_order==0 && ngs.have(f)) { - cerr<<"Reusing already loaded Ngram LM: "<<f<<endl; - return new ReuseLMI(order,ngs.get(f)); } else { - LanguageModelImpl *r=new LanguageModelImpl(order,f,load_order); - if (!load_order || !ngs.have(f)) - ngs.add(f,r->get_lm()); - return r; + cerr << "LanguageModel no longer supports non-remote LMs. Please use KLanguageModel!\nPlease see http://cdec-decoder.org/index.php?title=Language_model_notes\n"; + abort(); } } @@ -600,12 +533,12 @@ void LanguageModelFsa::set_ngram_order(int i) { WordID *ss=(WordID*)start.begin(); WordID *hs=(WordID*)h_start.begin(); if (ctxlen_) { // avoid segfault in case of unigram lm (0 state) - set_end_phrase(TD::se); + set_end_phrase(TD::Convert("</s>")); // se is pretty boring in unigram case, just adds constant prob. check that this is what we want - ss[0]=TD::ss; // start-sentence context (length 1) - hs[0]=TD::none; // empty context + ss[0]=TD::Convert("<s>"); // start-sentence context (length 1) + hs[0]=0; // empty context for (int i=1;i<ctxlen_;++i) { - ss[i]=hs[i]=TD::none; // need this so storage is initialized for hashing. + ss[i]=hs[i]=0; // need this so storage is initialized for hashing. //TODO: reevaluate whether state space comes cleared by allocator or not. } } @@ -627,7 +560,7 @@ void LanguageModelFsa::print_state(ostream &o,void const* st) const { for (int i=ctxlen_;i>0;sp=true) { --i; WordID w=wst[i]; - if (w==TD::none) continue; + if (w==0) continue; if (sp) o<<' '; o << TD::Convert(w); } diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h index e682481d..8885efce 100644 --- a/decoder/ff_lm.h +++ b/decoder/ff_lm.h @@ -8,6 +8,9 @@ #include "ff.h" #include "config.h" +// everything in this file is deprecated and may be broken. +// Chris Dyer, Mar 2011 + class LanguageModelInterface { public: double floor_; @@ -29,7 +32,7 @@ class LanguageModelInterface { double p=ContextBOW(context,slen); while (len>slen) { --len; - context[len]=TD::none; + context[len]=0; } return p; } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index d2df943e..85b7ef44 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -21,8 +21,13 @@ #include "ff_fsa.h" #include "ff_lm.h" +#ifndef TD__none +// replacing dependency on SRILM +#define TD__none -1 +#endif + namespace { -WordID empty_context=TD::none; +WordID empty_context=TD__none; } struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { @@ -40,7 +45,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { } static inline WordID const* left_end(WordID const* left, WordID const* e) { for (;e>left;--e) - if (e[-1]!=TD::none) break; + if (e[-1]!=TD__none) break; //post: [left,e] are the seen left words return e; } @@ -55,7 +60,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { } else { WordID ctx[ngram_order_]; //alloca if you don't have C99 state_copy(ctx,old_st); - ctx[ctxlen_]=TD::none; + ctx[ctxlen_]=TD__none; Featval p=floored(pimpl_->WordProb(w,ctx)); FSALMDBG(de,"p("<<TD::Convert(w)<<"|"<<TD::Convert(ctx,ctx+ctxlen_)<<")="<<p);FSALMDBGnl(de); // states are srilm contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). @@ -88,7 +93,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { WP st_end=st+ctxlen_; // may include some null already (or none if full) int nboth=nw+ctxlen_; WordID ctx[nboth+1]; - ctx[nboth]=TD::none; + ctx[nboth]=TD__none; // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0] W ctx_score_end=wordcpy_reverse(ctx,begin,end); wordcpy(ctx_score_end,st,st_end); // st already reversed. diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 20d64b16..74d71b6a 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -114,7 +114,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> { // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> { ShorterThanPrev(std::string const& param) - : FsaTypedBase<int,ShorterThanPrev>(-1,4,singleton_sentence(TD::se)) // start, h_start, end_phrase + : FsaTypedBase<int,ShorterThanPrev>(-1,4,singleton_sentence(TD::Convert("</s>"))) // start, h_start, end_phrase // h_start estimate state: anything <4 chars is usually shorter than previous { Init(); } static std::string usage(bool param,bool verbose) { diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index f76fc14c..1bcb9502 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -201,6 +201,11 @@ public: return found==values_.end() || !found->second; } + void remove_zeros() { + typename MapType::iterator it = values_.begin(); + for (; it != values_.end(); ++it) + if (!it->second) values_.erase(it); + } T get(int index) const { typename MapType::const_iterator found = values_.find(index); diff --git a/utils/tdict.cc b/utils/tdict.cc index 1f68feae..23a298f8 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -5,93 +5,27 @@ #include <stdlib.h> #include <cstring> #include <sstream> -#include "Ngram.h" #include "dict.h" #include "tdict.h" -#include "Vocab.h" #include "stringlib.h" #include "threadlocal.h" using namespace std; -Vocab TD::dict_(0,TD::max_wordid); -WordID TD::ss=dict_.ssIndex(); -WordID TD::se=dict_.seIndex(); -WordID TD::unk=dict_.unkIndex(); -char const*const TD::ss_str=Vocab_SentStart; -char const*const TD::se_str=Vocab_SentEnd; -char const*const TD::unk_str=Vocab_Unknown; - -// pre+(i-base)+">" for i in [base,e) -inline void pad(std::string const& pre,int base,int e) { - assert(base<=e); - ostringstream o; - for (int i=base;i<e;++i) { - o.str(pre); - o<<(i-base)<<'>'; - WordID id=TD::Convert(o.str()); - assert(id==i); // this fails. why? - } -} - - -namespace { -struct TD_init { - TD_init() { - /* - // disabled for now since it's breaking trunk - assert(TD::Convert(TD::ss_str)==TD::ss); - assert(TD::Convert(TD::se_str)==TD::se); - assert(TD::Convert(TD::unk_str)==TD::unk); - assert(TD::none==Vocab_None); - pad("<FILLER",TD::end(),TD::reserved_begin); - assert(TD::end()==TD::reserved_begin); - int reserved_end=TD::begin(); - pad("<RESERVED",TD::end(),reserved_end); - assert(TD::end()==reserved_end); - */ - } -}; - -TD_init td_init; -} - -unsigned int TD::NumWords() { - return dict_.numWords(); -} -WordID TD::end() { - return dict_.highIndex(); -} +Dict TD::dict_; WordID TD::Convert(const std::string& s) { - return dict_.addWord((VocabString)s.c_str()); + return dict_.Convert(s); } WordID TD::Convert(char const* s) { - return dict_.addWord((VocabString)s); + return dict_.Convert(string(s)); } - -#if TD_ALLOW_UNDEFINED_WORDIDS -# include "static_utoa.h" -char undef_prefix[]="UNDEF_"; -static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]); -THREADLOCAL char undef_buf[]="UNDEF_________________"; -inline char const* undef_token(WordID w) -{ - append_utoa(undef_buf+undefpre_n,w); - return undef_buf; -} -#endif - const char* TD::Convert(WordID w) { -#if TD_ALLOW_UNDEFINED_WORDIDS - if (w>=dict_.highIndex()) return undef_token(w); -#endif - return dict_.getWord((VocabIndex)w); + return dict_.Convert(w).c_str(); } - void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) { ids->clear(); for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i) diff --git a/utils/tdict.h b/utils/tdict.h index dd7f0237..393146fa 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -6,29 +6,10 @@ #include "wordid.h" #include <assert.h> -class Vocab; +class Dict; struct TD { - /* // disabled for now - static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "<FILLERi>" - static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>' - static inline WordID reserved(int i) { - assert(i>=0 && i<n_reserved); - return (WordID)(reserved_begin+i); - } - static inline WordID begin() { - return reserved(n_reserved); - } - */ - static const WordID max_wordid=0x7fffffff; - static const WordID null=max_wordid-1; - static const WordID none=(WordID)-1; // Vocab_None - this will collide with mixed node/variable id / word space, though. max_wordid will be distinct (still positive) - static char const* const ss_str; //="<s>"; - static char const* const se_str; //="</s>"; - static char const* const unk_str; //="<unk>"; - static WordID ss,se,unk; // x=Convert(x_str) static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far - static Vocab dict_; static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids); static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids); static std::string GetString(const std::vector<WordID>& str); @@ -38,6 +19,8 @@ struct TD { static WordID Convert(const std::string& s); static WordID Convert(char const* s); static const char* Convert(WordID w); + private: + static Dict dict_; }; struct ToTD { |