diff options
Diffstat (limited to 'gi/pf/transliterations.cc')
-rw-r--r-- | gi/pf/transliterations.cc | 198 |
1 files changed, 58 insertions, 140 deletions
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 6e0c2e93..e29334fd 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -2,173 +2,92 @@ #include <iostream> #include <vector> -#include <tr1/unordered_map> -#include "grammar.h" -#include "bottom_up_parser.h" -#include "hg.h" -#include "hg_intersect.h" +#include "boost/shared_ptr.hpp" + #include "filelib.h" #include "ccrp.h" #include "m.h" -#include "lattice.h" -#include "verbose.h" +#include "reachability.h" using namespace std; using namespace std::tr1; -static WordID kX; -static int kMAX_SRC_SIZE = 0; -static vector<vector<WordID> > cur_trg_chunks; - -vector<GrammarIter*> tlttofreelist; - -static void InitTargetChunks(int max_size, const vector<WordID>& trg) { - cur_trg_chunks.clear(); - vector<WordID> tmp; - unordered_set<vector<WordID>, boost::hash<vector<WordID> > > u; - for (int len = 1; len <= max_size; ++len) { - int end = trg.size() + 1; - end -= len; - for (int i = 0; i < end; ++i) { - tmp.clear(); - for (int j = 0; j < len; ++j) - tmp.push_back(trg[i + j]); - if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); - } - } -} - -struct TransliterationGrammarIter : public GrammarIter, public RuleBin { - TransliterationGrammarIter() { tlttofreelist.push_back(this); } - TransliterationGrammarIter(const TRulePtr& inr, int symbol) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - r.reset(new TRule); - } - TRule& rr = *r; - rr.lhs_ = kX; - rr.f_.push_back(symbol); - tlttofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (!r) return 0; - return cur_trg_chunks.size(); - } - virtual TRulePtr GetIthRule(int i) const { - TRulePtr nr(new TRule(*r)); - nr->e_ = cur_trg_chunks[i]; - //cerr << nr->AsString() << endl; - return nr; - } - virtual int Arity() const { - return 0; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - if (symbol <= 0) return NULL; - if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) - return new TransliterationGrammarIter(r, symbol); - else - return NULL; - } - TRulePtr r; -}; - -struct TransliterationGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new TransliterationGrammarIter; - } - virtual bool HasRuleForSpan(int, int, int distance) const { - return (distance < kMAX_SRC_SIZE); - } -}; - -struct TInfo { - TInfo() : initialized(false) {} +struct GraphStructure { + GraphStructure() : initialized(false) {} + boost::shared_ptr<Reachability> r; bool initialized; - Hypergraph lattice; // may be empty if transliteration is not possible - prob_t est_prob; // will be zero if not possible }; struct TransliterationsImpl { TransliterationsImpl() { - kX = TD::Convert("X")*-1; - kMAX_SRC_SIZE = 4; - grammars.push_back(GrammarPtr(new TransliterationGrammar)); - grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); - SetSilent(true); } void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { - if (src >= graphs.size()) graphs.resize(src + 1); - if (graphs[src][trg].initialized) return; - int kMAX_TRG_SIZE = 4; - InitTargetChunks(kMAX_TRG_SIZE, trg_lets); - ExhaustiveBottomUpParser parser("S", grammars); - Lattice lat(src_lets.size()), tlat(trg_lets.size()); - for (unsigned i = 0; i < src_lets.size(); ++i) - lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); - for (unsigned i = 0; i < trg_lets.size(); ++i) - tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); - //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; - //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; - if (!parser.Parse(lat, &graphs[src][trg].lattice)) { - //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; - abort(); - } - if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { - graphs[src][trg].est_prob = prob_t(1e-4); + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + if (graphs[src_len][trg_len].initialized) return; + graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); + +#if 0 + if (HG::Intersect(tlat, &hg)) { + // TODO } else { - graphs[src][trg].lattice.clear(); - //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; - graphs[src][trg].est_prob = prob_t::Zero(); + cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; + hg.clear(); } - for (unsigned i = 0; i < tlttofreelist.size(); ++i) - delete tlttofreelist[i]; - tlttofreelist.clear(); //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; - graphs[src][trg].initialized = true; +#endif + graphs[src_len][trg_len].initialized = true; } - const prob_t& EstimateProbability(WordID src, WordID trg) const { - assert(src < graphs.size()); - const unordered_map<WordID, TInfo>& um = graphs[src]; - const unordered_map<WordID, TInfo>::const_iterator it = um.find(trg); - assert(it != um.end()); - assert(it->second.initialized); - return it->second.est_prob; + void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + graphs[src_len][trg_len].r.reset(); + graphs[src_len][trg_len].initialized = true; } - void Forbid(WordID src, WordID trg) { - if (src >= graphs.size()) graphs.resize(src + 1); - graphs[src][trg].est_prob = prob_t::Zero(); - graphs[src][trg].initialized = true; + prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { + assert(src.size() < graphs.size()); + const vector<GraphStructure>& tv = graphs[src.size()]; + assert(trg.size() < tv.size()); + const GraphStructure& gs = tv[trg.size()]; + // TODO: do prob + return prob_t::Zero(); } void GraphSummary() const { - double tlp = 0; - int tt = 0; + double to = 0; + double tn = 0; + double tt = 0; for (int i = 0; i < graphs.size(); ++i) { - const unordered_map<WordID, TInfo>& um = graphs[i]; - unordered_map<WordID, TInfo>::const_iterator it; - for (it = um.begin(); it != um.end(); ++it) { - if (it->second.lattice.empty()) continue; - //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; - tlp += log(it->second.lattice.NumberOfPaths()); + const vector<GraphStructure>& vt = graphs[i]; + for (int j = 0; j < vt.size(); ++j) { + const GraphStructure& gs = vt[j]; + if (!gs.r) continue; tt++; + for (int k = 0; k < i; ++k) { + for (int l = 0; l < j; ++l) { + size_t c = gs.r->valid_deltas[k][l].size(); + if (c) { + tn += 1; + to += c; + } + } + } } } - tlp /= tt; - cerr << "E[log paths] = " << tlp << endl; - cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + cerr << " Average nodes = " << (tn / tt) << endl; + cerr << "Average out-degree = " << (to / tn) << endl; + cerr << " Unique structures = " << tt << endl; } - vector<unordered_map<WordID, TInfo> > graphs; - vector<GrammarPtr> grammars; + vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len] }; Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} @@ -178,16 +97,15 @@ void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, Wo pimpl_->Initialize(src, src_lets, trg, trg_lets); } -prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { - return pimpl_->EstimateProbability(src,trg); +prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { + return pimpl_->EstimateProbability(s, src,t, trg); } -void Transliterations::Forbid(WordID src, WordID trg) { - pimpl_->Forbid(src, trg); +void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + pimpl_->Forbid(src, src_lets, trg, trg_lets); } void Transliterations::GraphSummary() const { pimpl_->GraphSummary(); } - |