diff options
-rw-r--r-- | gi/pf/Makefile.am | 4 | ||||
-rw-r--r-- | gi/pf/align-lexonly-pyp.cc | 327 | ||||
-rw-r--r-- | gi/pf/base_measures.cc | 47 | ||||
-rw-r--r-- | gi/pf/base_measures.h | 18 | ||||
-rw-r--r-- | gi/pf/conditional_pseg.h | 74 | ||||
-rwxr-xr-x | word-aligner/stemmers/ur.pl | 38 |
6 files changed, 507 insertions, 1 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7c8e89d0..28367e67 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,12 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc + itg_SOURCES = itg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..d2630a2b --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,327 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr<MT19937> prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { + vector<WordID> src; + vector<WordID> trg; + vector<LexicalAlignment> a; + Array2D<short> posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector<WordID>& s) const { + return s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0] = exp(logp0(rule.e_)); + return prob_t(r.prob(rule.e_, v, l)); + } + + void Increment(const TRule& rule) { + v[0] = exp(logp0(rule.e_)); + if (r.increment(rule.e_, v, l, &*prng).count) { + base *= prob_t(v[0] * l[0]); + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; + for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR<vector<WordID> > r; + const double u0; + const vector<double> l; + mutable vector<double> v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, + const unsigned words_e, + const unsigned letters_e, + vector<AlignedSentencePair>* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + cerr << " LLH_prev = " << Likelihood() << flush; + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << "\tLLH_post = " << Likelihood() << endl; + } + + void ResampleCorpus(); + + const vector<vector<WordID> >& letters; // spelling dictionary + vector<AlignedSentencePair>& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; + //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; + //ConditionalTranslationModel<UnigramWordBase> tmodel; + //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; + MConditionalTranslationModel<HierarchicalWordBase> tmodel; + //ConditionalTranslationModel<FixedNgramBase> tmodel; + //ConditionalTranslationModel<CompletelyUniformBase> tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { + for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { + vector<WordID>& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D<bool> a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector<AlignedSentencePair> corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector<vector<WordID> > letters(TD::NumWords()); + set<WordID> letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); + x.InitializeRandom(); + const unsigned samples = conf["samples"].as<unsigned>(); + for (int i = 0; i < samples; ++i) { + for (int j = 65; j < 67; ++j) Debug(corpus[j]); + cerr << i << "\t" << x.tmodel.r.size() << "\t"; + if (i % 10 == 0) x.ResampleHyperparemeters(); + x.ResampleCorpus(); + if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + //ModelAndData posterior(x, &corpus, vocabe, vocabf); + x.tmodel.Summary(); + x.up0.Summary(); + + //posterior.Sample(); + + return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 97b4e698..7894d3e7 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,53 @@ using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + vector<int> le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x] = atof(TD::Convert(cur)); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc, const vector<WordID>& vtrg, int start_src, int start_trg) const { diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index a4e9ac28..7214aa22 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -72,6 +72,24 @@ struct UnigramWordBase { const UnigramWordModel un; }; +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule); + assert(it != table.end()); + return it->second; + } + + std::tr1::unordered_map<TRule,prob_t,RuleHasher> table; +}; + struct PhraseConditionalUninformativeBase { explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : kUNIFORM_TARGET(1.0 / vocab_e_size) { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index edcdc813..db951d15 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -8,11 +8,85 @@ #include "prob.h" #include "ccrp_nt.h" +#include "mfcr.h" #include "trule.h" #include "base_measures.h" #include "tdict.h" template <typename ConditionalBaseMeasure> +struct MConditionalTranslationModel { + explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << -1 << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + const TableCount delta = it->second.decrement(rule, rng); + if (delta.count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return delta.count; + } + + int IncrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, MFCR<TRule>(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + } + p0s[0] = rp0(rule).as_float(); + TableCount delta = it->second.increment(rule, p0s, lambdas, rng); + return delta.count; + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p0s[0] = rp0(rule).as_float(); + p = prob_t(it->second.prob(rule, p0s, lambdas)); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); +#if 0 + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } +#endif + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + MFCR<TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; + std::vector<double> lambdas; + mutable std::vector<double> p0s; +}; + +template <typename ConditionalBaseMeasure> struct ConditionalTranslationModel { explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : rp0(rcp0) {} diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl new file mode 100755 index 00000000..3a4f5a45 --- /dev/null +++ b/word-aligner/stemmers/ur.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while(<STDIN>) { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(al|Al)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + |