diff options
Diffstat (limited to 'gi/pf/itg.cc')
-rw-r--r-- | gi/pf/itg.cc | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc new file mode 100644 index 00000000..2c2a86f9 --- /dev/null +++ b/gi/pf/itg.cc @@ -0,0 +1,224 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +ostream& operator<<(ostream& os, const vector<WordID>& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct Model1 { + explicit Model1(const string& fname) : + kNULL(TD::Convert("<eps>")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; + } + + // returns prob 0 if src or trg is not found! + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const map<WordID, prob_t>& cpd = ttable[src]; + const map<WordID, prob_t>::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + vector<map<WordID, prob_t> > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") + ("input,i",po::value<string>(),"Read parallel data from") + ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases") + ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases") + ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") + ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") + ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadParallelCorpus(const string& filename, + vector<vector<WordID> >* f, + vector<vector<WordID> >* e, + set<WordID>* vocab_f, + set<WordID>* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector<int>()); + f->push_back(vector<int>()); + vector<int>& le = e->back(); + vector<int>& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } + if (in != &cin) delete in; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); + const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + const unsigned particles = conf["particles"].as<unsigned>(); + const unsigned samples = conf["samples"].as<unsigned>(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + shared_ptr<MT19937> prng; + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector<vector<WordID> > corpuse, corpusf; + set<WordID> vocabe, vocabf; + cerr << "Reading corpus...\n"; + ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + assert(corpusf.size() == corpuse.size()); + + const int kLHS = -TD::Convert("X"); + Model1 m1(conf["model1"].as<string>()); + Model1 invm1(conf["inverse_model1"].as<string>()); + for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) { + cerr << '.' << flush; + for (int ci = 0; ci < corpusf.size(); ++ci) { + const vector<WordID>& src = corpusf[ci]; + const vector<WordID>& trg = corpuse[ci]; + for (int i = 0; i < src.size(); ++i) { + for (int j = 0; j < trg.size(); ++j) { + const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); + for (int k = 0; k < eff_max_src; ++k) { + const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); + for (int l = 0; l < eff_max_trg; ++l) { + } + } + } + } + } + } +} + |