#include #include #include #include #include #include #include "viterbi.h" #include "hg.h" #include "trule.h" #include "tdict.h" #include "filelib.h" #include "dict.h" #include "sampler.h" #include "ccrp_nt.h" #include "ccrp_onetable.h" using namespace std; using namespace tr1; namespace po = boost::program_options; ostream& operator<<(ostream& os, const vector& p) { os << '['; for (int i = 0; i < p.size(); ++i) os << (i==0 ? "" : " ") << TD::Convert(p[i]); return os << ']'; } double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; } struct Model1 { explicit Model1(const string& fname) : kNULL(TD::Convert("")), kZERO() { LoadModel1(fname); } void LoadModel1(const string& fname) { cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; ReadFile rf(fname); istream& in = *rf.stream(); string line; unsigned lc = 0; while(getline(in, line)) { ++lc; int cur = 0; int start = 0; while(cur < line.size() && line[cur] != ' ') { ++cur; } assert(cur != line.size()); line[cur] = 0; const WordID src = TD::Convert(&line[0]); ++cur; start = cur; while(cur < line.size() && line[cur] != ' ') { ++cur; } assert(cur != line.size()); line[cur] = 0; WordID trg = TD::Convert(&line[start]); const double logprob = strtod(&line[cur + 1], NULL); if (src >= ttable.size()) ttable.resize(src + 1); ttable[src][trg].logeq(logprob); } cerr << " read " << lc << " parameters.\n"; } // returns prob 0 if src or trg is not found! const prob_t& operator()(WordID src, WordID trg) const { if (src == 0) src = kNULL; if (src < ttable.size()) { const map& cpd = ttable[src]; const map::const_iterator it = cpd.find(trg); if (it != cpd.end()) return it->second; } return kZERO; } const WordID kNULL; const prob_t kZERO; vector > ttable; }; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("particles,p",po::value()->default_value(25),"Number of particles") ("input,i",po::value(),"Read parallel data from") ("max_src_phrase",po::value()->default_value(7),"Maximum length of source language phrases") ("max_trg_phrase",po::value()->default_value(7),"Maximum length of target language phrases") ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value(), "Configuration file") ("help,h", "Print this help message and exit"); po::options_description dconfig_options, dcmdline_options; dconfig_options.add(opts); dcmdline_options.add(opts).add(clo); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); if (conf->count("config")) { ifstream config((*conf)["config"].as().c_str()); po::store(po::parse_config_file(config, dconfig_options), *conf); } po::notify(*conf); if (conf->count("help") || (conf->count("input") == 0)) { cerr << dcmdline_options << endl; exit(1); } } void ReadParallelCorpus(const string& filename, vector >* f, vector >* e, set* vocab_f, set* vocab_e) { f->clear(); e->clear(); vocab_f->clear(); vocab_e->clear(); istream* in; if (filename == "-") in = &cin; else in = new ifstream(filename.c_str()); assert(*in); string line; const WordID kDIV = TD::Convert("|||"); vector tmp; while(*in) { getline(*in, line); if (line.empty() && !*in) break; e->push_back(vector()); f->push_back(vector()); vector& le = e->back(); vector& lf = f->back(); tmp.clear(); TD::ConvertSentence(line, &tmp); bool isf = true; for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { if (kDIV == cur) { isf = false; } else { lf.push_back(cur); vocab_f->insert(cur); } } else { assert(cur != kDIV); le.push_back(cur); vocab_e->insert(cur); } } assert(isf == false); } if (in != &cin) delete in; } int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); const unsigned particles = conf["particles"].as(); const unsigned samples = conf["samples"].as(); if (!conf.count("model1")) { cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; return 1; } shared_ptr prng; if (conf.count("random_seed")) prng.reset(new MT19937(conf["random_seed"].as())); else prng.reset(new MT19937); MT19937& rng = *prng; vector > corpuse, corpusf; set vocabe, vocabf; cerr << "Reading corpus...\n"; ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); const int kLHS = -TD::Convert("X"); Model1 m1(conf["model1"].as()); Model1 invm1(conf["inverse_model1"].as()); for (int si = 0; si < conf["samples"].as(); ++si) { cerr << '.' << flush; for (int ci = 0; ci < corpusf.size(); ++ci) { const vector& src = corpusf[ci]; const vector& trg = corpuse[ci]; for (int i = 0; i < src.size(); ++i) { for (int j = 0; j < trg.size(); ++j) { const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); for (int k = 0; k < eff_max_src; ++k) { const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); for (int l = 0; l < eff_max_trg; ++l) { } } } } } } }