diff options
| author | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 | 
| commit | e26434979adc33bd949566ba7bf02dff64e80a3e (patch) | |
| tree | d1c72495e3af6301bd28e7e66c42de0c7a944d1f /phrasinator | |
| parent | 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff) | |
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'phrasinator')
| -rw-r--r-- | phrasinator/Jamfile | 4 | ||||
| -rw-r--r-- | phrasinator/Makefile.am | 14 | ||||
| -rw-r--r-- | phrasinator/README | 16 | ||||
| -rw-r--r-- | phrasinator/gibbs_train_plm.cc | 309 | ||||
| -rw-r--r-- | phrasinator/gibbs_train_plm.notables.cc | 335 | ||||
| -rwxr-xr-x | phrasinator/train-phrasinator.pl | 89 | 
6 files changed, 0 insertions, 767 deletions
| diff --git a/phrasinator/Jamfile b/phrasinator/Jamfile deleted file mode 100644 index 1fc34f79..00000000 --- a/phrasinator/Jamfile +++ /dev/null @@ -1,4 +0,0 @@ -exe gibbs_train_plm : gibbs_train_plm.cc ..//utils ..//z ..//boost_program_options ; -exe gibbs_train_plm_notables : gibbs_train_plm.notables.cc ..//utils ..//z ..//boost_program_options ; - -alias programs : gibbs_train_plm gibbs_train_plm_notables ; diff --git a/phrasinator/Makefile.am b/phrasinator/Makefile.am deleted file mode 100644 index 3ddd1934..00000000 --- a/phrasinator/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -bin_PROGRAMS = gibbs_train_plm gibbs_train_plm_notables - -#head_bigram_model - -gibbs_train_plm_notables_SOURCES = gibbs_train_plm.notables.cc -gibbs_train_plm_notables_LDADD = $(top_srcdir)/utils/libutils.a -lz - -gibbs_train_plm_SOURCES = gibbs_train_plm.cc -gibbs_train_plm_LDADD = $(top_srcdir)/utils/libutils.a -lz - -#head_bigram_model_SOURCES = head_bigram_model.cc -#head_bigram_model_LDADD = $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -funroll-loops -ffast-math -W -Wall -I$(top_srcdir)/utils diff --git a/phrasinator/README b/phrasinator/README deleted file mode 100644 index fb5b93ef..00000000 --- a/phrasinator/README +++ /dev/null @@ -1,16 +0,0 @@ -The "phrasinator" uses a simple Bayesian nonparametric model to segment -text into chunks. The inferred model is then saved so that it can rapidly -predict segments in new (but related) texts. - -  Input will be a corpus of sentences, e.g.: - -    economists have argued that real interest rates have fallen . - -  The output will be a model that, when run with cdec, will produce -  a segmentation into phrasal units, e.g.: - -    economists have argued that real_interest_rates have fallen . - - -To train a model, run ./train-phrasinator.pl and follow instructions. - diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc deleted file mode 100644 index 7847a460..00000000 --- a/phrasinator/gibbs_train_plm.cc +++ /dev/null @@ -1,309 +0,0 @@ -#include <iostream> -#include <tr1/memory> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "m.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -Dict d; // global dictionary - -string Join(char joiner, const vector<int>& phrase) { -  ostringstream os; -  for (unsigned i = 0; i < phrase.size(); ++i) { -    if (i > 0) os << joiner; -    os << d.Convert(phrase[i]); -  } -  return os.str(); -} - -ostream& operator<<(ostream& os, const vector<int>& phrase) { -  for (unsigned i = 0; i < phrase.size(); ++i) -    os << (i == 0 ? "" : " ") << d.Convert(phrase[i]); -  return os; -} - -struct UnigramLM { -  explicit UnigramLM(const string& fname) { -    ifstream in(fname.c_str()); -    assert(in); -  } - -  double logprob(unsigned word) const { -    assert(word < freqs_.size()); -    return freqs_[word]; -  } - -  vector<double> freqs_; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read file from") -        ("random_seed,S",po::value<uint32_t>(), "Random seed") -        ("write_cdec_grammar,g", po::value<string>(), "Write cdec grammar to this file") -        ("write_cdec_weights,w", po::value<string>(), "Write cdec weights to this file") -        ("poisson_length,p", "Use a Poisson distribution as the length of a phrase in the base distribuion") -        ("no_hyperparameter_inference,N", "Disable hyperparameter inference"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab) { -  c->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    c->push_back(vector<int>()); -    vector<int>& v = c->back(); -    d.ConvertWhitespaceDelimitedLine(line, &v); -    for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]); -  } -  if (in != &cin) delete in; -} - -struct UniphraseLM { -  UniphraseLM(const vector<vector<int> >& corpus, -              const set<int>& vocab, -              const po::variables_map& conf) : -    phrases_(1,1,1,1), -    gen_(1,1,1,1), -    corpus_(corpus), -    uniform_word_(1.0 / vocab.size()), -    gen_p0_(0.5), -    p_end_(0.5), -    use_poisson_(conf.count("poisson_length") > 0) {} - -  double p0(const vector<int>& phrase) const { -    static vector<double> p0s(10000, 0.0); -    assert(phrase.size() < 10000); -    double& p = p0s[phrase.size()]; -    if (p) return p; -    p = exp(log_p0(phrase)); -    if (!p) { -      cerr << "0 prob phrase: " << phrase << "\nAssigning std::numeric_limits<double>::min()\n"; -      p = std::numeric_limits<double>::min(); -    } -    return p; -  } - -  double log_p0(const vector<int>& phrase) const { -    double len_logprob; -    if (use_poisson_) -      len_logprob = Md::log_poisson(phrase.size(), 1.0); -    else -      len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_); -    return log(uniform_word_) * phrase.size() + len_logprob; -  } - -  double llh() const { -    double llh = gen_.log_crp_prob(); -    llh += gen_.num_tables(false) * log(gen_p0_) + -           gen_.num_tables(true) * log(1 - gen_p0_); -    double llhr = phrases_.log_crp_prob(); -    for (CCRP<vector<int> >::const_iterator it = phrases_.begin(); it != phrases_.end(); ++it) { -      llhr += phrases_.num_tables(it->first) * log_p0(it->first); -      //llhr += log_p0(it->first); -      if (!isfinite(llh)) { -        cerr << it->first << endl; -        cerr << log_p0(it->first) << endl; -        abort(); -      } -    } -    return llh + llhr; -  } - -  void Sample(unsigned int samples, bool hyp_inf, MT19937* rng) { -    cerr << "Initializing...\n"; -    z_.resize(corpus_.size()); -    int tc = 0; -    for (unsigned i = 0; i < corpus_.size(); ++i) { -      const vector<int>& line = corpus_[i]; -      const int ls = line.size(); -      const int last_pos = ls - 1; -      vector<bool>& z = z_[i]; -      z.resize(ls); -      int prev = 0; -      for (int j = 0; j < ls; ++j) { -        z[j] = rng->next() < 0.5; -        if (j == last_pos) z[j] = true;  // break phrase at the end of the sentence -        if (z[j]) { -          const vector<int> p(line.begin() + prev, line.begin() + j + 1); -          phrases_.increment(p, p0(p), rng); -          //cerr << p << ": " << p0(p) << endl; -          prev = j + 1; -          gen_.increment(false, gen_p0_, rng); -          ++tc; // remove -        } -      } -      ++tc; -      gen_.increment(true, 1.0 - gen_p0_, rng); // end of utterance -    } -    cerr << "TC: " << tc << endl; -    cerr << "Initial LLH: " << llh() << endl; -    cerr << "Sampling...\n"; -    cerr << gen_ << endl; -    for (unsigned s = 1; s < samples; ++s) { -      cerr << '.'; -      if (s % 10 == 0) { -        cerr << " [" << s; -        if (hyp_inf) ResampleHyperparameters(rng); -        cerr << " LLH=" << llh() << "]\n"; -        vector<int> z(z_[0].size(), 0); -        //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j]; -        //SegCorpus::Write(corpus_[0], z, d); -      } -      for (unsigned i = 0; i < corpus_.size(); ++i) { -        const vector<int>& line = corpus_[i]; -        const int ls = line.size(); -        const int last_pos = ls - 1; -        vector<bool>& z = z_[i]; -        int prev = 0; -        for (int j = 0; j < last_pos; ++j) { // don't resample last position -          int next = j+1;  while(!z[next]) { ++next; } -          const vector<int> p1p2(line.begin() + prev, line.begin() + next + 1); -          const vector<int> p1(line.begin() + prev, line.begin() + j + 1); -          const vector<int> p2(line.begin() + j + 1, line.begin() + next + 1); - -          if (z[j]) { -            phrases_.decrement(p1, rng); -            phrases_.decrement(p2, rng); -            gen_.decrement(false, rng); -            gen_.decrement(false, rng); -          } else { -            phrases_.decrement(p1p2, rng); -            gen_.decrement(false, rng); -          } - -          const double d1 = phrases_.prob(p1p2, p0(p1p2)) * gen_.prob(false, gen_p0_); -          double d2 = phrases_.prob(p1, p0(p1)) * gen_.prob(false, gen_p0_); -          phrases_.increment(p1, p0(p1), rng); -          gen_.increment(false, gen_p0_, rng); -          d2 *= phrases_.prob(p2, p0(p2)) * gen_.prob(false, gen_p0_); -          phrases_.decrement(p1, rng); -          gen_.decrement(false, rng); -          z[j] = rng->SelectSample(d1, d2); - -          if (z[j]) { -            phrases_.increment(p1, p0(p1), rng); -            phrases_.increment(p2, p0(p2), rng); -            gen_.increment(false, gen_p0_, rng); -            gen_.increment(false, gen_p0_, rng); -            prev = j + 1; -          } else { -            phrases_.increment(p1p2, p0(p1p2), rng); -            gen_.increment(false, gen_p0_, rng); -          } -        } -      } -    } -//    cerr << endl << endl << gen_ << endl << phrases_ << endl; -    cerr << gen_.prob(false, gen_p0_) << " " << gen_.prob(true, 1 - gen_p0_) << endl; -  } - -  void WriteCdecGrammarForCurrentSample(ostream* os) const { -    CCRP<vector<int> >::const_iterator it = phrases_.begin(); -    for (; it != phrases_.end(); ++it) { -      (*os) << "[X] ||| " << Join(' ', it->first) << " ||| " -                          << Join('_', it->first) << " ||| C=1 P="  -                          << log(phrases_.prob(it->first, p0(it->first))) << endl; -    } -  } - -  double OOVUnigramLogProb() const { -    vector<int> x(1,99999999); -    return log(phrases_.prob(x, p0(x))); -  } - -  void ResampleHyperparameters(MT19937* rng) { -    phrases_.resample_hyperparameters(rng); -    gen_.resample_hyperparameters(rng); -    cerr << " d=" << phrases_.discount() << ",s=" << phrases_.strength(); -  } - -  CCRP<vector<int> > phrases_; -  CCRP<bool> gen_; -  vector<vector<bool> > z_;   // z_[i] is there a phrase boundary after the ith word -  const vector<vector<int> >& corpus_; -  const double uniform_word_; -  const double gen_p0_; -  const double p_end_; // in base length distribution, p of the end of a phrase -  const bool use_poisson_; -}; - - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  boost::shared_ptr<MT19937> prng; -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<int> > corpus; -  set<int> vocab; -  ReadCorpus(conf["input"].as<string>(), &corpus, &vocab); -  cerr << "Corpus size: " << corpus.size() << " sentences\n"; -  cerr << "Vocabulary size: " << vocab.size() << " types\n"; - -  UniphraseLM ulm(corpus, vocab, conf); -  ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng); -  cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl; - -  for (unsigned i = 0; i < corpus.size(); ++i) -//    SegCorpus::Write(corpus[i], shmmlm.z_[i], d); - ; -  if (conf.count("write_cdec_grammar")) { -    string fname = conf["write_cdec_grammar"].as<string>(); -    cerr << "Writing model to " << fname << " ...\n"; -    WriteFile wf(fname); -    ulm.WriteCdecGrammarForCurrentSample(wf.stream()); -  } - -  if (conf.count("write_cdec_weights")) { -    string fname = conf["write_cdec_weights"].as<string>(); -    cerr << "Writing weights to " << fname << " .\n"; -    WriteFile wf(fname); -    ostream& os = *wf.stream(); -    os << "# make C smaller to use more phrases\nP 1\nPassThrough " << ulm.OOVUnigramLogProb() << "\nC -3\n"; -  } - -  return 0; -} - diff --git a/phrasinator/gibbs_train_plm.notables.cc b/phrasinator/gibbs_train_plm.notables.cc deleted file mode 100644 index 4526eaa6..00000000 --- a/phrasinator/gibbs_train_plm.notables.cc +++ /dev/null @@ -1,335 +0,0 @@ -#include <iostream> -#include <tr1/memory> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_nt.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -Dict d; // global dictionary - -string Join(char joiner, const vector<int>& phrase) { -  ostringstream os; -  for (unsigned i = 0; i < phrase.size(); ++i) { -    if (i > 0) os << joiner; -    os << d.Convert(phrase[i]); -  } -  return os.str(); -} - -template <typename BType> -void WriteSeg(const vector<int>& line, const vector<BType>& label, const Dict& d) { -  assert(line.size() == label.size()); -  assert(label.back()); -  unsigned prev = 0; -  unsigned cur = 0; -  while (cur < line.size()) { -    if (label[cur]) { -      if (prev) cout << ' '; -      cout << "{{"; -      for (unsigned i = prev; i <= cur; ++i) -        cout << (i == prev ? "" : " ") << d.Convert(line[i]); -      cout << "}}:" << label[cur]; -      prev = cur + 1; -    } -    ++cur; -  } -  cout << endl; -} - -ostream& operator<<(ostream& os, const vector<int>& phrase) { -  for (unsigned i = 0; i < phrase.size(); ++i) -    os << (i == 0 ? "" : " ") << d.Convert(phrase[i]); -  return os; -} - -struct UnigramLM { -  explicit UnigramLM(const string& fname) { -    ifstream in(fname.c_str()); -    assert(in); -  } - -  double logprob(unsigned word) const { -    assert(word < freqs_.size()); -    return freqs_[word]; -  } - -  vector<double> freqs_; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read file from") -        ("random_seed,S",po::value<uint32_t>(), "Random seed") -        ("write_cdec_grammar,g", po::value<string>(), "Write cdec grammar to this file") -        ("write_cdec_weights,w", po::value<string>(), "Write cdec weights to this file") -        ("poisson_length,p", "Use a Poisson distribution as the length of a phrase in the base distribuion") -        ("no_hyperparameter_inference,N", "Disable hyperparameter inference"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab) { -  c->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    c->push_back(vector<int>()); -    vector<int>& v = c->back(); -    d.ConvertWhitespaceDelimitedLine(line, &v); -    for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]); -  } -  if (in != &cin) delete in; -} - -double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} - -struct UniphraseLM { -  UniphraseLM(const vector<vector<int> >& corpus, -              const set<int>& vocab, -              const po::variables_map& conf) : -    phrases_(1,1), -    gen_(1,1), -    corpus_(corpus), -    uniform_word_(1.0 / vocab.size()), -    gen_p0_(0.5), -    p_end_(0.5), -    use_poisson_(conf.count("poisson_length") > 0) {} - -  double p0(const vector<int>& phrase) const { -    static vector<double> p0s(10000, 0.0); -    assert(phrase.size() < 10000); -    double& p = p0s[phrase.size()]; -    if (p) return p; -    p = exp(log_p0(phrase)); -    if (!p) { -      cerr << "0 prob phrase: " << phrase << "\nAssigning std::numeric_limits<double>::min()\n"; -      p = std::numeric_limits<double>::min(); -    } -    return p; -  } - -  double log_p0(const vector<int>& phrase) const { -    double len_logprob; -    if (use_poisson_) -      len_logprob = log_poisson(phrase.size(), 1.0); -    else -      len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_); -    return log(uniform_word_) * phrase.size() + len_logprob; -  } - -  double llh() const { -    double llh = gen_.log_crp_prob(); -    llh += log(gen_p0_) + log(1 - gen_p0_); -    double llhr = phrases_.log_crp_prob(); -    for (CCRP_NoTable<vector<int> >::const_iterator it = phrases_.begin(); it != phrases_.end(); ++it) { -      llhr += log_p0(it->first); -      //llhr += log_p0(it->first); -      if (!isfinite(llh)) { -        cerr << it->first << endl; -        cerr << log_p0(it->first) << endl; -        abort(); -      } -    } -    return llh + llhr; -  } - -  void Sample(unsigned int samples, bool hyp_inf, MT19937* rng) { -    cerr << "Initializing...\n"; -    z_.resize(corpus_.size()); -    int tc = 0; -    for (unsigned i = 0; i < corpus_.size(); ++i) { -      const vector<int>& line = corpus_[i]; -      const int ls = line.size(); -      const int last_pos = ls - 1; -      vector<bool>& z = z_[i]; -      z.resize(ls); -      int prev = 0; -      for (int j = 0; j < ls; ++j) { -        z[j] = rng->next() < 0.5; -        if (j == last_pos) z[j] = true;  // break phrase at the end of the sentence -        if (z[j]) { -          const vector<int> p(line.begin() + prev, line.begin() + j + 1); -          phrases_.increment(p); -          //cerr << p << ": " << p0(p) << endl; -          prev = j + 1; -          gen_.increment(false); -          ++tc; // remove -        } -      } -      ++tc; -      gen_.increment(true); // end of utterance -    } -    cerr << "TC: " << tc << endl; -    cerr << "Initial LLH: " << llh() << endl; -    cerr << "Sampling...\n"; -    cerr << gen_ << endl; -    for (unsigned s = 1; s < samples; ++s) { -      cerr << '.'; -      if (s % 10 == 0) { -        cerr << " [" << s; -        if (hyp_inf) ResampleHyperparameters(rng); -        cerr << " LLH=" << llh() << "]\n"; -        vector<int> z(z_[0].size(), 0); -        //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j]; -        //SegCorpus::Write(corpus_[0], z, d); -      } -      for (unsigned i = 0; i < corpus_.size(); ++i) { -        const vector<int>& line = corpus_[i]; -        const int ls = line.size(); -        const int last_pos = ls - 1; -        vector<bool>& z = z_[i]; -        int prev = 0; -        for (int j = 0; j < last_pos; ++j) { // don't resample last position -          int next = j+1;  while(!z[next]) { ++next; } -          const vector<int> p1p2(line.begin() + prev, line.begin() + next + 1); -          const vector<int> p1(line.begin() + prev, line.begin() + j + 1); -          const vector<int> p2(line.begin() + j + 1, line.begin() + next + 1); - -          if (z[j]) { -            phrases_.decrement(p1); -            phrases_.decrement(p2); -            gen_.decrement(false); -            gen_.decrement(false); -          } else { -            phrases_.decrement(p1p2); -            gen_.decrement(false); -          } - -          const double d1 = phrases_.prob(p1p2, p0(p1p2)) * gen_.prob(false, gen_p0_); -          double d2 = phrases_.prob(p1, p0(p1)) * gen_.prob(false, gen_p0_); -          phrases_.increment(p1); -          gen_.increment(false); -          d2 *= phrases_.prob(p2, p0(p2)) * gen_.prob(false, gen_p0_); -          phrases_.decrement(p1); -          gen_.decrement(false); -          z[j] = rng->SelectSample(d1, d2); - -          if (z[j]) { -            phrases_.increment(p1); -            phrases_.increment(p2); -            gen_.increment(false); -            gen_.increment(false); -            prev = j + 1; -          } else { -            phrases_.increment(p1p2); -            gen_.increment(false); -          } -        } -      } -    } -//    cerr << endl << endl << gen_ << endl << phrases_ << endl; -    cerr << gen_.prob(false, gen_p0_) << " " << gen_.prob(true, 1 - gen_p0_) << endl; -  } - -  void WriteCdecGrammarForCurrentSample(ostream* os) const { -    CCRP_NoTable<vector<int> >::const_iterator it = phrases_.begin(); -    for (; it != phrases_.end(); ++it) { -      (*os) << "[X] ||| " << Join(' ', it->first) << " ||| " -                          << Join('_', it->first) << " ||| C=1 P="  -                          << log(phrases_.prob(it->first, p0(it->first))) << endl; -    } -  } - -  double OOVUnigramLogProb() const { -    vector<int> x(1,99999999); -    return log(phrases_.prob(x, p0(x))); -  } - -  void ResampleHyperparameters(MT19937* rng) { -    phrases_.resample_hyperparameters(rng); -    gen_.resample_hyperparameters(rng); -    cerr << " " << phrases_.alpha(); -  } - -  CCRP_NoTable<vector<int> > phrases_; -  CCRP_NoTable<bool> gen_; -  vector<vector<bool> > z_;   // z_[i] is there a phrase boundary after the ith word -  const vector<vector<int> >& corpus_; -  const double uniform_word_; -  const double gen_p0_; -  const double p_end_; // in base length distribution, p of the end of a phrase -  const bool use_poisson_; -}; - - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  boost::shared_ptr<MT19937> prng; -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<int> > corpus; -  set<int> vocab; -  ReadCorpus(conf["input"].as<string>(), &corpus, &vocab); -  cerr << "Corpus size: " << corpus.size() << " sentences\n"; -  cerr << "Vocabulary size: " << vocab.size() << " types\n"; - -  UniphraseLM ulm(corpus, vocab, conf); -  ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng); -  cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl; - -  for (unsigned i = 0; i < corpus.size(); ++i) -    WriteSeg(corpus[i], ulm.z_[i], d); - -  if (conf.count("write_cdec_grammar")) { -    string fname = conf["write_cdec_grammar"].as<string>(); -    cerr << "Writing model to " << fname << " ...\n"; -    WriteFile wf(fname); -    ulm.WriteCdecGrammarForCurrentSample(wf.stream()); -  } - -  if (conf.count("write_cdec_weights")) { -    string fname = conf["write_cdec_weights"].as<string>(); -    cerr << "Writing weights to " << fname << " .\n"; -    WriteFile wf(fname); -    ostream& os = *wf.stream(); -    os << "# make C smaller to use more phrases\nP 1\nPassThrough " << ulm.OOVUnigramLogProb() << "\nC -3\n"; -  } - -   - -  return 0; -} - diff --git a/phrasinator/train-phrasinator.pl b/phrasinator/train-phrasinator.pl deleted file mode 100755 index c50b8e68..00000000 --- a/phrasinator/train-phrasinator.pl +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/perl -w -use strict; -my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } -use Getopt::Long; -use File::Spec qw (rel2abs); - -my $DECODER = "$script_dir/../decoder/cdec"; -my $TRAINER = "$script_dir/gibbs_train_plm_notables"; - -die "Can't find $TRAINER" unless -f $TRAINER; -die "Can't execute $TRAINER" unless -x $TRAINER; - -if (!GetOptions( -	"decoder=s" => \$DECODER, -)) { usage(); } - -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -if (scalar @ARGV != 2) { usage(); } -my $INFILE = shift @ARGV; -my $OUTDIR = shift @ARGV; -$OUTDIR = File::Spec->rel2abs($OUTDIR); -print STDERR "      Input file: $INFILE\n"; -print STDERR "Output directory: $OUTDIR\n"; -open F, "<$INFILE" or die "Failed to open $INFILE for reading: $!"; -close F; -die "Please remove existing directory $OUTDIR\n" if (-f $OUTDIR || -d $OUTDIR); - -my $CMD = "mkdir $OUTDIR"; -safesystem($CMD) or die "Failed to create directory $OUTDIR\n$!"; - -my $grammar="$OUTDIR/grammar.gz"; -my $weights="$OUTDIR/weights"; -$CMD = "$TRAINER -w $weights -g $grammar -i $INFILE"; -safesystem($CMD) or die "Failed to train model!\n"; -my $cdecini = "$OUTDIR/cdec.ini"; -open C, ">$cdecini" or die "Failed to open $cdecini for writing: $!"; - -print C <<EOINI; -quiet=true -formalism=scfg -grammar=$grammar -add_pass_through_rules=true -weights=$OUTDIR/weights -EOINI - -close C; - -print <<EOT; - -Model trained successfully.  Text can be decoded into phrasal units with -the following command: - -  $DECODER -c $OUTDIR/cdec.ini < FILE.TXT - -EOT -exit(0); - -sub usage { -  print <<EOT; -Usage: $0 [options] INPUT.TXT OUTPUT-DIRECTORY - -  Infers a phrasal segmentation model from the tokenized text in INPUT.TXT -  and writes it to OUTPUT-DIRECTORY/ so that it can be applied to other -  text or have its granularity altered. - -EOT -  exit(1); -} - -sub safesystem { -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    print STDERR "Exit code: $exitcode\n" if $exitcode; -    return ! $exitcode; -  } -} - | 
