summaryrefslogtreecommitdiff
path: root/phrasinator
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commit925087356b853e2099c1b60d8b757d7aa02121a9 (patch)
tree579925c5c9d3da51f43018a5c6d1c4dfbb72b089 /phrasinator
parentea79e535d69f6854d01c62e3752971fb6730d8e7 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'phrasinator')
-rw-r--r--phrasinator/Jamfile4
-rw-r--r--phrasinator/Makefile.am14
-rw-r--r--phrasinator/README16
-rw-r--r--phrasinator/gibbs_train_plm.cc309
-rw-r--r--phrasinator/gibbs_train_plm.notables.cc335
-rwxr-xr-xphrasinator/train-phrasinator.pl89
6 files changed, 0 insertions, 767 deletions
diff --git a/phrasinator/Jamfile b/phrasinator/Jamfile
deleted file mode 100644
index 1fc34f79..00000000
--- a/phrasinator/Jamfile
+++ /dev/null
@@ -1,4 +0,0 @@
-exe gibbs_train_plm : gibbs_train_plm.cc ..//utils ..//z ..//boost_program_options ;
-exe gibbs_train_plm_notables : gibbs_train_plm.notables.cc ..//utils ..//z ..//boost_program_options ;
-
-alias programs : gibbs_train_plm gibbs_train_plm_notables ;
diff --git a/phrasinator/Makefile.am b/phrasinator/Makefile.am
deleted file mode 100644
index 3ddd1934..00000000
--- a/phrasinator/Makefile.am
+++ /dev/null
@@ -1,14 +0,0 @@
-bin_PROGRAMS = gibbs_train_plm gibbs_train_plm_notables
-
-#head_bigram_model
-
-gibbs_train_plm_notables_SOURCES = gibbs_train_plm.notables.cc
-gibbs_train_plm_notables_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-gibbs_train_plm_SOURCES = gibbs_train_plm.cc
-gibbs_train_plm_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-#head_bigram_model_SOURCES = head_bigram_model.cc
-#head_bigram_model_LDADD = $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -funroll-loops -ffast-math -W -Wall -I$(top_srcdir)/utils
diff --git a/phrasinator/README b/phrasinator/README
deleted file mode 100644
index fb5b93ef..00000000
--- a/phrasinator/README
+++ /dev/null
@@ -1,16 +0,0 @@
-The "phrasinator" uses a simple Bayesian nonparametric model to segment
-text into chunks. The inferred model is then saved so that it can rapidly
-predict segments in new (but related) texts.
-
- Input will be a corpus of sentences, e.g.:
-
- economists have argued that real interest rates have fallen .
-
- The output will be a model that, when run with cdec, will produce
- a segmentation into phrasal units, e.g.:
-
- economists have argued that real_interest_rates have fallen .
-
-
-To train a model, run ./train-phrasinator.pl and follow instructions.
-
diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc
deleted file mode 100644
index 7847a460..00000000
--- a/phrasinator/gibbs_train_plm.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "m.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-Dict d; // global dictionary
-
-string Join(char joiner, const vector<int>& phrase) {
- ostringstream os;
- for (unsigned i = 0; i < phrase.size(); ++i) {
- if (i > 0) os << joiner;
- os << d.Convert(phrase[i]);
- }
- return os.str();
-}
-
-ostream& operator<<(ostream& os, const vector<int>& phrase) {
- for (unsigned i = 0; i < phrase.size(); ++i)
- os << (i == 0 ? "" : " ") << d.Convert(phrase[i]);
- return os;
-}
-
-struct UnigramLM {
- explicit UnigramLM(const string& fname) {
- ifstream in(fname.c_str());
- assert(in);
- }
-
- double logprob(unsigned word) const {
- assert(word < freqs_.size());
- return freqs_[word];
- }
-
- vector<double> freqs_;
-};
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read file from")
- ("random_seed,S",po::value<uint32_t>(), "Random seed")
- ("write_cdec_grammar,g", po::value<string>(), "Write cdec grammar to this file")
- ("write_cdec_weights,w", po::value<string>(), "Write cdec weights to this file")
- ("poisson_length,p", "Use a Poisson distribution as the length of a phrase in the base distribuion")
- ("no_hyperparameter_inference,N", "Disable hyperparameter inference");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab) {
- c->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- c->push_back(vector<int>());
- vector<int>& v = c->back();
- d.ConvertWhitespaceDelimitedLine(line, &v);
- for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]);
- }
- if (in != &cin) delete in;
-}
-
-struct UniphraseLM {
- UniphraseLM(const vector<vector<int> >& corpus,
- const set<int>& vocab,
- const po::variables_map& conf) :
- phrases_(1,1,1,1),
- gen_(1,1,1,1),
- corpus_(corpus),
- uniform_word_(1.0 / vocab.size()),
- gen_p0_(0.5),
- p_end_(0.5),
- use_poisson_(conf.count("poisson_length") > 0) {}
-
- double p0(const vector<int>& phrase) const {
- static vector<double> p0s(10000, 0.0);
- assert(phrase.size() < 10000);
- double& p = p0s[phrase.size()];
- if (p) return p;
- p = exp(log_p0(phrase));
- if (!p) {
- cerr << "0 prob phrase: " << phrase << "\nAssigning std::numeric_limits<double>::min()\n";
- p = std::numeric_limits<double>::min();
- }
- return p;
- }
-
- double log_p0(const vector<int>& phrase) const {
- double len_logprob;
- if (use_poisson_)
- len_logprob = Md::log_poisson(phrase.size(), 1.0);
- else
- len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_);
- return log(uniform_word_) * phrase.size() + len_logprob;
- }
-
- double llh() const {
- double llh = gen_.log_crp_prob();
- llh += gen_.num_tables(false) * log(gen_p0_) +
- gen_.num_tables(true) * log(1 - gen_p0_);
- double llhr = phrases_.log_crp_prob();
- for (CCRP<vector<int> >::const_iterator it = phrases_.begin(); it != phrases_.end(); ++it) {
- llhr += phrases_.num_tables(it->first) * log_p0(it->first);
- //llhr += log_p0(it->first);
- if (!isfinite(llh)) {
- cerr << it->first << endl;
- cerr << log_p0(it->first) << endl;
- abort();
- }
- }
- return llh + llhr;
- }
-
- void Sample(unsigned int samples, bool hyp_inf, MT19937* rng) {
- cerr << "Initializing...\n";
- z_.resize(corpus_.size());
- int tc = 0;
- for (unsigned i = 0; i < corpus_.size(); ++i) {
- const vector<int>& line = corpus_[i];
- const int ls = line.size();
- const int last_pos = ls - 1;
- vector<bool>& z = z_[i];
- z.resize(ls);
- int prev = 0;
- for (int j = 0; j < ls; ++j) {
- z[j] = rng->next() < 0.5;
- if (j == last_pos) z[j] = true; // break phrase at the end of the sentence
- if (z[j]) {
- const vector<int> p(line.begin() + prev, line.begin() + j + 1);
- phrases_.increment(p, p0(p), rng);
- //cerr << p << ": " << p0(p) << endl;
- prev = j + 1;
- gen_.increment(false, gen_p0_, rng);
- ++tc; // remove
- }
- }
- ++tc;
- gen_.increment(true, 1.0 - gen_p0_, rng); // end of utterance
- }
- cerr << "TC: " << tc << endl;
- cerr << "Initial LLH: " << llh() << endl;
- cerr << "Sampling...\n";
- cerr << gen_ << endl;
- for (unsigned s = 1; s < samples; ++s) {
- cerr << '.';
- if (s % 10 == 0) {
- cerr << " [" << s;
- if (hyp_inf) ResampleHyperparameters(rng);
- cerr << " LLH=" << llh() << "]\n";
- vector<int> z(z_[0].size(), 0);
- //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j];
- //SegCorpus::Write(corpus_[0], z, d);
- }
- for (unsigned i = 0; i < corpus_.size(); ++i) {
- const vector<int>& line = corpus_[i];
- const int ls = line.size();
- const int last_pos = ls - 1;
- vector<bool>& z = z_[i];
- int prev = 0;
- for (int j = 0; j < last_pos; ++j) { // don't resample last position
- int next = j+1; while(!z[next]) { ++next; }
- const vector<int> p1p2(line.begin() + prev, line.begin() + next + 1);
- const vector<int> p1(line.begin() + prev, line.begin() + j + 1);
- const vector<int> p2(line.begin() + j + 1, line.begin() + next + 1);
-
- if (z[j]) {
- phrases_.decrement(p1, rng);
- phrases_.decrement(p2, rng);
- gen_.decrement(false, rng);
- gen_.decrement(false, rng);
- } else {
- phrases_.decrement(p1p2, rng);
- gen_.decrement(false, rng);
- }
-
- const double d1 = phrases_.prob(p1p2, p0(p1p2)) * gen_.prob(false, gen_p0_);
- double d2 = phrases_.prob(p1, p0(p1)) * gen_.prob(false, gen_p0_);
- phrases_.increment(p1, p0(p1), rng);
- gen_.increment(false, gen_p0_, rng);
- d2 *= phrases_.prob(p2, p0(p2)) * gen_.prob(false, gen_p0_);
- phrases_.decrement(p1, rng);
- gen_.decrement(false, rng);
- z[j] = rng->SelectSample(d1, d2);
-
- if (z[j]) {
- phrases_.increment(p1, p0(p1), rng);
- phrases_.increment(p2, p0(p2), rng);
- gen_.increment(false, gen_p0_, rng);
- gen_.increment(false, gen_p0_, rng);
- prev = j + 1;
- } else {
- phrases_.increment(p1p2, p0(p1p2), rng);
- gen_.increment(false, gen_p0_, rng);
- }
- }
- }
- }
-// cerr << endl << endl << gen_ << endl << phrases_ << endl;
- cerr << gen_.prob(false, gen_p0_) << " " << gen_.prob(true, 1 - gen_p0_) << endl;
- }
-
- void WriteCdecGrammarForCurrentSample(ostream* os) const {
- CCRP<vector<int> >::const_iterator it = phrases_.begin();
- for (; it != phrases_.end(); ++it) {
- (*os) << "[X] ||| " << Join(' ', it->first) << " ||| "
- << Join('_', it->first) << " ||| C=1 P="
- << log(phrases_.prob(it->first, p0(it->first))) << endl;
- }
- }
-
- double OOVUnigramLogProb() const {
- vector<int> x(1,99999999);
- return log(phrases_.prob(x, p0(x)));
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- phrases_.resample_hyperparameters(rng);
- gen_.resample_hyperparameters(rng);
- cerr << " d=" << phrases_.discount() << ",s=" << phrases_.strength();
- }
-
- CCRP<vector<int> > phrases_;
- CCRP<bool> gen_;
- vector<vector<bool> > z_; // z_[i] is there a phrase boundary after the ith word
- const vector<vector<int> >& corpus_;
- const double uniform_word_;
- const double gen_p0_;
- const double p_end_; // in base length distribution, p of the end of a phrase
- const bool use_poisson_;
-};
-
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- boost::shared_ptr<MT19937> prng;
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<int> > corpus;
- set<int> vocab;
- ReadCorpus(conf["input"].as<string>(), &corpus, &vocab);
- cerr << "Corpus size: " << corpus.size() << " sentences\n";
- cerr << "Vocabulary size: " << vocab.size() << " types\n";
-
- UniphraseLM ulm(corpus, vocab, conf);
- ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng);
- cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl;
-
- for (unsigned i = 0; i < corpus.size(); ++i)
-// SegCorpus::Write(corpus[i], shmmlm.z_[i], d);
- ;
- if (conf.count("write_cdec_grammar")) {
- string fname = conf["write_cdec_grammar"].as<string>();
- cerr << "Writing model to " << fname << " ...\n";
- WriteFile wf(fname);
- ulm.WriteCdecGrammarForCurrentSample(wf.stream());
- }
-
- if (conf.count("write_cdec_weights")) {
- string fname = conf["write_cdec_weights"].as<string>();
- cerr << "Writing weights to " << fname << " .\n";
- WriteFile wf(fname);
- ostream& os = *wf.stream();
- os << "# make C smaller to use more phrases\nP 1\nPassThrough " << ulm.OOVUnigramLogProb() << "\nC -3\n";
- }
-
- return 0;
-}
-
diff --git a/phrasinator/gibbs_train_plm.notables.cc b/phrasinator/gibbs_train_plm.notables.cc
deleted file mode 100644
index 4526eaa6..00000000
--- a/phrasinator/gibbs_train_plm.notables.cc
+++ /dev/null
@@ -1,335 +0,0 @@
-#include <iostream>
-#include <tr1/memory>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "dict.h"
-#include "sampler.h"
-#include "ccrp.h"
-#include "ccrp_nt.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-Dict d; // global dictionary
-
-string Join(char joiner, const vector<int>& phrase) {
- ostringstream os;
- for (unsigned i = 0; i < phrase.size(); ++i) {
- if (i > 0) os << joiner;
- os << d.Convert(phrase[i]);
- }
- return os.str();
-}
-
-template <typename BType>
-void WriteSeg(const vector<int>& line, const vector<BType>& label, const Dict& d) {
- assert(line.size() == label.size());
- assert(label.back());
- unsigned prev = 0;
- unsigned cur = 0;
- while (cur < line.size()) {
- if (label[cur]) {
- if (prev) cout << ' ';
- cout << "{{";
- for (unsigned i = prev; i <= cur; ++i)
- cout << (i == prev ? "" : " ") << d.Convert(line[i]);
- cout << "}}:" << label[cur];
- prev = cur + 1;
- }
- ++cur;
- }
- cout << endl;
-}
-
-ostream& operator<<(ostream& os, const vector<int>& phrase) {
- for (unsigned i = 0; i < phrase.size(); ++i)
- os << (i == 0 ? "" : " ") << d.Convert(phrase[i]);
- return os;
-}
-
-struct UnigramLM {
- explicit UnigramLM(const string& fname) {
- ifstream in(fname.c_str());
- assert(in);
- }
-
- double logprob(unsigned word) const {
- assert(word < freqs_.size());
- return freqs_[word];
- }
-
- vector<double> freqs_;
-};
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
- ("input,i",po::value<string>(),"Read file from")
- ("random_seed,S",po::value<uint32_t>(), "Random seed")
- ("write_cdec_grammar,g", po::value<string>(), "Write cdec grammar to this file")
- ("write_cdec_weights,w", po::value<string>(), "Write cdec weights to this file")
- ("poisson_length,p", "Use a Poisson distribution as the length of a phrase in the base distribuion")
- ("no_hyperparameter_inference,N", "Disable hyperparameter inference");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || (conf->count("input") == 0)) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab) {
- c->clear();
- istream* in;
- if (filename == "-")
- in = &cin;
- else
- in = new ifstream(filename.c_str());
- assert(*in);
- string line;
- while(*in) {
- getline(*in, line);
- if (line.empty() && !*in) break;
- c->push_back(vector<int>());
- vector<int>& v = c->back();
- d.ConvertWhitespaceDelimitedLine(line, &v);
- for (unsigned i = 0; i < v.size(); ++i) vocab->insert(v[i]);
- }
- if (in != &cin) delete in;
-}
-
-double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
-
-struct UniphraseLM {
- UniphraseLM(const vector<vector<int> >& corpus,
- const set<int>& vocab,
- const po::variables_map& conf) :
- phrases_(1,1),
- gen_(1,1),
- corpus_(corpus),
- uniform_word_(1.0 / vocab.size()),
- gen_p0_(0.5),
- p_end_(0.5),
- use_poisson_(conf.count("poisson_length") > 0) {}
-
- double p0(const vector<int>& phrase) const {
- static vector<double> p0s(10000, 0.0);
- assert(phrase.size() < 10000);
- double& p = p0s[phrase.size()];
- if (p) return p;
- p = exp(log_p0(phrase));
- if (!p) {
- cerr << "0 prob phrase: " << phrase << "\nAssigning std::numeric_limits<double>::min()\n";
- p = std::numeric_limits<double>::min();
- }
- return p;
- }
-
- double log_p0(const vector<int>& phrase) const {
- double len_logprob;
- if (use_poisson_)
- len_logprob = log_poisson(phrase.size(), 1.0);
- else
- len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_);
- return log(uniform_word_) * phrase.size() + len_logprob;
- }
-
- double llh() const {
- double llh = gen_.log_crp_prob();
- llh += log(gen_p0_) + log(1 - gen_p0_);
- double llhr = phrases_.log_crp_prob();
- for (CCRP_NoTable<vector<int> >::const_iterator it = phrases_.begin(); it != phrases_.end(); ++it) {
- llhr += log_p0(it->first);
- //llhr += log_p0(it->first);
- if (!isfinite(llh)) {
- cerr << it->first << endl;
- cerr << log_p0(it->first) << endl;
- abort();
- }
- }
- return llh + llhr;
- }
-
- void Sample(unsigned int samples, bool hyp_inf, MT19937* rng) {
- cerr << "Initializing...\n";
- z_.resize(corpus_.size());
- int tc = 0;
- for (unsigned i = 0; i < corpus_.size(); ++i) {
- const vector<int>& line = corpus_[i];
- const int ls = line.size();
- const int last_pos = ls - 1;
- vector<bool>& z = z_[i];
- z.resize(ls);
- int prev = 0;
- for (int j = 0; j < ls; ++j) {
- z[j] = rng->next() < 0.5;
- if (j == last_pos) z[j] = true; // break phrase at the end of the sentence
- if (z[j]) {
- const vector<int> p(line.begin() + prev, line.begin() + j + 1);
- phrases_.increment(p);
- //cerr << p << ": " << p0(p) << endl;
- prev = j + 1;
- gen_.increment(false);
- ++tc; // remove
- }
- }
- ++tc;
- gen_.increment(true); // end of utterance
- }
- cerr << "TC: " << tc << endl;
- cerr << "Initial LLH: " << llh() << endl;
- cerr << "Sampling...\n";
- cerr << gen_ << endl;
- for (unsigned s = 1; s < samples; ++s) {
- cerr << '.';
- if (s % 10 == 0) {
- cerr << " [" << s;
- if (hyp_inf) ResampleHyperparameters(rng);
- cerr << " LLH=" << llh() << "]\n";
- vector<int> z(z_[0].size(), 0);
- //for (int j = 0; j < z.size(); ++j) z[j] = z_[0][j];
- //SegCorpus::Write(corpus_[0], z, d);
- }
- for (unsigned i = 0; i < corpus_.size(); ++i) {
- const vector<int>& line = corpus_[i];
- const int ls = line.size();
- const int last_pos = ls - 1;
- vector<bool>& z = z_[i];
- int prev = 0;
- for (int j = 0; j < last_pos; ++j) { // don't resample last position
- int next = j+1; while(!z[next]) { ++next; }
- const vector<int> p1p2(line.begin() + prev, line.begin() + next + 1);
- const vector<int> p1(line.begin() + prev, line.begin() + j + 1);
- const vector<int> p2(line.begin() + j + 1, line.begin() + next + 1);
-
- if (z[j]) {
- phrases_.decrement(p1);
- phrases_.decrement(p2);
- gen_.decrement(false);
- gen_.decrement(false);
- } else {
- phrases_.decrement(p1p2);
- gen_.decrement(false);
- }
-
- const double d1 = phrases_.prob(p1p2, p0(p1p2)) * gen_.prob(false, gen_p0_);
- double d2 = phrases_.prob(p1, p0(p1)) * gen_.prob(false, gen_p0_);
- phrases_.increment(p1);
- gen_.increment(false);
- d2 *= phrases_.prob(p2, p0(p2)) * gen_.prob(false, gen_p0_);
- phrases_.decrement(p1);
- gen_.decrement(false);
- z[j] = rng->SelectSample(d1, d2);
-
- if (z[j]) {
- phrases_.increment(p1);
- phrases_.increment(p2);
- gen_.increment(false);
- gen_.increment(false);
- prev = j + 1;
- } else {
- phrases_.increment(p1p2);
- gen_.increment(false);
- }
- }
- }
- }
-// cerr << endl << endl << gen_ << endl << phrases_ << endl;
- cerr << gen_.prob(false, gen_p0_) << " " << gen_.prob(true, 1 - gen_p0_) << endl;
- }
-
- void WriteCdecGrammarForCurrentSample(ostream* os) const {
- CCRP_NoTable<vector<int> >::const_iterator it = phrases_.begin();
- for (; it != phrases_.end(); ++it) {
- (*os) << "[X] ||| " << Join(' ', it->first) << " ||| "
- << Join('_', it->first) << " ||| C=1 P="
- << log(phrases_.prob(it->first, p0(it->first))) << endl;
- }
- }
-
- double OOVUnigramLogProb() const {
- vector<int> x(1,99999999);
- return log(phrases_.prob(x, p0(x)));
- }
-
- void ResampleHyperparameters(MT19937* rng) {
- phrases_.resample_hyperparameters(rng);
- gen_.resample_hyperparameters(rng);
- cerr << " " << phrases_.alpha();
- }
-
- CCRP_NoTable<vector<int> > phrases_;
- CCRP_NoTable<bool> gen_;
- vector<vector<bool> > z_; // z_[i] is there a phrase boundary after the ith word
- const vector<vector<int> >& corpus_;
- const double uniform_word_;
- const double gen_p0_;
- const double p_end_; // in base length distribution, p of the end of a phrase
- const bool use_poisson_;
-};
-
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- boost::shared_ptr<MT19937> prng;
- if (conf.count("random_seed"))
- prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
- else
- prng.reset(new MT19937);
- MT19937& rng = *prng;
-
- vector<vector<int> > corpus;
- set<int> vocab;
- ReadCorpus(conf["input"].as<string>(), &corpus, &vocab);
- cerr << "Corpus size: " << corpus.size() << " sentences\n";
- cerr << "Vocabulary size: " << vocab.size() << " types\n";
-
- UniphraseLM ulm(corpus, vocab, conf);
- ulm.Sample(conf["samples"].as<unsigned>(), conf.count("no_hyperparameter_inference") == 0, &rng);
- cerr << "OOV unigram prob: " << ulm.OOVUnigramLogProb() << endl;
-
- for (unsigned i = 0; i < corpus.size(); ++i)
- WriteSeg(corpus[i], ulm.z_[i], d);
-
- if (conf.count("write_cdec_grammar")) {
- string fname = conf["write_cdec_grammar"].as<string>();
- cerr << "Writing model to " << fname << " ...\n";
- WriteFile wf(fname);
- ulm.WriteCdecGrammarForCurrentSample(wf.stream());
- }
-
- if (conf.count("write_cdec_weights")) {
- string fname = conf["write_cdec_weights"].as<string>();
- cerr << "Writing weights to " << fname << " .\n";
- WriteFile wf(fname);
- ostream& os = *wf.stream();
- os << "# make C smaller to use more phrases\nP 1\nPassThrough " << ulm.OOVUnigramLogProb() << "\nC -3\n";
- }
-
-
-
- return 0;
-}
-
diff --git a/phrasinator/train-phrasinator.pl b/phrasinator/train-phrasinator.pl
deleted file mode 100755
index c50b8e68..00000000
--- a/phrasinator/train-phrasinator.pl
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
-use Getopt::Long;
-use File::Spec qw (rel2abs);
-
-my $DECODER = "$script_dir/../decoder/cdec";
-my $TRAINER = "$script_dir/gibbs_train_plm_notables";
-
-die "Can't find $TRAINER" unless -f $TRAINER;
-die "Can't execute $TRAINER" unless -x $TRAINER;
-
-if (!GetOptions(
- "decoder=s" => \$DECODER,
-)) { usage(); }
-
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-if (scalar @ARGV != 2) { usage(); }
-my $INFILE = shift @ARGV;
-my $OUTDIR = shift @ARGV;
-$OUTDIR = File::Spec->rel2abs($OUTDIR);
-print STDERR " Input file: $INFILE\n";
-print STDERR "Output directory: $OUTDIR\n";
-open F, "<$INFILE" or die "Failed to open $INFILE for reading: $!";
-close F;
-die "Please remove existing directory $OUTDIR\n" if (-f $OUTDIR || -d $OUTDIR);
-
-my $CMD = "mkdir $OUTDIR";
-safesystem($CMD) or die "Failed to create directory $OUTDIR\n$!";
-
-my $grammar="$OUTDIR/grammar.gz";
-my $weights="$OUTDIR/weights";
-$CMD = "$TRAINER -w $weights -g $grammar -i $INFILE";
-safesystem($CMD) or die "Failed to train model!\n";
-my $cdecini = "$OUTDIR/cdec.ini";
-open C, ">$cdecini" or die "Failed to open $cdecini for writing: $!";
-
-print C <<EOINI;
-quiet=true
-formalism=scfg
-grammar=$grammar
-add_pass_through_rules=true
-weights=$OUTDIR/weights
-EOINI
-
-close C;
-
-print <<EOT;
-
-Model trained successfully. Text can be decoded into phrasal units with
-the following command:
-
- $DECODER -c $OUTDIR/cdec.ini < FILE.TXT
-
-EOT
-exit(0);
-
-sub usage {
- print <<EOT;
-Usage: $0 [options] INPUT.TXT OUTPUT-DIRECTORY
-
- Infers a phrasal segmentation model from the tokenized text in INPUT.TXT
- and writes it to OUTPUT-DIRECTORY/ so that it can be applied to other
- text or have its granularity altered.
-
-EOT
- exit(1);
-}
-
-sub safesystem {
- print STDERR "Executing: @_\n";
- system(@_);
- if ($? == -1) {
- print STDERR "ERROR: Failed to execute: @_\n $!\n";
- exit(1);
- }
- elsif ($? & 127) {
- printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
- ($? & 127), ($? & 128) ? 'with' : 'without';
- exit(1);
- }
- else {
- my $exitcode = $? >> 8;
- print STDERR "Exit code: $exitcode\n" if $exitcode;
- return ! $exitcode;
- }
-}
-