From 34b4752a1eefc002166e95782c2c52747bb08b3a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 20 Mar 2012 15:37:54 -0400 Subject: make c++11 compatible --- decoder/decoder.cc | 31 +++++++++++++++---------------- decoder/earley_composer.cc | 4 +--- decoder/phrasetable_fst.cc | 3 +-- 3 files changed, 17 insertions(+), 21 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 69fbaf85..d4f8f06d 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -57,7 +57,6 @@ static const double kMINUS_EPSILON = -1e-6; // don't be too strict using namespace std; using namespace std::tr1; -using boost::shared_ptr; namespace po = boost::program_options; static bool verbose_feature_functions=true; @@ -101,7 +100,7 @@ inline string str(char const* name,po::variables_map const& conf) { // print just the --long_opt names suitable for bash compgen inline void print_options(std::ostream &out,po::options_description const& opts) { - typedef std::vector< shared_ptr > Ds; + typedef std::vector< boost::shared_ptr > Ds; Ds const& ds=opts.options(); out << '"'; for (unsigned i=0;i make_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") { +inline boost::shared_ptr make_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") { string ff, param; SplitCommandAndParam(ffp, &ff, ¶m); cerr << pre << "feature: " << ff; if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; else cerr << " (no config parameters)\n"; - shared_ptr pf = ff_registry.Create(ff, param); + boost::shared_ptr pf = ff_registry.Create(ff, param); if (!pf) exit(1); int nbyte=pf->NumBytesContext(); if (verbose_feature_functions) @@ -135,13 +134,13 @@ inline shared_ptr make_ff(string const& ffp,bool verbose_featur } #ifdef FSA_RESCORING -inline shared_ptr make_fsa_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") { +inline boost::shared_ptr make_fsa_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") { string ff, param; SplitCommandAndParam(ffp, &ff, ¶m); cerr << "FSA Feature: " << ff; if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; else cerr << " (no config parameters)\n"; - shared_ptr pf = fsa_ff_registry.Create(ff, param); + boost::shared_ptr pf = fsa_ff_registry.Create(ff, param); if (!pf) exit(1); if (verbose_feature_functions) cerr<<"State is "<state_bytes()<<" bytes for "< make_fsa_ff(string const& ffp,bool verbose // passes are carried over into subsequent passes (where they may have different weights). struct RescoringPass { RescoringPass() : fid_summary(), density_prune(), beam_prune() {} - shared_ptr models; - shared_ptr inter_conf; + boost::shared_ptr models; + boost::shared_ptr inter_conf; vector ffs; - shared_ptr > weight_vector; + boost::shared_ptr > weight_vector; int fid_summary; // 0 == no summary feature double density_prune; // 0 == don't density prune double beam_prune; // 0 == don't beam prune @@ -293,15 +292,15 @@ struct DecoderImpl { po::variables_map& conf; OracleBleu oracle; string formalism; - shared_ptr translator; - shared_ptr > init_weights; // weights used with initial parse - vector > pffs; + boost::shared_ptr translator; + boost::shared_ptr > init_weights; // weights used with initial parse + vector > pffs; #ifdef FSA_RESCORING CFGOptions cfg_options; - vector > fsa_ffs; + vector > fsa_ffs; vector fsa_names; #endif - shared_ptr > rng; + boost::shared_ptr > rng; int sample_max_trans; bool aligner_mode; bool graphviz; @@ -310,7 +309,7 @@ struct DecoderImpl { bool kbest; bool unique_kbest; bool get_oracle_forest; - shared_ptr extract_file; + boost::shared_ptr extract_file; int combine_size; int sent_id; SparseVector acc_vec; // accumulate gradient @@ -622,7 +621,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } // set up weight vectors since later phases may reuse weights from earlier phases - shared_ptr > prev_weights = init_weights; + boost::shared_ptr > prev_weights = init_weights; for (int pass = 0; pass < rescoring_passes.size(); ++pass) { RescoringPass& rp = rescoring_passes[pass]; if (!rp.weight_vector) { diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc index b7af801a..385baf8b 100644 --- a/decoder/earley_composer.cc +++ b/decoder/earley_composer.cc @@ -16,8 +16,6 @@ #include "tdict.h" #include "hg.h" -using boost::shared_ptr; -namespace po = boost::program_options; using namespace std; using namespace std::tr1; @@ -111,7 +109,7 @@ struct Edge { const Edge* const active_parent; // back pointer, NULL for PREDICT items const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items const TargetPhraseSet* const tps; // translations - shared_ptr > features; // features from CFG rule + boost::shared_ptr > features; // features from CFG rule bool IsPassive() const { // when a rule is completed, this value will be set diff --git a/decoder/phrasetable_fst.cc b/decoder/phrasetable_fst.cc index f421e941..b3bec86b 100644 --- a/decoder/phrasetable_fst.cc +++ b/decoder/phrasetable_fst.cc @@ -9,7 +9,6 @@ #include "filelib.h" #include "tdict.h" -using boost::shared_ptr; using namespace std; TargetPhraseSet::~TargetPhraseSet() {} @@ -46,7 +45,7 @@ class TextFSTNode : public FSTNode { void ClearPassThroughTranslations(); private: vector passthroughs; - shared_ptr data; + boost::shared_ptr data; map ptr; }; -- cgit v1.2.3 From b6eede632af4fa58a6f5325ee0d059c02a898b9f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 24 Mar 2012 23:04:46 -0400 Subject: rename aligner, add support for distinguishing translation / transliteration --- decoder/aligner.cc | 4 +- decoder/ff_wordalign.cc | 1 - mteval/aer_scorer.cc | 6 +-- utils/Makefile.am | 2 +- utils/alignment_io.cc | 97 ++++++++++++++++++++++++++++++++++++++++++++++ utils/alignment_io.h | 42 ++++++++++++++++++++ utils/alignment_pharaoh.cc | 77 ------------------------------------ utils/alignment_pharaoh.h | 14 ------- utils/atools.cc | 8 ++-- 9 files changed, 149 insertions(+), 102 deletions(-) create mode 100644 utils/alignment_io.cc create mode 100644 utils/alignment_io.h delete mode 100644 utils/alignment_pharaoh.cc delete mode 100644 utils/alignment_pharaoh.h (limited to 'decoder') diff --git a/decoder/aligner.cc b/decoder/aligner.cc index 53e059fb..232e022a 100644 --- a/decoder/aligner.cc +++ b/decoder/aligner.cc @@ -11,7 +11,7 @@ #include "sentence_metadata.h" #include "inside_outside.h" #include "viterbi.h" -#include "alignment_pharaoh.h" +#include "alignment_io.h" using namespace std; @@ -300,7 +300,7 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice, cerr << grid << endl; } (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| "; - AlignmentPharaoh::SerializePharaohFormat(grid, out); + AlignmentIO::SerializePharaohFormat(grid, out); } }; diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 9e7c618e..decdf9bc 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -15,7 +15,6 @@ #include "factored_lexicon_helper.h" #include "verbose.h" -#include "alignment_pharaoh.h" #include "stringlib.h" #include "sentence_metadata.h" #include "hg.h" diff --git a/mteval/aer_scorer.cc b/mteval/aer_scorer.cc index edd4390f..ae3192d4 100644 --- a/mteval/aer_scorer.cc +++ b/mteval/aer_scorer.cc @@ -5,7 +5,7 @@ #include #include "tdict.h" -#include "alignment_pharaoh.h" +#include "alignment_io.h" using namespace std; @@ -85,7 +85,7 @@ AERScorer::AERScorer(const vector >& refs, const string& src) : s cerr << "AERScorer can only take a single reference!\n"; abort(); } - ref_ = AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); + ref_ = AlignmentIO::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); } static inline bool Safe(const Array2D& a, int i, int j) { @@ -101,7 +101,7 @@ ScoreP AERScorer::ScoreCCandidate(const vector& shyp) const { ScoreP AERScorer::ScoreCandidate(const vector& shyp) const { boost::shared_ptr > hyp = - AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(shyp)); + AlignmentIO::ReadPharaohAlignmentGrid(TD::GetString(shyp)); int m = 0; int r = 0; diff --git a/utils/Makefile.am b/utils/Makefile.am index 3ea21835..2fc6ae21 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -23,7 +23,7 @@ atools_SOURCES = atools.cc noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ - alignment_pharaoh.cc \ + alignment_io.cc \ b64tools.cc \ corpus_tools.cc \ dict.cc \ diff --git a/utils/alignment_io.cc b/utils/alignment_io.cc new file mode 100644 index 00000000..1d923f7f --- /dev/null +++ b/utils/alignment_io.cc @@ -0,0 +1,97 @@ +#include "utils/alignment_io.h" + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr > AlignmentIO::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + size_t pos = al.rfind(" ||| "); + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + if(al[i] != '-') { + cerr << "BAD ALIGNMENT: " << al << endl; + abort(); + } + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); + i = 0; + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignmentIO::SerializePharaohFormat(const Array2D& alignment, ostream* o) { + ostream& out = *o; + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) out << ' '; else need_space = true; + out << i << '-' << j; + } + out << endl; +} + +void AlignmentIO::SerializeTypedAlignment(const Array2D& alignment, ostream* o) { + ostream& out = *o; + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) { + const AlignmentType& aij = alignment(i,j); + if (aij != kNONE) { + if (need_space) out << ' '; else need_space = true; + if (aij == kTRANSLATION) {} + else if (aij == kTRANSLITERATION) { + out << 'T' << ':'; + } else { + cerr << "\nUnexpected alignment point type: " << static_cast(aij) << endl; + abort(); + } + out << i << '-' << j; + } + } + out << endl; +} + diff --git a/utils/alignment_io.h b/utils/alignment_io.h new file mode 100644 index 00000000..36bcecd7 --- /dev/null +++ b/utils/alignment_io.h @@ -0,0 +1,42 @@ +#ifndef _ALIGNMENT_IO_H_ +#define _ALIGNMENT_IO_H_ + +#include +#include +#include +#include "array2d.h" + +struct AlignmentIO { + enum AlignmentType { kNONE = 0, kTRANSLATION = 1, kTRANSLITERATION = 2 }; + + static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); + static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); + static void SerializeTypedAlignment(const Array2D& alignment, std::ostream* out); +}; + +inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { + os << ' '; + for (int j=0; j - -using namespace std; - -static bool is_digit(char x) { return x >= '0' && x <= '9'; } - -boost::shared_ptr > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) { - int max_x = 0; - int max_y = 0; - int i = 0; - size_t pos = al.rfind(" ||| "); - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - if (x > max_x) max_x = x; - assert(i < al.size()); - if(al[i] != '-') { - cerr << "BAD ALIGNMENT: " << al << endl; - abort(); - } - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - if (y > max_y) max_y = y; - while(i < al.size() && al[i] == ' ') { ++i; } - } - - boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); - i = 0; - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - (*grid)(x, y) = true; - while(i < al.size() && al[i] == ' ') { ++i; } - } - // cerr << *grid << endl; - return grid; -} - -void AlignmentPharaoh::SerializePharaohFormat(const Array2D& alignment, ostream* out) { - bool need_space = false; - for (int i = 0; i < alignment.width(); ++i) - for (int j = 0; j < alignment.height(); ++j) - if (alignment(i,j)) { - if (need_space) (*out) << ' '; else need_space = true; - (*out) << i << '-' << j; - } - (*out) << endl; -} - diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h deleted file mode 100644 index d111c8bf..00000000 --- a/utils/alignment_pharaoh.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _PHARAOH_ALIGNMENT_H_ -#define _PHARAOH_ALIGNMENT_H_ - -#include -#include -#include -#include "array2d.h" - -struct AlignmentPharaoh { - static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); - static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); -}; - -#endif diff --git a/utils/atools.cc b/utils/atools.cc index ba56dd6c..bce7822e 100644 --- a/utils/atools.cc +++ b/utils/atools.cc @@ -8,7 +8,7 @@ #include #include "filelib.h" -#include "alignment_pharaoh.h" +#include "alignment_io.h" namespace po = boost::program_options; using namespace std; @@ -348,9 +348,9 @@ int main(int argc, char **argv) { } if (line1.empty() && !*in1) break; boost::shared_ptr > out(new Array2D); - boost::shared_ptr > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); + boost::shared_ptr > a1 = AlignmentIO::ReadPharaohAlignmentGrid(line1); if (in2) { - boost::shared_ptr > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); + boost::shared_ptr > a2 = AlignmentIO::ReadPharaohAlignmentGrid(line2); cmd.Apply(*a1, *a2, out.get()); } else { Array2D dummy; @@ -358,7 +358,7 @@ int main(int argc, char **argv) { } if (cmd.Result() == 1) { - AlignmentPharaoh::SerializePharaohFormat(*out, &cout); + AlignmentIO::SerializePharaohFormat(*out, &cout); } } if (cmd.Result() == 2) -- cgit v1.2.3 From bf4a7606151301dba49265e91c289f2caab2b7ec Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 2 Apr 2012 23:48:19 -0400 Subject: fix bug in lattices with OOVs --- decoder/grammar.cc | 24 +++++++++++--------- decoder/grammar.h | 2 -- rst_parser/Makefile.am | 16 +++++++++++++ rst_parser/arc_factored.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++ rst_parser/mst_train.cc | 11 +++++++++ rst_parser/rst.cc | 2 ++ rst_parser/rst.h | 7 ++++++ 7 files changed, 107 insertions(+), 13 deletions(-) create mode 100644 rst_parser/Makefile.am create mode 100644 rst_parser/arc_factored.h create mode 100644 rst_parser/mst_train.cc create mode 100644 rst_parser/rst.cc create mode 100644 rst_parser/rst.h (limited to 'decoder') diff --git a/decoder/grammar.cc b/decoder/grammar.cc index 9e4065a6..714390f0 100644 --- a/decoder/grammar.cc +++ b/decoder/grammar.cc @@ -3,12 +3,14 @@ #include #include #include +#include #include "rule_lexer.h" #include "filelib.h" #include "tdict.h" using namespace std; +using namespace std::tr1; const vector Grammar::NO_RULES; @@ -148,24 +150,24 @@ bool GlueGrammar::HasRuleForSpan(int i, int /* j */, int /* distance */) const { return (i == 0); } -PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const unsigned int ctf_level) : - has_rule_(input.size() + 1) { +PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const unsigned int ctf_level) { + unordered_set ss; for (int i = 0; i < input.size(); ++i) { const vector& alts = input[i]; for (int k = 0; k < alts.size(); ++k) { const int j = alts[k].dist2next + i; - has_rule_[i].insert(j); const string& src = TD::Convert(alts[k].label); - TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1")); - pt->a_.push_back(AlignmentPoint(0,0)); - AddRule(pt); - RefineRule(pt, ctf_level); + if (ss.count(alts[k].label) == 0) { + TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1")); + pt->a_.push_back(AlignmentPoint(0,0)); + AddRule(pt); + RefineRule(pt, ctf_level); + ss.insert(alts[k].label); + } } } } -bool PassThroughGrammar::HasRuleForSpan(int i, int j, int /* distance */) const { - const set& hr = has_rule_[i]; - if (i == j) { return !hr.empty(); } - return (hr.find(j) != hr.end()); +bool PassThroughGrammar::HasRuleForSpan(int, int, int distance) const { + return (distance < 2); } diff --git a/decoder/grammar.h b/decoder/grammar.h index f5d00817..e6a15a69 100644 --- a/decoder/grammar.h +++ b/decoder/grammar.h @@ -91,8 +91,6 @@ struct GlueGrammar : public TextGrammar { struct PassThroughGrammar : public TextGrammar { PassThroughGrammar(const Lattice& input, const std::string& cat, const unsigned int ctf_level=0); virtual bool HasRuleForSpan(int i, int j, int distance) const; - private: - std::vector > has_rule_; // index by [i][j] }; void RefineRule(TRulePtr pt, const unsigned int ctf_level); diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am new file mode 100644 index 00000000..fef1c1a2 --- /dev/null +++ b/rst_parser/Makefile.am @@ -0,0 +1,16 @@ +bin_PROGRAMS = \ + mst_train + +noinst_PROGRAMS = \ + rst_test + +TESTS = rst_test + +noinst_LIBRARIES = librst.a + +librst_a_SOURCES = rst.cc + +mst_train_SOURCES = mst_train.cc +mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h new file mode 100644 index 00000000..312d7d67 --- /dev/null +++ b/rst_parser/arc_factored.h @@ -0,0 +1,58 @@ +#ifndef _ARC_FACTORED_H_ +#define _ARC_FACTORED_H_ + +#include +#include +#include "array2d.h" +#include "sparse_vector.h" + +class ArcFactoredForest { + public: + explicit ArcFactoredForest(short num_words) : + num_words_(num_words), + root_edges_(num_words), + edges_(num_words, num_words) {} + + struct Edge { + Edge() : features(), edge_prob(prob_t::Zero()) {} + SparseVector features; + prob_t edge_prob; + }; + + template + void Reweight(const V& weights) { + for (int m = 0; m < num_words_; ++m) { + for (int h = 0; h < num_words_; ++h) { + if (h != m) { + Edge& e = edges_(h, m); + e.edge_prob.logeq(e.features.dot(weights)); + } + } + if (m) { + Edge& e = root_edges_[m]; + e.edge_prob.logeq(e.features.dot(weights)); + } + } + } + + const Edge& operator()(short h, short m) const { + assert(m > 0); + assert(m <= num_words_); + assert(h >= 0); + assert(h <= num_words_); + return h ? edges_(h - 1, m - 1) : root_edges[m - 1]; + } + Edge& operator()(short h, short m) { + assert(m > 0); + assert(m <= num_words_); + assert(h >= 0); + assert(h <= num_words_); + return h ? edges_(h - 1, m - 1) : root_edges[m - 1]; + } + private: + unsigned num_words_; + std::vector root_edges_; + Array2D edges_; +}; + +#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc new file mode 100644 index 00000000..1bceaff5 --- /dev/null +++ b/rst_parser/mst_train.cc @@ -0,0 +1,11 @@ +#include "arc_factored.h" + +#include + +using namespace std; + +int main(int argc, char** argv) { + ArcFactoredForest af(5); + return 0; +} + diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc new file mode 100644 index 00000000..0ab3e296 --- /dev/null +++ b/rst_parser/rst.cc @@ -0,0 +1,2 @@ +#include "rst.h" + diff --git a/rst_parser/rst.h b/rst_parser/rst.h new file mode 100644 index 00000000..30a1f8a4 --- /dev/null +++ b/rst_parser/rst.h @@ -0,0 +1,7 @@ +#ifndef _RST_H_ +#define _RST_H_ + +struct RandomSpanningTree { +}; + +#endif -- cgit v1.2.3 From 6001b81eba37985d2e7dea6e6ebb488b787789a6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 3 Apr 2012 02:08:33 -0400 Subject: bayes lattice scoring --- decoder/hg_io.cc | 20 +++ decoder/hg_io.h | 1 + gi/pf/Makefile.am | 5 +- gi/pf/bayes_lattice_score.cc | 309 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 gi/pf/bayes_lattice_score.cc (limited to 'decoder') diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index 9f0f50fa..d416dbf6 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -401,6 +401,26 @@ string HypergraphIO::AsPLF(const Hypergraph& hg, bool include_global_parentheses return os.str(); } +string HypergraphIO::AsPLF(const Lattice& lat, bool include_global_parentheses) { + static bool first = true; + if (first) { InitEscapes(); first = false; } + if (lat.empty()) return "()"; + ostringstream os; + if (include_global_parentheses) os << '('; + static const string EPS="*EPS*"; + for (int i = 0; i < lat.size(); ++i) { + const vector arcs = lat[i]; + os << '('; + for (int j = 0; j < arcs.size(); ++j) { + os << "('" << Escape(TD::Convert(arcs[j].label)) << "'," + << arcs[j].cost << ',' << arcs[j].dist2next << "),"; + } + os << "),"; + } + if (include_global_parentheses) os << ')'; + return os.str(); +} + namespace PLF { const string chars = "'\\"; diff --git a/decoder/hg_io.h b/decoder/hg_io.h index 44817157..4e502a0c 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -30,6 +30,7 @@ struct HypergraphIO { static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); // return PLF string representation (undefined behavior on non-lattices) static std::string AsPLF(const Hypergraph& hg, bool include_global_parentheses = true); + static std::string AsPLF(const Lattice& lat, bool include_global_parentheses = true); static void PLFtoLattice(const std::string& plf, Lattice* pl); static std::string Escape(const std::string& s); // PLF helper }; diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index d365016b..86f8e07b 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,9 +1,12 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test bayes_lattice_score noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc hpyp_tm.cc pyp_tm.cc +bayes_lattice_score_SOURCES = bayes_lattice_score.cc +bayes_lattice_score_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + pf_test_SOURCES = pf_test.cc pf_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/bayes_lattice_score.cc b/gi/pf/bayes_lattice_score.cc new file mode 100644 index 00000000..70cb8dc2 --- /dev/null +++ b/gi/pf/bayes_lattice_score.cc @@ -0,0 +1,309 @@ +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "hg_io.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +boost::shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + ReadFile rf(filename); + istream* in = rf.stream(); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(Lattice()); + Lattice& le = e->back(); + LatticeTools::ConvertTextOrPLF(line, & le); + for (unsigned i = 0; i < le.size(); ++i) + for (unsigned j = 0; j < le[i].size(); ++j) + vocab_e->insert(le[i][j].label); + toks += le.size(); + } + return toks; +} + +struct BaseModel { + explicit BaseModel(unsigned tc) : + unif(1.0 / tc), p(prob_t::One()) {} + prob_t prob(const TRule& r) const { + return unif; + } + void increment(const TRule& r, MT19937* rng) { + p *= prob(r); + } + void decrement(const TRule& r, MT19937* rng) { + p /= prob(r); + } + prob_t Likelihood() const { + return p; + } + const prob_t unif; + prob_t p; +}; + +struct UnigramModel { + explicit UnigramModel(unsigned tc) : base(tc), crp(1,1,1,1), glue(1,1,1,1) {} + BaseModel base; + CCRP crp; + CCRP glue; + + prob_t Prob(const TRule& r) const { + if (r.Arity() != 0) { + return glue.prob(r, prob_t(0.5)); + } + return crp.prob(r, base.prob(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + if (r.Arity() != 0) { + glue.increment(r, 0.5, rng); + return 0; + } else { + if (crp.increment(r, base.prob(r), rng)) { + base.increment(r, rng); + return 1; + } + return 0; + } + } + + int Decrement(const TRule& r, MT19937* rng) { + if (r.Arity() != 0) { + glue.decrement(r, rng); + return 0; + } else { + if (crp.decrement(r, rng)) { + base.decrement(r, rng); + return -1; + } + return 0; + } + } + + prob_t Likelihood() const { + prob_t p; + p.logeq(crp.log_crp_prob() + glue.log_crp_prob()); + p *= base.Likelihood(); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + crp.resample_hyperparameters(rng); + glue.resample_hyperparameters(rng); + cerr << " d=" << crp.discount() << ", s=" << crp.strength() << "\t STOP d=" << glue.discount() << ", s=" << glue.strength() << endl; + } +}; + +UnigramModel* plm; + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { + vector node_probs; + Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } +// for (unsigned i = 0; i < sampled_deriv->size(); ++i) { +// cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; +// } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, UnigramModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, UnigramModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +prob_t TotalProb(const Hypergraph& hg) { + return Inside(hg); +} + +void IncrementLatticePath(const Hypergraph& hg, const vector& d, Lattice* pl) { + Lattice& lat = *pl; + for (int i = 0; i < d.size(); ++i) { + const Hypergraph::Edge& edge = hg.edges_[d[i]]; + if (edge.rule_->Arity() != 0) continue; + WordID sym = edge.rule_->e_[0]; + vector& las = lat[edge.i_]; + int dist = edge.j_ - edge.i_; + assert(dist > 0); + for (int j = 0; j < las.size(); ++j) { + if (las[j].dist2next == dist && + las[j].label == sym) { + las[j].cost += 1; + } + } + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + vector grammars(2); + grammars[0].reset(new GlueGrammar("S","X")); + const unsigned samples = conf["samples"].as(); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " lattices\t (" << vocabe.size() << " word types)\n"; + UnigramModel lm(vocabe.size()); + vector hgs(corpuse.size()); + vector > derivs(corpuse.size()); + for (int i = 0; i < corpuse.size(); ++i) { + grammars[1].reset(new PassThroughGrammar(corpuse[i], "X")); + ExhaustiveBottomUpParser parser("S", grammars); + bool res = parser.Parse(corpuse[i], &hgs[i]); // exhaustive parse + assert(res); + } + + double csamples = 0; + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + bool record_sample = (SS > (samples * 1 / 3) && (SS % 5 == 3)); + if (record_sample) csamples++; + for (int ci = 0; ci < corpuse.size(); ++ci) { + Lattice& lat = corpuse[ci]; + Hypergraph& hg = hgs[ci]; + vector& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.Arity() != 0) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (record_sample) IncrementLatticePath(hg, derivs[ci], &lat); + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + cerr << lm.crp << endl; + cerr << lm.glue << endl; + for (int i = 0; i < corpuse.size(); ++i) { + for (int j = 0; j < corpuse[i].size(); ++j) + for (int k = 0; k < corpuse[i][j].size(); ++k) { + corpuse[i][j][k].cost /= csamples; + corpuse[i][j][k].cost += 1e-3; + corpuse[i][j][k].cost = log(corpuse[i][j][k].cost); + } + cout << HypergraphIO::AsPLF(corpuse[i]) << endl; + } + return 0; +} + -- cgit v1.2.3