From 0ab9e175f86b3fd02a4a94f350282210aba054e3 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 14:29:42 -0500 Subject: moar --- gi/pf/align-tl.cc | 15 +++++++++------ gi/pf/reachability.cc | 9 +++++---- gi/pf/reachability.h | 8 +++++--- gi/pf/transliterations.cc | 14 ++++++++++---- gi/pf/transliterations.h | 3 ++- 5 files changed, 31 insertions(+), 18 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fe8950b5..fc9b7ca5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -30,6 +30,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("input,i",po::value(),"Read parallel data from") + ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") + ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") + ("min_transliterated_src_length", po::value()->default_value(3), "Minimum length of source words considered for transliteration") + ("filter_ratio", po::value()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -306,12 +310,11 @@ int main(int argc, char** argv) { letters[TD::Convert("NULL")].clear(); // TODO configure this - int max_src_chunk = 4; - int max_trg_chunk = 4; - Transliterations tl(max_src_chunk, max_trg_chunk); - - // TODO CONFIGURE THIS - int min_trans_src = 4; + const int max_src_chunk = conf["max_src_chunk"].as(); + const int max_trg_chunk = conf["max_trg_chunk"].as(); + const double filter_rat = conf["filter_ratio"].as(); + const int min_trans_src = conf["min_transliterated_src_length"].as(); + Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 59bc6ace..c10000f2 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState { int prev_trg_covered; }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { typedef boost::multi_array, 2> array_type; array_type a(boost::extents[srclen + 1][trglen + 1]); a[0][0].push_back(SState()); @@ -30,9 +30,10 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } } a[0][0].clear(); - //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - if (a[srclen][trglen].size() == 0) { - cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); + if (a[srclen][trglen].size() < min_allowed) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; return; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 1e22c76a..03967d44 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,17 +18,19 @@ struct Reachability { boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + // filter_ratio says if the number of outgoing edges from the first cell is less than + // src_max * trg_max * filter_rat^2 then mark as non reachable + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); } private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); }; #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 61e95b82..8ea4ebd2 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -48,10 +48,11 @@ struct BackwardEstimates { }; struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg) : + TransliterationsImpl(int max_src, int max_trg, double fr) : kMAX_SRC_CHUNK(max_src), kMAX_TRG_CHUNK(max_trg), - tot_pairs() { + kFILTER_RATIO(fr), + tot_pairs(), tot_mem() { } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { @@ -63,7 +64,7 @@ struct TransliterationsImpl { if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); GraphStructure& gs = graphs[src_len][trg_len]; if (!gs.r) - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); const Reachability& r = *gs.r; // init backward estimates @@ -77,6 +78,7 @@ struct TransliterationsImpl { // TODO tot_pairs++; + tot_mem += sizeof(float) * gs.r->nodes; } void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { @@ -119,16 +121,20 @@ struct TransliterationsImpl { cerr << "Average out-degree = " << (to / tn) << endl; cerr << " Unique structures = " << tt << endl; cerr << " Unique pairs = " << tot_pairs << endl; + cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; } const int kMAX_SRC_CHUNK; const int kMAX_TRG_CHUNK; + const double kFILTER_RATIO; unsigned tot_pairs; + size_t tot_mem; vector > graphs; // graphs[src_len][trg_len] vector > bes; // bes[src][trg] }; -Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} +Transliterations::Transliterations(int max_src, int max_trg, double fr) : + pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index e025547e..ea9f9d3f 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -8,7 +8,8 @@ struct TransliterationsImpl; struct Transliterations { // max_src and max_trg indicate how big the transliteration phrases can be - explicit Transliterations(int max_src, int max_trg); + // see reachability.h for information about filter_ratio + explicit Transliterations(int max_src, int max_trg, double filter_ratio); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); -- cgit v1.2.3