diff options
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/pf/align-tl.cc | 15 | ||||
| -rw-r--r-- | gi/pf/reachability.cc | 9 | ||||
| -rw-r--r-- | gi/pf/reachability.h | 8 | ||||
| -rw-r--r-- | gi/pf/transliterations.cc | 14 | ||||
| -rw-r--r-- | gi/pf/transliterations.h | 3 | 
5 files changed, 31 insertions, 18 deletions
| diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fe8950b5..fc9b7ca5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -30,6 +30,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    opts.add_options()          ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")          ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source") +        ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target") +        ("min_transliterated_src_length", po::value<unsigned>()->default_value(3), "Minimum length of source words considered for transliteration") +        ("filter_ratio", po::value<double>()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable")          ("random_seed,S",po::value<uint32_t>(), "Random seed");    po::options_description clo("Command line options");    clo.add_options() @@ -306,12 +310,11 @@ int main(int argc, char** argv) {    letters[TD::Convert("NULL")].clear();    // TODO configure this -  int max_src_chunk = 4; -  int max_trg_chunk = 4; -  Transliterations tl(max_src_chunk, max_trg_chunk); - -  // TODO CONFIGURE THIS -  int min_trans_src = 4; +  const int max_src_chunk = conf["max_src_chunk"].as<unsigned>(); +  const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>(); +  const double filter_rat = conf["filter_ratio"].as<double>(); +  const int min_trans_src = conf["min_transliterated_src_length"].as<unsigned>(); +  Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat);    cerr << "Initializing transliteration graph structures ...\n";    for (int i = 0; i < corpus.size(); ++i) { diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 59bc6ace..c10000f2 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState {    int prev_trg_covered;  }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) {      typedef boost::multi_array<vector<SState>, 2> array_type;      array_type a(boost::extents[srclen + 1][trglen + 1]);      a[0][0].push_back(SState()); @@ -30,9 +30,10 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras        }      }      a[0][0].clear(); -    //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; -    if (a[srclen][trglen].size() == 0) { -      cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; +    //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; +    size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); +    if (a[srclen][trglen].size() < min_allowed) { +      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n";        return;      } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 1e22c76a..03967d44 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,17 +18,19 @@ struct Reachability {    boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")    boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node -  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : +  // filter_ratio says if the number of outgoing edges from the first cell is less than +  //    src_max * trg_max * filter_rat^2 then mark as non reachable +  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) :        nodes(),        edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),        max_src_delta(boost::extents[srclen][trglen]),        node_addresses(boost::extents[srclen][trglen]),        valid_deltas(boost::extents[srclen][trglen]) { -    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); +    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio);    }   private: -  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); +  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio);  };  #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 61e95b82..8ea4ebd2 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -48,10 +48,11 @@ struct BackwardEstimates {  };  struct TransliterationsImpl { -  TransliterationsImpl(int max_src, int max_trg) : +  TransliterationsImpl(int max_src, int max_trg, double fr) :        kMAX_SRC_CHUNK(max_src),        kMAX_TRG_CHUNK(max_trg), -      tot_pairs() { +      kFILTER_RATIO(fr), +      tot_pairs(), tot_mem() {    }    void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { @@ -63,7 +64,7 @@ struct TransliterationsImpl {      if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);      GraphStructure& gs = graphs[src_len][trg_len];      if (!gs.r) -      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); +      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);      const Reachability& r = *gs.r;      // init backward estimates @@ -77,6 +78,7 @@ struct TransliterationsImpl {      // TODO      tot_pairs++; +    tot_mem += sizeof(float) * gs.r->nodes;    }    void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { @@ -119,16 +121,20 @@ struct TransliterationsImpl {      cerr << "Average out-degree = " << (to / tn) << endl;      cerr << " Unique structures = " << tt << endl;      cerr << "      Unique pairs = " << tot_pairs << endl; +    cerr << "          BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;    }    const int kMAX_SRC_CHUNK;    const int kMAX_TRG_CHUNK; +  const double kFILTER_RATIO;    unsigned tot_pairs; +  size_t tot_mem;    vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]    vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]  }; -Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} +Transliterations::Transliterations(int max_src, int max_trg, double fr) : +    pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}  Transliterations::~Transliterations() { delete pimpl_; }  void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index e025547e..ea9f9d3f 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -8,7 +8,8 @@  struct TransliterationsImpl;  struct Transliterations {    // max_src and max_trg indicate how big the transliteration phrases can be -  explicit Transliterations(int max_src, int max_trg); +  // see reachability.h for information about filter_ratio +  explicit Transliterations(int max_src, int max_trg, double filter_ratio);    ~Transliterations();    void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);    void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); | 
