summaryrefslogtreecommitdiff
path: root/gi/pf
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pf')
-rw-r--r--gi/pf/align-tl.cc15
-rw-r--r--gi/pf/reachability.cc9
-rw-r--r--gi/pf/reachability.h8
-rw-r--r--gi/pf/transliterations.cc14
-rw-r--r--gi/pf/transliterations.h3
5 files changed, 31 insertions, 18 deletions
diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc
index fe8950b5..fc9b7ca5 100644
--- a/gi/pf/align-tl.cc
+++ b/gi/pf/align-tl.cc
@@ -30,6 +30,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
("input,i",po::value<string>(),"Read parallel data from")
+ ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")
+ ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target")
+ ("min_transliterated_src_length", po::value<unsigned>()->default_value(3), "Minimum length of source words considered for transliteration")
+ ("filter_ratio", po::value<double>()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
clo.add_options()
@@ -306,12 +310,11 @@ int main(int argc, char** argv) {
letters[TD::Convert("NULL")].clear();
// TODO configure this
- int max_src_chunk = 4;
- int max_trg_chunk = 4;
- Transliterations tl(max_src_chunk, max_trg_chunk);
-
- // TODO CONFIGURE THIS
- int min_trans_src = 4;
+ const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();
+ const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>();
+ const double filter_rat = conf["filter_ratio"].as<double>();
+ const int min_trans_src = conf["min_transliterated_src_length"].as<unsigned>();
+ Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat);
cerr << "Initializing transliteration graph structures ...\n";
for (int i = 0; i < corpus.size(); ++i) {
diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc
index 59bc6ace..c10000f2 100644
--- a/gi/pf/reachability.cc
+++ b/gi/pf/reachability.cc
@@ -12,7 +12,7 @@ struct SState {
int prev_trg_covered;
};
-void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {
+void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) {
typedef boost::multi_array<vector<SState>, 2> array_type;
array_type a(boost::extents[srclen + 1][trglen + 1]);
a[0][0].push_back(SState());
@@ -30,9 +30,10 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras
}
}
a[0][0].clear();
- //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
- if (a[srclen][trglen].size() == 0) {
- cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n";
+ //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n";
+ size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio);
+ if (a[srclen][trglen].size() < min_allowed) {
+ cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n";
return;
}
diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h
index 1e22c76a..03967d44 100644
--- a/gi/pf/reachability.h
+++ b/gi/pf/reachability.h
@@ -18,17 +18,19 @@ struct Reachability {
boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")
boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node
- Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :
+ // filter_ratio says if the number of outgoing edges from the first cell is less than
+ // src_max * trg_max * filter_rat^2 then mark as non reachable
+ Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) :
nodes(),
edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),
max_src_delta(boost::extents[srclen][trglen]),
node_addresses(boost::extents[srclen][trglen]),
valid_deltas(boost::extents[srclen][trglen]) {
- ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);
+ ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio);
}
private:
- void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);
+ void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio);
};
#endif
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 61e95b82..8ea4ebd2 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -48,10 +48,11 @@ struct BackwardEstimates {
};
struct TransliterationsImpl {
- TransliterationsImpl(int max_src, int max_trg) :
+ TransliterationsImpl(int max_src, int max_trg, double fr) :
kMAX_SRC_CHUNK(max_src),
kMAX_TRG_CHUNK(max_trg),
- tot_pairs() {
+ kFILTER_RATIO(fr),
+ tot_pairs(), tot_mem() {
}
void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
@@ -63,7 +64,7 @@ struct TransliterationsImpl {
if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
GraphStructure& gs = graphs[src_len][trg_len];
if (!gs.r)
- gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+ gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);
const Reachability& r = *gs.r;
// init backward estimates
@@ -77,6 +78,7 @@ struct TransliterationsImpl {
// TODO
tot_pairs++;
+ tot_mem += sizeof(float) * gs.r->nodes;
}
void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
@@ -119,16 +121,20 @@ struct TransliterationsImpl {
cerr << "Average out-degree = " << (to / tn) << endl;
cerr << " Unique structures = " << tt << endl;
cerr << " Unique pairs = " << tot_pairs << endl;
+ cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;
}
const int kMAX_SRC_CHUNK;
const int kMAX_TRG_CHUNK;
+ const double kFILTER_RATIO;
unsigned tot_pairs;
+ size_t tot_mem;
vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
};
-Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {}
+Transliterations::Transliterations(int max_src, int max_trg, double fr) :
+ pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}
Transliterations::~Transliterations() { delete pimpl_; }
void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h
index e025547e..ea9f9d3f 100644
--- a/gi/pf/transliterations.h
+++ b/gi/pf/transliterations.h
@@ -8,7 +8,8 @@
struct TransliterationsImpl;
struct Transliterations {
// max_src and max_trg indicate how big the transliteration phrases can be
- explicit Transliterations(int max_src, int max_trg);
+ // see reachability.h for information about filter_ratio
+ explicit Transliterations(int max_src, int max_trg, double filter_ratio);
~Transliterations();
void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);
void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);