From 649a4ea65e78193876d7cb5f387ef775362bfea0 Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 26 Aug 2010 21:49:45 +0000 Subject: some experimental stuff git-svn-id: https://ws10smt.googlecode.com/svn/trunk@625 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/extractor.cc | 13 +++++++++++-- extools/sentence_pair.cc | 21 +++++++++------------ extools/sentence_pair.h | 4 +++- 3 files changed, 23 insertions(+), 15 deletions(-) (limited to 'extools') diff --git a/extools/extractor.cc b/extools/extractor.cc index 1eb85f37..1e4154ef 100644 --- a/extools/extractor.cc +++ b/extools/extractor.cc @@ -31,6 +31,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input,i", po::value()->default_value("-"), "Input file") ("default_category,d", po::value(), "Default span type (use X for 'Hiero')") + ("x_cdyer_pos,x", "Extract monolingual POS contexts (cdyer experimental)") ("loose", "Use loose phrase extraction heuristic for base phrases") ("base_phrase,B", "Write base phrases") ("base_phrase_spans", "Write base sentences and phrase spans") @@ -360,8 +361,8 @@ int main(int argc, char** argv) { AnnotatedParallelSentence sentence; vector phrases; vector all_cats; - const int max_base_phrase_size = conf["max_base_phrase_size"].as(); - const bool write_phrase_contexts = conf.count("phrase_context") > 0; + int max_base_phrase_size = conf["max_base_phrase_size"].as(); + bool write_phrase_contexts = conf.count("phrase_context") > 0; const bool write_base_phrases = conf.count("base_phrase") > 0; const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0; const bool loose_phrases = conf.count("loose") > 0; @@ -378,6 +379,7 @@ int main(int argc, char** argv) { const string cs = conf["context_language"].as(); const bool context_s = cs == "source" || cs == "both"; const bool context_t = cs == "target" || cs == "both"; + const bool x_cdyer_pos = conf.count("x_cdyer_pos"); int line = 0; CountCombiner cc(conf["combiner_size"].as()); HadoopStreamingRuleObserver o(&cc, @@ -402,6 +404,13 @@ int main(int argc, char** argv) { if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush; } sentence.ParseInputLine(buf); + if (x_cdyer_pos) { + sentence.e = sentence.f; + sentence.AllocateForAlignment(); + for (int i = 0; i < sentence.e.size(); ++i) sentence.Align(i,i); + max_base_phrase_size = 1; + write_phrase_contexts = true; + } phrases.clear(); Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases); if (loose_phrases) diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc index 55609db4..d5ebe48f 100644 --- a/extools/sentence_pair.cc +++ b/extools/sentence_pair.cc @@ -117,6 +117,14 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf, return ch; } +void AnnotatedParallelSentence::Align(const short a, const short b) { + aligned(a,b) = true; + ++f_aligned[a]; + ++e_aligned[b]; + aligns_by_fword[a].push_back(make_pair(a,b)); + // cerr << a << " " << b << endl; +} + void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) { short a, b; ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0); @@ -124,11 +132,7 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl; exit(1); } - aligned(a,b) = true; - ++f_aligned[a]; - ++e_aligned[b]; - aligns_by_fword[a].push_back(make_pair(a,b)); - // cerr << a << " " << b << endl; + Align(a,b); } void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { @@ -191,12 +195,5 @@ void AnnotatedParallelSentence::ParseInputLine(const char* buf) { default: cerr << "Can't happen\n"; abort(); } } - if (state < 2) { - cerr << "Not enough fields: " << buf << endl; - abort(); - } - if (e.empty() || f.empty()) { - cerr << "Sentences must not be empty: " << buf << endl; - } } diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h index b5a7ca93..a05275e7 100644 --- a/extools/sentence_pair.h +++ b/extools/sentence_pair.h @@ -29,11 +29,13 @@ struct AnnotatedParallelSentence { int f_len, e_len; + void Align(const short a, const short b); + void AllocateForAlignment(); + static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d); private: void Reset(); - void AllocateForAlignment(); void ParseAlignmentPoint(const char* buf, int start, int end); void ParseSpanLabel(const char* buf, int start, int end); }; -- cgit v1.2.3