summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-26 21:49:45 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-26 21:49:45 +0000
commit649a4ea65e78193876d7cb5f387ef775362bfea0 (patch)
treef737bd5bb0fe8a24fdbd82872c4b4851bfc33c09
parent2484876aaf430b479ee5aa71bda7e812fa61430f (diff)
some experimental stuff
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@625 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--extools/extractor.cc13
-rw-r--r--extools/sentence_pair.cc21
-rw-r--r--extools/sentence_pair.h4
3 files changed, 23 insertions, 15 deletions
diff --git a/extools/extractor.cc b/extools/extractor.cc
index 1eb85f37..1e4154ef 100644
--- a/extools/extractor.cc
+++ b/extools/extractor.cc
@@ -31,6 +31,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("input,i", po::value<string>()->default_value("-"), "Input file")
("default_category,d", po::value<string>(), "Default span type (use X for 'Hiero')")
+ ("x_cdyer_pos,x", "Extract monolingual POS contexts (cdyer experimental)")
("loose", "Use loose phrase extraction heuristic for base phrases")
("base_phrase,B", "Write base phrases")
("base_phrase_spans", "Write base sentences and phrase spans")
@@ -360,8 +361,8 @@ int main(int argc, char** argv) {
AnnotatedParallelSentence sentence;
vector<ParallelSpan> phrases;
vector<WordID> all_cats;
- const int max_base_phrase_size = conf["max_base_phrase_size"].as<int>();
- const bool write_phrase_contexts = conf.count("phrase_context") > 0;
+ int max_base_phrase_size = conf["max_base_phrase_size"].as<int>();
+ bool write_phrase_contexts = conf.count("phrase_context") > 0;
const bool write_base_phrases = conf.count("base_phrase") > 0;
const bool write_base_phrase_spans = conf.count("base_phrase_spans") > 0;
const bool loose_phrases = conf.count("loose") > 0;
@@ -378,6 +379,7 @@ int main(int argc, char** argv) {
const string cs = conf["context_language"].as<string>();
const bool context_s = cs == "source" || cs == "both";
const bool context_t = cs == "target" || cs == "both";
+ const bool x_cdyer_pos = conf.count("x_cdyer_pos");
int line = 0;
CountCombiner cc(conf["combiner_size"].as<size_t>());
HadoopStreamingRuleObserver o(&cc,
@@ -402,6 +404,13 @@ int main(int argc, char** argv) {
if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
}
sentence.ParseInputLine(buf);
+ if (x_cdyer_pos) {
+ sentence.e = sentence.f;
+ sentence.AllocateForAlignment();
+ for (int i = 0; i < sentence.e.size(); ++i) sentence.Align(i,i);
+ max_base_phrase_size = 1;
+ write_phrase_contexts = true;
+ }
phrases.clear();
Extract::ExtractBasePhrases(max_base_phrase_size, sentence, &phrases);
if (loose_phrases)
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
index 55609db4..d5ebe48f 100644
--- a/extools/sentence_pair.cc
+++ b/extools/sentence_pair.cc
@@ -117,6 +117,14 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
return ch;
}
+void AnnotatedParallelSentence::Align(const short a, const short b) {
+ aligned(a,b) = true;
+ ++f_aligned[a];
+ ++e_aligned[b];
+ aligns_by_fword[a].push_back(make_pair(a,b));
+ // cerr << a << " " << b << endl;
+}
+
void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
short a, b;
ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0);
@@ -124,11 +132,7 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start,
cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;
exit(1);
}
- aligned(a,b) = true;
- ++f_aligned[a];
- ++e_aligned[b];
- aligns_by_fword[a].push_back(make_pair(a,b));
- // cerr << a << " " << b << endl;
+ Align(a,b);
}
void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
@@ -191,12 +195,5 @@ void AnnotatedParallelSentence::ParseInputLine(const char* buf) {
default: cerr << "Can't happen\n"; abort();
}
}
- if (state < 2) {
- cerr << "Not enough fields: " << buf << endl;
- abort();
- }
- if (e.empty() || f.empty()) {
- cerr << "Sentences must not be empty: " << buf << endl;
- }
}
diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h
index b5a7ca93..a05275e7 100644
--- a/extools/sentence_pair.h
+++ b/extools/sentence_pair.h
@@ -29,11 +29,13 @@ struct AnnotatedParallelSentence {
int f_len, e_len;
+ void Align(const short a, const short b);
+ void AllocateForAlignment();
+
static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d);
private:
void Reset();
- void AllocateForAlignment();
void ParseAlignmentPoint(const char* buf, int start, int end);
void ParseSpanLabel(const char* buf, int start, int end);
};