summaryrefslogtreecommitdiff
path: root/extools/extractor.cc
diff options
context:
space:
mode:
authorolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-08 21:59:50 +0000
committerolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-08 21:59:50 +0000
commit11f476baf7d855198413b091cae775bde4ea41ed (patch)
tree2b2f9975b4da2839ba5fa89e3ed46d3868bb0d4e /extools/extractor.cc
parentf2be77ccae455563e167607ee7527abbf8d96e60 (diff)
Adding backoff grammar and BackoffRule feature.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@191 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/extractor.cc')
-rw-r--r--extools/extractor.cc14
1 files changed, 12 insertions, 2 deletions
diff --git a/extools/extractor.cc b/extools/extractor.cc
index 4f9b4dc6..7149bfd7 100644
--- a/extools/extractor.cc
+++ b/extools/extractor.cc
@@ -43,6 +43,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("max_vars,v", po::value<int>()->default_value(2), "Maximum number of nonterminal variables in final phrase size")
("permit_adjacent_nonterminals,A", "Permit adjacent nonterminals in source side of rules")
("no_required_aligned_terminal,n", "Do not require an aligned terminal")
+ ("topics,t", po::value<int>()->default_value(50), "Number of categories assigned during clustering")
+ ("backoff,g","Produce a backoff grammar")
("help,h", "Print this help message and exit");
po::options_description clo("Command line options");
po::options_description dcmdline_options;
@@ -299,6 +301,7 @@ int main(int argc, char** argv) {
WordID default_cat = 0; // 0 means no default- extraction will
// fail if a phrase is extracted without a
// category
+ const bool backoff = (conf.count("backoff") ? true : false);
if (conf.count("default_category")) {
string sdefault_cat = conf["default_category"].as<string>();
default_cat = -TD::Convert(sdefault_cat);
@@ -310,6 +313,7 @@ int main(int argc, char** argv) {
char buf[MAX_LINE_LENGTH];
AnnotatedParallelSentence sentence;
vector<ParallelSpan> phrases;
+ vector<WordID> all_cats;
const int max_base_phrase_size = conf["max_base_phrase_size"].as<int>();
const bool write_phrase_contexts = conf.count("phrase_context") > 0;
const bool write_base_phrases = conf.count("base_phrase") > 0;
@@ -319,12 +323,19 @@ int main(int argc, char** argv) {
const int max_syms = conf["max_syms"].as<int>();
const int max_vars = conf["max_vars"].as<int>();
const int ctx_size = conf["phrase_context_size"].as<int>();
+ const int num_categories = conf["topics"].as<int>();
const bool permit_adjacent_nonterminals = conf.count("permit_adjacent_nonterminals") > 0;
const bool require_aligned_terminal = conf.count("no_required_aligned_terminal") == 0;
int line = 0;
CountCombiner cc(conf["combiner_size"].as<size_t>());
HadoopStreamingRuleObserver o(&cc,
conf.count("bidir") > 0);
+
+ if(backoff) {
+ for (int i=0;i < num_categories;++i)
+ all_cats.push_back(TD::Convert("X"+boost::lexical_cast<string>(i)));
+ }
+
//SimpleRuleWriter o;
while(in) {
++line;
@@ -356,9 +367,8 @@ int main(int argc, char** argv) {
continue;
}
Extract::AnnotatePhrasesWithCategoryTypes(default_cat, sentence.span_types, &phrases);
- Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o);
+ Extract::ExtractConsistentRules(sentence, phrases, max_vars, max_syms, permit_adjacent_nonterminals, require_aligned_terminal, &o, &all_cats);
}
if (!silent) cerr << endl;
return 0;
}
-