summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 21:10:12 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 21:10:12 +0000
commit9e73380e46ada871ad114f4fd4238b8f2263ef27 (patch)
treeecd759d7e5c2c87f23ed90596d9c198042dffae9 /decoder
parent7a87c0b62eaa5ef27b0642fc4cae9ee5c4cd6a8b (diff)
psg for lex trans
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@699 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder')
-rw-r--r--decoder/decoder.cc1
-rw-r--r--decoder/lextrans.cc43
-rw-r--r--decoder/translator.cc2
3 files changed, 41 insertions, 5 deletions
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index eb983419..2a8043db 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -324,6 +324,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("formalism,f",po::value<string>(),"Decoding formalism; values include SCFG, FST, PB, LexTrans (lexical translation model, also disc training), CSplit (compound splitting), Tagger (sequence labeling), LexAlign (alignment only, or EM training)")
("input,i",po::value<string>()->default_value("-"),"Source file")
("grammar,g",po::value<vector<string> >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)")
+ ("per_sentence_grammar_file", po::value<string>(), "Optional (and possibly not implemented) per sentence grammar file enables all per sentence grammars to be stored in a single large file and accessed by offset")
("weights,w",po::value<string>(),"Feature weights file")
("prelm_weights",po::value<string>(),"Feature weights file for prelm_beam_prune. Requires --weights.")
("prelm_copy_weights","use --weights as value for --prelm_weights.")
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index 1921f280..551e77e3 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -1,6 +1,7 @@
#include "lextrans.h"
#include <iostream>
+#include <cstdlib>
#include "filelib.h"
#include "hg.h"
@@ -13,10 +14,14 @@ using namespace std;
struct LexicalTransImpl {
LexicalTransImpl(const boost::program_options::variables_map& conf) :
use_null(conf.count("lextrans_use_null") > 0),
+ psg_file_(),
kXCAT(TD::Convert("X")*-1),
kNULL(TD::Convert("<eps>")),
kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")),
kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) {
+ if (conf.count("per_sentence_grammar_file")) {
+ psg_file_ = new ifstream(conf["per_sentence_grammar_file"].as<string>().c_str());
+ }
vector<string> gfiles = conf["grammar"].as<vector<string> >();
assert(gfiles.size() == 1);
ReadFile rf(gfiles.front());
@@ -25,10 +30,10 @@ struct LexicalTransImpl {
istream* in = rf.stream();
int lc = 0;
bool flag = false;
+ string line;
while(*in) {
- string line;
getline(*in, line);
- if (line.empty()) continue;
+ if (!*in) continue;
++lc;
TRulePtr r(TRule::CreateRulePhrasetable(line));
tg->AddRule(r);
@@ -39,7 +44,31 @@ struct LexicalTransImpl {
cerr << "Loaded " << lc << " rules\n";
}
+ void LoadSentenceGrammar(const string& s_offset) {
+ const unsigned long long int offset = strtoull(s_offset.c_str(), NULL, 10);
+ psg_file_->seekg(offset, ios::beg);
+ TextGrammar *tg = new TextGrammar;
+ sup_grammar.reset(tg);
+ const string kEND_MARKER = "###EOS###";
+ string line;
+ while(true) {
+ assert(*psg_file_);
+ getline(*psg_file_, line);
+ if (line == kEND_MARKER) break;
+ TRulePtr r(TRule::CreateRulePhrasetable(line));
+ tg->AddRule(r);
+ }
+ }
+
void BuildTrellis(const Lattice& lattice, const SentenceMetadata& smeta, Hypergraph* forest) {
+ if (psg_file_) {
+ const string offset = smeta.GetSGMLValue("psg");
+ if (offset.size() < 2 || offset[0] != '@') {
+ cerr << "per_sentence_grammar_file given but sentence id=" << smeta.GetSentenceID() << " doesn't have grammar info!\n";
+ abort();
+ }
+ LoadSentenceGrammar(offset.substr(1));
+ }
const int e_len = smeta.GetTargetLength();
assert(e_len > 0);
const int f_len = lattice.size();
@@ -53,8 +82,12 @@ struct LexicalTransImpl {
const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label);
const GrammarIter* gi = grammar->GetRoot()->Extend(src_sym);
if (!gi) {
- cerr << "No translations found for: " << TD::Convert(src_sym) << "\n";
- abort();
+ if (psg_file_)
+ gi = sup_grammar->GetRoot()->Extend(src_sym);
+ if (!gi) {
+ cerr << "No translations found for: " << TD::Convert(src_sym) << "\n";
+ abort();
+ }
}
const RuleBin* rb = gi->GetRules();
assert(rb);
@@ -88,11 +121,13 @@ struct LexicalTransImpl {
private:
const bool use_null;
+ ifstream* psg_file_;
const WordID kXCAT;
const WordID kNULL;
const TRulePtr kBINARY;
const TRulePtr kGOAL_RULE;
GrammarPtr grammar;
+ GrammarPtr sup_grammar;
};
LexicalTrans::LexicalTrans(const boost::program_options::variables_map& conf) :
diff --git a/decoder/translator.cc b/decoder/translator.cc
index 277c3a2d..d1ca125b 100644
--- a/decoder/translator.cc
+++ b/decoder/translator.cc
@@ -43,7 +43,7 @@ void Translator::SentenceComplete() {
// this may be overridden by translators that want to accept
// metadata
void Translator::ProcessMarkupHintsImpl(const map<string, string>& kv) {
- int unprocessed = kv.size() - kv.count("id");
+ int unprocessed = kv.size() - kv.count("id") - kv.count("psg");
if (!SILENT) cerr << "Inside translator process hints\n";
if (unprocessed > 0) {
cerr << "Sentence markup contains unprocessed data:\n";