From 0172721855098ca02b207231a654dffa5e4eb1c9 Mon Sep 17 00:00:00 2001 From: redpony Date: Tue, 22 Jun 2010 05:12:27 +0000 Subject: initial checkin git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/build_lexical_translation.cc | 104 +++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 extools/build_lexical_translation.cc (limited to 'extools/build_lexical_translation.cc') diff --git a/extools/build_lexical_translation.cc b/extools/build_lexical_translation.cc new file mode 100644 index 00000000..f609f56a --- /dev/null +++ b/extools/build_lexical_translation.cc @@ -0,0 +1,104 @@ +/* + * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar + * + * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentence_pair.h" +#include "extract.h" +#include "fdict.h" +#include "tdict.h" + +#include +#include +#include + +using namespace std; +using namespace std::tr1; + +static const size_t MAX_LINE_LENGTH = 64000000; + +int main(int argc, char* argv[]){ + + bool DEBUG = false; + + map > word_translation; + map total_foreign; + map total_english; + + AnnotatedParallelSentence sent; + char* buf = new char[MAX_LINE_LENGTH]; + while(cin) + { + cin.getline(buf, MAX_LINE_LENGTH); + if (buf[0] == 0) continue; + + sent.ParseInputLine(buf); + + map foreign_aligned; + map english_aligned; + + //iterate over the alignment to compute aligned words + + for(int i =0;i& nullcounts = word_translation[NULL_]; + for (int j =0; j < sent.e_len; j++) + { + if (english_aligned.count(sent.e[j])) continue; + ++nullcounts[sent.e[j]]; + ++total_foreign[NULL_]; + ++total_english[sent.e[j]]; + } + + for (int i =0; i < sent.f_len; i++) + { + if (foreign_aligned.count(sent.f[i])) continue; + ++word_translation[sent.f[i]][NULL_]; + ++total_english[NULL_]; + ++total_foreign[sent.f[i]]; + } + + } + + for(map < WordID, map >::iterator it = word_translation.begin(); it != word_translation.end(); ++it) + { + const map& trans = it->second; + for (map::const_iterator iit = trans.begin(); iit != trans.end(); ++iit) { + cout << TD::Convert(it->first) << "," << TD::Convert(iit->first) << "=" << iit->second << "/" << total_foreign[it->first] << endl; + } + } + + + return 0; +} -- cgit v1.2.3