summaryrefslogtreecommitdiff
path: root/extools/build_lexical_translation.cc
blob: f609f56a38b5c36803d695bd547063049fde048f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
 * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar
 *
 * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn
 */
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <utility>
#include <cstdlib>
#include <fstream>
#include <tr1/unordered_map>

#include "sentence_pair.h"
#include "extract.h"
#include "fdict.h"
#include "tdict.h"

#include <boost/functional/hash.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>

using namespace std;
using namespace std::tr1;

static const size_t MAX_LINE_LENGTH = 64000000;

int main(int argc, char* argv[]){

  bool DEBUG = false;

  map <WordID, map<WordID, int> > word_translation;
  map <WordID, int> total_foreign;
  map <WordID, int> total_english;

  AnnotatedParallelSentence sent;
  char* buf = new char[MAX_LINE_LENGTH];
  while(cin) 
    {
      cin.getline(buf, MAX_LINE_LENGTH);
      if (buf[0] == 0) continue;
      
      sent.ParseInputLine(buf);
      
      map <WordID, int> foreign_aligned;
      map <WordID, int> english_aligned;

      //iterate over the alignment to compute aligned words
            
      for(int i =0;i<sent.aligned.width();i++)
	{
	  for (int j=0;j<sent.aligned.height();j++)
	    {
	      if (DEBUG) cout << sent.aligned(i,j) << " ";
	      if( sent.aligned(i,j))
		{
		  if (DEBUG) cout << TD::Convert(sent.f[i])  << " aligned to " << TD::Convert(sent.e[j]);
		  //local counts
		  ++foreign_aligned[sent.f[i]];
		  ++english_aligned[sent.e[j]];

		  //global counts
		  ++word_translation[sent.f[i]][sent.e[j]];
		  ++total_foreign[sent.f[i]];
		  ++total_english[sent.e[j]];
		}
	    }
	  if (DEBUG)  cout << endl;
	}
      if (DEBUG) cout << endl;
      
      static const WordID NULL_ = TD::Convert("NULL");
      //handle unaligned words - align them to null
      map<WordID, int>& nullcounts = word_translation[NULL_];
      for (int j =0; j < sent.e_len; j++)
	{
	  if (english_aligned.count(sent.e[j])) continue;
	  ++nullcounts[sent.e[j]];
	  ++total_foreign[NULL_];
	  ++total_english[sent.e[j]];
	}

      for (int i =0; i < sent.f_len; i++)
	{
	  if (foreign_aligned.count(sent.f[i])) continue;
	  ++word_translation[sent.f[i]][NULL_];
	  ++total_english[NULL_];
	  ++total_foreign[sent.f[i]];
	}
      
    }

  for(map < WordID, map<WordID,int> >::iterator it = word_translation.begin(); it != word_translation.end(); ++it)
    {
    const map<WordID, int>& trans = it->second;
    for (map<WordID,int>::const_iterator iit = trans.begin(); iit != trans.end(); ++iit) {
      cout <<  TD::Convert(it->first) <<  "," << TD::Convert(iit->first) << "=" << iit->second << "/" << total_foreign[it->first] << endl;
    }
  }


  return 0;
}