summaryrefslogtreecommitdiff
path: root/extractor/translation_table.h
blob: 2a37bab7ebd4fc3396548d7c9e9d32033f180bc4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#ifndef _TRANSLATION_TABLE_
#define _TRANSLATION_TABLE_

#include <memory>
#include <string>
#include <unordered_map>

#include <boost/filesystem.hpp>
#include <boost/functional/hash.hpp>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/split_member.hpp>
#include <boost/serialization/utility.hpp>

using namespace std;
namespace fs = boost::filesystem;

namespace extractor {

typedef boost::hash<pair<int, int>> PairHash;

class Alignment;
class DataArray;

/**
 * Bilexical table with conditional probabilities.
 */
class TranslationTable {
 public:
  // Constructs a translation table from source data, target data and the
  // corresponding alignment.
  TranslationTable(
      shared_ptr<DataArray> source_data_array,
      shared_ptr<DataArray> target_data_array,
      shared_ptr<Alignment> alignment);

  // Creates empty translation table.
  TranslationTable();

  virtual ~TranslationTable();

  // Returns p(e | f).
  virtual double GetTargetGivenSourceScore(const string& source_word,
                                           const string& target_word);

  // Returns p(f | e).
  virtual double GetSourceGivenTargetScore(const string& source_word,
                                           const string& target_word);

  bool operator==(const TranslationTable& other) const;

 private:
  // Increment links count for the given (f, e) word pair.
  void IncrementLinksCount(
      unordered_map<int, int>& source_links_count,
      unordered_map<int, int>& target_links_count,
      unordered_map<pair<int, int>, int, PairHash>& links_count,
      int source_word_id,
      int target_word_id) const;

  friend class boost::serialization::access;

  template<class Archive> void save(Archive& ar, unsigned int) const {
    ar << *source_data_array << *target_data_array;

    int num_entries = translation_probabilities.size();
    ar << num_entries;
    for (auto entry: translation_probabilities) {
      ar << entry;
    }
  }

  template<class Archive> void load(Archive& ar, unsigned int) {
    source_data_array = make_shared<DataArray>();
    ar >> *source_data_array;
    target_data_array = make_shared<DataArray>();
    ar >> *target_data_array;

    int num_entries;
    ar >> num_entries;
    for (size_t i = 0; i < num_entries; ++i) {
      pair<pair<int, int>, pair<double, double>> entry;
      ar >> entry;
      translation_probabilities.insert(entry);
    }
  }

  BOOST_SERIALIZATION_SPLIT_MEMBER();

  shared_ptr<DataArray> source_data_array;
  shared_ptr<DataArray> target_data_array;
  unordered_map<pair<int, int>, pair<double, double>, PairHash>
      translation_probabilities;
};

} // namespace extractor

#endif