summaryrefslogtreecommitdiff
path: root/extools/sentence_pair.h
blob: a05275e78b733b75b38b1fd2d2132a3ebd3d223d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#ifndef _SENTENCE_PAIR_H_
#define _SENTENCE_PAIR_H_

#include <map>
#include <utility>
#include <vector>
#include <boost/tuple/tuple.hpp>
#include "wordid.h"
#include "array2d.h"

// represents a parallel sentence with a word alignment and category
// annotations over subspans (currently in terms of f)
// you should read one using ParseInputLine and then use the public
// member variables to query things about it
struct AnnotatedParallelSentence {
  // read annotated parallel sentence from string
  void ParseInputLine(const char* buf);

  std::vector<WordID> f, e;  // words in f and e

  // word alignment information
  std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned
  Array2D<bool> aligned;
  std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;

  // span type information
  std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > span_types;
  // span_types(i,j,k,l) is the list of category span (i,j) in source and (k,l) in the target language.

  int f_len, e_len;

  void Align(const short a, const short b);
  void AllocateForAlignment();

  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d);

 private:
  void Reset();
  void ParseAlignmentPoint(const char* buf, int start, int end);
  void ParseSpanLabel(const char* buf, int start, int end);
};

#endif