summaryrefslogtreecommitdiff
path: root/extools/sentence_pair.h
blob: d78be359f7a17bd27383b6a1a854e1b9b0954de8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#ifndef _SENTENCE_PAIR_H_
#define _SENTENCE_PAIR_H_

#include <utility>
#include <vector>
#include "wordid.h"
#include "array2d.h"

// represents a parallel sentence with a word alignment and category
// annotations over subspans (currently in terms of f)
// you should read one using ParseInputLine and then use the public
// member variables to query things about it
struct AnnotatedParallelSentence {
  // read annotated parallel sentence from string
  void ParseInputLine(const char* buf);

  std::vector<WordID> f, e;  // words in f and e

  // word alignment information
  std::vector<int> e_aligned, f_aligned; // counts the number of times column/row x is aligned
  Array2D<bool> aligned;
  std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;

  // span type information
  Array2D<std::vector<WordID> > span_types;  // span_types(i,j) is the list of category
                               // types for a span (i,j) in the TARGET language.

  int f_len, e_len;

  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b);

 private:
  void Reset();
  void AllocateForAlignment();
  void ParseAlignmentPoint(const char* buf, int start, int end);
  void ParseSpanLabel(const char* buf, int start, int end);
};

#endif