summaryrefslogtreecommitdiff
path: root/decoder/ff_charset.cc
blob: 472de82b81635672cc9191d471dfcbcb8eb66e5d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include "ff_charset.h"

#include "fdict.h"
#include "stringlib.h"

using namespace std;

NonLatinCount::NonLatinCount(const string& param) : FeatureFunction(), fid_(FD::Convert("NonLatinCount")) {}

bool ContainsNonLatin(const string& word) {
  unsigned cur = 0;
  while(cur < word.size()) {
    const int size = UTF8Len(word[cur]);
    if (size > 1) return true;
    cur += size;  
  }
  return false;
}

void NonLatinCount::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                          const Hypergraph::Edge& edge,
                                          const std::vector<const void*>& ant_contexts,
                                          FeatureVector* features,
                                          FeatureVector* estimated_features,
                                          void* context) const {
  const vector<WordID>& e = edge.rule_->e();
  int count = 0;
  for (int i = 0; i < e.size(); ++i) {
    if (e[i] > 0) {
      map<WordID, bool>::iterator it = is_non_latin_.find(e[i]);
      if (it == is_non_latin_.end()) {
        if ((is_non_latin_[e[i]] = ContainsNonLatin(TD::Convert(e[i]))))
          ++count;
      } else {
        if (it->second)
          ++count;
      }
    }
  }
  if (count) features->set_value(fid_, count);
}