summaryrefslogtreecommitdiff
path: root/decoder/ff_sample_fsa.h
blob: 6e42b83b626172df7cc94cd8e9f53108cce0bc69 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#ifndef FF_SAMPLE_FSA_H
#define FF_SAMPLE_FSA_H

#include "ff_from_fsa.h"

// example: feature val = -1 * # of target words
struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
  static std::string usage(bool param,bool verbose) {
    return FeatureFunction::usage_helper(
      "WordPenaltyFsa","","-1 per target word"
      ,param,verbose);
  }

  WordPenaltyFsa(std::string const& param) {
    Init();
    return;
    //below are all defaults:
    set_state_bytes(0);
    start.clear();
    h_start.clear();
  }
  static const float val_per_target_word=-1;
  // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates.  state and next_state may be same memory.
  void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const {
    features->add_value(fid_,val_per_target_word);
  }
};

typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;


// appears to be buggy right now: give it a bonus weight (-) and it overstates how many
struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
  typedef FsaFeatureFunctionBase<LongerThanPrev> Base;
  static std::string usage(bool param,bool verbose) {
    return FeatureFunction::usage_helper(
      "LongerThanPrev",
      "",
      "stupid example stateful (bigram) feature: -1 per target word that's longer than the previous word (<s> sentence begin considered 3 chars long, </s> is sentence end.)",
      param,verbose);
  }

  static inline int &state(void *st) {
    return *(int*)st;
  }
  static inline int state(void const* st) {
    return *(int const*)st;
  }
  static inline int wordlen(WordID w) {
    return std::strlen(TD::Convert(w));
  }
  int markov_order() const { return 1; }
  LongerThanPrev(std::string const& param) : Base(sizeof(int),singleton_sentence(TD::se)) {
    Init();
    if (0) { // all this is done in constructor already
      set_state_bytes(sizeof(int));
      start.resize(state_bytes()); // this is done by set_state_bytes already.
      h_start.resize(state_bytes());
      int ss=3;
      to_state(start.begin(),&ss,1);
      ss=4;
      to_state(h_start.begin(),&ss,1);
    }

    state(start.begin())=3;
    state(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous

  }

  static const float val_per_target_word=-1;
  void Scan(SentenceMetadata const& smeta,WordID w,void const* from,void *next_state,FeatureVector *features) const {
    int prevlen=state(from);
    int len=wordlen(w);
    if (len>prevlen)
      features->add_value(fid_,val_per_target_word);
    state(next_state)=len;
  }
};

// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
// also buggy right now: give it a bonus weight (-) and it overstates how many
struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
  typedef FsaTypedBase<int,ShorterThanPrev> Base;
  static std::string usage(bool param,bool verbose) {
    return FeatureFunction::usage_helper(
      "ShorterThanPrev",
      "",
      "stupid example stateful (bigram) feature: -1 per target word that's shorter than the previous word (end of sentence considered '</s>')",
      param,verbose);
  }

  static inline int wordlen(WordID w) {
    return std::strlen(TD::Convert(w));
  }
  ShorterThanPrev(std::string const& param)
  : Base(3,4,singleton_sentence(TD::se))
    // start, h_start, end_phrase
    // estimate: anything <4 chars is usually shorter than previous
  {
    Init();
  }

  static const float val_per_target_word=-1;
  // evil anti-google int & len out-param:
  void ScanTyped(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const {
    len=wordlen(w);
    if (len<prevlen)
      features->add_value(fid_,val_per_target_word);
  }

  // already provided by FsaTypedScan<ShorterThanPrev>
/*  void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
    ScanTyped(smeta,w,state(st),state(next_state),features);
    } */

};


#endif