1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
#ifndef FF_SAMPLE_FSA_H
#define FF_SAMPLE_FSA_H
//TODO: fsa interface changed to include ScanPhrase* Scan*Accum. update these so they compile+work
#include "ff_from_fsa.h"
// example: feature val = 1 * # of target words
struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"WordPenaltyFsa","","1 per target word"
,param,verbose);
}
WordPenaltyFsa(std::string const& param) {
Init();
return; //below are all defaults:
set_state_bytes(0);
start.clear();
h_start.clear();
}
// move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state may be same memory.
Featval Scan1(WordID w,void const* state,void *next_state) const {
return 1;
}
};
typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
struct SameFirstLetter : public FsaFeatureFunctionBase<SameFirstLetter> {
SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase<SameFirstLetter>(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost
int markov_order() const { return 1; }
Featval Scan1(WordID w,void const* old_state,void *new_state) const {
char cw=TD::Convert(w)[0];
char co=*(char const*)old_state;
*(char *)new_state = cw;
return cw==co?1:0;
}
void print_state(std::ostream &o,void const* st) const {
o<<*(char const*)st;
}
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose);
}
};
// appears to be buggy right now: give it a bonus weight (+)
struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
typedef FsaFeatureFunctionBase<LongerThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"LongerThanPrev",
"",
"stupid example stateful (bigram) feature: 1 per target word that's longer than the previous word (<s> sentence begin considered 3 chars long, </s> is sentence end.)",
param,verbose);
}
static inline int &state(void *st) {
return *(int*)st;
}
static inline int state(void const* st) {
return *(int const*)st;
}
/* int describe_state(void const* st) const {
return state(st);
}
*/
// only need 1 of the 2
void print_state(std::ostream &o,void const* st) const {
o<<state(st);
}
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
int markov_order() const { return 1; }
LongerThanPrev(std::string const& param) : Base(sizeof(int)/* ,singleton_sentence(TD::se) */) {
Init();
if (0) { // all this is done in constructor already
set_state_bytes(sizeof(int));
//start.resize(state_bytes());h_start.resize(state_bytes()); // this is done by set_state_bytes already.
int ss=3;
to_state(start.begin(),&ss,1);
ss=4;
to_state(h_start.begin(),&ss,1);
}
assert(state_bytes()==sizeof(int));
assert(start.size()==sizeof(int));
assert(h_start.size()==sizeof(int));
state(start.begin())=999999;
state(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous
}
Featval Scan1(WordID w,void const* from,void *next_state) const {
int prevlen=state(from);
int len=wordlen(w);
state(next_state)=len;
return len>prevlen ? 1 : 0;
}
};
// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
typedef FsaTypedBase<int,ShorterThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"ShorterThanPrev",
"",
"stupid example stateful (bigram) feature: 1 per target word that's shorter than the previous word (end of sentence considered '</s>')",
param,verbose);
}
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
ShorterThanPrev(std::string const& param)
: Base(-1,4,singleton_sentence(TD::se)) // start, h_start, end_phrase
// estimate: anything <4 chars is usually shorter than previous
{
Init();
}
/* Featval ScanT1(WordID w,int prevlen,int &len) const;
// alternative to below:
*/
// evil anti-google int & len out-param:
Featval ScanT1(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,int prevlen,int &len) const {
len=wordlen(w);
return (len<prevlen) ? 1 : 0;
}
};
#endif
|