1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#ifndef FF_SAMPLE_FSA_H
#define FF_SAMPLE_FSA_H
#include "ff_from_fsa.h"
// example: feature val = -1 * # of target words
struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"WordPenaltyFsa","","-1 per target word"
,param,verbose);
}
WordPenaltyFsa(std::string const& param) {
Init();
return;
//below are all defaults:
set_state_bytes(0);
start.clear();
h_start.clear();
}
static const float val_per_target_word=-1;
// move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state may be same memory.
void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const {
features->add_value(fid_,val_per_target_word);
}
};
typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
// appears to be buggy right now: give it a bonus weight (-) and it overstates how many
struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
typedef FsaFeatureFunctionBase<LongerThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"LongerThanPrev",
"",
"stupid example stateful (bigram) feature: -1 per target word that's longer than the previous word (<s> sentence begin considered 3 chars long, </s> is sentence end.)",
param,verbose);
}
static inline int &state(void *st) {
return *(int*)st;
}
static inline int state(void const* st) {
return *(int const*)st;
}
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
int markov_order() const { return 1; }
LongerThanPrev(std::string const& param) : Base(sizeof(int),singleton_sentence(TD::se)) {
Init();
if (0) { // all this is done in constructor already
set_state_bytes(sizeof(int));
start.resize(state_bytes()); // this is done by set_state_bytes already.
h_start.resize(state_bytes());
int ss=3;
to_state(start.begin(),&ss,1);
ss=4;
to_state(h_start.begin(),&ss,1);
}
state(start.begin())=3;
state(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous
}
static const float val_per_target_word=-1;
void Scan(SentenceMetadata const& smeta,WordID w,void const* from,void *next_state,FeatureVector *features) const {
int prevlen=state(from);
int len=wordlen(w);
if (len>prevlen)
features->add_value(fid_,val_per_target_word);
state(next_state)=len;
}
};
// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
// also buggy right now: give it a bonus weight (-) and it overstates how many
struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
typedef FsaTypedBase<int,ShorterThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"ShorterThanPrev",
"",
"stupid example stateful (bigram) feature: -1 per target word that's shorter than the previous word (end of sentence considered '</s>')",
param,verbose);
}
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
ShorterThanPrev(std::string const& param)
: Base(3,4,singleton_sentence(TD::se))
// start, h_start, end_phrase
// estimate: anything <4 chars is usually shorter than previous
{
Init();
}
static const float val_per_target_word=-1;
// evil anti-google int & len out-param:
void ScanTyped(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const {
len=wordlen(w);
if (len<prevlen)
features->add_value(fid_,val_per_target_word);
}
// already provided by FsaTypedScan<ShorterThanPrev>
/* void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
ScanTyped(smeta,w,state(st),state(next_state),features);
} */
};
#endif
|