1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
#ifndef FF_SAMPLE_FSA_H
#define FF_SAMPLE_FSA_H
#include "ff_from_fsa.h"
// example: feature val = 1 * # of target words
struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"WordPenaltyFsa","","1 per target word"
,param,verbose);
}
WordPenaltyFsa(std::string const& param) {
Init();
return;
//below are all defaults:
set_state_bytes(0);
start.clear();
h_start.clear();
}
// move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state may be same memory.
Featval Scan1(WordID w,void const* state,void *next_state) const {
return 1;
}
};
typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
// appears to be buggy right now: give it a bonus weight (-) and it overstates how many
struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
typedef FsaFeatureFunctionBase<LongerThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"LongerThanPrev",
"",
"stupid example stateful (bigram) feature: 1 per target word that's longer than the previous word (<s> sentence begin considered 3 chars long, </s> is sentence end.)",
param,verbose);
}
static inline int &state(void *st) {
return *(int*)st;
}
static inline int state(void const* st) {
return *(int const*)st;
}
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
int markov_order() const { return 1; }
LongerThanPrev(std::string const& param) : Base(sizeof(int)/* ,singleton_sentence(TD::se) */) {
Init();
if (0) { // all this is done in constructor already
set_state_bytes(sizeof(int));
start.resize(state_bytes()); // this is done by set_state_bytes already.
h_start.resize(state_bytes());
int ss=3;
to_state(start.begin(),&ss,1);
ss=4;
to_state(h_start.begin(),&ss,1);
}
state(start.begin())=999999;
state(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous
}
Featval Scan1(WordID w,void const* from,void *next_state) const {
int prevlen=state(from);
int len=wordlen(w);
state(next_state)=len;
return len>prevlen ? 1 : 0;
}
};
// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
// also buggy right now: give it a bonus weight (-) and it overstates how many
struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
typedef FsaTypedBase<int,ShorterThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"ShorterThanPrev",
"",
"stupid example stateful (bigram) feature: 1 per target word that's shorter than the previous word (end of sentence considered '</s>')",
param,verbose);
}
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
ShorterThanPrev(std::string const& param)
: Base(-1,4/* ,singleton_sentence(TD::se) */)
// start, h_start, end_phrase
// estimate: anything <4 chars is usually shorter than previous
{
Init();
}
/* Featval ScanT1(WordID w,int prevlen,int &len) const;
// alternative to below:
*/
// evil anti-google int & len out-param:
void ScanT(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,int prevlen,int &len,FeatureVector *features) const {
len=wordlen(w);
if (len<prevlen)
features->add_value(fid_,1);
}
};
#endif
|