diff options
author | Patrick Simianer <p@simianer.de> | 2011-09-04 23:40:44 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-23 19:13:58 +0200 |
commit | 809361a8e182c5e47ffe569db7671bc8c7224867 (patch) | |
tree | b0c34f494be231962fdf7ec70eb739af64be0004 /dtrain/scfg/features/count/featurecount.cc | |
parent | aa5f96417ff81408b15b54aab35a3c16b845adf8 (diff) |
minor updates, fixes, kbest filtering switch
Diffstat (limited to 'dtrain/scfg/features/count/featurecount.cc')
-rw-r--r-- | dtrain/scfg/features/count/featurecount.cc | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/dtrain/scfg/features/count/featurecount.cc b/dtrain/scfg/features/count/featurecount.cc new file mode 100644 index 00000000..db31885c --- /dev/null +++ b/dtrain/scfg/features/count/featurecount.cc @@ -0,0 +1,49 @@ +#include "featurecount.hh" + + +void +FeatureCountMapper::map( HadoopPipes::MapContext &context ) +{ + string line = context.getInputValue(); + + // get features substr + size_t i = 0, c = 0, beg = 0, end = 0; + string::iterator it = line.begin(); + string s; + while ( c != 12 ) { + s = *it; + if ( s == "|" ) c += 1; + if ( beg == 0 && c == 9 ) beg = i+2; + if ( c == 12 ) end = i-beg-3; + it++; + i++; + } + string sub = line.substr( beg, end ); + + // emit feature:1 + vector<string> f_tok; + boost::split( f_tok, sub, boost::is_any_of(" ") ); + vector<string>::iterator f; + for ( f = f_tok.begin(); f != f_tok.end(); f++ ) { + if ( f->find("=1") != string::npos ) context.emit(*f, "1"); + } +} + +void +FeatureCountReducer::reduce( HadoopPipes::ReduceContext &context ) +{ + size_t sum = 0; + while ( context.nextValue() ) sum += HadoopUtils::toInt( context.getInputValue() ); + context.emit( context.getInputKey(), HadoopUtils::toString(sum) ); +} + + +int +main( int argc, char * argv[] ) +{ + HadoopPipes::TemplateFactory2<FeatureCountMapper, + FeatureCountReducer> factory; + + return HadoopPipes::runTask(factory); +} + |