#include <iostream> #include <fstream> #include <cassert> #include <cmath> #include <boost/utility.hpp> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> #include "boost/tuple/tuple.hpp" #include "fdict.h" #include "sparse_vector.h" using namespace std; namespace po = boost::program_options; // useful for EM models parameterized by a bunch of multinomials // this converts event counts (returned from cdec as feature expectations) // into different keys and values (which are lists of all the events, // conditioned on the key) for summing and normalization by a reducer void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts") ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value<string>(), "Configuration file") ("help,h", "Print this help message and exit"); po::options_description dconfig_options, dcmdline_options; dconfig_options.add(opts); dcmdline_options.add(opts).add(clo); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); if (conf->count("config")) { ifstream config((*conf)["config"].as<string>().c_str()); po::store(po::parse_config_file(config, dconfig_options), *conf); } po::notify(*conf); if (conf->count("help")) { cerr << dcmdline_options << endl; exit(1); } } struct EventMapper { int Map(int fid) { int& cv = map_[fid]; if (!cv) { cv = GetConditioningVariable(fid); } return cv; } void Clear() { map_.clear(); } protected: virtual int GetConditioningVariable(int fid) const = 0; private: map<int, int> map_; }; struct LexAlignEventMapper : public EventMapper { protected: virtual int GetConditioningVariable(int fid) const { const string& str = FD::Convert(fid); size_t pos = str.rfind("_"); if (pos == string::npos || pos == 0 || pos >= str.size() - 1) { cerr << "Bad feature for EM adapter: " << str << endl; abort(); } return FD::Convert(str.substr(0, pos)); } }; int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); const bool use_b64 = conf["format"].as<string>() == "b64"; const int buffer_size = conf["buffer_size"].as<int>(); const string s_obj = "**OBJ**"; // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; // 0<TAB>**OBJ**=1.1;Feat1=1.0; EventMapper* event_mapper = new LexAlignEventMapper; map<int, SparseVector<double> > counts; size_t total = 0; while(cin) { string line; getline(cin, line); if (line.empty()) continue; int feat; double val; size_t i = line.find("\t"); assert(i != string::npos); ++i; SparseVector<double> g; double obj = 0; if (use_b64) { if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { cerr << "B64 decoder returned error, skipping!\n"; continue; } } else { // text encoding - your counts will not be accurate! while (i < line.size()) { size_t start = i; while (line[i] != '=' && i < line.size()) ++i; if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } string fname = line.substr(start, i - start); if (fname == s_obj) { feat = -1; } else { feat = FD::Convert(line.substr(start, i - start)); } ++i; start = i; while (line[i] != ';' && i < line.size()) ++i; if (i - start == 0) continue; val = atof(line.substr(start, i - start).c_str()); ++i; if (feat == -1) { obj = val; } else { g.set_value(feat, val); } } } //cerr << "OBJ: " << obj << endl; const SparseVector<double>& cg = g; for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { const int cond_var = event_mapper->Map(it->first); SparseVector<double>& cond_counts = counts[cond_var]; int delta = cond_counts.size(); cond_counts.add_value(it->first, it->second); delta = cond_counts.size() - delta; total += delta; } if (total > buffer_size) { for (map<int, SparseVector<double> >::iterator it = counts.begin(); it != counts.end(); ++it) { const SparseVector<double>& cc = it->second; cout << FD::Convert(it->first) << '\t'; if (use_b64) { B64::Encode(0.0, cc, &cout); } else { abort(); } cout << endl; } cout << flush; total = 0; counts.clear(); } } return 0; }