From 38a5bee71f6b49515cd105a9467ff602ff9dee64 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 13:25:46 +0100 Subject: optional support for doing perfect hashing of feature strings to save lots of memory --- utils/Makefile.am | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'utils/Makefile.am') diff --git a/utils/Makefile.am b/utils/Makefile.am index 94f9be30..c50747bf 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,5 +1,5 @@ -noinst_PROGRAMS = ts -TESTS = ts +noinst_PROGRAMS = ts phmt +TESTS = ts phmt if HAVE_GTEST noinst_PROGRAMS += \ @@ -27,6 +27,11 @@ libutils_a_SOURCES = \ verbose.cc \ weights.cc +if HAVE_CMPH + libutils_a_SOURCES += perfect_hash.cc +endif + +phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc dict_test_SOURCES = dict_test.cc dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) -- cgit v1.2.3 From 4d87d0edc375a9a7bedddb22d075b6484daf0bf6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 20:16:17 +0100 Subject: tool to reconstruct text weights from a hash function, key file, and (binary) weights file --- utils/Makefile.am | 5 ++++ utils/reconstruct_weights.cc | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 utils/reconstruct_weights.cc (limited to 'utils/Makefile.am') diff --git a/utils/Makefile.am b/utils/Makefile.am index c50747bf..df667655 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,3 +1,6 @@ + +bin_PROGRAMS = reconstruct_weights + noinst_PROGRAMS = ts phmt TESTS = ts phmt @@ -11,6 +14,8 @@ noinst_PROGRAMS += \ TESTS += small_vector_test logval_test weights_test dict_test endif +reconstruct_weights_SOURCES = reconstruct_weights.cc + noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ diff --git a/utils/reconstruct_weights.cc b/utils/reconstruct_weights.cc new file mode 100644 index 00000000..d32e4f67 --- /dev/null +++ b/utils/reconstruct_weights.cc @@ -0,0 +1,68 @@ +#include +#include +#include + +#include +#include + +#include "filelib.h" +#include "fdict.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"Input feature weights file") + ("keys,k",po::value(),"Keys file (list of features with dummy value at start)") + ("cmph_perfect_hash_file,h",po::value(),"cmph perfect hash function file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,?", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("cmph_perfect_hash_file") || !conf->count("weights") || !conf->count("keys")) { + cerr << "Generate a text format weights file. Options -w -k and -h are required.\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +int main(int argc, char** argv) { + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + FD::EnableHash(conf["cmph_perfect_hash_file"].as()); + + // load weights + vector weights; + Weights::InitFromFile(conf["weights"].as(), &weights); + + ReadFile rf(conf["keys"].as()); + istream& in = *rf.stream(); + string key; + size_t lc = 0; + while(getline(in, key)) { + ++lc; + if (lc == 1) continue; + assert(lc <= weights.size()); + cout << key << " " << weights[lc - 1] << endl; + } + + return 0; +} + -- cgit v1.2.3