summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorWu, Ke <wuke@cs.umd.edu>2014-12-17 16:11:38 -0500
committerWu, Ke <wuke@cs.umd.edu>2014-12-17 16:11:38 -0500
commit7468e8d85e99b4619442c7afaf4a0d92870111bb (patch)
treea6f17da7c69048c8900260b5490bb9d8611be3bb /utils
parentb6dd5a683db9dda2d634dd2fdb76606819594901 (diff)
parent1a79175f9a101d46cf27ca921213d5dd9300518f (diff)
Merge with upstream
Diffstat (limited to 'utils')
-rw-r--r--utils/Makefile.am2
-rw-r--r--utils/alias_sampler.h4
-rw-r--r--utils/alignment_io.h4
-rw-r--r--utils/b64featvector.cc55
-rw-r--r--utils/b64featvector.h12
-rw-r--r--utils/b64tools.h4
-rw-r--r--utils/corpus_tools.h4
-rw-r--r--utils/exp_semiring.h4
-rw-r--r--utils/fast_sparse_vector.h4
-rw-r--r--utils/fdict.h4
-rw-r--r--utils/feature_vector.h4
-rw-r--r--utils/filelib.h4
-rw-r--r--utils/kernel_string_subseq.h4
-rw-r--r--utils/m.h4
-rw-r--r--utils/murmur_hash3.h4
-rw-r--r--utils/perfect_hash.h4
-rw-r--r--utils/prob.h4
-rw-r--r--utils/small_vector.h20
-rw-r--r--utils/small_vector_test.cc30
-rw-r--r--utils/sparse_vector.h4
-rw-r--r--utils/star.h4
-rw-r--r--utils/sv_test.cc31
-rw-r--r--utils/tdict.h4
-rw-r--r--utils/timing_stats.h4
-rw-r--r--utils/verbose.h4
-rw-r--r--utils/weights.h4
-rw-r--r--utils/wordid.h4
27 files changed, 190 insertions, 44 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index e0221e64..dd74ddc0 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -22,6 +22,7 @@ libutils_a_SOURCES = \
alias_sampler.h \
alignment_io.h \
array2d.h \
+ b64featvector.h \
b64tools.h \
batched_append.h \
city.h \
@@ -72,6 +73,7 @@ libutils_a_SOURCES = \
fast_lexical_cast.hpp \
intrusive_refcount.hpp \
alignment_io.cc \
+ b64featvector.cc \
b64tools.cc \
corpus_tools.cc \
dict.cc \
diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h
index 81541f7a..0f9d3f6d 100644
--- a/utils/alias_sampler.h
+++ b/utils/alias_sampler.h
@@ -1,5 +1,5 @@
-#ifndef _ALIAS_SAMPLER_H_
-#define _ALIAS_SAMPLER_H_
+#ifndef ALIAS_SAMPLER_H_
+#define ALIAS_SAMPLER_H_
#include <vector>
#include <limits>
diff --git a/utils/alignment_io.h b/utils/alignment_io.h
index 63fb916b..ec70688e 100644
--- a/utils/alignment_io.h
+++ b/utils/alignment_io.h
@@ -1,5 +1,5 @@
-#ifndef _ALIGNMENT_IO_H_
-#define _ALIGNMENT_IO_H_
+#ifndef ALIGNMENT_IO_H_
+#define ALIGNMENT_IO_H_
#include <string>
#include <iostream>
diff --git a/utils/b64featvector.cc b/utils/b64featvector.cc
new file mode 100644
index 00000000..c7d08b29
--- /dev/null
+++ b/utils/b64featvector.cc
@@ -0,0 +1,55 @@
+#include "b64featvector.h"
+
+#include <sstream>
+#include <boost/scoped_array.hpp>
+#include "b64tools.h"
+#include "fdict.h"
+
+using namespace std;
+
+static inline void EncodeFeatureWeight(const string &featname, weight_t weight,
+ ostream *output) {
+ output->write(featname.data(), featname.size() + 1);
+ output->write(reinterpret_cast<char *>(&weight), sizeof(weight_t));
+}
+
+string EncodeFeatureVector(const SparseVector<weight_t> &vec) {
+ string b64;
+ {
+ ostringstream base64_strm;
+ {
+ ostringstream strm;
+ for (SparseVector<weight_t>::const_iterator it = vec.begin();
+ it != vec.end(); ++it)
+ if (it->second != 0)
+ EncodeFeatureWeight(FD::Convert(it->first), it->second, &strm);
+ string data(strm.str());
+ B64::b64encode(data.data(), data.size(), &base64_strm);
+ }
+ b64 = base64_strm.str();
+ }
+ return b64;
+}
+
+void DecodeFeatureVector(const string &data, SparseVector<weight_t> *vec) {
+ vec->clear();
+ if (data.empty()) return;
+ // Decode data
+ size_t b64_len = data.size(), len = b64_len / 4 * 3;
+ boost::scoped_array<char> buf(new char[len]);
+ bool res =
+ B64::b64decode(reinterpret_cast<const unsigned char *>(data.data()),
+ b64_len, buf.get(), len);
+ assert(res);
+ // Apply updates
+ size_t cur = 0;
+ while (cur < len) {
+ string feat_name(buf.get() + cur);
+ if (feat_name.empty()) break; // Encountered trailing \0
+ int feat_id = FD::Convert(feat_name);
+ weight_t feat_delta =
+ *reinterpret_cast<weight_t *>(buf.get() + cur + feat_name.size() + 1);
+ (*vec)[feat_id] = feat_delta;
+ cur += feat_name.size() + 1 + sizeof(weight_t);
+ }
+}
diff --git a/utils/b64featvector.h b/utils/b64featvector.h
new file mode 100644
index 00000000..6ac04d44
--- /dev/null
+++ b/utils/b64featvector.h
@@ -0,0 +1,12 @@
+#ifndef _B64FEATVECTOR_H_
+#define _B64FEATVECTOR_H_
+
+#include <string>
+
+#include "sparse_vector.h"
+#include "weights.h"
+
+std::string EncodeFeatureVector(const SparseVector<weight_t> &);
+void DecodeFeatureVector(const std::string &, SparseVector<weight_t> *);
+
+#endif // _B64FEATVECTOR_H_
diff --git a/utils/b64tools.h b/utils/b64tools.h
index c821fc8f..130a9102 100644
--- a/utils/b64tools.h
+++ b/utils/b64tools.h
@@ -1,5 +1,5 @@
-#ifndef _B64_TOOLS_H_
-#define _B64_TOOLS_H_
+#ifndef B64_TOOLS_H_
+#define B64_TOOLS_H_
namespace B64 {
bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize);
diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h
index f6699d87..3ccaf6ef 100644
--- a/utils/corpus_tools.h
+++ b/utils/corpus_tools.h
@@ -1,5 +1,5 @@
-#ifndef _CORPUS_TOOLS_H_
-#define _CORPUS_TOOLS_H_
+#ifndef CORPUS_TOOLS_H_
+#define CORPUS_TOOLS_H_
#include <string>
#include <set>
diff --git a/utils/exp_semiring.h b/utils/exp_semiring.h
index 26a22071..164286e3 100644
--- a/utils/exp_semiring.h
+++ b/utils/exp_semiring.h
@@ -1,5 +1,5 @@
-#ifndef _EXP_SEMIRING_H_
-#define _EXP_SEMIRING_H_
+#ifndef EXP_SEMIRING_H_
+#define EXP_SEMIRING_H_
#include <iostream>
#include "star.h"
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 6e2a77cd..1e0ab428 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -1,5 +1,5 @@
-#ifndef _FAST_SPARSE_VECTOR_H_
-#define _FAST_SPARSE_VECTOR_H_
+#ifndef FAST_SPARSE_VECTOR_H_
+#define FAST_SPARSE_VECTOR_H_
// FastSparseVector<T> is a integer indexed unordered map that supports very fast
// (mathematical) vector operations when the sizes are very small, and reasonably
diff --git a/utils/fdict.h b/utils/fdict.h
index eb853fb2..94763890 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -1,5 +1,5 @@
-#ifndef _FDICT_H_
-#define _FDICT_H_
+#ifndef FDICT_H_
+#define FDICT_H_
#ifdef HAVE_CONFIG_H
#include "config.h"
diff --git a/utils/feature_vector.h b/utils/feature_vector.h
index a7b61a66..bf77b5ac 100644
--- a/utils/feature_vector.h
+++ b/utils/feature_vector.h
@@ -1,5 +1,5 @@
-#ifndef _FEATURE_VECTOR_H_
-#define _FEATURE_VECTOR_H_
+#ifndef FEATURE_VECTOR_H_
+#define FEATURE_VECTOR_H_
#include <vector>
#include "sparse_vector.h"
diff --git a/utils/filelib.h b/utils/filelib.h
index 4fa69760..90620d05 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -1,5 +1,5 @@
-#ifndef _FILELIB_H_
-#define _FILELIB_H_
+#ifndef FILELIB_H_
+#define FILELIB_H_
#include <cassert>
#include <string>
diff --git a/utils/kernel_string_subseq.h b/utils/kernel_string_subseq.h
index 516e8b89..00ee7da7 100644
--- a/utils/kernel_string_subseq.h
+++ b/utils/kernel_string_subseq.h
@@ -1,5 +1,5 @@
-#ifndef _KERNEL_STRING_SUBSEQ_H_
-#define _KERNEL_STRING_SUBSEQ_H_
+#ifndef KERNEL_STRING_SUBSEQ_H_
+#define KERNEL_STRING_SUBSEQ_H_
#include <vector>
#include <cmath>
diff --git a/utils/m.h b/utils/m.h
index dc881b36..bd82c305 100644
--- a/utils/m.h
+++ b/utils/m.h
@@ -1,5 +1,5 @@
-#ifndef _M_H_
-#define _M_H_
+#ifndef M_H_HEADER_
+#define M_H_HEADER_
#include <cassert>
#include <cmath>
diff --git a/utils/murmur_hash3.h b/utils/murmur_hash3.h
index a125d775..e8a8b10b 100644
--- a/utils/murmur_hash3.h
+++ b/utils/murmur_hash3.h
@@ -2,8 +2,8 @@
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
-#ifndef _MURMURHASH3_H_
-#define _MURMURHASH3_H_
+#ifndef MURMURHASH3_H_
+#define MURMURHASH3_H_
//-----------------------------------------------------------------------------
// Platform-specific functions and macros
diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h
index 29ea48a9..8c12c9f0 100644
--- a/utils/perfect_hash.h
+++ b/utils/perfect_hash.h
@@ -1,5 +1,5 @@
-#ifndef _PERFECT_HASH_MAP_H_
-#define _PERFECT_HASH_MAP_H_
+#ifndef PERFECT_HASH_MAP_H_
+#define PERFECT_HASH_MAP_H_
#include <vector>
#include <boost/utility.hpp>
diff --git a/utils/prob.h b/utils/prob.h
index bc297870..32ba9a86 100644
--- a/utils/prob.h
+++ b/utils/prob.h
@@ -1,5 +1,5 @@
-#ifndef _PROB_H_
-#define _PROB_H_
+#ifndef PROB_H_
+#define PROB_H_
#include "logval.h"
diff --git a/utils/small_vector.h b/utils/small_vector.h
index 280ab72c..f16bc898 100644
--- a/utils/small_vector.h
+++ b/utils/small_vector.h
@@ -1,5 +1,5 @@
-#ifndef _SMALL_VECTOR_H_
-#define _SMALL_VECTOR_H_
+#ifndef SMALL_VECTOR_H_
+#define SMALL_VECTOR_H_
/* REQUIRES that T is POD (can be memcpy). won't work (yet) due to union with SMALL_VECTOR_POD==0 - may be possible to handle movable types that have ctor/dtor, by using explicit allocation, ctor/dtor calls. but for now JUST USE THIS FOR no-meaningful ctor/dtor POD types.
@@ -15,6 +15,7 @@
#include <new>
#include <stdint.h>
#include <boost/functional/hash.hpp>
+#include <boost/serialization/map.hpp>
//sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1
@@ -297,6 +298,21 @@ public:
return hash_range(data_.ptr,data_.ptr+size_);
}
+ template<class Archive>
+ void save(Archive & ar, const unsigned int) const {
+ ar & size_;
+ for (unsigned i = 0; i < size_; ++i)
+ ar & (*this)[i];
+ }
+ template<class Archive>
+ void load(Archive & ar, const unsigned int) {
+ uint16_t s;
+ ar & s;
+ this->resize(s);
+ for (unsigned i = 0; i < size_; ++i)
+ ar & (*this)[i];
+ }
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
private:
union StorageType {
T vals[SV_MAX];
diff --git a/utils/small_vector_test.cc b/utils/small_vector_test.cc
index a4eb89ae..9e1a148d 100644
--- a/utils/small_vector_test.cc
+++ b/utils/small_vector_test.cc
@@ -3,6 +3,10 @@
#define BOOST_TEST_MODULE svTest
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <string>
+#include <sstream>
#include <iostream>
#include <vector>
@@ -128,3 +132,29 @@ BOOST_AUTO_TEST_CASE(Small) {
cerr << sizeof(SmallVectorInt) << endl;
cerr << sizeof(vector<int>) << endl;
}
+
+BOOST_AUTO_TEST_CASE(Serialize) {
+ std::string in;
+ {
+ SmallVectorInt v;
+ v.push_back(0);
+ v.push_back(1);
+ v.push_back(-2);
+ ostringstream os;
+ boost::archive::text_oarchive oa(os);
+ oa << v;
+ in = os.str();
+ cerr << in;
+ }
+ {
+ istringstream is(in);
+ boost::archive::text_iarchive ia(is);
+ SmallVectorInt v;
+ ia >> v;
+ BOOST_CHECK_EQUAL(v.size(), 3);
+ BOOST_CHECK_EQUAL(v[0], 0);
+ BOOST_CHECK_EQUAL(v[1], 1);
+ BOOST_CHECK_EQUAL(v[2], -2);
+ }
+}
+
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index 049151f7..13601376 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -1,5 +1,5 @@
-#ifndef _SPARSE_VECTOR_H_
-#define _SPARSE_VECTOR_H_
+#ifndef SPARSE_VECTOR_H_
+#define SPARSE_VECTOR_H_
#include "fast_sparse_vector.h"
#define SparseVector FastSparseVector
diff --git a/utils/star.h b/utils/star.h
index 21977dc9..01433d12 100644
--- a/utils/star.h
+++ b/utils/star.h
@@ -1,5 +1,5 @@
-#ifndef _STAR_H_
-#define _STAR_H_
+#ifndef STAR_H_
+#define STAR_H_
// star(x) computes the infinite sum x^0 + x^1 + x^2 + ...
diff --git a/utils/sv_test.cc b/utils/sv_test.cc
index 67df8c57..b006e66d 100644
--- a/utils/sv_test.cc
+++ b/utils/sv_test.cc
@@ -1,7 +1,12 @@
#define BOOST_TEST_MODULE WeightsTest
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <sstream>
+#include <string>
#include "sparse_vector.h"
+#include "fdict.h"
using namespace std;
@@ -33,3 +38,29 @@ BOOST_AUTO_TEST_CASE(Division) {
x /= -1;
BOOST_CHECK(x == y);
}
+
+BOOST_AUTO_TEST_CASE(Serialization) {
+ string arc;
+ FD::dict_.clear();
+ {
+ SparseVector<double> x;
+ x.set_value(FD::Convert("Feature1"), 1.0);
+ x.set_value(FD::Convert("Pi"), 3.14);
+ ostringstream os;
+ boost::archive::text_oarchive oa(os);
+ oa << x;
+ arc = os.str();
+ }
+ FD::dict_.clear();
+ FD::Convert("SomeNewString");
+ {
+ SparseVector<double> x;
+ istringstream is(arc);
+ boost::archive::text_iarchive ia(is);
+ ia >> x;
+ cerr << x << endl;
+ BOOST_CHECK_CLOSE(x.get(FD::Convert("Pi")), 3.14, 1e-9);
+ BOOST_CHECK_CLOSE(x.get(FD::Convert("Feature1")), 1.0, 1e-9);
+ }
+}
+
diff --git a/utils/tdict.h b/utils/tdict.h
index bb19ecd5..eed33c3a 100644
--- a/utils/tdict.h
+++ b/utils/tdict.h
@@ -1,5 +1,5 @@
-#ifndef _TDICT_H_
-#define _TDICT_H_
+#ifndef TDICT_H_
+#define TDICT_H_
#include <string>
#include <vector>
diff --git a/utils/timing_stats.h b/utils/timing_stats.h
index 0a9f7656..69a1cf4b 100644
--- a/utils/timing_stats.h
+++ b/utils/timing_stats.h
@@ -1,5 +1,5 @@
-#ifndef _TIMING_STATS_H_
-#define _TIMING_STATS_H_
+#ifndef TIMING_STATS_H_
+#define TIMING_STATS_H_
#include <string>
#include <map>
diff --git a/utils/verbose.h b/utils/verbose.h
index 73476383..e39e23cb 100644
--- a/utils/verbose.h
+++ b/utils/verbose.h
@@ -1,5 +1,5 @@
-#ifndef _VERBOSE_H_
-#define _VERBOSE_H_
+#ifndef VERBOSE_H_
+#define VERBOSE_H_
extern bool SILENT;
diff --git a/utils/weights.h b/utils/weights.h
index 920fdd75..0bd4c2d9 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -1,5 +1,5 @@
-#ifndef _WEIGHTS_H_
-#define _WEIGHTS_H_
+#ifndef WEIGHTS_H_
+#define WEIGHTS_H_
#include <string>
#include <vector>
diff --git a/utils/wordid.h b/utils/wordid.h
index 714dcd0b..3aa6cc23 100644
--- a/utils/wordid.h
+++ b/utils/wordid.h
@@ -1,5 +1,5 @@
-#ifndef _WORD_ID_H_
-#define _WORD_ID_H_
+#ifndef WORD_ID_H_
+#define WORD_ID_H_
#include <limits>