summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2014-11-24 00:16:18 -0500
committerChris Dyer <redpony@gmail.com>2014-11-24 00:16:18 -0500
commit41e89b46b19fedfae33637383dd3f055d0f356b5 (patch)
tree8738aed2092c02c1ae3adf9c5dc5828b67ea8df7 /utils
parentaf1cea51086ce9188904235635a3a7288304fb35 (diff)
parent8c0e4a5c1f168a419b3a236a94815c97164bddbc (diff)
Merge pull request #59 from kho/mrmira
Mr.MIRA compatibility
Diffstat (limited to 'utils')
-rw-r--r--utils/Makefile.am3
-rw-r--r--utils/b64featvector.cc55
-rw-r--r--utils/b64featvector.h12
3 files changed, 69 insertions, 1 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 727fa8a5..64f6d433 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -22,6 +22,7 @@ libutils_a_SOURCES = \
alias_sampler.h \
alignment_io.h \
array2d.h \
+ b64featvector.h \
b64tools.h \
batched_append.h \
city.h \
@@ -70,6 +71,7 @@ libutils_a_SOURCES = \
fast_lexical_cast.hpp \
intrusive_refcount.hpp \
alignment_io.cc \
+ b64featvector.cc \
b64tools.cc \
corpus_tools.cc \
dict.cc \
@@ -117,4 +119,3 @@ stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_U
# do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I. -I$(top_srcdir) -DTEST_DATA=\"$(top_srcdir)/utils/test_data\"
################################################################
-
diff --git a/utils/b64featvector.cc b/utils/b64featvector.cc
new file mode 100644
index 00000000..c7d08b29
--- /dev/null
+++ b/utils/b64featvector.cc
@@ -0,0 +1,55 @@
+#include "b64featvector.h"
+
+#include <sstream>
+#include <boost/scoped_array.hpp>
+#include "b64tools.h"
+#include "fdict.h"
+
+using namespace std;
+
+static inline void EncodeFeatureWeight(const string &featname, weight_t weight,
+ ostream *output) {
+ output->write(featname.data(), featname.size() + 1);
+ output->write(reinterpret_cast<char *>(&weight), sizeof(weight_t));
+}
+
+string EncodeFeatureVector(const SparseVector<weight_t> &vec) {
+ string b64;
+ {
+ ostringstream base64_strm;
+ {
+ ostringstream strm;
+ for (SparseVector<weight_t>::const_iterator it = vec.begin();
+ it != vec.end(); ++it)
+ if (it->second != 0)
+ EncodeFeatureWeight(FD::Convert(it->first), it->second, &strm);
+ string data(strm.str());
+ B64::b64encode(data.data(), data.size(), &base64_strm);
+ }
+ b64 = base64_strm.str();
+ }
+ return b64;
+}
+
+void DecodeFeatureVector(const string &data, SparseVector<weight_t> *vec) {
+ vec->clear();
+ if (data.empty()) return;
+ // Decode data
+ size_t b64_len = data.size(), len = b64_len / 4 * 3;
+ boost::scoped_array<char> buf(new char[len]);
+ bool res =
+ B64::b64decode(reinterpret_cast<const unsigned char *>(data.data()),
+ b64_len, buf.get(), len);
+ assert(res);
+ // Apply updates
+ size_t cur = 0;
+ while (cur < len) {
+ string feat_name(buf.get() + cur);
+ if (feat_name.empty()) break; // Encountered trailing \0
+ int feat_id = FD::Convert(feat_name);
+ weight_t feat_delta =
+ *reinterpret_cast<weight_t *>(buf.get() + cur + feat_name.size() + 1);
+ (*vec)[feat_id] = feat_delta;
+ cur += feat_name.size() + 1 + sizeof(weight_t);
+ }
+}
diff --git a/utils/b64featvector.h b/utils/b64featvector.h
new file mode 100644
index 00000000..6ac04d44
--- /dev/null
+++ b/utils/b64featvector.h
@@ -0,0 +1,12 @@
+#ifndef _B64FEATVECTOR_H_
+#define _B64FEATVECTOR_H_
+
+#include <string>
+
+#include "sparse_vector.h"
+#include "weights.h"
+
+std::string EncodeFeatureVector(const SparseVector<weight_t> &);
+void DecodeFeatureVector(const std::string &, SparseVector<weight_t> *);
+
+#endif // _B64FEATVECTOR_H_