summaryrefslogtreecommitdiff
path: root/decoder/csplit.h
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
committerChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
commit851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree8c68ee77205badc056b8ab5b332e67e3e98017df /decoder/csplit.h
parentdc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'decoder/csplit.h')
-rw-r--r--decoder/csplit.h30
1 files changed, 30 insertions, 0 deletions
diff --git a/decoder/csplit.h b/decoder/csplit.h
new file mode 100644
index 00000000..ce6295c1
--- /dev/null
+++ b/decoder/csplit.h
@@ -0,0 +1,30 @@
+#ifndef _CSPLIT_H_
+#define _CSPLIT_H_
+
+#include "translator.h"
+#include "lattice.h"
+
+// this "translator" takes single words (with NO SPACES) and segments
+// them using the approach described in:
+//
+// C. Dyer. (2009) Using a maximum entropy model to build segmentation
+// lattices for MT. In Proceedings of NAACL HLT 2009.
+// note, an extra word space marker # is inserted at the left edge of
+// the forest!
+struct CompoundSplitImpl;
+struct CompoundSplit : public Translator {
+ CompoundSplit(const boost::program_options::variables_map& conf);
+ bool Translate(const std::string& input,
+ SentenceMetadata* smeta,
+ const std::vector<double>& weights,
+ Hypergraph* forest);
+
+ // given a forest generated by CompoundSplit::Translate,
+ // find the edge representing the unsegmented form
+ static int GetFullWordEdgeIndex(const Hypergraph& forest);
+
+ private:
+ boost::shared_ptr<CompoundSplitImpl> pimpl_;
+};
+
+#endif