summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2010-12-23 20:22:45 -0600
committerChris Dyer <cdyer@cs.cmu.edu>2010-12-23 20:22:45 -0600
commit0f32c8d6fd42a0fff61a438dff41ddbb4a195b7e (patch)
treecc096d0f3865f514e8e70bbe24c7987c196d9698
parentd4907ddee2012dce728bd1a6eb4e6cad452a54b2 (diff)
non-latin character detector
-rw-r--r--decoder/Makefile.am1
-rw-r--r--decoder/cdec_ff.cc2
-rw-r--r--decoder/ff_charset.cc42
-rw-r--r--decoder/ff_charset.h26
4 files changed, 71 insertions, 0 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 633542f0..be04fb31 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -58,6 +58,7 @@ libcdec_a_SOURCES = \
trule.cc \
ff.cc \
ff_wordset.cc \
+ ff_charset.cc \
ff_lm.cc \
ff_klm.cc \
ff_ruleshape.cc \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 686905ad..a12b532f 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -12,6 +12,7 @@
#include "ff_lm_fsa.h"
#include "ff_sample_fsa.h"
#include "ff_register.h"
+#include "ff_charset.h"
#include "ff_wordset.h"
#ifdef HAVE_GLC
@@ -51,6 +52,7 @@ void register_feature_functions() {
ff_registry.Register("KLanguageModel_Sorted", new FFFactory<KLanguageModel<lm::ngram::SortedModel> >());
ff_registry.Register("KLanguageModel_Trie", new FFFactory<KLanguageModel<lm::ngram::TrieModel> >());
ff_registry.Register("KLanguageModel_Probing", new FFFactory<KLanguageModel<lm::ngram::ProbingModel> >());
+ ff_registry.Register("NonLatinCount", new FFFactory<NonLatinCount>);
ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>);
ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>);
diff --git a/decoder/ff_charset.cc b/decoder/ff_charset.cc
new file mode 100644
index 00000000..33afc1a7
--- /dev/null
+++ b/decoder/ff_charset.cc
@@ -0,0 +1,42 @@
+#include "ff_charset.h"
+
+#include "fdict.h"
+#include "stringlib.h"
+
+using namespace std;
+
+NonLatinCount::NonLatinCount(const string& param) : FeatureFunction(), fid_(FD::Convert("NonLatinCount")) {}
+
+bool ContainsNonLatin(const char* word) {
+ int cur = 0;
+ while(word[cur]) {
+ const int size = UTF8Len(word[cur]);
+ if (size > 1) return true;
+ cur += size;
+ }
+ return false;
+}
+
+void NonLatinCount::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ FeatureVector* features,
+ FeatureVector* estimated_features,
+ void* context) const {
+ const vector<WordID>& e = edge.rule_->e();
+ int count = 0;
+ for (int i = 0; i < e.size(); ++i) {
+ if (e[i] > 0) {
+ map<WordID, bool>::iterator it = is_non_latin_.find(e[i]);
+ if (it == is_non_latin_.end()) {
+ if ((is_non_latin_[e[i]] = ContainsNonLatin(TD::Convert(e[i]))))
+ ++count;
+ } else {
+ if (it->second)
+ ++count;
+ }
+ }
+ }
+ if (count) features->set_value(fid_, count);
+}
+
diff --git a/decoder/ff_charset.h b/decoder/ff_charset.h
new file mode 100644
index 00000000..b1ad537e
--- /dev/null
+++ b/decoder/ff_charset.h
@@ -0,0 +1,26 @@
+#ifndef _FFCHARSET_H_
+#define _FFCHARSET_H_
+
+#include <string>
+#include <map>
+#include "ff.h"
+#include "hg.h"
+
+class SentenceMetadata;
+
+class NonLatinCount : public FeatureFunction {
+ public:
+ NonLatinCount(const std::string& param);
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ FeatureVector* features,
+ FeatureVector* estimated_features,
+ void* context) const;
+ private:
+ mutable std::map<WordID, bool> is_non_latin_;
+ const int fid_;
+};
+
+#endif