non-latin character detector

author: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-23 20:22:45 -0600
committer: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-23 20:22:45 -0600
commit: 0f32c8d6fd42a0fff61a438dff41ddbb4a195b7e (patch)
tree: cc096d0f3865f514e8e70bbe24c7987c196d9698
parent: d4907ddee2012dce728bd1a6eb4e6cad452a54b2 (diff)
4 files changed, 71 insertions, 0 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 633542f0..be04fb31 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -58,6 +58,7 @@ libcdec_a_SOURCES = \
   trule.cc \
   ff.cc \
   ff_wordset.cc \
+  ff_charset.cc \
   ff_lm.cc \
   ff_klm.cc \
   ff_ruleshape.cc \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 686905ad..a12b532f 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -12,6 +12,7 @@
 #include "ff_lm_fsa.h"
 #include "ff_sample_fsa.h"
 #include "ff_register.h"
+#include "ff_charset.h"
 #include "ff_wordset.h"
 
 #ifdef HAVE_GLC
@@ -51,6 +52,7 @@ void register_feature_functions() {
   ff_registry.Register("KLanguageModel_Sorted", new FFFactory<KLanguageModel<lm::ngram::SortedModel> >());
   ff_registry.Register("KLanguageModel_Trie", new FFFactory<KLanguageModel<lm::ngram::TrieModel> >());
   ff_registry.Register("KLanguageModel_Probing", new FFFactory<KLanguageModel<lm::ngram::ProbingModel> >());
+  ff_registry.Register("NonLatinCount", new FFFactory<NonLatinCount>);
   ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>);
   ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
   ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>);
diff --git a/decoder/ff_charset.cc b/decoder/ff_charset.cc
new file mode 100644
index 00000000..33afc1a7
--- /dev/null
+++ b/decoder/ff_charset.cc
@@ -0,0 +1,42 @@
+#include "ff_charset.h"
+
+#include "fdict.h"
+#include "stringlib.h"
+
+using namespace std;
+
+NonLatinCount::NonLatinCount(const string& param) : FeatureFunction(), fid_(FD::Convert("NonLatinCount")) {}
+
+bool ContainsNonLatin(const char* word) {
+  int cur = 0;
+  while(word[cur]) {
+    const int size = UTF8Len(word[cur]);
+    if (size > 1) return true;
+    cur += size;  
+  }
+  return false;
+}
+
+void NonLatinCount::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                          const Hypergraph::Edge& edge,
+                                          const std::vector<const void*>& ant_contexts,
+                                          FeatureVector* features,
+                                          FeatureVector* estimated_features,
+                                          void* context) const {
+  const vector<WordID>& e = edge.rule_->e();
+  int count = 0;
+  for (int i = 0; i < e.size(); ++i) {
+    if (e[i] > 0) {
+      map<WordID, bool>::iterator it = is_non_latin_.find(e[i]);
+      if (it == is_non_latin_.end()) {
+        if ((is_non_latin_[e[i]] = ContainsNonLatin(TD::Convert(e[i]))))
+          ++count;
+      } else {
+        if (it->second)
+          ++count;
+      }
+    }
+  }
+  if (count) features->set_value(fid_, count);
+}
+
diff --git a/decoder/ff_charset.h b/decoder/ff_charset.h
new file mode 100644
index 00000000..b1ad537e
--- /dev/null
+++ b/decoder/ff_charset.h
@@ -0,0 +1,26 @@
+#ifndef _FFCHARSET_H_
+#define _FFCHARSET_H_
+
+#include <string>
+#include <map>
+#include "ff.h"
+#include "hg.h"
+
+class SentenceMetadata;
+
+class NonLatinCount : public FeatureFunction {
+ public:
+  NonLatinCount(const std::string& param);
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     FeatureVector* features,
+                                     FeatureVector* estimated_features,
+                                     void* context) const;
+ private:
+  mutable std::map<WordID, bool> is_non_latin_;
+  const int fid_;
+};
+
+#endif
author	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-23 20:22:45 -0600
committer	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-23 20:22:45 -0600
commit	0f32c8d6fd42a0fff61a438dff41ddbb4a195b7e (patch)
tree	cc096d0f3865f514e8e70bbe24c7987c196d9698
parent	d4907ddee2012dce728bd1a6eb4e6cad452a54b2 (diff)