diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-23 20:22:45 -0600 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-23 20:22:45 -0600 |
commit | 0f32c8d6fd42a0fff61a438dff41ddbb4a195b7e (patch) | |
tree | cc096d0f3865f514e8e70bbe24c7987c196d9698 /decoder | |
parent | d4907ddee2012dce728bd1a6eb4e6cad452a54b2 (diff) |
non-latin character detector
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/Makefile.am | 1 | ||||
-rw-r--r-- | decoder/cdec_ff.cc | 2 | ||||
-rw-r--r-- | decoder/ff_charset.cc | 42 | ||||
-rw-r--r-- | decoder/ff_charset.h | 26 |
4 files changed, 71 insertions, 0 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 633542f0..be04fb31 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -58,6 +58,7 @@ libcdec_a_SOURCES = \ trule.cc \ ff.cc \ ff_wordset.cc \ + ff_charset.cc \ ff_lm.cc \ ff_klm.cc \ ff_ruleshape.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 686905ad..a12b532f 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -12,6 +12,7 @@ #include "ff_lm_fsa.h" #include "ff_sample_fsa.h" #include "ff_register.h" +#include "ff_charset.h" #include "ff_wordset.h" #ifdef HAVE_GLC @@ -51,6 +52,7 @@ void register_feature_functions() { ff_registry.Register("KLanguageModel_Sorted", new FFFactory<KLanguageModel<lm::ngram::SortedModel> >()); ff_registry.Register("KLanguageModel_Trie", new FFFactory<KLanguageModel<lm::ngram::TrieModel> >()); ff_registry.Register("KLanguageModel_Probing", new FFFactory<KLanguageModel<lm::ngram::ProbingModel> >()); + ff_registry.Register("NonLatinCount", new FFFactory<NonLatinCount>); ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>); ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>); ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>); diff --git a/decoder/ff_charset.cc b/decoder/ff_charset.cc new file mode 100644 index 00000000..33afc1a7 --- /dev/null +++ b/decoder/ff_charset.cc @@ -0,0 +1,42 @@ +#include "ff_charset.h" + +#include "fdict.h" +#include "stringlib.h" + +using namespace std; + +NonLatinCount::NonLatinCount(const string& param) : FeatureFunction(), fid_(FD::Convert("NonLatinCount")) {} + +bool ContainsNonLatin(const char* word) { + int cur = 0; + while(word[cur]) { + const int size = UTF8Len(word[cur]); + if (size > 1) return true; + cur += size; + } + return false; +} + +void NonLatinCount::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_contexts, + FeatureVector* features, + FeatureVector* estimated_features, + void* context) const { + const vector<WordID>& e = edge.rule_->e(); + int count = 0; + for (int i = 0; i < e.size(); ++i) { + if (e[i] > 0) { + map<WordID, bool>::iterator it = is_non_latin_.find(e[i]); + if (it == is_non_latin_.end()) { + if ((is_non_latin_[e[i]] = ContainsNonLatin(TD::Convert(e[i])))) + ++count; + } else { + if (it->second) + ++count; + } + } + } + if (count) features->set_value(fid_, count); +} + diff --git a/decoder/ff_charset.h b/decoder/ff_charset.h new file mode 100644 index 00000000..b1ad537e --- /dev/null +++ b/decoder/ff_charset.h @@ -0,0 +1,26 @@ +#ifndef _FFCHARSET_H_ +#define _FFCHARSET_H_ + +#include <string> +#include <map> +#include "ff.h" +#include "hg.h" + +class SentenceMetadata; + +class NonLatinCount : public FeatureFunction { + public: + NonLatinCount(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_contexts, + FeatureVector* features, + FeatureVector* estimated_features, + void* context) const; + private: + mutable std::map<WordID, bool> is_non_latin_; + const int fid_; +}; + +#endif |