From d52db01a2e224869c6ea72a4a234e888c6fd756c Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 1 Dec 2010 05:27:13 +0000 Subject: alternative def of neighborhoods git-svn-id: https://ws10smt.googlecode.com/svn/trunk@739 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec_ff.cc | 3 + decoder/ff_wordalign.cc | 256 +++++++++++- decoder/ff_wordalign.h | 55 +++ decoder/lextrans.cc | 32 +- decoder/trule.cc | 20 +- environment/LocalConfig.pm | 4 + utils/sparse_vector.h | 14 +- word-aligner/aligner.pl | 3 + word-aligner/makefiles/makefile.grammars | 22 +- .../support/generate_word_pair_features.pl | 432 +++++++++++++++++++++ word-aligner/support/make_lex_grammar.pl | 388 +----------------- 11 files changed, 823 insertions(+), 406 deletions(-) create mode 100755 word-aligner/support/generate_word_pair_features.pl diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 3953118c..d6cf4572 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -51,6 +51,8 @@ void register_feature_functions() { ff_registry.Register("RuleShape", new FFFactory); ff_registry.Register("RelativeSentencePosition", new FFFactory); ff_registry.Register("Model2BinaryFeatures", new FFFactory); + ff_registry.Register("LexNullJump", new FFFactory); + ff_registry.Register("NewJump", new FFFactory); ff_registry.Register("MarkovJump", new FFFactory); ff_registry.Register("MarkovJumpFClass", new FFFactory); ff_registry.Register("SourceBigram", new FFFactory); @@ -64,6 +66,7 @@ void register_feature_functions() { ff_registry.Register("OutputIdentity", new FFFactory); ff_registry.Register("InputIdentity", new FFFactory); ff_registry.Register("LexicalTranslationTrigger", new FFFactory); + ff_registry.Register("WordPairFeatures", new FFFactory); ff_registry.Register("WordSet", new FFFactory); #ifdef HAVE_GLC ff_registry.Register("ContextCRF", new FFFactory); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 5f42b438..980c64ad 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -1,10 +1,13 @@ #include "ff_wordalign.h" +#include +#include #include #include #include #include +#include "verbose.h" #include "alignment_pharaoh.h" #include "stringlib.h" #include "sentence_metadata.h" @@ -20,6 +23,8 @@ static const int kNULL_i = 255; // -1 as an unsigned char using namespace std; +// TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature + Model2BinaryFeatures::Model2BinaryFeatures(const string& ) : fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) { for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { @@ -195,6 +200,45 @@ void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } +LexNullJump::LexNullJump(const string& param) : + FeatureFunction(1), + fid_lex_null_(FD::Convert("JumpLexNull")), + fid_null_lex_(FD::Convert("JumpNullLex")), + fid_null_null_(FD::Convert("JumpNullNull")), + fid_lex_lex_(FD::Convert("JumpLexLex")) {} + +void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* /* estimated_features */, + void* state) const { + char& dpstate = *((char*)state); + if (edge.Arity() == 0) { + // dpstate is 'N' = null or 'L' = lex + if (edge.i_ < 0) { dpstate = 'N'; } else { dpstate = 'L'; } + } else if (edge.Arity() == 1) { + dpstate = *((unsigned char*)ant_states[0]); + } else if (edge.Arity() == 2) { + char left = *((char*)ant_states[0]); + char right = *((char*)ant_states[1]); + dpstate = right; + if (left == 'N') { + if (right == 'N') + features->set_value(fid_null_null_, 1.0); + else + features->set_value(fid_null_lex_, 1.0); + } else { // left == 'L' + if (right == 'N') + features->set_value(fid_lex_null_, 1.0); + else + features->set_value(fid_lex_lex_, 1.0); + } + } else { + assert(!"something really unexpected is happening"); + } +} + MarkovJump::MarkovJump(const string& param) : FeatureFunction(1), fid_(FD::Convert("MarkovJump")), @@ -287,6 +331,100 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } +NewJump::NewJump(const string& param) : + FeatureFunction(1) { + cerr << " NewJump"; + vector argv; + int argc = SplitOnWhitespace(param, &argv); + set config; + for (int i = 0; i < argc; ++i) config.insert(argv[i]); + cerr << endl; + use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0; +} + +// do a log transform on the length (of a sentence, a jump, etc) +// this basically means that large distances that are close to each other +// are put into the same bin +int BinnedLogLength(int len) { + int res = static_cast(log(len+1) / log(1.3)); + if (res > 16) res = 16; + return res; +} + +void NewJump::FireFeature(const SentenceMetadata& smeta, + const int prev_src_index, + const int cur_src_index, + SparseVector* features) const { + const int src_len = smeta.GetSourceLength(); + const int raw_jump = cur_src_index - prev_src_index; + char jtype = 0; + int jump_magnitude = raw_jump; + if (raw_jump > 0) { jtype = 'R'; } // Right + else if (raw_jump == 0) { jtype = 'S'; } // Stay + else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left + int effective_length = src_len; + if (use_binned_log_lengths_) { + jump_magnitude = BinnedLogLength(jump_magnitude); + effective_length = BinnedLogLength(src_len); + } + + if (true) { + static map > len2jump2fid; + int& fid = len2jump2fid[src_len][raw_jump]; + if (!fid) { + ostringstream os; + os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude; + fid = FD::Convert(os.str()); + } + features->set_value(fid, 1.0); + } +} + +void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_states, + SparseVector* features, + SparseVector* /* estimated_features */, + void* state) const { + unsigned char& dpstate = *((unsigned char*)state); + const int flen = smeta.GetSourceLength(); + if (edge.Arity() == 0) { + dpstate = static_cast(edge.i_); + if (edge.prev_i_ == 0) { // first target word in sentence + if (edge.i_ >= 0) { // generated from non-Null token? + FireFeature(smeta, + -1, // previous src = beginning of sentence index + edge.i_, // current src + features); + } + } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) { // last word + if (edge.i_ >= 0) { // generated from non-Null token? + FireFeature(smeta, + edge.i_, // previous src = last word position + flen, // current src + features); + } + } + } else if (edge.Arity() == 1) { + dpstate = *((unsigned char*)ant_states[0]); + } else if (edge.Arity() == 2) { + int left_index = *((unsigned char*)ant_states[0]); + int right_index = *((unsigned char*)ant_states[1]); + if (right_index == -1) + dpstate = static_cast(left_index); + else + dpstate = static_cast(right_index); + if (left_index != kNULL_i && right_index != kNULL_i) { + FireFeature(smeta, + left_index, // previous src index + right_index, // current src index + features); + } + } else { + assert(!"something really unexpected is happening"); + } +} + SourceBigram::SourceBigram(const std::string& param) : FeatureFunction(sizeof(WordID) + sizeof(int)) { } @@ -626,6 +764,122 @@ void InputIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } +WordPairFeatures::WordPairFeatures(const string& param) { + vector argv; + int argc = SplitOnWhitespace(param, &argv); + if (argc != 1) { + cerr << "WordPairFeature /path/to/feature_values.table\n"; + abort(); + } + set all_srcs; + { + ReadFile rf(argv[0]); + istream& in = *rf.stream(); + string buf; + while (in) { + getline(in, buf); + if (buf.empty()) continue; + int start = 0; + while(start < buf.size() && buf[start] == ' ') ++start; + int end = start; + while(end < buf.size() && buf[end] != ' ') ++end; + const WordID src = TD::Convert(buf.substr(start, end - start)); + all_srcs.insert(src); + } + } + if (all_srcs.empty()) { + cerr << "WordPairFeature " << param << " loaded empty file!\n"; + return; + } + fkeys_.reserve(all_srcs.size()); + copy(all_srcs.begin(), all_srcs.end(), back_inserter(fkeys_)); + values_.resize(all_srcs.size()); + if (!SILENT) { cerr << "WordPairFeature: " << all_srcs.size() << " sources\n"; } + ReadFile rf(argv[0]); + istream& in = *rf.stream(); + string buf; + double val = 0; + WordID cur_src = 0; + map > *pv = NULL; + const WordID kBARRIER = TD::Convert("|||"); + while (in) { + getline(in, buf); + if (buf.size() == 0) continue; + int start = 0; + while(start < buf.size() && buf[start] == ' ') ++start; + int end = start; + while(end < buf.size() && buf[end] != ' ') ++end; + const WordID src = TD::Convert(buf.substr(start, end - start)); + if (cur_src != src) { + cur_src = src; + size_t ind = distance(fkeys_.begin(), lower_bound(fkeys_.begin(), fkeys_.end(), cur_src)); + pv = &values_[ind]; + } + end += 1; + start = end; + while(end < buf.size() && buf[end] != ' ') ++end; + WordID x = TD::Convert(buf.substr(start, end - start)); + if (x != kBARRIER) { + cerr << "1 Format error: " << buf << endl; + abort(); + } + start = end + 1; + end = start + 1; + while(end < buf.size() && buf[end] != ' ') ++end; + WordID trg = TD::Convert(buf.substr(start, end - start)); + if (trg == kBARRIER) { + cerr << "2 Format error: " << buf << endl; + abort(); + } + start = end + 1; + end = start + 1; + while(end < buf.size() && buf[end] != ' ') ++end; + WordID x2 = TD::Convert(buf.substr(start, end - start)); + if (x2 != kBARRIER) { + cerr << "3 Format error: " << buf << endl; + abort(); + } + start = end + 1; + + SparseVector& v = (*pv)[trg]; + while(start < buf.size()) { + end = start + 1; + while(end < buf.size() && buf[end] != '=' && buf[end] != ' ') ++end; + if (end == buf.size() || buf[end] != '=') { cerr << "4 Format error: " << buf << endl; abort(); } + const int fid = FD::Convert(buf.substr(start, end - start)); + start = end + 1; + while(start < buf.size() && buf[start] == ' ') ++start; + end = start + 1; + while(end < buf.size() && buf[end] != ' ') ++end; + assert(end > start); + if (end < buf.size()) buf[end] = 0; + val = strtod(&buf.c_str()[start], NULL); + v.set_value(fid, val); + start = end + 1; + } + } +} - +void WordPairFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + if (edge.Arity() == 0) { + assert(edge.rule_->EWords() == 1); + assert(edge.rule_->FWords() == 1); + const WordID trg = edge.rule_->e()[0]; + const WordID src = edge.rule_->f()[0]; + size_t ind = distance(fkeys_.begin(), lower_bound(fkeys_.begin(), fkeys_.end(), src)); + if (ind == fkeys_.size() || fkeys_[ind] != src) { + cerr << "WordPairFeatures no source entries for " << TD::Convert(src) << endl; + abort(); + } + const map >::const_iterator it = values_[ind].find(trg); + // TODO optional strict flag to make sure there are features for all pairs? + if (it != values_[ind].end()) + (*features) += it->second; + } +} diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index 0714229c..418c8768 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -103,6 +103,43 @@ class SourceBigram : public FeatureFunction { mutable Class2Class2FID fmap_; }; +class LexNullJump : public FeatureFunction { + public: + LexNullJump(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + const int fid_lex_null_; + const int fid_null_lex_; + const int fid_null_null_; + const int fid_lex_lex_; +}; + +class NewJump : public FeatureFunction { + public: + NewJump(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* out_context) const; + private: + void FireFeature(const SentenceMetadata& smeta, + const int prev_src_index, + const int cur_src_index, + SparseVector* features) const; + + bool use_binned_log_lengths_; + std::string fid_str_; // identifies configuration uniquely +}; + class SourcePOSBigram : public FeatureFunction { public: SourcePOSBigram(const std::string& param); @@ -238,6 +275,24 @@ class BlunsomSynchronousParseHack : public FeatureFunction { mutable std::vector > refs_; }; +// association feature type look up a pair (e,f) in a table and return a vector +// of feature values +class WordPairFeatures : public FeatureFunction { + public: + WordPairFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + + private: + std::vector fkeys_; // parallel to values_ + std::vector > > values_; // fkeys_index -> e -> value +}; + class InputIdentity : public FeatureFunction { public: InputIdentity(const std::string& param); diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc index 4476fe63..35d2d15d 100644 --- a/decoder/lextrans.cc +++ b/decoder/lextrans.cc @@ -76,13 +76,13 @@ struct LexicalTransImpl { // hack to tell the feature function system how big the sentence pair is const int f_start = (use_null ? -1 : 0); int prev_node_id = -1; - set target_vocab; // only set for alignment_only mode - if (align_only_) { - const Lattice& ref = smeta.GetReference(); - for (int i = 0; i < ref.size(); ++i) { - target_vocab.insert(ref[i][0].label); - } + set target_vocab; + const Lattice& ref = smeta.GetReference(); + for (int i = 0; i < ref.size(); ++i) { + target_vocab.insert(ref[i][0].label); } + bool all_sources_to_all_targets_ = true; + set trgs_used; for (int i = 0; i < e_len; ++i) { // for each word in the *target* Hypergraph::Node* node = forest->AddNode(kXCAT); const int new_node_id = node->id_; @@ -101,10 +101,13 @@ struct LexicalTransImpl { assert(rb); for (int k = 0; k < rb->GetNumRules(); ++k) { TRulePtr rule = rb->GetIthRule(k); + const WordID trg_word = rule->e_[0]; if (align_only_) { - if (target_vocab.count(rule->e_[0]) == 0) + if (target_vocab.count(trg_word) == 0) continue; } + if (all_sources_to_all_targets_ && (target_vocab.count(trg_word) > 0)) + trgs_used.insert(trg_word); Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); edge->i_ = j; edge->j_ = j+1; @@ -113,6 +116,21 @@ struct LexicalTransImpl { edge->feature_values_ += edge->rule_->GetFeatureValues(); forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); } + if (all_sources_to_all_targets_) { + for (set::iterator it = target_vocab.begin(); it != target_vocab.end(); ++it) { + if (trgs_used.count(*it)) continue; + const WordID ungenerated_trg_word = *it; + TRulePtr rule; + rule.reset(TRule::CreateLexicalRule(src_sym, ungenerated_trg_word)); + Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); + edge->i_ = j; + edge->j_ = j+1; + edge->prev_i_ = i; + edge->prev_j_ = i+1; + forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); + } + trgs_used.clear(); + } } if (prev_node_id >= 0) { const int comb_node_id = forest->AddNode(kXCAT)->id_; diff --git a/decoder/trule.cc b/decoder/trule.cc index a40c4e14..eedf8f30 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -246,18 +246,18 @@ string TRule::AsString(bool verbose) const { int idx = 0; if (lhs_ && verbose) { os << '[' << TD::Convert(lhs_ * -1) << "] |||"; - for (int i = 0; i < f_.size(); ++i) { - const WordID& w = f_[i]; - if (w < 0) { - int wi = w * -1; - ++idx; - os << " [" << TD::Convert(wi) << ',' << idx << ']'; - } else { - os << ' ' << TD::Convert(w); - } + } + for (int i = 0; i < f_.size(); ++i) { + const WordID& w = f_[i]; + if (w < 0) { + int wi = w * -1; + ++idx; + os << " [" << TD::Convert(wi) << ',' << idx << ']'; + } else { + os << ' ' << TD::Convert(w); } - os << " ||| "; } + os << " ||| "; if (idx > 9) { cerr << "Too many non-terminals!\n partial: " << os.str() << endl; exit(1); diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm index b047d21c..6e29fd05 100644 --- a/environment/LocalConfig.pm +++ b/environment/LocalConfig.pm @@ -36,6 +36,10 @@ my $CCONFIG = { 'HOST_REGEXP' => qr/^(blacklight.psc.edu|bl1.psc.teragrid.org)$/, 'QSubMemFlag' => '-l pmem=', }, + 'LOCAL' => { + 'HOST_REGEXP' => qr/local\.net$/, + 'QSubMemFlag' => '', + }, }; our $senvironment_name; diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index cce6c8a4..f76fc14c 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -361,6 +361,18 @@ public: return *this; } + template + SparseVector &operator+=(const SparseVector &other) { + for (typename SparseVector::MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { +// T v = + (values_[it->first] += it->second); +// if (!v) values_.erase(it->first); + } + return *this; + } + SparseVector &operator-=(const SparseVector &other) { for (typename MapType::const_iterator it = other.values_.begin(); it != other.values_.end(); ++it) @@ -512,8 +524,8 @@ public: values_.swap(other.values_); } -private: MapType values_; +private: #if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP friend class boost::serialization::access; diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index f0733449..81ac4198 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -118,15 +118,18 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz # grammar=$align_dir/grammars/freq_grammar.$direction.gz # per_sentence_grammar_file=$align_dir/grammars/psg.$direction +feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz feature_function=LexicalPairIdentity feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map feature_function=InputIdentity feature_function=OutputIdentity feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first +# the following two are deprecated feature_function=MarkovJump +b feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first feature_function=SourceBigram +# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first EOT close CDEC; diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index 21f39ac1..60417ec5 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,7 +1,7 @@ -all: corpus.f-e.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml +all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml clean: - $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* + $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs* SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip @@ -9,6 +9,7 @@ ZCAT = zcat EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl +GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl STEM_F = $(SCRIPT_DIR)/stemmers/$(F_LANG).pl @@ -66,13 +67,22 @@ corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS) $(MERGE_CORPUS) corpus.e corpus.f > $@ corpus.f-e.model1: corpus.f-e $(MODEL1) - $(MODEL1) corpus.f-e > $@ + $(MODEL1) -v -V corpus.f-e > $@ corpus.e-f.model1: corpus.e-f $(MODEL1) - $(MODEL1) corpus.e-f > $@ + $(MODEL1) -v -V corpus.e-f > $@ -corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) - $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) | $(GZIP) -9 > $@ +corpus.f-e.full-model1: corpus.f-e $(MODEL1) + $(MODEL1) -t -999999 -v -V corpus.f-e > $@ + +corpus.e-f.full-model1: corpus.e-f $(MODEL1) + $(MODEL1) -t -999999 -v -V corpus.e-f > $@ + +corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 + $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 | $(GZIP) -9 > $@ + +wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 + $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@ corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e $(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@ diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl new file mode 100755 index 00000000..b722ee49 --- /dev/null +++ b/word-aligner/support/generate_word_pair_features.pl @@ -0,0 +1,432 @@ +#!/usr/bin/perl -w +use utf8; +use strict; + +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.full-model1 corpus.e-f.full-model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f corpus.f-e.model1\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && $sparse_m1 && -f $sparse_m1; + +my %eclass = (); +my %fclass = (); +load_classes($class_e, \%eclass); +load_classes($class_f, \%fclass); + +our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14); +die unless scalar @IDENT_BINS == 15; +our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS; + +my $MIN_MAGNITUDE = 0.001; # minimum value of a feature + +our %cache; +open EF, "<$effile" or die; +open M1, "<$model1" or die; +open IM1, "<$imodel1" or die; +open SM1, "<$sparse_m1" or die; +binmode(EF,":utf8"); +binmode(M1,":utf8"); +binmode(IM1,":utf8"); +binmode(SM1,":utf8"); +binmode(STDOUT,":utf8"); +my %model1; +print STDERR "Reading model1...\n"; +my %sizes = (); +while() { + chomp; + my ($f, $e, $lp) = split /\s+/; + $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp)); + $sizes{$f}++; +} +close M1; + +my $inv_add = 0; +my %invm1; +print STDERR "Reading inverse model1...\n"; +my %esizes=(); +while() { + chomp; + my ($e, $f, $lp) = split /\s+/; + $invm1{$e}->{$f} = sprintf("%.5g", 1e-12 + exp($lp)); +} +close IM1; + +open OE, "<$orthoe" or die; +binmode(OE,":utf8"); +my %oe_dict; +while() { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $oe_dict{$a} = $b; +} +close OE; + +print STDERR "Reading sparse model 1 from $sparse_m1...\n"; +my %s_m1; +while() { + chomp; + my ($f, $e, $lp) = split /\s+/; + die unless defined $e && defined $f; + $s_m1{$f}->{$e} = 1; +} +close SM1; + +open OF, "<$orthof" or die; +binmode(OF,":utf8"); +my %of_dict; +while() { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $of_dict{$a} = $b; +} +close OF; +$of_dict{''} = ''; +$oe_dict{''} = ''; + +my $MIN_FEATURE_COUNT = 0; +my $ADD_PREFIX_ID = 1; +my $ADD_LEN = 1; +my $ADD_SIM = 1; +my $ADD_DICE = 1; +my $ADD_111 = 1; +my $ADD_SPARSE_M1 = 0; # this is a very bad feature +my $SPARSE_111 = 1; # if 1-1-1, then don't include Model1 & Dice features +my $ADD_ID = 1; +my $ADD_PUNC = 1; +my $ADD_NULL = 1; +my $ADD_MODEL1 = 1; +my $ADD_NOMODEL1 = 1; +my $BEAM_RATIO = 50; +my $BIN_ORTHO = 1; +my $BIN_DLEN = 1; +my $BIN_IDENT = 1; +my $BIN_DICE = 1; + +if ($ADD_NULL) { $fclass{''}='NUL'; $eclass{''} ='NUL'; } + +my %fdict; +my %fcounts; +my %ecounts; + +my %sdict; + +while() { + chomp; + my ($f, $e) = split /\s*\|\|\|\s*/; + my @es = split /\s+/, $e; + my @fs = split /\s+/, $f; + for my $ew (@es){ + die "E: Empty word" if $ew eq ''; + $ecounts{$ew}++; + } + push @fs, '' if $ADD_NULL; + my $i = 0; + for my $fw (@fs){ + $i++; + die "F: Empty word\nI=$i FS: @fs" if $fw eq ''; + $fcounts{$fw}++; + } + for my $fw (@fs){ + for my $ew (@es){ + $fdict{$fw}->{$ew}++; + } + } +} + +print STDERR "Extracting word pair features...\n"; +my $specials = 0; +my $fc = 1000000; +my $sids = 1000000; +for my $f (sort keys %fdict) { + my $re = $fdict{$f}; + my $max; + for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { + my $efcount = $re->{$e}; + unless (defined $max) { $max = $efcount; } + my $m1 = $model1{$f}->{$e}; + my $im1 = $invm1{$e}->{$f}; + my $is_null = undef; + if ($f eq '') { + $is_null = 1; + $im1 = 0; # probability not calcuated + } + die "No Model1 probability for $e | $f !" unless defined $m1; + die "No inverse Model1 probability for $f | $e !" unless defined $im1; + my $ident = ($e eq $f); + my $total_eandf = $ecounts{$e} + $fcounts{$f}; + my $dice = 2 * $efcount / $total_eandf; + my @feats; + my $is_111 = ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1); + if ($is_111 && $ADD_111) { + push @feats, "OneOneOne=1"; + } + unless ($is_111 && $SPARSE_111) { + if ($ADD_SPARSE_M1 && defined $s_m1{$f}->{$e}) { + push @feats, "HighM1=1"; + } + if (defined $m1 && $ADD_MODEL1) { + if ($m1 > $MIN_MAGNITUDE) { + push @feats, "Model1=$m1"; + my $m1d = sprintf("%.5g", sqrt($m1 * $dice)); + push @feats, "M1Dice=$m1d" if $m1d > $MIN_MAGNITUDE; + } elsif ($ADD_NOMODEL1) { + push @feats, 'NoModel1=1'; + } + if ($im1 > $MIN_MAGNITUDE) { + push @feats, "InvModel1=$im1" if $im1; + } else { + push @feats, 'NoInvModel1=1'; + } + my $am1 = sprintf("%.5g", sqrt($m1 * $im1)); + push @feats, "AgrModel1=$am1" if $am1 > $MIN_MAGNITUDE; + } + if ($ADD_DICE) { + if ($BIN_DICE) { + push @feats, dicebin($dice) . '=1'; + } else { + push @feats, "Dice=$dice"; + } + } + } + my $oe = $oe_dict{$e}; + die "Can't find orthonorm form for $e" unless defined $oe; + my $of = $of_dict{$f}; + die "Can't find orthonorm form for $f" unless defined $of; + my $len_e = length($oe); + my $len_f = length($of); + if ($ADD_LEN) { + if (!$is_null) { + my $dlen = abs($len_e - $len_f); + if ($BIN_DLEN) { + push @feats, dlenbin($dlen) . '=1'; + } else { + push @feats, "DLen=$dlen"; + } + } + } + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); + my $both_non_numeric = (!$e_num && !$f_num); + + unless ($total_eandf > 20) { + if ($f_num && $e_num) { + my $xf = $of; + $xf =~ s/[.,\N{U+0087}]//g; + my $xe = $oe; + $xe =~ s/[.,\N{U+0087}]//g; + if (($of ne $oe) && ($xe eq $xf)) { push @feats, "NumNearIdent=1"; } + } + } + + if ($ADD_SIM) { + my $ld = 0; + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); + } + if ($BIN_ORTHO) { + push @feats, orthobin($ld) . '=1'; + } else { + push @feats, "OrthoSim=$ld"; + } + } + my $f_is_punc = ($f =~ /^[!,\-\/"'`:;&=+?.()\[\]«»]+$/); + if ($ident && $ADD_ID) { + if ($f_is_punc) { push @feats, "IdentPunc=1"; } + else { + if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; } + if ($total_eandf < 8) { push @feats, "IdentRare=1"; } + if ($BIN_IDENT) { + push @feats, identbin($len_e) . '=1'; + } else { + push @feats, "Identical=$len_e"; + } + } + } + if ($ADD_PREFIX_ID && !$ident) { + if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { + my $pe = substr $oe, 0, 3; + my $pf = substr $of, 0, 3; + if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } + } + } + if ($ADD_PUNC) { + if ($f_is_punc && $e =~ /[a-z]+/) { + push @feats, "PuncMiss=1"; + } + } + print "$f ||| $e ||| @feats\n"; + } +} + + +sub levenshtein +{ + # $s1 and $s2 are the two strings + # $len1 and $len2 are their respective lengths + # + my ($s1, $s2) = @_; + my ($len1, $len2) = (length $s1, length $s2); + + # If one of the strings is empty, the distance is the length + # of the other string + # + return $len2 if ($len1 == 0); + return $len1 if ($len2 == 0); + + my %mat; + + # Init the distance matrix + # + # The first row to 0..$len1 + # The first column to 0..$len2 + # The rest to 0 + # + # The first row and column are initialized so to denote distance + # from the empty string + # + for (my $i = 0; $i <= $len1; ++$i) + { + for (my $j = 0; $j <= $len2; ++$j) + { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + + $mat{$i}{0} = $i; + } + + # Some char-by-char processing is ahead, so prepare + # array of chars from the strings + # + my @ar1 = split(//, $s1); + my @ar2 = split(//, $s2); + + for (my $i = 1; $i <= $len1; ++$i) + { + for (my $j = 1; $j <= $len2; ++$j) + { + # Set the cost to 1 iff the ith char of $s1 + # equals the jth of $s2 + # + # Denotes a substitution cost. When the char are equal + # there is no need to substitute, so the cost is 0 + # + my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; + + # Cell $mat{$i}{$j} equals the minimum of: + # + # - The cell immediately above plus 1 + # - The cell immediately to the left plus 1 + # - The cell diagonally above and to the left plus the cost + # + # We can either insert a new char, delete a char or + # substitute an existing char (with an associated cost) + # + $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + } + } + + # Finally, the Levenshtein distance equals the rightmost bottom cell + # of the matrix + # + # Note that $mat{$x}{$y} denotes the distance between the substrings + # 1..$x and 1..$y + # + return $mat{$len1}{$len2}; +} + + +# minimal element of a list +# +sub min +{ + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) + { + $min = $i if ($i < $min); + } + + return $min; +} + +sub load_classes { + my ($file, $ref) = @_; + print STDERR "Reading classes from $file...\n"; + open F, "<$file" or die "Can't read $file: $!"; + binmode(F, ":utf8") or die; + while() { + chomp; + my ($word, $class) = split /\s+/; +# print STDERR "'$word' -> $class\n"; + $ref->{$word} = $class; + } + close F; +} + +sub dicebin { + my $x = shift; + if ($x < 0.05) { return 'DiceLT005'; } + elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; } + elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; } + elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; } + elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; } + elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; } + elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; } + elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; } + elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; } + elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; } + elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; } + elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; } + elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; } + elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; } + elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; } + elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; } + elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; } + elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; } + elsif ($x >= 2.3) { return 'DiceGT23'; } +} + +sub orthobin { + my $x = shift; + if ($x < 0.9) { return 'OrthoLT09'; } + elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; } + elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; } + elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; } + elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; } + elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; } + elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; } + elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; } + elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; } + elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; } + elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; } + elsif ($x >= 2.9) { return 'OrthoGT29'; } +} + +sub dlenbin { + my $x = shift; + if ($x == 0) { return 'DLen0'; } + elsif ($x == 1) { return 'DLen1'; } + elsif ($x == 2) { return 'DLen2'; } + elsif ($x == 3) { return 'DLen3'; } + elsif ($x == 4) { return 'DLen4'; } + elsif ($x == 5) { return 'DLen5'; } + elsif ($x == 6) { return 'DLen6'; } + elsif ($x == 7) { return 'DLen7'; } + elsif ($x == 8) { return 'DLen8'; } + elsif ($x == 9) { return 'DLen9'; } + elsif ($x >= 10) { return 'DLenGT10'; } +} + +sub identbin { + my $x = shift; + if ($x == 0) { die; } + if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; } + return $IDENT_BINS[$x]; +} + + diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index c96071bf..47d4d945 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -4,27 +4,14 @@ use strict; my $LIMIT_SIZE=30; -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; +my ($effile, $model1, $imodel1) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1; +my $ADD_NULL = 1; -my %eclass = (); -my %fclass = (); -load_classes($class_e, \%eclass); -load_classes($class_f, \%fclass); - -our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14); -die unless scalar @IDENT_BINS == 15; -our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS; - -our %cache; open EF, "<$effile" or die; open M1, "<$model1" or die; open IM1, "<$imodel1" or die; -#open M4, "<$gizaf2e" or die; -#open IM4, "<$gizae2f" or die; -#binmode(M4,":utf8"); -#binmode(IM4,":utf8"); binmode(EF,":utf8"); binmode(M1,":utf8"); binmode(IM1,":utf8"); @@ -35,7 +22,7 @@ my %sizes = (); while() { chomp; my ($f, $e, $lp) = split /\s+/; - $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp)); + $model1{$f}->{$e} = 1; $sizes{$f}++; } close M1; @@ -47,10 +34,10 @@ my %esizes=(); while() { chomp; my ($e, $f, $lp) = split /\s+/; - $invm1{$e}->{$f} = sprintf("%.5g", 1e-12 + exp($lp)); + $invm1{$e}->{$f} = 1; $esizes{$e}++; if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) { - $model1{$f}->{$e} = 1e-12; + $model1{$f}->{$e} = 1; $sizes{$f}++; $inv_add++; } @@ -58,72 +45,9 @@ while() { close IM1; print STDERR "Added $inv_add from inverse model1\n"; -open M1, "<$model1" or die; -binmode(M1,":utf8"); -my $dir_add = 0; -print STDERR "Reading model1 (again) for extra inverse translations...\n"; -while() { - chomp; - my ($f, $e, $lp) = split /\s+/; - if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) { - $invm1{$e}->{$f} = 1e-12; - $esizes{$e}++; - $dir_add++; - } -} -close M1; -print STDERR "Added $dir_add from model 1\n"; print STDERR "Generating grammars...\n"; -open OE, "<$orthoe" or die; -binmode(OE,":utf8"); -my %oe_dict; -while() { - chomp; - my ($a, $b) = split / \|\|\| /, $_; - die "BAD: $_" unless defined $a && defined $b; - $oe_dict{$a} = $b; -} -close OE; -open OF, "<$orthof" or die; -binmode(OF,":utf8"); -my %of_dict; -while() { - chomp; - my ($a, $b) = split / \|\|\| /, $_; - die "BAD: $_" unless defined $a && defined $b; - $of_dict{$a} = $b; -} -close OF; -$of_dict{''} = ''; -$oe_dict{''} = ''; - -my $MIN_FEATURE_COUNT = 0; -my $ADD_PREFIX_ID = 1; -my $ADD_LEN = 1; -my $ADD_SIM = 1; -my $ADD_DICE = 1; -my $ADD_111 = 1; -my $ADD_ID = 1; -my $ADD_PUNC = 1; -my $ADD_NULL = 1; -my $ADD_MODEL1 = 1; -my $ADD_STEM_ID = 0; -my $ADD_SYM = 0; -my $BEAM_RATIO = 50; -my $BIN_ORTHO = 1; -my $BIN_DLEN = 1; -my $BIN_IDENT = 1; -my $BIN_DICE = 1; -my $ADD_FIDENT = 0; - -if ($ADD_NULL) { $fclass{''}='NUL'; $eclass{''} ='NUL'; } my %fdict; -my %fcounts; -my %ecounts; - -my %sdict; - while() { chomp; my ($f, $e) = split /\s*\|\|\|\s*/; @@ -131,14 +55,12 @@ while() { my @fs = split /\s+/, $f; for my $ew (@es){ die "E: Empty word" if $ew eq ''; - $ecounts{$ew}++; } push @fs, '' if $ADD_NULL; my $i = 0; for my $fw (@fs){ $i++; die "F: Empty word\nI=$i FS: @fs" if $fw eq ''; - $fcounts{$fw}++; } for my $fw (@fs){ for my $ew (@es){ @@ -147,7 +69,6 @@ while() { } } -#print STDERR "Loading Giza output...\n"; my %model4; #while() { # my $en = ; chomp $en; @@ -181,305 +102,10 @@ for my $f (sort keys %fdict) { my $m4 = $model4{$f}->{$e}; my $im1 = $invm1{$e}->{$f}; my $is_good_pair = (defined $m1 || defined $m4); - my $is_inv_good_pair = (defined $im1); my $ident = ($e eq $f); if ($ident) { $is_good_pair = 1; } - my $total_eandf = $ecounts{$e} + $fcounts{$f}; - my $dice = 2 * $efcount / $total_eandf; - my @feats; - if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { - $is_good_pair = 1; - if ($ADD_111) { - push @feats, "OneOneOne=1"; - } - } next unless $is_good_pair; - if (defined $m1 && $ADD_MODEL1) { - push @feats, "Model1=$m1"; - my $m1d = sprintf("%.5g", sqrt($m1 * $dice)); - push @feats, "Model1Dice=$m1d"; - } - if ($ADD_MODEL1 && !defined $m1) { push @feats, "NoModel1=1"; } - if (defined $im1 && $ADD_MODEL1) { - push @feats, "InvModel1=$im1"; - } - if (!defined $im1 && $ADD_MODEL1) { - push @feats, "NoInvModel1=1"; - } - if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) { - $fc++; - push @feats, "F$fc=1"; - } - if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; } - my $oe = $oe_dict{$e}; - die "Can't find orthonorm form for $e" unless defined $oe; - my $of = $of_dict{$f}; - die "Can't find orthonorm form for $f" unless defined $of; - my $len_e = length($oe); - my $len_f = length($of); - if ($ADD_DICE) { - if ($BIN_DICE) { - push @feats, dicebin($dice) . '=1'; - } else { - push @feats, "Dice=$dice"; - } - } - my $is_null = undef; - if ($ADD_NULL && $f eq '') { - $is_null = 1; - } - if ($ADD_LEN) { - if (!$is_null) { - my $dlen = abs($len_e - $len_f); - if ($BIN_DLEN) { - push @feats, dlenbin($dlen) . '=1'; - } else { - push @feats, "DLen=$dlen"; - } - } - } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); - my $both_non_numeric = (!$e_num && !$f_num); - unless ($total_eandf > 20) { - if ($f_num && $e_num) { - my $xf = $of; - $xf =~ s/[.,]//g; - my $xe = $oe; - $xe =~ s/[.,]//g; - if (($of ne $oe) && ($xe eq $xf)) { push @feats, "NumNearIdent=1"; } - } - } - if ($ADD_STEM_ID) { - my $el = 4; - my $fl = 4; - if ($oe =~ /^al|re|co/) { $el++; } - if ($of =~ /^al|re|co/) { $fl++; } - if ($oe =~ /^trans|inter/) { $el+=2; } - if ($of =~ /^trans|inter/) { $fl+=2; } - if ($fl > length($of)) { $fl = length($of); } - if ($el > length($oe)) { $el = length($oe); } - my $sf = substr $of, 0, $fl; - my $se = substr $oe, 0, $el; - my $id = $sdict{$sf}->{$se}; - if (!$id) { - $sids++; - $sdict{$sf}->{$se} = $sids; - $id = $sids; - } - push @feats, "S$id=1"; - } - if ($ADD_SIM) { - my $ld = 0; - my $eff = $len_e; - if ($eff < $len_f) { $eff = $len_f; } - if (!$is_null) { - $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); - } - #if ($ld > 1.5) { $is_good_pair = 1; } - if ($BIN_ORTHO) { - push @feats, orthobin($ld) . '=1'; - } else { - push @feats, "OrthoSim=$ld"; - } - } - if ($ident && $ADD_ID) { - if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; } - if ($total_eandf < 8) { push @feats, "IdentRare=1"; } - if ($BIN_IDENT) { - push @feats, identbin($len_e) . '=1'; - } else { - push @feats, "Identical=$len_e"; - } - } - if ($ADD_PREFIX_ID && !$ident) { - if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { - my $pe = substr $oe, 0, 3; - my $pf = substr $of, 0, 3; - if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } - } - } - if ($ADD_PUNC) { - if ($f =~ /^[!,\-\/"'`:;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { - push @feats, "PuncMiss=1"; - } - } - my $is_special = ($is_good_pair && !(defined $m1)); - $specials++ if $is_special; - print STDERR "$f -> $e\n" if $is_special; - print "$f ||| $e ||| @feats\n" if $is_good_pair; + print "$f ||| $e ||| X=0\n" if $is_good_pair; } } -print STDERR "Added $specials special rules that were not in the M1 set\n"; - - -sub levenshtein -{ - # $s1 and $s2 are the two strings - # $len1 and $len2 are their respective lengths - # - my ($s1, $s2) = @_; - my ($len1, $len2) = (length $s1, length $s2); - - # If one of the strings is empty, the distance is the length - # of the other string - # - return $len2 if ($len1 == 0); - return $len1 if ($len2 == 0); - - my %mat; - - # Init the distance matrix - # - # The first row to 0..$len1 - # The first column to 0..$len2 - # The rest to 0 - # - # The first row and column are initialized so to denote distance - # from the empty string - # - for (my $i = 0; $i <= $len1; ++$i) - { - for (my $j = 0; $j <= $len2; ++$j) - { - $mat{$i}{$j} = 0; - $mat{0}{$j} = $j; - } - - $mat{$i}{0} = $i; - } - - # Some char-by-char processing is ahead, so prepare - # array of chars from the strings - # - my @ar1 = split(//, $s1); - my @ar2 = split(//, $s2); - - for (my $i = 1; $i <= $len1; ++$i) - { - for (my $j = 1; $j <= $len2; ++$j) - { - # Set the cost to 1 iff the ith char of $s1 - # equals the jth of $s2 - # - # Denotes a substitution cost. When the char are equal - # there is no need to substitute, so the cost is 0 - # - my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; - - # Cell $mat{$i}{$j} equals the minimum of: - # - # - The cell immediately above plus 1 - # - The cell immediately to the left plus 1 - # - The cell diagonally above and to the left plus the cost - # - # We can either insert a new char, delete a char or - # substitute an existing char (with an associated cost) - # - $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, - $mat{$i}{$j-1} + 1, - $mat{$i-1}{$j-1} + $cost]); - } - } - - # Finally, the Levenshtein distance equals the rightmost bottom cell - # of the matrix - # - # Note that $mat{$x}{$y} denotes the distance between the substrings - # 1..$x and 1..$y - # - return $mat{$len1}{$len2}; -} - - -# minimal element of a list -# -sub min -{ - my @list = @{$_[0]}; - my $min = $list[0]; - - foreach my $i (@list) - { - $min = $i if ($i < $min); - } - - return $min; -} - -sub load_classes { - my ($file, $ref) = @_; - print STDERR "Reading classes from $file...\n"; - open F, "<$file" or die "Can't read $file: $!"; - binmode(F, ":utf8") or die; - while() { - chomp; - my ($word, $class) = split /\s+/; -# print STDERR "'$word' -> $class\n"; - $ref->{$word} = $class; - } - close F; -} - -sub dicebin { - my $x = shift; - if ($x < 0.05) { return 'DiceLT005'; } - elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; } - elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; } - elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; } - elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; } - elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; } - elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; } - elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; } - elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; } - elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; } - elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; } - elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; } - elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; } - elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; } - elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; } - elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; } - elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; } - elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; } - elsif ($x >= 2.3) { return 'DiceGT23'; } -} - -sub orthobin { - my $x = shift; - if ($x < 0.9) { return 'OrthoLT09'; } - elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; } - elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; } - elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; } - elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; } - elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; } - elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; } - elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; } - elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; } - elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; } - elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; } - elsif ($x >= 2.9) { return 'OrthoGT29'; } -} - -sub dlenbin { - my $x = shift; - if ($x == 0) { return 'DLen0'; } - elsif ($x == 1) { return 'DLen1'; } - elsif ($x == 2) { return 'DLen2'; } - elsif ($x == 3) { return 'DLen3'; } - elsif ($x == 4) { return 'DLen4'; } - elsif ($x == 5) { return 'DLen5'; } - elsif ($x == 6) { return 'DLen6'; } - elsif ($x == 7) { return 'DLen7'; } - elsif ($x == 8) { return 'DLen8'; } - elsif ($x == 9) { return 'DLen9'; } - elsif ($x >= 10) { return 'DLenGT10'; } -} - -sub identbin { - my $x = shift; - if ($x == 0) { die; } - if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; } - return $IDENT_BINS[$x]; -} - -- cgit v1.2.3