From c97b8a8b58f7385fb48b74e2cf1ea9610cd1202f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 1 Feb 2010 17:38:39 -0500 Subject: word aligner cleanup, new features --- decoder/cdec_ff.cc | 1 + decoder/ff_wordalign.cc | 96 ++++++- decoder/ff_wordalign.h | 13 +- tests/system_tests/unsup-align/cdec.ini | 2 +- tests/system_tests/unsup-align/gold.statistics | 2 +- training/cluster-ptrain.pl | 6 +- word-aligner/aligner.pl | 20 +- word-aligner/classify.pl | 27 -- word-aligner/extract_grammar.pl | 11 - word-aligner/extract_vocab.pl | 20 -- word-aligner/extract_weights.pl | 17 -- word-aligner/invert_grammar.pl | 8 - word-aligner/make_lex_grammar.pl | 339 ------------------------ word-aligner/makefiles/makefile.grammars | 15 +- word-aligner/merge_corpus.pl | 18 -- word-aligner/supplement_weights_file.pl | 37 --- word-aligner/support/classify.pl | 27 ++ word-aligner/support/extract_grammar.pl | 11 + word-aligner/support/extract_vocab.pl | 20 ++ word-aligner/support/extract_weights.pl | 17 ++ word-aligner/support/invert_grammar.pl | 8 + word-aligner/support/make_lex_grammar.pl | 339 ++++++++++++++++++++++++ word-aligner/support/merge_corpus.pl | 18 ++ word-aligner/support/supplement_weights_file.pl | 73 +++++ 24 files changed, 641 insertions(+), 504 deletions(-) delete mode 100755 word-aligner/classify.pl delete mode 100755 word-aligner/extract_grammar.pl delete mode 100755 word-aligner/extract_vocab.pl delete mode 100755 word-aligner/extract_weights.pl delete mode 100755 word-aligner/invert_grammar.pl delete mode 100755 word-aligner/make_lex_grammar.pl delete mode 100755 word-aligner/merge_corpus.pl delete mode 100755 word-aligner/supplement_weights_file.pl create mode 100755 word-aligner/support/classify.pl create mode 100755 word-aligner/support/extract_grammar.pl create mode 100755 word-aligner/support/extract_vocab.pl create mode 100755 word-aligner/support/extract_weights.pl create mode 100755 word-aligner/support/invert_grammar.pl create mode 100755 word-aligner/support/make_lex_grammar.pl create mode 100755 word-aligner/support/merge_corpus.pl create mode 100755 word-aligner/support/supplement_weights_file.pl diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 34499398..b4381dda 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -16,6 +16,7 @@ void register_feature_functions() { global_ff_registry->Register("RelativeSentencePosition", new FFFactory); global_ff_registry->Register("Model2BinaryFeatures", new FFFactory); global_ff_registry->Register("MarkovJump", new FFFactory); + global_ff_registry->Register("MarkovJumpFClass", new FFFactory); global_ff_registry->Register("SourcePOSBigram", new FFFactory); global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory); global_ff_registry->Register("AlignerResults", new FFFactory); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index e3fa91d4..fb90df62 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -1,5 +1,6 @@ #include "ff_wordalign.h" +#include #include #include #include @@ -12,20 +13,20 @@ #include "tdict.h" // Blunsom hack #include "filelib.h" // Blunsom hack -static const size_t MAX_SENTENCE_SIZE = 100; +static const int MAX_SENTENCE_SIZE = 100; using namespace std; Model2BinaryFeatures::Model2BinaryFeatures(const string& param) : fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) { - for (int i = 0; i < MAX_SENTENCE_SIZE; ++i) { - for (int j = 0; j < MAX_SENTENCE_SIZE; ++j) { + for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { + for (int j = 0; j < i; ++j) { for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) { int& val = fids_[i][j][k]; val = -1; if (j < i) { ostringstream os; - os << "M2_" << i << '_' << j << ':' << k; + os << "M2_FL:" << i << "_SI:" << j << "_TI:" << k; val = FD::Convert(os.str()); } } @@ -56,8 +57,24 @@ RelativeSentencePosition::RelativeSentencePosition(const string& param) : if (!param.empty()) { cerr << " Loading word classes from " << param << endl; condition_on_fclass_ = true; - template_ = "RSP:FC000"; - assert(!"not implemented"); + ReadFile rf(param); + istream& in = *rf.stream(); + set classes; + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + vector v; + TD::ConvertSentence(line, &v); + pos_.push_back(v); + for (int i = 0; i < v.size(); ++i) + classes.insert(v[i]); + for (set::iterator i = classes.begin(); i != classes.end(); ++i) { + ostringstream os; + os << "RelPos_FC:" << TD::Convert(*i); + fids_[*i] = FD::Convert(os.str()); + } + } } else { condition_on_fclass_ = false; } @@ -79,17 +96,22 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme static_cast(edge.prev_i_) / smeta.GetTargetLength()); features->set_value(fid_, val); if (condition_on_fclass_) { - assert(!"not implemented"); + assert(smeta.GetSentenceID() < pos_.size()); + const WordID cur_fclass = pos_[smeta.GetSentenceID()][edge.i_]; + const int fid = fids_.find(cur_fclass)->second; + features->set_value(fid, val); } // cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; } MarkovJumpFClass::MarkovJumpFClass(const string& param) : - FeatureFunction(1) { + FeatureFunction(1), + fids_(MAX_SENTENCE_SIZE) { cerr << " MarkovJumpFClass" << endl; cerr << "Reading source POS tags from " << param << endl; ReadFile rf(param); istream& in = *rf.stream(); + set classes; while(in) { string line; getline(in, line); @@ -97,8 +119,66 @@ MarkovJumpFClass::MarkovJumpFClass(const string& param) : vector v; TD::ConvertSentence(line, &v); pos_.push_back(v); + for (int i = 0; i < v.size(); ++i) + classes.insert(v[i]); } cerr << " (" << pos_.size() << " lines)\n"; + cerr << " Classes: " << classes.size() << endl; + for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) { + map >& cfids = fids_[ss]; + for (set::iterator i = classes.begin(); i != classes.end(); ++i) { + map &fids = cfids[*i]; + for (int j = -ss; j <= ss; ++j) { + ostringstream os; + os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j; + fids[j] = FD::Convert(os.str()); + } + } + } +} + +void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta, + int prev_src_pos, + int cur_src_pos, + SparseVector* features) const { + const int jumpsize = cur_src_pos - prev_src_pos; + assert(smeta.GetSentenceID() < pos_.size()); + const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos]; + const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; + features->set_value(fid, 1.0); +} + +void MarkovJumpFClass::FinalTraversalFeatures(const void* context, + SparseVector* features) const { + int left_index = *static_cast(context); +// int right_index = cur_flen; + // TODO +} + +void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_states, + SparseVector* features, + SparseVector* estimated_features, + void* state) const { + unsigned char& dpstate = *((unsigned char*)state); + if (edge.Arity() == 0) { + dpstate = static_cast(edge.i_); + } else if (edge.Arity() == 1) { + dpstate = *((unsigned char*)ant_states[0]); + } else if (edge.Arity() == 2) { + int left_index = *((unsigned char*)ant_states[0]); + int right_index = *((unsigned char*)ant_states[1]); + if (right_index == -1) + dpstate = static_cast(left_index); + else + dpstate = static_cast(right_index); +// const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index]; +// cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl; +// const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; +// features->set_value(fid, 1.0); + FireFeature(smeta, left_index, right_index, features); + } } MarkovJump::MarkovJump(const string& param) : diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index c5404887..688750de 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -19,7 +19,8 @@ class RelativeSentencePosition : public FeatureFunction { private: const int fid_; bool condition_on_fclass_; - std::string template_; + std::vector > pos_; + std::map fids_; // fclass -> fid }; class Model2BinaryFeatures : public FeatureFunction { @@ -66,10 +67,14 @@ class MarkovJumpFClass : public FeatureFunction { SparseVector* features, SparseVector* estimated_features, void* context) const; - private: - void FireFeature(WordID src, - WordID trg, + + void FireFeature(const SentenceMetadata& smeta, + int prev_src_pos, + int cur_src_pos, SparseVector* features) const; + + private: + std::vector > > fids_; // flen -> fclass -> jumpsize -> fid std::vector > pos_; }; diff --git a/tests/system_tests/unsup-align/cdec.ini b/tests/system_tests/unsup-align/cdec.ini index 4016a201..37a37214 100644 --- a/tests/system_tests/unsup-align/cdec.ini +++ b/tests/system_tests/unsup-align/cdec.ini @@ -1,6 +1,6 @@ aligner=true grammar=unsup-align.lex-grammar -cubepruning_pop_limit=1000000 +intersection_strategy=full formalism=lexcrf feature_function=RelativeSentencePosition feature_function=MarkovJump diff --git a/tests/system_tests/unsup-align/gold.statistics b/tests/system_tests/unsup-align/gold.statistics index 2f37c2db..975c9d4e 100644 --- a/tests/system_tests/unsup-align/gold.statistics +++ b/tests/system_tests/unsup-align/gold.statistics @@ -90,7 +90,7 @@ constr_paths 4 +lm_nodes 7 +lm_edges 14 +lm_paths 16 -+lm_trans end thet ++lm_trans thet thet constr_nodes 7 constr_edges 10 constr_paths 4 diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl index 7643d4e5..03122df9 100755 --- a/training/cluster-ptrain.pl +++ b/training/cluster-ptrain.pl @@ -29,10 +29,12 @@ my $PRIOR; my $OALG = "lbfgs"; my $sigsq = 1; my $means_file; +my $mem_buffers = 20; my $RESTART_IF_NECESSARY; GetOptions("cdec=s" => \$DECODER, "distributed" => \$DISTRIBUTED, "sigma_squared=f" => \$sigsq, + "lbfgs_memory_buffers=i" => \$mem_buffers, "max_iteration=i" => \$max_iteration, "means=s" => \$means_file, "optimizer=s" => \$OALG, @@ -133,7 +135,7 @@ while ($iter < $max_iteration) { my $start = time; my $next_iter = $iter + 1; my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; - my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M 50 $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; + my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; my $cmd = ""; if ($parallel) { $cmd = $pcmd; } @@ -183,6 +185,8 @@ Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init --means FILE if you want means other than 0 --sigma_squared S variance on prior --pmem MEM Memory required for decoder + --lbfgs_memory_buffers Number of buffers to use + with LBFGS optimizer EOT } diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index d203fc53..7eec0e42 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -6,15 +6,20 @@ use Getopt::Long; my $training_dir = "$SCRIPT_DIR/../training"; die "Can't find training dir: $training_dir" unless -d $training_dir; +my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; my $num_classes = 50; my $nodes = 40; my $pmem = "2500mb"; my $DECODER = "cdec"; GetOptions("cdec=s" => \$DECODER, "jobs=i" => \$nodes, - "pmem=s" => \$pmem + "pmem=s" => \$pmem, + "mkcls=s" => \$mkcls, ) or usage(); usage() unless (scalar @ARGV == 1); +die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; +die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; + my $in_file = shift @ARGV; die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; @@ -22,13 +27,13 @@ my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; +print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; my @stages = qw(nopos relpos markov); my @directions = qw(f-e e-f); -my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; my $corpus = 'c'; my $cwd = getcwd(); @@ -75,7 +80,7 @@ NCLASSES = $num_classes TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary -PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5 +PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 export @@ -95,12 +100,16 @@ clean: EOT close TOPLEVEL; +print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; +exit 0; + sub make_stage { my ($stage, $direction, $prev_stage) = @_; my $stage_dir = "$align_dir/$stage-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; + my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; open CDEC, ">$stage_dir/cdec.ini" or die; print CDEC <) { - chomp; - my ($word, $cat) = split /\s+/; - die "'$word' '$cat'" unless (defined $word && defined $cat); - $dict{$word} = $cat; - $cc++; -} -close C; -print STDERR "Loaded classes for $cc words\n"; - -while() { - chomp; - my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/; - print "@cats\n"; -} - diff --git a/word-aligner/extract_grammar.pl b/word-aligner/extract_grammar.pl deleted file mode 100755 index d7275ef5..00000000 --- a/word-aligner/extract_grammar.pl +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $key = shift @ARGV; -die "Usage: $0 KEY\n" unless defined $key; - -while(<>) { - my ($k, @rest) = split / \|\|\| /; - print join(' ||| ', @rest) if ($k eq $key); -} - diff --git a/word-aligner/extract_vocab.pl b/word-aligner/extract_vocab.pl deleted file mode 100755 index 070d4202..00000000 --- a/word-aligner/extract_vocab.pl +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -print STDERR "Extracting vocabulary...\n"; -my %dict = (); -my $wc = 0; -while(<>) { - chomp; - my @words = split /\s+/; - for my $word (@words) { $wc++; $dict{$word}++; } -} - -my $tc = 0; -for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) { - print "$word\n"; - $tc++; -} - -print STDERR "$tc types / $wc tokens\n"; - diff --git a/word-aligner/extract_weights.pl b/word-aligner/extract_weights.pl deleted file mode 100755 index dfedd12e..00000000 --- a/word-aligner/extract_weights.pl +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/perl -w -use strict; -my %dict=(); -while(<>) { - chomp; - my ($dummy, $a, $b, $wts) = split / \|\|\| /; - my @weights = split /\s+/, $wts; - for my $w (@weights) { - my ($name, $val) = split /=/, $w; - unless ($dict{$name}) { - my $r = (0.5 - rand) / 5; - $r = sprintf ("%0.4f", $r); - print "$name $r\n"; - $dict{$name}= 1; - } - } -} diff --git a/word-aligner/invert_grammar.pl b/word-aligner/invert_grammar.pl deleted file mode 100755 index 3988388d..00000000 --- a/word-aligner/invert_grammar.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -while(<>) { - my ($f, $e, $scores) = split / \|\|\| /; - print "$e ||| $f ||| $scores"; -} - diff --git a/word-aligner/make_lex_grammar.pl b/word-aligner/make_lex_grammar.pl deleted file mode 100755 index bdb2752c..00000000 --- a/word-aligner/make_lex_grammar.pl +++ /dev/null @@ -1,339 +0,0 @@ -#!/usr/bin/perl -w -use utf8; -use strict; - -my $LIMIT_SIZE=30; - -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; - -my %eclass = (); -my %fclass = (); -load_classes($class_e, \%eclass); -load_classes($class_f, \%fclass); - -our %cache; -open EF, "<$effile" or die; -open M1, "<$model1" or die; -open IM1, "<$imodel1" or die; -binmode(EF,":utf8"); -binmode(M1,":utf8"); -binmode(IM1,":utf8"); -binmode(STDOUT,":utf8"); -my %model1; -print STDERR "Reading model1...\n"; -my %sizes = (); -while() { - chomp; - my ($f, $e, $lp) = split /\s+/; - $model1{$f}->{$e} = 1; - $sizes{$f}++; -} -close M1; - -my $inv_add = 0; -my %invm1; -print STDERR "Reading inverse model1...\n"; -my %esizes=(); -while() { - chomp; - my ($e, $f, $lp) = split /\s+/; - $invm1{$e}->{$f} = 1; - $esizes{$e}++; - if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) { - $model1{$f}->{$e} = 1; - $sizes{$f}++; - $inv_add++; - } -} -close IM1; -print STDERR "Added $inv_add from inverse model1\n"; - -open M1, "<$model1" or die; -binmode(M1,":utf8"); -my $dir_add = 0; -print STDERR "Reading model1 (again) for extra inverse translations...\n"; -while() { - chomp; - my ($f, $e, $lp) = split /\s+/; - if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) { - $invm1{$e}->{$f} = 1; - $esizes{$e}++; - $dir_add++; - } -} -close M1; -print STDERR "Added $dir_add from model 1\n"; -print STDERR "Generating grammars...\n"; -open OE, "<$orthoe" or die; -binmode(OE,":utf8"); -my %oe_dict; -while() { - chomp; - my ($a, $b) = split / \|\|\| /, $_; - die "BAD: $_" unless defined $a && defined $b; - $oe_dict{$a} = $b; -} -close OE; -open OF, "<$orthof" or die; -binmode(OF,":utf8"); -my %of_dict; -while() { - chomp; - my ($a, $b) = split / \|\|\| /, $_; - die "BAD: $_" unless defined $a && defined $b; - $of_dict{$a} = $b; -} -close OF; -$of_dict{''} = ''; -$oe_dict{''} = ''; - -my $MIN_FEATURE_COUNT = 0; -my $ADD_PREFIX_ID = 0; -my $ADD_CLASS_CLASS = 1; -my $ADD_LEN = 1; -my $ADD_SIM = 1; -my $ADD_DICE = 1; -my $ADD_111 = 1; -my $ADD_ID = 1; -my $ADD_PUNC = 1; -my $ADD_NULL = 0; -my $ADD_STEM_ID = 1; -my $ADD_SYM = 0; -my $BEAM_RATIO = 50; - -my %fdict; -my %fcounts; -my %ecounts; - -my %sdict; - -while() { - chomp; - my ($f, $e) = split /\s*\|\|\|\s*/; - my @es = split /\s+/, $e; - my @fs = split /\s+/, $f; - for my $ew (@es){ $ecounts{$ew}++; } - push @fs, '' if $ADD_NULL; - for my $fw (@fs){ $fcounts{$fw}++; } - for my $fw (@fs){ - for my $ew (@es){ - $fdict{$fw}->{$ew}++; - } - } -} - -my $specials = 0; -my $fc = 1000000; -my $sids = 1000000; -for my $f (sort keys %fdict) { - my $re = $fdict{$f}; - my $max; - for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { - my $efcount = $re->{$e}; - unless (defined $max) { $max = $efcount; } - my $m1 = $model1{$f}->{$e}; - my $im1 = $invm1{$e}->{$f}; - my $is_good_pair = (defined $m1); - my $is_inv_good_pair = (defined $im1); - my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); - my @feats; - if ($efcount > $MIN_FEATURE_COUNT) { - $fc++; - push @feats, "F$fc=1"; - } - if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; } - my $oe = $oe_dict{$e}; - die "Can't find orthonorm form for $e" unless defined $oe; - my $of = $of_dict{$f}; - die "Can't find orthonorm form for $f" unless defined $of; - my $len_e = length($oe); - my $len_f = length($of); - push @feats, "Dice=$dice" if $ADD_DICE; - if ($ADD_CLASS_CLASS) { - my $ce = $eclass{$e} or die "E- no class for: $e"; - my $cf = $fclass{$f} or die "F- no class for: $f"; - push @feats, "C${cf}_${ce}=1"; - } - my $is_null = undef; - if ($ADD_NULL && $f eq '') { - push @feats, "IsNull=1"; - $is_null = 1; - } - if ($ADD_LEN) { - if (!$is_null) { - my $dlen = abs($len_e - $len_f); - push @feats, "DLen=$dlen"; - } - } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); - my $both_non_numeric = (!$e_num && !$f_num); - if ($ADD_STEM_ID) { - my $el = 4; - my $fl = 4; - if ($oe =~ /^al|re|co/) { $el++; } - if ($of =~ /^al|re|co/) { $fl++; } - if ($oe =~ /^trans|inter/) { $el+=2; } - if ($of =~ /^trans|inter/) { $fl+=2; } - if ($fl > length($of)) { $fl = length($of); } - if ($el > length($oe)) { $el = length($oe); } - my $sf = substr $of, 0, $fl; - my $se = substr $oe, 0, $el; - my $id = $sdict{$sf}->{$se}; - if (!$id) { - $sids++; - $sdict{$sf}->{$se} = $sids; - $id = $sids; - } - push @feats, "S$id=1"; - } - if ($ADD_PREFIX_ID) { - if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { - my $pe = substr $oe, 0, 3; - my $pf = substr $of, 0, 3; - if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } - } - } - if ($ADD_SIM) { - my $ld = 0; - my $eff = $len_e; - if ($eff < $len_f) { $eff = $len_f; } - if (!$is_null) { - $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); - } - if ($ld > 1.5) { $is_good_pair = 1; } - push @feats, "OrthoSim=$ld"; - } - my $ident = ($e eq $f); - if ($ident) { $is_good_pair = 1; } - if ($ident && $ADD_ID) { push @feats, "Identical=$len_e"; } - if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { - $is_good_pair = 1; - if ($ADD_111) { - push @feats, "OneOneOne=1"; - } - } - if ($ADD_PUNC) { - if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { - push @feats, "PuncMiss=1"; - } - } - my $is_special = ($is_good_pair && !(defined $m1)); - $specials++ if $is_special; - print STDERR "$f -> $e\n" if $is_special; - print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair; - print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair; - } -} -print STDERR "Added $specials special rules that were not in the M1 set\n"; - - -sub levenshtein -{ - # $s1 and $s2 are the two strings - # $len1 and $len2 are their respective lengths - # - my ($s1, $s2) = @_; - my ($len1, $len2) = (length $s1, length $s2); - - # If one of the strings is empty, the distance is the length - # of the other string - # - return $len2 if ($len1 == 0); - return $len1 if ($len2 == 0); - - my %mat; - - # Init the distance matrix - # - # The first row to 0..$len1 - # The first column to 0..$len2 - # The rest to 0 - # - # The first row and column are initialized so to denote distance - # from the empty string - # - for (my $i = 0; $i <= $len1; ++$i) - { - for (my $j = 0; $j <= $len2; ++$j) - { - $mat{$i}{$j} = 0; - $mat{0}{$j} = $j; - } - - $mat{$i}{0} = $i; - } - - # Some char-by-char processing is ahead, so prepare - # array of chars from the strings - # - my @ar1 = split(//, $s1); - my @ar2 = split(//, $s2); - - for (my $i = 1; $i <= $len1; ++$i) - { - for (my $j = 1; $j <= $len2; ++$j) - { - # Set the cost to 1 iff the ith char of $s1 - # equals the jth of $s2 - # - # Denotes a substitution cost. When the char are equal - # there is no need to substitute, so the cost is 0 - # - my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; - - # Cell $mat{$i}{$j} equals the minimum of: - # - # - The cell immediately above plus 1 - # - The cell immediately to the left plus 1 - # - The cell diagonally above and to the left plus the cost - # - # We can either insert a new char, delete a char or - # substitute an existing char (with an associated cost) - # - $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, - $mat{$i}{$j-1} + 1, - $mat{$i-1}{$j-1} + $cost]); - } - } - - # Finally, the Levenshtein distance equals the rightmost bottom cell - # of the matrix - # - # Note that $mat{$x}{$y} denotes the distance between the substrings - # 1..$x and 1..$y - # - return $mat{$len1}{$len2}; -} - - -# minimal element of a list -# -sub min -{ - my @list = @{$_[0]}; - my $min = $list[0]; - - foreach my $i (@list) - { - $min = $i if ($i < $min); - } - - return $min; -} - -sub load_classes { - my ($file, $ref) = @_; - print STDERR "Reading classes from $file...\n"; - open F, "<$file" or die "Can't read $file: $!"; - binmode(F, ":utf8") or die; - while() { - chomp; - my ($word, $class) = split /\s+/; -# print STDERR "'$word' -> $class\n"; - $ref->{$word} = $class; - } - close F; -} - diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index a6167010..b89937c1 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -3,18 +3,19 @@ all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.c clean: $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* +SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip ZCAT = zcat -EXTRACT_WEIGHTS = $(SCRIPT_DIR)/extract_weights.pl -EXTRACT_GRAMMAR = $(SCRIPT_DIR)/extract_grammar.pl -SUPPLEMENT_WEIGHTS = $(SCRIPT_DIR)/supplement_weights_file.pl -EXTRACT_VOCAB = $(SCRIPT_DIR)/extract_vocab.pl +EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl +EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl +SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl +EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl -CLASSIFY = $(SCRIPT_DIR)/classify.pl -MAKE_LEX_GRAMMAR = $(SCRIPT_DIR)/make_lex_grammar.pl +CLASSIFY = $(SUPPORT_DIR)/classify.pl +MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl MODEL1 = $(TRAINING_DIR)/model1 -MERGE_CORPUS = $(SCRIPT_DIR)/merge_corpus.pl +MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl orthonorm-dict.e: corpus.e $(EXTRACT_VOCAB) corpus.e > e.voc diff --git a/word-aligner/merge_corpus.pl b/word-aligner/merge_corpus.pl deleted file mode 100755 index 02827903..00000000 --- a/word-aligner/merge_corpus.pl +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/perl -w -use strict; -die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2; - -my ($a, $b) = @ARGV; -open A, "<$a" or die "Can't read $a: $!"; -open B, "<$b" or die "Can't read $a: $!"; - -while() { - chomp; - my $e = ; - die "Mismatched lines in $a and $b!" unless defined $e; - print "$_ ||| $e"; -} - -my $e = ; -die "Mismatched lines in $a and $b!" unless !defined $e; - diff --git a/word-aligner/supplement_weights_file.pl b/word-aligner/supplement_weights_file.pl deleted file mode 100755 index 76f668e2..00000000 --- a/word-aligner/supplement_weights_file.pl +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my ($f_classes) = @ARGV; - -die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; - -print <) { - chomp; - my ($x, $cat) = split /\s+/; - $dcats{$cat} = 1; -} - -my @cats = sort keys %dcats; - -for (my $i=0; $i < scalar @cats; $i++) { - my $c1 = $cats[$i]; - for (my $j=0; $j < scalar @cats; $j++) { - my $c2 = $cats[$j]; - print "SP:${c1}_${c2} 0\n"; - } -} - diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl new file mode 100755 index 00000000..893c7b22 --- /dev/null +++ b/word-aligner/support/classify.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2; + +my ($class, $text) = @ARGV; +open C, "<$class" or die "Can't read $class: $!"; +open T, "<$text" or die "Can't read $text: $!"; + +my %dict = (); +my $cc = 0; +while() { + chomp; + my ($word, $cat) = split /\s+/; + die "'$word' '$cat'" unless (defined $word && defined $cat); + $dict{$word} = $cat; + $cc++; +} +close C; +print STDERR "Loaded classes for $cc words\n"; + +while() { + chomp; + my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/; + print "@cats\n"; +} + diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl new file mode 100755 index 00000000..d7275ef5 --- /dev/null +++ b/word-aligner/support/extract_grammar.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; + +my $key = shift @ARGV; +die "Usage: $0 KEY\n" unless defined $key; + +while(<>) { + my ($k, @rest) = split / \|\|\| /; + print join(' ||| ', @rest) if ($k eq $key); +} + diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl new file mode 100755 index 00000000..070d4202 --- /dev/null +++ b/word-aligner/support/extract_vocab.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w +use strict; + +print STDERR "Extracting vocabulary...\n"; +my %dict = (); +my $wc = 0; +while(<>) { + chomp; + my @words = split /\s+/; + for my $word (@words) { $wc++; $dict{$word}++; } +} + +my $tc = 0; +for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) { + print "$word\n"; + $tc++; +} + +print STDERR "$tc types / $wc tokens\n"; + diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl new file mode 100755 index 00000000..dfedd12e --- /dev/null +++ b/word-aligner/support/extract_weights.pl @@ -0,0 +1,17 @@ +#!/usr/bin/perl -w +use strict; +my %dict=(); +while(<>) { + chomp; + my ($dummy, $a, $b, $wts) = split / \|\|\| /; + my @weights = split /\s+/, $wts; + for my $w (@weights) { + my ($name, $val) = split /=/, $w; + unless ($dict{$name}) { + my $r = (0.5 - rand) / 5; + $r = sprintf ("%0.4f", $r); + print "$name $r\n"; + $dict{$name}= 1; + } + } +} diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl new file mode 100755 index 00000000..3988388d --- /dev/null +++ b/word-aligner/support/invert_grammar.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + my ($f, $e, $scores) = split / \|\|\| /; + print "$e ||| $f ||| $scores"; +} + diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl new file mode 100755 index 00000000..bdb2752c --- /dev/null +++ b/word-aligner/support/make_lex_grammar.pl @@ -0,0 +1,339 @@ +#!/usr/bin/perl -w +use utf8; +use strict; + +my $LIMIT_SIZE=30; + +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; + +my %eclass = (); +my %fclass = (); +load_classes($class_e, \%eclass); +load_classes($class_f, \%fclass); + +our %cache; +open EF, "<$effile" or die; +open M1, "<$model1" or die; +open IM1, "<$imodel1" or die; +binmode(EF,":utf8"); +binmode(M1,":utf8"); +binmode(IM1,":utf8"); +binmode(STDOUT,":utf8"); +my %model1; +print STDERR "Reading model1...\n"; +my %sizes = (); +while() { + chomp; + my ($f, $e, $lp) = split /\s+/; + $model1{$f}->{$e} = 1; + $sizes{$f}++; +} +close M1; + +my $inv_add = 0; +my %invm1; +print STDERR "Reading inverse model1...\n"; +my %esizes=(); +while() { + chomp; + my ($e, $f, $lp) = split /\s+/; + $invm1{$e}->{$f} = 1; + $esizes{$e}++; + if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) { + $model1{$f}->{$e} = 1; + $sizes{$f}++; + $inv_add++; + } +} +close IM1; +print STDERR "Added $inv_add from inverse model1\n"; + +open M1, "<$model1" or die; +binmode(M1,":utf8"); +my $dir_add = 0; +print STDERR "Reading model1 (again) for extra inverse translations...\n"; +while() { + chomp; + my ($f, $e, $lp) = split /\s+/; + if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) { + $invm1{$e}->{$f} = 1; + $esizes{$e}++; + $dir_add++; + } +} +close M1; +print STDERR "Added $dir_add from model 1\n"; +print STDERR "Generating grammars...\n"; +open OE, "<$orthoe" or die; +binmode(OE,":utf8"); +my %oe_dict; +while() { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $oe_dict{$a} = $b; +} +close OE; +open OF, "<$orthof" or die; +binmode(OF,":utf8"); +my %of_dict; +while() { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $of_dict{$a} = $b; +} +close OF; +$of_dict{''} = ''; +$oe_dict{''} = ''; + +my $MIN_FEATURE_COUNT = 0; +my $ADD_PREFIX_ID = 0; +my $ADD_CLASS_CLASS = 1; +my $ADD_LEN = 1; +my $ADD_SIM = 1; +my $ADD_DICE = 1; +my $ADD_111 = 1; +my $ADD_ID = 1; +my $ADD_PUNC = 1; +my $ADD_NULL = 0; +my $ADD_STEM_ID = 1; +my $ADD_SYM = 0; +my $BEAM_RATIO = 50; + +my %fdict; +my %fcounts; +my %ecounts; + +my %sdict; + +while() { + chomp; + my ($f, $e) = split /\s*\|\|\|\s*/; + my @es = split /\s+/, $e; + my @fs = split /\s+/, $f; + for my $ew (@es){ $ecounts{$ew}++; } + push @fs, '' if $ADD_NULL; + for my $fw (@fs){ $fcounts{$fw}++; } + for my $fw (@fs){ + for my $ew (@es){ + $fdict{$fw}->{$ew}++; + } + } +} + +my $specials = 0; +my $fc = 1000000; +my $sids = 1000000; +for my $f (sort keys %fdict) { + my $re = $fdict{$f}; + my $max; + for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { + my $efcount = $re->{$e}; + unless (defined $max) { $max = $efcount; } + my $m1 = $model1{$f}->{$e}; + my $im1 = $invm1{$e}->{$f}; + my $is_good_pair = (defined $m1); + my $is_inv_good_pair = (defined $im1); + my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); + my @feats; + if ($efcount > $MIN_FEATURE_COUNT) { + $fc++; + push @feats, "F$fc=1"; + } + if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; } + my $oe = $oe_dict{$e}; + die "Can't find orthonorm form for $e" unless defined $oe; + my $of = $of_dict{$f}; + die "Can't find orthonorm form for $f" unless defined $of; + my $len_e = length($oe); + my $len_f = length($of); + push @feats, "Dice=$dice" if $ADD_DICE; + if ($ADD_CLASS_CLASS) { + my $ce = $eclass{$e} or die "E- no class for: $e"; + my $cf = $fclass{$f} or die "F- no class for: $f"; + push @feats, "C${cf}_${ce}=1"; + } + my $is_null = undef; + if ($ADD_NULL && $f eq '') { + push @feats, "IsNull=1"; + $is_null = 1; + } + if ($ADD_LEN) { + if (!$is_null) { + my $dlen = abs($len_e - $len_f); + push @feats, "DLen=$dlen"; + } + } + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); + my $both_non_numeric = (!$e_num && !$f_num); + if ($ADD_STEM_ID) { + my $el = 4; + my $fl = 4; + if ($oe =~ /^al|re|co/) { $el++; } + if ($of =~ /^al|re|co/) { $fl++; } + if ($oe =~ /^trans|inter/) { $el+=2; } + if ($of =~ /^trans|inter/) { $fl+=2; } + if ($fl > length($of)) { $fl = length($of); } + if ($el > length($oe)) { $el = length($oe); } + my $sf = substr $of, 0, $fl; + my $se = substr $oe, 0, $el; + my $id = $sdict{$sf}->{$se}; + if (!$id) { + $sids++; + $sdict{$sf}->{$se} = $sids; + $id = $sids; + } + push @feats, "S$id=1"; + } + if ($ADD_PREFIX_ID) { + if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { + my $pe = substr $oe, 0, 3; + my $pf = substr $of, 0, 3; + if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } + } + } + if ($ADD_SIM) { + my $ld = 0; + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); + } + if ($ld > 1.5) { $is_good_pair = 1; } + push @feats, "OrthoSim=$ld"; + } + my $ident = ($e eq $f); + if ($ident) { $is_good_pair = 1; } + if ($ident && $ADD_ID) { push @feats, "Identical=$len_e"; } + if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { + $is_good_pair = 1; + if ($ADD_111) { + push @feats, "OneOneOne=1"; + } + } + if ($ADD_PUNC) { + if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { + push @feats, "PuncMiss=1"; + } + } + my $is_special = ($is_good_pair && !(defined $m1)); + $specials++ if $is_special; + print STDERR "$f -> $e\n" if $is_special; + print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair; + print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair; + } +} +print STDERR "Added $specials special rules that were not in the M1 set\n"; + + +sub levenshtein +{ + # $s1 and $s2 are the two strings + # $len1 and $len2 are their respective lengths + # + my ($s1, $s2) = @_; + my ($len1, $len2) = (length $s1, length $s2); + + # If one of the strings is empty, the distance is the length + # of the other string + # + return $len2 if ($len1 == 0); + return $len1 if ($len2 == 0); + + my %mat; + + # Init the distance matrix + # + # The first row to 0..$len1 + # The first column to 0..$len2 + # The rest to 0 + # + # The first row and column are initialized so to denote distance + # from the empty string + # + for (my $i = 0; $i <= $len1; ++$i) + { + for (my $j = 0; $j <= $len2; ++$j) + { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + + $mat{$i}{0} = $i; + } + + # Some char-by-char processing is ahead, so prepare + # array of chars from the strings + # + my @ar1 = split(//, $s1); + my @ar2 = split(//, $s2); + + for (my $i = 1; $i <= $len1; ++$i) + { + for (my $j = 1; $j <= $len2; ++$j) + { + # Set the cost to 1 iff the ith char of $s1 + # equals the jth of $s2 + # + # Denotes a substitution cost. When the char are equal + # there is no need to substitute, so the cost is 0 + # + my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; + + # Cell $mat{$i}{$j} equals the minimum of: + # + # - The cell immediately above plus 1 + # - The cell immediately to the left plus 1 + # - The cell diagonally above and to the left plus the cost + # + # We can either insert a new char, delete a char or + # substitute an existing char (with an associated cost) + # + $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + } + } + + # Finally, the Levenshtein distance equals the rightmost bottom cell + # of the matrix + # + # Note that $mat{$x}{$y} denotes the distance between the substrings + # 1..$x and 1..$y + # + return $mat{$len1}{$len2}; +} + + +# minimal element of a list +# +sub min +{ + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) + { + $min = $i if ($i < $min); + } + + return $min; +} + +sub load_classes { + my ($file, $ref) = @_; + print STDERR "Reading classes from $file...\n"; + open F, "<$file" or die "Can't read $file: $!"; + binmode(F, ":utf8") or die; + while() { + chomp; + my ($word, $class) = split /\s+/; +# print STDERR "'$word' -> $class\n"; + $ref->{$word} = $class; + } + close F; +} + diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl new file mode 100755 index 00000000..02827903 --- /dev/null +++ b/word-aligner/support/merge_corpus.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; +die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2; + +my ($a, $b) = @ARGV; +open A, "<$a" or die "Can't read $a: $!"; +open B, "<$b" or die "Can't read $a: $!"; + +while() { + chomp; + my $e = ; + die "Mismatched lines in $a and $b!" unless defined $e; + print "$_ ||| $e"; +} + +my $e = ; +die "Mismatched lines in $a and $b!" unless !defined $e; + diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl new file mode 100755 index 00000000..7f804b90 --- /dev/null +++ b/word-aligner/support/supplement_weights_file.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; + +my $ADD_FCLASS_JUMP = 1; +my $ADD_MODEL2_BINARY = 0; +my $ADD_FC_RELPOS = 1; + +my ($f_classes) = @ARGV; + +die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; + +print <) { + chomp; + my ($x, $cat) = split /\s+/; + $dcats{$cat} = 1; +} + +my @cats = sort keys %dcats; + +my $added = 0; +for (my $i=0; $i < scalar @cats; $i++) { + my $c1 = $cats[$i]; + for (my $j=0; $j < scalar @cats; $j++) { + my $c2 = $cats[$j]; + print "SP:${c1}_${c2} 0\n"; + $added++; + } +} + +for (my $ss=1; $ss < 100; $ss++) { + if ($ADD_FCLASS_JUMP) { + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + for (my $j = -$ss; $j <= $ss; $j++) { + print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n"; + $added++; + } + } + } + if ($ADD_MODEL2_BINARY) { + # M2_FL:8_SI:3_TI:2=1 + for (my $i = 0; $i < $ss; $i++) { + for (my $j = 0; $j < 100; $j++) { + print "M2_FL:${ss}_SI:${i}_TI:${j} 0\n"; + $added++; + } + } + } +} +if ($ADD_FC_RELPOS) { + #RelPos_FC:11 + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + print "RelPos_FC:$cat 0\n"; + $added++; + } +} + +print STDERR "Added $added weights\n"; -- cgit v1.2.3