diff options
author | Chris Dyer <redpony@gmail.com> | 2010-02-01 17:38:39 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2010-02-01 17:38:39 -0500 |
commit | c97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch) | |
tree | 3bc1b02c39927a810862136534d5a0e35d7ed4fc | |
parent | da222df300e4f87ad185a7decbf119ad56aa34e0 (diff) |
word aligner cleanup, new features
17 files changed, 201 insertions, 64 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 34499398..b4381dda 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -16,6 +16,7 @@ void register_feature_functions() { global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>); global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>); global_ff_registry->Register("MarkovJump", new FFFactory<MarkovJump>); + global_ff_registry->Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>); global_ff_registry->Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>); global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>); global_ff_registry->Register("AlignerResults", new FFFactory<AlignerResults>); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index e3fa91d4..fb90df62 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -1,5 +1,6 @@ #include "ff_wordalign.h" +#include <set> #include <sstream> #include <string> #include <cmath> @@ -12,20 +13,20 @@ #include "tdict.h" // Blunsom hack #include "filelib.h" // Blunsom hack -static const size_t MAX_SENTENCE_SIZE = 100; +static const int MAX_SENTENCE_SIZE = 100; using namespace std; Model2BinaryFeatures::Model2BinaryFeatures(const string& param) : fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) { - for (int i = 0; i < MAX_SENTENCE_SIZE; ++i) { - for (int j = 0; j < MAX_SENTENCE_SIZE; ++j) { + for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { + for (int j = 0; j < i; ++j) { for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) { int& val = fids_[i][j][k]; val = -1; if (j < i) { ostringstream os; - os << "M2_" << i << '_' << j << ':' << k; + os << "M2_FL:" << i << "_SI:" << j << "_TI:" << k; val = FD::Convert(os.str()); } } @@ -56,8 +57,24 @@ RelativeSentencePosition::RelativeSentencePosition(const string& param) : if (!param.empty()) { cerr << " Loading word classes from " << param << endl; condition_on_fclass_ = true; - template_ = "RSP:FC000"; - assert(!"not implemented"); + ReadFile rf(param); + istream& in = *rf.stream(); + set<WordID> classes; + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + vector<WordID> v; + TD::ConvertSentence(line, &v); + pos_.push_back(v); + for (int i = 0; i < v.size(); ++i) + classes.insert(v[i]); + for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) { + ostringstream os; + os << "RelPos_FC:" << TD::Convert(*i); + fids_[*i] = FD::Convert(os.str()); + } + } } else { condition_on_fclass_ = false; } @@ -79,17 +96,22 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme static_cast<double>(edge.prev_i_) / smeta.GetTargetLength()); features->set_value(fid_, val); if (condition_on_fclass_) { - assert(!"not implemented"); + assert(smeta.GetSentenceID() < pos_.size()); + const WordID cur_fclass = pos_[smeta.GetSentenceID()][edge.i_]; + const int fid = fids_.find(cur_fclass)->second; + features->set_value(fid, val); } // cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; } MarkovJumpFClass::MarkovJumpFClass(const string& param) : - FeatureFunction(1) { + FeatureFunction(1), + fids_(MAX_SENTENCE_SIZE) { cerr << " MarkovJumpFClass" << endl; cerr << "Reading source POS tags from " << param << endl; ReadFile rf(param); istream& in = *rf.stream(); + set<WordID> classes; while(in) { string line; getline(in, line); @@ -97,8 +119,66 @@ MarkovJumpFClass::MarkovJumpFClass(const string& param) : vector<WordID> v; TD::ConvertSentence(line, &v); pos_.push_back(v); + for (int i = 0; i < v.size(); ++i) + classes.insert(v[i]); } cerr << " (" << pos_.size() << " lines)\n"; + cerr << " Classes: " << classes.size() << endl; + for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) { + map<WordID, map<int, int> >& cfids = fids_[ss]; + for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) { + map<int, int> &fids = cfids[*i]; + for (int j = -ss; j <= ss; ++j) { + ostringstream os; + os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j; + fids[j] = FD::Convert(os.str()); + } + } + } +} + +void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta, + int prev_src_pos, + int cur_src_pos, + SparseVector<double>* features) const { + const int jumpsize = cur_src_pos - prev_src_pos; + assert(smeta.GetSentenceID() < pos_.size()); + const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos]; + const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; + features->set_value(fid, 1.0); +} + +void MarkovJumpFClass::FinalTraversalFeatures(const void* context, + SparseVector<double>* features) const { + int left_index = *static_cast<const unsigned char*>(context); +// int right_index = cur_flen; + // TODO +} + +void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_states, + SparseVector<double>* features, + SparseVector<double>* estimated_features, + void* state) const { + unsigned char& dpstate = *((unsigned char*)state); + if (edge.Arity() == 0) { + dpstate = static_cast<unsigned int>(edge.i_); + } else if (edge.Arity() == 1) { + dpstate = *((unsigned char*)ant_states[0]); + } else if (edge.Arity() == 2) { + int left_index = *((unsigned char*)ant_states[0]); + int right_index = *((unsigned char*)ant_states[1]); + if (right_index == -1) + dpstate = static_cast<unsigned int>(left_index); + else + dpstate = static_cast<unsigned int>(right_index); +// const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index]; +// cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl; +// const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; +// features->set_value(fid, 1.0); + FireFeature(smeta, left_index, right_index, features); + } } MarkovJump::MarkovJump(const string& param) : diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index c5404887..688750de 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -19,7 +19,8 @@ class RelativeSentencePosition : public FeatureFunction { private: const int fid_; bool condition_on_fclass_; - std::string template_; + std::vector<std::vector<WordID> > pos_; + std::map<WordID, int> fids_; // fclass -> fid }; class Model2BinaryFeatures : public FeatureFunction { @@ -66,10 +67,14 @@ class MarkovJumpFClass : public FeatureFunction { SparseVector<double>* features, SparseVector<double>* estimated_features, void* context) const; - private: - void FireFeature(WordID src, - WordID trg, + + void FireFeature(const SentenceMetadata& smeta, + int prev_src_pos, + int cur_src_pos, SparseVector<double>* features) const; + + private: + std::vector<std::map<WordID, std::map<int, int> > > fids_; // flen -> fclass -> jumpsize -> fid std::vector<std::vector<WordID> > pos_; }; diff --git a/tests/system_tests/unsup-align/cdec.ini b/tests/system_tests/unsup-align/cdec.ini index 4016a201..37a37214 100644 --- a/tests/system_tests/unsup-align/cdec.ini +++ b/tests/system_tests/unsup-align/cdec.ini @@ -1,6 +1,6 @@ aligner=true grammar=unsup-align.lex-grammar -cubepruning_pop_limit=1000000 +intersection_strategy=full formalism=lexcrf feature_function=RelativeSentencePosition feature_function=MarkovJump diff --git a/tests/system_tests/unsup-align/gold.statistics b/tests/system_tests/unsup-align/gold.statistics index 2f37c2db..975c9d4e 100644 --- a/tests/system_tests/unsup-align/gold.statistics +++ b/tests/system_tests/unsup-align/gold.statistics @@ -90,7 +90,7 @@ constr_paths 4 +lm_nodes 7 +lm_edges 14 +lm_paths 16 -+lm_trans end thet ++lm_trans thet thet constr_nodes 7 constr_edges 10 constr_paths 4 diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl index 7643d4e5..03122df9 100755 --- a/training/cluster-ptrain.pl +++ b/training/cluster-ptrain.pl @@ -29,10 +29,12 @@ my $PRIOR; my $OALG = "lbfgs"; my $sigsq = 1; my $means_file; +my $mem_buffers = 20; my $RESTART_IF_NECESSARY; GetOptions("cdec=s" => \$DECODER, "distributed" => \$DISTRIBUTED, "sigma_squared=f" => \$sigsq, + "lbfgs_memory_buffers=i" => \$mem_buffers, "max_iteration=i" => \$max_iteration, "means=s" => \$means_file, "optimizer=s" => \$OALG, @@ -133,7 +135,7 @@ while ($iter < $max_iteration) { my $start = time; my $next_iter = $iter + 1; my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; - my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M 50 $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; + my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; my $cmd = ""; if ($parallel) { $cmd = $pcmd; } @@ -183,6 +185,8 @@ Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init --means FILE if you want means other than 0 --sigma_squared S variance on prior --pmem MEM Memory required for decoder + --lbfgs_memory_buffers Number of buffers to use + with LBFGS optimizer EOT } diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index d203fc53..7eec0e42 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -6,15 +6,20 @@ use Getopt::Long; my $training_dir = "$SCRIPT_DIR/../training"; die "Can't find training dir: $training_dir" unless -d $training_dir; +my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; my $num_classes = 50; my $nodes = 40; my $pmem = "2500mb"; my $DECODER = "cdec"; GetOptions("cdec=s" => \$DECODER, "jobs=i" => \$nodes, - "pmem=s" => \$pmem + "pmem=s" => \$pmem, + "mkcls=s" => \$mkcls, ) or usage(); usage() unless (scalar @ARGV == 1); +die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; +die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; + my $in_file = shift @ARGV; die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; @@ -22,13 +27,13 @@ my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; +print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; my @stages = qw(nopos relpos markov); my @directions = qw(f-e e-f); -my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; my $corpus = 'c'; my $cwd = getcwd(); @@ -75,7 +80,7 @@ NCLASSES = $num_classes TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary -PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5 +PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 export @@ -95,12 +100,16 @@ clean: EOT close TOPLEVEL; +print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; +exit 0; + sub make_stage { my ($stage, $direction, $prev_stage) = @_; my $stage_dir = "$align_dir/$stage-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; + my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; open CDEC, ">$stage_dir/cdec.ini" or die; print CDEC <<EOT; formalism=lexcrf @@ -108,10 +117,11 @@ intersection_strategy=full grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz EOT if ($stage =~ /relpos/) { - print CDEC "feature_function=RelativeSentencePosition\n"; + print CDEC "$RELPOS\n"; } elsif ($stage =~ /markov/) { - print CDEC "feature_function=RelativeSentencePosition\n"; + print CDEC "$RELPOS\n"; print CDEC "feature_function=MarkovJump\n"; + print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n"; } close CDEC; diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index a6167010..b89937c1 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -3,18 +3,19 @@ all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.c clean: $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* +SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip ZCAT = zcat -EXTRACT_WEIGHTS = $(SCRIPT_DIR)/extract_weights.pl -EXTRACT_GRAMMAR = $(SCRIPT_DIR)/extract_grammar.pl -SUPPLEMENT_WEIGHTS = $(SCRIPT_DIR)/supplement_weights_file.pl -EXTRACT_VOCAB = $(SCRIPT_DIR)/extract_vocab.pl +EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl +EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl +SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl +EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl -CLASSIFY = $(SCRIPT_DIR)/classify.pl -MAKE_LEX_GRAMMAR = $(SCRIPT_DIR)/make_lex_grammar.pl +CLASSIFY = $(SUPPORT_DIR)/classify.pl +MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl MODEL1 = $(TRAINING_DIR)/model1 -MERGE_CORPUS = $(SCRIPT_DIR)/merge_corpus.pl +MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl orthonorm-dict.e: corpus.e $(EXTRACT_VOCAB) corpus.e > e.voc diff --git a/word-aligner/supplement_weights_file.pl b/word-aligner/supplement_weights_file.pl deleted file mode 100755 index 76f668e2..00000000 --- a/word-aligner/supplement_weights_file.pl +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my ($f_classes) = @ARGV; - -die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; - -print <<EOT; -MarkovJump 0 -RelativeSentencePosition 0 -EOT - -# ! 8 -# " 11 -# 's 18 - -my %dcats = (); -$dcats{'BOS'} = 1; -$dcats{'EOS'} = 1; - -open FC, "<$f_classes" or die; -while(<FC>) { - chomp; - my ($x, $cat) = split /\s+/; - $dcats{$cat} = 1; -} - -my @cats = sort keys %dcats; - -for (my $i=0; $i < scalar @cats; $i++) { - my $c1 = $cats[$i]; - for (my $j=0; $j < scalar @cats; $j++) { - my $c2 = $cats[$j]; - print "SP:${c1}_${c2} 0\n"; - } -} - diff --git a/word-aligner/classify.pl b/word-aligner/support/classify.pl index 893c7b22..893c7b22 100755 --- a/word-aligner/classify.pl +++ b/word-aligner/support/classify.pl diff --git a/word-aligner/extract_grammar.pl b/word-aligner/support/extract_grammar.pl index d7275ef5..d7275ef5 100755 --- a/word-aligner/extract_grammar.pl +++ b/word-aligner/support/extract_grammar.pl diff --git a/word-aligner/extract_vocab.pl b/word-aligner/support/extract_vocab.pl index 070d4202..070d4202 100755 --- a/word-aligner/extract_vocab.pl +++ b/word-aligner/support/extract_vocab.pl diff --git a/word-aligner/extract_weights.pl b/word-aligner/support/extract_weights.pl index dfedd12e..dfedd12e 100755 --- a/word-aligner/extract_weights.pl +++ b/word-aligner/support/extract_weights.pl diff --git a/word-aligner/invert_grammar.pl b/word-aligner/support/invert_grammar.pl index 3988388d..3988388d 100755 --- a/word-aligner/invert_grammar.pl +++ b/word-aligner/support/invert_grammar.pl diff --git a/word-aligner/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index bdb2752c..bdb2752c 100755 --- a/word-aligner/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl diff --git a/word-aligner/merge_corpus.pl b/word-aligner/support/merge_corpus.pl index 02827903..02827903 100755 --- a/word-aligner/merge_corpus.pl +++ b/word-aligner/support/merge_corpus.pl diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl new file mode 100755 index 00000000..7f804b90 --- /dev/null +++ b/word-aligner/support/supplement_weights_file.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; + +my $ADD_FCLASS_JUMP = 1; +my $ADD_MODEL2_BINARY = 0; +my $ADD_FC_RELPOS = 1; + +my ($f_classes) = @ARGV; + +die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; + +print <<EOT; +MarkovJump 0 +RelativeSentencePosition 0 +EOT + +# ! 8 +# " 11 +# 's 18 + +my %dcats = (); +$dcats{'BOS'} = 1; +$dcats{'EOS'} = 1; + +open FC, "<$f_classes" or die; +while(<FC>) { + chomp; + my ($x, $cat) = split /\s+/; + $dcats{$cat} = 1; +} + +my @cats = sort keys %dcats; + +my $added = 0; +for (my $i=0; $i < scalar @cats; $i++) { + my $c1 = $cats[$i]; + for (my $j=0; $j < scalar @cats; $j++) { + my $c2 = $cats[$j]; + print "SP:${c1}_${c2} 0\n"; + $added++; + } +} + +for (my $ss=1; $ss < 100; $ss++) { + if ($ADD_FCLASS_JUMP) { + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + for (my $j = -$ss; $j <= $ss; $j++) { + print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n"; + $added++; + } + } + } + if ($ADD_MODEL2_BINARY) { + # M2_FL:8_SI:3_TI:2=1 + for (my $i = 0; $i < $ss; $i++) { + for (my $j = 0; $j < 100; $j++) { + print "M2_FL:${ss}_SI:${i}_TI:${j} 0\n"; + $added++; + } + } + } +} +if ($ADD_FC_RELPOS) { + #RelPos_FC:11 + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + print "RelPos_FC:$cat 0\n"; + $added++; + } +} + +print STDERR "Added $added weights\n"; |