diff options
324 files changed, 16184 insertions, 5472 deletions
@@ -1,3 +1,48 @@ +mira/kbest_mira +utils/m_test +sa-extract/calignment.c +sa-extract/calignment.so +sa-extract/cdat.c +sa-extract/cdat.so +sa-extract/cfloatlist.c +sa-extract/cfloatlist.so +sa-extract/cintlist.c +sa-extract/cintlist.so +sa-extract/clex.c +sa-extract/clex.so +sa-extract/cn.pyc +sa-extract/context_model.pyc +sa-extract/cstrmap.c +sa-extract/cstrmap.so +sa-extract/csuf.c +sa-extract/csuf.so +sa-extract/cveb.c +sa-extract/cveb.so +sa-extract/lcp.c +sa-extract/lcp.so +sa-extract/log.pyc +sa-extract/manager.pyc +sa-extract/model.pyc +sa-extract/monitor.pyc +sa-extract/precomputation.c +sa-extract/precomputation.so +sa-extract/rule.c +sa-extract/rule.so +sa-extract/rulefactory.c +sa-extract/rulefactory.so +sa-extract/sgml.pyc +sa-extract/sym.c +sa-extract/sym.so +training/lbl_model +training/mpi_flex_optimize +training/test_ngram +utils/dict_test +utils/logval_test +utils/mfcr_test +utils/phmt +utils/small_vector_test +utils/ts +utils/weights_test pro-train/.deps pro-train/mr_pro_map pro-train/mr_pro_reduce @@ -12,6 +57,7 @@ training/mpi_extract_reachable klm/lm/build_binary extools/extractor_monolingual gi/pf/.deps +gi/pf/learn_cfg gi/pf/brat gi/pf/cbgi gi/pf/dpnaive @@ -38,8 +84,8 @@ utils/.deps/ utils/libutils.a *swp *.o -vest/sentserver -vest/sentclient +dpmert/sentserver +dpmert/sentclient gi/pyp-topics/src/contexts_lexer.cc config.guess config.sub @@ -61,12 +107,12 @@ training/mr_em_map_adapter training/mr_reduce_to_weights training/optimize_test training/plftools -vest/fast_score -vest/lo_test -vest/mr_vest_map -vest/mr_vest_reduce -vest/scorer_test -vest/union_forests +dpmert/fast_score +dpmert/lo_test +dpmert/mr_dpmert_map +dpmert/mr_dpmert_reduce +dpmert/scorer_test +dpmert/union_forests Makefile Makefile.in aclocal.m4 @@ -99,12 +145,14 @@ training/Makefile.in training/*.o training/grammar_convert training/model1 -vest/.deps/ -vest/Makefile -vest/Makefile.in -vest/mr_vest_generate_mapper_input -vest/*.o +dpmert/.deps/ +dpmert/Makefile +dpmert/Makefile.in +dpmert/mr_dpmert_generate_mapper_input +dpmert/*.o decoder/logval_test +dtrain/dtrain +dtrain/*.o extools/build_lexical_translation extools/filter_grammar extools/score_grammar @@ -124,7 +172,6 @@ m4/ltoptions.m4 m4/ltsugar.m4 m4/ltversion.m4 m4/lt~obsolete.m4 -vest/mbr_kbest extools/featurize_grammar extools/filter_score_grammar gi/posterior-regularisation/prjava/build/ @@ -143,19 +190,10 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar *.ps *.toc *~ -*/.deps/ -mira/kbest_mira +gi/pf/align-lexonly +gi/pf/align-lexonly-pyp +gi/pf/condnaive +mteval/scorer_test phrasinator/gibbs_train_plm -training/augment_grammar -training/mpi_batch_optimize -training/mpi_em_optimize -training/test_ngram -utils/ts -training/compute_cllh -dtrain/dtrain -weights.gz -dtrain/test/eval/ phrasinator/gibbs_train_plm_notables -training/mpi_flex_optimize -utils/phmt -dtrain/thesis +.* diff --git a/Makefile.am b/Makefile.am index 6b2ec7b6..e49554d0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dtrain vest pro-train extools gi/pf gi/markov_al +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dtrain dpmert pro-train extools gi/pf gi/markov_al #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index ec519067..dee28083 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ esac AC_PROG_CC AC_PROG_CXX AC_LANG_CPLUSPLUS -BOOST_REQUIRE +BOOST_REQUIRE([1.44]) BOOST_PROGRAM_OPTIONS AC_ARG_ENABLE(mpi, [ --enable-mpi Build MPI binaries, assumes mpi.h is present ], @@ -38,7 +38,7 @@ then CPPFLAGS="$CPPFLAGS -I${with_cmph}/include" AC_CHECK_HEADER(cmph.h, - [AC_DEFINE([HAVE_CMPH], [], [flag for cmph perfect hashing library])], + [AC_DEFINE([HAVE_CMPH], [1], [flag for cmph perfect hashing library])], [AC_MSG_ERROR([Cannot find cmph library!])]) LDFLAGS="$LDFLAGS -L${with_cmph}/lib" @@ -46,6 +46,25 @@ then AM_CONDITIONAL([HAVE_CMPH], true) fi +AM_CONDITIONAL([HAVE_EIGEN], false) +AC_ARG_WITH(eigen, + [AC_HELP_STRING([--with-eigen=PATH], [(optional) path to Eigen linear algebra library])], + [with_eigen=$withval], + [with_eigen=no] + ) + +if test "x$with_eigen" != 'xno' +then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS -I${with_eigen}" + + AC_CHECK_HEADER(Eigen/Dense, + [AC_DEFINE([HAVE_EIGEN], [1], [flag for Eigen linear algebra library])], + [AC_MSG_ERROR([Cannot find Eigen!])]) + + AM_CONDITIONAL([HAVE_EIGEN], true) +fi + #BOOST_THREADS CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" @@ -53,11 +72,8 @@ LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS" # $BOOST_THREAD_LIBS" -AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp, - [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])]) - AC_CHECK_HEADER(google/dense_hash_map, - [AC_DEFINE([HAVE_SPARSEHASH], [], [flag for google::dense_hash_map])]) + [AC_DEFINE([HAVE_SPARSEHASH], [1], [flag for google::dense_hash_map])]) AC_PROG_INSTALL GTEST_LIB_CHECK(1.0) @@ -113,4 +129,4 @@ then AM_CONDITIONAL([GLC], true) fi -AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile dtrain/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile dtrain/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) diff --git a/decoder/1dev.ur b/decoder/1dev.ur deleted file mode 100755 index adeaa101..00000000 --- a/decoder/1dev.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy ( AstRAf rpwrtRr ) krAcy myN pyr kw mxtlf HAdvAt myN xAtwn smyt 4 AfrAd hlAk hw gyY jbkh smndr sY Ayk $xS ky lA$ mly . diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 30eaf04d..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -63,6 +63,7 @@ libcdec_a_SOURCES = \ ff.cc \ ff_rules.cc \ ff_wordset.cc \ + ff_context.cc \ ff_charset.cc \ ff_lm.cc \ ff_klm.cc \ @@ -75,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/apply_fsa_models.README b/decoder/apply_fsa_models.README deleted file mode 100755 index 7e116a62..00000000 --- a/decoder/apply_fsa_models.README +++ /dev/null @@ -1,21 +0,0 @@ -trie root and trie lhs2[lhs-nodeid] -> trie node - -trie node edges (adj) - list of w,dest,p. dest==0 means it's a completed rule (note: p is redundant with node e.dest->p-p, except in case of dest=0). we will also use null_wordid (max_int) for dest=0 edges, but that doesn't matter - -we intersect by iterating over adj and scoring w/ fsa. TODO: index for sparse fsa; for now we assume smoothed ngram fsa where all items are scorable. - -predicted items: we don't make copies of the pending predictions as we scan toward completion; instead, item backpointers are followed until the prediction (where backpointer=0). such backpointer=0 items have a queue of prediction-originating items. - -reusing completed items using a lookup on pair [NT,a] -> all [NT,a,b] lazy best-first. b-next (right state) index in lazy index. - -perhaps predictors need to register the # of items it has already mated with. (b-next index) - -comb-like (cube) t-next (position in trie node edge list), b-next? or just check chart and don't redup. depends on whether we want just 1best or kbest deriv - diff. ways of reaching same result are good in kbest. - -types of chart items: - -A->t.*,a,b (trie node t) with mutable state t-next for generating successor lazily (vs. all at once) - -A->t.B,a,b (t-next of A->t.* points to (B,t')): mutable state b-next for choosing which B->b,? to use. note: such an item can't be queued immediately on its own, but can be added to the pending list of B->b,? ; once any B->b,? is completed then we see if any more b-next are already known; if they're exhausted then we add back to pending list? - -A->a,? - list of all known (b,inside prob) such that A[a,b]. we may also choose to represent this as A->.*,a,a. diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h index 6561c70c..6561c70c 100755..100644 --- a/decoder/apply_fsa_models.h +++ b/decoder/apply_fsa_models.h diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 40fd27e4..9ba59d1b 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -270,7 +270,8 @@ public: const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; const JVector j(edge.tail_nodes_.size(), 0); cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); - assert(unique_cands.insert(cand.back()).second); // these should all be unique! + bool is_new = unique_cands.insert(cand.back()).second; + assert(is_new); // these should all be unique! } // cerr << " making heap of " << cand.size() << " candidates\n"; make_heap(cand.begin(), cand.end(), HeapCandCompare()); @@ -378,7 +379,8 @@ public: pop_heap(cand.begin(), cand.end(), HeapCandCompare()); Candidate* item = cand.back(); cand.pop_back(); - assert(unique_accepted.insert(item).second); // these should all be unique! + bool is_new = unique_accepted.insert(item).second; + assert(is_new); // these should all be unique! // cerr << "POPPED: " << *item << endl; PushSuccFast2(*item, is_goal, &cand, &unique_accepted); @@ -419,7 +421,8 @@ public: Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); cand.push_back(new_cand); push_heap(cand.begin(), cand.end(), HeapCandCompare()); - assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check + bool is_new = cs->insert(new_cand).second; + assert(is_new); // insert into uniqueness set, sanity check } } } diff --git a/decoder/cdec-gz.ini b/decoder/cdec-gz.ini deleted file mode 100755 index f9b15420..00000000 --- a/decoder/cdec-gz.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar.gz -weights=weights.tune.nolm diff --git a/decoder/cdec-nolm-tuned.ini b/decoder/cdec-nolm-tuned.ini deleted file mode 100755 index 5ebab747..00000000 --- a/decoder/cdec-nolm-tuned.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar -weights=weights.tune.nolm diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 4ce5749e..b516c386 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -1,6 +1,7 @@ #include <boost/shared_ptr.hpp> #include "ff.h" +#include "ff_context.h" #include "ff_spans.h" #include "ff_lm.h" #include "ff_klm.h" @@ -42,6 +43,7 @@ void register_feature_functions() { #endif ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>()); ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>()); + ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>()); ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>()); ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>); ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>); diff --git a/decoder/cfg.cc b/decoder/cfg.cc index cd7e66e9..cd7e66e9 100755..100644 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc diff --git a/decoder/cfg.h b/decoder/cfg.h index 8cb29bb9..8cb29bb9 100755..100644 --- a/decoder/cfg.h +++ b/decoder/cfg.h diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index ae06f8bf..ae06f8bf 100755..100644 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h index 2f40d483..2f40d483 100755..100644 --- a/decoder/cfg_format.h +++ b/decoder/cfg_format.h diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h index 7b59c05c..7b59c05c 100755..100644 --- a/decoder/cfg_options.h +++ b/decoder/cfg_options.h diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc index c61f9f2c..c61f9f2c 100755..100644 --- a/decoder/cfg_test.cc +++ b/decoder/cfg_test.cc diff --git a/decoder/decode.sh b/decoder/decode.sh deleted file mode 100755 index 677e64ad..00000000 --- a/decoder/decode.sh +++ /dev/null @@ -1,10 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -decode() { -if [ "$lm" ] ; then - lmargs0=-F - lmargs1="LanguageModel lm.gz -n LM" -fi -set -x -$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@" -set +x -} diff --git a/decoder/decoder.cc b/decoder/decoder.cc index b93925d1..53c47d21 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -408,6 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("show_partition,z", "Compute and show the partition (inside score)") ("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation") ("show_cfg_search_space", "Show the search space as a CFG") + ("show_target_graph", "Output the target hypergraph") ("coarse_to_fine_beam_prune", po::value<double>(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)") ("ctf_beam_widen", po::value<double>()->default_value(2.0), "Expand coarse pass beam by this factor if no fine parse is found") ("ctf_num_widenings", po::value<int>()->default_value(2), "Widen coarse beam this many times before backing off to full parse") @@ -815,6 +816,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { abort(); } + if (conf.count("show_target_graph")) + HypergraphIO::WriteTarget(forest); + for (int pass = 0; pass < rescoring_passes.size(); ++pass) { const RescoringPass& rp = rescoring_passes[pass]; const vector<weight_t>& cur_weights = *rp.weight_vector; diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh deleted file mode 100755 index b3ddeb18..00000000 --- a/decoder/do.tests.sh +++ /dev/null @@ -1 +0,0 @@ -for f in *_test; do ./$f; done diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc index 48e94a31..b7af801a 100644 --- a/decoder/earley_composer.cc +++ b/decoder/earley_composer.cc @@ -329,7 +329,10 @@ class EarleyComposerImpl { forest->ReserveNodes(kMAX_NODES); assert(sit != g.end()); Edge* init = new Edge(start_cat_, &sit->second, q_0_); - assert(IncorporateNewEdge(init)); + if (!IncorporateNewEdge(init)) { + cerr << "Failed to create initial edge!\n"; + abort(); + } while (exp_agenda.HasWork() || agenda.HasWork()) { while(exp_agenda.HasWork()) { const Edge* edge = exp_agenda.Next(); diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc new file mode 100644 index 00000000..19f9a413 --- /dev/null +++ b/decoder/ff_context.cc @@ -0,0 +1,99 @@ +#include "ff_context.h" + +#include <sstream> +#include <cassert> +#include <cmath> + +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "lattice.h" +#include "fdict.h" +#include "verbose.h" + +using namespace std; + +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + +RuleContextFeatures::RuleContextFeatures(const std::string& param) { + kSOS = TD::Convert("<s>"); + kEOS = TD::Convert("</s>"); + + // TODO param lets you pass in a string from the cdec.ini file +} + +void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) { + const Lattice& sl = smeta.GetSourceLattice(); + current_input.resize(sl.size()); + for (unsigned i = 0; i < sl.size(); ++i) { + if (sl[i].size() != 1) { + cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl; + abort(); + } + current_input[i] = sl[i][0].label; + } +} + +void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector<const void*>& ant_contexts, + SparseVector<double>* features, + SparseVector<double>* estimated_features, + void* context) const { + const TRule& rule = *edge.rule_; + + if (rule.Arity() != 0 || // arity = 0, no nonterminals + rule.e_.size() != 1) return; // size = 1, predicted label is a single token + + + // you can see the current label "for free" + const WordID cur_label = rule.e_[0]; + // (if you want to see more labels, you have to be very careful, and muck + // about with contexts and ant_contexts) + + // but... you can look at as much of the source as you want! + const int from_src_index = edge.i_; // start of the span in the input being labeled + const int to_src_index = edge.j_; // end of the span in the input + // (note: in the case of tagging the size of the spans being labeled will + // always be 1, but in other formalisms, you can have bigger spans.) + + // this is the current token being labeled: + const WordID cur_input = current_input[from_src_index]; + + // let's get the previous token in the input (may be to the left of the start + // of the sentence!) + WordID prev_input = kSOS; + if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; } + // let's get the next token (may be to the left of the start of the sentence!) + WordID next_input = kEOS; + if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; } + + // now, build a feature string + ostringstream os; + // TD::Convert converts from the internal integer representation of a token + // to the actual token + os << "C1:" << TD::Convert(prev_input) << '_' + << TD::Convert(cur_input) << '|' << TD::Convert(cur_label); + // C1 is just to prevent a name clash + + // pick a value + double fval = 1.0; // can be any real value + + // add it to the feature vector FD::Convert converts the feature string to a + // feature int, Escape makes sure the feature string doesn't have any bad + // symbols that could confuse a parser somewhere + features->add_value(FD::Convert(Escape(os.str())), fval); + // that's it! + + // create more features if you like... +} + diff --git a/decoder/ff_context.h b/decoder/ff_context.h new file mode 100644 index 00000000..0d22b027 --- /dev/null +++ b/decoder/ff_context.h @@ -0,0 +1,23 @@ +#ifndef _FF_CONTEXT_H_ +#define _FF_CONTEXT_H_ + +#include <vector> +#include "ff.h" + +class RuleContextFeatures : public FeatureFunction { + public: + RuleContextFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_contexts, + SparseVector<double>* features, + SparseVector<double>* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + std::vector<WordID> current_input; + WordID kSOS, kEOS; +}; + +#endif diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict<float> freq_dict_; set<WordID> bad_words_; }; diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index 04dd1906..d6d79f5e 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -57,6 +57,39 @@ namespace { } } +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) { + vector<string> const& argv=SplitOnWhitespace(in); + *explicit_markers = false; + *order = 3; +#define LMSPEC_NEXTARG if (i==argv.end()) { \ + cerr << "Missing argument for "<<*last<<". "; goto usage; \ + } else { ++i; } + + for (vector<string>::const_iterator last,i=argv.begin(),e=argv.end();i!=e;++i) { + string const& s=*i; + if (s[0]=='-') { + if (s.size()>2) goto fail; + switch (s[1]) { + case 'x': + *explicit_markers = true; + break; + case 'o': + LMSPEC_NEXTARG; *order=atoi((*i).c_str()); + break; +#undef LMSPEC_NEXTARG + default: + fail: + cerr<<"Unknown option on NgramFeatures "<<s<<" ; "; + goto usage; + } + } + } + return true; +usage: + cerr << "NgramFeatures is incorrect!\n"; + return false; +} + class NgramDetectorImpl { // returns the number of unscored words at the left edge of a span @@ -264,10 +297,10 @@ class NgramDetectorImpl { } public: - explicit NgramDetectorImpl(bool explicit_markers) : + explicit NgramDetectorImpl(bool explicit_markers, unsigned order) : kCDEC_UNK(TD::Convert("<unk>")) , add_sos_eos_(!explicit_markers) { - order_ = 3; + order_ = order; state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID); unscored_size_offset_ = (order_ - 1) * sizeof(WordID); is_complete_offset_ = unscored_size_offset_ + 1; @@ -316,8 +349,10 @@ class NgramDetectorImpl { NgramDetector::NgramDetector(const string& param) { string filename, mapfile, featname; - bool explicit_markers = (param == "-x"); - pimpl_ = new NgramDetectorImpl(explicit_markers); + bool explicit_markers = false; + unsigned order = 3; + ParseArgs(param, &explicit_markers, &order); + pimpl_ = new NgramDetectorImpl(explicit_markers, order); SetStateSize(pimpl_->ReserveStateSize()); } diff --git a/decoder/ff_register.h b/decoder/ff_register.h index 80b1457e..80b1457e 100755..100644 --- a/decoder/ff_register.h +++ b/decoder/ff_register.h diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 74d71b6a..74d71b6a 100755..100644 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include <iostream> -#include <fstream> -#include <cassert> -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include <iostream> #include <map> #include <string> #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template <typename T = float> class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map<WordID,float>::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map<WordID,T>::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map<WordID, float> counts_; + T max_; + std::map<WordID, T> counts_; }; #endif diff --git a/decoder/fsa-decode.sh b/decoder/fsa-decode.sh deleted file mode 100755 index 66879523..00000000 --- a/decoder/fsa-decode.sh +++ /dev/null @@ -1,3 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -. $d/decode.sh -in=1dev.ur cfg=cdec-fsa decode diff --git a/decoder/fsa-hiero.ini b/decoder/fsa-hiero.ini deleted file mode 100755 index 7c7d0347..00000000 --- a/decoder/fsa-hiero.ini +++ /dev/null @@ -1,5 +0,0 @@ -formalism=scfg -scfg_extra_glue_grammar=glue-lda.scfg -grammar=grammar.hiero -show_tree_structure=true -weights=weights.hiero diff --git a/decoder/fsa.ini b/decoder/fsa.ini deleted file mode 100755 index 571a2e34..00000000 --- a/decoder/fsa.ini +++ /dev/null @@ -1,2 +0,0 @@ -feature_function=ShorterThanPrev -feature_function=LongerThanPrev diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 38dbd717..074de4c9 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -30,7 +30,10 @@ struct FSTTranslatorImpl { if (input.find("{\"rules\"") == 0) { istringstream is(input); Hypergraph src_cfg_hg; - assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); + if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) { + cerr << "Failed to read HG from JSON.\n"; + abort(); + } if (add_pass_through_rules) { SparseVector<double> feats; feats.set_value(FD::Convert("PassThrough"), 1); diff --git a/decoder/glue-lda.scfg b/decoder/glue-lda.scfg deleted file mode 100755 index 27489817..00000000 --- a/decoder/glue-lda.scfg +++ /dev/null @@ -1,8 +0,0 @@ -[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X0,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X1,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X1,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X2,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X2,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X3,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X3,1] ||| [1] ||| GlueTop=1 diff --git a/decoder/grammar.hiero b/decoder/grammar.hiero deleted file mode 100755 index 79adf33a..00000000 --- a/decoder/grammar.hiero +++ /dev/null @@ -1,151 +0,0 @@ -[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] [2] cat ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] cat [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] . ||| [1] [2] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro ||| [1] black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro . ||| [1] black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro [X,2] ||| [1] black cat [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande ||| big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande . ||| big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande [X,2] ||| big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro ||| black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro . ||| black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro [X,2] ||| black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga ||| [1] caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga . ||| [1] caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga [X,2] ||| [1] caterpiller [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] ||| [1] [2] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] . ||| [1] [2] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo ||| [1] ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo . ||| [1] ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo [X,2] ||| [1] ugly duckling [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces ||| [1] fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces . ||| [1] fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces [X,2] ||| [1] fish [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro ||| [1] dog ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro . ||| [1] dog . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] dog [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] [2] dog ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] . ||| [1] [2] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande ||| [1] big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande . ||| [1] big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande [X,2] ||| [1] big dog [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] ||| [1] [2] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] . ||| [1] [2] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro ||| [1] black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro . ||| [1] black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro [X,2] ||| [1] black bird [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| anciano ||| old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano . ||| old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano [X,1] ||| old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo ||| the ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo . ||| the ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo [X,2] ||| the ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande ||| the big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande . ||| the big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande [X,2] ||| the big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro ||| the black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro . ||| the black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro [X,2] ||| the black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato ||| the cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato . ||| the cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the [1] cat ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the cat [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] . ||| the [1] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro ||| the black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro . ||| the black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro [X,1] ||| the black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] ||| the [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] . ||| the [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo ||| the ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo . ||| the ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo [X,1] ||| the ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro ||| the dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro . ||| the dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the [1] dog ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the dog [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] . ||| the [1] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande ||| the big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande . ||| the big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande [X,1] ||| the big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] ||| the [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] . ||| the [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro ||| the black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro . ||| the black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro [X,1] ||| the black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| eso ||| that ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] ||| that [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] . ||| that [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro ||| that dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro . ||| that dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro [X,1] ||| that dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este ||| this ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] ||| this [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] . ||| this [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este anciano ||| this old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano . ||| this old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano [X,1] ||| this old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este gato ||| this cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato . ||| this cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato [X,1] ||| this cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| feo ||| ugly ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato ||| cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato . ||| cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| [1] cat ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| cat [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] . ||| [1] cat . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro ||| black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro . ||| black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro [X,1] ||| black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| grande ||| big ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| la ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga ||| the caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga . ||| the caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga [X,1] ||| the caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces ||| the fish ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces . ||| the fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces [X,1] ||| the fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| negro ||| black ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga ||| caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga . ||| caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga [X,1] ||| caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito ||| duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] ||| [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] . ||| [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo ||| ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo . ||| ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo [X,1] ||| ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces ||| fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces . ||| fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces [X,1] ||| fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro ||| dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro . ||| dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| [1] dog ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| dog [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] . ||| [1] dog . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande ||| big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande . ||| big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande [X,1] ||| big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro ||| bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] ||| [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] . ||| [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro ||| black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro . ||| black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro [X,1] ||| black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h index b90aca47..b90aca47 100755..100644 --- a/decoder/hg_cfg.h +++ b/decoder/hg_cfg.h diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index c1c93933..9f0f50fa 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -624,3 +624,30 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { } } +/* Output format: + * #vertices + * for each vertex in bottom-up topological order: + * #downward_edges + * for each downward edge: + * RHS with [vertex_index] for NTs ||| scores + */ +void HypergraphIO::WriteTarget(const Hypergraph& hg) { + cout << hg.nodes_.size() << ' ' << hg.edges_.size() << '\n'; + for (unsigned int i = 0; i < hg.nodes_.size(); ++i) { + const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_; + cout << edges.size() << '\n'; + for (unsigned int j = 0; j < edges.size(); ++j) { + const Hypergraph::Edge &edge = hg.edges_[edges[j]]; + const std::vector<WordID> &e = edge.rule_->e(); + for (std::vector<WordID>::const_iterator word = e.begin(); word != e.end(); ++word) { + if (*word <= 0) { + cout << '[' << edge.tail_nodes_[-*word] << "] "; + } else { + cout << TD::Convert(*word) << ' '; + } + } + cout << "||| " << edge.rule_->scores_ << '\n'; + } + } +} + diff --git a/decoder/hg_io.h b/decoder/hg_io.h index 082489d8..44817157 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -23,6 +23,9 @@ struct HypergraphIO { static void WriteAsCFG(const Hypergraph& hg); + // Write only the target size information in bottom-up order. + static void WriteTarget(const Hypergraph& hg); + // serialization utils static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); // return PLF string representation (undefined behavior on non-lattices) diff --git a/decoder/hg_test.h b/decoder/hg_test.h index 3da6533c..3da6533c 100755..100644 --- a/decoder/hg_test.h +++ b/decoder/hg_test.h diff --git a/decoder/lattice.cc b/decoder/lattice.cc index e3631e59..89da3cd0 100644 --- a/decoder/lattice.cc +++ b/decoder/lattice.cc @@ -46,6 +46,7 @@ void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) { Lattice& l = *pl; vector<WordID> ids; TD::ConvertSentence(text, &ids); + l.clear(); l.resize(ids.size()); for (int i = 0; i < l.size(); ++i) l[i].push_back(LatticeArc(ids[i], 0.0, 1)); diff --git a/decoder/nt_span.h b/decoder/nt_span.h index a918f301..a918f301 100755..100644 --- a/decoder/nt_span.h +++ b/decoder/nt_span.h diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h index b603e27a..b603e27a 100755..100644 --- a/decoder/oracle_bleu.h +++ b/decoder/oracle_bleu.h diff --git a/decoder/perro.sh b/decoder/perro.sh deleted file mode 100755 index 3e54ac71..00000000 --- a/decoder/perro.sh +++ /dev/null @@ -1 +0,0 @@ -$gdb $cdec "$@" -k 30 --show_features -c fsa-hiero.ini -i perro.ur diff --git a/decoder/perro.ur b/decoder/perro.ur deleted file mode 100755 index 6c5da6d7..00000000 --- a/decoder/perro.ur +++ /dev/null @@ -1 +0,0 @@ -eso perro feo diff --git a/decoder/program_options.h b/decoder/program_options.h index 87afb320..87afb320 100755..100644 --- a/decoder/program_options.h +++ b/decoder/program_options.h diff --git a/decoder/sentences.h b/decoder/sentences.h index 54b5ffb3..54b5ffb3 100755..100644 --- a/decoder/sentences.h +++ b/decoder/sentences.h diff --git a/decoder/short.ur b/decoder/short.ur deleted file mode 100755 index 48612801..00000000 --- a/decoder/short.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy myN pyr kw mxtlf HAdvAt diff --git a/decoder/trule.cc b/decoder/trule.cc index 40235542..141b8faa 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -232,16 +232,6 @@ void TRule::ComputeArity() { arity_ = 1 - min; } -static string AnonymousStrVar(int i) { - string res("[v]"); - if(!(i <= 0 && i >= -8)) { - cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; - abort(); - } - res[1] = '1' - i; - return res; -} - string TRule::AsString(bool verbose) const { ostringstream os; int idx = 0; @@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const { } } os << " ||| "; - if (idx > 9) { - cerr << "Too many non-terminals!\n partial: " << os.str() << endl; - exit(1); - } for (int i =0; i<e_.size(); ++i) { if (i) os << ' '; const WordID& w = e_[i]; if (w < 1) - os << AnonymousStrVar(w); + os << '[' << (1-w) << ']'; else os << TD::Convert(w); } diff --git a/decoder/weights-fsa b/decoder/weights-fsa deleted file mode 100644 index 3cc96c2f..00000000 --- a/decoder/weights-fsa +++ /dev/null @@ -1,14 +0,0 @@ -Arity_0 1.70741473606976 -Arity_1 1.12426238048012 -Arity_2 1.14986187839554 -Glue -0.04589037041388 -LanguageModel 1.09051 -LM 1.09051 -PassThrough -3.66226367902928 -PhraseModel_0 -1.94633451863252 -PhraseModel_1 -0.1475347695476 -PhraseModel_2 -1.614818994946 -WordPenalty -3.0 -WordPenaltyFsa -0.56028442964748 -ShorterThanPrev -10 -LongerThanPrev -10 diff --git a/decoder/weights.hiero b/decoder/weights.hiero deleted file mode 100755 index 6747f059..00000000 --- a/decoder/weights.hiero +++ /dev/null @@ -1,10 +0,0 @@ -SameFirstLetter 1 -LongerThanPrev 1 -ShorterThanPrev 1 -GlueTop 0.0 -Glue -1.0 -EgivenF -0.5 -FgivenE -0.5 -LexEgivenF -0.5 -LexFgivenE -0.5 -LM 1 diff --git a/dpmert/Makefile.am b/dpmert/Makefile.am new file mode 100644 index 00000000..2676fb50 --- /dev/null +++ b/dpmert/Makefile.am @@ -0,0 +1,35 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input \ + sentserver \ + sentclient + +if HAVE_GTEST +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test +endif + +sentserver_SOURCES = sentserver.c +sentserver_LDFLAGS = -all-static -pthread + +sentclient_SOURCES = sentclient.c +sentclient_LDFLAGS = -all-static -pthread + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc +mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc +mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/README.shared-mem b/dpmert/README.shared-mem index 7728efc0..7728efc0 100644 --- a/vest/README.shared-mem +++ b/dpmert/README.shared-mem diff --git a/vest/ces.cc b/dpmert/ces.cc index 4ae6b695..a85454da 100644 --- a/vest/ces.cc +++ b/dpmert/ces.cc @@ -4,25 +4,32 @@ #include <sstream> #include <boost/shared_ptr.hpp> -#include "aligner.h" +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" #include "lattice.h" -#include "viterbi_envelope.h" +#include "mert_geometry.h" #include "error_surface.h" +#include "ns.h" using boost::shared_ptr; using namespace std; const bool minimize_segments = true; // if adjacent segments have equal scores, merge them -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) { +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { vector<WordID> prev_trans; - const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs(); + const vector<shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs(); env->resize(ienv.size()); - ScoreP prev_score; + SufficientStats prev_score; // defaults to 0 int j = 0; for (int i = 0; i < ienv.size(); ++i) { - const Segment& seg = *ienv[i]; + const MERTPoint& seg = *ienv[i]; vector<WordID> trans; +#if 0 if (type == AER) { vector<bool> edges(hg.edges_.size(), false); seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi @@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er string tstr = os.str(); TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); } else { +#endif seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; if (trans == prev_trans) { if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); + out.delta.fields.clear(); out.x = seg.x; ++j; } - // cerr << "Identical translation, skipping scoring\n"; + //cerr << "Identical translation, skipping scoring\n"; } else { - ScoreP score = ss.ScoreCandidate(trans); + SufficientStats score; + ss.Evaluate(trans, &score); // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; prev_trans.swap(trans); prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; + out.delta = delta; out.x = seg.x; ++j; } diff --git a/dpmert/ces.h b/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/vest/dist-vest.pl b/dpmert/dpmert.pl index 11e791c1..52ce0fc0 100755 --- a/vest/dist-vest.pl +++ b/dpmert/dpmert.pl @@ -21,9 +21,9 @@ my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; my $FAST_SCORE="$bin_dir/../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input"; -my $MAPPER = "$bin_dir/mr_vest_map"; -my $REDUCER = "$bin_dir/mr_vest_reduce"; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; my $parallelize = "$bin_dir/parallelize.pl"; my $libcall = "$bin_dir/libcall.pl"; my $sentserver = "$bin_dir/sentserver"; @@ -65,8 +65,6 @@ my $oraclen=0; my $oracleb=20; my $bleu_weight=1; my $use_make = 1; # use make to parallelize line search -my $dirargs=''; -my $density_prune; my $useqsub; my $pass_suffix = ''; my $cpbin=1; @@ -75,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, "jobs=i" => \$jobs, - "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, "dry-run" => \$dryrun, @@ -87,15 +84,7 @@ if (GetOptions( "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "rand-directions=i" => \$rand_directions, - "random_directions=i" => \$rand_directions, - "bleu_weight=s" => \$bleu_weight, - "no-primary!" => \$noprimary, - "max-similarity=s" => \$maxsim, - "oracle-directions=i" => \$oraclen, - "n-oracle=i" => \$oraclen, - "oracle-batch=i" => \$oracleb, - "directions-args=s" => \$dirargs, + "random-directions=i" => \$rand_directions, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, @@ -107,10 +96,6 @@ if (GetOptions( exit; } -if (defined $density_prune) { - die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; -} - if ($useqsub) { $use_make = 0; die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); @@ -151,7 +136,7 @@ if ($metric =~ /^ter$|^aer$/i) { my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); unless ($dir){ - $dir = "vest"; + $dir = "dpmert"; } unless ($dir =~ /^\//){ # convert relative path to absolute path my $basedir = check_output("pwd"); @@ -212,14 +197,14 @@ if ($dryrun){ write_config(*STDERR); exit 0; } else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs die "ERROR: working dir $dir already exists\n\n"; } else { -e $dir || mkdir $dir; mkdir "$dir/hgs"; modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-vest.sh"; + my $cmdfile="$dir/rerun-dpmert.sh"; open CMD,'>',$cmdfile; print CMD "cd ",&getcwd,"\n"; # print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. @@ -308,7 +293,7 @@ while (1){ $retries++; } die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; @@ -328,10 +313,7 @@ while (1){ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; print STDERR unchecked_output("date"); $icc++; - my $nop=$noprimary?"--no_primary":""; - my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; - my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; print STDERR "COMMAND:\n$cmd\n"; check_call($cmd); check_call("mkdir -p $dir/splag.$im1"); @@ -360,11 +342,11 @@ while (1){ my $mapoutput = $shard; my $client_name = $shard; $client_name =~ s/mapinput.//; - $client_name = "vest.$client_name"; + $client_name = "dpmert.$client_name"; $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -424,7 +406,7 @@ while (1){ print STDERR "Results for $tol/$til lines\n"; print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; diff --git a/vest/error_surface.cc b/dpmert/error_surface.cc index 754aa8de..515b67f8 100644 --- a/vest/error_surface.cc +++ b/dpmert/error_surface.cc @@ -5,8 +5,7 @@ using namespace std; -ErrorSurface::~ErrorSurface() { -} +ErrorSurface::~ErrorSurface() {} void ErrorSurface::Serialize(std::string* out) const { const int segments = this->size(); @@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const { for (int i = 0; i < segments; ++i) { const ErrorSegment& cur = (*this)[i]; string senc; - cur.delta->Encode(&senc); - assert(senc.size() < 256); + cur.delta.Encode(&senc); + assert(senc.size() < 1024); unsigned char len = senc.size(); os.write((const char*)&cur.x, sizeof(cur.x)); os.write((const char*)&len, sizeof(len)); @@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const { *out = os.str(); } -void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { +void ErrorSurface::Deserialize(const std::string& in) { istringstream is(in, ios::binary); int segments; is.read((char*)&segments, sizeof(segments)); @@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { is.read((char*)&len, sizeof(len)); string senc(len, '\0'); assert(senc.size() == len); is.read((char*)&senc[0], len); - cur.delta = SentenceScorer::CreateScoreFromString(type, senc); + cur.delta = SufficientStats(senc); } } diff --git a/vest/error_surface.h b/dpmert/error_surface.h index ad728cfa..bb65847b 100644 --- a/vest/error_surface.h +++ b/dpmert/error_surface.h @@ -4,13 +4,13 @@ #include <vector> #include <string> -#include "scorer.h" +#include "ns.h" class Score; struct ErrorSegment { double x; - ScoreP delta; + SufficientStats delta; ErrorSegment() : x(0), delta() {} }; @@ -18,7 +18,7 @@ class ErrorSurface : public std::vector<ErrorSegment> { public: ~ErrorSurface(); void Serialize(std::string* out) const; - void Deserialize(ScoreType type, const std::string& in); + void Deserialize(const std::string& in); }; #endif diff --git a/vest/libcall.pl b/dpmert/libcall.pl index c7d0f128..c7d0f128 100644 --- a/vest/libcall.pl +++ b/dpmert/libcall.pl diff --git a/vest/line_mediator.pl b/dpmert/line_mediator.pl index bc2bb24c..bc2bb24c 100755 --- a/vest/line_mediator.pl +++ b/dpmert/line_mediator.pl diff --git a/vest/line_optimizer.cc b/dpmert/line_optimizer.cc index 7303df8d..49443fbe 100644 --- a/vest/line_optimizer.cc +++ b/dpmert/line_optimizer.cc @@ -4,7 +4,7 @@ #include <algorithm> #include "sparse_vector.h" -#include "scorer.h" +#include "ns.h" using namespace std; @@ -18,6 +18,7 @@ struct IntervalComp { }; double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, const vector<ErrorSurface>& surfaces, const LineOptimizer::ScoreType type, float* best_score, @@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize( } sort(all_ints.begin(), all_ints.end(), IntervalComp()); double last_boundary = all_ints.front()->x; - ScoreP accp = all_ints.front()->delta->GetZero(); - Score *acc=accp.get(); + SufficientStats acc; float& cur_best_score = *best_score; cur_best_score = (type == MAXIMIZE_SCORE ? -numeric_limits<float>::max() : numeric_limits<float>::max()); @@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize( for (vector<ErrorIter>::iterator i = all_ints.begin(); i != all_ints.end(); ++i) { const ErrorSegment& seg = **i; - assert(seg.delta); if (seg.x - last_boundary > epsilon) { - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize( } else { pos = last_boundary + (seg.x - last_boundary) / 2; } - // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; } - // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx; + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; // cerr << "---- s=" << sco << "\n"; last_boundary = seg.x; } // cerr << "x-boundary=" << seg.x << "\n"; - acc->PlusEquals(*seg.delta); + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; } - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections( RandomUnitVector(features_to_optimize, &out[i], rng); cerr << "Generated " << out.size() << " total axes to optimize along.\n"; } + diff --git a/vest/line_optimizer.h b/dpmert/line_optimizer.h index 99a591f4..83819f41 100644 --- a/vest/line_optimizer.h +++ b/dpmert/line_optimizer.h @@ -7,6 +7,7 @@ #include "error_surface.h" #include "sampler.h" +class EvaluationMetric; class Weights; struct LineOptimizer { @@ -18,6 +19,7 @@ struct LineOptimizer { // merge all the error surfaces together into a global // error surface and find (the middle of) the best segment static double LineOptimize( + const EvaluationMetric* metric, const std::vector<ErrorSurface>& envs, const LineOptimizer::ScoreType type, float* best_score, diff --git a/vest/lo_test.cc b/dpmert/lo_test.cc index f5638600..d9b909b8 100644 --- a/vest/lo_test.cc +++ b/dpmert/lo_test.cc @@ -5,6 +5,8 @@ #include <boost/shared_ptr.hpp> #include <gtest/gtest.h> +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "fdict.h" #include "hg.h" @@ -13,9 +15,8 @@ #include "filelib.h" #include "inside_outside.h" #include "viterbi.h" -#include "viterbi_envelope.h" +#include "mert_geometry.h" #include "line_optimizer.h" -#include "scorer.h" using namespace std; using boost::shared_ptr; @@ -42,23 +43,23 @@ TEST_F(OptTest, TestCheckNaN) { EXPECT_EQ(true, isnan(z)); } -TEST_F(OptTest,TestViterbiEnvelope) { - shared_ptr<Segment> a1(new Segment(-1, 0)); - shared_ptr<Segment> b1(new Segment(1, 0)); - shared_ptr<Segment> a2(new Segment(-1, 1)); - shared_ptr<Segment> b2(new Segment(1, -1)); - vector<shared_ptr<Segment> > sa; sa.push_back(a1); sa.push_back(b1); - vector<shared_ptr<Segment> > sb; sb.push_back(a2); sb.push_back(b2); - ViterbiEnvelope a(sa); +TEST_F(OptTest,TestConvexHull) { + shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0)); + shared_ptr<MERTPoint> b1(new MERTPoint(1, 0)); + shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1)); + shared_ptr<MERTPoint> b2(new MERTPoint(1, -1)); + vector<shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1); + vector<shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); cerr << a << endl; - ViterbiEnvelope b(sb); - ViterbiEnvelope c = a; + ConvexHull b(sb); + ConvexHull c = a; c *= b; cerr << a << " (*) " << b << " = " << c << endl; EXPECT_EQ(3, c.size()); } -TEST_F(OptTest,TestViterbiEnvelopeInside) { +TEST_F(OptTest,TestConvexHullInside) { const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; Hypergraph hg; istringstream instr(json); @@ -77,10 +78,10 @@ TEST_F(OptTest,TestViterbiEnvelopeInside) { cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; } SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); - ViterbiEnvelopeWeightFunction wf(wts, dir); - ViterbiEnvelope env = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); cerr << env << endl; - const vector<boost::shared_ptr<Segment> >& segs = env.GetSortedSegs(); + const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs(); dir *= segs[1]->x; wts += dir; hg.Reweight(wts); @@ -141,10 +142,7 @@ TEST_F(OptTest, TestS1) { TD::ConvertSentence(ref22, &refs2[1]); TD::ConvertSentence(ref32, &refs2[2]); TD::ConvertSentence(ref42, &refs2[3]); - ScoreType type = ScoreTypeFromString("ibm_bleu"); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); - ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2); - vector<ViterbiEnvelope> envs(2); + vector<ConvexHull> envs(2); RandomNumberGenerator<boost::mt19937> rng; @@ -162,19 +160,22 @@ TEST_F(OptTest, TestS1) { cerr << "Computing Viterbi envelope using inside algorithm...\n"; cerr << "axis: " << axis << endl; clock_t t_start=clock(); - ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction - envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); - envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf); vector<ErrorSurface> es(2); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); - ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); clock_t t_env=clock(); float score; - double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); clock_t t_opt=clock(); cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; EXPECT_FLOAT_EQ(0.48719698, score); @@ -213,19 +214,19 @@ TEST_F(OptTest,TestZeroOrigin) { } SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0); - ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction - vector<ViterbiEnvelope> envs(1); - envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector<ConvexHull> envs(1); + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); - ScoreType type = ScoreTypeFromString("ibm_bleu"); vector<vector<WordID> > mr(4); TD::ConvertSentence("untitled", &mr[0]); TD::ConvertSentence("with no title", &mr[1]); TD::ConvertSentence("without a title", &mr[2]); TD::ConvertSentence("without title", &mr[3]); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr); vector<ErrorSurface> es(1); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); } int main(int argc, char **argv) { diff --git a/dpmert/mert_geometry.cc b/dpmert/mert_geometry.cc new file mode 100644 index 00000000..81b25af9 --- /dev/null +++ b/dpmert/mert_geometry.cc @@ -0,0 +1,186 @@ +#include "mert_geometry.h" + +#include <cassert> +#include <limits> + +using namespace std; +using boost::shared_ptr; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, shared_ptr<MERTPoint>(), shared_ptr<MERTPoint>()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector<shared_ptr<MERTPoint> >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const shared_ptr<MERTPoint>& a, const shared_ptr<MERTPoint>& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + shared_ptr<MERTPoint> edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector<shared_ptr<MERTPoint> > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector<WordID>* trans) const { + const MERTPoint* cur = this; + vector<vector<WordID> > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector<const vector<WordID>*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/dpmert/mert_geometry.h b/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include <vector> +#include <iostream> +#include <boost/shared_ptr.hpp> + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); +static const double kPlusInfinity = std::numeric_limits<double>::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr<MERTPoint> p1; + boost::shared_ptr<MERTPoint> p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector<WordID>* trans) const; + void CollectEdgesUsed(std::vector<bool>* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector<boost::shared_ptr<MERTPoint> > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector<double>& ori, + const SparseVector<double>& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector<double> origin; + const SparseVector<double> direction; +}; + +#endif diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..59d4f24f --- /dev/null +++ b/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,78 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator<boost::mt19937> rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector<string> features; + SparseVector<weight_t> origin; + vector<weight_t> w; + Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as<string>(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as<vector<string> >(); + vector<SparseVector<weight_t> > directions; + vector<int> fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as<unsigned int>(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/vest/mr_vest_map.cc b/dpmert/mr_dpmert_map.cc index 71dda6d7..f3304f0f 100644 --- a/vest/mr_vest_map.cc +++ b/dpmert/mr_dpmert_map.cc @@ -6,12 +6,13 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" -#include "scorer.h" -#include "viterbi_envelope.h" +#include "mert_geometry.h" #include "inside_outside.h" #include "error_surface.h" #include "b64tools.h" @@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") ("source,s",po::value<string>(), "Source file (ignored, except for AER)") - ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") + ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized") ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") ("help,h", "Help"); po::options_description dcmdline_options; @@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as<string>(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; Hypergraph hg; string last_file; ReadFile in_read(conf["input"].as<string>()); @@ -81,28 +82,29 @@ int main(int argc, char** argv) { if (line.empty()) continue; istringstream is(line); int sent_id; - string file, s_origin, s_axis; + string file, s_origin, s_direction; // path-to-file (JSON) sent_ed starting-point search-direction - is >> file >> sent_id >> s_origin >> s_axis; + is >> file >> sent_id >> s_origin >> s_direction; SparseVector<double> origin; - assert(ReadSparseVectorString(s_origin, &origin)); - SparseVector<double> axis; - assert(ReadSparseVectorString(s_axis, &axis)); - // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl; + ReadSparseVectorString(s_origin, &origin); + SparseVector<double> direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; if (last_file != file) { last_file = file; ReadFile rf(file); HypergraphIO::ReadFromJSON(rf.stream(), &hg); } - ViterbiEnvelopeWeightFunction wf(origin, axis); - ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + ErrorSurface es; - ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; es.Serialize(&val); - cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t'; + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; B64::b64encode(val.c_str(), val.size(), &cout); cout << endl << flush; } diff --git a/vest/mr_vest_reduce.cc b/dpmert/mr_dpmert_reduce.cc index 3df52020..31512a03 100644 --- a/vest/mr_vest_reduce.cc +++ b/dpmert/mr_dpmert_reduce.cc @@ -10,6 +10,7 @@ #include "error_surface.h" #include "line_optimizer.h" #include "b64tools.h" +#include "stringlib.h" using namespace std; namespace po = boost::program_options; @@ -17,12 +18,12 @@ namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("loss_function,l",po::value<string>(), "Loss function being optimized") + ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("loss_function") == 0; + bool flag = conf->count("evaluation_metric") == 0; if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); @@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as<string>(); - ScoreType type = ScoreTypeFromString(loss_function); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (type == TER || type == AER) { + if (metric->IsErrorMetric()) opt_type = LineOptimizer::MINIMIZE_SCORE; - } - string last_key; + vector<ErrorSurface> esv; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; + string last_key, line, key, val; + while(getline(cin, line)) { size_t ks = line.find("\t"); assert(string::npos != ks); assert(ks > 2); - string key = line.substr(2, ks - 2); - string val = line.substr(ks + 1); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); if (key != last_key) { if (!last_key.empty()) { float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } - last_key = key; + last_key.swap(key); esv.clear(); } if (val.size() % 4 != 0) { @@ -68,13 +66,11 @@ int main(int argc, char** argv) { continue; } esv.push_back(ErrorSurface()); - esv.back().Deserialize(type, encoded); + esv.back().Deserialize(encoded); } if (!esv.empty()) { - // cerr << "ESV=" << esv.size() << endl; - // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } return 0; diff --git a/vest/parallelize.pl b/dpmert/parallelize.pl index 7d0365cc..7d0365cc 100755 --- a/vest/parallelize.pl +++ b/dpmert/parallelize.pl diff --git a/vest/sentclient.c b/dpmert/sentclient.c index 91d994ab..91d994ab 100644 --- a/vest/sentclient.c +++ b/dpmert/sentclient.c diff --git a/vest/sentserver.c b/dpmert/sentserver.c index c20b4fa6..c20b4fa6 100644 --- a/vest/sentserver.c +++ b/dpmert/sentserver.c diff --git a/vest/sentserver.h b/dpmert/sentserver.h index cd17a546..cd17a546 100644 --- a/vest/sentserver.h +++ b/dpmert/sentserver.h diff --git a/vest/test_aer/README b/dpmert/test_aer/README index 819b2e32..819b2e32 100644 --- a/vest/test_aer/README +++ b/dpmert/test_aer/README diff --git a/vest/test_aer/cdec.ini b/dpmert/test_aer/cdec.ini index 08187848..08187848 100644 --- a/vest/test_aer/cdec.ini +++ b/dpmert/test_aer/cdec.ini diff --git a/vest/test_aer/corpus.src b/dpmert/test_aer/corpus.src index 31b23971..31b23971 100644 --- a/vest/test_aer/corpus.src +++ b/dpmert/test_aer/corpus.src diff --git a/vest/test_aer/grammar b/dpmert/test_aer/grammar index 9d857824..9d857824 100644 --- a/vest/test_aer/grammar +++ b/dpmert/test_aer/grammar diff --git a/vest/test_aer/ref.0 b/dpmert/test_aer/ref.0 index 734a9c5b..734a9c5b 100644 --- a/vest/test_aer/ref.0 +++ b/dpmert/test_aer/ref.0 diff --git a/vest/test_aer/weights b/dpmert/test_aer/weights index afc9282e..afc9282e 100644 --- a/vest/test_aer/weights +++ b/dpmert/test_aer/weights diff --git a/vest/test_data/0.json.gz b/dpmert/test_data/0.json.gz Binary files differindex 30f8dd77..30f8dd77 100644 --- a/vest/test_data/0.json.gz +++ b/dpmert/test_data/0.json.gz diff --git a/vest/test_data/1.json.gz b/dpmert/test_data/1.json.gz Binary files differindex c82cc179..c82cc179 100644 --- a/vest/test_data/1.json.gz +++ b/dpmert/test_data/1.json.gz diff --git a/vest/test_data/c2e.txt.0 b/dpmert/test_data/c2e.txt.0 index 12c4abe9..12c4abe9 100644 --- a/vest/test_data/c2e.txt.0 +++ b/dpmert/test_data/c2e.txt.0 diff --git a/vest/test_data/c2e.txt.1 b/dpmert/test_data/c2e.txt.1 index 4ac12df1..4ac12df1 100644 --- a/vest/test_data/c2e.txt.1 +++ b/dpmert/test_data/c2e.txt.1 diff --git a/vest/test_data/c2e.txt.2 b/dpmert/test_data/c2e.txt.2 index 2f67b72f..2f67b72f 100644 --- a/vest/test_data/c2e.txt.2 +++ b/dpmert/test_data/c2e.txt.2 diff --git a/vest/test_data/c2e.txt.3 b/dpmert/test_data/c2e.txt.3 index 5483cef6..5483cef6 100644 --- a/vest/test_data/c2e.txt.3 +++ b/dpmert/test_data/c2e.txt.3 diff --git a/vest/test_data/re.txt.0 b/dpmert/test_data/re.txt.0 index 86eff087..86eff087 100644 --- a/vest/test_data/re.txt.0 +++ b/dpmert/test_data/re.txt.0 diff --git a/vest/test_data/re.txt.1 b/dpmert/test_data/re.txt.1 index 2140f198..2140f198 100644 --- a/vest/test_data/re.txt.1 +++ b/dpmert/test_data/re.txt.1 diff --git a/vest/test_data/re.txt.2 b/dpmert/test_data/re.txt.2 index 94e46286..94e46286 100644 --- a/vest/test_data/re.txt.2 +++ b/dpmert/test_data/re.txt.2 diff --git a/vest/test_data/re.txt.3 b/dpmert/test_data/re.txt.3 index f87c3308..f87c3308 100644 --- a/vest/test_data/re.txt.3 +++ b/dpmert/test_data/re.txt.3 diff --git a/expLog b/expLog deleted file mode 100644 index 2070ac98..00000000 --- a/expLog +++ /dev/null @@ -1,60 +0,0 @@ -TIME MEASURES AFTER MERGE WITH cdec: -8/July/2011 -commit ed8a6e81d87f6e917ecf - -./runEval -Fri Jul 8 13:28:23 CEST 2011 -Fri Jul 8 13:30:24 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.25, 76.5|43.1|24.3|13.9 (brev=0.993) -0.322487 -Fri Jul 8 13:30:24 CEST 2011 ------------- -Fri Jul 8 15:04:00 CEST 2011 -Fri Jul 8 15:05:58 CEST 2011 -Time required for Cube Pruning execution: 77.61 seconds. ------------- -Fri Jul 8 15:24:39 CEST 2011 -Fri Jul 8 15:26:36 CEST 2011 -Time required for Cube Pruning execution: 79.01 seconds. ------------- - -./runEvalFCP -Fri Jul 8 13:33:17 CEST 2011 -Fri Jul 8 13:35:06 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.39, 76.5|43.1|24.5|14.0 (brev=0.994) -0.323857 -Fri Jul 8 13:35:07 CEST 2011 ------------- -Fri Jul 8 15:08:17 CEST 2011 -Fri Jul 8 15:10:05 CEST 2011 -Time required for Cube Pruning execution: 69.36 seconds. ------------- -Fri Jul 8 15:21:48 CEST 2011 -Fri Jul 8 15:23:35 CEST 2011 -Time required for Cube Pruning execution: 69.71 seconds. ------------- - -./runEvalFCP2 -Fri Jul 8 13:53:38 CEST 2011 -Fri Jul 8 13:55:29 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.49, 76.6|43.2|24.5|14.1 (brev=0.994) -0.324901 -Fri Jul 8 13:55:29 CEST 2011 ------------- -Fri Jul 8 15:12:52 CEST 2011 -Fri Jul 8 15:14:42 CEST 2011 -Time required for Cube Pruning execution: 72.66 seconds. ------------- -Fri Jul 8 15:19:13 CEST 2011 -Fri Jul 8 15:21:03 CEST 2011 -Time required for Cube Pruning execution: 72.06 seconds. ------------- diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am index 3aab17da..cdca1f97 100644 --- a/gi/clda/src/Makefile.am +++ b/gi/clda/src/Makefile.am @@ -1,14 +1,3 @@ -if HAVE_GTEST -noinst_PROGRAMS = \ - crp_test - -TESTS = crp_test - -crp_test_SOURCES = crp_test.cc -crp_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) - -endif - bin_PROGRAMS = clda clda_SOURCES = clda.cc diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,26 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc + +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc + +nuisance_test_SOURCES = nuisance_test.cc +nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +align_tl_SOURCES = align-tl.cc +align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + +learn_cfg_SOURCES = learn_cfg.cc + +condnaive_SOURCES = condnaive.cc + dpnaive_SOURCES = dpnaive.cc pfdist_SOURCES = pfdist.cc @@ -17,5 +33,6 @@ brat_SOURCES = brat.cc pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..942dcf51 --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,239 @@ +#include <iostream> +#include <queue> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "array2d.h" +#include "sampler.h" +#include "corpus.h" +#include "pyp_tm.h" +#include "quasi_model2.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") + ("input,i",po::value<string>(),"Read parallel data from") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +MT19937* prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { + vector<WordID> src; + vector<WordID> trg; + vector<LexicalAlignment> a; + Array2D<short> posterior; +}; + +struct Aligner { + Aligner(const vector<vector<WordID> >& lets, + int num_letters, + const po::variables_map& conf, + vector<AlignedSentencePair>* c) : + corpus(*c), + paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), + model(lets, num_letters), + kNULL(TD::Convert("NULL")) { + assert(lets[kNULL].size() == 0); + } + + vector<AlignedSentencePair>& corpus; + QuasiModel2 paj_model; + const bool infer_paj; + PYPLexicalTranslation model; + const WordID kNULL; + + void ResampleHyperparameters() { + model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); + } + + void InitializeRandom() { + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, asp.trg[j], &*prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); + } + } + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + void ResampleCorpus() { + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + const WordID e_j = asp.trg[j]; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + ss[prop_a_j] = model.Prob(prop_f, e_j); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); + } + } + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); + } +}; + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { + for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { + vector<WordID>& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D<bool> a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + assert(asp.a[j].src_index <= asp.src.size()); + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + } + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng = new MT19937(conf["random_seed"].as<uint32_t>()); + else + prng = new MT19937; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector<AlignedSentencePair> corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector<vector<WordID> > letters(TD::NumWords()); + set<WordID> letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + Aligner aligner(letters, letset.size(), conf, &corpus); + aligner.InitializeRandom(); + + const unsigned samples = conf["samples"].as<unsigned>(); + for (int i = 0; i < samples; ++i) { + for (int j = 65; j < 67; ++j) Debug(corpus[j]); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } + aligner.ResampleCorpus(); + if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + aligner.model.Summary(); + + return 0; +} diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc new file mode 100644 index 00000000..cbe8c6c8 --- /dev/null +++ b/gi/pf/align-tl.cc @@ -0,0 +1,339 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "backward.h" +#include "array2d.h" +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" +#include "transliterations.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities") + ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities") + ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source") + ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target") + ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr<MT19937> prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { + vector<WordID> src; + vector<WordID> trg; + vector<LexicalAlignment> a; + Array2D<short> posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector<WordID>& s) const { + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); + } + + void Increment(const TRule& rule) { + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR<1,vector<WordID> > r; + const double u0; + const vector<prob_t> l; + mutable vector<prob_t> v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, + const unsigned words_e, + const unsigned letters_e, + vector<AlignedSentencePair>* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + } + + void ResampleCorpus(); + + const vector<vector<WordID> >& letters; // spelling dictionary + vector<AlignedSentencePair>& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; + //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; + //ConditionalTranslationModel<UnigramWordBase> tmodel; + //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; + MConditionalTranslationModel<HierarchicalWordBase> tmodel; + //ConditionalTranslationModel<FixedNgramBase> tmodel; + //ConditionalTranslationModel<CompletelyUniformBase> tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; +} + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { + for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { + vector<WordID>& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D<bool> a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector<AlignedSentencePair> corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector<vector<WordID> > letters(TD::NumWords() + 1); + set<WordID> letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + // TODO configure this + const int max_src_chunk = conf["max_src_chunk"].as<unsigned>(); + const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>(); + const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>(); + const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>()); + Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); + + cerr << "Initializing transliteration graph structures ...\n"; + for (int i = 0; i < corpus.size(); ++i) { + const vector<int>& src = corpus[i].src; + const vector<int>& trg = corpus[i].trg; + for (int j = 0; j < src.size(); ++j) { + const vector<int>& src_let = letters[src[j]]; + for (int k = 0; k < trg.size(); ++k) { + const vector<int>& trg_let = letters[trg[k]]; + tl.Initialize(src[j], src_let, trg[k], trg_let); + //if (src_let.size() < min_trans_src) + // tl.Forbid(src[j], src_let, trg[k], trg_let); + } + } + } + cerr << endl; + tl.GraphSummary(); + + return 0; +} diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include <queue> +#include <utility> + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, + const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { + delete m1; m1 = NULL; + delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src, + const std::vector<WordID>& trg, + unsigned src_covered, + unsigned trg_covered, + double s2t_ratio) const { + if (src_covered == src.size() || trg_covered == trg.size()) { + assert(src_covered == src.size()); + assert(trg_covered == trg.size()); + return 0; + } + static const WordID kNULL = TD::Convert("<eps>"); + const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); + // TODO factor in expected length ratio + prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_covered; j < trg.size(); ++j) { + prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); + for (unsigned i = src_covered; i < src.size(); ++i) + p += (*m1)(src[i], trg[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; + assert(!"failed"); + } + p *= uniform_alignment; + e *= p; + } + // TODO factor in expected length ratio + const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); + prob_t inv; + inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); + for (unsigned i = src_covered; i < src.size(); ++i) { + prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); + for (unsigned j = trg_covered; j < trg.size(); ++j) + p += (*m1inv)(trg[j], src[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; + assert(!"failed"); + } + p *= inv_uniform; + inv *= p; + } + return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector<WordID>& src, + const vector<WordID>& trg, + const Reachability& r, + double s2t_ratio, + float* grid) const { + queue<pair<int,int> > q; + q.push(make_pair(0,0)); + Array2D<bool> done(src.size()+1, trg.size()+1, false); + //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; + while(!q.empty()) { + const pair<int,int> n = q.front(); + q.pop(); + if (done(n.first,n.second)) continue; + done(n.first,n.second) = true; + + float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); + if (n.first == 0 && n.second == 0) grid[0] = lp; + //cerr << " " << n.first << "," << n.second << "\t" << lp << endl; + + if (n.first == src.size() || n.second == trg.size()) continue; + const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second]; + for (int i = 0; i < edges.size(); ++i) + q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); + } + //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include <vector> +#include <string> +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { + BackwardEstimator(const std::string& s2t, + const std::string& t2s); + ~BackwardEstimator(); + + void InitializeGrid(const std::vector<WordID>& src, + const std::vector<WordID>& trg, + const Reachability& r, + double src2trg_ratio, + float* grid) const; + + private: + float ComputeBackwardProb(const std::vector<WordID>& src, + const std::vector<WordID>& trg, + unsigned src_covered, + unsigned trg_covered, + double src2trg_ratio) const; + + Model1* m1; + Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_measures.cc b/gi/pf/base_distributions.cc index 8adb37d7..d9761005 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_distributions.cc @@ -1,4 +1,4 @@ -#include "base_measures.h" +#include "base_distributions.h" #include <iostream> @@ -6,6 +6,79 @@ using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + vector<int> le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x].logeq(atof(TD::Convert(cur))); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc, + const vector<WordID>& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc, + const vector<WordID>& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + void Model1::LoadModel1(const string& fname) { cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; ReadFile rf(fname); @@ -40,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) { // for each position i in e-RHS const WordID trg = vtrg[i + start_trg]; prob_t tp = prob_t::Zero(); @@ -66,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p *= ptrglen; p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -98,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc, prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p1 *= ptrglen; p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -120,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc, } prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1) // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01)); p2 *= psrclen; p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform for (int i = 0; i < flen; ++i) { // for each position i in E @@ -154,9 +227,9 @@ JumpBase::JumpBase() : p(200) { for (int j = min_jump; j <= max_jump; ++j) { prob_t& cp = cpd[j]; if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); + cp.logeq(Md::log_poisson(1.5-j, 1)); else if (j > 0) - cp.logeq(log_poisson(j, 1)); + cp.logeq(Md::log_poisson(j, 1)); cp.poweq(0.2); z += cp; } diff --git a/gi/pf/base_measures.h b/gi/pf/base_distributions.h index 7ce7e2e6..84dacdf2 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_distributions.h @@ -6,22 +6,15 @@ #include <string> #include <cmath> #include <iostream> +#include <cassert> +#include "unigrams.h" #include "trule.h" #include "prob.h" #include "tdict.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} +#include "sampler.h" +#include "m.h" +#include "os_phrase.h" struct Model1 { explicit Model1(const std::string& fname) : @@ -49,6 +42,104 @@ struct Model1 { std::vector<std::map<WordID, prob_t> > ttable; }; +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } + return it->second; + } + + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + + std::tr1::unordered_map<TRule,prob_t,RuleHasher> table; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + struct PhraseConditionalBase { explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : model1(m1), @@ -83,7 +174,7 @@ struct PhraseJointBase { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } @@ -113,7 +204,7 @@ struct PhraseJointBase_BiDir { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable<vector<int> > phrases_; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h new file mode 100644 index 00000000..81ddb206 --- /dev/null +++ b/gi/pf/conditional_pseg.h @@ -0,0 +1,275 @@ +#ifndef _CONDITIONAL_PSEG_H_ +#define _CONDITIONAL_PSEG_H_ + +#include <vector> +#include <tr1/unordered_map> +#include <boost/functional/hash.hpp> +#include <iostream> + +#include "m.h" +#include "prob.h" +#include "ccrp_nt.h" +#include "mfcr.h" +#include "trule.h" +#include "base_distributions.h" +#include "tdict.h" + +template <typename ConditionalBaseMeasure> +struct MConditionalTranslationModel { + explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; + for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; + } + } + + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits<double>::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_strength) const { + return m_.log_likelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it; +#if 1 + for (it = r.begin(); it != r.end(); ++it) { + it->second.resample_hyperparameters(rng); + } +#else + const unsigned nloop = 5; + const unsigned niterations = 10; + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits<double>::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { + it->second.set_discount(d); + it->second.set_strength(strength); + } +#endif + } + + int DecrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + const TableCount delta = it->second.decrement(rule, rng); + if (delta.count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return delta.count; + } + + int IncrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; + } + p0s[0] = rp0(rule); + TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); + return delta.count; + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule); + } else { + p0s[0] = rp0(rule); + p = it->second.prob(rule, p0s.begin(), lambdas.begin()); + } + return p; + } + + prob_t Likelihood() const { + prob_t p; p.logeq(log_likelihood(d, strength)); + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + MFCR<1, TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; + double d, strength; + std::vector<prob_t> lambdas; + mutable std::vector<prob_t> p0s; +}; + +template <typename ConditionalBaseMeasure> +struct ConditionalTranslationModel { + explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p.logeq(it->second.logprob(rule, log(rp0(rule)))); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + CCRP_NoTable<TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; +}; + +template <typename ConditionalBaseMeasure> +struct ConditionalParallelSegementationModel { + explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : + tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} + + ConditionalTranslationModel<ConditionalBaseMeasure> tmodel; + + void DecrementRule(const TRule& rule) { + tmodel.DecrementRule(rule); + } + + void IncrementRule(const TRule& rule) { + tmodel.IncrementRule(rule); + } + + void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { + tmodel.IncrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + IncrementAlign(rules[i]->f_.size()); + } + } + + void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { + tmodel.DecrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + DecrementAlign(rules[i]->f_.size()); + } + } + + prob_t RuleProbability(const TRule& rule) const { + return tmodel.RuleProbability(rule); + } + + void IncrementAlign(unsigned span) { + if (aligns.increment(span)) { + // TODO + } + } + + void DecrementAlign(unsigned span) { + if (aligns.decrement(span)) { + // TODO + } + } + + prob_t AlignProbability(unsigned span) const { + prob_t p; + p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); + return p; + } + + prob_t Likelihood() const { + prob_t p; p.logeq(aligns.log_crp_prob()); + p *= base; + p *= tmodel.Likelihood(); + return p; + } + + prob_t base; + CCRP_NoTable<unsigned> aligns; +}; + +#endif + diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc new file mode 100644 index 00000000..3ea88016 --- /dev/null +++ b/gi/pf/condnaive.cc @@ -0,0 +1,298 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases") + ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases") + ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr<MT19937> prng; + +struct ModelAndData { + explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : + model(m), + rng(&*prng), + corpuse(ce), + corpusf(cf), + vocabe(ve), + vocabf(vf), + mh_samples(), + mh_rejects(), + kX(-TD::Convert("X")), + derivations(corpuse.size()) {} + + void ResampleHyperparameters() { + } + + void InstantiateRule(const pair<short,short>& from, + const pair<short,short>& to, + const vector<int>& sentf, + const vector<int>& sente, + TRule* rule) const { + rule->f_.clear(); + rule->e_.clear(); + rule->lhs_ = kX; + for (short i = from.first; i < to.first; ++i) + rule->f_.push_back(sentf[i]); + for (short i = from.second; i < to.second; ++i) + rule->e_.push_back(sente[i]); + } + + void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.DecrementRule(x); + model.DecrementAlign(x.f_.size()); + } + } + + void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + cerr << i << '/' << (d.size() - 1) << ": " << x << endl; + } + } + + void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.IncrementRule(x); + model.IncrementAlign(x.f_.size()); + } + } + + prob_t Likelihood() const { + return model.Likelihood(); + } + + prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { + prob_t p = prob_t::One(); + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + p *= model.RuleProbability(x); + p *= model.AlignProbability(x.f_.size()); + } + return p; + } + + void Sample(); + + ConditionalParallelSegementationModel<PhraseConditionalBase>& model; + MT19937* rng; + const vector<vector<int> >& corpuse, corpusf; + const set<int>& vocabe, vocabf; + unsigned mh_samples, mh_rejects; + const int kX; + vector<vector<pair<short, short> > > derivations; +}; + +void ModelAndData::Sample() { + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; + TRule x; + x.lhs_ = -TD::Convert("X"); + + for (int samples = 0; samples < 1000; ++samples) { + if (samples % 1 == 0 && samples > 0) { + //ResampleHyperparameters(); + cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; + for (int i = 0; i < 10; ++i) { + cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; + PrintDerivation(derivations[i], corpusf[i], corpuse[i]); + } + static TRule xx("[X] ||| w n ||| s h ||| X=0"); + const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second; + for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { + cerr << "\t" << it->second << "\t" << it->first << endl; + } + } + cerr << '.' << flush; + for (int s = 0; s < corpuse.size(); ++s) { + const vector<int>& sentf = corpusf[s]; + const vector<int>& sente = corpuse[s]; +// cerr << " CUSTOMERS: " << rules.num_customers() << endl; +// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + + vector<pair<short, short> >& deriv = derivations[s]; + const prob_t p_cur = Likelihood(); + DecrementDerivation(deriv, sentf, sente); + + boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); + boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); + a[0][0] = prob_t::One(); + for (int i = 0; i < sentf.size(); ++i) { + for (int j = 0; j < sente.size(); ++j) { + const prob_t src_a = a[i][j]; + x.f_.clear(); + for (int k = 1; k <= MAXK; ++k) { + if (i + k > sentf.size()) break; + x.f_.push_back(sentf[i + k - 1]); + x.e_.clear(); + const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source + for (int l = 1; l <= MAXL; ++l) { + if (j + l > sente.size()) break; + x.e_.push_back(sente[j + l - 1]); + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; + a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; + } + } + } + } +// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; + const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + + vector<pair<short,short> > newderiv; + int cur_i = sentf.size(); + int cur_j = sente.size(); + while(cur_i > 0 && cur_j > 0) { + newderiv.push_back(pair<short,short>(cur_i, cur_j)); +// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; + SampleSet<prob_t> ss; + vector<pair<short,short> > nexts; + for (int k = 1; k <= MAXK; ++k) { + const int hyp_i = cur_i - k; + if (hyp_i < 0) break; + for (int l = 1; l <= MAXL; ++l) { + const int hyp_j = cur_j - l; + if (hyp_j < 0) break; + const prob_t& inside = a[hyp_i][hyp_j]; + if (inside == prob_t::Zero()) continue; + const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; + if (transp == prob_t::Zero()) continue; + const prob_t p = inside * transp; + ss.add(p); + nexts.push_back(pair<short,short>(hyp_i, hyp_j)); +// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; + } + } +// cerr << " sample set has " << nexts.size() << " elements.\n"; + const int selected = rng->SelectSample(ss); + cur_i = nexts[selected].first; + cur_j = nexts[selected].second; + } + newderiv.push_back(pair<short,short>(0,0)); + const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); + IncrementDerivation(newderiv, sentf, sente); +// cerr << "SANITY: " << q_new << " " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl; + if (deriv.empty()) { deriv = newderiv; continue; } + ++mh_samples; + + if (deriv != newderiv) { + const prob_t p_new = Likelihood(); +// cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl; +// cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl; + if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) { + ++mh_rejects; + DecrementDerivation(newderiv, sentf, sente); + IncrementDerivation(deriv, sentf, sente); + } else { +// cerr << " ACCEPT\n"; + deriv = newderiv; + } + } + } + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + Model1 m1(conf["model1"].as<string>()); + + PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); + ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0); + + ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); + posterior.Sample(); + + TRule r1("[X] ||| x ||| l e ||| X=0"); + TRule r2("[X] ||| A ||| a d ||| X=0"); + TRule r3("[X] ||| n ||| e r ||| X=0"); + TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); + + PhraseConditionalUninformativeBase u0(vocabe.size()); + + cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; + cerr << (u0(r4)) << endl; + + return 0; +} + diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename, istream* in = rf.stream(); assert(*in); string line; + unsigned lc = 0; const WordID kDIV = TD::Convert("|||"); vector<WordID> tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; + while(getline(*in, line)) { + ++lc; e->push_back(vector<int>()); f->push_back(vector<int>()); vector<int>& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename, for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { - if (kDIV == cur) { isf = false; } else { + if (kDIV == cur) { + isf = false; + } else { lf.push_back(cur); vocab_f->insert(cur); } } else { - assert(cur != kDIV); + if (cur == kDIV) { + cerr << "ERROR in " << lc << ": " << line << endl << endl; + abort(); + } le.push_back(cur); vocab_e->insert(cur); } diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index db1c43c7..469dff5c 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -6,7 +6,7 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl new file mode 100755 index 00000000..d00c2168 --- /dev/null +++ b/gi/pf/guess-translits.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +my $MIN_PMI = -3; + +my %fs; +my %es; +my %ef; + +die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $tot = 0; +print STDERR "Reading alignments from STDIN ...\n"; +while(<STDIN>) { + chomp; + my ($fsent, $esent, $alsent) = split / \|\|\| /; + die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; + + my @fws = split /\s+/, $fsent; + my @ews = split /\s+/, $esent; + my @as = split /\s+/, $alsent; + my %a2b; + my %b2a; + for my $ap (@as) { + my ($a,$b) = split /-/, $ap; + die "BAD INPUT: $_\n" unless defined $a && defined $b; + $a2b{$a}->{$b} = 1; + $b2a{$b}->{$a} = 1; + } + for my $a (keys %a2b) { + my $bref = $a2b{$a}; + next unless scalar keys %$bref < 2; + my $b = (keys %$bref)[0]; + next unless scalar keys %{$b2a{$b}} < 2; + my $f = $fws[$a]; + next unless defined $f; + next unless length($f) > 3; + my $e = $ews[$b]; + next unless defined $e; + next unless length($e) > 3; + + $ef{$f}->{$e}++; + $es{$e}++; + $fs{$f}++; + $tot++; + } +} +my $ltot = log($tot); +my $num = 0; +print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; +for my $f (keys %fs) { + my $logf = log($fs{$f}); + my $esref = $ef{$f}; + for my $e (keys %$esref) { + my $loge = log($es{$e}); + my $ef = $esref->{$e}; + my $logef = log($ef); + my $pmi = $logef - ($loge + $logf); + next if $pmi < $MIN_PMI; + my @flets = split //, $f; + my @elets = split //, $e; + print "@flets ||| @elets\n"; + $num++; + } +} +print STDERR "Extracted $num pairs.\n"; +print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) { return os << ']'; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { + explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : + use_uniform_(fname.size() == 0), + p0null_(p0null), + uniform_((1.0 - p0null) / vocab_size), + probs_(TD::NumWords() + 1) { + if (fname.size() > 0) LoadUnigrams(fname); + probs_[0] = p0null_; + } + +// +// \data\ +// ngram 1=9295 +// +// \1-grams: +// -3.191193 " + + void LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + const prob_t pnon_null(1.0 - p0null_.as_float()); + if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); + } + } + + const prob_t& operator()(const WordID& w) const { + if (!w) return p0null_; + if (use_uniform_) return uniform_; + return probs_[w]; + } + + const bool use_uniform_; + const prob_t p0null_; + const prob_t uniform_; + vector<prob_t> probs_; +}; struct Model1 { explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") ("input,i",po::value<string>(),"Read parallel data from") - ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases") - ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases") ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform") + ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform") ("random_seed,S",po::value<uint32_t>(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename, int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); - const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); const unsigned particles = conf["particles"].as<unsigned>(); const unsigned samples = conf["samples"].as<unsigned>(); - + TD::Convert("<s>"); + TD::Convert("</s>"); + TD::Convert("<unk>"); if (!conf.count("model1")) { cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) { cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); + UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size()); + UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size()); + const prob_t kHALF(0.5); + const string kEMPTY = "NULL"; const int kLHS = -TD::Convert("X"); Model1 m1(conf["model1"].as<string>()); Model1 invm1(conf["inverse_model1"].as<string>()); for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) { cerr << '.' << flush; for (int ci = 0; ci < corpusf.size(); ++ci) { - const vector<WordID>& src = corpusf[ci]; const vector<WordID>& trg = corpuse[ci]; - for (int i = 0; i < src.size(); ++i) { - for (int j = 0; j < trg.size(); ++j) { - const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); - for (int k = 0; k < eff_max_src; ++k) { - const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); - for (int l = 0; l < eff_max_trg; ++l) { - } - } + const vector<WordID>& src = corpusf[ci]; + for (int i = 0; i <= trg.size(); ++i) { + const WordID e_i = i > 0 ? trg[i-1] : 0; + for (int j = 0; j <= src.size(); ++j) { + const WordID f_j = j > 0 ? src[j-1] : 0; + if (e_i == 0 && f_j == 0) continue; + prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); + cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; + if (e_i && f_j) + cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; } } } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..ed1772bf --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,428 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; +vector<int> nt_vocab; +vector<int> nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs +static bool kHIERARCHICAL_PRIOR = false; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)") + ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") + ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") + ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary") + ("hierarchical_prior,h", "Use hierarchical prior") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector<vector<WordID> >* e, + set<WordID>* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector<int>()); + vector<int>& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + toks += le.size(); + } + if (in != &cin) delete in; + return toks; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector<int> grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + if (kALLOW_MIXED) p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + if (kALLOW_MIXED) p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : + base(vocab_size, num_nts), + q0(1,1,1,1), + nts(num_nts, CCRP<TRule>(1,1,1,1)) {} + + prob_t Prob(const TRule& r) const { + return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); + } + + inline prob_t p0(const TRule& r) const { + if (kHIERARCHICAL_PRIOR) + return q0.prob(r, base(r)); + else + return base(r); + } + + int Increment(const TRule& r, MT19937* rng) { + const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.increment(r, base(r), rng); + return delta; + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.decrement(r, rng); + return delta; + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < nts.size(); ++i) { + prob_t q; q.logeq(nts[i].log_crp_prob()); + p *= q; + for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + if (kHIERARCHICAL_PRIOR) { + prob_t q; q.logeq(q0.log_crp_prob()); + p *= q; + for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) { + prob_t tp = base(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= base(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < nts.size(); ++i) + nts[i].resample_hyperparameters(rng); + if (kHIERARCHICAL_PRIOR) { + q0.resample_hyperparameters(rng); + cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; + } + cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; + } + + const BaseRuleModel base; + CCRP<TRule> q0; + vector<CCRP<TRule> > nts; + //CCRP_OneTable<TRule> x; +}; + +vector<GrammarIter* > tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = nt_vocab[0]; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + inline static unsigned NextArity(int cur_a, int symbol) { + return cur_a + (symbol <= 0 ? 1 : 0); + } + virtual int GetNumRules() const { + if (r) return nt_vocab.size(); else return 0; + } + virtual TRulePtr GetIthRule(int i) const { + if (i == 0) return r; + TRulePtr nr(new TRule(*r)); + nr->lhs_ = nt_vocab[i]; + return nr; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + const int next_arity = NextArity(arity, symbol); + if (kMAX_ARITY && next_arity > kMAX_ARITY) + return NULL; + if (!kALLOW_MIXED && r) { + bool t1 = r->f_.front() <= 0; + bool t2 = symbol <= 0; + if (t1 != t2) return NULL; + } + if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) + return new NPGrammarIter(r, next_arity, symbol); + else + return NULL; + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +prob_t TotalProb(const Hypergraph& hg) { + return Inside<prob_t, EdgeProb>(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) { + vector<prob_t> node_probs; + Inside<prob_t, EdgeProb>(hg, &node_probs); + queue<unsigned> q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet<prob_t> ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + nt_vocab.resize(conf["nonterminals"].as<unsigned>()); + assert(nt_vocab.size() > 0); + assert(nt_vocab.size() < 26); + { + string nt = "X"; + for (unsigned i = 0; i < nt_vocab.size(); ++i) { + if (nt_vocab.size() > 1) nt[0] = ('A' + i); + int pid = TD::Convert(nt); + nt_vocab[i] = -pid; + if (pid >= nt_id_to_index.size()) { + nt_id_to_index.resize(pid + 1, -1); + } + nt_id_to_index[pid] = i; + } + } + vector<GrammarPtr> grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + const unsigned samples = conf["samples"].as<unsigned>(); + kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>(); + if (kMAX_RULE_SIZE == 1) { + cerr << "Invalid maximum rule size: must be 0 or >1\n"; + return 1; + } + kMAX_ARITY = conf["max_arity"].as<unsigned>(); + if (kMAX_ARITY == 1) { + cerr << "Invalid maximum arity: must be 0 or >1\n"; + return 1; + } + kALLOW_MIXED = !conf.count("no_mixed_rules"); + + kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector<vector<WordID> > corpuse; + set<WordID> vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size(), nt_vocab.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + + Hypergraph hg; + const int kGoal = -TD::Convert("Goal"); + const int kLP = FD::Convert("LogProb"); + SparseVector<double> v; v.set_value(kLP, 1.0); + vector<vector<unsigned> > derivs(corpuse.size()); + vector<Lattice> cl(corpuse.size()); + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector<int>& src = corpuse[ci]; + Lattice& lat = cl[ci]; + lat.resize(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + } + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector<int>& src = corpuse[ci]; + const Lattice& lat = cl[ci]; + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + vector<unsigned>& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kGoal) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (tofreelist.size() > 200000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + for (unsigned i = 0; i < nt_vocab.size(); ++i) + cerr << lm.nts[i] << endl; + return 0; +} + diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { + my $most = $d{$vocab[$i]}; + my $least = 1; + + my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); + if ($nl < 0) { $nl = 0; } + print "$vocab[$i] $nl\n" +} + + diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h index 301aa6d8..10d171fe 100644 --- a/gi/pf/monotonic_pseg.h +++ b/gi/pf/monotonic_pseg.h @@ -6,7 +6,7 @@ #include "prob.h" #include "ccrp_nt.h" #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h" template <typename BaseMeasure> struct MonotonicParallelSegementationModel { diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc new file mode 100644 index 00000000..1299f06f --- /dev/null +++ b/gi/pf/ngram_base.cc @@ -0,0 +1,69 @@ +#include "ngram_base.h" + +#include "lm/model.hh" +#include "tdict.h" + +using namespace std; + +namespace { +struct GICSVMapper : public lm::EnumerateVocab { + GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } + void Add(lm::WordIndex index, const StringPiece &str) { + const WordID cdec_id = TD::Convert(str.as_string()); + if (cdec_id >= out_->size()) + out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); + (*out_)[cdec_id] = index; + } + vector<lm::WordIndex>* out_; + const lm::WordIndex kLM_UNKNOWN_TOKEN; +}; +} + +struct FixedNgramBaseImpl { + FixedNgramBaseImpl(const string& param) { + GICSVMapper vm(&cdec2klm_map_); + lm::ngram::Config conf; + conf.enumerate_vocab = &vm; + cerr << "Reading character LM from " << param << endl; + model = new lm::ngram::ProbingModel(param.c_str(), conf); + order = model->Order(); + kEOS = MapWord(TD::Convert("</s>")); + assert(kEOS > 0); + } + + lm::WordIndex MapWord(const WordID w) const { + if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; + return 0; + } + + ~FixedNgramBaseImpl() { delete model; } + + prob_t StringProbability(const vector<WordID>& s) const { + lm::ngram::State state = model->BeginSentenceState(); + double prob = 0; + for (unsigned i = 0; i < s.size(); ++i) { + const lm::ngram::State scopy(state); + prob += model->Score(scopy, MapWord(s[i]), state); + } + const lm::ngram::State scopy(state); + prob += model->Score(scopy, kEOS, state); + prob_t p; p.logeq(prob * log(10)); + return p; + } + + lm::ngram::ProbingModel* model; + unsigned order; + vector<lm::WordIndex> cdec2klm_map_; + lm::WordIndex kEOS; +}; + +FixedNgramBase::~FixedNgramBase() { delete impl; } + +FixedNgramBase::FixedNgramBase(const string& lmfname) { + impl = new FixedNgramBaseImpl(lmfname); +} + +prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const { + return impl->StringProbability(s); +} + diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h new file mode 100644 index 00000000..4ea999f3 --- /dev/null +++ b/gi/pf/ngram_base.h @@ -0,0 +1,25 @@ +#ifndef _NGRAM_BASE_H_ +#define _NGRAM_BASE_H_ + +#include <string> +#include <vector> +#include "trule.h" +#include "wordid.h" +#include "prob.h" + +struct FixedNgramBaseImpl; +struct FixedNgramBase { + FixedNgramBase(const std::string& lmfname); + ~FixedNgramBase(); + prob_t StringProbability(const std::vector<WordID>& s) const; + + prob_t operator()(const TRule& rule) const { + return StringProbability(rule.e_); + } + + private: + FixedNgramBaseImpl* impl; + +}; + +#endif diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc new file mode 100644 index 00000000..fc0af9cb --- /dev/null +++ b/gi/pf/nuisance_test.cc @@ -0,0 +1,161 @@ +#include "ccrp.h" + +#include <vector> +#include <iostream> + +#include "tdict.h" +#include "transliterations.h" + +using namespace std; + +MT19937 rng; + +ostream& operator<<(ostream&os, const vector<int>& v) { + os << '[' << v[0]; + if (v.size() == 2) os << ' ' << v[1]; + return os << ']'; +} + +struct Base { + Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} + inline double p0(const vector<int>& x) const { + double p = 0.75; + if (x.size() == 2) p = 0.25; + p *= 1.0 / 3.0; + if (x.size() == 2) p *= 1.0 / 3.0; + return p; + } + double est_deriv_prob(int a, int b, int seg) const { + assert(a > 0 && a < 4); // a \in {1,2,3} + assert(b > 0 && b < 4); // b \in {1,2,3} + assert(seg == 0 || seg == 1); // seg \in {0,1} + if (seg == 0) { + v[0] = a; + v[1] = b; + return crp.prob(v, p0(v)); + } else { + v1[0] = a; + v2[0] = b; + return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); + } + } + double est_marginal_prob(int a, int b) const { + return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); + } + int increment(int a, int b, double* pw = NULL) { + double p1 = est_deriv_prob(a, b, 0); + double p2 = est_deriv_prob(a, b, 1); + //p1 = 0.5; p2 = 0.5; + int seg = rng.SelectSample(p1,p2); + double tmp = 0; + if (!pw) pw = &tmp; + double& w = *pw; + if (seg == 0) { + v[0] = a; + v[1] = b; + w = crp.prob(v, p0(v)) / p1; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + w = crp.prob(v1, p0(v1)) / p2; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + w *= crp.prob(v2, p0(v2)); + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + return seg; + } + void increment(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + } + void decrement(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.decrement(v, &rng)) { + llh -= log(p0(v)); + } + } else { + v1[0] = a; + if (crp.decrement(v1, &rng)) { + llh -= log(p0(v1)); + } + v2[0] = b; + if (crp.decrement(v2, &rng)) { + llh -= log(p0(v2)); + } + } + } + double log_likelihood() const { + return llh + crp.log_crp_prob(); + } + double llh; + mutable vector<int> v, v1, v2; + CCRP<vector<int> > crp; +}; + +int main(int argc, char** argv) { + double tl = 0; + const int ITERS = 1000; + const int PARTICLES = 20; + const int DATAPOINTS = 50; + WordID x = TD::Convert("souvenons"); + WordID y = TD::Convert("remember"); + vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src); + vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg); +// Transliterations xx; +// xx.Initialize(x, src, y, trg); +// return 1; + + for (int j = 0; j < ITERS; ++j) { + Base b; + vector<int> segs(DATAPOINTS); + SampleSet<double> ss; + vector<int> sss; + for (int i = 0; i < DATAPOINTS; i++) { + ss.clear(); + sss.clear(); + int x = ((i / 10) % 3) + 1; + int y = (i % 3) + 1; + //double ep = b.est_marginal_prob(x,y); + //cerr << "est p(" << x << "," << y << ") = " << ep << endl; + for (int n = 0; n < PARTICLES; ++n) { + double w; + int seg = b.increment(x,y,&w); + //cerr << seg << " w=" << w << endl; + ss.add(w); + sss.push_back(seg); + b.decrement(x,y,seg); + } + int seg = sss[rng.SelectSample(ss)]; + b.increment(x, y, seg); + //cerr << "Selected: " << seg << endl; + //return 1; + segs[i] = seg; + } + tl += b.log_likelihood(); + } + cerr << "LLH=" << tl / ITERS << endl; +} + diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h new file mode 100644 index 00000000..dfe40cb1 --- /dev/null +++ b/gi/pf/os_phrase.h @@ -0,0 +1,15 @@ +#ifndef _OS_PHRASE_H_ +#define _OS_PHRASE_H_ + +#include <iostream> +#include <vector> +#include "tdict.h" + +inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +#endif diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable<vector<int> > phrases_; diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index aae5f798..3d578db2 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -7,7 +7,7 @@ #include <boost/program_options/variables_map.hpp> #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "reachability.h" #include "viterbi.h" #include "hg.h" @@ -315,7 +315,7 @@ struct BackwardEstimate { for (int i = 0; i < src_cov.size(); ++i) if (!src_cov[i]) r.push_back(src_[i]); const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -352,7 +352,7 @@ struct BackwardEstimateSym { if (!src_cov[i]) r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -367,7 +367,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 728ec00d..e1a53f5c 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,7 +7,7 @@ #include <boost/program_options/variables_map.hpp> #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "reachability.h" #include "viterbi.h" @@ -77,7 +77,7 @@ struct BackwardEstimateSym { r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -92,7 +92,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..91029688 --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,209 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "tied_resampler.h" + +// A not very memory-efficient implementation of an N-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +// I use templates to handle the recursive formalation of the prior, so +// the order of the model has to be specified here, at compile time: +#define kORDER 3 + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples") + ("train,i",po::value<string>(),"Training data file") + ("test,T",po::value<string>(),"Test data file") + ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this") + ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this") + ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this") + ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("train") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +template <unsigned N> struct PYPLM; + +// uniform base distribution (0-gram model) +template<> struct PYPLM<0> { + PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} + void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; } + void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); } + double prob(WordID, const vector<WordID>&) const { return p0; } + void resample_hyperparameters(MT19937*) {} + double log_likelihood() const { return draws * log(p0); } + const double p0; + int draws; +}; + +// represents an N-gram LM +template <unsigned N> struct PYPLM { + PYPLM(unsigned vs, double da, double db, double ss, double sr) : + backoff(vs, da, db, ss, sr), + tr(da, db, ss, sr, 0.8, 1.0), + lookup(N-1) {} + void increment(WordID w, const vector<WordID>& context, MT19937* rng) { + const double bo = backoff.prob(w, context); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); + if (it == p.end()) { + it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first; + tr.Add(&it->second); // add to resampler + } + if (it->second.increment(w, bo, rng)) + backoff.increment(w, context, rng); + } + void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); + assert(it != p.end()); + if (it->second.decrement(w, rng)) + backoff.decrement(w, context, rng); + } + double prob(WordID w, const vector<WordID>& context) const { + const double bo = backoff.prob(w, context); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup); + if (it == p.end()) return bo; + return it->second.prob(w, bo); + } + + double log_likelihood() const { + double llh = backoff.log_likelihood(); + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it; + for (it = p.begin(); it != p.end(); ++it) + llh += it->second.log_crp_prob(); + llh += tr.LogLikelihood(); + return llh; + } + + void resample_hyperparameters(MT19937* rng) { + tr.ResampleHyperparameters(rng); + backoff.resample_hyperparameters(rng); + } + + PYPLM<N-1> backoff; + TiedResampler<CCRP<WordID> > tr; + double discount_a, discount_b, strength_s, strength_r; + double d, strength; + mutable vector<WordID> lookup; // thread-local + unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as<unsigned>(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector<vector<WordID> > corpuse; + set<WordID> vocabe; + const WordID kEOS = TD::Convert("</s>"); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + vector<vector<WordID> > test; + if (conf.count("test")) + CorpusTools::ReadFromFile(conf["test"].as<string>(), &test); + else + test = corpuse; + PYPLM<kORDER> lm(vocabe.size(), + conf["discount_prior_a"].as<double>(), + conf["discount_prior_b"].as<double>(), + conf["strength_prior_s"].as<double>(), + conf["strength_prior_r"].as<double>()); + vector<WordID> ctx(kORDER - 1, TD::Convert("<s>")); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector<WordID>& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, ctx, &rng); + lm.increment(w, ctx, &rng); + ctx.push_back(w); + } + if (SS > 0) lm.decrement(kEOS, ctx, &rng); + lm.increment(kEOS, ctx, &rng); + } + if (SS % 10 == 9) { + cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; + if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + } else { cerr << '.' << flush; } + } + double llh = 0; + unsigned cnt = 0; + unsigned oovs = 0; + for (int ci = 0; ci < test.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector<WordID>& s = test[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w, ctx)) / log(2); + if (i < s.size() && vocabe.count(w) == 0) { + cerr << "**OOV "; + ++oovs; + lp = 0; + } + cerr << "p(" << TD::Convert(w) << " |"; + for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) + cerr << ' ' << TD::Convert(ctx[j]); + cerr << ") = " << lp << endl; + ctx.push_back(w); + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; + cerr << " Count: " << cnt << endl; + cerr << " OOVs: " << oovs << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + + diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..e21f0267 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,131 @@ +#include "pyp_tm.h" + +#include <tr1/unordered_map> +#include <iostream> +#include <queue> + +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" +#include "tied_resampler.h" + +using namespace std; +using namespace std::tr1; + +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict<unsigned> fd_; +}; + +template <typename Base, class Binner = FreqBinner> +struct ConditionalPYPWordModel { + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} + + void Summary() const { + cerr << "Number of conditioning contexts: " << r.size() << endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; + for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + btr.ResampleHyperparameters(rng); + } + + prob_t Prob(const WordID src, const vector<WordID>& trglets) const { + RuleModelHash::const_iterator it = r.find(src); + if (it == r.end()) { + return base(trglets); + } else { + return it->second.prob(trglets, base(trglets)); + } + } + + void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + if (it == r.end()) { + it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first; + static const WordID kNULL = TD::Convert("NULL"); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); + } + if (it->second.increment(trglets, base(trglets), rng)) + base.Increment(trglets, rng); + } + + void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + assert(it != r.end()); + if (it->second.decrement(trglets, rng)) { + base.Decrement(trglets, rng); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + return p; + } + + unsigned UniqueConditioningContexts() const { + return r.size(); + } + + // TODO tie PYP hyperparameters based on source word frequency bins + Base& base; + const Binner* binner; + BinTiedResampler<CCRP<vector<WordID> > > btr; + typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; + RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets, + const unsigned num_letters) : + letters(lets), + up0(new PYPWordModel(num_letters)), + tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))), + kX(-TD::Convert("X")) {} + +void PYPLexicalTranslation::Summary() const { + tmodel->Summary(); + up0->Summary(); +} + +prob_t PYPLexicalTranslation::Likelihood() const { + prob_t p = up0->Likelihood(); + p *= tmodel->Likelihood(); + return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { + tmodel->ResampleHyperparameters(rng); + up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { + return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { + return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { + tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { + tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..63e7c96d --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,35 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include <vector> +#include "wordid.h" +#include "prob.h" +#include "sampler.h" +#include "freqdict.h" + +struct FreqBinner; +struct PYPWordModel; +template <typename T, class B> struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { + explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, + const unsigned num_letters); + + prob_t Likelihood() const; + + void ResampleHyperparameters(MT19937* rng); + prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) + void Summary() const; + void Increment(WordID src, WordID trg, MT19937* rng); + void Decrement(WordID src, WordID trg, MT19937* rng); + unsigned UniqueConditioningContexts() const; + + private: + const std::vector<std::vector<WordID> >& letters; // spelling dictionary + PYPWordModel* up0; // base distribuction (model English word) + ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel; // translation distributions + // (model English word | French word) + const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include <iostream> + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { + cerr << "PYPWordModel: generations=" << r.num_customers() + << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ + << " (on " << it->second.table_counts_.size() << " tables) " + << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..ff366865 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include <iostream> +#include <cmath> +#include <vector> +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + + void ResampleHyperparameters(MT19937* rng); + + inline prob_t operator()(const std::vector<WordID>& s) const { + return r.prob(s, p0(s)); + } + + inline void Increment(const std::vector<WordID>& s, MT19937* rng) { + if (r.increment(s, p0(s), rng)) + base *= p0(s); + } + + inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) { + if (r.decrement(s, rng)) + base /= p0(s); + } + + inline prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const; + + private: + inline double logp0(const std::vector<WordID>& s) const { + return Md::log_poisson(s.size(), mean_length) + s.size() * u0; + } + + inline prob_t p0(const std::vector<WordID>& s) const { + prob_t p; p.logeq(logp0(s)); + return p; + } + + prob_t base; // keeps track of the draws from the base distribution + CCRP<std::vector<WordID> > r; + const double u0; // uniform log prob of generating a letter + const double mean_length; // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h new file mode 100644 index 00000000..588c8f84 --- /dev/null +++ b/gi/pf/quasi_model2.h @@ -0,0 +1,166 @@ +#ifndef _QUASI_MODEL2_H_ +#define _QUASI_MODEL2_H_ + +#include <vector> +#include <cmath> +#include <tr1/unordered_map> +#include "boost/functional.hpp" +#include "prob.h" +#include "array2d.h" +#include "slice_sampler.h" +#include "m.h" + +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast<const size_t&>(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + +struct QuasiModel2 { + explicit QuasiModel2(double alpha, double pnull = 0.1) : + alpha_(alpha), + pnull_(pnull), + pnotnull_(1 - pnull) {} + + // a_j = 0 => NULL; src_len does *not* include null + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + if (!a_j) return pnull_; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + struct PNullResampler { + PNullResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_pnull) const { + return log(m_.Likelihood(m_.alpha_, proposed_pnull)); + } + }; + + struct AlphaResampler { + AlphaResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_alpha) const { + return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const PNullResampler dr(*this); + const AlphaResampler ar(*this); + for (unsigned i = 0; i < nloop; ++i) { + double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, + 1.0, 0.0, niterations, 100*niterations); + pnull_ = prob_t(pnull); + alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + } + std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" + << pnull_.as_float() << ") = " << Likelihood() << std::endl; + zcache_.clear(); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p; + p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure + assert(!p.is_0()); + prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); + assert(!prob_of_ppnull.is_0()); + p *= prob_of_ppnull; + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } + return p; + } + + private: + static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + return p; + } + + static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) + z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector<double>& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + + double alpha_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector<std::vector<std::vector<double> > > zcache_; + typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount; + ObsCount obs_; +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 73dd8d39..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -30,15 +30,17 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } } a[0][0].clear(); - //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - if (a[srclen][trglen].size() == 0) { - cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + if (a[srclen][trglen].empty()) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + nodes = 0; return; } typedef boost::multi_array<bool, 2> rarray_type; rarray_type r(boost::extents[srclen + 1][trglen + 1]); r[srclen][trglen] = true; + nodes = 0; for (int i = srclen; i >= 0; --i) { for (int j = trglen; j >= 0; --j) { vector<SState>& prevs = a[i][j]; @@ -47,6 +49,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; int src_delta = i - prevs[k].prev_src_covered; edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered)); short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; if (src_delta > msd) msd = src_delta; } @@ -56,9 +59,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} + nodes = 0; + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (valid_deltas[i][j].size() > 0) { + node_addresses[i][j] = nodes++; + } else { + node_addresses[i][j] = -1; + } + } + } + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 98450ec1..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,12 +12,18 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { - boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + unsigned nodes; + boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") + boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { + max_src_delta(boost::extents[srclen][trglen]), + node_addresses(boost::extents[srclen][trglen]), + valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h new file mode 100644 index 00000000..6f45fbce --- /dev/null +++ b/gi/pf/tied_resampler.h @@ -0,0 +1,124 @@ +#ifndef _TIED_RESAMPLER_H_ +#define _TIED_RESAMPLER_H_ + +#include <set> +#include <vector> +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +template <class CRP> +struct TiedResampler { + explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : + d_alpha(da), + d_beta(db), + s_shape(ss), + s_rate(sr), + discount(d), + strength(s) {} + + void Add(CRP* crp) { + crps.insert(crp); + crp->set_discount(discount); + crp->set_strength(strength); + assert(!crp->has_discount_prior()); + assert(!crp->has_strength_prior()); + } + + void Remove(CRP* crp) { + crps.erase(crp); + } + + size_t size() const { + return crps.size(); + } + + double LogLikelihood(double d, double s) const { + if (s <= -d) return -std::numeric_limits<double>::infinity(); + double llh = Md::log_beta_density(d, d_alpha, d_beta) + + Md::log_gamma_density(d + s, s_shape, s_rate); + for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) + llh += (*it)->log_crp_prob(d, s); + return llh; + } + + double LogLikelihood() const { + return LogLikelihood(discount, strength); + } + + struct DiscountResampler { + DiscountResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_discount) const { + return m_.LogLikelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.discount, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } + const DiscountResampler dr(*this); + const AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits<double>::min(); + if (strength < 0.0) min_discount -= strength; + discount = slice_sampler1d(dr, discount, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "TiedCRPs(d=" << discount << ",s=" + << strength << ") = " << LogLikelihood(discount, strength) << std::endl; + for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) { + (*it)->set_discount(discount); + (*it)->set_strength(strength); + } + } + private: + std::set<CRP*> crps; + const double d_alpha, d_beta, s_shape, s_rate; + double discount, strength; +}; + +// split according to some criterion +template <class CRP> +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < resamplers.size(); ++i) + llh += resamplers[i].LogLikelihood(); + return llh; + } + + private: + std::vector<TiedResampler<CRP> > resamplers; +}; + +#endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc new file mode 100644 index 00000000..2200715e --- /dev/null +++ b/gi/pf/transliterations.cc @@ -0,0 +1,334 @@ +#include "transliterations.h" + +#include <iostream> +#include <vector> + +#include "boost/shared_ptr.hpp" + +#include "backward.h" +#include "filelib.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h" +#include "m.h" +#include "reachability.h" + +using namespace std; +using namespace std::tr1; + +struct TruncatedConditionalLengthModel { + TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) { + for (unsigned i = 1; i <= max_src_size; ++i) { + prob_t z = prob_t::Zero(); + for (unsigned j = 1; j <= max_trg_size; ++j) + z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); + for (unsigned j = 1; j <= max_trg_size; ++j) + plens[i][j] /= z; + //for (unsigned j = 1; j <= max_trg_size; ++j) + // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; + } + } + + // return p(tlen | slen) for *chunks* not full words + inline const prob_t& operator()(int slen, int tlen) const { + return plens[slen][tlen]; + } + + vector<vector<prob_t> > plens; +}; + +struct CondBaseDist { + CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + + prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st, + const vector<WordID>& trg, unsigned tf, unsigned tt) const { + prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len) + assert(!"not impl"); + return p; + } + inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const { + return (*this)(src, 0, src.size(), trg, 0, trg.size()); + } + TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +// p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { + explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : + d(0.0), + strength(1.0), + rp0(pp0) { + } + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule.f_, rule.e_); + } else { + p = it->second.prob(rule, rp0(rule.f_, rule.e_)); + } + return p; + } + + double LogLikelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits<double>::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = //Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(aa); + return llh; + } + + struct AlphaResampler { + AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} + const TransliterationChunkConditionalModel& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it; + //const unsigned nloop = 5; + const unsigned niterations = 10; + //DiscountResampler dr(*this); + AlphaResampler ar(*this); +#if 0 + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits<double>::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } +#endif + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { +#if 0 + it->second.set_discount(d); +#endif + it->second.set_alpha(strength); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(LogLikelihood(d, strength)); + return p; + } + + const CondBaseDist& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + CCRP_NoTable<TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; + double d, strength; +}; + +struct GraphStructure { + GraphStructure() : r() {} + // leak memory - these are basically static + const Reachability* r; + bool IsReachable() const { return r->nodes > 0; } +}; + +struct ProbabilityEstimates { + ProbabilityEstimates() : gs(), backward() {} + explicit ProbabilityEstimates(const GraphStructure& g) : + gs(&g), backward() { + if (g.r->nodes > 0) + backward = new float[g.r->nodes]; + } + // leak memory, these are static + + // returns an estimate of the marginal probability + double MarginalEstimate() const { + if (!backward) return 0; + return backward[0]; + } + + // returns an backward estimate + double Backward(int src_covered, int trg_covered) const { + if (!backward) return 0; + int ind = gs->r->node_addresses[src_covered][trg_covered]; + if (ind < 0) return 0; + return backward[ind]; + } + + prob_t estp; + float* backward; + private: + const GraphStructure* gs; +}; + +struct TransliterationsImpl { + TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : + cp0(max_src, max_trg, sr), + tccm(cp0), + be(b), + kMAX_SRC_CHUNK(max_src), + kMAX_TRG_CHUNK(max_trg), + kS2T_RATIO(sr), + tot_pairs(), tot_mem() { + } + const CondBaseDist cp0; + TransliterationChunkConditionalModel tccm; + const BackwardEstimator& be; + + void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + + // init graph structure + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + GraphStructure& gs = graphs[src_len][trg_len]; + if (!gs.r) { + double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); + if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { + cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; + gs.r = new Reachability(src_len, trg_len, 0, 0); + } else { + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + } + } + + const Reachability& r = *gs.r; + + // init backward estimates + if (src >= ests.size()) ests.resize(src + 1); + unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg); + if (it != ests[src].end()) return; // already initialized + + it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; + ProbabilityEstimates& est = it->second; + if (!gs.r->nodes) return; // not derivable subject to length constraints + + be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); + cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; + tot_pairs++; + tot_mem += sizeof(float) * gs.r->nodes; + } + + void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + // TODO + } + + prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { + assert(src.size() < graphs.size()); + const vector<GraphStructure>& tv = graphs[src.size()]; + assert(trg.size() < tv.size()); + const GraphStructure& gs = tv[trg.size()]; + if (gs.r->nodes == 0) + return prob_t::Zero(); + const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t); + assert(it != ests[s].end()); + return it->second.estp; + } + + void GraphSummary() const { + double to = 0; + double tn = 0; + double tt = 0; + for (int i = 0; i < graphs.size(); ++i) { + const vector<GraphStructure>& vt = graphs[i]; + for (int j = 0; j < vt.size(); ++j) { + const GraphStructure& gs = vt[j]; + if (!gs.r) continue; + tt++; + for (int k = 0; k < i; ++k) { + for (int l = 0; l < j; ++l) { + size_t c = gs.r->valid_deltas[k][l].size(); + if (c) { + tn += 1; + to += c; + } + } + } + } + } + cerr << " Average nodes = " << (tn / tt) << endl; + cerr << "Average out-degree = " << (to / tn) << endl; + cerr << " Unique structures = " << tt << endl; + cerr << " Unique pairs = " << tot_pairs << endl; + cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; + } + + const int kMAX_SRC_CHUNK; + const int kMAX_TRG_CHUNK; + const double kS2T_RATIO; + unsigned tot_pairs; + size_t tot_mem; + vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len] + vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg] +}; + +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : + pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} +Transliterations::~Transliterations() { delete pimpl_; } + +void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + pimpl_->Initialize(src, src_lets, trg, trg_lets); +} + +prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { + return pimpl_->EstimateProbability(s, src,t, trg); +} + +void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + pimpl_->Forbid(src, src_lets, trg, trg_lets); +} + +void Transliterations::GraphSummary() const { + pimpl_->GraphSummary(); +} + diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h new file mode 100644 index 00000000..49d14684 --- /dev/null +++ b/gi/pf/transliterations.h @@ -0,0 +1,24 @@ +#ifndef _TRANSLITERATIONS_H_ +#define _TRANSLITERATIONS_H_ + +#include <vector> +#include "wordid.h" +#include "prob.h" + +struct BackwardEstimator; +struct TransliterationsImpl; +struct Transliterations { + // max_src and max_trg indicate how big the transliteration phrases can be + // see reachability.h for information about filter_ratio + explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); + ~Transliterations(); + void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); + void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); + void GraphSummary() const; + prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const; + private: + TransliterationsImpl* pimpl_; +}; + +#endif + diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include <string> +#include <cmath> + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; + } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + size_t cur = pos + 1; + vector<WordID> w; + while (cur < line.size()) { + const size_t len = UTF8Len(line[cur]); + w.push_back(TD::Convert(line.substr(cur, len))); + cur += len; + } + line[pos] = 0; + float p = atof(&line[0]); + probs_[w].logeq(p * log(10.0)); + } +} + diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h new file mode 100644 index 00000000..1660d1ed --- /dev/null +++ b/gi/pf/unigrams.h @@ -0,0 +1,69 @@ +#ifndef _UNIGRAMS_H_ +#define _UNIGRAMS_H_ + +#include <vector> +#include <string> +#include <tr1/unordered_map> +#include <boost/functional.hpp> + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +struct UnigramModel { + explicit UnigramModel(const std::string& fname, unsigned vocab_size) : + use_uniform_(fname.size() == 0), + uniform_(1.0 / vocab_size), + probs_() { + if (fname.size() > 0) { + probs_.resize(TD::NumWords() + 1); + LoadUnigrams(fname); + } + } + + const prob_t& operator()(const WordID& w) const { + assert(w); + if (use_uniform_) return uniform_; + return probs_[w]; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + std::vector<prob_t> probs_; +}; + + +// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' +struct UnigramWordModel { + explicit UnigramWordModel(const std::string& fname) : + use_uniform_(false), + uniform_(1.0), + probs_() { + LoadUnigrams(fname); + } + + explicit UnigramWordModel(const unsigned vocab_size) : + use_uniform_(true), + uniform_(1.0 / vocab_size), + probs_() {} + + const prob_t& operator()(const std::vector<WordID>& s) const { + if (use_uniform_) return uniform_; + const VectorProbHash::const_iterator it = probs_.find(s); + assert(it != probs_.end()); + return it->second; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash; + VectorProbHash probs_; +}; + +#endif diff --git a/graehl/NOTES b/graehl/NOTES deleted file mode 100755 index 77e99fee..00000000 --- a/graehl/NOTES +++ /dev/null @@ -1,18 +0,0 @@ -BUG: tune is bad - urdu conf=baseline tuning (16 dev bleu score???) - -conf=baseline force=1 ./tune.sh - - decode is good. - - UPDATE: maybe tuning is fine; chris never gave me a dev-corpus-filtered grammar and so a bleu of 16 may be what we always got; i just never checked. this means i need to redo tuned-first-pass experiments - -valgrind is ok - - dist-vest? - - (changes made to scoring? plusequals? shared_ptr? small_vector?) - - scorer_test is good - - - line_optimer fast_score scorer diff --git a/graehl/NOTES.beam b/graehl/NOTES.beam deleted file mode 100755 index a48d1ab7..00000000 --- a/graehl/NOTES.beam +++ /dev/null @@ -1,29 +0,0 @@ -(graehl, comments on code) - -passive chart: completion of actual translation rules (X or S NT in Hiero), have -rule features. Hyperedge inserted with copy of rule feature vector -(non-sparse). Inefficient; should be postponed on intermediate parses with -global pruning; just keep pointer to rules and models must provide an interface -to build a (sparse) feat. vector on demand later for the stuff we keep. - -multithreading: none. list of hyperarcs for refinement would need to be -segregated into subforest blocks and have own output lists for later merging. -e.g. bottom up count number of tail-reachable nodes under each hypernode, then -assign to workers. - -ngram caching: trie, no locks, for example. for threading, LRU hashing w/ locks per bucket is probably better, or per-thread caches. probably cache is reset per sentence? - -randlm worth using? guess not. - -actually get all 0-state models in 1st pass parse and prune passive edges per span. - -allocate cube pruning budget per prev pass - -(has been tried in ISI decoder) models with nonstandard state comparison, -typically (partially) greedy forest scoring, some part of the state is excluded -from equality/hashing. Within virtual ff interface, would just add equals, hash -to vtable (rather than the faster raw state compare). If this is done often, -then add a nonvirtual flag to interface instead, saying whether to use the -virt. methods or not. or: simple flag by user of ApplyModels (per feature?) -saying whether to be 100% greedy or 0% - no halfway e.g. item name uses bigram -context, but score using 5gram state. diff --git a/graehl/NOTES.earley b/graehl/NOTES.earley deleted file mode 100755 index 0953708c..00000000 --- a/graehl/NOTES.earley +++ /dev/null @@ -1,111 +0,0 @@ -1. fsts (modify target string produced) are quite feasible. augment fsa ff to not just emit features, but also new target words. composition or intersection is no problem (can always bunch into a single FSA/FST lazily by wrapping) - -2. sparse fsas (many transitions have -inf score) aren't efficiently supported presently (they are in early_composer where the fsa is a phrase table); the fsa ff interface doesn't even provide a way to query the non-0 transitions (you have to emit a -inf feature). if sparse fsas were expected often and we wanted exact search, then a similar index of the tcfg as in earley_composer would make sense. however, with prob. beam search, we prune out bad-scoring stuff anyway - -3. binarization of rhs isn't usually considered necessary in earley, but i liked the idea of optimal binarization making the most sharing possible. however, this means what would have just been a scan is now a scan+complete. - -4. prefix (forward) + inside cost. this is phrased in a way so that prefix includes inside. but there's no reason not to think of it in exclusive terms (outside,inside) where prefix=outside*inside when using viterbi. on the other hand, we usually use the outside*inside as the beam score. and furthermore, it looks like when summing over all derivations, there would be some difficulty calculating, as the total inside wouldn't be known at first. - -(a,i) r => (+=a*r,r) would become (o,i) r => (+=[(o*i*r)/r],r) = (+=o*i,r) -(_,b'') (a,b) => (+=a*b'',+=b*b'') would become (_,b'') (o,b) => (?????) - -==== - - -the target CFG (tcfg) is finite - absolutely no cycles. conceptually we're intersecting it with a wfsa (weights are feature vectors), which is a lot like parsing a lattice, in that states are (source state, dest state) pairs and you're covering some string(s) that go from source->dest in the wfsa. - -Chris' paper http://www.ling.umd.edu/~redpony/forest-reordering.pdf - apparently (figure 5) already contains the exact concept we're going for, albeit with only inside scores. http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz describes a nice way of computing sums over derivations given a string by keeping a tuple of ("forward","inner") scores while Earley parsing. I'm not sure yet if this is applicable (because we'll already have the exact outside scores from the -LM forest already, and plan on using cost pushing toward the top so we don't have to explicitly consider them). - -normally in earley, one word is consumed at a time, left to right. completions happen from shortest to longest, then (repeated) predictions, and finally scans. i'm sure this has the usual obvious extension to parsing lattices (proceed in some topological order). - -but because the wfsa (ngram lm) has cycles and forgets the length of the string (at some point), it's slightly more complicated than lattice parsing the tcfg - there's no topological order over the wfsa states and so you can't finish all the items [x,j] for j from left->right. best first with monotonic total scores (admissable heuristics) is an easy way to avoid generating the whole space; otherwise I can't think of a fixed order that would allow for alternative beaming. as we discussed, arbitrary predicates filtering candidate items can be added if exact best-first is too slow - -if the wfsa were just a single string t[0...n-1], then any time you have an item [i,j]X->a.b that means that there's some derivation in the tCFG of S =>* t[0...i-1]Xc => t[0....i-1]abc =>* t[0...j-1]bc , for a FSA, the analog is S =>* W(0,i)Xc => W(0,i)abc =>* W(0,i)W(i,j)bc where W(a,b) means any string in the wfsa language with a as the start state and b as the final state. - - -http://www.isi.edu/natural-language/teaching/cs544/cs544-huang-3-Earley.pdf - -http://www.isi.edu/~lhuang/dp-final.pdf (variation on stolcke 1995 prefix cost) - -http://acl.ldc.upenn.edu/P/P07/P07-1019.pdf - phrase based lazy priority queue "cube growing" descendants (p149) - - - - - -http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz - -http://www.icsi.berkeley.edu/~stolcke/papers/cl95/node10.html#SECTION00042000000000000000 - -a) An (unconstrained) Earley path, or simply path, is a sequence of Earley -states linked by prediction, scanning, or completion. For the purpose of -this definition, we allow scanning to operate in “generation mode,” i.e., all -states with terminals to the right of the dot can be scanned, not just those -matching the input. (For completed states, the predecessor state is defined -to be the complete state from the same state set contributing to the -completion.) -b) A path is said to be constrained by, or generate a string x if the terminals -immediately to the left of the dot in all scanned states, in sequence, form -the string x. -c) A path is complete if the last state on it matches the first, except that the -dot has moved to the end of the RHS. -d) We say that a path starts with nonterminal X if the first state on it is a -predicted statewith X on the LHS. -e) The length of a path is defined as the number of scanned states on it. - -Note that the definition of path length is somewhat counter-intuitive, but is motivated -by the fact that only scanned states correspond directly to input symbols. Thus, -the length of a path is always the same as the length of the input string it generates. - -A constrained path starting with the initial state contains a sequence of states from -state set 0 derived by repeated prediction, followed by a single state from set 1 produced -by scanning the first symbol, followed by a sequence of states produced by completion, -followed by a sequence of predicted states, followed by a state scanning the second -symbol, and so on. The significance of Earley paths is that they are in a one-to-one -correspondence with left-most derivations. - - -============= - -The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all -constrained paths of length that end in state X[k]->x.y - -b) The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all -paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] - -(forward,inner) [i.e. (outside,inside)?] - unchanged by scan (rule cost is paid up front when predicting) - -if X[k,i] -> x.Yz (a,b) and rule Y -> r (p) -then Y[i,i] -> .r (a',p) with a' += a*p - -if Y[j,i]->y. (a'',b'') and X[k,j]->r.Ys (a,b) -then X[k,i]->rY.s (a',b') with a' += a*b'', b' += b*b'' - -(this is summing over all derivations) - - -========== - -is forward cost viterbi fine? i.e. can i have items whose names ignore the lhs NT (look up predictions that i finish lazily / graph structured?) -====== - -1) A -> x . * (trie) - -this is somewhat nice. cost pushed for best first, of course. similar benefit as left-branching binarization without the explicit predict/complete steps? - -vs. just - -2) * -> x . y - -here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete. of course, the prediction of items -> . x y can occur lazy best-first. - -vs. - -3) * -> x . * - -with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse. this is not a good option at all. - -====== - --LM forest may have many in-edges per V. (many rules per NT lhs). so instead of generating all successors for scan/predict, i wanted to have them in sorted (admissable) -LM cost order and postpone once the prefix+rule part is more expensive than something else in the agenda. question: how many such postponed successor things diff --git a/graehl/NOTES.lm.phrase b/graehl/NOTES.lm.phrase deleted file mode 100755 index e87cc6fb..00000000 --- a/graehl/NOTES.lm.phrase +++ /dev/null @@ -1,180 +0,0 @@ -possibly the most direct solution is to print every individual probability from LM (to global fstream?). since the difference happens even w/o shortening, disable shortening to remove the possible effect of floor(p+bo) vs floor(p)+bo disagreeing - -+LM forest (nodes/edges): 2163/11293 - +LM forest (paths): 7.14685e+14 - +LM forest Viterbi logp: -490.21 - +LM forest Viterbi: karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found dead body of a person from the sea . - +LM forest derivation: ({<0,28>[1] ||| (final r->l(( karachi|<s> ) start=[ <s>]->{karachi (} r->l(</s>|. sea) end=[sea .]->{</s>} LanguageModelFsa=-5.74766; }({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found dead : [sea .]] r->l(dead found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.93027 h=-5.83552); }({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); }({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); }({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); }({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); }({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); }({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)}({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)}({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)}) ) ) ) ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)}({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)}) ) ) ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)}) ) ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)}({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)}({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)}) ) ) ) ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)}({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)}({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)}) ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)}) ) ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)}({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)}) ) ) ) ({<20,28>[1] the sea . ||| [found dead : [ from]] r->l(dead found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found dead : [sea .]] LanguageModelFsa=-4.84745 h=-7.62839); r->l(. sea the|)}({<21,27>found [1] from ||| [dead body : []] r->l(dead found|) r->l(body|dead found) rule-phrase[from] r->l(from|) = [found dead : [ from]] LanguageModelFsa=-3.42491 h=-7.62839); r->l(found|) r->l(from|)}({<22,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : []] r->l(person a|of body) = [dead body : []] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)}({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)}({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)}) ) ) ) ) ) ) - +LM forest features: Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.49;LmFsa=-446.17;PassThrough=5;PhraseModel_0=12.2199;PhraseModel_1=11.6391;PhraseModel_2=10.9878;WordPenalty=-13.0288;Unigram=-462.696;UnigramFsa=-462.696 -Output kbest to - -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.49;LmFsa=-446.17;PassThrough=5;PhraseModel_0=12.2199;PhraseModel_1=11.6391;PhraseModel_2=10.9878;WordPenalty=-13.0288;Unigram=-462.696;UnigramFsa=-462.696 ||| -490.21 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi|<s> ) start=[ <s>]->{karachi (} r->l(</s>|. sea) end=[sea .]->{</s>} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found dead : [sea .]] r->l(dead found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.93027 h=-5.83552); } - ({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)} - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ) - ({<20,28>[1] the sea . ||| [found dead : [ from]] r->l(dead found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found dead : [sea .]] LanguageModelFsa=-4.84745 h=-7.62839); r->l(. sea the|)} - ({<21,27>found [1] from ||| [dead body : []] r->l(dead found|) r->l(body|dead found) rule-phrase[from] r->l(from|) = [found dead : [ from]] LanguageModelFsa=-3.42491 h=-7.62839); r->l(found|) r->l(from|)} - ({<22,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : []] r->l(person a|of body) = [dead body : []] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)} - ({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) -) -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found the dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.828;LmFsa=-446.508;PassThrough=5;PhraseModel_0=12.697;PhraseModel_1=11.6391;PhraseModel_2=11.5728;WordPenalty=-13.4631;Unigram=-463.765;UnigramFsa=-463.765 ||| -490.295 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi|<s> ) start=[ <s>]->{karachi (} r->l(</s>|. sea) end=[sea .]->{</s>} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found the : [sea .]] r->l(the found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-3.6217 h=-5.83552); } - ({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)} - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ) - ({<20,28>[1] the sea . ||| [found the : [ from]] r->l(the found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found the : [sea .]] LanguageModelFsa=-4.84745 h=-5.31983); r->l(. sea the|)} - ({<21,27>found [1] from ||| [the dead : []] r->l(the found|) r->l(dead|the found) rule-phrase[from] r->l(from|) = [found the : [ from]] LanguageModelFsa=-5.34421 h=-5.31983); r->l(found|) r->l(from|)} - ({<22,26>the dead body of [1] ||| r->l(dead the|) rule-phrase[body of] r->l(of body|dead the) [a person : []] r->l(person a|of body) = [the dead : []] LanguageModelFsa=-3.7205 h=-4.97373); r->l(of body dead the|)} - ({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) -) -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY killed 4 people including a woman while dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-445.419;LmFsa=-445.099;PassThrough=5;PhraseModel_0=12.5687;PhraseModel_1=12.5781;PhraseModel_2=9.61571;WordPenalty=-12.5945;Unigram=-461.303;UnigramFsa=-461.303 ||| -490.646 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi|<s> ) start=[ <s>]->{karachi (} r->l(</s>|. sea) end=[sea .]->{</s>} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [while dead : [sea .]] r->l(dead while|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.71074 h=-5.83552); } - ({<0,19>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY killed : [a woman]] r->l(killed gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-103.345 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,19>[2] killed [1] ||| [gyY] r->l(killed gyY|) [4 people : [a woman]] r->l(people 4|killed gyY) = [gyY killed : [a woman]] LanguageModelFsa=-2.98475 h=-103.345); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ({<19,28>while [1] ||| [dead body : [sea .]] r->l(dead while|) r->l(body|dead while) = [while dead : [sea .]] LanguageModelFsa=-1.20144 h=-6.25144); r->l(while|)} - ({<20,28>[1] . ||| [dead body : [the sea]] r->l(body dead|) rule-phrase[.] r->l(.|sea the) = [dead body : [sea .]] LanguageModelFsa=-0.45297 h=-4.63222); r->l(.|)} - ({<20,26>[1] the sea ||| [dead body : [ from]] r->l(body dead|) rule-phrase[the sea] r->l(sea the|from ) = [dead body : [the sea]] LanguageModelFsa=-4.39448 h=-4.63222); r->l(sea the|)} - ({<21,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : [ from]] r->l(person a|of body) = [dead body : [ from]] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)} - ({<21,24>a [1] from ||| [person] r->l(person a|) rule-phrase[from] r->l(from|) = [a person : [ from]] LanguageModelFsa=-2.33299 h=-4.90016); r->l(a|) r->l(from|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) - ) -) diff --git a/graehl/NOTES.partial.binarize b/graehl/NOTES.partial.binarize deleted file mode 100755 index a9985891..00000000 --- a/graehl/NOTES.partial.binarize +++ /dev/null @@ -1,21 +0,0 @@ -Earley doesn't require binarized rules. - -But a (partially) binarized grammar may lead to smaller (exhaustive or heuristic) charts. The tradeoff is mostly more reduce steps (the # of NTs should be similar or less than the usual dotted-item binarization0. - -Optionally collapse a rule rhs to unary as well (normal binarization would stop when an rhs is binary), if the rule to collapse it exists or is frequent enough. - -Greedy binarization schemes: - -1) (repeatedly) for the most frequent rhs bigram "X a" create a binary rule "V -> X a" and replace "X a" in all rules' rhs with V. stop if the most frequent bigram has count lower than some threshold (e.g. 3), because each instance of it saves one symbol, but the new rule has 3 symbols. - -2) (repeatedly) for each rule, pick the most frequent bigram in its rhs and binarize it (2a for that rule only, 2b everywhere that bigram occurs). again, some frequency threshold. optionally allow collapsing an rhs to unary. this fails to use some substitutions that are available "for free" based on actions taken at earlier rules w/ no frequent bigrams in common with this one. - -3) (DeNero) (for complete binarization only?) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created. If no prefix or suffix of r already exists as a virtual rule, then choose k=floor(n/2). To amend this to consider frequency of rhs, use the frequency of rhs-prefix/suffixes to decide where to split? - -4?) Song, Chin-Yew Lin - seems to require collecting stats from a larged parsed corpus - interesting idea: make rules that don't match fail early (that's 1 way you get a speedup), and pick V1 -> ... based on some kind of expected utility. - -5) l2r, r2l. yawn. - -1) seems the most sensible. don't just keep a count for each bigram, keep a set of left and right adjacent partially overlapping bigrams (i.e. the words left and right). for "a b" if "c" and "d" occur to the right, then "b c" and "b d" would be the right adjacent bigrams. when replacing a bigram, follow the left and right adjacencies to decrement the count of those bigrams, and add a (bidirectional) link to the new bigram. - -Further, partial-1) can be followed by complete-3) or 5) - although i see no reason not to just continue 1) until the grammar is binary if you want a full binarization. diff --git a/graehl/NOTES.wfsa b/graehl/NOTES.wfsa deleted file mode 100755 index b74dc810..00000000 --- a/graehl/NOTES.wfsa +++ /dev/null @@ -1,16 +0,0 @@ -left-to-right finite-state models (with heuristic) that depend only on the target string. - -http://github.com/jganitkevitch/cdec.git has some progress toward this: - -earley_generator.*: make a trie of earley dotted items (from first pass finite parse projected to target side?) and rules for each earley deduction step (is the predict step actually making a hyperedge? or is it marked "active" and so doesn't appear in the result?) - -ff_ltor.*: interface for l2r models; needless scoring of "complete" action (only heuristic changes there and heuristics can just be precomputed for all dot-items -ff_lm.*: ugly clone of regular LM model with l2r interface - -apply_models.*: ApplyLeftToRightModelSet - -l2r features: - -multiple feature ids from single model? - -declare markov bound for bottom-up scoring (inside items) wrapper, and "backoff start" state (i.e. empty context, not <s> context) diff --git a/klm/lm/bhiksha.cc b/klm/lm/bhiksha.cc index bf86fd4b..cdeafb47 100644 --- a/klm/lm/bhiksha.cc +++ b/klm/lm/bhiksha.cc @@ -1,5 +1,6 @@ #include "lm/bhiksha.hh" #include "lm/config.hh" +#include "util/file.hh" #include <limits> @@ -12,12 +13,12 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_ const uint8_t kArrayBhikshaVersion = 0; +// TODO: put this in binary file header instead when I change the binary file format again. void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) { uint8_t version; uint8_t configured_bits; - if (read(fd, &version, 1) != 1 || read(fd, &configured_bits, 1) != 1) { - UTIL_THROW(util::ErrnoException, "Could not read from binary file"); - } + util::ReadOrThrow(fd, &version, 1); + util::ReadOrThrow(fd, &configured_bits, 1); if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion); config.pointer_bhiksha_bits = configured_bits; } diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh index 3df43dda..5182ee2e 100644 --- a/klm/lm/bhiksha.hh +++ b/klm/lm/bhiksha.hh @@ -13,7 +13,7 @@ #ifndef LM_BHIKSHA__ #define LM_BHIKSHA__ -#include <inttypes.h> +#include <stdint.h> #include <assert.h> #include "lm/model_type.hh" diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc index 27cada13..4796f6d1 100644 --- a/klm/lm/binary_format.cc +++ b/klm/lm/binary_format.cc @@ -1,19 +1,15 @@ #include "lm/binary_format.hh" #include "lm/lm_exception.hh" +#include "util/file.hh" #include "util/file_piece.hh" +#include <cstddef> +#include <cstring> #include <limits> #include <string> -#include <fcntl.h> -#include <errno.h> -#include <stdlib.h> -#include <string.h> -#include <sys/mman.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> +#include <stdint.h> namespace lm { namespace ngram { @@ -24,14 +20,16 @@ const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; const long int kMagicVersion = 5; -// Test values. -struct Sanity { +// Old binary files built on 32-bit machines have this header. +// TODO: eliminate with next binary release. +struct OldSanity { char magic[sizeof(kMagicBytes)]; float zero_f, one_f, minus_half_f; WordIndex one_word_index, max_word_index; uint64_t one_uint64; void SetToReference() { + std::memset(this, 0, sizeof(OldSanity)); std::memcpy(magic, kMagicBytes, sizeof(magic)); zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; one_word_index = 1; @@ -40,27 +38,35 @@ struct Sanity { } }; -const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; -std::size_t TotalHeaderSize(unsigned char order) { - return Align8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); -} +// Test values aligned to 8 bytes. +struct Sanity { + char magic[ALIGN8(sizeof(kMagicBytes))]; + float zero_f, one_f, minus_half_f; + WordIndex one_word_index, max_word_index, padding_to_8; + uint64_t one_uint64; -void ReadLoop(int fd, void *to_void, std::size_t size) { - uint8_t *to = static_cast<uint8_t*>(to_void); - while (size) { - ssize_t ret = read(fd, to, size); - if (ret == -1) UTIL_THROW(util::ErrnoException, "Failed to read from binary file"); - if (ret == 0) UTIL_THROW(util::ErrnoException, "Binary file too short"); - to += ret; - size -= ret; + void SetToReference() { + std::memset(this, 0, sizeof(Sanity)); + std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes)); + zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; + one_word_index = 1; + max_word_index = std::numeric_limits<WordIndex>::max(); + padding_to_8 = 0; + one_uint64 = 1; } +}; + +const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; + +std::size_t TotalHeaderSize(unsigned char order) { + return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); } void WriteHeader(void *to, const Parameters ¶ms) { Sanity header = Sanity(); header.SetToReference(); - memcpy(to, &header, sizeof(Sanity)); + std::memcpy(to, &header, sizeof(Sanity)); char *out = reinterpret_cast<char*>(to) + sizeof(Sanity); *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed; @@ -74,14 +80,6 @@ void WriteHeader(void *to, const Parameters ¶ms) { } // namespace -void SeekOrThrow(int fd, off_t off) { - if ((off_t)-1 == lseek(fd, off, SEEK_SET)) UTIL_THROW(util::ErrnoException, "Seek failed"); -} - -void AdvanceOrThrow(int fd, off_t off) { - if ((off_t)-1 == lseek(fd, off, SEEK_CUR)) UTIL_THROW(util::ErrnoException, "Seek failed"); -} - uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) { if (config.write_mmap) { std::size_t total = TotalHeaderSize(order) + memory_size; @@ -89,7 +87,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_ strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order)); return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order); } else { - backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED); + util::MapAnonymous(memory_size, backing.vocab); return reinterpret_cast<uint8_t*>(backing.vocab.get()); } } @@ -98,42 +96,58 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad; if (config.write_mmap) { // Grow the file to accomodate the search, using zeros. - if (-1 == ftruncate(backing.file.get(), adjusted_vocab + memory_size)) - UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (adjusted_vocab + memory_size) << " failed"); + try { + util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size); + } catch (util::ErrnoException &e) { + e << " for file " << config.write_mmap; + throw e; + } + if (config.write_method == Config::WRITE_AFTER) { + util::MapAnonymous(memory_size, backing.search); + return reinterpret_cast<uint8_t*>(backing.search.get()); + } + // mmap it now. // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. - off_t page_size = sysconf(_SC_PAGE_SIZE); - off_t alignment_cruft = adjusted_vocab % page_size; + std::size_t page_size = util::SizePage(); + std::size_t alignment_cruft = adjusted_vocab % page_size; backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); - return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft; } else { - backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED); + util::MapAnonymous(memory_size, backing.search); return reinterpret_cast<uint8_t*>(backing.search.get()); } } -void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) { - if (config.write_mmap) { - if (msync(backing.search.get(), backing.search.size(), MS_SYNC) || msync(backing.vocab.get(), backing.vocab.size(), MS_SYNC)) - UTIL_THROW(util::ErrnoException, "msync failed for " << config.write_mmap); - // header and vocab share the same mmap. The header is written here because we know the counts. - Parameters params; - params.counts = counts; - params.fixed.order = counts.size(); - params.fixed.probing_multiplier = config.probing_multiplier; - params.fixed.model_type = model_type; - params.fixed.has_vocabulary = config.include_vocab; - params.fixed.search_version = search_version; - WriteHeader(backing.vocab.get(), params); +void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) { + if (!config.write_mmap) return; + util::SyncOrThrow(backing.vocab.get(), backing.vocab.size()); + switch (config.write_method) { + case Config::WRITE_MMAP: + util::SyncOrThrow(backing.search.get(), backing.search.size()); + break; + case Config::WRITE_AFTER: + util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad); + util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size()); + util::FSyncOrThrow(backing.file.get()); + break; } + // header and vocab share the same mmap. The header is written here because we know the counts. + Parameters params = Parameters(); + params.counts = counts; + params.fixed.order = counts.size(); + params.fixed.probing_multiplier = config.probing_multiplier; + params.fixed.model_type = model_type; + params.fixed.has_vocabulary = config.include_vocab; + params.fixed.search_version = search_version; + WriteHeader(backing.vocab.get(), params); } namespace detail { bool IsBinaryFormat(int fd) { - const off_t size = util::SizeFile(fd); - if (size == util::kBadSize || (size <= static_cast<off_t>(sizeof(Sanity)))) return false; + const uint64_t size = util::SizeFile(fd); + if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false; // Try reading the header. util::scoped_memory memory; try { @@ -154,19 +168,23 @@ bool IsBinaryFormat(int fd) { if ((end_ptr != begin_version) && version != kMagicVersion) { UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary"); } + + OldSanity old_sanity = OldSanity(); + old_sanity.SetToReference(); + UTIL_THROW_IF(!memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable."); UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture"); } return false; } void ReadHeader(int fd, Parameters &out) { - SeekOrThrow(fd, sizeof(Sanity)); - ReadLoop(fd, &out.fixed, sizeof(out.fixed)); + util::SeekOrThrow(fd, sizeof(Sanity)); + util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed)); if (out.fixed.probing_multiplier < 1.0) UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0."); out.counts.resize(static_cast<std::size_t>(out.fixed.order)); - ReadLoop(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); + if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); } void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) { @@ -179,11 +197,11 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet } void SeekPastHeader(int fd, const Parameters ¶ms) { - SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); + util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); } uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) { - const off_t file_size = util::SizeFile(backing.file.get()); + const uint64_t file_size = util::SizeFile(backing.file.get()); // The header is smaller than a page, so we have to map the whole header as well. std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size; if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map) @@ -194,9 +212,8 @@ uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t if (config.enumerate_vocab && !params.fixed.has_vocabulary) UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); - if (config.enumerate_vocab) { - SeekOrThrow(backing.file.get(), total_map); - } + // Seek to vocabulary words + util::SeekOrThrow(backing.file.get(), total_map); return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size()); } diff --git a/klm/lm/binary_format.hh b/klm/lm/binary_format.hh index e9df0892..dd795f62 100644 --- a/klm/lm/binary_format.hh +++ b/klm/lm/binary_format.hh @@ -12,7 +12,7 @@ #include <cstddef> #include <vector> -#include <inttypes.h> +#include <stdint.h> namespace lm { namespace ngram { @@ -33,10 +33,8 @@ struct FixedWidthParameters { unsigned int search_version; }; -inline std::size_t Align8(std::size_t in) { - std::size_t off = in % 8; - return off ? (in + 8 - off) : in; -} +// This is a macro instead of an inline function so constants can be assigned using it. +#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) // Parameters stored in the header of a binary file. struct Parameters { @@ -53,10 +51,6 @@ struct Backing { util::scoped_memory search; }; -void SeekOrThrow(int fd, off_t off); -// Seek forward -void AdvanceOrThrow(int fd, off_t off); - // Create just enough of a binary file to write vocabulary to it. uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing); // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin. @@ -64,7 +58,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t // Write header to binary file. This is done last to prevent incomplete files // from loading. -void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing); +void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing); namespace detail { diff --git a/klm/lm/blank.hh b/klm/lm/blank.hh index 2fb64cd0..4da81209 100644 --- a/klm/lm/blank.hh +++ b/klm/lm/blank.hh @@ -3,7 +3,7 @@ #include <limits> -#include <inttypes.h> +#include <stdint.h> #include <math.h> namespace lm { diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index fdb62a71..8cbb69d0 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -8,18 +8,24 @@ #include <math.h> #include <stdlib.h> -#include <unistd.h> + +#ifdef WIN32 +#include "util/getopt.hh" +#endif namespace lm { namespace ngram { namespace { void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for <unk> if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have <s> and </s>.\n" -"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n\n" +"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-w mmap|after determines how writing is done.\n" +" mmap maps the binary file and writes to it. Default for trie.\n" +" after allocates anonymous memory, builds, and writes. Default for probing.\n\n" "type is either probing or trie. Default is probing.\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" @@ -55,7 +61,7 @@ uint8_t ParseBitCount(const char *from) { unsigned long val = ParseUInt(from); if (val > 25) { util::ParseNumberException e(from); - e << " bit counts are limited to 256."; + e << " bit counts are limited to 25."; } return val; } @@ -87,7 +93,7 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { prefix = 'G'; divide = 1 << 30; } - long int length = std::max<long int>(2, lrint(ceil(log10(max_length / divide)))); + long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide)))); std::cout << "Memory estimate:\ntype "; // right align bytes. for (long int i = 0; i < length - 2; ++i) std::cout << ' '; @@ -112,10 +118,10 @@ int main(int argc, char *argv[]) { using namespace lm::ngram; try { - bool quantize = false, set_backoff_bits = false, bhiksha = false; + bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false; lm::ngram::Config config; int opt; - while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:a:")) != -1) { + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); @@ -129,6 +135,7 @@ int main(int argc, char *argv[]) { case 'a': config.pointer_bhiksha_bits = ParseBitCount(optarg); bhiksha = true; + break; case 'u': config.unknown_missing_logprob = ParseFloat(optarg); break; @@ -141,6 +148,16 @@ int main(int argc, char *argv[]) { case 'm': config.building_memory = ParseUInt(optarg) * 1048576; break; + case 'w': + set_write_method = true; + if (!strcmp(optarg, "mmap")) { + config.write_method = Config::WRITE_MMAP; + } else if (!strcmp(optarg, "after")) { + config.write_method = Config::WRITE_AFTER; + } else { + Usage(argv[0]); + } + break; case 's': config.sentence_marker_missing = lm::SILENT; break; @@ -166,9 +183,11 @@ int main(int argc, char *argv[]) { const char *from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; if (!strcmp(model_type, "probing")) { + if (!set_write_method) config.write_method = Config::WRITE_AFTER; if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); ProbingModel(from_file, config); } else if (!strcmp(model_type, "trie")) { + if (!set_write_method) config.write_method = Config::WRITE_MMAP; if (quantize) { if (bhiksha) { QuantArrayTrieModel(from_file, config); @@ -191,7 +210,9 @@ int main(int argc, char *argv[]) { } catch (const std::exception &e) { std::cerr << e.what() << std::endl; + std::cerr << "ERROR" << std::endl; return 1; } + std::cerr << "SUCCESS" << std::endl; return 0; } diff --git a/klm/lm/config.cc b/klm/lm/config.cc index 297589a4..dbe762b3 100644 --- a/klm/lm/config.cc +++ b/klm/lm/config.cc @@ -17,6 +17,7 @@ Config::Config() : temporary_directory_prefix(NULL), arpa_complain(ALL), write_mmap(NULL), + write_method(WRITE_AFTER), include_vocab(true), prob_bits(8), backoff_bits(8), diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 8564661b..01b75632 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -70,9 +70,17 @@ struct Config { // to NULL to disable. const char *write_mmap; + typedef enum { + WRITE_MMAP, // Map the file directly. + WRITE_AFTER // Write after we're done. + } WriteMethod; + WriteMethod write_method; + // Include the vocab in the binary file? Only effective if write_mmap != NULL. bool include_vocab; + + // Quantization options. Only effective for QuantTrieModel. One value is // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used // to quantize (and one of the remaining backoffs will be 0). diff --git a/klm/lm/left.hh b/klm/lm/left.hh index 41f71f84..a07f9803 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -112,7 +112,7 @@ inline size_t hash_value(const ChartState &state) { size_t hashes[2]; hashes[0] = hash_value(state.left); hashes[1] = hash_value(state.right); - return util::MurmurHashNative(hashes, sizeof(size_t), state.full); + return util::MurmurHashNative(hashes, sizeof(size_t) * 2, state.full); } template <class M> class RuleScore { diff --git a/klm/lm/left_test.cc b/klm/lm/left_test.cc index 8bb91cb3..c85e5efa 100644 --- a/klm/lm/left_test.cc +++ b/klm/lm/left_test.cc @@ -142,7 +142,7 @@ template <class M> float TreeMiddle(const M &m, const std::vector<WordIndex> &wo template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vector<WordIndex> &out) { out.clear(); - for (util::PieceIterator<' '> i(str); i; ++i) { + for (util::TokenIter<util::SingleCharacter, true> i(str, ' '); i; ++i) { out.push_back(m.GetVocabulary().Index(*i)); } } @@ -326,10 +326,17 @@ template <class M> void FullGrow(const M &m) { } } +const char *FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} + template <class M> void Everything() { Config config; config.messages = NULL; - M m("test.arpa", config); + M m(FileLocation(), config); Short(m); Charge(m); diff --git a/klm/lm/model.cc b/klm/lm/model.cc index e4c1ec1d..478ebed1 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -46,7 +46,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { SetupMemory(start, params.counts, config); - vocab_.LoadedBinary(fd, config.enumerate_vocab); + vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); search_.LoadedBinary(); } @@ -82,13 +82,18 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT search_.unigram.Unknown().backoff = 0.0; search_.unigram.Unknown().prob = config.unknown_missing_logprob; } - FinishFile(config, kModelType, kVersion, counts, backing_); + FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_); } catch (util::Exception &e) { e << " Byte: " << f.Offset(); throw; } } +template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) { + util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); + Search::UpdateConfigFromBinary(fd, counts, config); +} + template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { @@ -114,7 +119,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, } float backoff; // i is the order of the backoff we're looking for. - const Middle *mid_iter = search_.MiddleBegin() + start - 2; + typename Search::MiddleIter mid_iter = search_.MiddleBegin() + start - 2; for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++mid_iter) { if (!search_.LookupMiddleNoProb(*mid_iter, *i, backoff, node)) break; ret.prob += backoff; @@ -134,7 +139,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT search_.LookupUnigram(*context_rbegin, out_state.backoff[0], node, ignored); out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; float *backoff_out = out_state.backoff + 1; - const typename Search::Middle *mid = search_.MiddleBegin(); + typename Search::MiddleIter mid(search_.MiddleBegin()); for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++mid) { if (!search_.LookupMiddleNoProb(*mid, *i, *backoff_out, node)) { std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); @@ -161,7 +166,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, // If this function is called, then it does depend on left words. ret.independent_left = false; ret.extend_left = extend_pointer; - const typename Search::Middle *mid_iter = search_.MiddleBegin() + extend_length - 1; + typename Search::MiddleIter mid_iter(search_.MiddleBegin() + extend_length - 1); const WordIndex *i = add_rbegin; for (; ; ++i, ++backoff_out, ++mid_iter) { if (i == add_rend) { @@ -230,7 +235,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, // Ok start by looking up the bigram. const WordIndex *hist_iter = context_rbegin; - const typename Search::Middle *mid_iter = search_.MiddleBegin(); + typename Search::MiddleIter mid_iter(search_.MiddleBegin()); for (; ; ++mid_iter, ++hist_iter, ++backoff_out) { if (hist_iter == context_rend) { // Ran out of history. Typically no backoff, but this could be a blank. diff --git a/klm/lm/model.hh b/klm/lm/model.hh index c278acd6..6ea62a78 100644 --- a/klm/lm/model.hh +++ b/klm/lm/model.hh @@ -90,7 +90,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod * TrieModel. To classify binary files, call RecognizeBinary in * lm/binary_format.hh. */ - GenericModel(const char *file, const Config &config = Config()); + explicit GenericModel(const char *file, const Config &config = Config()); /* Score p(new_word | in_state) and incorporate new_word into out_state. * Note that in_state and out_state must be different references: @@ -137,14 +137,9 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod unsigned char &next_use) const; private: - friend void LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to); + friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to); - static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) { - AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); - Search::UpdateConfigFromBinary(fd, counts, config); - } - - float SlowBackoffLookup(const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const; + static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config); FullScoreReturn ScoreExceptBackoff(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; diff --git a/klm/lm/model_test.cc b/klm/lm/model_test.cc index 2654071f..461704d4 100644 --- a/klm/lm/model_test.cc +++ b/klm/lm/model_test.cc @@ -19,6 +19,20 @@ std::ostream &operator<<(std::ostream &o, const State &state) { namespace { +const char *TestLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} +const char *TestNoUnkLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 3) { + return "test_nounk.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[2]; + +} + #define StartTest(word, ngram, score, indep_left) \ ret = model.FullScore( \ state, \ @@ -307,7 +321,7 @@ template <class ModelT> void LoadingTest() { { ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; - ModelT m("test.arpa", config); + ModelT m(TestLocation(), config); enumerate.Check(m.GetVocabulary()); BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); Everything(m); @@ -315,7 +329,7 @@ template <class ModelT> void LoadingTest() { { ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; - ModelT m("test_nounk.arpa", config); + ModelT m(TestNoUnkLocation(), config); enumerate.Check(m.GetVocabulary()); BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); NoUnkCheck(m); @@ -346,7 +360,7 @@ template <class ModelT> void BinaryTest() { config.enumerate_vocab = &enumerate; { - ModelT copy_model("test.arpa", config); + ModelT copy_model(TestLocation(), config); enumerate.Check(copy_model.GetVocabulary()); enumerate.Clear(); Everything(copy_model); @@ -370,14 +384,14 @@ template <class ModelT> void BinaryTest() { config.messages = NULL; enumerate.Clear(); { - ModelT copy_model("test_nounk.arpa", config); + ModelT copy_model(TestNoUnkLocation(), config); enumerate.Check(copy_model.GetVocabulary()); enumerate.Clear(); NoUnkCheck(copy_model); } config.write_mmap = NULL; { - ModelT binary("test_nounk.binary", config); + ModelT binary(TestNoUnkLocation(), config); enumerate.Check(binary.GetVocabulary()); NoUnkCheck(binary); } diff --git a/klm/lm/ngram_query.cc b/klm/lm/ngram_query.cc index d9db4aa2..8f7a0e1c 100644 --- a/klm/lm/ngram_query.cc +++ b/klm/lm/ngram_query.cc @@ -1,87 +1,4 @@ -#include "lm/enumerate_vocab.hh" -#include "lm/model.hh" - -#include <cstdlib> -#include <fstream> -#include <iostream> -#include <string> - -#include <ctype.h> - -#include <sys/resource.h> -#include <sys/time.h> - -float FloatSec(const struct timeval &tv) { - return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0); -} - -void PrintUsage(const char *message) { - struct rusage usage; - if (getrusage(RUSAGE_SELF, &usage)) { - perror("getrusage"); - return; - } - std::cerr << message; - std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n'; - - // Linux doesn't set memory usage :-(. - std::ifstream status("/proc/self/status", std::ios::in); - std::string line; - while (getline(status, line)) { - if (!strncmp(line.c_str(), "VmRSS:\t", 7)) { - std::cerr << "rss " << (line.c_str() + 7) << '\n'; - break; - } - } -} - -template <class Model> void Query(const Model &model, bool sentence_context) { - PrintUsage("Loading statistics:\n"); - typename Model::State state, out; - lm::FullScoreReturn ret; - std::string word; - - while (std::cin) { - state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); - float total = 0.0; - bool got = false; - unsigned int oov = 0; - while (std::cin >> word) { - got = true; - lm::WordIndex vocab = model.GetVocabulary().Index(word); - if (vocab == 0) ++oov; - ret = model.FullScore(state, vocab, out); - total += ret.prob; - std::cout << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t'; - state = out; - char c; - while (true) { - c = std::cin.get(); - if (!std::cin) break; - if (c == '\n') break; - if (!isspace(c)) { - std::cin.unget(); - break; - } - } - if (c == '\n') break; - } - if (!got && !std::cin) break; - if (sentence_context) { - ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); - total += ret.prob; - std::cout << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t'; - } - std::cout << "Total: " << total << " OOV: " << oov << '\n'; - } - PrintUsage("After queries:\n"); -} - -template <class Model> void Query(const char *name) { - lm::ngram::Config config; - Model model(name, config); - Query(model); -} +#include "lm/ngram_query.hh" int main(int argc, char *argv[]) { if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { @@ -89,34 +6,40 @@ int main(int argc, char *argv[]) { std::cerr << "Input is wrapped in <s> and </s> unless null is passed." << std::endl; return 1; } - bool sentence_context = (argc == 2); - lm::ngram::ModelType model_type; - if (lm::ngram::RecognizeBinary(argv[1], model_type)) { - switch(model_type) { - case lm::ngram::HASH_PROBING: - Query<lm::ngram::ProbingModel>(argv[1], sentence_context); - break; - case lm::ngram::TRIE_SORTED: - Query<lm::ngram::TrieModel>(argv[1], sentence_context); - break; - case lm::ngram::QUANT_TRIE_SORTED: - Query<lm::ngram::QuantTrieModel>(argv[1], sentence_context); - break; - case lm::ngram::ARRAY_TRIE_SORTED: - Query<lm::ngram::ArrayTrieModel>(argv[1], sentence_context); - break; - case lm::ngram::QUANT_ARRAY_TRIE_SORTED: - Query<lm::ngram::QuantArrayTrieModel>(argv[1], sentence_context); - break; - case lm::ngram::HASH_SORTED: - default: - std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; - abort(); + try { + bool sentence_context = (argc == 2); + using namespace lm::ngram; + ModelType model_type; + if (RecognizeBinary(argv[1], model_type)) { + switch(model_type) { + case HASH_PROBING: + Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout); + break; + case TRIE_SORTED: + Query<TrieModel>(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_TRIE_SORTED: + Query<QuantTrieModel>(argv[1], sentence_context, std::cin, std::cout); + break; + case ARRAY_TRIE_SORTED: + Query<ArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_ARRAY_TRIE_SORTED: + Query<QuantArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout); + break; + case HASH_SORTED: + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); + } + } else { + Query<ProbingModel>(argv[1], sentence_context, std::cin, std::cout); } - } else { - Query<lm::ngram::ProbingModel>(argv[1], sentence_context); - } - PrintUsage("Total time including destruction:\n"); + PrintUsage("Total time including destruction:\n"); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } return 0; } diff --git a/klm/lm/ngram_query.hh b/klm/lm/ngram_query.hh new file mode 100644 index 00000000..4990df22 --- /dev/null +++ b/klm/lm/ngram_query.hh @@ -0,0 +1,103 @@ +#ifndef LM_NGRAM_QUERY__ +#define LM_NGRAM_QUERY__ + +#include "lm/enumerate_vocab.hh" +#include "lm/model.hh" + +#include <cstdlib> +#include <fstream> +#include <iostream> +#include <string> + +#include <ctype.h> +#if !defined(_WIN32) && !defined(_WIN64) +#include <sys/resource.h> +#include <sys/time.h> +#endif + +namespace lm { +namespace ngram { + +#if !defined(_WIN32) && !defined(_WIN64) +float FloatSec(const struct timeval &tv) { + return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0); +} +#endif + +void PrintUsage(const char *message) { +#if !defined(_WIN32) && !defined(_WIN64) + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage)) { + perror("getrusage"); + return; + } + std::cerr << message; + std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n'; + + // Linux doesn't set memory usage :-(. + std::ifstream status("/proc/self/status", std::ios::in); + std::string line; + while (getline(status, line)) { + if (!strncmp(line.c_str(), "VmRSS:\t", 7)) { + std::cerr << "rss " << (line.c_str() + 7) << '\n'; + break; + } + } +#endif +} + +template <class Model> void Query(const Model &model, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) { + PrintUsage("Loading statistics:\n"); + typename Model::State state, out; + lm::FullScoreReturn ret; + std::string word; + + while (in_stream) { + state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); + float total = 0.0; + bool got = false; + unsigned int oov = 0; + while (in_stream >> word) { + got = true; + lm::WordIndex vocab = model.GetVocabulary().Index(word); + if (vocab == 0) ++oov; + ret = model.FullScore(state, vocab, out); + total += ret.prob; + out_stream << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t'; + state = out; + char c; + while (true) { + c = in_stream.get(); + if (!in_stream) break; + if (c == '\n') break; + if (!isspace(c)) { + in_stream.unget(); + break; + } + } + if (c == '\n') break; + } + if (!got && !in_stream) break; + if (sentence_context) { + ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); + total += ret.prob; + out_stream << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t'; + } + out_stream << "Total: " << total << " OOV: " << oov << '\n'; + } + PrintUsage("After queries:\n"); +} + +template <class M> void Query(const char *file, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) { + Config config; +// config.load_method = util::LAZY; + M model(file, config); + Query(model, sentence_context, in_stream, out_stream); +} + +} // namespace ngram +} // namespace lm + +#endif // LM_NGRAM_QUERY__ + + diff --git a/klm/lm/quantize.cc b/klm/lm/quantize.cc index 98a5d048..a8e0cb21 100644 --- a/klm/lm/quantize.cc +++ b/klm/lm/quantize.cc @@ -1,31 +1,30 @@ +/* Quantize into bins of equal size as described in + * M. Federico and N. Bertoldi. 2006. How many bits are needed + * to store probabilities for phrase-based translation? In Proc. + * of the Workshop on Statistical Machine Translation, pages + * 94–101, New York City, June. Association for Computa- + * tional Linguistics. + */ + #include "lm/quantize.hh" #include "lm/binary_format.hh" #include "lm/lm_exception.hh" +#include "util/file.hh" #include <algorithm> #include <numeric> -#include <unistd.h> - namespace lm { namespace ngram { -/* Quantize into bins of equal size as described in - * M. Federico and N. Bertoldi. 2006. How many bits are needed - * to store probabilities for phrase-based translation? In Proc. - * of the Workshop on Statistical Machine Translation, pages - * 94–101, New York City, June. Association for Computa- - * tional Linguistics. - */ - namespace { -void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) { - std::sort(values, values_end); - const float *start = values, *finish; +void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) { + std::sort(values.begin(), values.end()); + std::vector<float>::const_iterator start = values.begin(), finish; for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) { - finish = values + (((values_end - values) * static_cast<uint64_t>(i + 1)) / bins); + finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins); if (finish == start) { // zero length bucket. *centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity(); @@ -41,10 +40,11 @@ const char kSeparatelyQuantizeVersion = 2; void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) { char version; - if (read(fd, &version, 1) != 1 || read(fd, &config.prob_bits, 1) != 1 || read(fd, &config.backoff_bits, 1) != 1) - UTIL_THROW(util::ErrnoException, "Failed to read header for quantization."); + util::ReadOrThrow(fd, &version, 1); + util::ReadOrThrow(fd, &config.prob_bits, 1); + util::ReadOrThrow(fd, &config.backoff_bits, 1); if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion); - AdvanceOrThrow(fd, -3); + util::AdvanceOrThrow(fd, -3); } void SeparatelyQuantize::SetupMemory(void *start, const Config &config) { @@ -66,12 +66,12 @@ void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vec float *centers = start_ + TableStart(order) + ProbTableLength(); *(centers++) = kNoExtensionBackoff; *(centers++) = kExtensionBackoff; - MakeBins(&*backoff.begin(), &*backoff.end(), centers, (1ULL << backoff_bits_) - 2); + MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2); } void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) { float *centers = start_ + TableStart(order); - MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_)); + MakeBins(prob, centers, (1ULL << prob_bits_)); } void SeparatelyQuantize::FinishedLoading(const Config &config) { diff --git a/klm/lm/quantize.hh b/klm/lm/quantize.hh index 4cf4236e..6d130a57 100644 --- a/klm/lm/quantize.hh +++ b/klm/lm/quantize.hh @@ -9,7 +9,7 @@ #include <algorithm> #include <vector> -#include <inttypes.h> +#include <stdint.h> #include <iostream> diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index dce73f77..05f761be 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -8,7 +8,7 @@ #include <ctype.h> #include <string.h> -#include <inttypes.h> +#include <stdint.h> namespace lm { diff --git a/klm/lm/return.hh b/klm/lm/return.hh index 15571960..1b55091b 100644 --- a/klm/lm/return.hh +++ b/klm/lm/return.hh @@ -1,7 +1,7 @@ #ifndef LM_RETURN__ #define LM_RETURN__ -#include <inttypes.h> +#include <stdint.h> namespace lm { /* Structure returned by scoring routines. */ diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc index 247832b0..1d6fb5be 100644 --- a/klm/lm/search_hashed.cc +++ b/klm/lm/search_hashed.cc @@ -30,7 +30,7 @@ template <class Middle> class ActivateLowerMiddle { // TODO: somehow get text of n-gram for this error message. if (!modify_.UnsafeMutableFind(hash, i)) UTIL_THROW(FormatLoadException, "The context of every " << n << "-gram should appear as a " << (n-1) << "-gram"); - SetExtension(i->MutableValue().backoff); + SetExtension(i->value.backoff); } private: @@ -65,7 +65,7 @@ template <class Middle> void FixSRI(int lower, float negative_lower_prob, unsign blank.prob -= unigrams[vocab_ids[1]].backoff; SetExtension(unigrams[vocab_ids[1]].backoff); // Bigram including a unigram's backoff - middle[0].Insert(Middle::Packing::Make(keys[0], blank)); + middle[0].Insert(detail::ProbBackoffEntry::Make(keys[0], blank)); fix = 1; } else { for (unsigned int i = 3; i < fix + 2; ++i) backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[i]); @@ -74,22 +74,24 @@ template <class Middle> void FixSRI(int lower, float negative_lower_prob, unsign for (; fix <= n - 3; ++fix) { typename Middle::MutableIterator gotit; if (middle[fix - 1].UnsafeMutableFind(backoff_hash, gotit)) { - float &backoff = gotit->MutableValue().backoff; + float &backoff = gotit->value.backoff; SetExtension(backoff); blank.prob -= backoff; } - middle[fix].Insert(Middle::Packing::Make(keys[fix], blank)); + middle[fix].Insert(detail::ProbBackoffEntry::Make(keys[fix], blank)); backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[fix + 2]); } } template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) { + assert(n >= 2); ReadNGramHeader(f, n); - // vocab ids of words in reverse order + // Both vocab_ids and keys are non-empty because n >= 2. + // vocab ids of words in reverse order. std::vector<WordIndex> vocab_ids(n); std::vector<uint64_t> keys(n-1); - typename Store::Packing::Value value; + typename Store::Entry::Value value; typename Middle::MutableIterator found; for (size_t i = 0; i < count; ++i) { ReadNGram(f, n, vocab, &*vocab_ids.begin(), value, warn); @@ -100,7 +102,7 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams( } // Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0. util::SetSign(value.prob); - store.Insert(Store::Packing::Make(keys[n-2], value)); + store.Insert(Store::Entry::Make(keys[n-2], value)); // Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb. int lower; util::FloatEnc fix_prob; @@ -113,9 +115,9 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams( } if (middle[lower].UnsafeMutableFind(keys[lower], found)) { // Turn off sign bit to indicate that it extends left. - fix_prob.f = found->MutableValue().prob; + fix_prob.f = found->value.prob; fix_prob.i &= ~util::kSignBit; - found->MutableValue().prob = fix_prob.f; + found->value.prob = fix_prob.f; // We don't need to recurse further down because this entry already set the bits for lower entries. break; } @@ -147,7 +149,7 @@ template <class MiddleT, class LongestT> uint8_t *TemplateHashedSearch<MiddleT, template <class MiddleT, class LongestT> template <class Voc> void TemplateHashedSearch<MiddleT, LongestT>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing) { // TODO: fix sorted. - SetupMemory(GrowForSearch(config, 0, Size(counts, config), backing), counts, config); + SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config); PositiveProbWarn warn(config.positive_log_probability); diff --git a/klm/lm/search_hashed.hh b/klm/lm/search_hashed.hh index e289fd11..4352c72d 100644 --- a/klm/lm/search_hashed.hh +++ b/klm/lm/search_hashed.hh @@ -8,7 +8,6 @@ #include "lm/weights.hh" #include "util/bit_packing.hh" -#include "util/key_value_packing.hh" #include "util/probing_hash_table.hh" #include <algorithm> @@ -92,8 +91,10 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing); - const Middle *MiddleBegin() const { return &*middle_.begin(); } - const Middle *MiddleEnd() const { return &*middle_.end(); } + typedef typename std::vector<Middle>::const_iterator MiddleIter; + + MiddleIter MiddleBegin() const { return middle_.begin(); } + MiddleIter MiddleEnd() const { return middle_.end(); } Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const { util::FloatEnc val; @@ -105,7 +106,7 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has std::cerr << "Extend pointer " << extend_pointer << " should have been found for length " << (unsigned) extend_length << std::endl; abort(); } - val.f = found->GetValue().prob; + val.f = found->value.prob; } val.i |= util::kSignBit; prob = val.f; @@ -117,12 +118,12 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has typename Middle::ConstIterator found; if (!middle.Find(node, found)) return false; util::FloatEnc enc; - enc.f = found->GetValue().prob; + enc.f = found->value.prob; ret.independent_left = (enc.i & util::kSignBit); ret.extend_left = node; enc.i |= util::kSignBit; ret.prob = enc.f; - backoff = found->GetValue().backoff; + backoff = found->value.backoff; return true; } @@ -132,7 +133,7 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has node = CombineWordHash(node, word); typename Middle::ConstIterator found; if (!middle.Find(node, found)) return false; - backoff = found->GetValue().backoff; + backoff = found->value.backoff; return true; } @@ -141,7 +142,7 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has node = CombineWordHash(node, word); typename Longest::ConstIterator found; if (!longest.Find(node, found)) return false; - prob = found->GetValue().prob; + prob = found->value.prob; return true; } @@ -160,14 +161,50 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has std::vector<Middle> middle_; }; -// std::identity is an SGI extension :-( -struct IdentityHash : public std::unary_function<uint64_t, size_t> { - size_t operator()(uint64_t arg) const { return static_cast<size_t>(arg); } +/* These look like perfect candidates for a template, right? Ancient gcc (4.1 + * on RedHat stale linux) doesn't pack templates correctly. ProbBackoffEntry + * is a multiple of 8 bytes anyway. ProbEntry is 12 bytes so it's set to pack. + */ +struct ProbBackoffEntry { + uint64_t key; + ProbBackoff value; + typedef uint64_t Key; + typedef ProbBackoff Value; + uint64_t GetKey() const { + return key; + } + static ProbBackoffEntry Make(uint64_t key, ProbBackoff value) { + ProbBackoffEntry ret; + ret.key = key; + ret.value = value; + return ret; + } }; +#pragma pack(push) +#pragma pack(4) +struct ProbEntry { + uint64_t key; + Prob value; + typedef uint64_t Key; + typedef Prob Value; + uint64_t GetKey() const { + return key; + } + static ProbEntry Make(uint64_t key, Prob value) { + ProbEntry ret; + ret.key = key; + ret.value = value; + return ret; + } +}; + +#pragma pack(pop) + + struct ProbingHashedSearch : public TemplateHashedSearch< - util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, ProbBackoff>, IdentityHash>, - util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, Prob>, IdentityHash> > { + util::ProbingHashTable<ProbBackoffEntry, util::IdentityHash>, + util::ProbingHashTable<ProbEntry, util::IdentityHash> > { static const ModelType kModelType = HASH_PROBING; }; diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 4bd3f4ee..ffadfa94 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -13,6 +13,7 @@ #include "lm/weights.hh" #include "lm/word_index.hh" #include "util/ersatz_progress.hh" +#include "util/mmap.hh" #include "util/proxy_iterator.hh" #include "util/scoped.hh" #include "util/sized_iterator.hh" @@ -20,14 +21,15 @@ #include <algorithm> #include <cstring> #include <cstdio> +#include <cstdlib> #include <queue> #include <limits> #include <numeric> #include <vector> -#include <sys/mman.h> -#include <sys/types.h> -#include <sys/stat.h> +#if defined(_WIN32) || defined(_WIN64) +#include <windows.h> +#endif namespace lm { namespace ngram { @@ -195,7 +197,7 @@ class SRISucks { void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { for (unsigned char i = 0; i < kMaxOrder - 1; ++i) { - it_[i] = &*values_[i].begin(); + it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); } messages_[0].Apply(it_, unigram_file); BackoffMessages *messages = messages_ + 1; @@ -227,8 +229,8 @@ class SRISucks { class FindBlanks { public: - FindBlanks(uint64_t *counts, unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) - : counts_(counts), longest_counts_(counts + order - 1), unigrams_(unigrams), sri_(messages) {} + FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) + : counts_(order), unigrams_(unigrams), sri_(messages) {} float UnigramProb(WordIndex index) const { return unigrams_[index].prob; @@ -248,7 +250,7 @@ class FindBlanks { } void Longest(const void * /*data*/) { - ++*longest_counts_; + ++counts_.back(); } // Unigrams wrote one past. @@ -256,8 +258,12 @@ class FindBlanks { --counts_[0]; } + const std::vector<uint64_t> &Counts() const { + return counts_; + } + private: - uint64_t *const counts_, *const longest_counts_; + std::vector<uint64_t> counts_; const ProbBackoff *unigrams_; @@ -375,7 +381,7 @@ template <class Doing> class BlankManager { template <class Doing> void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) { util::ErsatzProgress progress(progress_out, message, unigram_count + 1); - unsigned int unigram = 0; + WordIndex unigram = 0; std::priority_queue<Gram> grams; grams.push(Gram(&unigram, 1)); for (unsigned char i = 2; i <= total_order; ++i) { @@ -461,42 +467,33 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c } // namespace -template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { +template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { RecordReader inputs[kMaxOrder - 1]; RecordReader contexts[kMaxOrder - 1]; for (unsigned char i = 2; i <= counts.size(); ++i) { - std::stringstream assembled; - assembled << file_prefix << static_cast<unsigned int>(i) << "_merged"; - inputs[i-2].Init(assembled.str(), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); - util::RemoveOrThrow(assembled.str().c_str()); - assembled << kContextSuffix; - contexts[i-2].Init(assembled.str(), (i-1) * sizeof(WordIndex)); - util::RemoveOrThrow(assembled.str().c_str()); + inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); + contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex)); } SRISucks sri; - std::vector<uint64_t> fixed_counts(counts.size()); + std::vector<uint64_t> fixed_counts; + util::scoped_FILE unigram_file; + util::scoped_fd unigram_fd(files.StealUnigram()); { - std::string temp(file_prefix); temp += "unigrams"; - util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str())); util::scoped_memory unigrams; - MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); - FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri); + MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); + FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri); RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder); + fixed_counts = finder.Counts(); } + unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) { if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); } SanityCheckCounts(counts, fixed_counts); counts = fixed_counts; - util::scoped_FILE unigram_file; - { - std::string name(file_prefix + "unigrams"); - unigram_file.reset(OpenOrThrow(name.c_str(), "r+")); - util::RemoveOrThrow(name.c_str()); - } sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config); @@ -587,42 +584,19 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBin longest.LoadedBinary(); } -namespace { -bool IsDirectory(const char *path) { - struct stat info; - if (0 != stat(path, &info)) return false; - return S_ISDIR(info.st_mode); -} -} // namespace - template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) { - std::string temporary_directory; + std::string temporary_prefix; if (config.temporary_directory_prefix) { - temporary_directory = config.temporary_directory_prefix; - if (!temporary_directory.empty() && temporary_directory[temporary_directory.size() - 1] != '/' && IsDirectory(temporary_directory.c_str())) - temporary_directory += '/'; + temporary_prefix = config.temporary_directory_prefix; } else if (config.write_mmap) { - temporary_directory = config.write_mmap; + temporary_prefix = config.write_mmap; } else { - temporary_directory = file; - } - // Null on end is kludge to ensure null termination. - temporary_directory += "_trie_tmp_XXXXXX"; - temporary_directory += '\0'; - if (!mkdtemp(&temporary_directory[0])) { - UTIL_THROW(util::ErrnoException, "Failed to make a temporary directory based on the name " << temporary_directory.c_str()); + temporary_prefix = file; } - // Chop off null kludge. - temporary_directory.resize(strlen(temporary_directory.c_str())); - // Add directory delimiter. Assumes a real operating system. - temporary_directory += '/'; // At least 1MB sorting memory. - ARPAToSortedFiles(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_directory.c_str(), vocab); + SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab); - BuildTrie(temporary_directory, counts, config, *this, quant_, vocab, backing); - if (rmdir(temporary_directory.c_str()) && config.messages) { - *config.messages << "Failed to delete " << temporary_directory << std::endl; - } + BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); } template class TrieSearch<DontQuantize, DontBhiksha>; diff --git a/klm/lm/search_trie.hh b/klm/lm/search_trie.hh index 33ae8cff..5155ca02 100644 --- a/klm/lm/search_trie.hh +++ b/klm/lm/search_trie.hh @@ -7,6 +7,7 @@ #include "lm/trie.hh" #include "lm/weights.hh" +#include "util/file.hh" #include "util/file_piece.hh" #include <vector> @@ -20,7 +21,8 @@ class SortedVocabulary; namespace trie { template <class Quant, class Bhiksha> class TrieSearch; -template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); +class SortedFiles; +template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); template <class Quant, class Bhiksha> class TrieSearch { public: @@ -40,7 +42,7 @@ template <class Quant, class Bhiksha> class TrieSearch { static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) { Quant::UpdateConfigFromBinary(fd, counts, config); - AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0])); + util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0])); Bhiksha::UpdateConfigFromBinary(fd, config); } @@ -60,6 +62,8 @@ template <class Quant, class Bhiksha> class TrieSearch { void LoadedBinary(); + typedef const Middle *MiddleIter; + const Middle *MiddleBegin() const { return middle_begin_; } const Middle *MiddleEnd() const { return middle_end_; } @@ -108,7 +112,7 @@ template <class Quant, class Bhiksha> class TrieSearch { } private: - friend void BuildTrie<Quant, Bhiksha>(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); + friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); // Middles are managed manually so we can delay construction and they don't have to be copyable. void FreeMiddles() { diff --git a/klm/lm/trie.hh b/klm/lm/trie.hh index 06cc96ac..ebe9910f 100644 --- a/klm/lm/trie.hh +++ b/klm/lm/trie.hh @@ -1,7 +1,7 @@ #ifndef LM_TRIE__ #define LM_TRIE__ -#include <inttypes.h> +#include <stdint.h> #include <cstddef> diff --git a/klm/lm/trie_sort.cc b/klm/lm/trie_sort.cc index bb126f18..b80fed02 100644 --- a/klm/lm/trie_sort.cc +++ b/klm/lm/trie_sort.cc @@ -14,6 +14,7 @@ #include <algorithm> #include <cstring> #include <cstdio> +#include <cstdlib> #include <deque> #include <limits> #include <vector> @@ -22,14 +23,6 @@ namespace lm { namespace ngram { namespace trie { -const char *kContextSuffix = "_contexts"; - -FILE *OpenOrThrow(const char *name, const char *mode) { - FILE *ret = fopen(name, mode); - if (!ret) UTIL_THROW(util::ErrnoException, "Could not open " << name << " for " << mode); - return ret; -} - void WriteOrThrow(FILE *to, const void *data, size_t size) { assert(size); if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); @@ -78,28 +71,29 @@ class PartialViewProxy { typedef util::ProxyIterator<PartialViewProxy> PartialIter; -std::string DiskFlush(const void *mem_begin, const void *mem_end, const std::string &file_prefix, std::size_t batch, unsigned char order) { - std::stringstream assembled; - assembled << file_prefix << static_cast<unsigned int>(order) << '_' << batch; - std::string ret(assembled.str()); - util::scoped_fd out(util::CreateOrThrow(ret.c_str())); - util::WriteOrThrow(out.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); - return ret; +FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) { + util::scoped_fd file(maker.Make()); + util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); + return util::FDOpenOrThrow(file); } -void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_file_name, std::size_t entry_size, unsigned char order) { +FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) { const size_t context_size = sizeof(WordIndex) * (order - 1); // Sort just the contexts using the same memory. PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size)); PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size)); - std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1))); +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1))); - std::string name(ngram_file_name + kContextSuffix); - util::scoped_FILE out(OpenOrThrow(name.c_str(), "w")); + util::scoped_FILE out(maker.MakeFile()); // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. - if (context_begin == context_end) return; + if (context_begin == context_end) return out.release(); PartialIter i(context_begin); WriteOrThrow(out.get(), i->Data(), context_size); const void *previous = i->Data(); @@ -110,6 +104,7 @@ void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_fil previous = i->Data(); } } + return out.release(); } struct ThrowCombine { @@ -125,14 +120,12 @@ struct FirstCombine { } }; -template <class Combine> void MergeSortedFiles(const std::string &first_name, const std::string &second_name, const std::string &out, std::size_t weights_size, unsigned char order, const Combine &combine = ThrowCombine()) { +template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) { std::size_t entry_size = sizeof(WordIndex) * order + weights_size; RecordReader first, second; - first.Init(first_name.c_str(), entry_size); - util::RemoveOrThrow(first_name.c_str()); - second.Init(second_name.c_str(), entry_size); - util::RemoveOrThrow(second_name.c_str()); - util::scoped_FILE out_file(OpenOrThrow(out.c_str(), "w")); + first.Init(first_file, entry_size); + second.Init(second_file, entry_size); + util::scoped_FILE out_file(maker.MakeFile()); EntryCompare less(order); while (first && second) { if (less(first.Data(), second.Data())) { @@ -149,67 +142,14 @@ template <class Combine> void MergeSortedFiles(const std::string &first_name, co for (RecordReader &remains = (first ? first : second); remains; ++remains) { WriteOrThrow(out_file.get(), remains.Data(), entry_size); } -} - -void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn) { - ReadNGramHeader(f, order); - const size_t count = counts[order - 1]; - // Size of weights. Does it include backoff? - const size_t words_size = sizeof(WordIndex) * order; - const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); - const size_t entry_size = words_size + weights_size; - const size_t batch_size = std::min(count, mem.size() / entry_size); - uint8_t *const begin = reinterpret_cast<uint8_t*>(mem.get()); - std::deque<std::string> files; - for (std::size_t batch = 0, done = 0; done < count; ++batch) { - uint8_t *out = begin; - uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; - if (order == counts.size()) { - for (; out != out_end; out += entry_size) { - ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn); - } - } else { - for (; out != out_end; out += entry_size) { - ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn); - } - } - // Sort full records by full n-gram. - util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size); - // parallel_sort uses too much RAM - std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order))); - files.push_back(DiskFlush(begin, out_end, file_prefix, batch, order)); - WriteContextFile(begin, out_end, files.back(), entry_size, order); - - done += (out_end - begin) / entry_size; - } - - // All individual files created. Merge them. - - std::size_t merge_count = 0; - while (files.size() > 1) { - std::stringstream assembled; - assembled << file_prefix << static_cast<unsigned int>(order) << "_merge_" << (merge_count++); - files.push_back(assembled.str()); - MergeSortedFiles(files[0], files[1], files.back(), weights_size, order, ThrowCombine()); - MergeSortedFiles(files[0] + kContextSuffix, files[1] + kContextSuffix, files.back() + kContextSuffix, 0, order - 1, FirstCombine()); - files.pop_front(); - files.pop_front(); - } - if (!files.empty()) { - std::stringstream assembled; - assembled << file_prefix << static_cast<unsigned int>(order) << "_merged"; - std::string merged_name(assembled.str()); - if (std::rename(files[0].c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << files[0].c_str() << " to " << merged_name.c_str()); - std::string context_name = files[0] + kContextSuffix; - merged_name += kContextSuffix; - if (std::rename(context_name.c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << context_name << " to " << merged_name.c_str()); - } + return out_file.release(); } } // namespace -void RecordReader::Init(const std::string &name, std::size_t entry_size) { - file_.reset(OpenOrThrow(name.c_str(), "r+")); +void RecordReader::Init(FILE *file, std::size_t entry_size) { + rewind(file); + file_ = file; data_.reset(malloc(entry_size)); UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); remains_ = true; @@ -219,20 +159,29 @@ void RecordReader::Init(const std::string &name, std::size_t entry_size) { void RecordReader::Overwrite(const void *start, std::size_t amount) { long internal = (uint8_t*)start - (uint8_t*)data_.get(); - UTIL_THROW_IF(fseek(file_.get(), internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); - WriteOrThrow(file_.get(), start, amount); + UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); + WriteOrThrow(file_, start, amount); long forward = entry_size_ - internal - amount; - if (forward) UTIL_THROW_IF(fseek(file_.get(), forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); +#if !defined(_WIN32) && !defined(_WIN64) + if (forward) +#endif + UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); } -void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { +void RecordReader::Rewind() { + rewind(file_); + remains_ = true; + ++*this; +} + +SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { + util::TempMaker maker(file_prefix); PositiveProbWarn warn(config.positive_log_probability); + unigram_.reset(maker.Make()); { - std::string unigram_name = file_prefix + "unigrams"; - util::scoped_fd unigram_file; // In case <unk> appears. - size_t file_out = (counts[0] + 1) * sizeof(ProbBackoff); - util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), file_out, unigram_file), file_out); + size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); + util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out); Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn); CheckSpecials(config, vocab); if (!vocab.SawUnk()) ++counts[0]; @@ -246,16 +195,96 @@ void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uin buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back())); buffer = std::min<size_t>(buffer, buffer_use); - util::scoped_memory mem; - mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED); + util::scoped_malloc mem; + mem.reset(malloc(buffer)); if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); for (unsigned char order = 2; order <= counts.size(); ++order) { - ConvertToSorted(f, vocab, counts, mem, file_prefix, order, warn); + ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer); } ReadEnd(f); } +namespace { +class Closer { + public: + explicit Closer(std::deque<FILE*> &files) : files_(files) {} + + ~Closer() { + for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); ++i) { + util::scoped_FILE deleter(*i); + } + } + + void PopFront() { + util::scoped_FILE deleter(files_.front()); + files_.pop_front(); + } + private: + std::deque<FILE*> &files_; +}; +} // namespace + +void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { + ReadNGramHeader(f, order); + const size_t count = counts[order - 1]; + // Size of weights. Does it include backoff? + const size_t words_size = sizeof(WordIndex) * order; + const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); + const size_t entry_size = words_size + weights_size; + const size_t batch_size = std::min(count, mem_size / entry_size); + uint8_t *const begin = reinterpret_cast<uint8_t*>(mem); + + std::deque<FILE*> files, contexts; + Closer files_closer(files), contexts_closer(contexts); + + for (std::size_t batch = 0, done = 0; done < count; ++batch) { + uint8_t *out = begin; + uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; + if (order == counts.size()) { + for (; out != out_end; out += entry_size) { + ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn); + } + } else { + for (; out != out_end; out += entry_size) { + ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn); + } + } + // Sort full records by full n-gram. + util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size); + // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies. +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order))); + files.push_back(DiskFlush(begin, out_end, maker)); + contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order)); + + done += (out_end - begin) / entry_size; + } + + // All individual files created. Merge them. + + while (files.size() > 1) { + files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine())); + files_closer.PopFront(); + files_closer.PopFront(); + contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine())); + contexts_closer.PopFront(); + contexts_closer.PopFront(); + } + + if (!files.empty()) { + // Steal from closers. + full_[order - 2].reset(files.front()); + files.pop_front(); + context_[order - 2].reset(contexts.front()); + contexts.pop_front(); + } +} + } // namespace trie } // namespace ngram } // namespace lm diff --git a/klm/lm/trie_sort.hh b/klm/lm/trie_sort.hh index a6916483..3036319d 100644 --- a/klm/lm/trie_sort.hh +++ b/klm/lm/trie_sort.hh @@ -1,6 +1,9 @@ +// Step of trie builder: create sorted files. + #ifndef LM_TRIE_SORT__ #define LM_TRIE_SORT__ +#include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/file.hh" @@ -11,20 +14,21 @@ #include <string> #include <vector> -#include <inttypes.h> +#include <stdint.h> -namespace util { class FilePiece; } +namespace util { +class FilePiece; +class TempMaker; +} // namespace util -// Step of trie builder: create sorted files. namespace lm { +class PositiveProbWarn; namespace ngram { class SortedVocabulary; class Config; namespace trie { -extern const char *kContextSuffix; -FILE *OpenOrThrow(const char *name, const char *mode); void WriteOrThrow(FILE *to, const void *data, size_t size); class EntryCompare : public std::binary_function<const void*, const void*, bool> { @@ -49,15 +53,15 @@ class RecordReader { public: RecordReader() : remains_(true) {} - void Init(const std::string &name, std::size_t entry_size); + void Init(FILE *file, std::size_t entry_size); void *Data() { return data_.get(); } const void *Data() const { return data_.get(); } RecordReader &operator++() { - std::size_t ret = fread(data_.get(), entry_size_, 1, file_.get()); + std::size_t ret = fread(data_.get(), entry_size_, 1, file_); if (!ret) { - UTIL_THROW_IF(!feof(file_.get()), util::ErrnoException, "Error reading temporary file"); + UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); remains_ = false; } return *this; @@ -65,27 +69,46 @@ class RecordReader { operator bool() const { return remains_; } - void Rewind() { - rewind(file_.get()); - remains_ = true; - ++*this; - } + void Rewind(); std::size_t EntrySize() const { return entry_size_; } void Overwrite(const void *start, std::size_t amount); private: + FILE *file_; + util::scoped_malloc data_; bool remains_; std::size_t entry_size_; - - util::scoped_FILE file_; }; -void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); +class SortedFiles { + public: + // Build from ARPA + SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); + + int StealUnigram() { + return unigram_.release(); + } + + FILE *Full(unsigned char order) { + return full_[order - 2].get(); + } + + FILE *Context(unsigned char of_order) { + return context_[of_order - 2].get(); + } + + private: + void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); + + util::scoped_fd unigram_; + + util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1]; +}; } // namespace trie } // namespace ngram diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc index ffec41ca..9fd698bb 100644 --- a/klm/lm/vocab.cc +++ b/klm/lm/vocab.cc @@ -6,12 +6,15 @@ #include "lm/config.hh" #include "lm/weights.hh" #include "util/exception.hh" +#include "util/file.hh" #include "util/joint_sort.hh" #include "util/murmur_hash.hh" #include "util/probing_hash_table.hh" #include <string> +#include <string.h> + namespace lm { namespace ngram { @@ -29,23 +32,30 @@ const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5); // Sadly some LMs have <UNK>. const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5); -WordIndex ReadWords(int fd, EnumerateVocab *enumerate) { - if (!enumerate) return std::numeric_limits<WordIndex>::max(); +void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count) { + // Check that we're at the right place by reading <unk> which is always first. + char check_unk[6]; + util::ReadOrThrow(fd, check_unk, 6); + UTIL_THROW_IF( + memcmp(check_unk, "<unk>", 6), + FormatLoadException, + "Vocabulary words are in the wrong place. This could be because the binary file was built with stale gcc and old kenlm. Stale gcc, including the gcc distributed with RedHat and OS X, has a bug that ignores pragma pack for template-dependent types. New kenlm works around this, so you'll save memory but have to rebuild any binary files using the probing data structure."); + if (!enumerate) return; + enumerate->Add(0, "<unk>"); + + // Read all the words after unk. const std::size_t kInitialRead = 16384; std::string buf; buf.reserve(kInitialRead + 100); buf.resize(kInitialRead); - WordIndex index = 0; + WordIndex index = 1; // Read <unk> already. while (true) { - ssize_t got = read(fd, &buf[0], kInitialRead); - UTIL_THROW_IF(got == -1, util::ErrnoException, "Reading vocabulary words"); - if (got == 0) return index; + std::size_t got = util::ReadOrEOF(fd, &buf[0], kInitialRead); + if (got == 0) break; buf.resize(got); while (buf[buf.size() - 1]) { char next_char; - ssize_t ret = read(fd, &next_char, 1); - UTIL_THROW_IF(ret == -1, util::ErrnoException, "Reading vocabulary words"); - UTIL_THROW_IF(ret == 0, FormatLoadException, "Missing null terminator on a vocab word."); + util::ReadOrThrow(fd, &next_char, 1); buf.push_back(next_char); } // Ok now we have null terminated strings. @@ -55,6 +65,8 @@ WordIndex ReadWords(int fd, EnumerateVocab *enumerate) { i += length + 1 /* null byte */; } } + + UTIL_THROW_IF(expected_count != index, FormatLoadException, "The binary file has the wrong number of words at the end. This could be caused by a truncated binary file."); } } // namespace @@ -69,8 +81,7 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) { } void WriteWordsWrapper::Write(int fd) { - if ((off_t)-1 == lseek(fd, 0, SEEK_END)) - UTIL_THROW(util::ErrnoException, "Failed to seek in binary to vocab words"); + util::SeekEnd(fd); util::WriteOrThrow(fd, buffer_.data(), buffer_.size()); } @@ -114,8 +125,10 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) { void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { if (enumerate_) { - util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); - util::JointSort(begin_, end_, values); + if (!strings_to_enumerate_.empty()) { + util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); + util::JointSort(begin_, end_, values); + } for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) { // <unk> strikes again: +1 here. enumerate_->Add(i + 1, strings_to_enumerate_[i]); @@ -131,11 +144,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { bound_ = end_ - begin_ + 1; } -void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { +void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) { end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1); - ReadWords(fd, to); SetSpecial(Index("<s>"), Index("</s>"), 0); bound_ = end_ - begin_ + 1; + if (have_words) ReadWords(fd, to, bound_); } namespace { @@ -153,12 +166,12 @@ struct ProbingVocabularyHeader { ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {} std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) { - return Align8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier); + return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier); } void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) { header_ = static_cast<detail::ProbingVocabularyHeader*>(start); - lookup_ = Lookup(static_cast<uint8_t*>(start) + Align8(sizeof(detail::ProbingVocabularyHeader)), allocated); + lookup_ = Lookup(static_cast<uint8_t*>(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated); bound_ = 1; saw_unk_ = false; } @@ -178,7 +191,7 @@ WordIndex ProbingVocabulary::Insert(const StringPiece &str) { return 0; } else { if (enumerate_) enumerate_->Add(bound_, str); - lookup_.Insert(Lookup::Packing::Make(hashed, bound_)); + lookup_.Insert(ProbingVocabuaryEntry::Make(hashed, bound_)); return bound_++; } } @@ -190,12 +203,12 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) { SetSpecial(Index("<s>"), Index("</s>"), 0); } -void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { +void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) { UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code."); lookup_.LoadedBinary(); - ReadWords(fd, to); bound_ = header_->bound; SetSpecial(Index("<s>"), Index("</s>"), 0); + if (have_words) ReadWords(fd, to, bound_); } void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index 3c3414fb..06fdefe4 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -4,7 +4,6 @@ #include "lm/enumerate_vocab.hh" #include "lm/lm_exception.hh" #include "lm/virtual_interface.hh" -#include "util/key_value_packing.hh" #include "util/probing_hash_table.hh" #include "util/sorted_uniform.hh" #include "util/string_piece.hh" @@ -83,7 +82,7 @@ class SortedVocabulary : public base::Vocabulary { bool SawUnk() const { return saw_unk_; } - void LoadedBinary(int fd, EnumerateVocab *to); + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to); private: uint64_t *begin_, *end_; @@ -100,6 +99,26 @@ class SortedVocabulary : public base::Vocabulary { std::vector<std::string> strings_to_enumerate_; }; +#pragma pack(push) +#pragma pack(4) +struct ProbingVocabuaryEntry { + uint64_t key; + WordIndex value; + + typedef uint64_t Key; + uint64_t GetKey() const { + return key; + } + + static ProbingVocabuaryEntry Make(uint64_t key, WordIndex value) { + ProbingVocabuaryEntry ret; + ret.key = key; + ret.value = value; + return ret; + } +}; +#pragma pack(pop) + // Vocabulary storing a map from uint64_t to WordIndex. class ProbingVocabulary : public base::Vocabulary { public: @@ -107,7 +126,7 @@ class ProbingVocabulary : public base::Vocabulary { WordIndex Index(const StringPiece &str) const { Lookup::ConstIterator i; - return lookup_.Find(detail::HashForVocab(str), i) ? i->GetValue() : 0; + return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; } static size_t Size(std::size_t entries, const Config &config); @@ -124,17 +143,14 @@ class ProbingVocabulary : public base::Vocabulary { void FinishedLoading(ProbBackoff *reorder_vocab); + std::size_t UnkCountChangePadding() const { return 0; } + bool SawUnk() const { return saw_unk_; } - void LoadedBinary(int fd, EnumerateVocab *to); + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to); private: - // std::identity is an SGI extension :-( - struct IdentityHash : public std::unary_function<uint64_t, std::size_t> { - std::size_t operator()(uint64_t arg) const { return static_cast<std::size_t>(arg); } - }; - - typedef util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, WordIndex>, IdentityHash> Lookup; + typedef util::ProbingHashTable<ProbingVocabuaryEntry, util::IdentityHash> Lookup; Lookup lookup_; diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 33266b94..73a5cb22 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -1,33 +1,37 @@ #ifndef UTIL_BIT_PACKING__ #define UTIL_BIT_PACKING__ -/* Bit-level packing routines */ +/* Bit-level packing routines + * + * WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast. This is + * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on + * x86 but my target audience is large language models for which 64-bit is + * necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. + * + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ #include <assert.h> #ifdef __APPLE__ #include <architecture/byte_order.h> #elif __linux__ #include <endian.h> -#else +#elif !defined(_WIN32) && !defined(_WIN64) #include <arpa/nameser_compat.h> #endif -#include <inttypes.h> - -namespace util { +#include <stdint.h> -/* WARNING WARNING WARNING: - * The write functions assume that memory is zero initially. This makes them - * faster and is the appropriate case for mmapped language model construction. - * These routines assume that unaligned access to uint64_t is fast and that - * storage is little endian. This is the case on x86_64. I'm not sure how - * fast unaligned 64-bit access is on x86 but my target audience is large - * language models for which 64-bit is necessary. - * - * Call the BitPackingSanity function to sanity check. Calling once suffices, - * but it may be called multiple times when that's inconvenient. - */ +#include <string.h> +namespace util { // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. #if BYTE_ORDER == LITTLE_ENDIAN @@ -43,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { #endif inline uint64_t ReadOff(const void *base, uint64_t bit_off) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + return value64; +#else return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)); +#endif } /* Pack integers up to 57 bits using their least significant digits. @@ -57,18 +68,41 @@ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, ui * Assumes the memory is zero initially. */ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + value64 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value64, sizeof(value64)); +#else *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); +#endif } /* Same caveats as above, but for a 25 bit limit. */ inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + return (value32 >> BitPackShift(bit_off & 7, length)) & mask; +#else return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; +#endif } inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + value32 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value32, sizeof(value32)); +#else *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); +#endif } typedef union { float f; uint32_t i; } FloatEnc; diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 96951495..c4f8c04c 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -66,7 +66,7 @@ const char *HandleStrerror(const char *ret, const char * /*buf*/) { ErrnoException::ErrnoException() throw() : errno_(errno) { char buf[200]; buf[0] = 0; -#ifdef sun +#if defined(sun) || defined(_WIN32) || defined(_WIN64) const char *add = strerror(errno); #else const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); diff --git a/klm/util/file.cc b/klm/util/file.cc index d707568e..176737fa 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -9,8 +9,12 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> -#include <unistd.h> -#include <inttypes.h> +#include <stdint.h> + +#if defined(_WIN32) || defined(_WIN64) +#include <windows.h> +#include <io.h> +#endif namespace util { @@ -30,33 +34,71 @@ scoped_FILE::~scoped_FILE() { int OpenReadOrThrow(const char *name) { int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); +#else UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); +#endif return ret; } int CreateOrThrow(const char *name) { int ret; - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name); +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif return ret; } -off_t SizeFile(int fd) { +uint64_t SizeFile(int fd) { +#if defined(_WIN32) || defined(_WIN64) + __int64 ret = _filelengthi64(fd); + return (ret == -1) ? kBadSize : ret; +#else struct stat sb; if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; return sb.st_size; +#endif +} + +void ResizeOrThrow(int fd, uint64_t to) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(_chsize_s(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#else + UTIL_THROW_IF(ftruncate(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void ReadOrThrow(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast<uint8_t*>(to_void); while (amount) { ssize_t ret = read(fd, to, amount); - if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); - if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); amount -= ret; to += ret; } } +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast<uint8_t*>(to_void); + std::size_t remaining = amount; + while (remaining) { + ssize_t ret = read(fd, to, remaining); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed."); + if (!ret) return amount - remaining; + remaining -= ret; + to += ret; + } + return amount; +} + void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast<const uint8_t*>(data_void); while (size) { @@ -67,8 +109,172 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) { } } -void RemoveOrThrow(const char *name) { - UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name); +void FSyncOrThrow(int fd) { +// Apparently windows doesn't have fsync? +#if !defined(_WIN32) && !defined(_WIN64) + UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed."); +#endif +} + +namespace { +void InternalSeek(int fd, off_t off, int whence) { + UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); +} +} // namespace + +void SeekOrThrow(int fd, uint64_t off) { + InternalSeek(fd, off, SEEK_SET); +} + +void AdvanceOrThrow(int fd, int64_t off) { + InternalSeek(fd, off, SEEK_CUR); +} + +void SeekEnd(int fd) { + InternalSeek(fd, 0, SEEK_END); +} + +std::FILE *FDOpenOrThrow(scoped_fd &file) { + std::FILE *ret = fdopen(file.get(), "r+b"); + if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen"); + file.release(); + return ret; +} + +TempMaker::TempMaker(const std::string &prefix) : base_(prefix) { + base_ += "XXXXXX"; +} + +// Sigh. Windows temporary file creation is full of race conditions. +#if defined(_WIN32) || defined(_WIN64) +/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright + (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. */ + +/* This has been modified from the original version to rename the function and + * set the Windows temporary flag. */ + +static const char letters[] = +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +/* Generate a temporary file name based on TMPL. TMPL must match the + rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed + does not exist at the time of the call to mkstemp. TMPL is + overwritten with the result. */ +int +mkstemp_and_unlink(char *tmpl) +{ + int len; + char *XXXXXX; + static unsigned long long value; + unsigned long long random_time_bits; + unsigned int count; + int fd = -1; + int save_errno = errno; + + /* A lower bound on the number of temporary files to attempt to + generate. The maximum total number of temporary file names that + can exist for a given template is 62**6. It should never be + necessary to try all these combinations. Instead if a reasonable + number of names is tried (we define reasonable as 62**3) fail to + give the system administrator the chance to remove the problems. */ +#define ATTEMPTS_MIN (62 * 62 * 62) + + /* The number of times to attempt to generate a temporary file. To + conform to POSIX, this must be no smaller than TMP_MAX. */ +#if ATTEMPTS_MIN < TMP_MAX + unsigned int attempts = TMP_MAX; +#else + unsigned int attempts = ATTEMPTS_MIN; +#endif + + len = strlen (tmpl); + if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) + { + errno = EINVAL; + return -1; + } + +/* This is where the Xs start. */ + XXXXXX = &tmpl[len - 6]; + + /* Get some more or less random data. */ + { + SYSTEMTIME stNow; + FILETIME ftNow; + + // get system time + GetSystemTime(&stNow); + stNow.wMilliseconds = 500; + if (!SystemTimeToFileTime(&stNow, &ftNow)) + { + errno = -1; + return -1; + } + + random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) + | (unsigned long long)ftNow.dwLowDateTime); + } + value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); + + for (count = 0; count < attempts; value += 7777, ++count) + { + unsigned long long v = value; + + /* Fill in the random bits. */ + XXXXXX[0] = letters[v % 62]; + v /= 62; + XXXXXX[1] = letters[v % 62]; + v /= 62; + XXXXXX[2] = letters[v % 62]; + v /= 62; + XXXXXX[3] = letters[v % 62]; + v /= 62; + XXXXXX[4] = letters[v % 62]; + v /= 62; + XXXXXX[5] = letters[v % 62]; + + /* Modified for windows and to unlink */ + // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); + fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE); + if (fd >= 0) + { + errno = save_errno; + return fd; + } + else if (errno != EEXIST) + return -1; + } + + /* We got out of the loop because we ran out of combinations to try. */ + errno = EEXIST; + return -1; +} +#else +int +mkstemp_and_unlink(char *tmpl) { + int ret = mkstemp(tmpl); + if (ret == -1) return -1; + UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); + return ret; +} +#endif + +int TempMaker::Make() const { + std::string copy(base_); + copy.push_back(0); + int ret; + UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(©[0])), util::ErrnoException, "Failed to make a temporary based on " << base_); + return ret; +} + +std::FILE *TempMaker::MakeFile() const { + util::scoped_fd file(Make()); + return FDOpenOrThrow(file); } } // namespace util diff --git a/klm/util/file.hh b/klm/util/file.hh index d6cca41d..72c8ea76 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -1,8 +1,11 @@ #ifndef UTIL_FILE__ #define UTIL_FILE__ +#include <cstddef> #include <cstdio> -#include <unistd.h> +#include <string> + +#include <stdint.h> namespace util { @@ -52,22 +55,52 @@ class scoped_FILE { file_ = to; } + std::FILE *release() { + std::FILE *ret = file_; + file_ = NULL; + return ret; + } + private: std::FILE *file_; }; +// Open for read only. int OpenReadOrThrow(const char *name); - +// Create file if it doesn't exist, truncate if it does. Opened for write. int CreateOrThrow(const char *name); // Return value for SizeFile when it can't size properly. -const off_t kBadSize = -1; -off_t SizeFile(int fd); +const uint64_t kBadSize = (uint64_t)-1; +uint64_t SizeFile(int fd); + +void ResizeOrThrow(int fd, uint64_t to); void ReadOrThrow(int fd, void *to, std::size_t size); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); + void WriteOrThrow(int fd, const void *data_void, std::size_t size); -void RemoveOrThrow(const char *name); +void FSyncOrThrow(int fd); + +// Seeking +void SeekOrThrow(int fd, uint64_t off); +void AdvanceOrThrow(int fd, int64_t off); +void SeekEnd(int fd); + +std::FILE *FDOpenOrThrow(scoped_fd &file); + +class TempMaker { + public: + explicit TempMaker(const std::string &prefix); + + int Make() const; + + std::FILE *MakeFile() const; + + private: + std::string base_; +}; } // namespace util diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index b57582a0..081e662b 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,6 +2,10 @@ #include "util/exception.hh" #include "util/file.hh" +#include "util/mmap.hh" +#ifdef WIN32 +#include <io.h> +#endif // WIN32 #include <iostream> #include <string> @@ -11,14 +15,8 @@ #include <ctype.h> #include <fcntl.h> #include <stdlib.h> -#include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> -#include <unistd.h> - -#ifdef HAVE_ZLIB -#include <zlib.h> -#endif namespace util { @@ -26,24 +24,24 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } -GZException::GZException(void *file) { #ifdef HAVE_ZLIB +GZException::GZException(gzFile file) { int num; - *this << gzerror(file, &num) << " from zlib"; -#endif // HAVE_ZLIB + *this << gzerror( file, &num) << " from zlib"; } +#endif // HAVE_ZLIB // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } @@ -63,7 +61,7 @@ FilePiece::~FilePiece() { } StringPiece FilePiece::ReadLine(char delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (*i == delim) { @@ -94,13 +92,13 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber<unsigned long int>(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { #ifdef HAVE_ZLIB gz_file_ = NULL; #endif file_name_ = name; - default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2); + default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; @@ -130,7 +128,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t namespace { void ParseNumber(const char *begin, char *&end, float &out) { -#ifdef sun +#if defined(sun) || defined(WIN32) out = static_cast<float>(strtod(begin, &end)); #else out = strtof(begin, &end); @@ -171,7 +169,7 @@ template <class T> T FilePiece::ReadNumber() { } const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (delim[static_cast<unsigned char>(*i)]) return i; @@ -190,7 +188,7 @@ void FilePiece::Shift() { progress_.Finished(); throw EndOfFileException(); } - off_t desired_begin = position_ - data_.begin() + mapped_offset_; + uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. @@ -201,18 +199,18 @@ void FilePiece::Shift() { } } -void FilePiece::MMapShift(off_t desired_begin) { +void FilePiece::MMapShift(uint64_t desired_begin) { // Use mmap. - off_t ignore = desired_begin % page_; + uint64_t ignore = desired_begin % page_; // Duplicate request for Shift means give more data. if (position_ == data_.begin() + ignore) { default_map_size_ *= 2; } // Local version so that in case of failure it doesn't overwrite the class variable. - off_t mapped_offset = desired_begin - ignore; + uint64_t mapped_offset = desired_begin - ignore; - off_t mapped_size; - if (default_map_size_ >= static_cast<size_t>(total_size_ - mapped_offset)) { + uint64_t mapped_size; + if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) { at_end_ = true; mapped_size = total_size_ - mapped_offset; } else { @@ -221,15 +219,11 @@ void FilePiece::MMapShift(off_t desired_begin) { // Forcibly clear the existing mmap first. data_.reset(); - data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED - // Populate where available on linux -#ifdef MAP_POPULATE - | MAP_POPULATE -#endif - , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED); - if (data_.get() == MAP_FAILED) { + try { + MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); + } catch (const util::ErrnoException &e) { if (desired_begin) { - if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either."); + SeekOrThrow(*file_, desired_begin); } // The mmap was scheduled to end the file, but now we're going to read it. at_end_ = false; @@ -259,6 +253,10 @@ void FilePiece::TransitionToRead() { #endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index a627f38c..af93d8aa 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -8,9 +8,14 @@ #include "util/mmap.hh" #include "util/string_piece.hh" +#include <cstddef> #include <string> -#include <cstddef> +#include <stdint.h> + +#ifdef HAVE_ZLIB +#include <zlib.h> +#endif namespace util { @@ -22,7 +27,9 @@ class ParseNumberException : public Exception { class GZException : public Exception { public: - explicit GZException(void *file); +#ifdef HAVE_ZLIB + explicit GZException(gzFile file); +#endif GZException() throw() {} ~GZException() throw() {} }; @@ -33,9 +40,9 @@ extern const bool kSpaces[256]; class FilePiece { public: // 32 MB default. - explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); // Takes ownership of fd. name is used for messages. - explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); ~FilePiece(); @@ -70,14 +77,14 @@ class FilePiece { } } - off_t Offset() const { + uint64_t Offset() const { return position_ - data_.begin() + mapped_offset_; } const std::string &FileName() const { return file_name_; } private: - void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template <class T> T ReadNumber(); @@ -91,7 +98,7 @@ class FilePiece { void Shift(); // Backends to Shift(). - void MMapShift(off_t desired_begin); + void MMapShift(uint64_t desired_begin); void TransitionToRead(); void ReadShift(); @@ -99,11 +106,11 @@ class FilePiece { const char *position_, *last_space_, *position_end_; scoped_fd file_; - const off_t total_size_; - const off_t page_; + const uint64_t total_size_; + const uint64_t page_; - size_t default_map_size_; - off_t mapped_offset_; + std::size_t default_map_size_; + uint64_t mapped_offset_; // Order matters: file_ should always be destroyed after this. scoped_memory data_; @@ -116,7 +123,7 @@ class FilePiece { std::string file_name_; #ifdef HAVE_ZLIB - void *gz_file_; + gzFile gz_file_; #endif // HAVE_ZLIB }; diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index dc9ec7e7..f912e18a 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,3 +1,4 @@ +// Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" #include "util/scoped.hh" @@ -14,10 +15,18 @@ namespace util { namespace { +std::string FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "file_piece.cc"; + } + std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); + return ret; +} + /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); - FilePiece test("file_piece.cc", NULL, 1); + std::fstream ref(FileLocation().c_str(), std::ios::in); + FilePiece test(FileLocation().c_str(), NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -35,9 +44,13 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) { */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string popen_args = "cat \""; + popen_args += FileLocation(); + popen_args += '"'; - FILE *catter = popen("cat file_piece.cc", "r"); + FILE *catter = popen(popen_args.c_str(), "r"); BOOST_REQUIRE(catter); FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); @@ -58,10 +71,15 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) { // gzip file BOOST_AUTO_TEST_CASE(PlainZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::string location(FileLocation()); + std::fstream ref(location.c_str(), std::ios::in); - BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz")); - FilePiece test("file_piece.cc.gz", NULL, 1); + std::string command("gzip <\""); + command += location + "\" >\"" + location + "\".gz"; + + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + FilePiece test((location + ".gz").c_str(), NULL, 1); + unlink((location + ".gz").c_str()); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -77,12 +95,15 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) { // the test. #ifndef __APPLE__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string command("gzip <\""); + command += FileLocation() + "\""; - FILE * catter = popen("gzip <file_piece.cc", "r"); + FILE * catter = popen(command.c_str(), "r"); BOOST_REQUIRE(catter); - FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); + FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); diff --git a/klm/util/getopt.c b/klm/util/getopt.c new file mode 100644 index 00000000..992c96b0 --- /dev/null +++ b/klm/util/getopt.c @@ -0,0 +1,78 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifndef __GNUC__ + +#include "getopt.hh" +#include <stdio.h> +#include <string.h> + +#define NULL 0 +#define EOF (-1) +#define ERR(s, c) if(opterr){\ + char errbuf[2];\ + errbuf[0] = c; errbuf[1] = '\n';\ + fputs(argv[0], stderr);\ + fputs(s, stderr);\ + fputc(c, stderr);} + //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ + //(void) write(2, s, (unsigned)strlen(s));\ + //(void) write(2, errbuf, 2);} + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int +getopt(argc, argv, opts) +int argc; +char **argv, *opts; +{ + static int sp = 1; + register int c; + register char *cp; + + if(sp == 1) + if(optind >= argc || + argv[optind][0] != '-' || argv[optind][1] == '\0') + return(EOF); + else if(strcmp(argv[optind], "--") == NULL) { + optind++; + return(EOF); + } + optopt = c = argv[optind][sp]; + if(c == ':' || (cp=strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if(argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return('?'); + } + if(*++cp == ':') { + if(argv[optind][sp+1] != '\0') + optarg = &argv[optind++][sp+1]; + else if(++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return('?'); + } else + optarg = argv[optind++]; + sp = 1; + } else { + if(argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return(c); +} + +#endif /* __GNUC__ */ diff --git a/klm/util/getopt.hh b/klm/util/getopt.hh new file mode 100644 index 00000000..6ad97732 --- /dev/null +++ b/klm/util/getopt.hh @@ -0,0 +1,33 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifdef __GNUC__ +#include <getopt.h> +#endif +#ifndef __GNUC__ + +#ifndef _WINGETOPT_H_ +#define _WINGETOPT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int opterr; +extern int optind; +extern int optopt; +extern char *optarg; +extern int getopt(int argc, char **argv, char *opts); + +#ifdef __cplusplus +} +#endif + +#endif /* _GETOPT_H_ */ +#endif /* __GNUC__ */ + diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh deleted file mode 100644 index b84a5aad..00000000 --- a/klm/util/key_value_packing.hh +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef UTIL_KEY_VALUE_PACKING__ -#define UTIL_KEY_VALUE_PACKING__ - -/* Why such a general interface? I'm planning on doing bit-level packing. */ - -#include <algorithm> -#include <cstddef> -#include <cstring> - -#include <inttypes.h> - -namespace util { - -template <class Key, class Value> struct Entry { - Key key; - Value value; - - const Key &GetKey() const { return key; } - const Value &GetValue() const { return value; } - - Value &MutableValue() { return value; } - - void Set(const Key &key_in, const Value &value_in) { - SetKey(key_in); - SetValue(value_in); - } - void SetKey(const Key &key_in) { key = key_in; } - void SetValue(const Value &value_in) { value = value_in; } - - bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); } -}; - -// And now for a brief interlude to specialize std::swap. -} // namespace util -namespace std { -template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) { - swap(first.key, second.key); - swap(first.value, second.value); -} -}// namespace std -namespace util { - -template <class KeyT, class ValueT> class AlignedPacking { - public: - typedef KeyT Key; - typedef ValueT Value; - - public: - static const std::size_t kBytes = sizeof(Entry<Key, Value>); - static const std::size_t kBits = kBytes * 8; - - typedef Entry<Key, Value> * MutableIterator; - typedef const Entry<Key, Value> * ConstIterator; - typedef const Entry<Key, Value> & ConstReference; - - static MutableIterator FromVoid(void *start) { - return reinterpret_cast<MutableIterator>(start); - } - - static Entry<Key, Value> Make(const Key &key, const Value &value) { - Entry<Key, Value> ret; - ret.Set(key, value); - return ret; - } -}; - -template <class KeyT, class ValueT> class ByteAlignedPacking { - public: - typedef KeyT Key; - typedef ValueT Value; - - private: -#pragma pack(push) -#pragma pack(1) - struct RawEntry { - Key key; - Value value; - - const Key &GetKey() const { return key; } - const Value &GetValue() const { return value; } - - Value &MutableValue() { return value; } - - void Set(const Key &key_in, const Value &value_in) { - SetKey(key_in); - SetValue(value_in); - } - void SetKey(const Key &key_in) { key = key_in; } - void SetValue(const Value &value_in) { value = value_in; } - - bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); } - }; -#pragma pack(pop) - - friend void std::swap<>(RawEntry&, RawEntry&); - - public: - typedef RawEntry *MutableIterator; - typedef const RawEntry *ConstIterator; - typedef RawEntry &ConstReference; - - static const std::size_t kBytes = sizeof(RawEntry); - static const std::size_t kBits = kBytes * 8; - - static MutableIterator FromVoid(void *start) { - return MutableIterator(reinterpret_cast<RawEntry*>(start)); - } - - static RawEntry Make(const Key &key, const Value &value) { - RawEntry ret; - ret.Set(key, value); - return ret; - } -}; - -} // namespace util -namespace std { -template <class Key, class Value> void swap( - typename util::ByteAlignedPacking<Key, Value>::RawEntry &first, - typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) { - swap(first.key, second.key); - swap(first.value, second.value); -} -}// namespace std - -#endif // UTIL_KEY_VALUE_PACKING__ diff --git a/klm/util/key_value_packing_test.cc b/klm/util/key_value_packing_test.cc deleted file mode 100644 index a0d33fd7..00000000 --- a/klm/util/key_value_packing_test.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include "util/key_value_packing.hh" - -#include <boost/random/mersenne_twister.hpp> -#include <boost/random/uniform_int.hpp> -#include <boost/random/variate_generator.hpp> -#include <boost/scoped_array.hpp> -#define BOOST_TEST_MODULE KeyValueStoreTest -#include <boost/test/unit_test.hpp> - -#include <limits> -#include <stdlib.h> - -namespace util { -namespace { - -BOOST_AUTO_TEST_CASE(basic_in_out) { - typedef ByteAlignedPacking<uint64_t, unsigned char> Packing; - void *backing = malloc(Packing::kBytes * 2); - Packing::MutableIterator i(Packing::FromVoid(backing)); - i->SetKey(10); - BOOST_CHECK_EQUAL(10, i->GetKey()); - i->SetValue(3); - BOOST_CHECK_EQUAL(3, i->GetValue()); - ++i; - i->SetKey(5); - BOOST_CHECK_EQUAL(5, i->GetKey()); - i->SetValue(42); - BOOST_CHECK_EQUAL(42, i->GetValue()); - - Packing::ConstIterator c(i); - BOOST_CHECK_EQUAL(5, c->GetKey()); - --c; - BOOST_CHECK_EQUAL(10, c->GetKey()); - BOOST_CHECK_EQUAL(42, i->GetValue()); - - BOOST_CHECK_EQUAL(5, i->GetKey()); - free(backing); -} - -BOOST_AUTO_TEST_CASE(simple_sort) { - typedef ByteAlignedPacking<uint64_t, unsigned char> Packing; - char foo[Packing::kBytes * 4]; - Packing::MutableIterator begin(Packing::FromVoid(foo)); - Packing::MutableIterator i = begin; - i->SetKey(0); ++i; - i->SetKey(2); ++i; - i->SetKey(3); ++i; - i->SetKey(1); ++i; - std::sort(begin, i); - BOOST_CHECK_EQUAL(0, begin[0].GetKey()); - BOOST_CHECK_EQUAL(1, begin[1].GetKey()); - BOOST_CHECK_EQUAL(2, begin[2].GetKey()); - BOOST_CHECK_EQUAL(3, begin[3].GetKey()); -} - -BOOST_AUTO_TEST_CASE(big_sort) { - typedef ByteAlignedPacking<uint64_t, unsigned char> Packing; - boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]); - Packing::MutableIterator begin(Packing::FromVoid(memory.get())); - - boost::mt19937 rng; - boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max()); - boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range); - - for (size_t i = 0; i < 1000; ++i) { - (begin + i)->SetKey(gen()); - } - std::sort(begin, begin + 1000); - for (size_t i = 0; i < 999; ++i) { - BOOST_CHECK(begin[i] < begin[i+1]); - } -} - -} // namespace -} // namespace util diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 279bafa8..3b1c58b8 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -1,23 +1,63 @@ +/* Memory mapping wrappers. + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ +#include "util/mmap.hh" + #include "util/exception.hh" #include "util/file.hh" -#include "util/mmap.hh" #include <iostream> #include <assert.h> #include <fcntl.h> #include <sys/types.h> -#include <sys/mman.h> +#include <sys/stat.h> #include <stdlib.h> -#include <unistd.h> + +#if defined(_WIN32) || defined(_WIN64) +#include <windows.h> +#include <io.h> +#else +#include <sys/mman.h> +#endif namespace util { +long SizePage() { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwAllocationGranularity; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +void SyncOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); +#else + UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); +#endif +} + +void UnmapOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); +#else + UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed"); +#endif +} + scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { - // Thanks Denis Filimonov for pointing out NFS likes msync first. - if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) { - std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl; + try { + // Thanks Denis Filimonov for pointing out NFS likes msync first. + SyncOrThrow(data_, size_); + UnmapOrThrow(data_, size_); + } catch (const util::ErrnoException &e) { + std::cerr << e.what(); abort(); } } @@ -52,29 +92,40 @@ void scoped_memory::call_realloc(std::size_t size) { } } -void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset) { +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { #ifdef MAP_POPULATE // Linux specific if (prefault) { flags |= MAP_POPULATE; } #endif +#if defined(_WIN32) || defined(_WIN64) + int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; + int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; + uint64_t total_size = size + offset; + HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL); + UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); + LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size); + CloseHandle(hMapping); + UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); +#else int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret = mmap(NULL, size, protect, flags, fd, offset); - if (ret == MAP_FAILED) { - UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset); - } + UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); +#endif return ret; } const int kFileFlags = -#ifdef MAP_FILE +#if defined(_WIN32) || defined(_WIN64) + 0 // MapOrThrow ignores flags on windows +#elif defined(MAP_FILE) MAP_FILE | MAP_SHARED #else MAP_SHARED #endif ; -void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) { +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { switch (method) { case LAZY: out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); @@ -91,30 +142,38 @@ void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_m case READ: out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED); if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc"); - if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed."); + SeekOrThrow(fd, offset); ReadOrThrow(fd, out.get(), size); break; } } -void *MapAnonymous(std::size_t size) { - return MapOrThrow(size, true, -#ifdef MAP_ANONYMOUS - MAP_ANONYMOUS // Linux +// Allocates zeroed memory in to. +void MapAnonymous(std::size_t size, util::scoped_memory &to) { + to.reset(); +#if defined(_WIN32) || defined(_WIN64) + to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED); #else - MAP_ANON // BSD + to.reset(MapOrThrow(size, true, +# if defined(MAP_ANONYMOUS) + MAP_ANONYMOUS | MAP_PRIVATE // Linux +# else + MAP_ANON | MAP_PRIVATE // BSD +# endif + , false, -1, 0), size, scoped_memory::MMAP_ALLOCATED); #endif - | MAP_PRIVATE, false, -1, 0); +} + +void *MapZeroedWrite(int fd, std::size_t size) { + ResizeOrThrow(fd, 0); + ResizeOrThrow(fd, size); + return MapOrThrow(size, true, kFileFlags, false, fd, 0); } void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { - file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); - if (-1 == file.get()) - UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing"); - if (-1 == ftruncate(file.get(), size)) - UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed"); + file.reset(CreateOrThrow(name)); try { - return MapOrThrow(size, true, kFileFlags, false, file.get(), 0); + return MapZeroedWrite(file.get(), size); } catch (ErrnoException &e) { e << " in file " << name; throw; diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh index b0eb6672..b218c4d1 100644 --- a/klm/util/mmap.hh +++ b/klm/util/mmap.hh @@ -4,13 +4,15 @@ #include <cstddef> -#include <inttypes.h> +#include <stdint.h> #include <sys/types.h> namespace util { class scoped_fd; +long SizePage(); + // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. class scoped_mmap { public: @@ -94,15 +96,19 @@ typedef enum { extern const int kFileFlags; // Wrapper around mmap to check it worked and hide some platform macros. -void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0); +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); -void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out); +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); -void *MapAnonymous(std::size_t size); +void MapAnonymous(std::size_t size, scoped_memory &to); // Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); +// msync wrapper +void SyncOrThrow(void *start, size_t length); + } // namespace util #endif // UTIL_MMAP__ diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index ef5783fe..6accc21a 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -7,9 +7,11 @@ * placed in namespace util * add MurmurHashNative * default option = 0 for seed + * ARM port from NICT */ #include "util/murmur_hash.hh" +#include <string.h> namespace util { @@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) uint64_t h = seed ^ (len * m); +#if defined(__arm) || defined(__arm__) + const size_t ksize = sizeof(uint64_t); + const unsigned char * data = (const unsigned char *)key; + const unsigned char * end = data + (std::size_t)(len/8) * ksize; +#else const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len/8); +#endif while(data != end) { +#if defined(__arm) || defined(__arm__) + uint64_t k; + memcpy(&k, data, ksize); + data += ksize; +#else uint64_t k = *data++; +#endif k *= m; k ^= k >> r; @@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) unsigned int h1 = seed ^ len; unsigned int h2 = 0; +#if defined(__arm) || defined(__arm__) + size_t ksize = sizeof(unsigned int); + const unsigned char * data = (const unsigned char *)key; +#else const unsigned int * data = (const unsigned int *)key; +#endif + unsigned int k1, k2; while(len >= 8) { - unsigned int k1 = *data++; +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; + memcpy(&k2, data, ksize); + data += ksize; +#else + k1 = *data++; + k2 = *data++; +#endif + k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; - unsigned int k2 = *data++; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; @@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) if(len >= 4) { - unsigned int k1 = *data++; +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; +#else + k1 = *data++; +#endif k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index 78fe583f..638aaeb2 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -1,7 +1,7 @@ #ifndef UTIL_MURMUR_HASH__ #define UTIL_MURMUR_HASH__ #include <cstddef> -#include <inttypes.h> +#include <stdint.h> namespace util { diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 8122d69c..f466cebc 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -18,27 +18,33 @@ class ProbingSizeException : public Exception { ~ProbingSizeException() throw() {} }; +// std::identity is an SGI extension :-( +struct IdentityHash { + template <class T> T operator()(T arg) const { return arg; } +}; + /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number - * of elements, else an infinite loop happens. + * of elements, else it throws ProbingSizeException. * Memory management and initialization is externalized to make it easier to * serialize these to disk and load them quickly. * Uses linear probing to find value. * Only insert and lookup operations. */ -template <class PackingT, class HashT, class EqualT = std::equal_to<typename PackingT::Key> > class ProbingHashTable { +template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable { public: - typedef PackingT Packing; - typedef typename Packing::Key Key; - typedef typename Packing::MutableIterator MutableIterator; - typedef typename Packing::ConstIterator ConstIterator; - + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; + public: static std::size_t Size(std::size_t entries, float multiplier) { - return std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))) * Packing::kBytes; + std::size_t buckets = std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))); + return buckets * sizeof(Entry); } // Must be assigned to later. @@ -49,9 +55,9 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac {} ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) - : begin_(Packing::FromVoid(start)), - buckets_(allocated / Packing::kBytes), - end_(begin_ + (allocated / Packing::kBytes)), + : begin_(reinterpret_cast<MutableIterator>(start)), + buckets_(allocated / sizeof(Entry)), + end_(begin_ + buckets_), invalid_(invalid), hash_(hash_func), equal_(equal_func), @@ -62,11 +68,10 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac {} template <class T> MutableIterator Insert(const T &t) { - if (++entries_ >= buckets_) - UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); #ifdef DEBUG assert(initialized_); #endif + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } @@ -84,7 +89,7 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac if (equal_(got, key)) { out = i; return true; } if (equal_(got, invalid_)) return false; if (++i == end_) i = begin_; - } + } } template <class Key> bool Find(const Key key, ConstIterator &out) const { diff --git a/klm/util/probing_hash_table_test.cc b/klm/util/probing_hash_table_test.cc index ff2f5af3..ef68e5f2 100644 --- a/klm/util/probing_hash_table_test.cc +++ b/klm/util/probing_hash_table_test.cc @@ -1,6 +1,6 @@ #include "util/probing_hash_table.hh" -#include "util/key_value_packing.hh" +#include <stdint.h> #define BOOST_TEST_MODULE ProbingHashTableTest #include <boost/test/unit_test.hpp> @@ -9,17 +9,34 @@ namespace util { namespace { -typedef AlignedPacking<char, uint64_t> Packing; -typedef ProbingHashTable<Packing, boost::hash<char> > Table; +struct Entry { + unsigned char key; + typedef unsigned char Key; + + unsigned char GetKey() const { + return key; + } + + uint64_t GetValue() const { + return value; + } + + uint64_t value; +}; + +typedef ProbingHashTable<Entry, boost::hash<unsigned char> > Table; BOOST_AUTO_TEST_CASE(simple) { char mem[Table::Size(10, 1.2)]; memset(mem, 0, sizeof(mem)); Table table(mem, sizeof(mem)); - Packing::ConstIterator i = Packing::ConstIterator(); + const Entry *i = NULL; BOOST_CHECK(!table.Find(2, i)); - table.Insert(Packing::Make(3, 328920)); + Entry to_ins; + to_ins.key = 3; + to_ins.value = 328920; + table.Insert(to_ins); BOOST_REQUIRE(table.Find(3, i)); BOOST_CHECK_EQUAL(3, i->GetKey()); BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue()); diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh index 47dfc245..aabcc531 100644 --- a/klm/util/sized_iterator.hh +++ b/klm/util/sized_iterator.hh @@ -6,7 +6,7 @@ #include <functional> #include <string> -#include <inttypes.h> +#include <stdint.h> #include <string.h> namespace util { diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index 0d6ecbbd..7700d9e6 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -5,7 +5,7 @@ #include <cstddef> #include <assert.h> -#include <inttypes.h> +#include <stdint.h> namespace util { @@ -122,99 +122,6 @@ template <class Iterator, class Accessor> Iterator BinaryBelow( return begin - 1; } -// To use this template, you need to define a Pivot function to match Key. -template <class PackingT> class SortedUniformMap { - public: - typedef PackingT Packing; - typedef typename Packing::ConstIterator ConstIterator; - typedef typename Packing::MutableIterator MutableIterator; - - struct Accessor { - public: - typedef typename Packing::Key Key; - const Key &operator()(const ConstIterator &i) const { return i->GetKey(); } - Key &operator()(const MutableIterator &i) const { return i->GetKey(); } - }; - - // Offer consistent API with probing hash. - static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { - return sizeof(uint64_t) + entries * Packing::kBytes; - } - - SortedUniformMap() -#ifdef DEBUG - : initialized_(false), loaded_(false) -#endif - {} - - SortedUniformMap(void *start, std::size_t /*allocated*/) : - begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)), - end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start)) -#ifdef DEBUG - , initialized_(true), loaded_(false) -#endif - {} - - void LoadedBinary() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - // Restore the size. - end_ = begin_ + *size_ptr_; - } - - // Caller responsible for not exceeding specified size. Do not call after FinishedInserting. - template <class T> void Insert(const T &t) { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); -#endif - *end_ = t; - ++end_; - } - - void FinishedInserting() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - std::sort(begin_, end_); - *size_ptr_ = (end_ - begin_); - } - - // Don't use this to change the key. - template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind<MutableIterator, Accessor, Pivot64>(begin_, end_, key, out); - } - - // Do not call before FinishedInserting. - template <class Key> bool Find(const Key key, ConstIterator &out) const { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind<ConstIterator, Accessor, Pivot64>(Accessor(), ConstIterator(begin_), ConstIterator(end_), key, out); - } - - ConstIterator begin() const { return begin_; } - ConstIterator end() const { return end_; } - - private: - typename Packing::MutableIterator begin_, end_; - uint64_t *size_ptr_; -#ifdef DEBUG - bool initialized_; - bool loaded_; -#endif -}; - } // namespace util #endif // UTIL_SORTED_UNIFORM__ diff --git a/klm/util/sorted_uniform_test.cc b/klm/util/sorted_uniform_test.cc index 4aa4c8aa..d9f6fad1 100644 --- a/klm/util/sorted_uniform_test.cc +++ b/klm/util/sorted_uniform_test.cc @@ -1,12 +1,11 @@ #include "util/sorted_uniform.hh" -#include "util/key_value_packing.hh" - #include <boost/random/mersenne_twister.hpp> #include <boost/random/uniform_int.hpp> #include <boost/random/variate_generator.hpp> #include <boost/scoped_array.hpp> #include <boost/unordered_map.hpp> + #define BOOST_TEST_MODULE SortedUniformTest #include <boost/test/unit_test.hpp> @@ -17,74 +16,86 @@ namespace util { namespace { -template <class Map, class Key, class Value> void Check(const Map &map, const boost::unordered_map<Key, Value> &reference, const Key key) { +template <class KeyT, class ValueT> struct Entry { + typedef KeyT Key; + typedef ValueT Value; + + Key key; + Value value; + + Key GetKey() const { + return key; + } + + Value GetValue() const { + return value; + } + + bool operator<(const Entry<Key,Value> &other) const { + return key < other.key; + } +}; + +template <class KeyT> struct Accessor { + typedef KeyT Key; + template <class Value> Key operator()(const Entry<Key, Value> *entry) const { + return entry->GetKey(); + } +}; + +template <class Key, class Value> void Check(const Entry<Key, Value> *begin, const Entry<Key, Value> *end, const boost::unordered_map<Key, Value> &reference, const Key key) { typename boost::unordered_map<Key, Value>::const_iterator ref = reference.find(key); - typename Map::ConstIterator i = typename Map::ConstIterator(); + typedef const Entry<Key, Value> *It; + // g++ can't tell that require will crash and burn. + It i = NULL; + bool ret = SortedUniformFind<It, Accessor<Key>, Pivot64>(Accessor<Key>(), begin, end, key, i); if (ref == reference.end()) { - BOOST_CHECK(!map.Find(key, i)); + BOOST_CHECK(!ret); } else { - // g++ can't tell that require will crash and burn. - BOOST_REQUIRE(map.Find(key, i)); + BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(ref->second, i->GetValue()); } } -typedef SortedUniformMap<AlignedPacking<uint64_t, uint32_t> > TestMap; - BOOST_AUTO_TEST_CASE(empty) { - char buf[TestMap::Size(0)]; - TestMap map(buf, TestMap::Size(0)); - map.FinishedInserting(); - TestMap::ConstIterator i; - BOOST_CHECK(!map.Find(42, i)); -} - -BOOST_AUTO_TEST_CASE(one) { - char buf[TestMap::Size(1)]; - TestMap map(buf, sizeof(buf)); - Entry<uint64_t, uint32_t> e; - e.Set(42,2); - map.Insert(e); - map.FinishedInserting(); - TestMap::ConstIterator i = TestMap::ConstIterator(); - BOOST_REQUIRE(map.Find(42, i)); - BOOST_CHECK(i == map.begin()); - BOOST_CHECK(!map.Find(43, i)); - BOOST_CHECK(!map.Find(41, i)); + typedef const Entry<uint64_t, float> T; + const T *i; + bool ret = SortedUniformFind<const T*, Accessor<uint64_t>, Pivot64>(Accessor<uint64_t>(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); + BOOST_CHECK(!ret); } template <class Key> void RandomTest(Key upper, size_t entries, size_t queries) { typedef unsigned char Value; - typedef SortedUniformMap<AlignedPacking<Key, unsigned char> > Map; - boost::scoped_array<char> buffer(new char[Map::Size(entries)]); - Map map(buffer.get(), entries); boost::mt19937 rng; boost::uniform_int<Key> range_key(0, upper); boost::uniform_int<Value> range_value(0, 255); boost::variate_generator<boost::mt19937&, boost::uniform_int<Key> > gen_key(rng, range_key); boost::variate_generator<boost::mt19937&, boost::uniform_int<unsigned char> > gen_value(rng, range_value); + typedef Entry<Key, Value> Ent; + std::vector<Ent> backing; boost::unordered_map<Key, unsigned char> reference; - Entry<Key, unsigned char> ent; + Ent ent; for (size_t i = 0; i < entries; ++i) { Key key = gen_key(); unsigned char value = gen_value(); if (reference.insert(std::make_pair(key, value)).second) { - ent.Set(key, value); - map.Insert(Entry<Key, unsigned char>(ent)); + ent.key = key; + ent.value = value; + backing.push_back(ent); } } - map.FinishedInserting(); + std::sort(backing.begin(), backing.end()); // Random queries. for (size_t i = 0; i < queries; ++i) { const Key key = gen_key(); - Check<Map, Key, unsigned char>(map, reference, key); + Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, key); } typename boost::unordered_map<Key, unsigned char>::const_iterator it = reference.begin(); for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { - Check<Map, Key, unsigned char>(map, reference, it->second); + Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, it->second); } } diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index 413bda0b..c7e1c863 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -1,6 +1,7 @@ #ifndef UTIL_TOKENIZE_PIECE__ #define UTIL_TOKENIZE_PIECE__ +#include "util/exception.hh" #include "util/string_piece.hh" #include <boost/iterator/iterator_facade.hpp> @@ -8,63 +9,25 @@ #include <algorithm> #include <iostream> -/* Usage: - * - * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { - * std::cout << *i << "\n"; - * } - * - */ - namespace util { -// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece. -template <char d> class PieceIterator : public boost::iterator_facade<PieceIterator<d>, const StringPiece, boost::forward_traversal_tag> { +// Thrown on dereference when out of tokens to parse +class OutOfTokens : public Exception { public: - // Default construct is end, which is also accessed by kEndPieceIterator; - PieceIterator() {} - - explicit PieceIterator(const StringPiece &str) - : after_(str) { - increment(); - } + OutOfTokens() throw() {} + ~OutOfTokens() throw() {} +}; - bool operator!() const { - return after_.data() == 0; - } - operator bool() const { - return after_.data() != 0; - } +class SingleCharacter { + public: + explicit SingleCharacter(char delim) : delim_(delim) {} - static PieceIterator<d> end() { - return PieceIterator<d>(); + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); } private: - friend class boost::iterator_core_access; - - void increment() { - const char *start = after_.data(); - for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {} - if (start == after_.data() + after_.size()) { - // End condition. - after_.clear(); - return; - } - const char *finish = start; - for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {} - current_ = StringPiece(start, finish - start); - after_ = StringPiece(finish, after_.data() + after_.size() - finish); - } - - bool equal(const PieceIterator &other) const { - return after_.data() == other.after_.data(); - } - - const StringPiece &dereference() const { return current_; } - - StringPiece current_; - StringPiece after_; + char delim_; }; class MultiCharacter { @@ -95,7 +58,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it public: TokenIter() {} - TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { increment(); } @@ -130,6 +93,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it } const StringPiece &dereference() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); return current_; } diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc index e07ebcf5..d856018f 100644 --- a/klm/util/tokenize_piece_test.cc +++ b/klm/util/tokenize_piece_test.cc @@ -9,53 +9,7 @@ namespace util { namespace { -BOOST_AUTO_TEST_CASE(simple) { - PieceIterator<' '> it("single spaced words."); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("single"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("words."), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_delimiter) { - const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; - PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("first"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("second"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("third"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_entries) { - const char str[] = "\0split\0\0 \0me\0 "; - PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - const char first[] = "\0split\0\0"; - BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); - ++it; - BOOST_REQUIRE(it); - const char second[] = "\0me\0"; - BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); - ++it; - BOOST_CHECK(!it); -} - -/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { +BOOST_AUTO_TEST_CASE(pipe_pipe_none) { const char str[] = "nodelimit at all"; TokenIter<MultiCharacter> it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); @@ -79,7 +33,7 @@ BOOST_AUTO_TEST_CASE(remove_empty) { const char str[] = "|||"; TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||")); BOOST_CHECK(!it); -}*/ +} BOOST_AUTO_TEST_CASE(remove_empty_keep) { const char str[] = " |||"; diff --git a/mteval/Makefile.am b/mteval/Makefile.am index 95845090..e7126675 100644 --- a/mteval/Makefile.am +++ b/mteval/Makefile.am @@ -10,7 +10,7 @@ endif noinst_LIBRARIES = libmteval.a -libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc external_scorer.cc +libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc external_scorer.cc ns.cc ns_ter.cc ns_ext.cc ns_comb.cc ns_docscorer.cc fast_score_SOURCES = fast_score.cc fast_score_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a -lz diff --git a/mteval/fast_score.cc b/mteval/fast_score.cc index 5ee264a6..a271ccc5 100644 --- a/mteval/fast_score.cc +++ b/mteval/fast_score.cc @@ -4,9 +4,11 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "stringlib.h" #include "filelib.h" #include "tdict.h" -#include "scorer.h" +#include "ns.h" +#include "ns_docscorer.h" using namespace std; namespace po = boost::program_options; @@ -14,8 +16,8 @@ namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") - ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("reference,r",po::value<vector<string> >(), "[1 or more required] Reference translation(s) in tokenized text files") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") ("in_file,i", po::value<string>()->default_value("-"), "Input file") ("help,h", "Help"); po::options_description dcmdline_options; @@ -35,24 +37,29 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as<string>(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as<vector<string> >(), ""); + string loss_function = UppercaseString(conf["evaluation_metric"].as<string>()); + if (loss_function == "COMBI") { + cerr << "WARNING: 'combi' metric is no longer supported, switching to 'COMB:TER=-0.5;IBM_BLEU=0.5'\n"; + loss_function = "COMB:TER=-0.5;IBM_BLEU=0.5"; + } else if (loss_function == "BLEU") { + cerr << "WARNING: 'BLEU' is ambiguous, assuming 'IBM_BLEU'\n"; + loss_function = "IBM_BLEU"; + } + EvaluationMetric* metric = EvaluationMetric::Instance(loss_function); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; ReadFile rf(conf["in_file"].as<string>()); - ScoreP acc; + SufficientStats acc; istream& in = *rf.stream(); int lc = 0; - while(in) { - string line; - getline(in, line); - if (line.empty() && !in) break; + string line; + while(getline(in, line)) { vector<WordID> sent; TD::ConvertSentence(line, &sent); - ScoreP sentscore = ds[lc]->ScoreCandidate(sent); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); + SufficientStats t; + ds[lc]->Evaluate(sent, &t); + acc += t; ++lc; } assert(lc > 0); @@ -63,9 +70,8 @@ int main(int argc, char** argv) { if (lc != ds.size()) cerr << "Fewer sentences in hyp (" << lc << ") than refs (" << ds.size() << "): scoring partial set!\n"; - float score = acc->ComputeScore(); - string details; - acc->ScoreDetails(&details); + float score = metric->ComputeScore(acc); + const string details = metric->DetailedScore(acc); cerr << details << endl; cout << score << endl; return 0; diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index 64a6a8bf..2bd31566 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -5,7 +5,7 @@ #include "prob.h" #include "tdict.h" -#include "scorer.h" +#include "ns.h" #include "filelib.h" #include "stringlib.h" @@ -17,7 +17,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function") + ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric") ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from") ("output_list,L", "Show reranked list as output") ("help,h", "Help"); @@ -75,13 +75,15 @@ bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, pro int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as<string>(); + const string smetric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(smetric); + + const bool is_loss = (UppercaseString(smetric) == "TER"); const bool output_list = conf.count("output_list") > 0; const string file = conf["input"].as<string>(); const double mbr_scale = conf["scale"].as<double>(); cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - ScoreType type = ScoreTypeFromString(metric); vector<pair<vector<WordID>, prob_t> > list; ReadFile rf(file); string sent_id; @@ -99,15 +101,17 @@ int main(int argc, char** argv) { vector<double> mbr_scores(output_list ? list.size() : 0); double mbr_loss = numeric_limits<double>::max(); for (int i = 0 ; i < list.size(); ++i) { - vector<vector<WordID> > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); + const vector<vector<WordID> > refs(1, list[i].first); + boost::shared_ptr<SegmentEvaluator> segeval = metric-> + CreateSegmentEvaluator(refs); + double wl_acc = 0; for (int j = 0; j < list.size(); ++j) { if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; + SufficientStats ss; + segeval->Evaluate(list[j].first, &ss); + double loss = 1.0 - metric->ComputeScore(ss); + if (is_loss) loss = 1.0 - loss; double weighted_loss = loss * (joints[j] / marginal).as_float(); wl_acc += weighted_loss; if ((!output_list) && wl_acc > mbr_loss) break; diff --git a/mteval/ns.cc b/mteval/ns.cc new file mode 100644 index 00000000..788f809a --- /dev/null +++ b/mteval/ns.cc @@ -0,0 +1,290 @@ +#include "ns.h" +#include "ns_ter.h" +#include "ns_ext.h" +#include "ns_comb.h" + +#include <cstdio> +#include <cassert> +#include <cmath> +#include <cstdlib> +#include <iostream> +#include <sstream> + +#include "tdict.h" +#include "stringlib.h" + +using namespace std; +using boost::shared_ptr; + +map<string, EvaluationMetric*> EvaluationMetric::instances_; + +SegmentEvaluator::~SegmentEvaluator() {} +EvaluationMetric::~EvaluationMetric() {} + +bool EvaluationMetric::IsErrorMetric() const { + return false; +} + +struct DefaultSegmentEvaluator : public SegmentEvaluator { + DefaultSegmentEvaluator(const vector<vector<WordID> >& refs, const EvaluationMetric* em) : refs_(refs), em_(em) {} + void Evaluate(const vector<WordID>& hyp, SufficientStats* out) const { + em_->ComputeSufficientStatistics(hyp, refs_, out); + out->id_ = em_->MetricId(); + } + const vector<vector<WordID> > refs_; + const EvaluationMetric* em_; +}; + +shared_ptr<SegmentEvaluator> EvaluationMetric::CreateSegmentEvaluator(const vector<vector<WordID> >& refs) const { + return shared_ptr<SegmentEvaluator>(new DefaultSegmentEvaluator(refs, this)); +} + +#define MAX_SS_VECTOR_SIZE 50 +unsigned EvaluationMetric::SufficientStatisticsVectorSize() const { + return MAX_SS_VECTOR_SIZE; +} + +void EvaluationMetric::ComputeSufficientStatistics(const vector<WordID>&, + const vector<vector<WordID> >&, + SufficientStats*) const { + cerr << "Base class ComputeSufficientStatistics should not be called.\n"; + abort(); +} + +string EvaluationMetric::DetailedScore(const SufficientStats& stats) const { + ostringstream os; + os << MetricId() << "=" << ComputeScore(stats); + return os.str(); +} + +enum BleuType { IBM, Koehn, NIST }; +template <unsigned int N = 4u, BleuType BrevityType = IBM> +struct BleuSegmentEvaluator : public SegmentEvaluator { + BleuSegmentEvaluator(const vector<vector<WordID> >& refs, const EvaluationMetric* em) : evaluation_metric(em) { + assert(refs.size() > 0); + float tot = 0; + int smallest = 9999999; + for (vector<vector<WordID> >::const_iterator ci = refs.begin(); + ci != refs.end(); ++ci) { + lengths_.push_back(ci->size()); + tot += lengths_.back(); + if (lengths_.back() < smallest) smallest = lengths_.back(); + CountRef(*ci); + } + if (BrevityType == Koehn) + lengths_[0] = tot / refs.size(); + if (BrevityType == NIST) + lengths_[0] = smallest; + } + + void Evaluate(const vector<WordID>& hyp, SufficientStats* out) const { + out->fields.resize(N + N + 2); + out->id_ = evaluation_metric->MetricId(); + for (unsigned i = 0; i < N+N+2; ++i) out->fields[i] = 0; + + ComputeNgramStats(hyp, &out->fields[0], &out->fields[N], true); + float& hyp_len = out->fields[2*N]; + float& ref_len = out->fields[2*N + 1]; + hyp_len = hyp.size(); + ref_len = lengths_[0]; + if (lengths_.size() > 1 && BrevityType == IBM) { + float bestd = 2000000; + float hl = hyp.size(); + float bl = -1; + for (vector<float>::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { + if (fabs(*ci - hl) < bestd) { + bestd = fabs(*ci - hl); + bl = *ci; + } + } + ref_len = bl; + } + } + + struct NGramCompare { + int operator() (const vector<WordID>& a, const vector<WordID>& b) { + const size_t as = a.size(); + const size_t bs = b.size(); + const size_t s = (as < bs ? as : bs); + for (size_t i = 0; i < s; ++i) { + int d = a[i] - b[i]; + if (d < 0) return true; + if (d > 0) return false; + } + return as < bs; + } + }; + typedef map<vector<WordID>, pair<int,int>, NGramCompare> NGramCountMap; + + void CountRef(const vector<WordID>& ref) { + NGramCountMap tc; + vector<WordID> ngram(N); + int s = ref.size(); + for (int j=0; j<s; ++j) { + int remaining = s-j; + int k = (N < remaining ? N : remaining); + ngram.clear(); + for (int i=1; i<=k; ++i) { + ngram.push_back(ref[j + i - 1]); + tc[ngram].first++; + } + } + for (typename NGramCountMap::iterator i = tc.begin(); i != tc.end(); ++i) { + pair<int,int>& p = ngrams_[i->first]; + if (p.first < i->second.first) + p = i->second; + } + } + + void ComputeNgramStats(const vector<WordID>& sent, + float* correct, // N elements reserved + float* hyp, // N elements reserved + bool clip_counts = true) const { + // clear clipping stats + for (typename NGramCountMap::iterator it = ngrams_.begin(); it != ngrams_.end(); ++it) + it->second.second = 0; + + vector<WordID> ngram(N); + *correct *= 0; + *hyp *= 0; + int s = sent.size(); + for (int j=0; j<s; ++j) { + int remaining = s-j; + int k = (N < remaining ? N : remaining); + ngram.clear(); + for (int i=1; i<=k; ++i) { + ngram.push_back(sent[j + i - 1]); + pair<int,int>& p = ngrams_[ngram]; + if(clip_counts){ + if (p.second < p.first) { + ++p.second; + correct[i-1]++; + } + } else { + ++p.second; + correct[i-1]++; + } + // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: + if (!p.first) { + for (; i<=k; ++i) + hyp[i-1]++; + } else { + hyp[i-1]++; + } + } + } + } + + const EvaluationMetric* evaluation_metric; + vector<float> lengths_; + mutable NGramCountMap ngrams_; +}; + +template <unsigned int N = 4u, BleuType BrevityType = IBM> +struct BleuMetric : public EvaluationMetric { + BleuMetric() : EvaluationMetric(BrevityType == IBM ? "IBM_BLEU" : (BrevityType == Koehn ? "KOEHN_BLEU" : "NIST_BLEU")) {} + unsigned SufficientStatisticsVectorSize() const { return N*2 + 2; } + shared_ptr<SegmentEvaluator> CreateSegmentEvaluator(const vector<vector<WordID> >& refs) const { + return shared_ptr<SegmentEvaluator>(new BleuSegmentEvaluator<N,BrevityType>(refs, this)); + } + float ComputeBreakdown(const SufficientStats& stats, float* bp, vector<float>* out) const { + if (out) { out->clear(); } + float log_bleu = 0; + int count = 0; + for (int i = 0; i < N; ++i) { + if (stats.fields[i+N] > 0) { + float cor_count = stats.fields[i]; // correct_ngram_hit_counts[i]; + // smooth bleu + if (!cor_count) { cor_count = 0.01; } + float lprec = log(cor_count) - log(stats.fields[i+N]); // log(hyp_ngram_counts[i]); + if (out) out->push_back(exp(lprec)); + log_bleu += lprec; + ++count; + } + } + log_bleu /= count; + float lbp = 0.0; + const float& hyp_len = stats.fields[2*N]; + const float& ref_len = stats.fields[2*N + 1]; + if (hyp_len < ref_len) + lbp = (hyp_len - ref_len) / hyp_len; + log_bleu += lbp; + if (bp) *bp = exp(lbp); + return exp(log_bleu); + } + string DetailedScore(const SufficientStats& stats) const { + char buf[2000]; + vector<float> precs(N); + float bp; + float bleu = ComputeBreakdown(stats, &bp, &precs); + sprintf(buf, "%s = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", + MetricId().c_str(), + bleu*100.0, + precs[0]*100.0, + precs[1]*100.0, + precs[2]*100.0, + precs[3]*100.0, + bp); + return buf; + } + float ComputeScore(const SufficientStats& stats) const { + return ComputeBreakdown(stats, NULL, NULL); + } +}; + +EvaluationMetric* EvaluationMetric::Instance(const string& imetric_id) { + static bool is_first = true; + if (is_first) { + instances_["NULL"] = NULL; + is_first = false; + } + const string metric_id = UppercaseString(imetric_id); + + map<string, EvaluationMetric*>::iterator it = instances_.find(metric_id); + if (it == instances_.end()) { + EvaluationMetric* m = NULL; + if (metric_id == "IBM_BLEU") { + m = new BleuMetric<4, IBM>; + } else if (metric_id == "NIST_BLEU") { + m = new BleuMetric<4, NIST>; + } else if (metric_id == "KOEHN_BLEU") { + m = new BleuMetric<4, Koehn>; + } else if (metric_id == "TER") { + m = new TERMetric; + } else if (metric_id == "METEOR") { + m = new ExternalMetric("METEOR", "java -Xmx1536m -jar /Users/cdyer/software/meteor/meteor-1.3.jar - - -mira -lower -t tune -l en"); + } else if (metric_id.find("COMB:") == 0) { + m = new CombinationMetric(metric_id); + } else { + cerr << "Implement please: " << metric_id << endl; + abort(); + } + if (m->MetricId() != metric_id) { + cerr << "Registry error: " << metric_id << " vs. " << m->MetricId() << endl; + abort(); + } + return instances_[metric_id] = m; + } else { + return it->second; + } +} + +SufficientStats::SufficientStats(const string& encoded) { + istringstream is(encoded); + is >> id_; + float val; + while(is >> val) + fields.push_back(val); +} + +void SufficientStats::Encode(string* out) const { + ostringstream os; + if (id_.size() > 0) + os << id_; + else + os << "NULL"; + for (unsigned i = 0; i < fields.size(); ++i) + os << ' ' << fields[i]; + *out = os.str(); +} + diff --git a/mteval/ns.h b/mteval/ns.h new file mode 100644 index 00000000..4e4c6975 --- /dev/null +++ b/mteval/ns.h @@ -0,0 +1,115 @@ +#ifndef _NS_H_ +#define _NS_H_ + +#include <string> +#include <vector> +#include <map> +#include <boost/shared_ptr.hpp> +#include "wordid.h" +#include <iostream> + +class SufficientStats { + public: + SufficientStats() : id_() {} + explicit SufficientStats(const std::string& encoded); + SufficientStats(const std::string& mid, const std::vector<float>& f) : + id_(mid), fields(f) {} + + SufficientStats& operator+=(const SufficientStats& delta) { + if (id_.empty() && delta.id_.size()) id_ = delta.id_; + if (fields.size() != delta.fields.size()) + fields.resize(std::max(fields.size(), delta.fields.size())); + for (unsigned i = 0; i < delta.fields.size(); ++i) + fields[i] += delta.fields[i]; + return *this; + } + SufficientStats& operator-=(const SufficientStats& delta) { + if (id_.empty() && delta.id_.size()) id_ = delta.id_; + if (fields.size() != delta.fields.size()) + fields.resize(std::max(fields.size(), delta.fields.size())); + for (unsigned i = 0; i < delta.fields.size(); ++i) + fields[i] -= delta.fields[i]; + return *this; + } + SufficientStats& operator*=(const double& scalar) { + for (unsigned i = 0; i < fields.size(); ++i) + fields[i] *= scalar; + return *this; + } + SufficientStats& operator/=(const double& scalar) { + for (unsigned i = 0; i < fields.size(); ++i) + fields[i] /= scalar; + return *this; + } + bool operator==(const SufficientStats& other) const { + return other.fields == fields; + } + bool IsAdditiveIdentity() const { + for (unsigned i = 0; i < fields.size(); ++i) + if (fields[i]) return false; + return true; + } + size_t size() const { return fields.size(); } + float operator[](size_t i) const { + if (i < fields.size()) return fields[i]; + return 0; + } + void Encode(std::string* out) const; + + std::string id_; + std::vector<float> fields; +}; + +inline const SufficientStats operator+(const SufficientStats& a, const SufficientStats& b) { + SufficientStats res(a); + return res += b; +} + +inline const SufficientStats operator-(const SufficientStats& a, const SufficientStats& b) { + SufficientStats res(a); + return res -= b; +} + +struct SegmentEvaluator { + virtual ~SegmentEvaluator(); + virtual void Evaluate(const std::vector<WordID>& hyp, SufficientStats* out) const = 0; +}; + +// Instructions for implementing a new metric +// To Instance(), add something that creates the metric +// Implement ComputeScore(const SufficientStats& stats) const; +// Implement ONE of the following: +// 1) void ComputeSufficientStatistics(const std::vector<std::vector<WordID> >& refs, SufficientStats* out) const; +// 2) a new SegmentEvaluator class AND CreateSegmentEvaluator(const std::vector<std::vector<WordID> >& refs) const; +// [The later (#2) is only used when it is necessary to precompute per-segment data from a set of refs] +// OPTIONAL: Override SufficientStatisticsVectorSize() if it is easy to do so +class EvaluationMetric { + public: + static EvaluationMetric* Instance(const std::string& metric_id = "IBM_BLEU"); + + protected: + EvaluationMetric(const std::string& id) : name_(id) {} + virtual ~EvaluationMetric(); + + public: + const std::string& MetricId() const { return name_; } + + // returns true for metrics like WER and TER where lower scores are better + // false for metrics like BLEU and METEOR where higher scores are better + virtual bool IsErrorMetric() const; + + virtual unsigned SufficientStatisticsVectorSize() const; + virtual float ComputeScore(const SufficientStats& stats) const = 0; + virtual std::string DetailedScore(const SufficientStats& stats) const; + virtual boost::shared_ptr<SegmentEvaluator> CreateSegmentEvaluator(const std::vector<std::vector<WordID> >& refs) const; + virtual void ComputeSufficientStatistics(const std::vector<WordID>& hyp, + const std::vector<std::vector<WordID> >& refs, + SufficientStats* out) const; + + private: + static std::map<std::string, EvaluationMetric*> instances_; + const std::string name_; +}; + +#endif + diff --git a/mteval/ns_comb.cc b/mteval/ns_comb.cc new file mode 100644 index 00000000..41c634cd --- /dev/null +++ b/mteval/ns_comb.cc @@ -0,0 +1,87 @@ +#include "ns_comb.h" + +#include <iostream> + +#include "stringlib.h" + +using namespace std; + +// e.g. COMB:IBM_BLEU=0.5;TER=0.5 +CombinationMetric::CombinationMetric(const std::string& cmd) : + EvaluationMetric(cmd), + total_size() { + if (cmd.find("COMB:") != 0 || cmd.size() < 9) { + cerr << "Error in combination metric specifier: " << cmd << endl; + exit(1); + } + string mix = cmd.substr(5); + vector<string> comps; + Tokenize(cmd.substr(5), ';', &comps); + if(comps.size() < 2) { + cerr << "Error in combination metric specifier: " << cmd << endl; + exit(1); + } + vector<string> cwpairs; + for (unsigned i = 0; i < comps.size(); ++i) { + Tokenize(comps[i], '=', &cwpairs); + if (cwpairs.size() != 2) { cerr << "Error in combination metric specifier: " << cmd << endl; exit(1); } + metrics.push_back(EvaluationMetric::Instance(cwpairs[0])); + coeffs.push_back(atof(cwpairs[1].c_str())); + offsets.push_back(total_size); + total_size += metrics.back()->SufficientStatisticsVectorSize(); + cerr << (i > 0 ? " + " : "( ") << coeffs.back() << " * " << cwpairs[0]; + } + cerr << " )\n"; +} + +struct CombinationSegmentEvaluator : public SegmentEvaluator { + CombinationSegmentEvaluator(const string& id, + const vector<vector<WordID> >& refs, + const vector<EvaluationMetric*>& metrics, + const vector<unsigned>& offsets, + const unsigned ts) : id_(id), offsets_(offsets), total_size_(ts), component_evaluators_(metrics.size()) { + for (unsigned i = 0; i < metrics.size(); ++i) + component_evaluators_[i] = metrics[i]->CreateSegmentEvaluator(refs); + } + virtual void Evaluate(const std::vector<WordID>& hyp, SufficientStats* out) const { + out->id_ = id_; + out->fields.resize(total_size_); + for (unsigned i = 0; i < component_evaluators_.size(); ++i) { + SufficientStats t; + component_evaluators_[i]->Evaluate(hyp, &t); + for (unsigned j = 0; j < t.fields.size(); ++j) { + unsigned op = j + offsets_[i]; + assert(op < out->fields.size()); + out->fields[op] = t[j]; + } + } + } + const string& id_; + const vector<unsigned>& offsets_; + const unsigned total_size_; + vector<boost::shared_ptr<SegmentEvaluator> > component_evaluators_; +}; + +boost::shared_ptr<SegmentEvaluator> CombinationMetric::CreateSegmentEvaluator(const std::vector<std::vector<WordID> >& refs) const { + boost::shared_ptr<SegmentEvaluator> res; + res.reset(new CombinationSegmentEvaluator(MetricId(), refs, metrics, offsets, total_size)); + return res; +} + +float CombinationMetric::ComputeScore(const SufficientStats& stats) const { + float tot = 0; + for (unsigned i = 0; i < metrics.size(); ++i) { + SufficientStats t; + unsigned next = total_size; + if (i + 1 < offsets.size()) next = offsets[i+1]; + for (unsigned j = offsets[i]; j < next; ++j) + t.fields.push_back(stats[j]); + tot += metrics[i]->ComputeScore(t) * coeffs[i]; + } + return tot; +} + +unsigned CombinationMetric::SufficientStatisticsVectorSize() const { + return total_size; +} + diff --git a/mteval/ns_comb.h b/mteval/ns_comb.h new file mode 100644 index 00000000..140e7e6a --- /dev/null +++ b/mteval/ns_comb.h @@ -0,0 +1,19 @@ +#ifndef _NS_COMB_H_ +#define _NS_COMB_H_ + +#include "ns.h" + +class CombinationMetric : public EvaluationMetric { + public: + CombinationMetric(const std::string& cmd); + virtual boost::shared_ptr<SegmentEvaluator> CreateSegmentEvaluator(const std::vector<std::vector<WordID> >& refs) const; + virtual float ComputeScore(const SufficientStats& stats) const; + virtual unsigned SufficientStatisticsVectorSize() const; + private: + std::vector<EvaluationMetric*> metrics; + std::vector<float> coeffs; + std::vector<unsigned> offsets; + unsigned total_size; +}; + +#endif diff --git a/mteval/ns_docscorer.cc b/mteval/ns_docscorer.cc new file mode 100644 index 00000000..28a2fd09 --- /dev/null +++ b/mteval/ns_docscorer.cc @@ -0,0 +1,60 @@ +#include "ns_docscorer.h" + +#include <iostream> +#include <cstring> + +#include "tdict.h" +#include "filelib.h" +#include "ns.h" + +using namespace std; + +DocumentScorer::~DocumentScorer() {} + +void DocumentScorer::Init(const EvaluationMetric* metric, + const vector<string>& ref_files, + const string& src_file, + bool verbose) { + scorers_.clear(); + cerr << "Loading references (" << ref_files.size() << " files)\n"; + assert(src_file.empty()); + std::vector<ReadFile> ifs(ref_files.begin(),ref_files.end()); + for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]); + char buf[64000]; + bool expect_eof = false; + int line=0; + while (ifs[0].get()) { + vector<vector<WordID> > refs(ref_files.size()); + for (int i=0; i < ref_files.size(); ++i) { + istream &in=ifs[i].get(); + if (in.eof()) break; + in.getline(buf, 64000); + refs[i].clear(); + if (strlen(buf) == 0) { + if (in.eof()) { + if (!expect_eof) { + assert(i == 0); + expect_eof = true; + } + break; + } + } else { + TD::ConvertSentence(buf, &refs[i]); + assert(!refs[i].empty()); + } + assert(!expect_eof); + } + if (!expect_eof) { + string src_line; + //if (srcrf) { + // getline(srcrf.get(), src_line); + // map<string,string> dummy; + // ProcessAndStripSGML(&src_line, &dummy); + //} + scorers_.push_back(metric->CreateSegmentEvaluator(refs)); + ++line; + } + } + cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n"; +} + diff --git a/mteval/ns_docscorer.h b/mteval/ns_docscorer.h new file mode 100644 index 00000000..170ac627 --- /dev/null +++ b/mteval/ns_docscorer.h @@ -0,0 +1,31 @@ +#ifndef _NS_DOC_SCORER_H_ +#define _NS_DOC_SCORER_H_ + +#include <vector> +#include <string> +#include <boost/shared_ptr.hpp> + +struct EvaluationMetric; +struct SegmentEvaluator; +class DocumentScorer { + public: + ~DocumentScorer(); + DocumentScorer() { } + DocumentScorer(const EvaluationMetric* metric, + const std::vector<std::string>& ref_files, + const std::string& src_file = "", + bool verbose=false) { + Init(metric,ref_files,src_file,verbose); + } + void Init(const EvaluationMetric* metric, + const std::vector<std::string>& ref_files, + const std::string& src_file = "", + bool verbose=false); + + int size() const { return scorers_.size(); } + const SegmentEvaluator* operator[](size_t i) const { return scorers_[i].get(); } + private: + std::vector<boost::shared_ptr<SegmentEvaluator> > scorers_; +}; + +#endif diff --git a/mteval/ns_ext.cc b/mteval/ns_ext.cc new file mode 100644 index 00000000..956708af --- /dev/null +++ b/mteval/ns_ext.cc @@ -0,0 +1,130 @@ +#include "ns_ext.h" + +#include <cstdio> // popen +#include <cstdlib> +#include <cstring> +#include <unistd.h> +#include <sstream> +#include <iostream> +#include <cassert> + +#include "stringlib.h" +#include "tdict.h" + +using namespace std; + +struct NScoreServer { + NScoreServer(const std::string& cmd); + ~NScoreServer(); + + float ComputeScore(const std::vector<float>& fields); + void Evaluate(const std::vector<std::vector<WordID> >& refs, const std::vector<WordID>& hyp, std::vector<float>* fields); + + private: + void RequestResponse(const std::string& request, std::string* response); + int p2c[2]; + int c2p[2]; +}; + +NScoreServer::NScoreServer(const string& cmd) { + cerr << "Invoking " << cmd << " ..." << endl; + if (pipe(p2c) < 0) { perror("pipe"); exit(1); } + if (pipe(c2p) < 0) { perror("pipe"); exit(1); } + pid_t cpid = fork(); + if (cpid < 0) { perror("fork"); exit(1); } + if (cpid == 0) { // child + close(p2c[1]); + close(c2p[0]); + dup2(p2c[0], 0); + close(p2c[0]); + dup2(c2p[1], 1); + close(c2p[1]); + cerr << "Exec'ing from child " << cmd << endl; + vector<string> vargs; + SplitOnWhitespace(cmd, &vargs); + const char** cargv = static_cast<const char**>(malloc(sizeof(const char*) * vargs.size())); + for (unsigned i = 1; i < vargs.size(); ++i) cargv[i-1] = vargs[i].c_str(); + cargv[vargs.size() - 1] = NULL; + execvp(vargs[0].c_str(), (char* const*)cargv); + } else { // parent + close(c2p[1]); + close(p2c[0]); + } + string dummy; + RequestResponse("SCORE ||| Reference initialization string . ||| Testing initialization string .", &dummy); + assert(dummy.size() > 0); + cerr << "Connection established.\n"; +} + +NScoreServer::~NScoreServer() { + // TODO close stuff, join stuff +} + +float NScoreServer::ComputeScore(const vector<float>& fields) { + ostringstream os; + os << "EVAL |||"; + for (unsigned i = 0; i < fields.size(); ++i) + os << ' ' << fields[i]; + string sres; + RequestResponse(os.str(), &sres); + return strtod(sres.c_str(), NULL); +} + +void NScoreServer::Evaluate(const vector<vector<WordID> >& refs, const vector<WordID>& hyp, vector<float>* fields) { + ostringstream os; + os << "SCORE"; + for (unsigned i = 0; i < refs.size(); ++i) { + os << " |||"; + for (unsigned j = 0; j < refs[i].size(); ++j) { + os << ' ' << TD::Convert(refs[i][j]); + } + } + os << " |||"; + for (unsigned i = 0; i < hyp.size(); ++i) { + os << ' ' << TD::Convert(hyp[i]); + } + string sres; + RequestResponse(os.str(), &sres); + istringstream is(sres); + float val; + fields->clear(); + while(is >> val) + fields->push_back(val); +} + +#define MAX_BUF 16000 + +void NScoreServer::RequestResponse(const string& request, string* response) { +// cerr << "@SERVER: " << request << endl; + string x = request + "\n"; + write(p2c[1], x.c_str(), x.size()); + char buf[MAX_BUF]; + size_t n = read(c2p[0], buf, MAX_BUF); + while (n < MAX_BUF && buf[n-1] != '\n') + n += read(c2p[0], &buf[n], MAX_BUF - n); + + buf[n-1] = 0; + if (n < 2) { + cerr << "Malformed response: " << buf << endl; + } + *response = Trim(buf, " \t\n"); +// cerr << "@RESPONSE: '" << *response << "'\n"; +} + +void ExternalMetric::ComputeSufficientStatistics(const std::vector<WordID>& hyp, + const std::vector<std::vector<WordID> >& refs, + SufficientStats* out) const { + eval_server->Evaluate(refs, hyp, &out->fields); +} + +float ExternalMetric::ComputeScore(const SufficientStats& stats) const { + eval_server->ComputeScore(stats.fields); +} + +ExternalMetric::ExternalMetric(const string& metric_name, const std::string& command) : + EvaluationMetric(metric_name), + eval_server(new NScoreServer(command)) {} + +ExternalMetric::~ExternalMetric() { + delete eval_server; +} diff --git a/mteval/ns_ext.h b/mteval/ns_ext.h new file mode 100644 index 00000000..78badb2e --- /dev/null +++ b/mteval/ns_ext.h @@ -0,0 +1,21 @@ +#ifndef _NS_EXTERNAL_SCORER_H_ +#define _NS_EXTERNAL_SCORER_H_ + +#include "ns.h" + +struct NScoreServer; +class ExternalMetric : public EvaluationMetric { + public: + ExternalMetric(const std::string& metricid, const std::string& command); + ~ExternalMetric(); + + virtual void ComputeSufficientStatistics(const std::vector<WordID>& hyp, + const std::vector<std::vector<WordID> >& refs, + SufficientStats* out) const; + virtual float ComputeScore(const SufficientStats& stats) const; + + protected: + NScoreServer* eval_server; +}; + +#endif diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc new file mode 100644 index 00000000..0e1008db --- /dev/null +++ b/mteval/ns_ter.cc @@ -0,0 +1,492 @@ +#include "ns_ter.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <limits> +#include <tr1/unordered_map> +#include <set> +#include <boost/functional/hash.hpp> +#include "tdict.h" + +static const bool ter_use_average_ref_len = true; +static const int ter_short_circuit_long_sentences = -1; + +static const unsigned kINSERTIONS = 0; +static const unsigned kDELETIONS = 1; +static const unsigned kSUBSTITUTIONS = 2; +static const unsigned kSHIFTS = 3; +static const unsigned kREF_WORDCOUNT = 4; +static const unsigned kDUMMY_LAST_ENTRY = 5; + +using namespace std; +using namespace std::tr1; + +bool TERMetric::IsErrorMetric() const { + return true; +} + +namespace NewScorer { + +struct COSTS { + static const float substitution; + static const float deletion; + static const float insertion; + static const float shift; +}; +const float COSTS::substitution = 1.0f; +const float COSTS::deletion = 1.0f; +const float COSTS::insertion = 1.0f; +const float COSTS::shift = 1.0f; + +static const int MAX_SHIFT_SIZE = 10; +static const int MAX_SHIFT_DIST = 50; + +struct Shift { + unsigned int d_; + Shift() : d_() {} + Shift(int b, int e, int m) : d_() { + begin(b); + end(e); + moveto(m); + } + inline int begin() const { + return d_ & 0x3ff; + } + inline int end() const { + return (d_ >> 10) & 0x3ff; + } + inline int moveto() const { + int m = (d_ >> 20) & 0x7ff; + if (m > 1024) { m -= 1024; m *= -1; } + return m; + } + inline void begin(int b) { + d_ &= 0xfffffc00u; + d_ |= (b & 0x3ff); + } + inline void end(int e) { + d_ &= 0xfff003ffu; + d_ |= (e & 0x3ff) << 10; + } + inline void moveto(int m) { + bool neg = (m < 0); + if (neg) { m *= -1; m += 1024; } + d_ &= 0xfffff; + d_ |= (m & 0x7ff) << 20; + } +}; + +class TERScorerImpl { + + public: + enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; + + explicit TERScorerImpl(const vector<WordID>& ref) : ref_(ref) { + for (unsigned i = 0; i < ref.size(); ++i) + rwexists_.insert(ref[i]); + } + + float Calculate(const vector<WordID>& hyp, int* subs, int* ins, int* dels, int* shifts) const { + return CalculateAllShifts(hyp, subs, ins, dels, shifts); + } + + inline int GetRefLength() const { + return ref_.size(); + } + + private: + const vector<WordID>& ref_; + set<WordID> rwexists_; + + typedef unordered_map<vector<WordID>, set<int>, boost::hash<vector<WordID> > > NgramToIntsMap; + mutable NgramToIntsMap nmap_; + + static float MinimumEditDistance( + const vector<WordID>& hyp, + const vector<WordID>& ref, + vector<TransType>* path) { + vector<vector<TransType> > bmat(hyp.size() + 1, vector<TransType>(ref.size() + 1, MATCH)); + vector<vector<float> > cmat(hyp.size() + 1, vector<float>(ref.size() + 1, 0)); + for (int i = 0; i <= hyp.size(); ++i) + cmat[i][0] = i; + for (int j = 0; j <= ref.size(); ++j) + cmat[0][j] = j; + for (int i = 1; i <= hyp.size(); ++i) { + const WordID& hw = hyp[i-1]; + for (int j = 1; j <= ref.size(); ++j) { + const WordID& rw = ref[j-1]; + float& cur_c = cmat[i][j]; + TransType& cur_b = bmat[i][j]; + + if (rw == hw) { + cur_c = cmat[i-1][j-1]; + cur_b = MATCH; + } else { + cur_c = cmat[i-1][j-1] + COSTS::substitution; + cur_b = SUBSTITUTION; + } + float cwoi = cmat[i-1][j]; + if (cur_c > cwoi + COSTS::insertion) { + cur_c = cwoi + COSTS::insertion; + cur_b = INSERTION; + } + float cwod = cmat[i][j-1]; + if (cur_c > cwod + COSTS::deletion) { + cur_c = cwod + COSTS::deletion; + cur_b = DELETION; + } + } + } + + // trace back along the best path and record the transition types + path->clear(); + int i = hyp.size(); + int j = ref.size(); + while (i > 0 || j > 0) { + if (j == 0) { + --i; + path->push_back(INSERTION); + } else if (i == 0) { + --j; + path->push_back(DELETION); + } else { + TransType t = bmat[i][j]; + path->push_back(t); + switch (t) { + case SUBSTITUTION: + case MATCH: + --i; --j; break; + case INSERTION: + --i; break; + case DELETION: + --j; break; + } + } + } + reverse(path->begin(), path->end()); + return cmat[hyp.size()][ref.size()]; + } + + void BuildWordMatches(const vector<WordID>& hyp, NgramToIntsMap* nmap) const { + nmap->clear(); + set<WordID> exists_both; + for (int i = 0; i < hyp.size(); ++i) + if (rwexists_.find(hyp[i]) != rwexists_.end()) + exists_both.insert(hyp[i]); + for (int start=0; start<ref_.size(); ++start) { + if (exists_both.find(ref_[start]) == exists_both.end()) continue; + vector<WordID> cp; + int mlen = min(MAX_SHIFT_SIZE, static_cast<int>(ref_.size() - start)); + for (int len=0; len<mlen; ++len) { + if (len && exists_both.find(ref_[start + len]) == exists_both.end()) break; + cp.push_back(ref_[start + len]); + (*nmap)[cp].insert(start); + } + } + } + + static void PerformShift(const vector<WordID>& in, + int start, int end, int moveto, vector<WordID>* out) { + // cerr << "ps: " << start << " " << end << " " << moveto << endl; + out->clear(); + if (moveto == -1) { + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto < start) { + for (int i = 0; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto > end) { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < in.size(); ++i) + out->push_back(in[i]); + } else { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = (end + (moveto - start))+1; i < in.size(); ++i) + out->push_back(in[i]); + } + if (out->size() != in.size()) { + cerr << "ps: " << start << " " << end << " " << moveto << endl; + cerr << "in=" << TD::GetString(in) << endl; + cerr << "out=" << TD::GetString(*out) << endl; + } + assert(out->size() == in.size()); + // cerr << "ps: " << TD::GetString(*out) << endl; + } + + void GetAllPossibleShifts(const vector<WordID>& hyp, + const vector<int>& ralign, + const vector<bool>& herr, + const vector<bool>& rerr, + const int min_size, + vector<vector<Shift> >* shifts) const { + for (int start = 0; start < hyp.size(); ++start) { + vector<WordID> cp(1, hyp[start]); + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) continue; + bool ok = false; + int moveto; + for (set<int>::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { + moveto = *i; + int rm = ralign[moveto]; + ok = (start != rm && + (rm - start) < MAX_SHIFT_DIST && + (start - rm - 1) < MAX_SHIFT_DIST); + if (ok) break; + } + if (!ok) continue; + cp.clear(); + for (int end = start + min_size - 1; + ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { + cp.push_back(hyp[end]); + vector<Shift>& sshifts = (*shifts)[end - start]; + ok = false; + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) break; + bool any_herr = false; + for (int i = start; i <= end && !any_herr; ++i) + any_herr = herr[i]; + if (!any_herr) { + ok = true; + continue; + } + for (set<int>::iterator mi = niter->second.begin(); + mi != niter->second.end(); ++mi) { + int moveto = *mi; + int rm = ralign[moveto]; + if (! ((rm != start) && + ((rm < start) || (rm > end)) && + (rm - start <= MAX_SHIFT_DIST) && + ((start - rm - 1) <= MAX_SHIFT_DIST))) continue; + ok = true; + bool any_rerr = false; + for (int i = 0; (i <= end - start) && (!any_rerr); ++i) + any_rerr = rerr[moveto+i]; + if (!any_rerr) continue; + for (int roff = 0; roff <= (end - start); ++roff) { + int rmr = ralign[moveto+roff]; + if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) + sshifts.push_back(Shift(start, end, moveto + roff)); + } + } + } + } + } + + bool CalculateBestShift(const vector<WordID>& cur, + const vector<WordID>& hyp, + float curerr, + const vector<TransType>& path, + vector<WordID>* new_hyp, + float* newerr, + vector<TransType>* new_path) const { + vector<bool> herr, rerr; + vector<int> ralign; + int hpos = -1; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case MATCH: + ++hpos; + herr.push_back(false); + rerr.push_back(false); + ralign.push_back(hpos); + break; + case SUBSTITUTION: + ++hpos; + herr.push_back(true); + rerr.push_back(true); + ralign.push_back(hpos); + break; + case INSERTION: + ++hpos; + herr.push_back(true); + break; + case DELETION: + rerr.push_back(true); + ralign.push_back(hpos); + break; + } + } +#if 0 + cerr << "RALIGN: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << ralign[i] << " "; + cerr << endl; + cerr << "RERR: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << (bool)rerr[i] << " "; + cerr << endl; + cerr << "HERR: "; + for (int i = 0; i < herr.size(); ++i) + cerr << (bool)herr[i] << " "; + cerr << endl; +#endif + + vector<vector<Shift> > shifts(MAX_SHIFT_SIZE + 1); + GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); + float cur_best_shift_cost = 0; + *newerr = curerr; + vector<TransType> cur_best_path; + vector<WordID> cur_best_hyp; + + bool res = false; + for (int i = shifts.size() - 1; i >=0; --i) { + float curfix = curerr - (cur_best_shift_cost + *newerr); + float maxfix = 2.0f * (1 + i) - COSTS::shift; + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; + for (int j = 0; j < shifts[i].size(); ++j) { + const Shift& s = shifts[i][j]; + curfix = curerr - (cur_best_shift_cost + *newerr); + maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove? + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; + vector<WordID> shifted(cur.size()); + PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); + vector<TransType> try_path; + float try_cost = MinimumEditDistance(shifted, ref_, &try_path); + float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); + if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { + *newerr = try_cost; + cur_best_shift_cost = COSTS::shift; + new_path->swap(try_path); + new_hyp->swap(shifted); + res = true; + // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; + } + } + } + + return res; + } + + static void GetPathStats(const vector<TransType>& path, int* subs, int* ins, int* dels) { + *subs = *ins = *dels = 0; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case SUBSTITUTION: + ++(*subs); + case MATCH: + break; + case INSERTION: + ++(*ins); break; + case DELETION: + ++(*dels); break; + } + } + } + + float CalculateAllShifts(const vector<WordID>& hyp, + int* subs, int* ins, int* dels, int* shifts) const { + BuildWordMatches(hyp, &nmap_); + vector<TransType> path; + float med_cost = MinimumEditDistance(hyp, ref_, &path); + float edits = 0; + vector<WordID> cur = hyp; + *shifts = 0; + if (ter_short_circuit_long_sentences < 0 || + ref_.size() < ter_short_circuit_long_sentences) { + while (true) { + vector<WordID> new_hyp; + vector<TransType> new_path; + float new_med_cost; + if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) + break; + edits += COSTS::shift; + ++(*shifts); + med_cost = new_med_cost; + path.swap(new_path); + cur.swap(new_hyp); + } + } + GetPathStats(path, subs, ins, dels); + return med_cost + edits; + } +}; + +#if 0 +void TERScore::ScoreDetails(std::string* details) const { + char buf[200]; + sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", + ComputeScore() * 100.0f, + stats[kINSERTIONS], + stats[kDELETIONS], + stats[kSUBSTITUTIONS], + stats[kSHIFTS], + stats[kREF_WORDCOUNT]); + *details = buf; +} +#endif + +} // namespace NewScorer + +void TERMetric::ComputeSufficientStatistics(const vector<WordID>& hyp, + const vector<vector<WordID> >& refs, + SufficientStats* out) const { + out->fields.resize(kDUMMY_LAST_ENTRY); + float best_score = numeric_limits<float>::max(); + unsigned avg_len = 0; + for (int i = 0; i < refs.size(); ++i) + avg_len += refs[i].size(); + avg_len /= refs.size(); + + for (int i = 0; i < refs.size(); ++i) { + int subs, ins, dels, shifts; + NewScorer::TERScorerImpl ter(refs[i]); + float score = ter.Calculate(hyp, &subs, &ins, &dels, &shifts); + // cerr << "Component TER cost: " << score << endl; + if (score < best_score) { + out->fields[kINSERTIONS] = ins; + out->fields[kDELETIONS] = dels; + out->fields[kSUBSTITUTIONS] = subs; + out->fields[kSHIFTS] = shifts; + if (ter_use_average_ref_len) { + out->fields[kREF_WORDCOUNT] = avg_len; + } else { + out->fields[kREF_WORDCOUNT] = refs[i].size(); + } + + best_score = score; + } + } +} + +unsigned TERMetric::SufficientStatisticsVectorSize() const { + return kDUMMY_LAST_ENTRY; +} + +float TERMetric::ComputeScore(const SufficientStats& stats) const { + float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); + return edits / static_cast<float>(stats[kREF_WORDCOUNT]); +} + +string TERMetric::DetailedScore(const SufficientStats& stats) const { + char buf[200]; + sprintf(buf, "TER = %.2f, %3.f|%3.f|%3.f|%3.f (len=%3.f)", + ComputeScore(stats) * 100.0f, + stats[kINSERTIONS], + stats[kDELETIONS], + stats[kSUBSTITUTIONS], + stats[kSHIFTS], + stats[kREF_WORDCOUNT]); + return buf; +} + diff --git a/mteval/ns_ter.h b/mteval/ns_ter.h new file mode 100644 index 00000000..c5c25413 --- /dev/null +++ b/mteval/ns_ter.h @@ -0,0 +1,21 @@ +#ifndef _NS_TER_H_ +#define _NS_TER_H_ + +#include "ns.h" + +class TERMetric : public EvaluationMetric { + friend class EvaluationMetric; + protected: + TERMetric() : EvaluationMetric("TER") {} + + public: + virtual bool IsErrorMetric() const; + virtual unsigned SufficientStatisticsVectorSize() const; + virtual std::string DetailedScore(const SufficientStats& stats) const; + virtual void ComputeSufficientStatistics(const std::vector<WordID>& hyp, + const std::vector<std::vector<WordID> >& refs, + SufficientStats* out) const; + virtual float ComputeScore(const SufficientStats& stats) const; +}; + +#endif diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc index a07a8c4b..73159557 100644 --- a/mteval/scorer_test.cc +++ b/mteval/scorer_test.cc @@ -3,9 +3,11 @@ #include <valarray> #include <gtest/gtest.h> +#include "ns.h" #include "tdict.h" #include "scorer.h" #include "aer_scorer.h" +#include "kernel_string_subseq.h" using namespace std; @@ -175,6 +177,52 @@ TEST_F(ScorerTest, AERTest) { EXPECT_EQ(d2, details); } +TEST_F(ScorerTest, Kernel) { + for (int i = 1; i < 10; ++i) { + const float l = (i / 10.0); + float f = ssk<4>(refs0[0], hyp1, l) + + ssk<4>(refs0[1], hyp1, l) + + ssk<4>(refs0[2], hyp1, l) + + ssk<4>(refs0[3], hyp1, l); + float f2= ssk<4>(refs1[0], hyp2, l) + + ssk<4>(refs1[1], hyp2, l) + + ssk<4>(refs1[2], hyp2, l) + + ssk<4>(refs1[3], hyp2, l); + f /= 4; + f2 /= 4; + float f3= ssk<4>(refs0[0], hyp2, l) + + ssk<4>(refs0[1], hyp2, l) + + ssk<4>(refs0[2], hyp2, l) + + ssk<4>(refs0[3], hyp2, l); + float f4= ssk<4>(refs1[0], hyp1, l) + + ssk<4>(refs1[1], hyp1, l) + + ssk<4>(refs1[2], hyp1, l) + + ssk<4>(refs1[3], hyp1, l); + f3 += f4; + f3 /= 8; + cerr << "LAMBDA=" << l << "\t" << f << " " << f2 << "\tf=" << ((f + f2)/2 - f3) << " (bad=" << f3 << ")" << endl; + } +} + +TEST_F(ScorerTest, NewScoreAPI) { + //EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + //EvaluationMetric* metric = EvaluationMetric::Instance("METEOR"); + EvaluationMetric* metric = EvaluationMetric::Instance("COMB:IBM_BLEU=0.5;TER=-0.5"); + boost::shared_ptr<SegmentEvaluator> e1 = metric->CreateSegmentEvaluator(refs0); + boost::shared_ptr<SegmentEvaluator> e2 = metric->CreateSegmentEvaluator(refs1); + SufficientStats stats1; + e1->Evaluate(hyp1, &stats1); + SufficientStats stats2; + e2->Evaluate(hyp2, &stats2); + stats1 += stats2; + string ss; + stats1.Encode(&ss); + cerr << "SS: " << ss << endl; + cerr << metric->ComputeScore(stats1) << endl; + //SufficientStats statse("IBM_BLEU 53 32 18 11 65 63 61 59 65 72"); + //cerr << metric->ComputeScore(statse) << endl; +} + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 29b3d7ea..3b99e1b6 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -8,6 +8,7 @@ #include "dict.h" #include "sampler.h" #include "ccrp.h" +#include "m.h" using namespace std; using namespace std::tr1; @@ -95,11 +96,6 @@ void ReadCorpus(const string& filename, vector<vector<int> >* c, set<int>* vocab if (in != &cin) delete in; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - struct UniphraseLM { UniphraseLM(const vector<vector<int> >& corpus, const set<int>& vocab, @@ -128,7 +124,7 @@ struct UniphraseLM { double log_p0(const vector<int>& phrase) const { double len_logprob; if (use_poisson_) - len_logprob = log_poisson(phrase.size(), 1.0); + len_logprob = Md::log_poisson(phrase.size(), 1.0); else len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_); return log(uniform_word_) * phrase.size() + len_logprob; @@ -256,7 +252,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " d=" << phrases_.discount() << ",c=" << phrases_.concentration(); + cerr << " d=" << phrases_.discount() << ",s=" << phrases_.strength(); } CCRP<vector<int> > phrases_; diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 5db053de..31258fa6 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -12,7 +12,7 @@ use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../vest"; +my $VEST_DIR="$SCRIPT_DIR/../dpmert"; require "$VEST_DIR/libcall.pl"; # Default settings @@ -288,7 +288,7 @@ while (1){ $retries++; } die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; @@ -338,7 +338,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 0a9b75d7..52b67f32 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -13,11 +13,12 @@ #include "filelib.h" #include "stringlib.h" #include "weights.h" -#include "scorer.h" #include "inside_outside.h" #include "hg_io.h" #include "kbest.h" #include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" // This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) @@ -80,7 +81,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)") ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") - ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -109,9 +110,12 @@ struct HypInfo { HypInfo(const vector<WordID>& h, const SparseVector<weight_t>& feats) : hyp(h), g_(-100.0f), x(feats) {} // lazy evaluation - double g(const SentenceScorer& scorer) const { - if (g_ == -100.0f) - g_ = scorer.ScoreCandidate(hyp)->ComputeScore(); + double g(const SegmentEvaluator& scorer, const EvaluationMetric* metric) const { + if (g_ == -100.0f) { + SufficientStats ss; + scorer.Evaluate(hyp, &ss); + g_ = metric->ComputeScore(ss); + } return g_; } vector<WordID> hyp; @@ -233,15 +237,21 @@ struct DiffOrder { } }; -void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) { +void Sample(const unsigned gamma, + const unsigned xi, + const vector<HypInfo>& J_i, + const SegmentEvaluator& scorer, + const EvaluationMetric* metric, + vector<TrainingInstance>* pv) { + const bool invert_score = metric->IsErrorMetric(); vector<TrainingInstance> v1, v2; float avg_diff = 0; for (unsigned i = 0; i < gamma; ++i) { const size_t a = rng->inclusive(0, J_i.size() - 1)(); const size_t b = rng->inclusive(0, J_i.size() - 1)(); if (a == b) continue; - float ga = J_i[a].g(scorer); - float gb = J_i[b].g(scorer); + float ga = J_i[a].g(scorer, metric); + float gb = J_i[b].g(scorer, metric); bool positive = gb < ga; if (invert_score) positive = !positive; const float gdiff = fabs(ga - gb); @@ -288,11 +298,12 @@ int main(int argc, char** argv) { rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); else rng.reset(new MT19937); - const string loss_function = conf["loss_function"].as<string>(); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; Hypergraph hg; string last_file; ReadFile in_read(conf["input"].as<string>()); @@ -335,7 +346,7 @@ int main(int argc, char** argv) { Dedup(&J_i); WriteKBest(kbest_file, J_i); - Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); + Sample(gamma, xi, J_i, *ds[sent_id], metric, &v); for (unsigned i = 0; i < v.size(); ++i) { const TrainingInstance& vi = v[i]; cout << vi.y << "\t" << vi.x << endl; diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl deleted file mode 100755 index 88bc9682..00000000 --- a/rescore/cdec_kbest_to_zmert.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $feature_file; -my $hyp_file; -my $help; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "feature_file|f=s" => \$feature_file, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$feature_file" or die "Can't read $feature_file: $!"; -my %weights; -my @all_feats; -while(<W>) { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - push @all_feats, $fname; - $weights{$fname} = 1; -} -close W; - -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - my @afeats = split /\s+/, $feats; - my $tot = 0; - my %fvaldict; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - $fvaldict{$fname} = $fval; - my $weight = $weights{$fname}; - warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight; - $weights{$fname} = 1; - } - my @trans; - for my $feat (@all_feats) { - my $v = $fvaldict{$feat}; - if (!defined $v) { $v = '0.0'; } - push @trans, $v; - } - print "$id ||| $hyp ||| @trans\n"; -} -close HYP; - -sub usage { - print <<EOT; -Usage: $0 -f feature-file.txt/weights.txt -h hyp.nbest.txt - Puts a cdec k-best list into Joshua/ZMERT format -EOT -} - diff --git a/rescore/example/README b/rescore/example/README deleted file mode 100644 index 92b657ca..00000000 --- a/rescore/example/README +++ /dev/null @@ -1,4 +0,0 @@ -Rescoring example: - - ../rescore_with_cdec_model.pl -c cdec.ini -s source.txt -h hyp.txt -w weights -f RescoringModel - diff --git a/rescore/example/cdec.ini b/rescore/example/cdec.ini deleted file mode 100644 index 29a1ece3..00000000 --- a/rescore/example/cdec.ini +++ /dev/null @@ -1,2 +0,0 @@ -formalism=scfg -grammar=small.scfg diff --git a/rescore/example/hyp.txt b/rescore/example/hyp.txt deleted file mode 100644 index c4757f6c..00000000 --- a/rescore/example/hyp.txt +++ /dev/null @@ -1,5 +0,0 @@ -0 ||| A B C ||| F1=1 F2=1 -0 ||| A b c ||| F1=1 F3=1 -0 ||| A C ||| F4=1 -1 ||| X Y ||| F5=1 -1 ||| XY ||| F6=1 diff --git a/rescore/example/small.scfg b/rescore/example/small.scfg deleted file mode 100644 index 402a585a..00000000 --- a/rescore/example/small.scfg +++ /dev/null @@ -1,9 +0,0 @@ -[X] ||| a b c ||| A B C ||| fe=0.2 -[X] ||| a b ||| A B ||| fe=0.8 -[X] ||| c ||| C ||| fe=0.3 -[X] ||| c ||| c ||| fe=1.3 -[X] ||| a b c ||| A B c ||| fe=0.8 -[X] ||| a b c ||| A C ||| fe=2 -[X] ||| x ||| X ||| fe=0.2 -[X] ||| y ||| Y ||| fe=0.5 -[X] ||| x y ||| XY ||| fe=0.8 diff --git a/rescore/example/source.txt b/rescore/example/source.txt deleted file mode 100644 index e8d4eda2..00000000 --- a/rescore/example/source.txt +++ /dev/null @@ -1,2 +0,0 @@ -a b c -x y diff --git a/rescore/example/weights b/rescore/example/weights deleted file mode 100644 index a22d36f1..00000000 --- a/rescore/example/weights +++ /dev/null @@ -1 +0,0 @@ -fe -0.8 diff --git a/rescore/generate_zmert_params_from_weights.pl b/rescore/generate_zmert_params_from_weights.pl deleted file mode 100755 index a9287896..00000000 --- a/rescore/generate_zmert_params_from_weights.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl -w - -my %defaults; -$defaults{'LanguageModel'} = "Opt\t0\t10\t0\t2.5"; -$defaults{'EgivenF'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'LexEGivenF'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'LexFGivenE'} = "Opt\t-5\t0.5\t-3\t0.5"; -$defaults{'PassThrough'} = "Opt\t-Inf\t+Inf\t-10\t0"; -$defaults{'WordPenalty'} = "Opt\t-Inf\t2\t-5\t0"; -my $DEFAULT = "Opt\t-Inf\t+Inf\t-1\t+1"; - -while(<>) { - next if /^#/; - chomp; - next if /^\s*$/; - s/^\s+//; - s/\s+$//; - my ($a,$b) = split /\s+/; - next unless ($a && $b); - my $line = $DEFAULT; - if ($defaults{$a}) { $line = $defaults{$a}; } - print "$a\t|||\t$b\t$line\n"; -} - -print "normalization = none\n"; - diff --git a/rescore/rerank.pl b/rescore/rerank.pl deleted file mode 100755 index 4a0c5750..00000000 --- a/rescore/rerank.pl +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $weights_file; -my $hyp_file; -my $help; -my $kbest; # flag to extract reranked list - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "weights_file|w=s" => \$weights_file, - "hypothesis_file|h=s" => \$hyp_file, - "kbest" => \$kbest, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$weights_file" or die "Can't read $weights_file: $!"; -my %weights; -while(<W>) { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - $weights{$fname} = $w; -} -close W; - -my $cur = undef; -my %hyps = (); -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - if ($cur ne $id) { - extract_1best($cur, \%hyps); - $cur = $id; - %hyps = (); - } - my @afeats = split /\s+/, $feats; - my $tot = 0; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - my $weight = $weights{$fname}; - die "Unweighted feature '$fname'" unless defined $weight; - $tot += ($weight * $fval); - } - $hyps{"$hyp ||| $feats"} = $tot; -} -extract_1best($cur, \%hyps) if defined $cur; -close HYP; - -sub extract_1best { - my ($id, $rh) = @_; - my %hyps = %$rh; - if ($kbest) { - for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) { - print "$id ||| $hyp\n"; - } - } else { - my $best_score = undef; - my $best_hyp = undef; - for my $hyp (keys %hyps) { - if (!defined $best_score || $hyps{$hyp} > $best_score) { - $best_score = $hyps{$hyp}; - $best_hyp = $hyp; - } - } - $best_hyp =~ s/ \|\|\|.*$//; - print "$best_hyp\n"; - } -} - -sub usage { - print <<EOT; -Usage: $0 -w weights.txt -h hyp.nbest.txt [--kbest] - Reranks n-best lists with new weights, extracting the new 1/k-best entries. -EOT -} - diff --git a/rescore/rescore_inv_model1.pl b/rescore/rescore_inv_model1.pl deleted file mode 100755 index 780452f5..00000000 --- a/rescore/rescore_inv_model1.pl +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $model_file; -my $src_file; -my $hyp_file; -my $help; -my $reverse_model; -my $feature_name='M1SrcGivenTrg'; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "model_file|m=s" => \$model_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) { - usage(); - exit; -} - -binmode STDIN, ":utf8"; -binmode STDOUT, ":utf8"; -binmode STDERR, ":utf8"; - -print STDERR "Reading Model 1 probabilities from $model_file...\n"; -open M, "<$model_file" or die "Couldn't read $model_file: $!"; -binmode M, ":utf8"; -my %m1; -while(<M>){ - chomp; - my ($e,$f,$lp) = split /\s+/; - die unless defined $e; - die unless defined $f; - die unless defined $lp; - $m1{$f}->{$e} = $lp; -} -close M; - -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(<SRC>){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -print STDERR "Rescoring...\n"; - -my $cur = undef; -my @hyps = (); -my @feats = (); -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - my %cache = (); - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - my $score = $cache{$hyps[$i]}; - if (!defined $score) { - if ($reverse_model) { - die "not implemented"; - } else { - $score = m1_prob($src, $hyps[$i]); - } - $cache{$hyps[$i]} = $score; - } - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } - -} - -sub m1_prob { - my ($fsent, $esent) = @_; - die unless defined $fsent; - die unless defined $esent; - my @fwords = split /\s+/, $fsent; - my @ewords = split /\s+/, $esent; - push @ewords, "<eps>"; - my $tp = 0; - for my $f (@fwords) { - my $m1f = $m1{$f}; - if (!defined $m1f) { $m1f = {}; } - my $tfp = 0; - for my $e (@ewords) { - my $lp = $m1f->{$e}; - if (!defined $lp) { $lp = -100; } - #print "P($f|$e) = $lp\n"; - my $prob = exp($lp); - #if ($prob > $tfp) { $tfp = $prob; } - $tfp += $prob; - } - $tp += log($tfp); - $tp -= log(scalar @ewords); # uniform probability of each generating word - } - return $tp; -} - -sub usage { - print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n Adds the back-translation probability under Model 1\n Use training/model1 to generate the required parameter file\n"; -} - - diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl deleted file mode 100755 index cdd8c217..00000000 --- a/rescore/rescore_with_cdec_model.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; - -my $decoder = "$SCRIPT_DIR/../decoder/cdec"; -my $help; -my $cdec_ini; -my $src_file; -my $hyp_file; -my $reverse_model; -my $weights_file; -my $feature_name='NewModel'; - -sub catch_pipe { - my $signame = shift; - die "$0 received SIGPIPE: did the decoder die?\n"; -} -$SIG{PIPE} = \&catch_pipe; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "config|c=s" => \$cdec_ini, - "weights|w=s" => \$weights_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "reverse" => \$reverse_model, # if true translate hyp -> src - "decoder=s" => \$decoder, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) { - usage(); - exit; -} -die "Can't find $decoder" unless -f $decoder; -die "Can't run $decoder" unless -x $decoder; -my $weights = ''; -if (defined $weights_file) { - die "Can't read $weights_file" unless -f $weights_file; - $weights = "-w $weights_file"; -} -my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob"; -print STDERR "DECODER COMMAND: $decoder_command\n"; -my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command) - or die "Couldn't run $decoder: $!"; -sleep 1; - -die "Can't find $cdec_ini" unless -f $cdec_ini; -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(<SRC>){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -binmode(CDEC_IN, ":utf8"); -binmode(CDEC_OUT, ":utf8"); - -my $cur = undef; -my @hyps = (); -my @feats = (); -while(<HYP>) { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -close CDEC_IN; -close CDEC_OUT; -close HYP; -waitpid($cdec_pid, 0); -my $status = $? >> 8; -if ($status != 0) { - print STDERR "Decoder returned bad status!\n"; -} - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - if ($reverse_model) { - print CDEC_OUT "<seg id=\"$id\">$hyps[$i] ||| $src</seg>\n"; - } else { - print CDEC_OUT "<seg id=\"$id\">$src ||| $hyps[$i]</seg>\n"; - } - my $score = <CDEC_IN>; - chomp $score; - my @words = split /\s+/, $hyps[$i]; - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } -} - -sub usage { - print <<EOT; -Usage: $0 -c cdec.ini [-w cdec_weights.txt] -s source.txt -h hypothesis.nbest.txt [-f FeatureName] -EOT - exit 0 -} - diff --git a/sa-extract/Makefile b/sa-extract/Makefile new file mode 100644 index 00000000..7b39ae4d --- /dev/null +++ b/sa-extract/Makefile @@ -0,0 +1,18 @@ +PYVER=python2.7 +PYDIR=/usr/local/Cellar/python/2.7.2 +PYINCLUDE=$(PYDIR)/include/$(PYVER) +CYTHON=/usr/local/share/python/cython +PYTHON=$(PYDIR)/bin/python + +%.c: %.pyx + $(CYTHON) $< -o $@ + +%.o: %.cc + g++ -O6 -g -fPIC -c $< + +all: cstrmap.c strmap.cc rule.c sym.c cdat.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c + $(PYTHON) setup.py build + +clean: + rm -f cdat.c cstrmap.c sym.c rule.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c *.so *.o *.cxx *~ *.pyc + rm -rf build diff --git a/sa-extract/README b/sa-extract/README new file mode 100644 index 00000000..e4022c7e --- /dev/null +++ b/sa-extract/README @@ -0,0 +1,62 @@ +SUFFIX-ARRAY-EXTRACT README + Feb 1, 2012 + +Written by Adam Lopez, repackaged by Chris Dyer. + +Originally based on parts of Hiero, by David Chiang, but these dependencies +have been removed or rewritten. + + +BUILD INSTRUCTIONS +============================================================================== + +Requirements: + Python 2.7 or later (http://www.python.org) + Cython 0.14.1 or later (http://cython.org/) + +- Edit Makefile to set the location of Python/Cython then do: + + make + + +COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT +============================================================================== +- Run sa-compile.pl to compile the training data and generate an extract.ini + file (which is written to STDOUT): + + sa-compile.pl -b bitext_name=source.fr,target.en \ + -a alignment_name=alignment.txt > extract.ini + + + The training data should be in two parallel text files (source.fr,source.en) + and the alignments are expected in "0-0 1-2 2-1 ..." format produced by + most alignment toolkits. The text files should NOT be escaped for non-XML + characters. + + +EXTRACTION OF PER-SENTENCE GRAMMARS +============================================================================== +The most common use-case we support is extraction of "per-sentence" grammars +for each segment in a testset. You may run the extractor on test set, but it +will try to interpret tags as SGML markup, so we provide a script that does +escaping: ./escape-testset.pl. + +- Example: + + cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini + + +EXTRACTION OF COMPLETE TEST-SET GRAMMARS +============================================================================== +Edit the generated extract.ini file a change per_sentence_grammar +to False. Then, run extraction as normal. + +Note: extracting a single grammar for an entire test set will consume more +memory during extraction and (probably) during decoding. + + +EXAMPLE +============================================================================== +- See example/ and the README therein. + + diff --git a/sa-extract/calignment.pxd b/sa-extract/calignment.pxd new file mode 100644 index 00000000..a7d3001f --- /dev/null +++ b/sa-extract/calignment.pxd @@ -0,0 +1,10 @@ +cimport cintlist +from libc.stdio cimport FILE + +cdef class Alignment: + + cdef cintlist.CIntList links + cdef cintlist.CIntList sent_index + cdef int link(self, int i, int j) + cdef _unlink(self, int link, int* f, int* e) + cdef int* _get_sent_links(self, int sent_id, int* num_links) diff --git a/sa-extract/calignment.pyx b/sa-extract/calignment.pyx new file mode 100644 index 00000000..976fcd66 --- /dev/null +++ b/sa-extract/calignment.pyx @@ -0,0 +1,128 @@ +import log +import gzip +import cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free + +# Note: Callison-Burch uses short instead of int. +# We have the space for our corpus, so this is not a problem; +# May need to revisit if things get really tight, though. +cdef class Alignment: + + + cdef int link(self, int i, int j): + '''Integerizes an alignment link pair''' + return i*65536 + j + + + def unlink(self, link): + '''De-integerizes an alignment link pair''' + return (link/65536, link%65536) + + + cdef _unlink(self, int link, int* f, int* e): + f[0] = link/65536 + e[0] = link%65536 + + + def get_sent_links(self, int sent_id): + cdef cintlist.CIntList sent_links + cdef int* arr + cdef int arr_len + + sent_links = cintlist.CIntList() + arr = self._get_sent_links(sent_id, &arr_len) + sent_links._extend_arr(arr, arr_len*2) + free(arr) + return sent_links + + + cdef int* _get_sent_links(self, int sent_id, int* num_links): + cdef int* sent_links + cdef int i, start, end + + start = self.sent_index.arr[sent_id] + end = self.sent_index.arr[sent_id+1] + num_links[0] = end - start + sent_links = <int*> malloc(2*num_links[0]*sizeof(int)) + for i from 0 <= i < num_links[0]: + self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1) + return sent_links + + + def __cinit__(self, filename, from_binary=False): + self.links = cintlist.CIntList(1000,1000) + self.sent_index = cintlist.CIntList(1000,1000) + log.writeln("Reading alignment from file %s" % filename) + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def read_text(self, filename): + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + for line in f: + self.sent_index.append(len(self.links)) + pairs = line.split() + for pair in pairs: + (i, j) = map(int, pair.split('-')) + self.links.append(self.link(i, j)) + self.sent_index.append(len(self.links)) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.links.read_handle(f) + self.sent_index.read_handle(f) + fclose(f) + + + def write_text(self, filename): + f = open(filename, "w") + sent_num = 0 + for i, link in enumerate(self.links): + while i >= self.sent_index[sent_num]: + f.write("\n") + sent_num = sent_num + 1 + f.write("%d-%d " % self.unlink(link)) + f.write("\n") + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.links.write_handle(f) + self.sent_index.write_handle(f) + fclose(f) + + + def write_enhanced(self, filename): + f = open(filename, "w") + sent_num = 1 + for link in self.links: + f.write("%d " % link) + f.write("\n") + for i in self.sent_index: + f.write("%d " % i) + f.write("\n") + + + def alignment(self, i): + '''Return all (e,f) pairs for sentence i''' + cdef int j, start, end + result = [] + start = self.sent_index.arr[i] + end = self.sent_index.arr[i+1] + for j from start <= j < end: + result.append(self.unlink(self.links.arr[j])) + return result diff --git a/sa-extract/cdat.pxd b/sa-extract/cdat.pxd new file mode 100644 index 00000000..b686f611 --- /dev/null +++ b/sa-extract/cdat.pxd @@ -0,0 +1,12 @@ +cimport cintlist +from libc.stdio cimport FILE + +cdef class DataArray: + cdef word2id + cdef id2word + cdef cintlist.CIntList data + cdef cintlist.CIntList sent_id + cdef cintlist.CIntList sent_index + cdef use_sent_id + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) diff --git a/sa-extract/cdat.pyx b/sa-extract/cdat.pyx new file mode 100644 index 00000000..57d3ad63 --- /dev/null +++ b/sa-extract/cdat.pyx @@ -0,0 +1,178 @@ +# cdat.pyx +# Defines "data arrays" that can be directly written to/read from disk in binary format +# In particular, the array itself is written/read directly as a glob of binary data +# Adam Lopez <alopez@cs.umd.edu> + +import sys +import gzip +import log +import cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class DataArray: + + def __init__(self, filename=None, from_binary=False, use_sent_id=False): + self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1} + self.id2word = ["END_OF_FILE", "END_OF_LINE"] + self.data = cintlist.CIntList(1000,1000) + self.sent_id = cintlist.CIntList(1000,1000) + self.sent_index = cintlist.CIntList(1000,1000) + self.use_sent_id = use_sent_id + + if filename is not None: + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def __len__(self): + return len(self.data) + + + def getSentId(self, i): + return self.sent_id.arr[i] + + + def getSent(self, i): + cdef int j, start, stop + sent = [] + start = self.sent_index.arr[i] + stop = self.sent_index.arr[i+1] + for i from start <= i < stop: + sent.append(self.id2word[self.data.arr[i]]) + return sent + + + def getSentPos(self, loc): + return loc - self.sent_index.arr[self.sent_id.arr[loc]] + + + def get_id(self, word): + if not word in self.word2id: + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + return self.word2id[word] + + + def get_word(self, id): + return self.id2word[id] + + + def write_text(self, filename): + f = open(filename, "w") + for w_id in self.data: + if w_id > 1: + f.write("%s " % self.get_word(w_id)) + if w_id == 1: + f.write("\n") + f.close() + + + def read_text(self, filename): + cdef int word_count + + if filename[-2:] == "gz": + file = gzip.GzipFile(filename) + else: + file = open(filename) + word_count = 0 + for line_num, line in enumerate(file): + self.sent_index.append(word_count) + for word in line.split(): + self.data.append(self.get_id(word)) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(1) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(0) + self.sent_index.append(word_count) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + self.data.read_handle(f) + self.sent_index.read_handle(f) + self.sent_id.read_handle(f) + fread(&(num_words), sizeof(int), 1, f) + for i in xrange(num_words): + fread(&(word_len), sizeof(int), 1, f) + c_word = <char*> malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + self.word2id[py_word] = len(self.id2word) + self.id2word.append(py_word) + if len(self.sent_id) == 0: + self.use_sent_id = False + else: + self.use_sent_id = True + + + cdef void write_handle(self, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + self.data.write_handle(f) + self.sent_index.write_handle(f) + self.sent_id.write_handle(f) + num_words = len(self.id2word) - 2 + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[2:]: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + def write_enhanced_handle(self, f): + for i in self.data: + f.write("%d " %i) + f.write("\n") + for i in self.sent_index: + f.write("%d " %i) + f.write("\n") + for i in self.sent_id: + f.write("%d " %i) + f.write("\n") + for word in self.id2word: + f.write("%s %d " % (word, self.word2id[word])) + f.write("\n") + + + + def write_enhanced(self, filename): + f = open(filename, "w") + self.write_enhanced_handle(self, f) + f.close() + + + + diff --git a/sa-extract/cfloatlist.pxd b/sa-extract/cfloatlist.pxd new file mode 100644 index 00000000..026f2739 --- /dev/null +++ b/sa-extract/cfloatlist.pxd @@ -0,0 +1,10 @@ +from libc.stdio cimport FILE + +cdef class CFloatList: + cdef int size + cdef int increment + cdef int len + cdef float* arr + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) + cdef void set(self, int i, float v) diff --git a/sa-extract/cfloatlist.pyx b/sa-extract/cfloatlist.pyx new file mode 100644 index 00000000..18a0ef2a --- /dev/null +++ b/sa-extract/cfloatlist.pyx @@ -0,0 +1,93 @@ +# conveniencelist.pyx +# defines int arrays in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez <alopez@cs.umd.edu> + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class CFloatList: + + def __cinit__(self, size=0, increment=1, initial_len=0): + if initial_len > size: + size = initial_len + self.arr = <float*> malloc(size*sizeof(float)) + memset(self.arr, 0, initial_len*sizeof(float)) + + + def __init__(self, size=0, increment=1, initial_len=0): + self.size = size + if initial_len > size: + self.size = initial_len + self.increment = increment + self.len = initial_len + + + def __dealloc__(self): + free(self.arr) + + + def __getitem__(self, i): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) + return self.arr[j] + + + cdef void set(self, int i, float v): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) + self.arr[j] = v + + def __setitem__(self, i, val): + self.set(i, val) + + def __len__(self): + return self.len + + + def append(self, float val): + if self.len == self.size: + self.size = self.size + self.increment + self.arr = <float*> realloc(self.arr, self.size*sizeof(float)) + self.arr[self.len] = val + self.len = self.len + 1 + + + cdef void write_handle(self, FILE* f): + fwrite(&(self.len), sizeof(float), 1, f) + fwrite(self.arr, sizeof(float), self.len, f) + + + def write(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + free(self.arr) + fread(&(self.len), sizeof(float), 1, f) + self.arr = <float*> malloc(self.len * sizeof(float)) + self.size = self.len + fread(self.arr, sizeof(float), self.len, f) + + + def read(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + diff --git a/sa-extract/cintlist.pxd b/sa-extract/cintlist.pxd new file mode 100644 index 00000000..8a3a655c --- /dev/null +++ b/sa-extract/cintlist.pxd @@ -0,0 +1,15 @@ +from libc.stdio cimport FILE + +cdef class CIntList: + cdef int size + cdef int increment + cdef int len + cdef int* arr + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) + cdef void _append(self, int val) + cdef void _extend(self, CIntList other) + cdef void _extend_arr(self, int* other, int other_len) + cdef void _clear(self) + cdef void set(self, int i, int val) + diff --git a/sa-extract/cintlist.pyx b/sa-extract/cintlist.pyx new file mode 100644 index 00000000..9d0a058e --- /dev/null +++ b/sa-extract/cintlist.pyx @@ -0,0 +1,196 @@ +# cintlist.pyx +# defines int arrays in C, with some convenience methods +# for reading arrays as globs directly from disk. +# Adam Lopez <alopez@cs.umd.edu> + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef class CIntList: + + def __cinit__(self, size=0, increment=1, initial_len=0): + if initial_len > size: + size = initial_len + self.arr = <int*> malloc(size*sizeof(int)) + memset(self.arr, 0, initial_len*sizeof(int)) + + def __str__(self): + ret = "CIntList[" + for idx in xrange(self.size): + if idx>0: + ret += "," + ret += str(self.arr[idx]) + ret += "]" + ret += "len=" + ret += self.len + return ret + + def index(self, val): + for i in xrange(self.len): + if self.arr[i] == val: + return i + + return IndexError + + def partition(self,start,end): + pivot = self.arr[end] + bottom = start-1 + top = end + done = 0 + while not done: + while not done: + bottom += 1 + if bottom == top: + done = 1 + break + if self.arr[bottom] > pivot: + self.arr[top] = self.arr[bottom] + break + while not done: + top -= 1 + if top == bottom: + done = 1 + break + if self.arr[top] < pivot: + self.arr[bottom] = self.arr[top] + break + self.arr[top] = pivot + return top + + def _doquicksort(self,start,end): + if start < end: + split = self.partition(start,end) + self._doquicksort(start,split-1) + self._doquicksort(split+1,end) + else: + return + + def sort(self): + self._doquicksort(0,self.len-1) + + def reset(self): + self.len = 0 + + def __init__(self, size=0, increment=1, initial_len=0): + self.size = size + if initial_len > size: + self.size = initial_len + self.increment = increment + self.len = initial_len + + + def __dealloc__(self): + free(self.arr) + + + def __getitem__(self, index): + cdef int i, j, k + + if type(index) == int: + j = index + if j < 0: + j = self.len + j + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length CIntList" % (index, self.len)) + return self.arr[j] + elif type(index) == slice: + i = index.start + j = index.stop + if i < 0: + i = self.len + i + if j < 0: + j = self.len + j + if i < 0 or i >= self.len or j < 0 or j > self.len: + raise IndexError("Requested index %d:%d of %d-length CIntList" % (index.start, index.stop, self.len)) + result = () + for k from i <= k < j: + result = result + (self.arr[k],) + return result + else: + raise IndexError("Illegal key type %s for CIntList" % (type(index))) + + cdef void set(self, int i, int val): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length IntList" % (i, self.len)) + if type(val) != int: + raise TypeError + self.arr[j] = val + + + def __setitem__(self, i, val): + self.set(i, val) + + def __len__(self): + return self.len + + def getSize(self): + return self.size + + def append(self, int val): + self._append(val) + + cdef void _append(self, int val): + if self.len == self.size: + self.size = self.size + self.increment + self.arr = <int*> realloc(self.arr, self.size*sizeof(int)) + self.arr[self.len] = val + self.len = self.len + 1 + + def extend(self, other): + self._extend(other) + + + cdef void _extend(self, CIntList other): + self._extend_arr(other.arr, other.len) + + + cdef void _extend_arr(self, int* other, int other_len): + if self.size < self.len + other_len: + self.size = self.len + other_len + self.arr = <int*> realloc(self.arr, self.size*sizeof(int)) + memcpy(self.arr+self.len, other, other_len*sizeof(int)) + self.len = self.len + other_len + + + cdef void _clear(self): + free(self.arr) + self.len = 0 + self.size = 0 + self.arr = <int*> malloc(0) + + + cdef void write_handle(self, FILE* f): + fwrite(&(self.len), sizeof(int), 1, f) + fwrite(self.arr, sizeof(int), self.len, f) + + + def write(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + (self.arr) + fread(&(self.len), sizeof(int), 1, f) + self.arr = <int*> malloc(self.len * sizeof(int)) + self.size = self.len + fread(self.arr, sizeof(int), self.len, f) + + + def read(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx new file mode 100644 index 00000000..fa30caad --- /dev/null +++ b/sa-extract/clex.pyx @@ -0,0 +1,460 @@ +# clex.pyx +# defines bilexical dictionaries in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez <alopez@cs.umd.edu> + +import gzip +import sys +import context_model + +cimport cintlist +cimport cfloatlist +cimport calignment +cimport csuf +cimport cdat + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef struct _node: + _node* smaller + _node* bigger + int key + int val + +cdef _node* new_node(int key): + cdef _node* n + n = <_node*> malloc(sizeof(_node)) + n.smaller = NULL + n.bigger = NULL + n.key = key + n.val = 0 + return n + + +cdef del_node(_node* n): + if n.smaller != NULL: + del_node(n.smaller) + if n.bigger != NULL: + del_node(n.bigger) + free(n) + + +cdef int* get_val(_node* n, int key): + if key == n.key: + return &n.val + elif key < n.key: + if n.smaller == NULL: + n.smaller = new_node(key) + return &(n.smaller.val) + return get_val(n.smaller, key) + else: + if n.bigger == NULL: + n.bigger = new_node(key) + return &(n.bigger.val) + return get_val(n.bigger, key) + + +cdef class CLex: + + cdef cfloatlist.CFloatList col1 + cdef cfloatlist.CFloatList col2 + cdef cintlist.CIntList f_index + cdef cintlist.CIntList e_index + cdef id2eword, id2fword, eword2id, fword2id + + def __init__(self, filename, from_binary=False, + from_data=False, earray=None, fsarray=None): + self.id2eword = [] + self.id2fword = [] + self.eword2id = {} + self.fword2id = {} + self.e_index = cintlist.CIntList() + self.f_index = cintlist.CIntList() + self.col1 = cfloatlist.CFloatList() + self.col2 = cfloatlist.CFloatList() + if from_binary: + self.read_binary(filename) + else: + if from_data: + self.compute_from_data(fsarray, earray, filename) + else: + self.read_text(filename) + '''print "self.eword2id" + print "=============" + for x in self.eword2id: + print x + print "self.fword2id" + print "=============" + for x in self.fword2id: + print x + print "-------------"''' + + + cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa): + cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs + cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned + cdef _node** dict + cdef int *fmargin, *emargin, *count + cdef bytes word + cdef int null_word + + null_word = 0 + for word in fsa.darray.id2word: # I miss list comprehensions + self.id2fword.append(word) + self.id2fword[null_word] = "NULL" + for id, word in enumerate(self.id2fword): + self.fword2id[word] = id + + for word in eda.id2word: + self.id2eword.append(word) + self.id2eword[null_word] = "NULL" + for id, word in enumerate(self.id2eword): + self.eword2id[word] = id + + num_pairs = 0 + + V_E = len(eda.id2word) + V_F = len(fsa.darray.id2word) + fmargin = <int*> malloc(V_F*sizeof(int)) + emargin = <int*> malloc(V_E*sizeof(int)) + memset(fmargin, 0, V_F*sizeof(int)) + memset(emargin, 0, V_E*sizeof(int)) + + dict = <_node**> malloc(V_F*sizeof(_node*)) + memset(dict, 0, V_F*sizeof(_node*)) + + num_sents = len(fsa.darray.sent_index) + for sent_id from 0 <= sent_id < num_sents-1: + + fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id] + I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1 + faligned = <int*> malloc(I*sizeof(int)) + memset(faligned, 0, I*sizeof(int)) + + esent = eda.data.arr + eda.sent_index.arr[sent_id] + J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1 + ealigned = <int*> malloc(J*sizeof(int)) + memset(ealigned, 0, J*sizeof(int)) + + links = aa._get_sent_links(sent_id, &num_links) + + for l from 0 <= l < num_links: + i = links[l*2] + j = links[l*2+1] + if i >= I or j >= J: + sys.stderr.write(" %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1)) + assert i < I + assert j < J + f_i = fsent[i] + e_j = esent[j] + fmargin[f_i] = fmargin[f_i]+1 + emargin[e_j] = emargin[e_j]+1 + if dict[f_i] == NULL: + dict[f_i] = new_node(e_j) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + # add count + faligned[i] = 1 + ealigned[j] = 1 + for i from 0 <= i < I: + if faligned[i] == 0: + f_i = fsent[i] + fmargin[f_i] = fmargin[f_i] + 1 + emargin[null_word] = emargin[null_word] + 1 + if dict[f_i] == NULL: + dict[f_i] = new_node(null_word) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], null_word) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + for j from 0 <= j < J: + if ealigned[j] == 0: + e_j = esent[j] + fmargin[null_word] = fmargin[null_word] + 1 + emargin[e_j] = emargin[e_j] + 1 + if dict[null_word] == NULL: + dict[null_word] = new_node(e_j) + dict[null_word].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[null_word], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + free(links) + free(faligned) + free(ealigned) + self.f_index = cintlist.CIntList(initial_len=V_F) + self.e_index = cintlist.CIntList(initial_len=num_pairs) + self.col1 = cfloatlist.CFloatList(initial_len=num_pairs) + self.col2 = cfloatlist.CFloatList(initial_len=num_pairs) + + num_pairs = 0 + for i from 0 <= i < V_F: + #self.f_index[i] = num_pairs + self.f_index.set(i, num_pairs) + if dict[i] != NULL: + self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin) + del_node(dict[i]) + free(fmargin) + free(emargin) + free(dict) + return + + + cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin): + cdef int loc + if n.smaller != NULL: + self._add_node(n.smaller, num_pairs, fmargin, emargin) + loc = num_pairs[0] + self.e_index.set(loc, n.key) + self.col1.set(loc, float(n.val)/fmargin) + self.col2.set(loc, float(n.val)/float(emargin[n.key])) + num_pairs[0] = loc + 1 + if n.bigger != NULL: + self._add_node(n.bigger, num_pairs, fmargin, emargin) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.f_index.write_handle(f) + self.e_index.write_handle(f) + self.col1.write_handle(f) + self.col2.write_handle(f) + self.write_wordlist(self.id2fword, f) + self.write_wordlist(self.id2eword, f) + fclose(f) + + + cdef write_wordlist(self, wordlist, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + num_words = len(wordlist) + fwrite(&(num_words), sizeof(int), 1, f) + for word in wordlist: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + cdef read_wordlist(self, word2id, id2word, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + + fread(&(num_words), sizeof(int), 1, f) + for i from 0 <= i < num_words: + fread(&(word_len), sizeof(int), 1, f) + c_word = <char*> malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + word2id[py_word] = len(id2word) + id2word.append(py_word) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.f_index.read_handle(f) + self.e_index.read_handle(f) + self.col1.read_handle(f) + self.col2.read_handle(f) + self.read_wordlist(self.fword2id, self.id2fword, f) + self.read_wordlist(self.eword2id, self.id2eword, f) + fclose(f) + + + def get_e_id(self, eword): + if eword not in self.eword2id: + e_id = len(self.id2eword) + self.id2eword.append(eword) + self.eword2id[eword] = e_id + return self.eword2id[eword] + + + def get_f_id(self, fword): + if fword not in self.fword2id: + f_id = len(self.id2fword) + self.id2fword.append(fword) + self.fword2id[fword] = f_id + return self.fword2id[fword] + + + def read_text(self, filename): + cdef i, j, w, e_id, f_id, n_f, n_e, N + cdef cintlist.CIntList fcount + + fcount = cintlist.CIntList() + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + + # first loop merely establishes size of array objects + sys.stderr.write("Initial read...\n") + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + while f_id >= len(fcount): + fcount.append(0) + fcount.arr[f_id] = fcount.arr[f_id] + 1 + + # Allocate space for dictionary in arrays + N = 0 + n_f = len(fcount) + self.f_index = cintlist.CIntList(initial_len=n_f+1) + for i from 0 <= i < n_f: + self.f_index.arr[i] = N + N = N + fcount.arr[i] + fcount.arr[i] = 0 + self.f_index.arr[n_f] = N + self.e_index = cintlist.CIntList(initial_len=N) + self.col1 = cfloatlist.CFloatList(initial_len=N) + self.col2 = cfloatlist.CFloatList(initial_len=N) + + # Re-read file, placing words into buckets + sys.stderr.write("Bucket sort...\n") + f.seek(0) + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + index = self.f_index.arr[f_id] + fcount.arr[f_id] + fcount.arr[f_id] = fcount.arr[f_id] + 1 + self.e_index.arr[index] = int(e_id) + self.col1[index] = float(score1) + self.col2[index] = float(score2) + f.close() + + sys.stderr.write("Final sort...\n") + # Sort buckets by eword + for b from 0 <= b < n_f: + i = self.f_index.arr[b] + j = self.f_index.arr[b+1] + self.qsort(i,j, "") + + + cdef swap(self, int i, int j): + cdef int itmp + cdef float ftmp + + if i == j: + return + + itmp = self.e_index.arr[i] + self.e_index.arr[i] = self.e_index.arr[j] + self.e_index.arr[j] = itmp + + ftmp = self.col1.arr[i] + self.col1.arr[i] = self.col1.arr[j] + self.col1.arr[j] = ftmp + + ftmp = self.col2.arr[i] + self.col2.arr[i] = self.col2.arr[j] + self.col2.arr[j] = ftmp + + + cdef qsort(self, int i, int j, pad): + cdef int pval, p + + if i > j: + raise Exception("Sort error in CLex") + if i == j: #empty interval + return + if i == j-1: # singleton interval + return + + p = (i+j)/2 + pval = self.e_index.arr[p] + self.swap(i, p) + p = i + for k from i+1 <= k < j: + if pval >= self.e_index.arr[k]: + self.swap(p+1, k) + self.swap(p, p+1) + p = p + 1 + self.qsort(i,p, pad+" ") + self.qsort(p+1,j, pad+" ") + + + def write_enhanced(self, filename): + f = open(filename, "w") + for i in self.f_index: + f.write("%d " % i) + f.write("\n") + for i, s1, s2 in zip(self.e_index, self.col1, self.col2): + f.write("%d %f %f " % (i, s1, s2)) + f.write("\n") + for i, w in enumerate(self.id2fword): + f.write("%d %s " % (i, w)) + f.write("\n") + for i, w in enumerate(self.id2eword): + f.write("%d %s " % (i, w)) + f.write("\n") + f.close() + + + def get_score(self, fword, eword, col): + cdef e_id, f_id, low, high, midpoint, val + #print "get_score fword=",fword,", eword=",eword,", col=",col + + if eword not in self.eword2id: + return None + if fword not in self.fword2id: + return None + f_id = self.fword2id[fword] + e_id = self.eword2id[eword] + low = self.f_index.arr[f_id] + high = self.f_index.arr[f_id+1] + while high - low > 0: + midpoint = (low+high)/2 + val = self.e_index.arr[midpoint] + if val == e_id: + if col == 0: + return self.col1.arr[midpoint] + if col == 1: + return self.col2.arr[midpoint] + if val > e_id: + high = midpoint + if val < e_id: + low = midpoint + 1 + return None + + + def write_text(self, filename): + """Note: does not guarantee writing the dictionary in the original order""" + cdef i, N, e_id, f_id + + f = open(filename, "w") + N = len(self.e_index) + f_id = 0 + for i from 0 <= i < N: + while self.f_index.arr[f_id+1] == i: + f_id = f_id + 1 + e_id = self.e_index.arr[i] + score1 = self.col1.arr[i] + score2 = self.col2.arr[i] + f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2)) + f.close() + + diff --git a/sa-extract/cmath.pxd b/sa-extract/cmath.pxd new file mode 100644 index 00000000..3aaaa2a3 --- /dev/null +++ b/sa-extract/cmath.pxd @@ -0,0 +1,2 @@ +cdef extern from "math.h": + double log(double) diff --git a/sa-extract/cn.py b/sa-extract/cn.py new file mode 100644 index 00000000..e534783f --- /dev/null +++ b/sa-extract/cn.py @@ -0,0 +1,164 @@ +# cn.py +# Chris Dyer <redpony@umd.edu> +# Copyright (c) 2006 University of Maryland. + +# vim:tabstop=4:autoindent:expandtab + +import sys +import math +import sym +import log +import sgml + +epsilon = sym.fromstring('*EPS*'); + +class CNStats(object): + def __init__(self): + self.read = 0 + self.colls = 0 + self.words = 0 + + def collect(self, cn): + self.read += 1 + self.colls += cn.get_length() + for col in cn.columns: + self.words += len(col) + + def __str__(self): + return "confusion net statistics:\n succ. read: %d\n columns: %d\n words: %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read)) + +class ConfusionNet(object): + def __init__(self, sent): + object.__init__(self) + if (len(sent.words) == 0): + self.columns = () + return # empty line, it happens + line = sent.words[0] + if (line.startswith("(((")): + if (len(sent.words) > 1): + log.write("Bad sentence: %s\n" % (line)) + assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets! + line = "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))" + cols = eval(line) + res = [] + for col in cols: + x = [] + for alt in col: + costs = alt[1] + if (type(costs) != type((1,2))): + costs=(float(costs),) + j=[] + for c in costs: + j.append(float(c)) + cost = tuple(j) + spanlen = 1 + if (len(alt) == 3): + spanlen = alt[2] + x.append((sym.fromstring(alt[0],terminal=True), None, spanlen)) + res.append(tuple(x)) + self.columns = tuple(res) + else: # convert a string of input into a CN + res = []; + res.append(((sym.fromstring('<s>',terminal=True), None, 1), )) + for word in sent.words: + res.append(((sym.fromstring(word,terminal=True), None, 1), )); # (alt=word, cost=0.0) + res.append(((sym.fromstring('</s>',terminal=True), None, 1), )) + self.columns = tuple(res) + + def is_epsilon(self, position): + x = self.columns[position[0]][position[1]][0] + return x == epsilon + + def compute_epsilon_run_length(self, cn_path): + if (len(cn_path) == 0): + return 0 + x = len(cn_path) - 1 + res = 0 + ''' -1 denotes a non-terminal ''' + while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])): + res += 1 + x -= 1 + return res + + def compute_cn_cost(self, cn_path): + c = None + for (col, row) in cn_path: + if (col >= 0): + if c is None: + c = self.columns[col][row][1].clone() + else: + c += self.columns[col][row][1] + return c + + def get_column(self, col): + return self.columns[col] + + def get_length(self): + return len(self.columns) + + def __str__(self): + r = "conf net: %d\n" % (len(self.columns),) + i = 0 + for col in self.columns: + r += "%d -- " % i + i += 1 + for alternative in col: + r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2]) + r += "\n" + return r + + def listdown(_columns, col = 0): + # output all the possible sentences out of the self lattice + # will be used by the "dumb" adaptation of lattice decoding with suffix array + result = [] + for entry in _columns[col]: + if col+entry[2]+1<=len(_columns) : + for suffix in self.listdown(_columns,col+entry[2]): + result.append(entry[0]+' '+suffix) + #result.append(entry[0]+' '+suffix) + else: + result.append(entry[0]) + #result.append(entry[0]) + return result + + def next(self,_columns,curr_idx, min_dist=1): + # can be used only when prev_id is defined + result = [] + #print "curr_idx=%i\n" % curr_idx + if curr_idx+min_dist >= len(_columns): + return result + for alt_idx in xrange(len(_columns[curr_idx])): + alt = _columns[curr_idx][alt_idx] + #print "checking %i alternative : " % alt_idx + #print "%s %f %i\n" % (alt[0],alt[1],alt[2]) + #print alt + if alt[2]<min_dist: + #print "recursive next(%i, %i, %i)\n" % (curr_idx,alt_idx,min_dist-alt[2]) + result.extend(self.next(_columns,curr_idx+alt[2],min_dist-alt[2])) + elif curr_idx+alt[2]<len(_columns): + #print "adding because the skip %i doesn't go beyong the length\n" % alt[2] + result.append(curr_idx+alt[2]) + return set(result) + + + + +#file = open(sys.argv[1], "rb") +#sent = sgml.process_sgml_line(file.read()) +#print sent +#cn = ConfusionNet(sent) +#print cn +#results = cn.listdown() +#for result in results: +# print sym.tostring(result) +#print cn.next(0); +#print cn.next(1); +#print cn.next(2); +#print cn.next(3); +#print cn +#cn = ConfusionNet() +#k = 0 +#while (cn.read(file)): +# print cn + +#print cn.stats diff --git a/sa-extract/compile_bin.py b/sa-extract/compile_bin.py new file mode 100755 index 00000000..0196e552 --- /dev/null +++ b/sa-extract/compile_bin.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +'''This program compiles/decompiles binary data objects used +by the decoder''' + +import sys +import cdat +import calignment +import csuf +import clex +import precomputation +#import parse +import monitor +import optparse + +def main(argv=None): + '''Call this from the command-line to create a + pre-computed binary data array for later use''' + if argv is None: + argv = sys.argv + + parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+ + "\n\nNote: -d,-s,-a, and -p are mutually exclusive") + parser.add_option("-d", "--data-array", + action="store_true", default=False, + dest="da", help="Compile file into data array (default)") + parser.add_option("-s", "--suffix-array", + action="store_true", default=False, + dest="sa", help="Compile file into suffix array") + parser.add_option("-a", "--alignment", + action="store_true", default=False, + dest="a", help="Compile file into alignment") + parser.add_option("-l", "--lexical", + action="store_true", default=False, + dest="l", help="Compile file into lex file") + parser.add_option("-x", "--compute_lexical", action="store", nargs=2, + dest="lex_args", help="Compute lex file from data", + metavar="<f file> <e file>") + parser.add_option("-p", "--parse", + action="store_true", default=False, + dest="p", help="Compile file into parse") + parser.add_option("-b", "--binary-infile", + action="store_true", default=False, + dest="bin", help="Input file is binary (default: text)") + parser.add_option("-t", "--text-outfile", + action="store_true", default=False, + dest="text", help="Output file is text (default: binary)") + parser.add_option("-e", "--enhanced-outfile", + action="store_true", default=False, + dest="enhanced", help="Output file is enhanced text (default: binary)") + parser.add_option("-r", action="store", nargs=7, + dest="precomp_args", help="Precompute collocations (Hiero only)", + metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>") + (options, args) = parser.parse_args() + + filetype_opts = [options.da, options.sa, options.a, options.p] + + if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: + parser.print_help() + sys.exit(1) + + (infilename, outfilename) = args + if options.bin: + bin = " binary" + else: + bin = "" + + start_time = monitor.cpu() + if options.precomp_args: + if options.bin: + obj = precomputation.Precomputation(infilename, from_binary=True) + else: + keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) + precomp_opts = {} + sys.stderr.write("Precomputing statistics for list %s\n" % infilename) + for pair in options.precomp_args: + (key, val) = pair.split("=") + if key in keys: + keys.remove(key) + if key != "sa": + val = int(val) + precomp_opts[key] = val + else: + sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) + return 1 + sa = csuf.SuffixArray(precomp_opts["sa"], True) + obj = precomputation.Precomputation(infilename, sa, + precompute_rank=precomp_opts["rank1"], + precompute_secondary_rank=precomp_opts["rank2"], + max_length=precomp_opts["max-len"], + max_nonterminals=precomp_opts["max-nt"], + train_max_initial_size=precomp_opts["max-size"], + train_min_gap_size=precomp_opts["min-gap"]) + elif options.sa: + sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) + obj = csuf.SuffixArray(infilename, options.bin) + elif options.a: + sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) + obj = calignment.Alignment(infilename, options.bin) + elif options.p: + sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) + obj = parse.ParseArray(infilename, options.bin) + elif options.l: + sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) + obj = clex.CLex(infilename, options.bin) + elif options.lex_args: + ffile = options.lex_args[0] + efile = options.lex_args[1] + sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) + fsarray = csuf.SuffixArray(ffile, True) + earray = cdat.DataArray(efile, True) + aarray = calignment.Alignment(infilename, True) + obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) + else: + sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) + obj = cdat.DataArray(infilename, options.bin) + + sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) + start_time = monitor.cpu() + if options.text: + sys.stderr.write("Writing text file %s...\n" % outfilename) + obj.write_text(outfilename) + elif options.enhanced: + sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) + obj.write_enhanced(outfilename) + else: + sys.stderr.write("Writing binary file %s...\n" % outfilename) + obj.write_binary(outfilename) + sys.stderr.write("Finished.\n") + sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) + + mem_use = float(monitor.memory()) + metric = "B" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "KB" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "MB" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "GB" + sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric)) + + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/sa-extract/context_model.py b/sa-extract/context_model.py new file mode 100644 index 00000000..8cb6c174 --- /dev/null +++ b/sa-extract/context_model.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +import sys +import model +import sym +import log +import math + +class ContextModel(model.Model): + '''A ContextModel is one that is computed using information + from the Context object''' + + def __init__(self, context_manager, default=0.0): + model.Model.__init__(self) + self.wordless = 0 + self.initial = None + self.default = default + self.context_manager = context_manager + self.id = self.context_manager.add_model(self) + + '''The next feature is true if the model depends in + some way on the entire input sentence; that is, if + it cannot be scored when created, but must be scored + no earlier than during the input method (note that + this is less strict than stateful)''' + self.contextual = True + ''' It may seem somewhat counterintuitive that a + ContextModel can be non-contextual, but a good + example is the rule probabilites; although these + are computed using the Context object, they don't + really depend in any way on context''' + + + '''inherited from model.Model, called once for each input sentence''' + def input(self, fwords, meta): + # all ContextModels must make this call + self.context_manager.input(self, fwords, meta) + + + '''This function will be called via the input method + only for contextual models''' + def compute_contextual_score(self, r): + return 0.0 + + '''This function is only called on rule creation for + contextless models''' + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return 0.0 + + '''Stateless models should not need to + override this function, unless they define + something for model.TO_GOAL''' + def transition (self, r, antstates, i, j, j1=None): + return (None, 0.0) + + def estimate(self, r): + return r.getscore("context", self.id) + + def transition(self, r, antstates, i, j, j1=None): + return (None, r.getscore("context", self.id)) + + def finaltransition(self, state): + return 0.0 + + def rescore(self, ewords, score): + return score + + + +'''p(e|f)''' +class EgivenF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(paircount)/float(fcount) + return -math.log10(prob) + +class CountEF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1.0 + float(paircount)) + +class SampleCountF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1.0 + float(fsample_count)) + + + +class EgivenFCoherent(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(paircount)/float(fsample_count) + #print "paircount=",paircount," , fsample_count=",fsample_count,", prob=",prob + if (prob == 0.0): return 99.0 + return -math.log10(prob) + + + +class CoherenceProb(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(fcount)/float(fsample_count) + return -math.log10(prob) + + + +class MaxLexEgivenF(ContextModel): + + def __init__(self, context_manager, ttable, col=0): + ContextModel.__init__(self, context_manager) + self.ttable = ttable + self.col = col + self.wordless = 0 + self.initial = None + self.contextual = False + + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + totalscore = 1.0 + fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) + fwords.append("NULL") + ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) + for e in ewords: + maxScore = 0.0 + for f in fwords: + score = self.ttable.get_score(f, e, self.col) + #print "score(MaxLexEgivenF) = ",score + if score > maxScore: + maxScore = score + totalscore *= maxScore + if totalscore == 0.0: + return 999 + else: + return -math.log10(totalscore) + + +class MaxLexFgivenE(ContextModel): + + def __init__(self, context_manager, ttable, col=1): + ContextModel.__init__(self, context_manager) + self.ttable = ttable + self.col = col + self.wordless = 0 + self.initial = None + self.contextual = False + + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + totalscore = 1.0 + fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) + ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) + ewords.append("NULL") + for f in fwords: + maxScore = 0.0 + for e in ewords: + score = self.ttable.get_score(f, e, self.col) + #print "score(MaxLexFgivenE) = ",score + if score > maxScore: + maxScore = score + totalscore *= maxScore + if totalscore == 0.0: + return 999 + else: + return -math.log10(totalscore) + + +class IsSingletonF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount==1) + + +class IsSingletonFE(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount==1) + +class IsNotSingletonF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount>1) + + +class IsNotSingletonFE(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount>1) + + +class IsFEGreaterThanZero(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 0.01) + + diff --git a/sa-extract/cstrmap.pxd b/sa-extract/cstrmap.pxd new file mode 100644 index 00000000..53becbc6 --- /dev/null +++ b/sa-extract/cstrmap.pxd @@ -0,0 +1,12 @@ +cdef extern from "strmap.h": + ctypedef struct StrMap + StrMap* stringmap_new() + void stringmap_delete(StrMap *vocab) + int stringmap_index(StrMap *vocab, char *s) + char* stringmap_word(StrMap *vocab, int i) + +cdef class StringMap: + cdef StrMap *vocab + cdef char *word(self, int i) + cdef int index(self, char *s) + diff --git a/sa-extract/cstrmap.pyx b/sa-extract/cstrmap.pyx new file mode 100644 index 00000000..d3883ea5 --- /dev/null +++ b/sa-extract/cstrmap.pyx @@ -0,0 +1,14 @@ + +cdef class StringMap: + def __cinit__(self): + self.vocab = stringmap_new() + + def __dealloc__(self): + stringmap_delete(self.vocab) + + cdef char *word(self, int i): + return stringmap_word(self.vocab, i) + + cdef int index(self, char *s): + return stringmap_index(self.vocab, s) + diff --git a/sa-extract/csuf.pxd b/sa-extract/csuf.pxd new file mode 100644 index 00000000..f44167dd --- /dev/null +++ b/sa-extract/csuf.pxd @@ -0,0 +1,11 @@ +cimport cdat +cimport cintlist + +cdef class SuffixArray: + cdef cdat.DataArray darray + cdef cintlist.CIntList sa + cdef cintlist.CIntList ha + cdef __lookup_helper(self, int word_id, int offset, int low, int high) + cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint) + cdef __search_low(self, int word_id, int offset, int low, int high) + cdef __search_high(self, word_id, offset, low, high) diff --git a/sa-extract/csuf.pyx b/sa-extract/csuf.pyx new file mode 100644 index 00000000..64c44788 --- /dev/null +++ b/sa-extract/csuf.pyx @@ -0,0 +1,321 @@ +# csuf.pyx +# Defines suffix arrays that can be directly written to/read from disk in binary format +# Adam Lopez <alopez@cs.umd.edu> + +import sys +import log +import cdat +import cintlist +import monitor + +from libc.stdio cimport FILE, fclose, fopen + +cdef class SuffixArray: + + def __init__(self, filename, from_binary=False): + self.darray = cdat.DataArray() + self.sa = cintlist.CIntList() + self.ha = cintlist.CIntList() + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def __getitem__(self, i): + return self.sa.arr[i] + + + def getSentId(self, i): + return self.darray.getSentId(i) + + + def getSent(self, i): + return self.darray.getSent(i) + + + def getSentPos(self, loc): + return self.darray.getSentPos(loc) + + def read_text(self, filename): + '''Constructs suffix array using the algorithm + of Larsson & Sadahkane (1999)''' + cdef int V, N, i, j, h, a_i, n, current_run, skip + cdef cintlist.CIntList isa, word_count + + self.darray = cdat.DataArray(filename, from_binary=False, use_sent_id=True) + N = len(self.darray) + V = len(self.darray.id2word) + + self.sa = cintlist.CIntList(initial_len=N) + self.ha = cintlist.CIntList(initial_len=V+1) + + isa = cintlist.CIntList(initial_len=N) + word_count = cintlist.CIntList(initial_len=V+1) + + '''Step 1: bucket sort data''' + sort_start_time = monitor.cpu() + start_time = sort_start_time + for i from 0 <= i < N: + a_i = self.darray.data.arr[i] + word_count.arr[a_i] = word_count.arr[a_i] + 1 + + n = 0 + for i from 0 <= i < V+1: + self.ha.arr[i] = n + n = n + word_count.arr[i] + word_count.arr[i] = 0 + + for i from 0 <= i < N: + a_i = self.darray.data.arr[i] + self.sa.arr[self.ha.arr[a_i] + word_count.arr[a_i]] = i + isa.arr[i] = self.ha.arr[a_i + 1] - 1 # bucket pointer is last index in bucket + word_count.arr[a_i] = word_count.arr[a_i] + 1 + + '''Determine size of initial runs''' + current_run = 0 + for i from 0 <= i < V+1: + if i < V and self.ha.arr[i+1] - self.ha.arr[i] == 1: + current_run = current_run + 1 + else: + if current_run > 0: + self.sa.arr[self.ha.arr[i] - current_run] = -current_run + current_run = 0 + + sys.stderr.write(" Bucket sort took %f seconds\n" % (monitor.cpu() - sort_start_time)) + + '''Step 2: prefix-doubling sort''' + h = 1 + while self.sa.arr[0] != -N: + sort_start_time = monitor.cpu() + sys.stderr.write(" Refining, sort depth = %d\n" % (h,)) + i = 0 + skip = 0 + while i < N: + if self.sa.arr[i] < 0: + #sys.stderr.write("Skip from %d to %d\n" % (i, i-self.sa.arr[i]-1)) + skip = skip + self.sa.arr[i] + i = i - self.sa.arr[i] + else: + if skip < 0: + self.sa.arr[i+skip] = skip + skip = 0 + j = isa.arr[self.sa.arr[i]] + #sys.stderr.write("Process from %d to %d (%d, %d, %d)\n" % (i, j, self.sa.arr[i], self.darray.data.arr[self.sa.arr[i]], isa.arr[self.sa.arr[i]])) + self.q3sort(i, j, h, isa) + i = j+1 + if skip < 0: + self.sa.arr[i+skip] = skip + h = h * 2 + sys.stderr.write(" Refinement took %f seconds\n" % (monitor.cpu() - sort_start_time)) + + '''Step 3: read off suffix array from inverse suffix array''' + sys.stderr.write(" Finalizing sort...\n") + for i from 0 <= i < N: + j = isa.arr[i] + self.sa.arr[j] = i + sys.stderr.write("Suffix array construction took %f seconds\n" % (monitor.cpu() - start_time)) + + def q3sort(self, int i, int j, int h, cintlist.CIntList isa, pad=""): + '''This is a ternary quicksort. It divides the array into + three partitions: items less than the pivot, items equal + to pivot, and items greater than pivot. The first and last + of these partitions are then recursively sorted''' + cdef int k, midpoint, pval, phead, ptail, tmp + + if j-i < -1: + raise Exception("Unexpected condition found in q3sort: sort from %d to %d" % (i,j)) + if j-i == -1: # recursive base case -- empty interval + return + if (j-i == 0): # recursive base case -- singleton interval + isa.arr[self.sa.arr[i]] = i + self.sa.arr[i] = -1 + return + + # NOTE: choosing the first item as a pivot value resulted in + # stack overflow for some very large buckets. I think there + # is a natural bias towards order due the way the word ids are + # assigned; thus this resulted in the range to the left of the + # pivot being nearly empty. Therefore, choose the middle item. + # If the method of assigning word_id's is changed, this method + # may need to be reconsidered as well. + midpoint = (i+j)/2 + pval = isa.arr[self.sa.arr[midpoint] + h] + if i != midpoint: + tmp = self.sa.arr[midpoint] + self.sa.arr[midpoint] = self.sa.arr[i] + self.sa.arr[i] = tmp + phead = i + ptail = i + + # find the three partitions. phead marks the first element + # of the middle partition, and ptail marks the last element + for k from i+1 <= k < j+1: + if isa.arr[self.sa.arr[k] + h] < pval: + if k > ptail+1: + tmp = self.sa.arr[phead] + self.sa.arr[phead] = self.sa.arr[k] + self.sa.arr[k] = self.sa.arr[ptail+1] + self.sa.arr[ptail+1] = tmp + else: # k == ptail+1 + tmp = self.sa.arr[phead] + self.sa.arr[phead] = self.sa.arr[k] + self.sa.arr[k] = tmp + phead = phead + 1 + ptail = ptail + 1 + else: + if isa.arr[self.sa.arr[k] + h] == pval: + if k > ptail+1: + tmp = self.sa.arr[ptail+1] + self.sa.arr[ptail+1] = self.sa.arr[k] + self.sa.arr[k] = tmp + ptail = ptail + 1 + + # recursively sort smaller suffixes + self.q3sort(i, phead-1, h, isa, pad+" ") + + # update suffixes with pivot value + # corresponds to update_group function in Larsson & Sadakane + for k from phead <= k < ptail+1: + isa.arr[self.sa.arr[k]] = ptail + if phead == ptail: + self.sa.arr[phead] = -1 + + # recursively sort larger suffixes + self.q3sort(ptail+1, j, h, isa, pad+" ") + + + def write_text(self, filename): + self.darray.write_text(filename) + + + def read_binary(self, filename): + cdef FILE *f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.darray.read_handle(f) + self.sa.read_handle(f) + self.ha.read_handle(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.darray.write_handle(f) + self.sa.write_handle(f) + self.ha.write_handle(f) + fclose(f) + + + def write_enhanced(self, filename): + f = open(filename, "w") + self.darray.write_enhanced_handle(f) + for a_i in self.sa: + f.write("%d " % a_i) + f.write("\n") + for w_i in self.ha: + f.write("%d " % w_i) + f.write("\n") + f.close() + + + cdef __search_high(self, word_id, offset, low, high): + cdef int midpoint + + if low >= high: + return high + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__search_high(word_id, offset, midpoint+1, high) + else: + return self.__search_high(word_id, offset, low, midpoint) + + + cdef __search_low(self, int word_id, int offset, int low, int high): + cdef int midpoint + + if low >= high: + return high + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__search_low(word_id, offset, low, midpoint) + else: + return self.__search_low(word_id, offset, midpoint+1, high) + + + cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint): + return (self.__search_low(word_id, offset, low, midpoint), + self.__search_high(word_id, offset, midpoint, high)) + + + cdef __lookup_helper(self, int word_id, int offset, int low, int high): + cdef int midpoint + + if offset == 0: + return (self.ha.arr[word_id], self.ha.arr[word_id+1]) + if low >= high: + return None + + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__get_range(word_id, offset, low, high, midpoint) + if self.darray.data.arr[self.sa.arr[midpoint] + offset] > word_id: + return self.__lookup_helper(word_id, offset, low, midpoint) + else: + return self.__lookup_helper(word_id, offset, midpoint+1, high) + + + def lookup(self, word, offset, int low, int high): + if low == -1: + low = 0 + if high == -1: + high = len(self.sa) + if word in self.darray.word2id: + word_id = self.darray.word2id[word] + return self.__lookup_helper(word_id, offset, low, high) + else: + return None + + + + def print_sa(self, isa): + '''Slow; Use only in case of emergency''' + cdef int i, j, k, N + cdef cintlist.CIntList tmp_sa + + N = len(self.sa) + for i from 0 <= i < N: + sys.stderr.write("%2d " % i) + sys.stderr.write("\n") + for i from 0 <= i < N: + sys.stderr.write("%2d " % self.darray.data.arr[i]) + sys.stderr.write("\n") + for i from 0 <= i < N: + sys.stderr.write("%2d " % isa.arr[i]) + sys.stderr.write("\n\n\n") + + # Recover partially sorted array + tmp_sa = cintlist.CIntList(initial_len=N) + for i from 0 <= i < N: + j = isa.arr[i] + tmp_sa.arr[j] = i + for i from 0 <= i < N: + if self.sa.arr[i] > 0: + tmp_sa.arr[i] = self.sa.arr[i] + + for i from 0 <= i < N: + j = tmp_sa.arr[i] + sys.stderr.write("%2d %2d | " % (i, self.sa.arr[i])) + for k from j <= k < N: + sys.stderr.write("%2d " % self.darray.data.arr[k]) + sys.stderr.write("\n") + sys.stderr.write("\n") + + + + + diff --git a/sa-extract/cveb.pxd b/sa-extract/cveb.pxd new file mode 100644 index 00000000..8967f8e3 --- /dev/null +++ b/sa-extract/cveb.pxd @@ -0,0 +1,15 @@ +cdef struct _VEB: + int top_universe_size + int num_bottom_bits + int max_val + int min_val + int size + void* top + void** bottom + + +cdef class VEB: + cdef _VEB* veb + cdef int _findsucc(self, int i) + cdef int _insert(self, int i) + cdef int _first(self) diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx new file mode 100644 index 00000000..ca87becc --- /dev/null +++ b/sa-extract/cveb.pyx @@ -0,0 +1,390 @@ +#!/usr/bin/env python2.4 +'''This module implements a partial stratified tree (van Emde Boas, 1977). +Only insert findsucc, __iter__, and __contains__ are implemented. +Delete is currently not supported. +There is very little error-checking in this code -- it is designed +to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007), +which doesn't cover all of the possible ways that you could misuse it +(e.g. trying to insert a key larger than the universe size) +Other notes -- this code is really rather ugly C code masquerading as +Pyrex/Python. Virtual function calls are bypassed by hand in several +places for the sake of efficiency, and other Python niceties are +removed for the same reason.''' + +from libc.stdlib cimport malloc, free +from libc.math cimport log, ceil +from libc.string cimport memset + +cdef int MIN_BOTTOM_SIZE +cdef int MIN_BOTTOM_BITS + +MIN_BOTTOM_SIZE = 32 +MIN_BOTTOM_BITS = 5 + +cdef int lower_mask[32] +cdef int i, mask + +for i from 0 <= i < MIN_BOTTOM_SIZE: + mask = (mask << 1) + 1 + lower_mask[i] = mask + + +cdef struct _BitSet: + long bitset + int min_val + int max_val + int size + + +cdef _BitSet* new_BitSet(): + cdef _BitSet* b + + b = <_BitSet*> malloc(sizeof(_BitSet)) + b.bitset = 0 + b.min_val = -1 + b.max_val = -1 + b.size = 0 + return b + + +cdef int bitset_findsucc(_BitSet* b, int i): + cdef int bitset, mask + cdef int low, high, mid + + if b.max_val == -1 or i >= b.max_val: + return -1 + if i < b.min_val: + return b.min_val + + bitset = b.bitset & ~lower_mask[i] + low = i+1 + high = b.max_val+1 + while low < high-1: + mid = (high + low)/2 + mask = ~(lower_mask[high-1] ^ lower_mask[mid-1]) + if bitset & mask == 0: + low = mid + else: + bitset = bitset & mask + high = mid + return low + + +cdef int bitset_insert(_BitSet* b, int i): + cdef int val + + val = 1 << i + if b.bitset & val == 0: + b.bitset = b.bitset | val + if b.size == 0: + b.min_val = i + b.max_val = i + else: + if i < b.min_val: + b.min_val = i + if i > b.max_val: + b.max_val = i + b.size = b.size + 1 + return 1 + return 0 + + +cdef int bitset_contains(_BitSet* b, int i): + cdef int val + + val = 1 << i + if b.bitset & val == 0: + return 0 + else: + return 1 + + +cdef class BitSetIterator: + cdef _BitSet* b + cdef int next_val + + def __next__(self): + cdef int ret_val + + if self.next_val == -1: + raise StopIteration() + ret_val = self.next_val + self.next_val = bitset_findsucc(self.b, ret_val) + return ret_val + + + +# This is a Python wrapper class to give access to the +# (entirely C-implemented) _BitSet struct. +# Very slow; use only for debugging +cdef class BitSet: + + cdef _BitSet* b + + def __cinit__(self): + self.b = new_BitSet() + + def __dealloc__(self): + free(self.b) + + def __iter__(self): + cdef BitSetIterator it + it = BitSetIterator() + it.b = self.b + it.next_val = self.b.min_val + return it + + def insert(self, i): + return bitset_insert(self.b, i) + + def findsucc(self, i): + return bitset_findsucc(self.b, i) + + def __str__(self): + return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")" + + def min(self): + return self.b.min_val + + def max(self): + return self.b.max_val + + def __len__(self): + return self.b.size + + def __contains__(self, i): + return bool(bitset_contains(self.b, i)) + + +def dec2bin(i): + cdef d + result = "" + for d from 0 <= d < MIN_BOTTOM_SIZE: + if i & lower_mask[0] == 0: + result = "0"+result + else: + result = "1"+result + i = i >> 1 + return result + + +cdef _VEB* new_VEB(int n): + cdef _VEB* veb + cdef int num_bits, num_top_bits, i + + veb = <_VEB*> malloc(sizeof(_VEB)) + + num_bits = int(ceil(log(n) / log(2))) + veb.num_bottom_bits = num_bits/2 + if veb.num_bottom_bits < MIN_BOTTOM_BITS: + veb.num_bottom_bits = MIN_BOTTOM_BITS + veb.top_universe_size = (n >> veb.num_bottom_bits) + 1 + + veb.bottom = <void**> malloc(veb.top_universe_size * sizeof(void*)) + memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*)) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + veb.top = new_VEB(veb.top_universe_size) + else: + veb.top = new_BitSet() + + veb.max_val = -1 + veb.min_val = -1 + veb.size = 0 + return veb + + +cdef int VEB_insert(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b, tmp + + if veb.size == 0: + veb.min_val = i + veb.max_val = i + elif i == veb.min_val or i == veb.max_val: + return 0 + else: + if i < veb.min_val: + tmp = i + i = veb.min_val + veb.min_val = tmp + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + if veb.bottom[a] == NULL: + if veb.top_universe_size > MIN_BOTTOM_SIZE: + subv = <_VEB*> veb.top + VEB_insert(subv, a) + else: + subb = <_BitSet*> veb.top + bitset_insert(subb, a) + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits) + else: + veb.bottom[a] = new_BitSet() + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + if VEB_insert(subv, b) == 0: + return 0 + else: + subb = <_BitSet*> veb.bottom[a] + if bitset_insert(subb, b) == 0: + return 0 + + if i > veb.max_val: + veb.max_val = i + veb.size = veb.size + 1 + return 1 + + +cdef del_VEB(_VEB* veb): + cdef int i + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + i = (<_VEB*> veb.top).min_val + else: + i = (<_BitSet*> veb.top).min_val + + while i != -1: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + del_VEB(<_VEB*> veb.bottom[i]) + else: + free(<_BitSet*> veb.bottom[i]) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + i = VEB_findsucc(<_VEB*> veb.top, i) + else: + i = bitset_findsucc(<_BitSet*> veb.top, i) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + del_VEB(<_VEB*> veb.top) + else: + free(<_BitSet*> veb.top) + free(veb.bottom) + free(veb) + + +cdef int VEB_findsucc(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b, j, c, found + + if veb.max_val == -1 or i>=veb.max_val: + return -1 + if i < veb.min_val: + return veb.min_val + + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + found = 0 + if veb.bottom[a] != NULL: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + if subv.max_val > b: + j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b) + found = 1 + else: + subb = <_BitSet*> veb.bottom[a] + if subb.max_val > b: + j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b) + found = 1 + if found==0: + if veb.top_universe_size > MIN_BOTTOM_SIZE: + subv = <_VEB*> veb.top + c = VEB_findsucc(subv, a) + else: + subb = <_BitSet*> veb.top + c = bitset_findsucc(subb, a) + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[c] + j = (c << veb.num_bottom_bits) + subv.min_val + else: + subb = <_BitSet*> veb.bottom[c] + j = (c << veb.num_bottom_bits) + subb.min_val + return j + + +cdef int VEB_contains(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b + + if veb.size == 0 or i < veb.min_val or i > veb.max_val: + return 0 + + if veb.min_val == i: + return 1 + else: + if veb.size == 1: + return 0 + + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + if veb.bottom[a] == NULL: + return 0 + else: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + return VEB_contains(subv, b) + else: + subb = <_BitSet*> veb.bottom[a] + return bitset_contains(subb, b) + + +cdef class VEBIterator: + cdef _VEB* v + cdef int next_val + + def __next__(self): + cdef int ret_val + + if self.next_val == -1: + raise StopIteration() + ret_val = self.next_val + self.next_val = VEB_findsucc(self.v, ret_val) + return ret_val + + +cdef class VEB: + + def __init__(self, size): + pass + + def __cinit__(self, int size): + self.veb = new_VEB(size) + + def __dealloc__(self): + del_VEB(self.veb) + + def __iter__(self): + cdef VEBIterator it + it = VEBIterator() + it.v = self.veb + it.next_val = self.veb.min_val + return it + + def insert(self, i): + return VEB_insert(self.veb, i) + + cdef int _insert(self, int i): + return VEB_insert(self.veb, i) + + def findsucc(self, i): + return VEB_findsucc(self.veb, i) + + cdef int _first(self): + return self.veb.min_val + + cdef int _findsucc(self, int i): + return VEB_findsucc(self.veb, i) + + def __len__(self): + return self.veb.size + + def __contains__(self, i): + return VEB_contains(self.veb, i) + + + + + diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl new file mode 100755 index 00000000..02fd7445 --- /dev/null +++ b/sa-extract/escape-testset.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl -w + +use utf8; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +my @fh = (); +if (scalar @ARGV == 0) { + push @fh, \*STDIN; +} else { + for my $file (@ARGV) { + my $f; + open $f, "<$file" or die "Can't read $file: $!\n"; + binmode $f, ":utf8"; + push @fh, $f; + } +} + +my $id = -1; +for my $f (@fh) { + while(<$f>) { + chomp; + die "Empty line in test set" if /^\s*$/; + die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i; + $id++; + s/&/\&/g; + s/</\</g; + s/>/\>/g; + print "<seg id=\"$id\"> $_ </seg>\n"; + } +} + + diff --git a/sa-extract/example/README b/sa-extract/example/README new file mode 100644 index 00000000..f6eac52b --- /dev/null +++ b/sa-extract/example/README @@ -0,0 +1,8 @@ +Commands to compile a corpus and extract some grammars +====================================================== + +# compile +../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini +# extract +cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini + diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz Binary files differnew file mode 100644 index 00000000..741de7e4 --- /dev/null +++ b/sa-extract/example/corpus.align.gz diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz Binary files differnew file mode 100644 index 00000000..0d66470a --- /dev/null +++ b/sa-extract/example/corpus.de.gz diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz Binary files differnew file mode 100644 index 00000000..28cb5c58 --- /dev/null +++ b/sa-extract/example/corpus.en.gz diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de new file mode 100644 index 00000000..8923329f --- /dev/null +++ b/sa-extract/example/test.de @@ -0,0 +1,10 @@ +dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen . +taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt . +die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln . +malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen . +indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden . +die who steht vor einer wichtigen entscheidung . +sie muss dringend den handel progressiv liberalisieren . +eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) . +dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte . +robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete . diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en new file mode 100644 index 00000000..e50edcac --- /dev/null +++ b/sa-extract/example/test.ref.en @@ -0,0 +1,10 @@ +this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform . +taiwan is also set to play a constructive role -- if mainland china plays along . +the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules . +malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions . +indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda . +the wto is at a crossroads . +it sorely needs to liberalize trade progressively . +strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) . +this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative . +however , robert zoellick , president bush 's trade representative , needs allies . diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini new file mode 100644 index 00000000..56913245 --- /dev/null +++ b/sa-extract/extract.ini @@ -0,0 +1,116 @@ +# This .ini file extracts grammars to a file using +# the pattern matching infrastructure. +# +# Does not do any decoding. +# +# Variables can be set using sa-system.pl +# +# Usage: decoder.py -c <this ini file> [-x <grammar file>] +# +# If the -x option is used, grammar will be written to the +# specified file, otherwise it is written to $PWD/grammar.out +# +# NOTE: all information about rules is cached, so use generous +# memory limits (rules themselves are not cached.) + +import os +import manager +import clex +import context_model +import rulefactory +import calignment +import sys + +out_grammar_file = "grammar.out" +if opts.extra: + out_grammar_file = opts.extra + +# *** these variables written by sa-system.pl. Do not modify *** +lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz" +f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin" +e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin" +a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin" +lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin" +max_len = 5 +max_nt = 2 +max_size=10 +min_gap=1 +rank1 = 100 +rank2 = 10 +precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin" + +# check for path errors +if not os.path.exists(f_sa_file): + raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file) +if not os.path.exists(e_file): + raise Exception("Cannot find compiled target language array file %s" % e_file) +if not os.path.exists(a_file): + raise Exception("Cannot find compiled alignment file %s" % a_file) +if not os.path.exists(lex_file): + raise Exception("Cannot find compiled lexical weights file %s" % lex_file) +if not os.path.exists(precompute_file): + log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file) + precompute_file = None + +### Output options +mark_phrases = False # show derivation as SGML markup in output +mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!) + +# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose +log.level = 1 +log.file = sys.stderr + +# pattern-matching stuff +class PhonyGrammar: # saves us the cost of keeping the rules around + def add(self, thing): + pass + +local_grammar = PhonyGrammar() +xcat="X" + +cm = manager.ContextManager( + f_sa_file, + e_file, + sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!) + rulefactory=rulefactory.HieroCachingRuleFactory( + alignment=calignment.Alignment( # compiled alignment object (REQUIRED) + a_file, + from_binary=True + ), + category="["+xcat+"]", # name of generic nonterminal used by Hiero + grammar=local_grammar, # do not change for extraction + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=15, # maximum span of a grammar rule in TEST DATA + max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=precompute_file, # filename of file containing precomputed collocations + precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate a complete grammar for each input sentence + rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars) + train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower) + ), + from_binary=True + ) + +# lexical weighting tables +tt = clex.CLex(lex_file, from_binary=True) + +# Only include features that depend on rule identity here +add_model(context_model.EgivenFCoherent(cm), 0.125) +add_model(context_model.SampleCountF(cm), 0.125) +add_model(context_model.CountEF(cm), 0.125) +add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) +add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) +add_model(context_model.IsSingletonF(cm), 0.125) +add_model(context_model.IsSingletonFE(cm), 0.125) + +# grammars, search parameters and all that other stuff are irrelevant + diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py new file mode 100755 index 00000000..9d66ebf0 --- /dev/null +++ b/sa-extract/extractor.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +# vim:expandtab:shiftwidth=4 + +import sys, gc, monitor, sgml +import optparse +import model +import log +import cn + +models = [] + +def add_model(m,w=0.0): + models.append(m) + +def extract_grammar(input): + confnet = cn.ConfusionNet(input) + meta = input.meta + for m in models: + m.input(confnet.columns, meta) + +if __name__ == "__main__": + optparser = optparse.OptionParser() + optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override") + (opts,args) = optparser.parse_args() + + if opts.config is None: + raise ValueError, "You must specify a configuration file." + else: + if log.level >= 1: + log.write("Reading configuration from %s\n" % opts.config) + execfile(opts.config) + + if len(args) >= 1 and args[0] != "-": + input_file = file(args[0], "r") + else: + input_file = sys.stdin + + if len(args) >= 2 and args[1] != "-": + output_file = file(args[1], "w") + else: + output_file = sys.stdout + + gc.collect() + if log.level >= 1: + log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) + log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) + + sents = sgml.read_raw(input_file) + for sent in sents: + mark = sent.getmark() + if mark is not None: + (tag, attrs) = mark + if tag == "seg": + sent.unmark() + dattrs = sgml.attrs_to_dict(attrs) + sent.meta = attrs + extract_grammar(sent) + diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx new file mode 100644 index 00000000..a992d3ee --- /dev/null +++ b/sa-extract/lcp.pyx @@ -0,0 +1,113 @@ +#!/usr/bin/env python2.4 + +'''Compute LCP array for a suffix array using the Kasai et al. algorithm''' +'''Can also be used to compute statistics such +as k most frequent n-grams''' + +import sys + +cimport cintlist +cimport csuf +cimport cdat +cimport cveb + +cdef class LCP: + + cdef csuf.SuffixArray sa + cdef cintlist.CIntList lcp + + def __init__(self, sa): + self._construct(sa) + + cdef _construct(self, csuf.SuffixArray sa): + cdef int i, k, j, h, n + cdef cintlist.CIntList rank + + sys.stderr.write("Constructing LCP array\n") + self.sa = sa + n = self.sa.sa.len + self.lcp = cintlist.CIntList(initial_len=n) + + rank = cintlist.CIntList(initial_len=n) + for i from 0 <= i < n: + rank.arr[sa.sa.arr[i]] = i + + h = 0 + for i from 0 <= i < n: + k = rank.arr[i] + if k == 0: + self.lcp.arr[k] = -1 + else: + j = sa.sa.arr[k-1] + while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: + h = h+1 + self.lcp.arr[k] = h + if h > 0: + h = h-1 + sys.stderr.write("LCP array completed\n") + + + def compute_stats(self, max_n): + self._compute_stats(max_n) + + cdef _compute_stats(self, int max_n): + '''Note: the output of this function is not exact. In + particular, the frequency associated with each word is + not guaranteed to be correct. This is due to a bit of + laxness in the design; the function is intended only to + obtain a list of the most frequent words; for this + purpose it is perfectly fine''' + cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid + cdef cintlist.CIntList run_start + cdef cintlist.CIntList ngram_start + cdef cveb.VEB veb + + N = self.sa.sa.len + + ngram_starts = [] + for n from 0 <= n < max_n: + ngram_starts.append(cintlist.CIntList(initial_len=N)) + + run_start = cintlist.CIntList(initial_len=max_n) + veb = cveb.VEB(N) + + for i from 0 <= i < N: + h = self.lcp.arr[i] + if h < 0: + h = 0 + for n from h <= n < max_n: + rs = run_start.arr[n] + run_start.arr[n] = i + freq = i - rs + if freq > 1000: # arbitrary, but see note below + veb._insert(freq) + ngram_start = ngram_starts[n] + while ngram_start.arr[freq] > 0: + freq = freq + 1 # cheating a bit, should be ok for sparse histogram + ngram_start.arr[freq] = rs + i = veb.veb.min_val + while i != -1: + ii = veb._findsucc(i) + for n from 0 <= n < max_n: + ngram_start = ngram_starts[n] + iii = i + rs = ngram_start.arr[iii] + while (ii==-1 or iii < ii) and rs != 0: + j = self.sa.sa.arr[rs] + valid = 1 + for k from 0 <= k < n+1: + if self.sa.darray.data.arr[j+k] < 2: + valid = 0 + if valid: + ngram = "" + for k from 0 <= k < n+1: + ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " " + print i, n+1, ngram + iii = iii + 1 + rs = ngram_start.arr[iii] + i = ii + + + + + diff --git a/sa-extract/lcp_ops.py b/sa-extract/lcp_ops.py new file mode 100755 index 00000000..9df6e82a --- /dev/null +++ b/sa-extract/lcp_ops.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import lcp +import sys +import optparse +import csuf + +'''Run various computations using the LCP array''' +def main(): + + optparser = optparse.OptionParser() +# optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-s", "--sa-check", dest="sa_check", default=False, action="store_true") + optparser.add_option("-l", "--lcp-check", dest="lcp_check", default=False, action="store_true") + optparser.add_option("-t", "--stats", dest="stats", default=0, type="int", action="store") + optparser.add_option("-u", "--unigram", dest="uni_check", default=False, action="store_true") + optparser.add_option("-r", "--report-long-lcps", dest="long_lcp", type="int", default=0, action="store") + (opts,args) = optparser.parse_args() + + if len(args) < 1: + print >>sys.stderr, "Usage: lcp.py [opts] <sa file>" + sys.exit(1) + + safile = args[0] + sa = csuf.SuffixArray(safile, from_binary=True) + +# if opts.sa_check: +# check_sufarray(sa) + + l = lcp.LCP(sa) + + if opts.lcp_check: + print >>sys.stderr, "Checking LCP Array..." + l.check() + print >>sys.stderr, "Check finished" + + if opts.stats > 0: + l.compute_stats(opts.stats) + +# if opts.uni_check: +# if lcp is None: +# lcp = LCP(sa) +# unigram_stats(sa, lcp) +# +# if opts.long_lcp: +# if lcp is None: +# lcp = LCP(sa, opts.long_lcp) + +if __name__ == "__main__": + sys.exit(main()) + + diff --git a/sa-extract/log.py b/sa-extract/log.py new file mode 100644 index 00000000..d4f96cb4 --- /dev/null +++ b/sa-extract/log.py @@ -0,0 +1,18 @@ +import sys + +level = 1 +file = sys.stderr + +def writeln(s="", l=0): + if level >= l: + file.write("%s\n" % s) + file.flush() + +def write(s, l=0): + if level >= l: + file.write(s) + file.flush() + + + + diff --git a/sa-extract/manager.py b/sa-extract/manager.py new file mode 100644 index 00000000..767192c1 --- /dev/null +++ b/sa-extract/manager.py @@ -0,0 +1,100 @@ +import csuf +import cdat + +class Sampler(object): + '''A Sampler implements a logic for choosing + samples from a population range''' + + def __init__(self): + pass + + def registerContext(self, context_manager): + self.context_manager = context_manager + + def sample(self, phrase_location): + return cintlist.CIntList() + + + +class Extractor(object): + '''Extractor is responsible for extracting rules + from a given context; once a sentence id/location + is found for the source fwords, extractor is + responsible for producing any matching rule(s). + Optionally, extractor may return an empty list''' + + def __init__(self): + pass + + def registerContext(self, context_manager): + self.context_manager = context_manager + + def extract(self, fwords, loc): + return [] + + + +class RuleFactory(object): + '''RuleFactory is a class that manages the + generation of translation rules, using the Context + and (optionally) any of its contained classes or + data. The RuleFactory is responsible for handling + any caching (i.e. when presented with an input + sentence, it may lookup a rule from its cache + rather than extracting a new rule)''' + + def __init__(self): + self.num_lookups = 0 + self.num_extractions = 0 + self.num_rules = 0 + self.time = 0.0 + + + def registerContext(self, context_manager): + self.context_manager = context_manager + + + def input(self, fwords): + '''Manages the process of enumerating + rules for a given input sentence, and + looking them with calls to Context, + Sampler, and Extractor''' + return [] + + +class ContextManager(object): + + def __init__(self, ffile, efile, extractor=None, sampler=None, rulefactory=None, from_binary=False): + # NOTE: Extractor does not have a default value because + # the only nontrivial extractor right now depends on an + # alignment file + + self.fsarray = csuf.SuffixArray(ffile, from_binary) + self.edarray = cdat.DataArray(efile, from_binary) + + self.factory = rulefactory + self.factory.registerContext(self) + + self.sampler = sampler + self.sampler.registerContext(self) + + self.models = [] + self.owner = None + + + def add_model(self, model): + if self.owner is None: + self.owner = model + model_id = len(self.models) + self.models.append(model) + return model_id + + + def input(self, model, fwords, meta): + if model != self.owner: + return + self.fwords = fwords + self.factory.input(self.fwords, meta) + + + diff --git a/sa-extract/model.py b/sa-extract/model.py new file mode 100644 index 00000000..66c51051 --- /dev/null +++ b/sa-extract/model.py @@ -0,0 +1,12 @@ + +class Model(object): + def __init__(self, name=None): + object.__init__(self) + if name is None: + self.name = self.__class__.__name__ + else: + self.name = name + + def input(self, fwords, meta): + pass + diff --git a/sa-extract/monitor.py b/sa-extract/monitor.py new file mode 100644 index 00000000..eb0bed57 --- /dev/null +++ b/sa-extract/monitor.py @@ -0,0 +1,48 @@ +import os, resource + +def cpu(): + return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+ + resource.getrusage(resource.RUSAGE_SELF).ru_stime) + +# from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222 + + +_proc_status = '/proc/%d/status' % os.getpid() + +_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, + 'KB': 1024.0, 'MB': 1024.0*1024.0} + +def _VmB(VmKey): + '''Private. + ''' + global _proc_status, _scale + # get pseudo file /proc/<pid>/status + try: + t = open(_proc_status) + v = t.read() + t.close() + except: + return 0.0 # non-Linux? + # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' + i = v.index(VmKey) + v = v[i:].split(None, 3) # whitespace + if len(v) < 3: + return 0.0 # invalid format? + # convert Vm value to bytes + return float(v[1]) * _scale[v[2]] + +def memory(since=0.0): + '''Return memory usage in bytes. + ''' + return _VmB('VmSize:') - since + +def resident(since=0.0): + '''Return resident memory usage in bytes. + ''' + return _VmB('VmRSS:') - since + + +def stacksize(since=0.0): + '''Return stack size in bytes. + ''' + return _VmB('VmStk:') - since diff --git a/sa-extract/precomputation.pxd b/sa-extract/precomputation.pxd new file mode 100644 index 00000000..c75d5eef --- /dev/null +++ b/sa-extract/precomputation.pxd @@ -0,0 +1,13 @@ +from libc.stdio cimport FILE + +cdef class Precomputation: + cdef int precompute_rank + cdef int precompute_secondary_rank + cdef int max_length + cdef int max_nonterminals + cdef int train_max_initial_size + cdef int train_min_gap_size + cdef precomputed_index + cdef precomputed_collocations + cdef read_map(self, FILE* f) + cdef write_map(self, m, FILE* f) diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx new file mode 100644 index 00000000..ce4c21aa --- /dev/null +++ b/sa-extract/precomputation.pyx @@ -0,0 +1,478 @@ +# precomputes a set of collocations by advancing over the text. +# warning: nasty C code + +import log +import monitor + +cimport csuf +cimport cdat +cimport cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef struct _Trie_Node # forward decl + +cdef struct _Trie_Edge: + int val + _Trie_Node* node + _Trie_Edge* bigger + _Trie_Edge* smaller + +cdef struct _Trie_Node: + _Trie_Edge* root + int* arr + int arr_len + +cdef _Trie_Node* new_trie_node(): + cdef _Trie_Node* node + node = <_Trie_Node*> malloc(sizeof(_Trie_Node)) + node.root = NULL + node.arr_len = 0 + node.arr = <int*> malloc(sizeof(0*sizeof(int))) + return node + +cdef _Trie_Edge* new_trie_edge(int val): + cdef _Trie_Edge* edge + edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge)) + edge.node = new_trie_node() + edge.bigger = NULL + edge.smaller = NULL + edge.val = val + return edge + +cdef free_trie_node(_Trie_Node* node): + if node != NULL: + free_trie_edge(node.root) + free(node.arr) + +cdef free_trie_edge(_Trie_Edge* edge): + if edge != NULL: + free_trie_node(edge.node) + free_trie_edge(edge.bigger) + free_trie_edge(edge.smaller) + +cdef _Trie_Node* trie_find(_Trie_Node* node, int val): + cdef _Trie_Edge* cur + cur = node.root + while cur != NULL and cur.val != val: + if val > cur.val: + cur = cur.bigger + elif val < cur.val: + cur = cur.smaller + if cur == NULL: + return NULL + else: + return cur.node + +cdef trie_node_data_append(_Trie_Node* node, int val): + cdef int new_len + new_len = node.arr_len + 1 + node.arr = <int*> realloc(node.arr, new_len*sizeof(int)) + node.arr[node.arr_len] = val + node.arr_len = new_len + +cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals): + cdef int new_len + new_len = node.arr_len + num_vals + node.arr = <int*> realloc(node.arr, new_len*sizeof(int)) + memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int)) + node.arr_len = new_len + + +cdef _Trie_Node* trie_insert(_Trie_Node* node, int val): + cdef _Trie_Edge** cur + cur = &node.root + while cur[0] != NULL and cur[0].val != val: + if val > cur[0].val: + cur = &cur[0].bigger + elif val < cur[0].val: + cur = &cur[0].smaller + if cur[0] == NULL: + cur[0] = new_trie_edge(val) + return cur[0].node + +cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros): + cdef cintlist.CIntList arr + + if include_zeros or node.arr_len > 0: + arr = cintlist.CIntList() + free(arr.arr) + arr.arr = <int*> malloc(node.arr_len * sizeof(int)) + memcpy(arr.arr, node.arr, node.arr_len * sizeof(int)) + arr.len = node.arr_len + arr.size = node.arr_len + result[prefix] = arr + trie_edge_to_map(node.root, result, prefix, include_zeros) + +cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros): + if edge != NULL: + trie_edge_to_map(edge.smaller, result, prefix, include_zeros) + trie_edge_to_map(edge.bigger, result, prefix, include_zeros) + prefix = prefix + (edge.val,) + trie_node_to_map(edge.node, result, prefix, include_zeros) + +cdef class TrieMap: + + cdef _Trie_Node** root + cdef int V + + def __init__(self, alphabet_size): + self.V = alphabet_size + self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*)) + memset(self.root, 0, self.V * sizeof(_Trie_Node*)) + + + def __dealloc__(self): + cdef int i + for i from 0 <= i < self.V: + if self.root[i] != NULL: + free_trie_node(self.root[i]) + free(self.root) + + + def insert(self, pattern): + cdef int* p + cdef int i, l + l = len(pattern) + p = <int*> malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + self._insert(p,l) + free(p) + + + cdef _Trie_Node* _insert(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + if self.root[pattern[0]] == NULL: + self.root[pattern[0]] = new_trie_node() + node = self.root[pattern[0]] + for i from 1 <= i < pattern_len: + node = trie_insert(node, pattern[i]) + return node + + def contains(self, pattern): + cdef int* p + cdef int i, l + cdef _Trie_Node* node + l = len(pattern) + p = <int*> malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + node = self._contains(p,l) + free(p) + if node == NULL: + return False + else: + return True + + cdef _Trie_Node* _contains(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + node = self.root[pattern[0]] + i = 1 + while node != NULL and i < pattern_len: + node = trie_find(node, pattern[i]) + i = i+1 + return node + + def toMap(self, flag): + cdef int i, include_zeros + + if flag: + include_zeros=1 + else: + include_zeros=0 + result = {} + for i from 0 <= i < self.V: + if self.root[i] != NULL: + trie_node_to_map(self.root[i], result, (i,), include_zeros) + return result + + +cdef class Precomputation: + +# Defined in .pxd file, here for reference: +# cdef int precompute_rank +# cdef int precompute_secondary_rank +# cdef int max_length +# cdef int max_nonterminals +# cdef int train_max_initial_size +# cdef int train_min_gap_size +# cdef precomputed_index +# cdef precomputed_collocations + + def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, + max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False): + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.train_max_initial_size = train_max_initial_size + self.train_min_gap_size = train_min_gap_size + if from_binary: + self.read_binary(filename) + else: + self.precompute(filename, sa) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + fread(&(self.precompute_rank), sizeof(int), 1, f) + fread(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fread(&(self.max_length), sizeof(int), 1, f) + fread(&(self.max_nonterminals), sizeof(int), 1, f) + fread(&(self.train_max_initial_size), sizeof(int), 1, f) + fread(&(self.train_min_gap_size), sizeof(int), 1, f) + self.precomputed_index = self.read_map(f) + self.precomputed_collocations = self.read_map(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + + f = fopen(cfilename, "w") + fwrite(&(self.precompute_rank), sizeof(int), 1, f) + fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fwrite(&(self.max_length), sizeof(int), 1, f) + fwrite(&(self.max_nonterminals), sizeof(int), 1, f) + fwrite(&(self.train_max_initial_size), sizeof(int), 1, f) + fwrite(&(self.train_min_gap_size), sizeof(int), 1, f) + self.write_map(self.precomputed_index, f) + self.write_map(self.precomputed_collocations, f) + fclose(f) + + + cdef write_map(self, m, FILE* f): + cdef int i, N + cdef cintlist.CIntList arr + + N = len(m) + fwrite(&(N), sizeof(int), 1, f) + for pattern, val in m.iteritems(): + N = len(pattern) + fwrite(&(N), sizeof(int), 1, f) + for word_id in pattern: + i = word_id + fwrite(&(i), sizeof(int), 1, f) + arr = val + arr.write_handle(f) + + + cdef read_map(self, FILE* f): + cdef int i, j, k, word_id, N + cdef cintlist.CIntList arr + + m = {} + fread(&(N), sizeof(int), 1, f) + for j from 0 <= j < N: + fread(&(i), sizeof(int), 1, f) + key = () + for k from 0 <= k < i: + fread(&(word_id), sizeof(int), 1, f) + key = key + (word_id,) + arr = cintlist.CIntList() + arr.read_handle(f) + m[key] = arr + return m + + + def precompute(self, filename, sa): + cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank + cdef csuf.SuffixArray sarray + cdef cdat.DataArray darray + cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank + cdef TrieMap frequent_patterns, super_frequent_patterns, collocations + cdef _Trie_Node* node + + sarray = sa + darray = sarray.darray + data = darray.data + + frequent_patterns = TrieMap(len(darray.id2word)) + super_frequent_patterns = TrieMap(len(darray.id2word)) + collocations = TrieMap(len(darray.id2word)) + + I_set = set() + J_set = set() + J2_set = set() + IJ_set = set() + pattern_rank = {} + + log.writeln("Precomputing frequent intersections\n", 1) + start_time = monitor.cpu() + + max_pattern_len = 0 + if filename is not None: + precompute_file = open(filename) + for rank, line in enumerate(precompute_file): + if rank >= self.precompute_rank: + break + phrase_words = line.split()[2:] + phrase = () + for word in phrase_words: + phrase = phrase + (darray.word2id[word],) + max_pattern_len = max(max_pattern_len, len(phrase)) + frequent_patterns.insert(phrase) + I_set.add(phrase) + pattern_rank[phrase] = rank + if rank < self.precompute_secondary_rank: + super_frequent_patterns.insert(phrase) + J_set.add(phrase) + precompute_file.close() + + queue = cintlist.CIntList(increment=1000) + + log.writeln(" Computing inverted indexes...", 1) + N = len(data) + for i from 0 <= i < N: + sa_word_id = data.arr[i] + if sa_word_id == 1: + queue._append(-1) + else: + for l from 1 <= l <= max_pattern_len: + node = frequent_patterns._contains(data.arr+i, l) + if node == NULL: + break + queue._append(i) + queue._append(l) + trie_node_data_append(node, i) + + log.writeln(" Computing collocations...", 1) + N = len(queue) + ptr1 = 0 + sent_count = 0 + while ptr1 < N: # main loop + i1 = queue.arr[ptr1] + if i1 > -1: + l1 = queue.arr[ptr1+1] + ptr2 = ptr1 + 2 + while ptr2 < N: + i2 = queue.arr[ptr2] + if i2 == -1 or i2 - i1 >= self.train_max_initial_size: + break + l2 = queue.arr[ptr2+1] + if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + if super_frequent_patterns._contains(data.arr+i2, l2) != NULL: + if super_frequent_patterns._contains(data.arr+i1, l1) != NULL: + is_super = 1 + else: + is_super = 0 + ptr3 = ptr2 + 2 + while ptr3 < N: + i3 = queue.arr[ptr3] + if i3 == -1 or i3 - i1 >= self.train_max_initial_size: + break + l3 = queue.arr[ptr3+1] + if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length: + if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + node = trie_insert(node, -1) + for i from i3 <= i < i3+l3: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + trie_node_data_append(node, i3) + ptr3 = ptr3 + 2 + ptr2 = ptr2 + 2 + ptr1 = ptr1 + 2 + else: + sent_count = sent_count + 1 + if sent_count % 10000 == 0: + log.writeln(" %d sentences" % sent_count) + ptr1 = ptr1 + 1 + + self.precomputed_collocations = collocations.toMap(False) + self.precomputed_index = frequent_patterns.toMap(True) + + x = 0 + for pattern1 in J_set: + for pattern2 in J_set: + if len(pattern1) + len(pattern2) + 1 < self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + J2_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in I_set: + x = x+1 + if len(pattern1) + len(pattern2) + 1 <= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in J2_set: + x = x+2 + if len(pattern1) + len(pattern2) + 1<= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + combined_pattern = pattern2 + (-1,) + pattern1 + IJ_set.add(combined_pattern) + + N = len(pattern_rank) + cost_by_rank = cintlist.CIntList(initial_len=N) + count_by_rank = cintlist.CIntList(initial_len=N) + for pattern, arr in self.precomputed_collocations.iteritems(): + if pattern not in IJ_set: + s = "" + for word_id in pattern: + if word_id == -1: + s = s + "X " + else: + s = s + darray.id2word[word_id] + " " + log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1) + else: + chunk = () + max_rank = 0 + arity = 0 + for word_id in pattern: + if word_id == -1: + max_rank = max(max_rank, pattern_rank[chunk]) + arity = arity + 1 + chunk = () + else: + chunk = chunk + (word_id,) + max_rank = max(max_rank, pattern_rank[chunk]) + cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr)) + count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1)) + + cumul_cost = 0 + cumul_count = 0 + for i from 0 <= i < N: + cumul_cost = cumul_cost + cost_by_rank.arr[i] + cumul_count = cumul_count + count_by_rank.arr[i] + log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost)) + + num_found_patterns = len(self.precomputed_collocations) + for pattern in IJ_set: + if pattern not in self.precomputed_collocations: + self.precomputed_collocations[pattern] = cintlist.CIntList() + + stop_time = monitor.cpu() + log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x)) + log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index)) + log.writeln("Precomputation took %f seconds" % (stop_time - start_time)) + log.writeln("Detailed statistics:") + + + + + + + diff --git a/sa-extract/rule.pxd b/sa-extract/rule.pxd new file mode 100644 index 00000000..c9c84e5c --- /dev/null +++ b/sa-extract/rule.pxd @@ -0,0 +1,13 @@ +cdef class Phrase: + cdef int *syms + cdef int n, *varpos, n_vars + cdef public int chunkpos(self, int k) + cdef public int chunklen(self, int k) + +cdef class Rule: + cdef public int lhs + cdef readonly Phrase f, e + cdef float *cscores + cdef int n_scores + cdef public owner, word_alignments + diff --git a/sa-extract/rule.pyx b/sa-extract/rule.pyx new file mode 100644 index 00000000..7cd3efda --- /dev/null +++ b/sa-extract/rule.pyx @@ -0,0 +1,286 @@ +from libc.stdlib cimport malloc, calloc, realloc, free, strtof, strtol +from libc.string cimport strsep, strcpy, strlen + +cdef extern from "strutil.h": + char *strstrsep(char **stringp, char *delim) + char *strip(char *s) + char **split(char *s, char *delim, int *pn) + +import sys + +import sym +cimport sym +cdef sym.Alphabet alphabet +alphabet = sym.alphabet + +global span_limit +span_limit = None + +cdef int bufsize +cdef char *buf +bufsize = 100 +buf = <char *>malloc(bufsize) +cdef ensurebufsize(int size): + global buf, bufsize + if size > bufsize: + buf = <char *>realloc(buf, size*sizeof(char)) + bufsize = size + +cdef class Phrase: + def __cinit__(self, words): + cdef int i, j, n, n_vars + cdef char **toks + cdef bytes bwords + cdef char* cwords + + n_vars = 0 + if type(words) is str: + ensurebufsize(len(words)+1) + bwords = words + cwords = bwords + strcpy(buf, cwords) + toks = split(buf, NULL, &n) + self.syms = <int *>malloc(n*sizeof(int)) + for i from 0 <= i < n: + self.syms[i] = alphabet.fromstring(toks[i], 0) + if alphabet.isvar(self.syms[i]): + n_vars = n_vars + 1 + + else: + n = len(words) + self.syms = <int *>malloc(n*sizeof(int)) + for i from 0 <= i < n: + self.syms[i] = words[i] + if alphabet.isvar(self.syms[i]): + n_vars = n_vars + 1 + self.n = n + self.n_vars = n_vars + self.varpos = <int *>malloc(n_vars*sizeof(int)) + j = 0 + for i from 0 <= i < n: + if alphabet.isvar(self.syms[i]): + self.varpos[j] = i + j = j + 1 + + def __dealloc__(self): + free(self.syms) + free(self.varpos) + + def __str__(self): + strs = [] + cdef int i, s + for i from 0 <= i < self.n: + s = self.syms[i] + strs.append(alphabet.tostring(s)) + return " ".join(strs) + + def instantiable(self, i, j, n): + return span_limit is None or (j-i) <= span_limit + + def handle(self): + """return a hashable representation that normalizes the ordering + of the nonterminal indices""" + norm = [] + cdef int i, j, s + i = 1 + j = 0 + for j from 0 <= j < self.n: + s = self.syms[j] + if alphabet.isvar(s): + s = alphabet.setindex(s,i) + i = i + 1 + norm.append(s) + return tuple(norm) + + def strhandle(self): + strs = [] + norm = [] + cdef int i, j, s + i = 1 + j = 0 + for j from 0 <= j < self.n: + s = self.syms[j] + if alphabet.isvar(s): + s = alphabet.setindex(s,i) + i = i + 1 + norm.append(alphabet.tostring(s)) + return " ".join(norm) + + def arity(self): + return self.n_vars + + def getvarpos(self, i): + if 0 <= i < self.n_vars: + return self.varpos[i] + else: + raise IndexError + + def getvar(self, i): + if 0 <= i < self.n_vars: + return self.syms[self.varpos[i]] + else: + raise IndexError + + cdef int chunkpos(self, int k): + if k == 0: + return 0 + else: + return self.varpos[k-1]+1 + + cdef int chunklen(self, int k): + if self.n_vars == 0: + return self.n + elif k == 0: + return self.varpos[0] + elif k == self.n_vars: + return self.n-self.varpos[k-1]-1 + else: + return self.varpos[k]-self.varpos[k-1]-1 + + def clen(self, k): + return self.chunklen(k) + + def getchunk(self, ci): + cdef int start, stop + start = self.chunkpos(ci) + stop = start+self.chunklen(ci) + chunk = [] + for i from start <= i < stop: + chunk.append(self.syms[i]) + return chunk + + def __cmp__(self, other): + cdef Phrase otherp + cdef int i + otherp = other + for i from 0 <= i < min(self.n, otherp.n): + if self.syms[i] < otherp.syms[i]: + return -1 + elif self.syms[i] > otherp.syms[i]: + return 1 + if self.n < otherp.n: + return -1 + elif self.n > otherp.n: + return 1 + else: + return 0 + + def __hash__(self): + cdef int i + cdef unsigned h + h = 0 + for i from 0 <= i < self.n: + if self.syms[i] > 0: + h = (h << 1) + self.syms[i] + else: + h = (h << 1) + -self.syms[i] + return h + + def __len__(self): + return self.n + + def __getitem__(self, i): + return self.syms[i] + + def __iter__(self): + cdef int i + l = [] + for i from 0 <= i < self.n: + l.append(self.syms[i]) + return iter(l) + + def subst(self, start, children): + cdef int i + for i from 0 <= i < self.n: + if alphabet.isvar(self.syms[i]): + start = start + children[alphabet.getindex(self.syms[i])-1] + else: + start = start + (self.syms[i],) + return start + +cdef class Rule: + def __cinit__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): + cdef int i, n + cdef char *rest + + self.word_alignments = word_alignments + if scores is None: + self.cscores = NULL + self.n_scores = 0 + else: + n = len(scores) + self.cscores = <float *>malloc(n*sizeof(float)) + self.n_scores = n + for i from 0 <= i < n: + self.cscores[i] = scores[i] + + def __init__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): + if not sym.isvar(lhs): + sys.stderr.write("error: lhs=%d\n" % lhs) + self.lhs = lhs + self.f = f + self.e = e + self.word_alignments = word_alignments + + def __dealloc__(self): + if self.cscores != NULL: + free(self.cscores) + + def __str__(self): + return self.to_line() + + def __hash__(self): + return hash((self.lhs, self.f, self.e)) + + def __cmp__(self, Rule other): + return cmp((self.lhs, self.f, self.e, self.word_alignments), (other.lhs, other.f, other.e, self.word_alignments)) + + def __iadd__(self, Rule other): + if self.n_scores != other.n_scores: + raise ValueError + for i from 0 <= i < self.n_scores: + self.cscores[i] = self.cscores[i] + other.cscores[i] + return self + + def fmerge(self, Phrase f): + if self.f == f: + self.f = f + + def arity(self): + return self.f.arity() + + def to_line(self): + scorestrs = [] + for i from 0 <= i < self.n_scores: + scorestrs.append(str(self.cscores[i])) + fields = [alphabet.tostring(self.lhs), str(self.f), str(self.e), " ".join(scorestrs)] + if self.word_alignments is not None: + alignstr = [] + for i from 0 <= i < len(self.word_alignments): + alignstr.append("%d-%d" % (self.word_alignments[i]/65536, self.word_alignments[i]%65536)) + #for s,t in self.word_alignments: + #alignstr.append("%d-%d" % (s,t)) + fields.append(" ".join(alignstr)) + + return " ||| ".join(fields) + + property scores: + def __get__(self): + s = [None]*self.n_scores + for i from 0 <= i < self.n_scores: + s[i] = self.cscores[i] + return s + + def __set__(self, s): + if self.cscores != NULL: + free(self.cscores) + self.cscores = <float *>malloc(len(s)*sizeof(float)) + self.n_scores = len(s) + for i from 0 <= i < self.n_scores: + self.cscores[i] = s[i] + +def rule_copy(r): + r1 = Rule(r.lhs, r.f, r.e, r.owner, r.scores) + r1.word_alignments = r.word_alignments + return r1 + diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx new file mode 100644 index 00000000..20ea80d2 --- /dev/null +++ b/sa-extract/rulefactory.pyx @@ -0,0 +1,2360 @@ +# Pyrex implementation of the algorithms described in +# Lopez, EMNLP-CoNLL 2007 +# Much faster than the Python numbers reported there. +# Note to reader: this code is closer to C than Python +import sys +import sym +import log +import rule +import monitor +import cintlist +import csuf +import cdat +import cveb +import precomputation +import gc +import cn +import sgml + +cimport cmath +cimport csuf +cimport cdat +cimport cintlist +cimport rule +cimport cveb +cimport precomputation +cimport calignment + +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy +from libc.math cimport fmod, ceil, floor + +cdef int PRECOMPUTE +cdef int MERGE +cdef int BAEZA_YATES + +PRECOMPUTE = 0 +MERGE = 1 +BAEZA_YATES = 2 + +#cdef int node_count +#node_count = 0 + +cdef class TrieNode: + cdef public children + #cdef int id + + def __init__(self): + self.children = {} + #self.id = node_count + #node_count += 1 + + +cdef class ExtendedTrieNode(TrieNode): + cdef public phrase + cdef public phrase_location + cdef public suffix_link + + def __init__(self, phrase=None, phrase_location=None, suffix_link=None): + TrieNode.__init__(self) + self.phrase = phrase + self.phrase_location = phrase_location + self.suffix_link = suffix_link + + +cdef class TrieTable: + cdef public int extended + cdef public int count + cdef public root + def __cinit__(self, extended=False): + self.count = 0 + self.extended = extended + if extended: + self.root = ExtendedTrieNode() + else: + self.root = TrieNode() + +# linked list structure for storing matches in BaselineRuleFactory +cdef struct match_node: + int* match + match_node* next + +cdef class BaselineRuleFactory: + + cdef grammar, context_manager + cdef int max_terminals, max_nonterminals + cdef int max_initial_size, train_max_initial_size + cdef int min_gap_size, train_min_gap_size + cdef int category + cdef int visit + cdef float intersect_time, extract_time + cdef ruleFile, timingFile + cdef int* last_visit1 + cdef int* last_visit2 + cdef match_node** intersector1 + cdef match_node** intersector2 + cdef csuf.SuffixArray sa + cdef cintlist.CIntList sent_id + + def input(self, fwords): + flen = len(fwords) + start_time = monitor.cpu() + self.intersect_time = 0.0 + self.extract_time = 0.0 + + pyro_phrase_count = 0 + hiero_phrase_count = 0 + + frontier = [] + for i in xrange(len(fwords)): + frontier.append((i, (), False)) + + while len(frontier) > 0: + this_iter_intersect_time = self.intersect_time + new_frontier = [] + for i, prefix, is_shadow_path in frontier: + + word_id = fwords[i][0][0] + #print "word_id = %i" % word_id + phrase = prefix + (word_id,) + str_phrase = map(sym.tostring, phrase) + hiero_phrase = rule.Phrase(phrase) + + #log.writeln("pos %2d, '%s'" % (i, hiero_phrase)) + self.lookup(hiero_phrase) + if hiero_phrase.arity() == 0: + pyro_phrase_count = pyro_phrase_count + 1 + else: + hiero_phrase_count = hiero_phrase_count + 1 + + if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords): + new_frontier.append((i+1, phrase, is_shadow_path)) + if hiero_phrase.arity() < self.max_nonterminals: + xcat = sym.setindex(self.category, hiero_phrase.arity()+1) + for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))): + new_frontier.append((j, phrase+(xcat,), is_shadow_path)) + log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) + frontier = new_frontier + stop_time = monitor.cpu() + log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count)) + + + def lookup(self, phrase): + cdef int j, g, start, stop, sent_id, num_ranges, arity + cdef match_node** cur_intersector + cdef match_node** next_intersector + cdef match_node** tmp_intersector + cdef match_node* node + cdef match_node* cur_node + cdef match_node* prev_node + cdef match_node** node_ptr + cdef int* cur_visit + cdef int* next_visit + cdef int* tmp_visit + cdef int* chunklen + + #print "\n\nLOOKUP\n\n" + ranges = [] + sizes = [] + arity = phrase.arity() + chunklen = <int *> malloc(arity*sizeof(int)) + for i from 0 <= i < arity+1: + chunk = phrase.getchunk(i) + chunklen[i] = len(chunk) + sa_range = None + phr = () + for offset, word_id in enumerate(chunk): + word = sym.tostring(word_id) + sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1]) + if sa_range is None: + #log.writeln("Returned for phrase %s" % rule.Phrase(phr)) + return + #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr))) + ranges.append(sa_range) + sizes.append(sa_range[1]-sa_range[0]) + if phrase.arity() == 0: + return + + cur_intersector = self.intersector1 + next_intersector = self.intersector2 + cur_visit = self.last_visit1 + next_visit = self.last_visit2 + + num_ranges = len(ranges) + for i from 0 <= i < num_ranges: + sa_range = ranges[i] + start_time = monitor.cpu() + self.visit = self.visit + 1 + intersect_count = 0 + + start = sa_range[0] + stop = sa_range[1] + for j from start <= j < stop: + g = self.sa.sa.arr[j] + sent_id = self.sent_id.arr[g] + if i==0: + if next_visit[sent_id] != self.visit: + # clear intersector + node = next_intersector[sent_id] + next_intersector[sent_id] = NULL + while node != NULL: + prev_node = node + node = node.next + free(prev_node.match) + free(prev_node) + next_visit[sent_id] = self.visit + node_ptr = &(next_intersector[sent_id]) + while node_ptr[0] != NULL: + node_ptr = &(node_ptr[0].next) + node_ptr[0] = <match_node*> malloc(sizeof(match_node)) + node_ptr[0].match = <int *> malloc(sizeof(int)) + node_ptr[0].match[0] = g + node_ptr[0].next = NULL + intersect_count = intersect_count + 1 + else: + if cur_visit[sent_id] == self.visit-1: + cur_node = cur_intersector[sent_id] + while cur_node != NULL: + if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size: + if next_visit[sent_id] != self.visit: + # clear intersector -- note that we only do this if we've got something to put there + node = next_intersector[sent_id] + next_intersector[sent_id] = NULL + while node != NULL: + prev_node = node + node = node.next + free(prev_node.match) + free(prev_node) + next_visit[sent_id] = self.visit + node_ptr = &(next_intersector[sent_id]) + while node_ptr[0] != NULL: + node_ptr = &(node_ptr[0].next) + node_ptr[0] = <match_node*> malloc(sizeof(match_node)) + node_ptr[0].match = <int *> malloc((i+1) * sizeof(int)) + memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int)) + node_ptr[0].match[i] = g + node_ptr[0].next = NULL + intersect_count = intersect_count + 1 + cur_node = cur_node.next + tmp_intersector = cur_intersector + cur_intersector = next_intersector + next_intersector = tmp_intersector + + tmp_visit = cur_visit + cur_visit = next_visit + next_visit = tmp_visit + + intersect_time = monitor.cpu() - start_time + if i > 0: + log.writeln("INT %d %d %d %d %f baseline" % + (arity, prev_intersect_count, sa_range[1]-sa_range[0], + intersect_count, intersect_time)) + if intersect_count == 0: + return None + prev_intersect_count = intersect_count + free(chunklen) + + + + def __init__(self, max_terminals=5, max_nonterminals=2, + max_initial_size=10, train_max_initial_size=10, + min_gap_size=1, train_min_gap_size=2, + category='[PHRASE]', grammar=None, + ruleFile=None, timingFile=None): + self.grammar = grammar + self.max_terminals = max_terminals + self.max_nonterminals = max_nonterminals + self.max_initial_size = max_initial_size + self.train_max_initial_size = train_max_initial_size + self.min_gap_size = min_gap_size + self.train_min_gap_size = train_min_gap_size + self.category = sym.fromstring(category) + self.ruleFile = ruleFile + self.timingFile = timingFile + self.visit = 0 + + + def registerContext(self, context_manager): + cdef int num_sents + self.context_manager = context_manager + self.sa = context_manager.fsarray + self.sent_id = self.sa.darray.sent_id + + num_sents = len(self.sa.darray.sent_index) + self.last_visit1 = <int *> malloc(num_sents * sizeof(int)) + memset(self.last_visit1, 0, num_sents * sizeof(int)) + + self.last_visit2 = <int *> malloc(num_sents * sizeof(int)) + memset(self.last_visit2, 0, num_sents * sizeof(int)) + + self.intersector1 = <match_node **> malloc(num_sents * sizeof(match_node*)) + memset(self.intersector1, 0, num_sents * sizeof(match_node*)) + + self.intersector2 = <match_node **> malloc(num_sents * sizeof(match_node*)) + memset(self.intersector2, 0, num_sents * sizeof(match_node*)) + + +# encodes information needed to find a (hierarchical) phrase +# in the text. If phrase is contiguous, that's just a range +# in the suffix array; if discontiguous, it is the set of +# actual locations (packed into an array) +cdef class PhraseLocation: + cdef int sa_low + cdef int sa_high + cdef int arr_low + cdef int arr_high + cdef cintlist.CIntList arr + cdef int num_subpatterns + + # returns true if sent_id is contained + cdef int contains(self, int sent_id): + return 1 + + def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1): + self.sa_low = sa_low + self.sa_high = sa_high + self.arr_low = arr_low + self.arr_high = arr_high + self.arr = arr + self.num_subpatterns = num_subpatterns + + + +cdef class Sampler: + '''A Sampler implements a logic for choosing + samples from a population range''' + + cdef int sampleSize + cdef context_manager + cdef cintlist.CIntList sa + + def __init__(self, sampleSize=0): + self.sampleSize = sampleSize + if sampleSize > 0: + log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1) + else: + log.writeln("Sampling strategy: no sampling", 1) + + def registerContext(self, context_manager): + self.context_manager = context_manager + self.sa = (<csuf.SuffixArray> context_manager.fsarray).sa + + + def sample(self, PhraseLocation phrase_location): + '''Returns a sample of the locations for + the phrase. If there are less than self.sampleSize + locations, return all of them; otherwise, return + up to self.sampleSize locations. In the latter case, + we choose to sample UNIFORMLY -- that is, the locations + are chosen at uniform intervals over the entire set, rather + than randomly. This makes the algorithm deterministic, which + is good for things like MERT''' + cdef cintlist.CIntList sample + cdef double i, stepsize + cdef int num_locations, val, j + + sample = cintlist.CIntList() + if phrase_location.arr is None: + num_locations = phrase_location.sa_high - phrase_location.sa_low + if self.sampleSize == -1 or num_locations <= self.sampleSize: + sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations) + else: + stepsize = float(num_locations)/float(self.sampleSize) + i = phrase_location.sa_low + while i < phrase_location.sa_high and sample.len < self.sampleSize: + '''Note: int(i) not guaranteed to have the desired + effect, according to the python documentation''' + if fmod(i,1.0) > 0.5: + val = int(ceil(i)) + else: + val = int(floor(i)) + sample._append(self.sa.arr[val]) + i = i + stepsize + else: + num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns + if self.sampleSize == -1 or num_locations <= self.sampleSize: + sample = phrase_location.arr + else: + stepsize = float(num_locations)/float(self.sampleSize) + i = phrase_location.arr_low + while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns: + '''Note: int(i) not guaranteed to have the desired + effect, according to the python documentation''' + if fmod(i,1.0) > 0.5: + val = int(ceil(i)) + else: + val = int(floor(i)) + j = phrase_location.arr_low + (val*phrase_location.num_subpatterns) + sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns) + i = i + stepsize + return sample + + +cdef long nGramCount(PhraseLocation loc): + return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns + + +# struct used to encapsulate a single matching +cdef struct Matching: + int* arr + int start + int end + int sent_id + int size + + +cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr): + m.arr = arr + m.start = start + m.end = start + step + m.sent_id = sent_id_arr[arr[start]] + m.size = step + + +cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, + int offset_by_one, int num_subpatterns, int* result_len): + cdef int i, new_len + + new_len = result_len[0] + num_subpatterns + arr = <int*> realloc(arr, new_len*sizeof(int)) + + for i from 0 <= i < loc1.size: + arr[result_len[0]+i] = loc1.arr[loc1.start+i] + if num_subpatterns > loc1.size: + arr[new_len-1] = loc2.arr[loc2.end-1] + result_len[0] = new_len + return arr + + +cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len): + cdef int new_len + + new_len = arr_len[0] + appendix_len + arr = <int*> realloc(arr, new_len*sizeof(int)) + memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int)) + arr_len[0] = new_len + return arr + + +#cdef matching2str(Matching* m): +# cdef int i +# cdef result + +# result = "(" +# for i from m.start <= i < m.end: +# result = result + str(m.arr[i]) + " " +# result = result + ")" +# return result + + +cdef int median(int low, int high, int step): + return low + (((high - low)/step)/2)*step + + +cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus): + # Returns (minus, plus) indices for the portion of the array + # in which all matchings have the same first index as the one + # starting at loc + loc_plus[0] = loc + step + while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]: + loc_plus[0] = loc_plus[0] + step + loc_minus[0] = loc + while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]: + loc_minus[0] = loc_minus[0] - step + + +cdef class HieroCachingRuleFactory: + '''This RuleFactory implements a caching + method using TrieTable, which makes phrase + generation somewhat speedier -- phrases only + need to be extracted once (however, it is + quite possible they need to be scored + for each input sentence, for contextual models)''' + + cdef rules, grammar, context_manager + + cdef int max_chunks + cdef int max_target_chunks + cdef int max_length + cdef int max_target_length + cdef int max_nonterminals + cdef int max_initial_size + cdef int train_max_initial_size + cdef int min_gap_size + cdef int train_min_gap_size + cdef int category + + cdef cacheBetweenSents + cdef precomputed_index + cdef precomputed_collocations + cdef precompute_file + cdef max_rank + cdef int precompute_rank, precompute_secondary_rank + cdef useBaezaYates + cdef use_index + cdef use_collocations + cdef float by_slack_factor + + cdef per_sentence_grammar + cdef rule_filehandler + cdef rule_file + cdef pruned_rule_file + cdef extract_file + cdef sample_file + cdef search_file + cdef timingFile + cdef log_int_stats + cdef prev_norm_prefix + cdef float intersect_time, extract_time + cdef csuf.SuffixArray fsa + cdef cdat.DataArray fda + cdef cdat.DataArray eda + + cdef calignment.Alignment alignment + cdef cintlist.CIntList eid2symid + cdef cintlist.CIntList fid2symid + cdef int tight_phrases + cdef int require_aligned_terminal + cdef int require_aligned_chunks + + cdef cintlist.CIntList findexes + cdef cintlist.CIntList findexes1 + + cdef int excluded_sent_id # exclude a sentence id + + def __init__(self, + alignment=None, # compiled alignment object (REQUIRED) + by_slack_factor=1.0, # parameter for double-binary search; doesn't seem to matter much + category="[PHRASE]", # name of generic nonterminal used by Hiero + cacheBetweenSents=False, # prevent flushing of tree between sents; use carefully or you'll run out of memory + extract_file=None, # print raw extracted rules to this file + grammar=None, # empty grammar object -- must be supplied from outside (REQUIRED) + log_int_stats=False, # prints timing data on intersections to stderr + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=10, # maximum span of a grammar rule in TEST DATA + max_length=5, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=2, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=2, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=None, # filename of file containing precomputed collocations + precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=100, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + pruned_rule_file=None, # if specified, pruned grammars will be written to this filename + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate grammar files for each input segment + rule_file=None, # UNpruned grammars will be written to this filename + sample_file=None, # Sampling statistics will be written to this filename + search_file=None, # lookup statistics will be written to this filename + train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=2, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=False, # True if phrases should be tight, False otherwise (False == slower but better results) + timingFile=None, # timing statistics will be written to this filename + useBaezaYates=True, # True to require use of double-binary alg, false otherwise + use_collocations=True, # True to enable used of precomputed collocations + use_index=True # True to enable use of precomputed inverted indices + ): + '''Note: we make a distinction between the min_gap_size + and max_initial_size used in test and train. The latter + are represented by train_min_gap_size and train_max_initial_size, + respectively. This is because Chiang's model does not require + them to be the same, therefore we don't either.''' + self.rules = TrieTable(True) # cache + self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) + self.grammar = grammar + if alignment is None: + raise Exception("Must specify an alignment object") + self.alignment = alignment + + self.excluded_sent_id = -1 + + # grammar parameters and settings + # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.max_initial_size = max_initial_size + self.train_max_initial_size = train_max_initial_size + self.min_gap_size = min_gap_size + self.train_min_gap_size = train_min_gap_size + self.category = sym.fromstring(category) + + if max_chunks is None: + self.max_chunks = self.max_nonterminals + 1 + else: + self.max_chunks = max_chunks + + if max_target_chunks is None: + self.max_target_chunks = self.max_nonterminals + 1 + else: + self.max_target_chunks = max_target_chunks + + if max_target_length is None: + self.max_target_length = max_initial_size + else: + self.max_target_length = max_target_length + + # algorithmic parameters and settings + self.cacheBetweenSents = not per_sentence_grammar + self.precomputed_collocations = {} + self.precomputed_index = {} + self.use_index = use_index + self.use_collocations = use_collocations + self.max_rank = {} + self.precompute_file = precompute_file + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.useBaezaYates = useBaezaYates + self.by_slack_factor = by_slack_factor + if tight_phrases: + self.tight_phrases = 1 + else: + self.tight_phrases = 0 + + if require_aligned_chunks: + # one condition is a stronger version of the other. + self.require_aligned_chunks = 1 + self.require_aligned_terminal = 1 + elif require_aligned_terminal: + self.require_aligned_chunks = 0 + self.require_aligned_terminal = 1 + else: + self.require_aligned_chunks = 0 + self.require_aligned_terminal = 0 + + + self.per_sentence_grammar = per_sentence_grammar + if not self.per_sentence_grammar: + self.rule_filehandler = open(rule_file, "w") + # diagnostics + #if rule_file is None: + # self.rule_file = None + self.rule_file = rule_file + if extract_file is None: + self.extract_file = None + else: + self.extract_file = open(extract_file, "w") + if sample_file is None: + self.sample_file = None + else: + self.sample_file = open(sample_file, "w") + if search_file is None: + self.search_file = None + else: + self.search_file = open(search_file, "w") + self.pruned_rule_file = pruned_rule_file + self.timingFile = timingFile + self.log_int_stats = log_int_stats + self.prev_norm_prefix = () + + self.findexes = cintlist.CIntList(initial_len=10) + self.findexes1 = cintlist.CIntList(initial_len=10) + + def registerContext(self, context_manager): + '''This gives the RuleFactory access to the Context object. + Here we also use it to precompute the most expensive intersections + in the corpus quickly.''' + self.context_manager = context_manager + self.fsa = context_manager.fsarray + self.fda = self.fsa.darray + self.eda = context_manager.edarray + self.fid2symid = self.set_idmap(self.fda) + self.eid2symid = self.set_idmap(self.eda) + self.precompute() + + cdef set_idmap(self, cdat.DataArray darray): + cdef int word_id, new_word_id, N + cdef cintlist.CIntList idmap + + N = len(darray.id2word) + idmap = cintlist.CIntList(initial_len=N) + for word_id from 0 <= word_id < N: + new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True) + idmap.arr[word_id] = new_word_id + return idmap + + + def pattern2phrase(self, pattern): + # pattern is a tuple, which we must convert to a hiero rule.Phrase + result = () + arity = 0 + for word_id in pattern: + if word_id == -1: + arity = arity + 1 + new_id = sym.setindex(self.category, arity) + else: + new_id = sym.fromstring(self.fda.id2word[word_id]) + result = result + (new_id,) + return rule.Phrase(result) + + def pattern2phrase_plus(self, pattern): + # returns a list containing both the pattern, and pattern + # suffixed/prefixed with the NT category. + patterns = [] + result = () + arity = 0 + for word_id in pattern: + if word_id == -1: + arity = arity + 1 + new_id = sym.setindex(self.category, arity) + else: + new_id = sym.fromstring(self.fda.id2word[word_id]) + result = result + (new_id,) + patterns.append(rule.Phrase(result)) + patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),))) + patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result)) + return patterns + + def precompute(self): + cdef precomputation.Precomputation pre + + if self.precompute_file is not None: + start_time = monitor.cpu() + log.write("Reading precomputed data from file %s... " % self.precompute_file, 1) + pre = precomputation.Precomputation(self.precompute_file, from_binary=True) + # check parameters of precomputation -- some are critical and some are not + if pre.max_nonterminals != self.max_nonterminals: + log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals)) + if pre.max_length != self.max_length: + log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length)) + if pre.train_max_initial_size != self.train_max_initial_size: + log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size)) + raise Exception("Parameter mismatch with precomputed data") + if pre.train_min_gap_size != self.train_min_gap_size: + log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size)) + raise Exception("Parameter mismatch with precomputed data") + log.writeln("done.", 1) + if self.use_index: + log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1) + for pattern, arr in pre.precomputed_index.iteritems(): + phrases = self.pattern2phrase_plus(pattern) + for phrase in phrases: + self.precomputed_index[phrase] = arr + log.writeln("done.", 1) + if self.use_collocations: + log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1) + for pattern, arr in pre.precomputed_collocations.iteritems(): + phrase = self.pattern2phrase(pattern) + self.precomputed_collocations[phrase] = arr + log.writeln("done.", 1) + stop_time = monitor.cpu() + log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1) + + + def getPrecomputedCollocation(self, phrase): + if phrase in self.precomputed_collocations: + arr = self.precomputed_collocations[phrase] + return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1) + return None + + + cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, + int low2, int high2, int* arr2, int step2, + int offset_by_one, int len_last, int num_subpatterns, int* result_len): + cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus + cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high + cdef int med_result_len, low_result_len, high_result_len + cdef long comparison + cdef int* result + cdef int* low_result + cdef int* med_result + cdef int* high_result + cdef Matching loc1, loc2 + + result = <int*> malloc(0*sizeof(int*)) +# log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5) + + d_first = 0 + if high1 - low1 > high2 - low2: +# log.writeln("%sD first" % (pad), 5) + d_first = 1 +# else: +# log.writeln("%sQ first" % (pad), 5) + +# '''First, check to see if we are at any of the +# recursive base cases''' +# +# '''Case 1: one of the sets is empty''' + if low1 >= high1 or low2 >= high2: +# log.writeln("%sRETURN: set is empty" % (pad), 5) + return result + +# '''Case 2: sets are non-overlapping''' + assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr) + assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1: +# log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5) +# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) + return result + + assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr) + assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: +# log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5) +# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) + return result + + # Case 3: query set and data set do not meet size mismatch constraints; + # We use mergesort instead in this case + qsetsize = (high1-low1) / step1 + dsetsize = (high2-low2) / step2 + if d_first: + tmp = qsetsize + qsetsize = dsetsize + dsetsize = tmp + + if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize: + free(result) + return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len) + + # binary search. There are two flavors, depending on + # whether the queryset or dataset is first + if d_first: + med2 = median(low2, high2, step2) + assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) + + search_low = low1 + search_high = high1 + while search_low < search_high: + med1 = median(search_low, search_high, step1) + findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) + comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) + if comparison == -1: + search_low = med1_plus + elif comparison == 1: + search_high = med1_minus + else: + break + else: + med1 = median(low1, high1, step1) + findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) + + search_low = low2 + search_high = high2 + while search_low < search_high: + med2 = median(search_low, search_high, step2) + assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) + comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) + if comparison == -1: + search_high = med2 + elif comparison == 1: + search_low = med2 + step2 + else: + break + + med_result_len = 0 + med_result = <int*> malloc(0*sizeof(int*)) + if search_high > search_low: +# '''Then there is a match for the median element of Q''' +# +# '''What we want to find is the group of all bindings in the first set +# s.t. their first element == the first element of med1. Then we +# want to store the bindings for all of those elements. We can +# subsequently throw all of them away.''' + med2_minus = med2 + med2_plus = med2 + step2 + i1 = med1_minus + while i1 < med1_plus: + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) + while med2_minus-step2 >= low2: + assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1: + med2_minus = med2_minus - step2 + else: + break + i2 = med2_minus + while i2 < high2: + assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) + comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) + if comparison == 0: + pass + med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len) + if comparison == -1: + break + i2 = i2 + step2 + if i2 > med2_plus: + med2_plus = i2 + i1 = i1 + step1 + + tmp = med1_minus + med1_minus = med1_plus + med1_plus = tmp + else: + # No match; need to figure out the point of division in D and Q + med2_minus = med2 + med2_plus = med2 + if d_first: + med2_minus = med2_minus + step2 + if comparison == -1: + med1_minus = med1_plus + if comparison == 1: + med1_plus = med1_minus + else: + tmp = med1_minus + med1_minus = med1_plus + med1_plus = tmp + if comparison == 1: + med2_minus = med2_minus + step2 + med2_plus = med2_plus + step2 + + low_result_len = 0 + low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len) + high_result_len = 0 + high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len) + + result = extend_arr(result, result_len, low_result, low_result_len) + result = extend_arr(result, result_len, med_result, med_result_len) + result = extend_arr(result, result_len, high_result, high_result_len) + free(low_result) + free(med_result) + free(high_result) + + return result + + + + cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, + Matching* loc2, int offset_by_one, int len_last): +# '''Compares a *set* of bindings, all with the same first element, +# to a single binding. Returns -1 if all comparisons == -1, 1 if all +# comparisons == 1, and 0 otherwise.''' + cdef int i1, comparison, prev_comparison + cdef Matching l1_stack + cdef Matching* loc1 + + loc1 = &l1_stack + + i1 = i1_minus + while i1 < i1_plus: + assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr) + comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last) + if comparison == 0: + prev_comparison = 0 + break + elif i1 == i1_minus: + prev_comparison = comparison + else: + if comparison != prev_comparison: + prev_comparison = 0 + break + i1 = i1 + step1 + return prev_comparison + + + cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last): + cdef int i + + if loc1.sent_id > loc2.sent_id: + return 1 + if loc2.sent_id > loc1.sent_id: + return -1 + + if loc1.size == 1 and loc2.size == 1: + if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size: + return 1 + + elif offset_by_one: + for i from 1 <= i < loc1.size: + if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]: + return 1 + if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]: + return -1 + + else: + if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]: + return 1 + if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]: + return -1 + + for i from 1 <= i < loc1.size: + if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]: + return 1 + if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]: + return -1 + + if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size: + return -1 + return 0 + + + cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, + int low2, int high2, int* arr2, int step2, + int offset_by_one, int len_last, int num_subpatterns, int* result_len): + cdef int i1, i2, j1, j2 + cdef long comparison + cdef int* result + cdef Matching loc1, loc2 +# cdef int i + +# pad = " " +# log.writeln("->mergeHelper", 5) + + result_len[0] = 0 + result = <int*> malloc(0*sizeof(int)) + + i1 = low1 + i2 = low2 +# if log.level==5: +# log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5) +# log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5) +# log.write("[") +# for i from low1 <= i < high1: +# log.write("%d, " % arr1.arr[i],5) +# log.writeln("]") +# log.write("[") +# for i from low2 <= i < high2: +# log.write("%d, " % arr2.arr[i],5) +# log.writeln("]") + while i1 < high1 and i2 < high2: + +# '''First, pop all unneeded loc2's off the stack''' + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) +# if log.level==5: +# log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5) + while i2 < high2: + assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: +# if log.level==5: +# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) +# log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5) + i2 = i2 + step2 + else: + break + +# '''Next: process all loc1's with the same starting val''' + j1 = i1 + while i1 < high1 and arr1[j1] == arr1[i1]: + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) + j2 = i2 + while j2 < high2: + assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr) + comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) + if comparison == 0: +# if log.level==5: +# log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len) + if comparison == 1: +# if log.level==5: +# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + pass + if comparison == -1: +# if log.level==5: +# log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + break + else: + j2 = j2 + step2 +# if log.level==5: +# log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5) + i1 = i1 + step1 + +# log.writeln("<-mergeHelper", 5) + return result + + + cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase): + cdef int i, j + cdef cveb.VEB veb + cdef cintlist.CIntList result + + if phrase in self.precomputed_index: + loc.arr = self.precomputed_index[phrase] + else: + loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low) + veb = cveb.VEB(arr.len) + for i from loc.sa_low <= i < loc.sa_high: + veb._insert(arr.arr[i]) + i = veb.veb.min_val + for j from 0 <= j < loc.sa_high-loc.sa_low: + loc.arr.arr[j] = i + i = veb._findsucc(i) + loc.arr_low = 0 + loc.arr_high = loc.arr.len + + + cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix, + PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm): + + cdef cintlist.CIntList arr1, arr2, result + cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len + cdef int* result_ptr + cdef csuf.SuffixArray suf + + result_len = 0 + + if sym.isvar(suffix[0]): + offset_by_one = 1 + else: + offset_by_one = 0 + + len_last = len(suffix.getchunk(suffix.arity())) + + if prefix_loc.arr is None: + suf = self.context_manager.fsarray + self.sortPhraseLoc(suf.sa, prefix_loc, prefix) + arr1 = prefix_loc.arr + low1 = prefix_loc.arr_low + high1 = prefix_loc.arr_high + step1 = prefix_loc.num_subpatterns + + if suffix_loc.arr is None: + suf = self.context_manager.fsarray + self.sortPhraseLoc(suf.sa, suffix_loc, suffix) + arr2 = suffix_loc.arr + low2 = suffix_loc.arr_low + high2 = suffix_loc.arr_high + step2 = suffix_loc.num_subpatterns + + num_subpatterns = prefix.arity()+1 + + if algorithm == MERGE: + result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, + low2, high2, arr2.arr, step2, + offset_by_one, len_last, num_subpatterns, &result_len) + else: + result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, + low2, high2, arr2.arr, step2, + offset_by_one, len_last, num_subpatterns, &result_len) + + if result_len == 0: + free(result_ptr) + return None + else: + result = cintlist.CIntList() + free(result.arr) + result.arr = result_ptr + result.len = result_len + result.size = result_len + return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns) + + cdef loc2str(self, PhraseLocation loc): + cdef int i, j + result = "{" + i = 0 + while i < loc.arr_high: + result = result + "(" + for j from i <= j < i + loc.num_subpatterns: + result = result + ("%d " %loc.arr[j]) + result = result + ")" + i = i + loc.num_subpatterns + result = result + "}" + return result + +# cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2): +# cdef i +# if loc1 is None and type1=="pre": +# return +# if loc1 is None: +# if loc2 is None or loc2.arr_high == 0: +# return +# if loc2 is None: +# if loc1.arr_high == 0: +# return +# if loc1.arr_high != loc2.arr_high: +# log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2)) +# #log.writeln(" %s" % self.loc2str(loc2)) +# if loc1.arr_high == 0: +# return +# elif loc1.num_subpatterns != loc2.num_subpatterns: +# log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase)) +# for i from 0 <= i < loc1.arr_high: +# if loc1.arr[i] != loc2.arr[i]: +# log.writeln("ERROR 3") +# + cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase): + cdef rule.Phrase prefix, suffix + cdef PhraseLocation prefix_loc, suffix_loc, result + + start_time = monitor.cpu() + prefix = prefix_node.phrase + suffix = suffix_node.phrase + prefix_loc = prefix_node.phrase_location + suffix_loc = suffix_node.phrase_location + + result = self.getPrecomputedCollocation(phrase) + if result is not None: + intersect_method = "precomputed" + + if result is None: + if self.useBaezaYates: + result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES) + intersect_method="double binary" + else: + result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE) + intersect_method="merge" + stop_time = monitor.cpu() + intersect_time = stop_time - start_time + if self.log_int_stats: + if intersect_method == "precomputed": + sort1 = "none" + sort2 = "none" + else: + if prefix in self.precomputed_index: + sort1 = "index" + else: + sort1 = "veb" + if suffix in self.precomputed_index: + sort2 = "index" + else: + sort2 = "veb" + result_len=0 + if result is not None: + result_len = len(result.arr)/result.num_subpatterns + rank = 0 +# if phrase in self.max_rank: +# rank = self.max_rank[phrase] +# else: +# rank = self.precompute_rank + 10 + log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % + (len(prefix)+1 - prefix.arity(), prefix.arity(), + nGramCount(prefix_node.phrase_location), + nGramCount(suffix_node.phrase_location), + result_len, intersect_time, rank, intersect_method, sort1, sort2)) + return result + + def advance(self, frontier, res, fwords): + nf = [] + for (toskip, (i, alt, pathlen)) in frontier: + spanlen = fwords[i][alt][2] + if (toskip == 0): + #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3) + res.append((i, alt, pathlen)) + ni = i + spanlen + #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3) + if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size): + for na in xrange(len(fwords[ni])): + nf.append((toskip - 1, (ni, na, pathlen + 1))) + if (len(nf) > 0): + return self.advance(nf, res, fwords) + else: + return res + + def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer): + frontier = [] + if (i+spanlen+skip >= len(next_states)): + return frontier + #print "get_all_nodes_isteps_away from %i" % (i) + key = tuple([i,spanlen]) + reachable = [] + if (key in reachable_buffer): + reachable = reachable_buffer[key] + else: + reachable = self.reachable(fwords, i, spanlen) + reachable_buffer[key] = reachable + #print "reachable(from=%i,dist=%i) = " % (i,spanlen) + #print reachable + for nextreachable in reachable: + for next_id in next_states[nextreachable]: + jump = self.shortest(fwords,i,next_id) + #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump) + #if (next_id - (i+spanlen)) < skip: + if jump < skip: + continue + #if next_id-(i-pathlen) < self.max_initial_size: + if pathlen+jump <= self.max_initial_size: + for alt_id in xrange(len(fwords[next_id])): + if (fwords[next_id][alt_id][0] != cn.epsilon): + #frontier.append((next_id,alt_id,next_id-(i-pathlen))); + #print "finding the shortest from %i to %i" % (i, next_id) + newel = (next_id,alt_id,pathlen+jump) + if newel not in frontier: + frontier.append((next_id,alt_id,pathlen+jump)) + #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) + #else: + #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) + #else: + #print "next_id = %s is aborted\n" % next_id + #print "returning frontier" + #print frontier + return frontier + + def reachable(self, fwords, ifrom, dist): + #print "inside reachable(%i,%i)" % (ifrom,dist) + ret = [] + if (ifrom >= len(fwords)): + return ret + for alt_id in xrange(len(fwords[ifrom])): + if (fwords[ifrom][alt_id][0] == cn.epsilon): + ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist)) + else: + if (dist==0): + if (ifrom not in ret): + ret.append(ifrom) + else: + for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1): + if (ifromchild not in ret): + ret.append(ifromchild) + + return ret + + def shortest(self, fwords, ifrom, ito): + min = 1000 + #print "shortest ifrom=%i, ito=%i" % (ifrom,ito) + if (ifrom > ito): + return min + if (ifrom == ito): + return 0 + for alt_id in xrange(len(fwords[ifrom])): + currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito) + if (fwords[ifrom][alt_id][0] != cn.epsilon): + currmin += 1 + if (currmin<min): + min = currmin + return min + + def get_next_states(self, _columns, curr_idx, min_dist=2): + result = [] + candidate = [[curr_idx,0]] + + while len(candidate) > 0: + curr = candidate.pop() + if curr[0] >= len(_columns): + continue + if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size: + result.append(curr[0]); + curr_col = _columns[curr[0]] + for alt in curr_col: + next_id = curr[0]+alt[2] + jump = 1 + if (alt[0] == cn.epsilon): + jump = 0 + if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1: + candidate.append([next_id,curr[1]+jump]) + return sorted(result); + + def input(self, fwords, meta): + '''When this function is called on the RuleFactory, + it looks up all of the rules that can be used to translate + the input sentence''' + cdef int i, j, k, flen, arity, num_subpatterns, num_samples + cdef float start_time + cdef PhraseLocation phrase_location + cdef cintlist.CIntList sample, chunklen + cdef Matching matching + cdef rule.Phrase hiero_phrase + + #fwords = [ ((1,0.0,1),), fwords1 ] #word id for <s> = 1, cost = 0.0, next = 1 + #print fwords + flen = len(fwords) + #print "length = %i" % flen + start_time = monitor.cpu() + self.intersect_time = 0.0 + self.extract_time = 0.0 + nodes_isteps_away_buffer = {} + hit = 0 + reachable_buffer = {} + #print "id = ",meta + #print "rule_file = ",self.rule_file + dattrs = sgml.attrs_to_dict(meta) + id = dattrs.get('id', 'NOID') + if self.per_sentence_grammar: + self.rule_filehandler = open(self.rule_file+'.'+id, 'w') + self.excluded_sent_id = int(dattrs.get('exclude', '-1')) + + #print "max_initial_size = %i" % self.max_initial_size + + if not self.cacheBetweenSents: + self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) + self.grammar.root = [None, {}] + + frontier = [] + for i in xrange(len(fwords)): + for alt in xrange(0, len(fwords[i])): + if fwords[i][alt][0] != cn.epsilon: + frontier.append((i, i, alt, 0, self.rules.root, (), False)) + + xroot = None + x1 = sym.setindex(self.category, 1) + if x1 in self.rules.root.children: + xroot = self.rules.root.children[x1] + else: + xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) + self.rules.root.children[x1] = xroot + + for i in xrange(self.min_gap_size, len(fwords)): + for alt in xrange(0, len(fwords[i])): + if fwords[i][alt][0] != cn.epsilon: + frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True)) + '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: + if len(prefix)>0: + print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path + else: + print k, i, alt, pathlen, node, prefix, is_shadow_path''' + + #for wid in xrange(1000): + # print "%i = %s" % (wid, sym.tostring(wid)) + next_states = [] + for i in xrange(len(fwords)): + next_states.append(self.get_next_states(fwords,i,self.min_gap_size)) + #print "next state of %i" % i + #print next_states[i] + + while len(frontier) > 0: + #print "frontier = %i" % len(frontier) + this_iter_intersect_time = self.intersect_time + new_frontier = [] + for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: + #print "looking at: " + #if len(prefix)>0: + # print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path + #else: + # print k, i, alt, pathlen, node, prefix, is_shadow_path + word_id = fwords[i][alt][0] + spanlen = fwords[i][alt][2] + #print "word_id = %i, %s" % (word_id, sym.tostring(word_id)) + # to prevent .. [X] </S> + #print "prefix = ",prefix + #if word_id == 2 and len(prefix)>=2: + #print "at the end: %s" % (prefix[len(prefix)-1]) + #if prefix[len(prefix)-1]<0: + #print "break" + #continue + #print "continuing" + #if pathlen + spanlen > self.max_initial_size: + #continue + # TODO get rid of k -- pathlen is replacing it + if word_id == cn.epsilon: + #print "skipping because word_id is epsilon" + if i+spanlen >= len(fwords): + continue + for nualt in xrange(0,len(fwords[i+spanlen])): + frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path)) + continue + + phrase = prefix + (word_id,) + str_phrase = map(sym.tostring, phrase) + hiero_phrase = rule.Phrase(phrase) + arity = hiero_phrase.arity() + + #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase) + if self.search_file is not None: + self.search_file.write("%s\n" % hiero_phrase) + + lookup_required = False + if word_id in node.children: + if node.children[word_id] is None: + #print "Path dead-ends at this node\n" + continue + else: + #print "Path continues at this node\n" + node = node.children[word_id] + else: + if node.suffix_link is None: + #print "Current node is root; lookup required\n" + lookup_required = True + else: + if word_id in node.suffix_link.children: + if node.suffix_link.children[word_id] is None: + #print "Suffix link reports path is dead end\n" + node.children[word_id] = None + continue + else: + #print "Suffix link indicates lookup is reqired\n" + lookup_required = True + else: + #print "ERROR: We never get here\n" + raise Exception("Keyword trie error") + #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path)) + #print "checking whether lookup_required\n" + if lookup_required: + new_node = None + if is_shadow_path: + #print "Extending shadow path for %s \n" + # on the shadow path we don't do any search, we just use info from suffix link + new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, + suffix_link=node.suffix_link.children[word_id], + phrase=hiero_phrase) + else: + if arity > 0: + #print "Intersecting for %s because of arity > 0\n" % hiero_phrase + phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase) + else: + #print "Suffix array search for %s" % hiero_phrase + phrase_location = node.phrase_location + sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high) + if sa_range is not None: + phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1]) + else: + phrase_location = None + + if phrase_location is None: + node.children[word_id] = None + #print "Search failed\n" + continue + #print "Search succeeded\n" + suffix_link = self.rules.root + if node.suffix_link is not None: + suffix_link = node.suffix_link.children[word_id] + new_node = ExtendedTrieNode(phrase_location=phrase_location, + suffix_link=suffix_link, + phrase=hiero_phrase) + node.children[word_id] = new_node + node = new_node + #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id) + + '''Automatically add a trailing X node, if allowed -- + This should happen before we get to extraction (so that + the node will exist if needed)''' + if arity < self.max_nonterminals: + xcat_index = arity+1 + xcat = sym.setindex(self.category, xcat_index) + suffix_link_xcat_index = xcat_index + if is_shadow_path: + suffix_link_xcat_index = xcat_index-1 + suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index) + node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, + suffix_link=node.suffix_link.children[suffix_link_xcat], + phrase= rule.Phrase(phrase + (xcat,))) + #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4) + + # sample from range + if not is_shadow_path: + #print "is_not_shadow_path" + sample = self.context_manager.sampler.sample(node.phrase_location) + #print "node.phrase_location %s" % str(node.phrase_location) + #print "sample.len = %i" % len(sample) + num_subpatterns = (<PhraseLocation> node.phrase_location).num_subpatterns + chunklen = cintlist.CIntList(initial_len=num_subpatterns) + for j from 0 <= j < num_subpatterns: + chunklen.arr[j] = hiero_phrase.chunklen(j) + extracts = [] + j = 0 + extract_start = monitor.cpu() + '''orig_tight_phrases = self.tight_phrases + orig_require_aligned_terminal = self.require_aligned_terminal + orig_require_aligned_chunks = self.require_aligned_chunks + if k==0 or i==len(fwords)-1: + self.tight_phrases = 0 + self.require_aligned_terminal = 0 + self.require_aligned_chunks = 0''' + while j < sample.len: + extract = [] + + assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr) + '''print "tight_phrase = " + print self.tight_phrases + print "require_aligned_terminal = " + print self.require_aligned_terminal + print "require_aligned_chunks = " + print self.require_aligned_chunks''' + + extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns) + extracts.extend(extract) + j = j + num_subpatterns + '''self.tight_phrases = orig_tight_phrases + sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal + self.require_aligned_chunks = orig_require_aligned_chunks''' + num_samples = sample.len/num_subpatterns + extract_stop = monitor.cpu() + self.extract_time = self.extract_time + extract_stop - extract_start + #print "extract.size = %i" % len(extracts) + if len(extracts) > 0: + fphrases = {} + fals = {} + fcount = {} + for f, e, count, als in extracts: + fcount.setdefault(f, 0.0) + fcount[f] = fcount[f] + count + fphrases.setdefault(f, {}) + fphrases[f].setdefault(e, {}) + #fphrases[f][e] = fphrases[f][e] + count + fphrases[f][e].setdefault(als,0.0) + fphrases[f][e][als] = fphrases[f][e][als] + count + #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als] + #fals[str(f)+" ||| "+str(e)] = als + for f, elist in fphrases.iteritems(): + #print "f = '%s'" % f + #if (str(f) in ['<s>','</s>','<s> [X,1]','[X,1] </s>']): + # print "rejected" + # continue + f_margin = fcount[f] + for e, alslist in elist.iteritems(): + alignment = None + count = 0 + for als, currcount in alslist.iteritems(): + #print "als = ",als,", count = ",currcount + if currcount > count: + alignment = als + count = currcount + #alignment = fals[str(f)+" ||| "+str(e)] + #print "selected = ",alignment," with count = ",count + scores = [] + for m in self.context_manager.models: + scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples)) + r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment) + self.grammar.add(r) + if self.rule_filehandler is not None: + self.rule_filehandler.write("%s\n" % r.to_line()) + #print "adding a rule = %s" % r + + #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size: + if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size: + #to prevent [X] </s> + #print "lexicalized" + for alt_id in xrange(len(fwords[i+spanlen])): + #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size): + #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path)) + #print "alt_id = %d\n" % alt_id + new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path)) + #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path) + #print "end lexicalized" + num_subpatterns = arity + if not is_shadow_path: + num_subpatterns = num_subpatterns + 1 + #to avoid <s> X ... we want <s> next to a lexicalized item + #if k>0 and i<len(fwords)-1 and len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks: + if len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks: + #print "masuk kondisi" + xcat = sym.setindex(self.category, arity+1) + xnode = node.children[xcat] + #frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, spanlen, pathlen, fwords, next_states) + # I put spanlen=1 below + key = tuple([self.min_gap_size, i, 1, pathlen]) + frontier_nodes = [] + if (key in nodes_isteps_away_buffer): + frontier_nodes = nodes_isteps_away_buffer[key] + else: + frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, 1, pathlen, fwords, next_states, reachable_buffer) + nodes_isteps_away_buffer[key] = frontier_nodes + + #print "new frontier:\n" + for (i, alt, pathlen) in frontier_nodes: + #if (pathlen+fwords[i][alt][2] <= self.max_initial_size): + new_frontier.append((k, i, alt, pathlen, xnode, phrase +(xcat,), is_shadow_path)) + #print k, i, alt, pathlen, node, map(sym.tostring,phrase +(xcat,)), is_shadow_path + #print "all end\n"; + #else: + #print "no new frontier1\n"; + #else : + #print "no new frontier2\n" + if self.log_int_stats: + log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) + frontier = new_frontier + + stop_time = monitor.cpu() + log.writeln("Total time for rule lookup, extraction, and scoring = %f seconds" % (stop_time - start_time)) + #log.writeln(" Intersect time = %f seconds" % self.intersect_time) + gc.collect() + log.writeln(" Extract time = %f seconds" % self.extract_time) + if self.pruned_rule_file: + self.grammar.dump(self.pruned_rule_file) + if self.per_sentence_grammar: + self.rule_filehandler.close() + +# else: +# self.rule_filehandler.write("###EOS_"+ id +"\n") + + + cdef int find_fixpoint(self, + int f_low, f_high, + int* f_links_low, int* f_links_high, + int* e_links_low, int* e_links_high, + int e_in_low, int e_in_high, + int* e_low, int* e_high, + int* f_back_low, int* f_back_high, + int f_sent_len, int e_sent_len, + int max_f_len, int max_e_len, + int min_fx_size, int min_ex_size, + int max_new_x, + int allow_low_x, int allow_high_x, + int allow_arbitrary_x, int write_log): + cdef int e_low_prev, e_high_prev, f_low_prev, f_high_prev, new_x, new_low_x, new_high_x + + e_low[0] = e_in_low + e_high[0] = e_in_high + self.find_projection(f_low, f_high, f_links_low, f_links_high, e_low, e_high) + if e_low[0] == -1: + # low-priority corner case: if phrase w is unaligned, + # but we don't require aligned terminals, then returning + # an error here might prevent extraction of allowed + # rule X -> X_1 w X_2 / X_1 X_2. This is probably + # not worth the bother, though. + #print "find_fixpoint0" + return 0 + elif e_in_low != -1 and e_low[0] != e_in_low: + if e_in_low - e_low[0] < min_ex_size: + e_low[0] = e_in_low - min_ex_size + if e_low[0] < 0: + #print "find_fixpoint1" + return 0 + + if e_high[0] - e_low[0] > max_e_len: + #print "find_fixpoint2" + return 0 + elif e_in_high != -1 and e_high[0] != e_in_high: + if e_high[0] - e_in_high < min_ex_size: + e_high[0] = e_in_high + min_ex_size + if e_high[0] > e_sent_len: + #print "find_fixpoint3" + return 0 + + f_back_low[0] = -1 + f_back_high[0] = -1 + f_low_prev = f_low + f_high_prev = f_high + new_x = 0 + new_low_x = 0 + new_high_x = 0 + + while True: + + if f_back_low[0] == -1: + self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) + else: + self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high) + self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) + + if f_back_low[0] > f_low: + f_back_low[0] = f_low + + if f_back_high[0] < f_high: + f_back_high[0] = f_high + + if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev: + return 1 + + if allow_low_x == 0 and f_back_low[0] < f_low: +# log.writeln(" FAIL: f phrase is not tight") + #print " FAIL: f phrase is not tight" + return 0 + + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: f back projection is too wide") + #print " FAIL: f back projection is too wide" + return 0 + + if allow_high_x == 0 and f_back_high[0] > f_high: +# log.writeln(" FAIL: extension on high side not allowed") + #print " FAIL: extension on high side not allowed" + return 0 + + if f_low != f_back_low[0]: + if new_low_x == 0: + if new_x >= max_new_x: +# log.writeln(" FAIL: extension required on low side violates max # of gaps") + #print " FAIL: extension required on low side violates max # of gaps" + return 0 + else: + new_x = new_x + 1 + new_low_x = 1 + if f_low - f_back_low[0] < min_fx_size: + f_back_low[0] = f_low - min_fx_size + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: extension required on low side violates max initial length") + #print " FAIL: extension required on low side violates max initial length" + return 0 + if f_back_low[0] < 0: +# log.writeln(" FAIL: extension required on low side violates sentence boundary") + #print " FAIL: extension required on low side violates sentence boundary" + return 0 + + if f_high != f_back_high[0]: + if new_high_x == 0: + if new_x >= max_new_x: +# log.writeln(" FAIL: extension required on high side violates max # of gaps") + #print " FAIL: extension required on high side violates max # of gaps" + return 0 + else: + new_x = new_x + 1 + new_high_x = 1 + if f_back_high[0] - f_high < min_fx_size: + f_back_high[0] = f_high + min_fx_size + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: extension required on high side violates max initial length") + #print " FAIL: extension required on high side violates max initial length" + return 0 + if f_back_high[0] > f_sent_len: +# log.writeln(" FAIL: extension required on high side violates sentence boundary") + #print " FAIL: extension required on high side violates sentence boundary" + return 0 + + e_low_prev = e_low[0] + e_high_prev = e_high[0] + + self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high) + self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high) + if e_low[0] == e_low_prev and e_high[0] == e_high_prev: + return 1 + if allow_arbitrary_x == 0: +# log.writeln(" FAIL: arbitrary expansion not permitted") + #print " FAIL: arbitrary expansion not permitted" + return 0 + if e_high[0] - e_low[0] > max_e_len: +# log.writeln(" FAIL: re-projection violates sentence max phrase length") + #print " FAIL: re-projection violates sentence max phrase length" + return 0 + f_low_prev = f_back_low[0] + f_high_prev = f_back_high[0] + + + cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, + int* out_low, int* out_high): + cdef int i + for i from in_low <= i < in_high: + if in_links_low[i] != -1: + if out_low[0] == -1 or in_links_low[i] < out_low[0]: + out_low[0] = in_links_low[i] + if out_high[0] == -1 or in_links_high[i] > out_high[0]: + out_high[0] = in_links_high[i] + + + cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len): + cdef int new_len + new_len = arr_len[0] + data_len + arr = <int*> realloc(arr, new_len*sizeof(int)) + memcpy(arr+arr_len[0], data, data_len*sizeof(int)) + arr_len[0] = new_len + return arr + + + cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps, + int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, + int sent_id, int e_sent_len, int e_sent_start): + cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high + cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks + cdef cintlist.CIntList ephr_arr + cdef result + + #print "inside extract_phrases" + #print "f_low=%d, f_high=%d" % (f_low,f_high) + result = [] + len1 = 0 + e_gaps1 = <int*> malloc(0) + ephr_arr = cintlist.CIntList() + + e_gap_order = <int*> malloc(num_gaps*sizeof(int)) + if num_gaps > 0: + e_gap_order[0] = 0 + for i from 1 <= i < num_gaps: + for j from 0 <= j < i: + if e_gap_low[i] < e_gap_low[j]: + for k from j <= k < i: + e_gap_order[k+1] = e_gap_order[k] + e_gap_order[j] = i + break + else: + e_gap_order[i] = i + + e_x_low = e_low + e_x_high = e_high + if self.tight_phrases == 0: + while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1: + e_x_low = e_x_low - 1 + while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1: + e_x_high = e_x_high + 1 + + for i from e_x_low <= i <= e_low: + e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1) + + for i from 0 <= i < num_gaps: + e_gaps2 = <int*> malloc(0) + len2 = 0 + + j = e_gap_order[i] + e_x_gap_low = e_gap_low[j] + e_x_gap_high = e_gap_high[j] + if self.tight_phrases == 0: + while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1: + e_x_gap_low = e_x_gap_low - 1 + while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1: + e_x_gap_high = e_x_gap_high + 1 + + k = 0 + step = 1+(i*2) + while k < len1: + for m from e_x_gap_low <= m <= e_gap_low[j]: + if m >= e_gaps1[k+step-1]: + for n from e_gap_high[j] <= n <= e_x_gap_high: + if n-m >= 1: # extractor.py doesn't restrict target-side gap length + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1) + k = k + step + free(e_gaps1) + e_gaps1 = e_gaps2 + len1 = len2 + + step = 1+(num_gaps*2) + e_gaps2 = <int*> malloc(0) + len2 = 0 + for i from e_high <= i <= e_x_high: + j = 0 + while j < len1: + if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]: + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1) + j = j + step + free(e_gaps1) + e_gaps1 = e_gaps2 + len1 = len2 + + step = (num_gaps+1)*2 + i = 0 + + while i < len1: + ephr_arr._clear() + num_chunks = 0 + indexes = [] + for j from 0 <= j < num_gaps+1: + if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]: + num_chunks = num_chunks + 1 + for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]: + indexes.append(k) + ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]]) + if j < num_gaps: + indexes.append(sym.setindex(self.category, e_gap_order[j]+1)) + ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1)) + i = i + step + if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks: + result.append((rule.Phrase(ephr_arr),indexes)) + + free(e_gaps1) + free(e_gap_order) + return result + + cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes): + #print "create_alignments" + #s = "sent_links = " + #i = 0 + #while (i < num_links*2): + # s = s+"%d-%d " % (sent_links[i],sent_links[i+1]) + # i += 2 + #print s + #print findexes + #print eindexes + + ret = cintlist.CIntList() + for i in xrange(len(findexes)): + s = findexes[i] + if (s<0): + continue + idx = 0 + while (idx < num_links*2): + if (sent_links[idx] == s): + j = eindexes.index(sent_links[idx+1]) + ret.append(i*65536+j) + idx += 2 + return ret + + cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks): + cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high + cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start + cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints + cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high + cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len + cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len + cdef float pair_count + cdef float available_mass + cdef extracts, phrase_list + cdef cintlist.CIntList fphr_arr + cdef rule.Phrase fphr + cdef reason_for_failure + + fphr_arr = cintlist.CIntList() + phrase_len = phrase.n + extracts = [] + sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links) + + e_sent_start = self.eda.sent_index.arr[matching.sent_id] + e_sent_end = self.eda.sent_index.arr[matching.sent_id+1] + e_sent_len = e_sent_end - e_sent_start - 1 + f_sent_start = self.fda.sent_index.arr[matching.sent_id] + f_sent_end = self.fda.sent_index.arr[matching.sent_id+1] + f_sent_len = f_sent_end - f_sent_start - 1 + available_mass = 1.0 + if matching.sent_id == self.excluded_sent_id: + available_mass = 0.0 + + self.findexes1.reset() + sofar = 0 + for i in xrange(num_chunks): + for j in xrange(chunklen[i]): + self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start); + sofar += 1 + if (i+1<num_chunks): + self.findexes1.append(phrase[sofar]) + sofar += 1 + + + e_links_low = <int*> malloc(e_sent_len*sizeof(int)) + e_links_high = <int*> malloc(e_sent_len*sizeof(int)) + f_links_low = <int*> malloc(f_sent_len*sizeof(int)) + f_links_high = <int*> malloc(f_sent_len*sizeof(int)) + f_gap_low = <int*> malloc((num_chunks+1)*sizeof(int)) + f_gap_high = <int*> malloc((num_chunks+1)*sizeof(int)) + e_gap_low = <int*> malloc((num_chunks+1)*sizeof(int)) + e_gap_high = <int*> malloc((num_chunks+1)*sizeof(int)) + memset(f_gap_low, 0, (num_chunks+1)*sizeof(int)) + memset(f_gap_high, 0, (num_chunks+1)*sizeof(int)) + memset(e_gap_low, 0, (num_chunks+1)*sizeof(int)) + memset(e_gap_high, 0, (num_chunks+1)*sizeof(int)) + + reason_for_failure = "" + + for i from 0 <= i < e_sent_len: + e_links_low[i] = -1 + e_links_high[i] = -1 + for i from 0 <= i < f_sent_len: + f_links_low[i] = -1 + f_links_high[i] = -1 + + # this is really inefficient -- might be good to + # somehow replace with binary search just for the f + # links that we care about (but then how to look up + # when we want to check something on the e side?) + i = 0 + while i < num_links*2: + f_i = sent_links[i] + e_i = sent_links[i+1] + if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i: + f_links_low[f_i] = e_i + if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1: + f_links_high[f_i] = e_i + 1 + if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i: + e_links_low[e_i] = f_i + if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1: + e_links_high[e_i] = f_i + 1 + i = i + 2 + + als = [] + for x in xrange(matching.start,matching.end): + al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start]) + als.append(al) + # check all source-side alignment constraints + met_constraints = 1 + if self.require_aligned_terminal: + num_aligned_chunks = 0 + for i from 0 <= i < num_chunks: + for j from 0 <= j < chunklen[i]: + if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1: + num_aligned_chunks = num_aligned_chunks + 1 + break + if num_aligned_chunks == 0: + reason_for_failure = "No aligned terminals" + met_constraints = 0 + if self.require_aligned_chunks and num_aligned_chunks < num_chunks: + reason_for_failure = "Unaligned chunk" + met_constraints = 0 + + if met_constraints and self.tight_phrases: + # outside edge constraints are checked later + for i from 0 <= i < num_chunks-1: + if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1: + reason_for_failure = "Gaps are not tight phrases" + met_constraints = 0 + break + if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1: + reason_for_failure = "Gaps are not tight phrases" + met_constraints = 0 + break + + f_low = matching.arr[matching.start] - f_sent_start + f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start + if met_constraints: + + if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + self.train_min_gap_size, 0, + self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0): + gap_error = 0 + num_gaps = 0 + + if f_back_low < f_low: + f_gap_low[0] = f_back_low + f_gap_high[0] = f_low + num_gaps = 1 + gap_start = 0 + phrase_len = phrase_len+1 + if phrase_len > self.max_length: + gap_error = 1 + if self.tight_phrases: + if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1: + gap_error = 1 + reason_for_failure = "Inside edges of preceding subphrase are not tight" + else: + gap_start = 1 + if self.tight_phrases and f_links_low[f_low] == -1: + # this is not a hard error. we can't extract this phrase + # but we still might be able to extract a superphrase + met_constraints = 0 + + for i from 0 <= i < matching.size - 1: + f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start + f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start + num_gaps = num_gaps + 1 + + if f_high < f_back_high: + f_gap_low[gap_start+num_gaps] = f_high + f_gap_high[gap_start+num_gaps] = f_back_high + num_gaps = num_gaps + 1 + phrase_len = phrase_len+1 + if phrase_len > self.max_length: + gap_error = 1 + if self.tight_phrases: + if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1: + gap_error = 1 + reason_for_failure = "Inside edges of following subphrase are not tight" + else: + if self.tight_phrases and f_links_low[f_high-1] == -1: + met_constraints = 0 + + if gap_error == 0: + e_word_count = e_high - e_low + for i from 0 <= i < num_gaps: # check integrity of subphrases + if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, + f_gap_low+gap_start+i, f_gap_high+gap_start+i, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0) == 0: + gap_error = 1 + reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i]) + break + + if gap_error == 0: + i = 1 + self.findexes.reset() + if f_back_low < f_low: + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.append(sym.setindex(self.category, i)) + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + if f_back_high > f_high: + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + + fphr = rule.Phrase(fphr_arr) + if met_constraints: + phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps, + f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high) + for (phrase2,eindexes) in phrase_list: + als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als1))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)) + #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high) + + if (num_gaps < self.max_nonterminals and + phrase_len < self.max_length and + f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size): + if (f_back_low == f_low and + f_low >= self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))): + f_x_low = f_low-self.train_min_gap_size + met_constraints = 1 + if self.tight_phrases: + while f_x_low >= 0 and f_links_low[f_x_low] == -1: + f_x_low = f_x_low - 1 + if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_x_low, f_back_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 1, 1, 0, 1, 0) and + ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and + self.find_fixpoint(f_x_low, f_low, # check integrity of new subphrase + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + self.findexes.append(sym.setindex(self.category, i)) + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + if f_back_high > f_high: + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1, + f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, + e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2,eindexes in phrase_list: + als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als2))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + + if (f_back_high == f_high and + f_sent_len - f_high >= self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))): + f_x_high = f_high+self.train_min_gap_size + met_constraints = 1 + if self.tight_phrases: + while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: + f_x_high = f_x_high + 1 + if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_back_low, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 1, 0, 1, 1, 0) and + ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and + self.find_fixpoint(f_high, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, + f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + if f_back_low < f_low: + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.append(sym.setindex(self.category, i)) + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1, + f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2, eindexes in phrase_list: + als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als3))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + if (num_gaps < self.max_nonterminals - 1 and + phrase_len+1 < self.max_length and + f_back_high == f_high and + f_back_low == f_low and + f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and + f_low >= self.train_min_gap_size and + f_high <= f_sent_len - self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))): + + met_constraints = 1 + f_x_low = f_low-self.train_min_gap_size + if self.tight_phrases: + while f_x_low >= 0 and f_links_low[f_x_low] == -1: + f_x_low = f_x_low - 1 + if f_x_low < 0: + met_constraints = 0 + + f_x_high = f_high+self.train_min_gap_size + if self.tight_phrases: + while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: + f_x_high = f_x_high + 1 + if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_x_low, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 2, 1, 1, 1, 1) and + ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and + self.find_fixpoint(f_x_low, f_low, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0) and + self.find_fixpoint(f_high, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, + f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + self.findexes.append(sym.setindex(self.category, i)) + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2, + f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2, eindexes in phrase_list: + als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als4))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + else: + reason_for_failure = "Unable to extract basic phrase" + + free(sent_links) + free(f_links_low) + free(f_links_high) + free(e_links_low) + free(e_links_high) + free(f_gap_low) + free(f_gap_high) + free(e_gap_low) + free(e_gap_high) + + if self.sample_file is not None: + self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)) + + #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure) + + + return extracts + diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl new file mode 100755 index 00000000..1cae83a7 --- /dev/null +++ b/sa-extract/sa-compile.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; + +my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); } + +my $rootdir = `dirname $0`; chomp $rootdir; +my $compile = "$rootdir/compile_bin.py"; +my $lcp = "$rootdir/lcp_ops.py"; +die "Can't find $compile" unless -f $compile; +die "Can't execute $compile" unless -x $compile; + +sub print_help; +sub cleanup; + +my $alignment; +my $bitext; +my $catalog; +my $dryrun = 0; +my $group; +my $help = 0; +my $ini = "$rootdir/extract.ini"; +my $lm; +my $precomp; +my $no_ini = 0; +my $remove; +my $type; +my $local_only = 1; +my $output; + +# Process command-line options +if (GetOptions( + "alignment=s" => \$alignment, + "bitext=s" => \$bitext, + "help" => \$help, + "ini=s" => \$ini, + "output=s" => \$output, + "precomp-options=s" => \$precomp, + "no-ini" => \$no_ini, +) == 0 || $help == 1 || @ARGV > 0){ + print_help; + die "\n"; +} + +open(INI, $ini) or die "Can't read $ini: $!"; + +$bitext || die "You must specify a bitext with -b\n"; +$alignment || die "You must specify an alignment with -a\n"; + +my $top_dir; +if (defined $output) { + $top_dir = $output; +} else { + $top_dir = "$cwd/sa-compiled"; +} + +my $type_dir = "$top_dir"; + +my $bitext_name; +my $bitext_f_file; +my $bitext_e_file; +my $bitext_dir; +if ($bitext){ + if ($bitext =~ /(.*)=(.*),(.*)/){ + $bitext_name = $1; + $bitext_f_file = $2; + $bitext_e_file = $3; + -e $bitext_f_file || die "Could not find file $bitext_f_file\n"; + -e $bitext_e_file || die "Could not find file $bitext_e_file\n"; + } else { + $bitext_name = $bitext; + } + + $bitext_dir = "$type_dir/bitext/$bitext_name"; + if ($bitext_f_file){ + if (-e $bitext_dir) { + die "Bitext $bitext_name already exists\n"; + } + } else { + unless (-e $bitext_dir){ + die "No bitext $bitext_name. You must specify bitext files with -b\n"; + } + } +} + +my $max_nt = 2; +my $max_len = 5; +my $max_size = 15; +my $min_gap = 1; +my $rank1 = 100; +my $rank2 = 10; +my $precomp_file; +if ($precomp){ + unless ($bitext_name){ + die "You must specify a bitext with -b if using -p\n"; + } + my @precomp_args = split(/,/, $precomp); + my $precomp_arg; + for $precomp_arg (@precomp_args){ + if ($precomp_arg =~ /(.*)=(.*)/){ + my $key = $1; + my $value = $2; + unless ($value =~ /^\d+$/){ + die "Value for -p option must be a positive integer, found $value\n"; + } + if ($key eq "max-len"){ $max_len = $value; } + elsif ($key eq "max-nt"){ $max_nt = $value; } + elsif ($key eq "max-size"){ $max_size = $value; } + elsif ($key eq "min-gap"){ $min_gap = $value; } + elsif ($key eq "rank1"){ $rank1 = $value; } + elsif ($key eq "rank2"){ $rank2 = $value; } + else{ + die "Unknown option $key given for -p\n"; + } + } else { + die "When using -p, you must specify key-value pairs using syntax: <key1>=<value1>,...,<keyN>=<valueN>\n"; + } + } +} +my $precomp_compile_needed = 0; +if ($bitext_name){ + $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; + unless (-e $precomp_file){ + $precomp_compile_needed = 1; + } +} + +my $alignment_name; +my $alignment_file; +my $alignment_dir; +if ($alignment){ + $bitext || die "Specified alignment $alignment without specifying bitext using -b\n"; + if ($alignment =~ /(.*)=(.*)/){ + $alignment_name = $1; + $alignment_file = $2; + -e $alignment_file || die "Could not find file $alignment_file\n"; + } else { + $alignment_name = $alignment; + } + + $alignment_dir = "$bitext_dir/a/$alignment_name"; + if ($alignment_file){ + if (-e $alignment_dir){ + die "Alignment $alignment_name already exists for bitext $bitext_name\n"; + } + } else { + require_top_dirs(); + unless (-e $alignment_dir){ + die "No alignment $alignment_name for bitext $bitext_name\n"; + } + } +} + +if ($bitext_name){ + print STDERR " from files $bitext_f_file and $bitext_e_file\n"; +} else { + print " No bitext\n"; +} +if ($precomp_compile_needed){ + print STDERR " Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n"; +} +if ($alignment_name){ + print STDERR " Alignment = $alignment_name"; + if ($alignment_file){ + print STDERR " from file $alignment_file\n"; + } +} else { + print STDERR " No alignment\n"; +} + +my $script; +my $compile_dir; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + + if ($bitext_e_file || $precomp_compile_needed || $alignment_file){ + my $compiled_e_file; + my $compiled_f_file; + + $compile_dir = $top_dir; + my $compile_top_dir = "$compile_dir"; + + my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name"; + if ($bitext_e_file){ + `mkdir -p $compile_bitext_dir`; + print STDERR "\nCompiling bitext (f side)...\n"; + `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`; + die "Command failed: $!" unless $? == 0; + print STDERR "\nCompiling bitext (e side)...\n"; + `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`; + die "Command failed: $!" unless $? == 0; + + $compiled_f_file = "$compile_bitext_dir/f.sa.bin"; + $compiled_e_file = "$compile_bitext_dir/e.bin"; + } else { # bitext already compiled + $compiled_f_file = "$bitext_dir/f.sa.bin"; + $compiled_e_file = "$bitext_dir/e.bin"; + } + + if ($precomp_compile_needed){ + `mkdir -p $compile_bitext_dir`; + my $top_stats_file = "$compile_bitext_dir/f.top.$rank1"; + my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; + my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file"; + print STDERR "$cmd\n"; + `$cmd`; + die "Command failed: $cmd" unless $? == 0; + `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`; + die "Command failed: $!" unless $? == 0; + } + + if ($alignment_file){ + my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name"; + `mkdir -p $compile_alignment_dir`; + print STDERR "\nCompiling alignment...\n"; + my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin"; + print STDERR " $cmd\n"; + `$cmd`; + die "Command failed: $!" unless $? == 0; + + print STDERR "\nCompiling lexical weights file...\n"; + $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin"; + print STDERR " $cmd\n"; + `$cmd`; + die "Command failed: $!" unless $? == 0; + } + + chdir $compile_dir; + print STDERR "Compiling done: $compile_dir\n"; + } + + unless ($no_ini){ + my $line; + while($line=<INI>){ + $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/; + $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/; + $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/; + $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/; + $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/; + + $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/; + $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/; + $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/; + $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/; + $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/; + $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/; + + print $line; + } + } + +exit(0); + +sub cleanup { + die "Cleanup.\n"; +} + +sub print_help +{ + my $name = `basename $0`; chomp $name; + print << "Help"; + +usage: $name [options] + + Manage compilation of SA-Hiero files and creation of ini files. + In the default usage, the command deploys a set of files needed + to create a system, and writes an ini for the system on stdout. + +options: + + -a, --alignment <name>[=<filename>] + Name of an alignment of a bitext (which must be specified + with -b unless using the -c flag). If used with -r, the + alignment is removed from the deployment. If used with -c, + only alignments with this name are listed. If a filename is + given, then the file will be deployed using the name. + + -b, --bitext <name>[=<f file>,<e file>] + Name of a bitext for a particular system type (which must be + specified with -t unless using the -c flag). If used with -r, + the bitext is removed from the deployment. If used with -c, + only bitexts with the name are listed. If a filename is given, + then the file will be deployed using the name. + + -h, --help + Prints this message. + + -i, --ini <filename> + Use a specific ini file as the template for a system, rather than + the default ini file. + + -p, --precomp-options <key1>=<value1>[,<key2>=<value2>,...,<keyN>=<valueN>] + Set parameters of the grammar. This must be set by $name because + many parameters involve precomputation. There are six keys that can + be set: + max-len: maximum number of symbols (T and NT) in a grammar rule + max-nt: maximum number of nonterminals in a grammar rule + max-size: maximum span of a grammar rule extracted from training + min-gap: minimum gap spanned by a nonterminal in training + rank1: number of frequent words to precompute collocations for. + rank2: number of super-frequent words to precompute triple + collocations for. + All values must be positive integers. If not specified, defaults are: + max-len = 5 + max-nt = 2 (>2 not supported) + max-size = 10 + min-gap = 2 + rank1 = 100 (>300 not recommended) + rank2 = 10 (>10 not recommended) + + -n, --no-ini + Do not generate an ini file on stdout. If this option is used, then + the requirement to specify a full system is relaxed. Therefore, this + option can be used when the sole objective is deployment of files. + + -o, --output-dir + Write the compiled model to this directory. + +Help +} diff --git a/sa-extract/setup.cfg b/sa-extract/setup.cfg new file mode 100644 index 00000000..8f696136 --- /dev/null +++ b/sa-extract/setup.cfg @@ -0,0 +1,2 @@ +[build_ext] +inplace=1 diff --git a/sa-extract/setup.py b/sa-extract/setup.py new file mode 100644 index 00000000..cdcbfb54 --- /dev/null +++ b/sa-extract/setup.py @@ -0,0 +1,45 @@ +from distutils.core import setup, Extension +from distutils.util import get_platform +import os.path + +cstrmap_module = Extension('cstrmap', sources = ['cstrmap.c', 'strmap.cc']) +setup (name = 'CStringMap', version = '1.0', description = 'C string->int map', ext_modules = [cstrmap_module]) + +rule_module = Extension('rule', + sources = ['rule.c', 'strutil.c']) +setup (name = 'Rule', version = '1.0', description = 'rule class', ext_modules = [rule_module]) + +sym_module = Extension('sym', + sources = ['sym.c']) +setup (name = 'Sym', version = '1.0', description = 'symbol class', ext_modules = [sym_module]) + +cdat_module = Extension('cdat', sources = ['cdat.c']) +setup(name = "CDat", version = '1.0', description = 'C Data class', ext_modules = [cdat_module]) + +cintlist_module = Extension('cintlist', sources = ['cintlist.c']) +setup(name = "CIntList", version = '1.0', description = 'C int array/list class', ext_modules = [cintlist_module]) + +cfloatlist_module = Extension('cfloatlist', sources = ['cfloatlist.c']) +setup(name = "CFloatList", version = '1.0', description = 'C float array/list class', ext_modules = [cfloatlist_module]) + +calignment_module = Extension('calignment', sources = ['calignment.c']) +setup(name = "CAlignment", version = '1.0', description = 'C alignment class', ext_modules = [calignment_module]) + +csuf_module = Extension('csuf', sources = ['csuf.c']) +setup(name = "CSuffixArray", version = '1.0', description = 'C suffix array class', ext_modules = [csuf_module]) + +clex_module = Extension('clex', sources = ['clex.c']) +setup(name = "CLex", version = '1.0', description = 'C lexical class', ext_modules = [clex_module]) + +factory_module = Extension('rulefactory', sources = ['rulefactory.c']) +setup(name = "RuleFactory", version = '1.0', description = 'C rule factory classes', ext_modules = [factory_module]) + +cveb_module = Extension('cveb', sources = ['cveb.c']) +setup(name = "CVEB", version = '1.0', description = 'C impl. of van Emde Boas tree', ext_modules = [cveb_module]) + +lcp_module = Extension('lcp', sources = ['lcp.c']) +setup(name = "LCP", version = '1.0', description = 'C impl. of LCP', ext_modules = [lcp_module]) + +precomp_module = Extension('precomputation', sources = ['precomputation.c']) +setup(name = "Precomputation", version = '1.0', description = 'Precomputation Algorithm', ext_modules = [precomp_module]) + diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py new file mode 100644 index 00000000..2db8b5dc --- /dev/null +++ b/sa-extract/sgml.py @@ -0,0 +1,194 @@ +import sys, sgmllib, xml.sax.saxutils, sym + +def attrs_to_str(d): + if len(d) == 0: + return "" + l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d] + return " ".join(l) + +def attrs_to_dict(a): + d = {} + for (name, value) in a: + if d.has_key(name.lower()): + raise ValueError, "duplicate attribute names" + d[name.lower()] = value + return d + +class Sentence(object): + def __init__(self, words=None, meta=None): + if words is not None: + self.words = list(words) + else: + self.words = [] + if meta is not None: + self.meta = meta + else: + self.meta = [] + + def copy(self): + return Sentence(self.words, list(self.meta)) + + def mark(self, tag, attrs): + self.meta.append((tag, attrs, 0, len(self.words))) + + def getmark(self): + if len(self.meta) > 0: + (tag, attrs, i, j) = self.meta[-1] + if i == 0 and j == len(self.words): + return (tag, attrs) + else: + return None + else: + return None + + def unmark(self): + mark = self.getmark() + if mark is not None: + self.meta = self.meta[:-1] + return mark + + def __cmp__(self, other): + return cmp((self.words, self.meta), (other.words, other.meta)) + + def __str__(self): + def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)): + if i1==i2<=j1==j2: + return 0 + elif i2<=i1<=j1<=j2: + return -1 + elif i1<=i2<=j2<=j1: + return 1 + else: + return cmp((i1,j1),(i2,j2)) # don't care + # this guarantees that equal spans will come out nested + # we want the later spans to be outer + # this relies on stable sort + open = [[] for i in xrange(len(self.words)+1)] + # there seems to be a bug still with empty spans + empty = [[] for i in xrange(len(self.words)+1)] + close = [[] for j in xrange(len(self.words)+1)] + for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans): + if i == j: + # do we want these to nest? + empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag)) + else: + open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs))) + close[j].append("</%s>\n" % tag) + + result = [] + if len(empty[0]) > 0: + result.extend(empty[0]) + for i in xrange(len(self.words)): + if i > 0: + result.append(" ") + result.extend(reversed(open[i])) + result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i]))) + result.extend(close[i+1]) + if len(empty[i+1]) > 0: + result.extend(empty[i+1]) + + return "".join(result) + + def __add__(self, other): + if type(other) in (list, tuple): + return Sentence(self.words + list(other), self.meta) + else: + othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta] + return Sentence(self.words + other.words, self.meta+othermeta) + +def read_raw(f): + """Read a raw file into a list of Sentences.""" + if type(f) is str: + f = file(f, "r") + i = 0 + line = f.readline() + while line != "": + sent = process_sgml_line(line, i) + mark = sent.getmark() + if mark is None: + sent.mark('seg', [('id',str(i))]) + else: + (tag, attrs) = mark + if tag == "seg" and not attrs_to_dict(attrs).has_key('id'): + x = ('id',str(i)) + attrs.append(x) + sent.mark('seg', attrs) + if tag != "seg": + sent.mark('seg', [('id',str(i))]) + yield sent + i += 1 + line = f.readline() + +def process_sgml_line(line, id=None): + p = DatasetParser(None) + p.pos = 0 + p.words = [] + p.meta = [] + p.feed(line) + p.close() + sent = Sentence(p.words, p.meta) + return sent + +class DatasetParser(sgmllib.SGMLParser): + def __init__(self, set): + sgmllib.SGMLParser.__init__(self) + self.words = None + self.mystack = [] + self.string = None + self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"} + def close(self): + self.convert() + sgmllib.SGMLParser.close(self) + + def handle_starttag(self, tag, method, attrs): + thing = method(attrs) + self.mystack.append(thing) + + def handle_endtag(self, tag, method): + thing = self.mystack.pop() + method(thing) + + def unknown_starttag(self, tag, attrs): + thing = self.start(tag, attrs) + self.mystack.append(thing) + + def unknown_endtag(self, tag): + thing = self.mystack.pop() + self.end(tag, thing) + + def start(self, tag, attrs): + self.convert() + if self.words is not None: + return (tag, attrs, self.pos, None) + else: + return None + + def convert(self): + if self.words is not None and self.string is not None: + words = self.string.split() + self.pos += len(words) + self.words.extend(words) + self.string = None + + def end(self, tag, thing): + self.convert() + if self.words is not None: + (tag, attrs, i, j) = thing + self.meta.append((tag, attrs, i, self.pos)) + + def handle_data(self, s): + if self.words is not None: + if (self.string is None): + self.string = s + else: + self.string += s + + def handle_entityref(self, ref): + # s=self.convert_entityref(ref) # if python 2.5 + s=self.convref[ref] + if self.words is not None: + if (self.string is None): + self.string = s + else: + self.string += s + diff --git a/sa-extract/strmap.cc b/sa-extract/strmap.cc new file mode 100644 index 00000000..5040477e --- /dev/null +++ b/sa-extract/strmap.cc @@ -0,0 +1,232 @@ +#include "strmap.h" + +#include <vector> +#include <string> +#include <tr1/unordered_map> +#include <stdint.h> + +using namespace std; +using namespace std::tr1; + +#undef HAVE_64_BITS + +#if INTPTR_MAX == INT32_MAX +# define HAVE_64_BITS 0 +#elif INTPTR_MAX >= INT64_MAX +# define HAVE_64_BITS 1 +#else +# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" +#endif + +typedef uintptr_t MurmurInt; + +// MurmurHash2, by Austin Appleby + +static const uint32_t DEFAULT_SEED=2654435769U; + +#if HAVE_64_BITS +//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); + +inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) +{ + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return (uint32_t) MurmurHash64(key,len,seed); +} + +inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return MurmurHash64(key,len,seed); +} + +#else +// 32-bit + +// Note - This code makes a few assumptions about how your machine behaves - +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { + return MurmurHash32(key,len,seed); +} + +// 64-bit hash for 32-bit platforms + +inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = seed ^ len; + uint32_t h2 = 0; + + const uint32_t * data = (const uint32_t *)key; + + while(len >= 8) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +#endif +//32bit + +struct MurmurHasher { + size_t operator()(const string& s) const { + return MurmurHash(s.c_str(), s.size()); + } +}; + +struct StrMap { + StrMap() { keys_.reserve(10000); keys_.push_back("<bad0>"); map_[keys_[0]] = 0; } + unordered_map<string, int, MurmurHasher> map_; + vector<string> keys_; +}; + +StrMap* stringmap_new() { + return new StrMap; +} + +void stringmap_delete(StrMap *vocab) { + delete vocab; +} + +int stringmap_index(StrMap *vocab, char *s) { + int& cell = vocab->map_[s]; + if (!cell) { + cell = vocab->keys_.size(); + vocab->keys_.push_back(s); + } + return cell; +} + +char* stringmap_word(StrMap *vocab, int i) { + return const_cast<char *>(vocab->keys_[i].c_str()); +} + diff --git a/sa-extract/strmap.h b/sa-extract/strmap.h new file mode 100644 index 00000000..a218a4c0 --- /dev/null +++ b/sa-extract/strmap.h @@ -0,0 +1,22 @@ +#ifndef _STRMAP_H_ +#define _STRMAP_H_ + +#ifdef __cplusplus + extern "C" { +#else + typedef struct StrMap StrMap; /* dummy type to stand in for class */ +#endif + +struct StrMap; + +StrMap* stringmap_new(); +void stringmap_delete(StrMap *vocab); +int stringmap_index(StrMap *vocab, char *s); +char* stringmap_word(StrMap *vocab, int i); + +#ifdef __cplusplus + } +#endif + + +#endif diff --git a/sa-extract/strutil.c b/sa-extract/strutil.c new file mode 100644 index 00000000..456de87a --- /dev/null +++ b/sa-extract/strutil.c @@ -0,0 +1,63 @@ +#include <string.h> +#include <stdlib.h> + +/* Like strsep(3) except that the delimiter is a string, not a set of characters. +*/ +char *strstrsep(char **stringp, const char *delim) { + char *match, *save; + save = *stringp; + if (*stringp == NULL) + return NULL; + match = strstr(*stringp, delim); + if (match == NULL) { + *stringp = NULL; + return save; + } + *match = '\0'; + *stringp = match + strlen(delim); + return save; +} + +static char **words = NULL; +static int max_words; +char **split(char *s, const char *delim, int *pn) { + int i; + char *tok, *rest; + + if (words == NULL) { + max_words = 10; + words = malloc(max_words*sizeof(char *)); + } + i = 0; + rest = s; + while ((tok = (delim ? strstrsep(&rest, delim) : strsep(&rest, " \t\n"))) != NULL) { + if (!delim && !*tok) // empty token + continue; + while (i+1 >= max_words) { + max_words *= 2; + words = realloc(words, max_words*sizeof(char *)); + } + words[i] = tok; + i++; + } + words[i] = NULL; + if (pn != NULL) + *pn = i; + return words; +} + +inline int isspace(char c) { + return (c == ' ' || c == '\t' || c == '\n'); +} + +char *strip(char *s) { + int n; + while (isspace(*s) && *s != '\0') + s++; + n = strlen(s); + while (n > 0 && isspace(s[n-1])) { + s[n-1] = '\0'; + n--; + } + return s; +} diff --git a/sa-extract/strutil.h b/sa-extract/strutil.h new file mode 100644 index 00000000..94a77033 --- /dev/null +++ b/sa-extract/strutil.h @@ -0,0 +1,8 @@ +#ifndef STRUTIL_H +#define STRUTIL_H + +char *strstrsep(char **stringp, const char *delim); +char *strip(char *s); +char **split(char *s, const char *delim, int *pn); + +#endif diff --git a/sa-extract/sym.pxd b/sa-extract/sym.pxd new file mode 100644 index 00000000..d0650f46 --- /dev/null +++ b/sa-extract/sym.pxd @@ -0,0 +1,17 @@ +cimport cstrmap + +cdef class Alphabet: + cdef readonly cstrmap.StringMap terminals, nonterminals + cdef int first_nonterminal, last_nonterminal + cdef int isvar(self, int sym) + cdef int isword(self, int sym) + cdef int getindex(self, int sym) + cdef int setindex(self, int sym, int ind) + cdef int clearindex(self, int sym) + cdef int match(self, int sym1, int sym2) + cdef char* tocat(self, int sym) + cdef int fromcat(self, char *s) + cdef char* tostring(self, int sym) + cdef int fromstring(self, char *s, int terminal) + + diff --git a/sa-extract/sym.pyx b/sa-extract/sym.pyx new file mode 100644 index 00000000..264cfcd9 --- /dev/null +++ b/sa-extract/sym.pyx @@ -0,0 +1,155 @@ +from libc.string cimport strrchr, strstr, strcpy, strlen +from libc.stdlib cimport malloc, realloc, strtol + +cdef int index_shift, index_mask, n_index +index_shift = 3 +n_index = 1<<index_shift +index_mask = (1<<index_shift)-1 +cdef id2sym +id2sym = {} + +cdef class Alphabet: + def __cinit__(self): + self.terminals = cstrmap.StringMap() + self.nonterminals = cstrmap.StringMap() + + def __init__(self): + self.first_nonterminal = -1 + + def __dealloc__(self): + pass + + cdef int isvar(self, int sym): + return sym < 0 + + cdef int isword(self, int sym): + return sym >= 0 + + cdef int getindex(self, int sym): + return -sym & index_mask + + cdef int setindex(self, int sym, int ind): + return -(-sym & ~index_mask | ind) + + cdef int clearindex(self, int sym): + return -(-sym& ~index_mask) + + cdef int match(self, int sym1, int sym2): + return self.clearindex(sym1) == self.clearindex(sym2); + + cdef char* tocat(self, int sym): + return self.nonterminals.word((-sym >> index_shift)-1) + + cdef int fromcat(self, char *s): + cdef int i + i = self.nonterminals.index(s) + if self.first_nonterminal == -1: + self.first_nonterminal = i + if i > self.last_nonterminal: + self.last_nonterminal = i + return -(i+1 << index_shift) + + cdef char* tostring(self, int sym): + cdef int ind + if self.isvar(sym): + if sym in id2sym: + return id2sym[sym] + + ind = self.getindex(sym) + if ind > 0: + id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind) + else: + id2sym[sym] = "[%s]" % self.tocat(sym) + return id2sym[sym] + + else: + return self.terminals.word(sym) + + cdef int fromstring(self, char *s, int terminal): + """Warning: this method is allowed to alter s.""" + cdef char *comma + cdef int n + n = strlen(s) + cdef char *sep + sep = strstr(s,"_SEP_") + if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL: + if terminal: + s1 = "\\"+s + return self.terminals.index(s1) + s[n-1] = c'\0' + s = s + 1 + comma = strrchr(s, c',') + if comma != NULL: + comma[0] = c'\0' + return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10)) + else: + return self.fromcat(s) + else: + return self.terminals.index(s) + +# Expose Python functions as top-level functions for backward compatibility + +alphabet = Alphabet() + +cdef Alphabet calphabet +calphabet = alphabet + +def isvar(int sym): + return calphabet.isvar(sym) + +def isword(int sym): + return calphabet.isword(sym) + +def getindex(int sym): + return calphabet.getindex(sym) + +def setindex(int sym, int ind): + return calphabet.setindex(sym, ind) + +def clearindex(int sym): + return calphabet.clearindex(sym) + +def match(int sym1, int sym2): + return calphabet.match(sym1, sym2) != 0 + +def totag(int sym): + return calphabet.tocat(sym) + +def fromtag(s): + s = s.upper() + return calphabet.fromcat(s) + +def tostring(sym): + if type(sym) is str: + return sym + else: + return calphabet.tostring(sym) + +cdef int bufsize +cdef char *buf +bufsize = 100 +buf = <char *>malloc(bufsize) +cdef ensurebufsize(int size): + global buf, bufsize + if size > bufsize: + buf = <char *>realloc(buf, size*sizeof(char)) + bufsize = size + +def fromstring(s, terminal=False): + cdef bytes bs + cdef char* cs + if terminal: + return calphabet.fromstring(s, 1) + else: + bs = s + cs = bs + ensurebufsize(len(s)+1) + strcpy(buf, cs) + return calphabet.fromstring(buf, 0) + +def nonterminals(): + cdef i + l = [] + for i from calphabet.first_nonterminal <= i <= calphabet.last_nonterminal: + l.append(-(i+1 << index_shift)) + return l diff --git a/training/Makefile.am b/training/Makefile.am index 2a11ae52..991ac210 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,12 +1,12 @@ bin_PROGRAMS = \ model1 \ + lbl_model \ test_ngram \ mr_em_map_adapter \ mr_em_adapted_reduce \ mr_reduce_to_weights \ mr_optimize_reduce \ grammar_convert \ - atools \ plftools \ collapse_weights \ mpi_extract_reachable \ @@ -47,12 +47,12 @@ augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/lib test_ngram_SOURCES = test_ngram.cc test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - model1_SOURCES = model1.cc ttables.cc model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +lbl_model_SOURCES = lbl_model.cc optimize.cc +lbl_model_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz + grammar_convert_SOURCES = grammar_convert.cc grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/training/em_utils.h b/training/em_utils.h deleted file mode 100644 index 37762978..00000000 --- a/training/em_utils.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _EM_UTILS_H_ -#define _EM_UTILS_H_ - -#include "config.h" -#ifdef HAVE_BOOST_DIGAMMA -#include <boost/math/special_functions/digamma.hpp> -using boost::math::digamma; -#else -#warning Using Mark Johnsons digamma() -#include <cmath> -inline double digamma(double x) { - double result = 0, xx, xx2, xx4; - assert(x > 0); - for ( ; x < 7; ++x) - result -= 1/x; - x -= 1.0/2.0; - xx = 1.0/x; - xx2 = xx*xx; - xx4 = xx2*xx2; - result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4; - return result; -} -#endif -#endif diff --git a/training/lbl_model.cc b/training/lbl_model.cc new file mode 100644 index 00000000..a46ce33c --- /dev/null +++ b/training/lbl_model.cc @@ -0,0 +1,421 @@ +#include <iostream> + +#include "config.h" +#ifndef HAVE_EIGEN + int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } +#else + +#include <cstdlib> +#include <algorithm> +#include <cmath> +#include <set> +#include <cstring> // memset +#include <ctime> + +#ifdef HAVE_MPI +#include <boost/mpi/timer.hpp> +#include <boost/mpi.hpp> +#include <boost/archive/text_oarchive.hpp> +namespace mpi = boost::mpi; +#endif +#include <boost/math/special_functions/fpclassify.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> +#include <Eigen/Dense> + +#include "corpus_tools.h" +#include "optimize.h" +#include "array2d.h" +#include "m.h" +#include "lattice.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" + +namespace po = boost::program_options; +using namespace std; + +#define kDIMENSIONS 10 +typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector; +typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector; +typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix; +vector<RVector> r_src, r_trg; + +#if HAVE_MPI +namespace boost { +namespace serialization { + +template<class Archive> +void serialize(Archive & ar, RVector & v, const unsigned int version) { + for (unsigned i = 0; i < kDIMENSIONS; ++i) + ar & v[i]; +} + +} // namespace serialization +} // namespace boost +#endif + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input,i",po::value<string>(),"Input file") + ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") + ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") + ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD") + ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)") + ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)") + ("random_seed,s", po::value<unsigned>(), "Random seed") + ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") + ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (argc < 2 || conf->count("help")) { + cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void Normalize(RVector* v) { + double norm = v->norm(); + assert(norm > 0.0f); + *v /= norm; +} + +void Flatten(const TMatrix& m, vector<double>* v) { + unsigned c = 0; + v->resize(kDIMENSIONS * kDIMENSIONS); + for (unsigned i = 0; i < kDIMENSIONS; ++i) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isfinite(m(i, j))); + (*v)[c++] = m(i,j); + } +} + +void Unflatten(const vector<double>& v, TMatrix* m) { + unsigned c = 0; + for (unsigned i = 0; i < kDIMENSIONS; ++i) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isfinite(v[c])); + (*m)(i, j) = v[c++]; + } +} + +double ApplyRegularization(const double C, + const vector<double>& weights, + vector<double>* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + } + return reg; +} + +void LoadEmbeddings(const string& filename, vector<RVector>* pv) { + vector<RVector>& v = *pv; + cerr << "Reading embeddings from " << filename << " ...\n"; + ReadFile rf(filename); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + size_t cur = line.find(' '); + if (cur == string::npos || cur == 0) { + cerr << "Parse error reading line " << lc << ":\n" << line << endl; + abort(); + } + WordID w = TD::Convert(line.substr(0, cur)); + if (w >= v.size()) continue; + RVector& curv = v[w]; + line[cur] = 0; + size_t start = cur + 1; + cur = start + 1; + size_t c = 0; + while(cur < line.size()) { + if (line[cur] == ' ') { + line[cur] = 0; + curv[c++] = strtod(&line[start], NULL); + start = cur + 1; + cur = start; + if (c == kDIMENSIONS) break; + } + ++cur; + } + if (c < kDIMENSIONS && cur != start) { + if (cur < line.size()) line[cur] = 0; + curv[c++] = strtod(&line[start], NULL); + } + if (c != kDIMENSIONS) { + static bool first = true; + if (first) { + cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; + first = false; + } + for (; c < kDIMENSIONS; ++c) curv[c] = rand(); + } + if (c == kDIMENSIONS && cur != line.size()) { + static bool first = true; + if (first) { + cerr << " embedding file contains more dimensions than configured with, truncating.\n"; + first = false; + } + } + } +} + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + std::cerr << "**MPI enabled.\n"; + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + std::cerr << "**MPI disabled.\n"; + const int rank = 0; + const int size = 1; +#endif + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + const string fname = conf["input"].as<string>(); + const double reg_strength = conf["regularization_strength"].as<double>(); + const bool has_l2 = reg_strength; + assert(reg_strength >= 0.0f); + const int ITERATIONS = conf["iterations"].as<unsigned>(); + const double eta = conf["eta"].as<double>(); + const double diagonal_tension = conf["diagonal_tension"].as<double>(); + bool SGD = false; + if (diagonal_tension < 0.0) { + cerr << "Invalid value for diagonal_tension: must be >= 0\n"; + return 1; + } + string testset; + if (conf.count("testset")) testset = conf["testset"].as<string>(); + + unsigned lc = 0; + vector<double> unnormed_a_i; + bool flag = false; + vector<vector<WordID> > srcs, trgs; + vector<WordID> vocab_e; + { + set<WordID> svocab_e, svocab_f; + CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); + copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); + } + cerr << "Number of target word types: " << vocab_e.size() << endl; + const double num_examples = lc; + + boost::shared_ptr<LBFGSOptimizer> lbfgs; + if (rank == 0) + lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); + r_trg.resize(TD::NumWords() + 1); + r_src.resize(TD::NumWords() + 1); + vector<set<unsigned> > trg_pos(TD::NumWords() + 1); + + if (conf.count("random_seed")) { + srand(conf["random_seed"].as<unsigned>()); + } else { + unsigned seed = time(NULL) + rank * 100; + cerr << "Random seed: " << seed << endl; + srand(seed); + } + + TMatrix t = TMatrix::Zero(); + if (rank == 0) { + t = TMatrix::Random() / 50.0; + for (unsigned i = 1; i < r_trg.size(); ++i) { + r_trg[i] = RVector::Random(); + r_src[i] = RVector::Random(); + } + if (conf.count("source_embeddings")) + LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src); + if (conf.count("target_embeddings")) + LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg); + } + + // do optimization + TMatrix g = TMatrix::Zero(); + vector<TMatrix> exp_src; + vector<double> z_src; + vector<double> flat_g, flat_t, rcv_grad; + Flatten(t, &flat_t); + bool converged = false; +#if HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); + mpi::broadcast(world, r_trg, 0); + mpi::broadcast(world, r_src, 0); +#endif + cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; + for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { + if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; + Unflatten(flat_t, &t); + double likelihood = 0; + double denom = 0.0; + lc = 0; + flag = false; + g *= 0; + for (unsigned i = 0; i < srcs.size(); ++i) { + const vector<WordID>& src = srcs[i]; + const vector<WordID>& trg = trgs[i]; + ++lc; + if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } + if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } + denom += trg.size(); + + exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); + z_src.clear(); z_src.resize(src.size(), 0.0); + Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero()); + Array2D<double> z_refs(src.size(), trg.size(), 0.0); + for (unsigned j = 0; j < trg.size(); ++j) + trg_pos[trg[j]].insert(j); + + for (unsigned i = 0; i < src.size(); ++i) { + const RVector& r_s = r_src[src[i]]; + const RTVector pred = r_s.transpose() * t; + TMatrix& exp_m = exp_src[i]; + double& z = z_src[i]; + for (unsigned k = 0; k < vocab_e.size(); ++k) { + const WordID v_k = vocab_e[k]; + const RVector& r_t = r_trg[v_k]; + const double dot_prod = pred * r_t; + const double u = exp(dot_prod); + z += u; + const TMatrix v = r_s * r_t.transpose() * u; + exp_m += v; + set<unsigned>& ref_locs = trg_pos[v_k]; + if (!ref_locs.empty()) { + for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { + TMatrix& exp_ref_ij = exp_refs(i, *it); + double& z_ref_ij = z_refs(i, *it); + z_ref_ij += u; + exp_ref_ij += v; + } + } + } + } + for (unsigned j = 0; j < trg.size(); ++j) + trg_pos[trg[j]].clear(); + + // model expectations for a single target generation with + // uniform alignment prior + // TODO: when using a non-uniform alignment, m_exp will be + // a function of j (below) + double m_z = 0; + TMatrix m_exp = TMatrix::Zero(); + for (unsigned i = 0; i < src.size(); ++i) { + m_exp += exp_src[i]; + m_z += z_src[i]; + } + m_exp /= m_z; + + Array2D<bool> al(src.size(), trg.size(), false); + for (unsigned j = 0; j < trg.size(); ++j) { + double ref_z = 0; + TMatrix ref_exp = TMatrix::Zero(); + int max_i = 0; + double max_s = -9999999; + for (unsigned i = 0; i < src.size(); ++i) { + ref_exp += exp_refs(i, j); + ref_z += z_refs(i, j); + if (log(z_refs(i, j)) > max_s) { + max_s = log(z_refs(i, j)); + max_i = i; + } + // TODO handle alignment prob + } + if (ref_z <= 0) { + cerr << "TRG=" << TD::Convert(trg[j]) << endl; + cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; + cerr << " REF_EXP=\n" << ref_exp << endl; + cerr << " M_EXP=\n" << m_exp << endl; + abort(); + } + al(max_i, j) = true; + ref_exp /= ref_z; + g += m_exp - ref_exp; + likelihood += log(ref_z) - log(m_z); + if (SGD) { + t -= g * eta / num_examples; + g *= 0; + } + } + + if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } + } + if (flag && rank == 0) { cerr << endl; } + + double obj = 0; + if (!SGD) { + Flatten(g, &flat_g); + obj = -likelihood; +#if HAVE_MPI + rcv_grad.resize(flat_g.size(), 0.0); + mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0); + swap(flat_g, rcv_grad); + rcv_grad.clear(); + + double to = 0; + mpi::reduce(world, obj, to, plus<double>(), 0); + obj = to; + double tlh = 0; + mpi::reduce(world, likelihood, tlh, plus<double>(), 0); + likelihood = tlh; + double td = 0; + mpi::reduce(world, denom, td, plus<double>(), 0); + denom = td; +#endif + } + + if (rank == 0) { + double gn = 0; + for (unsigned i = 0; i < flat_g.size(); ++i) + gn += flat_g[i]*flat_g[i]; + const double base2_likelihood = likelihood / log(2); + cerr << " log_e likelihood: " << likelihood << endl; + cerr << " log_2 likelihood: " << base2_likelihood << endl; + cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; + cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + cerr << " gradient norm: " << sqrt(gn) << endl; + if (!SGD) { + if (has_l2) { + const double r = ApplyRegularization(reg_strength, + flat_t, + &flat_g); + obj += r; + cerr << " regularization: " << r << endl; + } + lbfgs->Optimize(obj, flat_g, &flat_t); + converged = (lbfgs->HasConverged()); + } + } +#ifdef HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); + mpi::broadcast(world, converged, 0); +#endif + } + if (rank == 0) + cerr << "TRANSLATION MATRIX:" << endl << t << endl; + return 0; +} + +#endif + diff --git a/training/model1.cc b/training/model1.cc index b9590ece..73104304 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -4,12 +4,12 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "m.h" #include "lattice.h" #include "stringlib.h" #include "filelib.h" #include "ttables.h" #include "tdict.h" -#include "em_utils.h" namespace po = boost::program_options; using namespace std; @@ -20,7 +20,12 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("iterations,i",po::value<unsigned>()->default_value(5),"Number of iterations of EM training") ("beam_threshold,t",po::value<double>()->default_value(-4),"log_10 of beam threshold (-10000 to include everything, 0 max)") ("no_null_word,N","Do not generate from the null token") + ("write_alignments,A", "Write alignments instead of parameters") + ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal") + ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") + ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights") + ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model") ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); po::options_description clo("Command line options"); @@ -56,7 +61,14 @@ int main(int argc, char** argv) { const WordID kNULL = TD::Convert("<eps>"); const bool add_viterbi = (conf.count("no_add_viterbi") == 0); const bool variational_bayes = (conf.count("variational_bayes") > 0); + const bool write_alignments = (conf.count("write_alignments") > 0); + const double diagonal_tension = conf["diagonal_tension"].as<double>(); + const double prob_align_null = conf["prob_align_null"].as<double>(); + string testset; + if (conf.count("testset")) testset = conf["testset"].as<string>(); + const double prob_align_not_null = 1.0 - prob_align_null; const double alpha = conf["alpha"].as<double>(); + const bool favor_diagonal = conf.count("favor_diagonal"); if (variational_bayes && alpha <= 0.0) { cerr << "--alpha must be > 0\n"; return 1; @@ -64,6 +76,9 @@ int main(int argc, char** argv) { TTable tt; TTable::Word2Word2Double was_viterbi; + double tot_len_ratio = 0; + double mean_srclen_multiplier = 0; + vector<double> unnormed_a_i; for (int iter = 0; iter < ITERATIONS; ++iter) { const bool final_iteration = (iter == (ITERATIONS - 1)); cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; @@ -74,13 +89,13 @@ int main(int argc, char** argv) { int lc = 0; bool flag = false; string line; + string ssrc, strg; while(true) { getline(in, line); if (!in) break; ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - string ssrc, strg; ParseTranslatorInput(line, &ssrc, &strg); Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); @@ -90,34 +105,60 @@ int main(int argc, char** argv) { assert(src.size() > 0); assert(trg.size() > 0); } + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); + if (iter == 0) + tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size()); denom += trg.size(); vector<double> probs(src.size() + 1); - const double src_logprob = -log(src.size() + 1); + bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) if (use_null) { - probs[0] = tt.prob(kNULL, f_j); + if (favor_diagonal) prob_a_i = prob_align_null; + probs[0] = tt.prob(kNULL, f_j) * prob_a_i; sum += probs[0]; } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } + az /= prob_align_not_null; + } for (int i = 1; i <= src.size(); ++i) { - probs[i] = tt.prob(src[i-1][0].label, f_j); + if (favor_diagonal) + prob_a_i = unnormed_a_i[i-1] / az; + probs[i] = tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } if (final_iteration) { - if (add_viterbi) { + if (add_viterbi || write_alignments) { WordID max_i = 0; double max_p = -1; + int max_index = -1; if (use_null) { max_i = kNULL; + max_index = 0; max_p = probs[0]; } for (int i = 1; i <= src.size(); ++i) { if (probs[i] > max_p) { + max_index = i; max_p = probs[i]; max_i = src[i-1][0].label; } } + if (write_alignments) { + if (max_index > 0) { + if (first_al) first_al = false; else cout << ' '; + cout << (max_index - 1) << "-" << j; + } + } was_viterbi[max_i][f_j] = 1.0; } } else { @@ -126,14 +167,19 @@ int main(int argc, char** argv) { for (int i = 1; i <= src.size(); ++i) tt.Increment(src[i-1][0].label, f_j, probs[i] / sum); } - likelihood += log(sum) + src_logprob; + likelihood += log(sum); } + if (write_alignments && final_iteration) cout << endl; } // log(e) = 1.0 double base2_likelihood = likelihood / log(2); if (flag) { cerr << endl; } + if (iter == 0) { + mean_srclen_multiplier = tot_len_ratio / lc; + cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; + } cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; @@ -145,6 +191,55 @@ int main(int argc, char** argv) { tt.Normalize(); } } + if (testset.size()) { + ReadFile rf(testset); + istream& in = *rf.stream(); + int lc = 0; + double tlp = 0; + string ssrc, strg, line; + while (getline(in, line)) { + ++lc; + ParseTranslatorInput(line, &ssrc, &strg); + Lattice src, trg; + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); + double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); + + // compute likelihood + for (int j = 0; j < trg.size(); ++j) { + const WordID& f_j = trg[j][0].label; + double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) + if (use_null) { + if (favor_diagonal) prob_a_i = prob_align_null; + sum += tt.prob(kNULL, f_j) * prob_a_i; + } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } + az /= prob_align_not_null; + } + for (int i = 1; i <= src.size(); ++i) { + if (favor_diagonal) + prob_a_i = unnormed_a_i[i-1] / az; + sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i; + } + log_prob += log(sum); + } + tlp += log_prob; + cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl; + } + cerr << "TOTAL LOG PROB " << tlp << endl; + } + + if (write_alignments) return 0; + for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) { const TTable::Word2Double& cpd = ei->second; const TTable::Word2Double& vit = was_viterbi[ei->first]; diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc index 00746532..a9197208 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/mpi_flex_optimize.cc @@ -205,7 +205,7 @@ int main(int argc, char** argv) { const int size = 1; const int rank = 0; #endif - if (size > 0) SetSilent(true); // turn off verbose decoder output + if (size > 1) SetSilent(true); // turn off verbose decoder output register_feature_functions(); MT19937* rng = NULL; @@ -272,6 +272,7 @@ int main(int argc, char** argv) { int iter = -1; bool converged = false; + vector<double> gg; while (!converged) { #ifdef HAVE_MPI mpi::timer timer; @@ -343,7 +344,7 @@ int main(int argc, char** argv) { double obj = 0; #ifdef HAVE_MPI - // TODO obj + reduce(world, local_obj, obj, std::plus<double>(), 0); reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0); #else obj = local_obj; @@ -354,13 +355,14 @@ int main(int argc, char** argv) { // g /= (size_per_proc * size); if (!o) o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers)); - vector<double> gg(FD::NumFeats()); + gg.clear(); + gg.resize(FD::NumFeats()); if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); } for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it) if (it->first) { gg[it->first] = it->second; } g.clear(); double r = ApplyRegularizationTerms(regularization_strength, - time_series_strength * (iter == 0 ? 0.0 : 1.0), + time_series_strength, // * (iter == 0 ? 0.0 : 1.0), cur_weights, prev_weights, &gg); @@ -375,10 +377,9 @@ int main(int argc, char** argv) { o->Optimize(obj, gg, &cur_weights); } #ifdef HAVE_MPI - // broadcast(world, x, 0); + broadcast(world, cur_weights, 0); broadcast(world, converged, 0); world.barrier(); - if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } #endif } prev_weights = cur_weights; diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc index d4c16a2f..f65b5440 100644 --- a/training/mr_em_adapted_reduce.cc +++ b/training/mr_em_adapted_reduce.cc @@ -10,7 +10,7 @@ #include "fdict.h" #include "weights.h" #include "sparse_vector.h" -#include "em_utils.h" +#include "m.h" using namespace std; namespace po = boost::program_options; @@ -63,11 +63,11 @@ void Maximize(const bool use_vb, assert(tot > 0.0); double ltot = log(tot); if (use_vb) - ltot = digamma(tot + total_event_types * alpha); + ltot = Md::digamma(tot + total_event_types * alpha); for (SparseVector<double>::const_iterator it = counts.begin(); it != counts.end(); ++it) { if (use_vb) { - pc->set_value(it->first, NoZero(digamma(it->second + alpha) - ltot)); + pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); } else { pc->set_value(it->first, NoZero(log(it->second) - ltot)); } diff --git a/training/ttables.h b/training/ttables.h index 50d85a68..bf3351d2 100644 --- a/training/ttables.h +++ b/training/ttables.h @@ -4,9 +4,9 @@ #include <iostream> #include <tr1/unordered_map> +#include "m.h" #include "wordid.h" #include "tdict.h" -#include "em_utils.h" class TTable { public: @@ -39,7 +39,7 @@ class TTable { for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) tot += it->second + alpha; for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second = exp(digamma(it->second + alpha) - digamma(tot)); + it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot)); } counts.clear(); } diff --git a/utils/Makefile.am b/utils/Makefile.am index df667655..3ea21835 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,26 +1,31 @@ -bin_PROGRAMS = reconstruct_weights +bin_PROGRAMS = reconstruct_weights atools -noinst_PROGRAMS = ts phmt -TESTS = ts phmt +noinst_PROGRAMS = ts phmt mfcr_test +TESTS = ts phmt mfcr_test if HAVE_GTEST noinst_PROGRAMS += \ + crp_test \ dict_test \ + m_test \ weights_test \ logval_test \ small_vector_test -TESTS += small_vector_test logval_test weights_test dict_test +TESTS += crp_test small_vector_test logval_test weights_test dict_test m_test endif reconstruct_weights_SOURCES = reconstruct_weights.cc +atools_SOURCES = atools.cc + noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ alignment_pharaoh.cc \ b64tools.cc \ + corpus_tools.cc \ dict.cc \ tdict.cc \ fdict.cc \ @@ -38,10 +43,16 @@ endif phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc +m_test_SOURCES = m_test.cc +m_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) dict_test_SOURCES = dict_test.cc dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +mfcr_test_SOURCES = mfcr_test.cc +mfcr_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) weights_test_SOURCES = weights_test.cc weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +crp_test_SOURCES = crp_test.cc +crp_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) logval_test_SOURCES = logval_test.cc logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) small_vector_test_SOURCES = small_vector_test.cc diff --git a/utils/agenda.h b/utils/agenda.h deleted file mode 100755 index d4f13696..00000000 --- a/utils/agenda.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef AGENDA_H -#define AGENDA_H - -#define DBG_AGENDA(x) x -/* - a priority queue where you expect to queue the same item at different - priorities several times before finally popping it. higher priority = better. - so in best first you'd be using negative cost or e^-cost (probabilities, in - other words). - - this means you have a way to look up a key and see its location in the queue, - so its priority can be adjusted (or, simpler implementation: so when you pop, - you see if you've already popped before at a lower cost, and skip the - subsequent pops). - - it's assumed that you'll never queue an item @ a better priority after it has - already been popped. that is, the agenda will track already completed items. - maybe in the future i will let you recompute a cheaper way to reach things - after first-pop also, it's assumed that we're always improving prios of - existing items, never making them worse (even though technically this is - possible and sensible if it hasn't been popped yet). - - simple binary max heap for now. there are better practical options w/ - superior cache locaility. movements in the heap need to update a record for - that key of where the key went. i do this by creating canonical key pointers - out of boost object pools (if the key were lightweight e.g. an int, then it - would make sense to use the hash lookup too - - since i'm doing key hashing to start with, i also allow you to attach some - arbitrary data (value) payload beyond key+priority. - - hash map from key to done (has been popped) -> set where doneness is marked in key item? - - a slightly different way to make an adjustable heap would be to use - tree-structured parent/children links intrusively (or mapped by key) in the - key, rather than indices in a compact binary-tree heap - - */ - -#include "best.h" -#include "intern_pool.h" -#include "d_ary_heap.h" -#include "lvalue_pmap.h" -#include <vector> -#include <functional> - -/* -template <class P> -struct priority_traits { - typedef typename P::priority_type priority_type; -}; -*/ - -typedef best_t agenda_best_t; -typedef unsigned agenda_location_t; - -PMAP_MEMBER_INDIRECT(LocationMap,agenda_location_t,location) -PMAP_MEMBER_INDIRECT(PriorityMap,agenda_best_t,priority) - -struct Less { - typedef bool result_type; - template <class A,class B> - bool operator()(A const& a,B const& b) const { return a<b; } -}; - -// LocMap and PrioMap are boost property maps put(locmap,key,size_t), Better(get(priomap,k1),get(priomap,k2)) means k1 should be above k2 (be popped first). Locmap and PrioMap may have state; the rest are assumed stateless functors -// make sure the (default) location is not -1 for anything you add, or else an assertion may trigger -template <class Item,class Better=Less, /* intern_pool args */ class KeyF=get_key<Item>,class HashKey=boost::hash<typename KeyF::result_type>,class EqKey=std::equal_to<typename KeyF::result_type>, class Pool=boost::object_pool<Item> > -struct Agenda : intern_pool<Item,KeyF,HashKey,EqKey,Pool> { - typedef intern_pool<Item,KeyF,HashKey,EqKey,Pool> Intern; // inherited because I want to use construct() - /* this is less generic than it could be, because I want to use a single hash mapping to intern to canonical mutable object pointers, where the property maps are just lvalue accessors */ - typedef typename KeyF::result_type Key; - typedef Item * Handle; - typedef LocationMap<Handle> LocMap; - typedef PriorityMap<Handle> PrioMap; - LocMap locmap; - PrioMap priomap; // note: priomap[item] is set by caller before giving us the item; then tracks best (for canonicalized item) thereafter - - Better better; - //NOT NEEDED: initialize function object state (there is none) - - typedef Item *ItemC; //canonicalized pointer - typedef Item *ItemP; - static const std::size_t heap_arity=4; // might be fastest possible (depends on key size probably - cache locality is bad w/ arity=2) - typedef std::vector<ItemC> HeapStorage; - typedef d_ary_heap_indirect<Handle,heap_arity,LocMap,PrioMap,Better,HeapStorage,agenda_location_t> Heap; - Heap q; - - // please don't call q.push etc. directly. - void add(ItemP i) { - bool fresh=interneq(i); - DBG_AGENDA(assert(fresh && !q.contains(i))); - q.push(i); - } - bool improve(ItemP i) { - ItemP c=i; - bool fresh=interneq(c); - if (fresh) { - add(c); - return true; - } - DBG_AGENDA(assert(q.contains(c))); - return q.maybe_improve(priomap[i]); - } - inline bool empty() { - return q.empty(); - } - // no need to destroy the canon. item because we want to remember the best cost and reject more expensive ways of using it). - ItemC pop() { - ItemC r=q.top(); - q.pop(); - return r; - } - void pop_discard() { - q.pop(); - } - - ItemC top() { - DBG_AGENDA(assert(!empty())); - return q.top(); - } - - agenda_best_t best() const { - return q.best(); //TODO: cache/track the global best? - } - - agenda_best_t second_best() const { - return q.second_best(); - } - - // add only if worse than queue current best, otherwise evaluate immediately (e.g. for early stopping w/ expensive to compute additional cost). return true if postponed (added) - bool postpone(ItemP i) { - if (better(priomap[i],best())) return false; - return improve(i); - } - - Agenda(unsigned reserve=1000000,LocMap const& lm=LocMap(),PrioMap const& pm=PrioMap(),EqKey const& eq=EqKey(),Better const& better=Better()) : locmap(lm), priomap(pm), better(better), q(priomap,locmap,better,reserve) { } -}; - -#endif diff --git a/training/atools.cc b/utils/atools.cc index 42579627..c0a91731 100644 --- a/training/atools.cc +++ b/utils/atools.cc @@ -8,7 +8,6 @@ #include <boost/shared_ptr.hpp> #include "filelib.h" -#include "aligner.h" #include "alignment_pharaoh.h" namespace po = boost::program_options; @@ -79,7 +78,7 @@ struct FMeasureCommand : public Command { struct DisplayCommand : public Command { string Name() const { return "display"; } bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D<bool>& in, const Array2D<bool>¬_used, Array2D<bool>* x) { + void Apply(const Array2D<bool>& in, const Array2D<bool>&, Array2D<bool>* x) { *x = in; cout << *x << endl; } @@ -88,7 +87,7 @@ struct DisplayCommand : public Command { struct ConvertCommand : public Command { string Name() const { return "convert"; } bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D<bool>& in, const Array2D<bool>¬_used, Array2D<bool>* x) { + void Apply(const Array2D<bool>& in, const Array2D<bool>&, Array2D<bool>* x) { *x = in; } }; @@ -96,7 +95,7 @@ struct ConvertCommand : public Command { struct InvertCommand : public Command { string Name() const { return "invert"; } bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D<bool>& in, const Array2D<bool>¬_used, Array2D<bool>* x) { + void Apply(const Array2D<bool>& in, const Array2D<bool>&, Array2D<bool>* x) { Array2D<bool>& res = *x; res.resize(in.height(), in.width()); for (int i = 0; i < in.height(); ++i) @@ -268,15 +267,15 @@ map<string, boost::shared_ptr<Command> > commands; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); ostringstream os; - os << "[REQ] Operation to perform:"; + os << "Operation to perform:"; for (map<string, boost::shared_ptr<Command> >::iterator it = commands.begin(); it != commands.end(); ++it) { os << ' ' << it->first; } string cstr = os.str(); opts.add_options() - ("input_1,i", po::value<string>(), "[REQ] Alignment 1 file, - for STDIN") - ("input_2,j", po::value<string>(), "[OPT] Alignment 2 file, - for STDIN") + ("input_1,i", po::value<string>(), "[REQUIRED] Alignment 1 file, - for STDIN") + ("input_2,j", po::value<string>(), "Alignment 2 file, - for STDIN") ("command,c", po::value<string>()->default_value("convert"), cstr.c_str()) ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); diff --git a/utils/batched_append.h b/utils/batched_append.h index fe4a12fc..fe4a12fc 100755..100644 --- a/utils/batched_append.h +++ b/utils/batched_append.h diff --git a/utils/best.h b/utils/best.h deleted file mode 100755 index ed15e0be..00000000 --- a/utils/best.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef UTILS__BEST_H -#define UTILS__BEST_H - -#include "max_plus.h" - -typedef MaxPlus<double> best_t; - -inline bool better(best_t const& a,best_t const& b) { - return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first. -} - -inline bool operator <(best_t const& a,best_t const& b) { - return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first. -} -struct BetterP { - inline bool operator ()(best_t const& a,best_t const& b) const { - return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first. - } -}; - -inline void maybe_improve(best_t &a,best_t const& b) { - if (a.v_>b.v_) - a.v_=b.v_; -} - -template <class O> -inline void maybe_improve(best_t &a,O const& b) { - if (a.v_>b.v_) - a.v_=b.v_; -} - -#endif diff --git a/phrasinator/ccrp.h b/utils/ccrp.h index 9acf12ab..4a8b80e7 100644 --- a/phrasinator/ccrp.h +++ b/utils/ccrp.h @@ -11,41 +11,59 @@ #include <boost/functional/hash.hpp> #include "sampler.h" #include "slice_sampler.h" +#include "m.h" // Chinese restaurant process (Pitman-Yor parameters) with table tracking. template <typename Dish, typename DishHash = boost::hash<Dish> > class CCRP { public: - CCRP(double disc, double conc) : - num_tables_(), - num_customers_(), - discount_(disc), - concentration_(conc), - discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()), - discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} + CCRP(double disc, double strength) : + num_tables_(), + num_customers_(), + discount_(disc), + strength_(strength), + discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()), + discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), + strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()), + strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { + check_hyperparameters(); + } - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) : - num_tables_(), - num_customers_(), - discount_(d), - concentration_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + num_tables_(), + num_customers_(), + discount_(d), + strength_(c), + discount_prior_strength_(d_strength), + discount_prior_beta_(d_beta), + strength_prior_shape_(c_shape), + strength_prior_rate_(c_rate) { + check_hyperparameters(); + } + + void check_hyperparameters() { + if (discount_ < 0.0 || discount_ >= 1.0) { + std::cerr << "Bad discount: " << discount_ << std::endl; + abort(); + } + if (strength_ <= -discount_) { + std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; + abort(); + } + } double discount() const { return discount_; } - double concentration() const { return concentration_; } + double strength() const { return strength_; } + void set_discount(double d) { discount_ = d; check_hyperparameters(); } + void set_strength(double a) { strength_ = a; check_hyperparameters(); } bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); + return !std::isnan(discount_prior_strength_); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_strength_prior() const { + return !std::isnan(strength_prior_shape_); } void clear() { @@ -75,12 +93,13 @@ class CCRP { } // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { + template <typename T> + int increment(const Dish& dish, const T& p0, MT19937* rng) { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + const T p_empty = T(strength_ + num_tables_ * discount_) * p0; + const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } if (share_table) { @@ -144,53 +163,38 @@ class CCRP { } } - double prob(const Dish& dish, const double& p0) const { + template <typename T> + T prob(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; + const T r = T(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); + return r * p0 / T(num_customers_ + strength_); } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); + return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / + T(num_customers_ + strength_); } } double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; + return log_crp_prob(discount_, strength_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { + double log_crp_prob(const double& discount, const double& strength) const { double lp = 0.0; if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); + if (has_strength_prior()) + lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); + if (strength) + lp += lgamma(strength) - lgamma(strength / discount); + lp += - lgamma(strength + num_customers_) + + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { @@ -199,8 +203,16 @@ class CCRP { lp += lgamma(*ti - discount) - r; } } + } else if (!discount) { // discount == 0.0 + lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + lp += lgamma(cur.table_counts_.size()); + } } else { - assert(!"not implemented yet"); + assert(!"discount less than 0 detected!"); } } assert(std::isfinite(lp)); @@ -208,20 +220,23 @@ class CCRP { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); + assert(has_discount_prior() || has_strength_prior()); + if (num_customers() == 0) return; DiscountResampler dr(*this); - ConcentrationResampler cr(*this); + StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + if (has_strength_prior()) { + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits<double>::min(), std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits<double>::min(), + double min_discount = std::numeric_limits<double>::min(); + if (strength_ < 0.0) min_discount -= strength_; + discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); } @@ -229,15 +244,15 @@ class CCRP { DiscountResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + return crp_.log_crp_prob(proposed_discount, crp_.strength_); } }; - struct ConcentrationResampler { - ConcentrationResampler(const CCRP& crp) : crp_(crp) {} + struct StrengthResampler { + StrengthResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + double operator()(const double& proposed_strength) const { + return crp_.log_crp_prob(crp_.discount_, proposed_strength); } }; @@ -249,7 +264,7 @@ class CCRP { }; void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -274,15 +289,15 @@ class CCRP { std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_; double discount_; - double concentration_; + double strength_; // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; + double discount_prior_strength_; double discount_prior_beta_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on strength_ (NaN if no prior) + double strength_prior_shape_; + double strength_prior_rate_; }; template <typename T,typename H> diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h index 63b6f4c2..6efbfc78 100644 --- a/utils/ccrp_nt.h +++ b/utils/ccrp_nt.h @@ -11,6 +11,7 @@ #include <boost/functional/hash.hpp> #include "sampler.h" #include "slice_sampler.h" +#include "m.h" // Chinese restaurant process (1 parameter) template <typename Dish, typename DishHash = boost::hash<Dish> > @@ -18,20 +19,21 @@ class CCRP_NoTable { public: explicit CCRP_NoTable(double conc) : num_customers_(), - concentration_(conc), - concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} + alpha_(conc), + alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : num_customers_(), - concentration_(c), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_(c), + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} - double concentration() const { return concentration_; } + double alpha() const { return alpha_; } + void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -71,38 +73,31 @@ class CCRP_NoTable { return table_diff; } - double prob(const Dish& dish, const double& p0) const { + template <typename F> + F prob(const Dish& dish, const F& p0) const { const unsigned at_table = num_customers(dish); - return (at_table + p0 * concentration_) / (num_customers_ + concentration_); + return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_); } double logprob(const Dish& dish, const double& logp0) const { const unsigned at_table = num_customers(dish); - return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); + return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_); } double log_crp_prob() const { - return log_crp_prob(concentration_); - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; + return log_crp_prob(alpha_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& concentration) const { + double log_crp_prob(const double& alpha) const { double lp = 0.0; - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + if (has_alpha_prior()) + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { - lp += lgamma(concentration) - lgamma(concentration + num_customers_) + - custs_.size() * log(concentration); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + custs_.size() * log(alpha); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); it != custs_.end(); ++it) { @@ -114,10 +109,10 @@ class CCRP_NoTable { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_concentration_prior()); + assert(has_alpha_prior()); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); } } @@ -125,13 +120,13 @@ class CCRP_NoTable { struct ConcentrationResampler { ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} const CCRP_NoTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(proposed_alpha); } }; void Print(std::ostream* out) const { - (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl; + (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; int cc = 0; for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin(); it != custs_.end(); ++it) { @@ -153,11 +148,11 @@ class CCRP_NoTable { return custs_.end(); } - double concentration_; + double alpha_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template <typename T,typename H> diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index a868af9a..1fe01b0e 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -21,33 +21,33 @@ class CCRP_OneTable { num_tables_(), num_customers_(), discount_(disc), - concentration_(conc), + alpha_(conc), discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()), discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} + alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {} CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - concentration_(c), + alpha_(c), discount_prior_alpha_(d_alpha), discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} double discount() const { return discount_; } - double concentration() const { return concentration_; } - void set_concentration(double c) { concentration_ = c; } + double alpha() const { return alpha_; } + void set_alpha(double c) { alpha_ = c; } void set_discount(double d) { discount_ = d; } bool has_discount_prior() const { return !std::isnan(discount_prior_alpha_); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -108,17 +108,29 @@ class CCRP_OneTable { double prob(const Dish& dish, const double& p0) const { const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const double r = num_tables_ * discount_ + concentration_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_counts_.end()) { - return r * p0 / (num_customers_ + concentration_); + return r * p0 / (num_customers_ + alpha_); } else { return (it->second - discount_ + r * p0) / - (num_customers_ + concentration_); + (num_customers_ + alpha_); + } + } + + template <typename T> + T probT(const Dish& dish, const T& p0) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + const T r(num_tables_ * discount_ + alpha_); + if (it == dish_counts_.end()) { + return r * p0 / T(num_customers_ + alpha_); + } else { + return (T(it->second - discount_) + r * p0) / + T(num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); + return log_crp_prob(discount_, alpha_); } static double log_beta_density(const double& x, const double& alpha, const double& beta) { @@ -140,19 +152,19 @@ class CCRP_OneTable { // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { + double log_crp_prob(const double& discount, const double& alpha) const { double lp = 0.0; if (has_discount_prior()) lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + if (has_alpha_prior()) + lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) + - lgamma(alpha / discount); assert(std::isfinite(lp)); for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { @@ -168,12 +180,12 @@ class CCRP_OneTable { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + if (has_alpha_prior()) { + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { @@ -181,7 +193,7 @@ class CCRP_OneTable { 1.0, 0.0, niterations, 100*niterations); } } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); } @@ -189,20 +201,20 @@ class CCRP_OneTable { DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {} const CCRP_OneTable& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + return crp_.log_crp_prob(proposed_discount, crp_.alpha_); } }; struct ConcentrationResampler { ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {} const CCRP_OneTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposed_alpha); } }; void Print(std::ostream* out) const { - (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { (*out) << " " << it->first << " = " << it->second << std::endl; } @@ -221,15 +233,15 @@ class CCRP_OneTable { DishMapType dish_counts_; double discount_; - double concentration_; + double alpha_; // optional beta prior on discount_ (NaN if no prior) double discount_prior_alpha_; double discount_prior_beta_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template <typename T,typename H> diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc new file mode 100644 index 00000000..d17785af --- /dev/null +++ b/utils/corpus_tools.cc @@ -0,0 +1,66 @@ +#include "corpus_tools.h" + +#include <iostream> + +#include "tdict.h" +#include "filelib.h" +#include "verbose.h" + +using namespace std; + +void CorpusTools::ReadFromFile(const string& filename, + vector<vector<WordID> >* src, + set<WordID>* src_vocab, + vector<vector<WordID> >* trg, + set<WordID>* trg_vocab, + int rank, + int size) { + assert(rank >= 0); + assert(size > 0); + assert(rank < size); + if (src) src->clear(); + if (src_vocab) src_vocab->clear(); + if (trg) trg->clear(); + if (trg_vocab) trg_vocab->clear(); + const int expected_fields = 1 + (trg == NULL ? 0 : 1); + if (!SILENT) cerr << "Reading from " << filename << " ...\n"; + ReadFile rf(filename); + istream& in = *rf.stream(); + string line; + int lc = 0; + static const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + while(getline(in, line)) { + const bool skip = (lc % size != rank); + ++lc; + TD::ConvertSentence(line, &tmp); + vector<WordID>* d = NULL; + if (!skip) { + src->push_back(vector<WordID>()); + d = &src->back(); + } + set<WordID>* v = src_vocab; + int s = 0; + for (unsigned i = 0; i < tmp.size(); ++i) { + if (tmp[i] == kDIV) { + ++s; + if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } + assert(trg); + if (!skip) { + trg->push_back(vector<WordID>()); + d = &trg->back(); + } + v = trg_vocab; + } else { + if (d) d->push_back(tmp[i]); + if (v) v->insert(tmp[i]); + } + } + ++s; + if (expected_fields != s) { + cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort(); + } + } +} + + diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h new file mode 100644 index 00000000..97bdaa94 --- /dev/null +++ b/utils/corpus_tools.h @@ -0,0 +1,19 @@ +#ifndef _CORPUS_TOOLS_H_ +#define _CORPUS_TOOLS_H_ + +#include <string> +#include <set> +#include <vector> +#include "wordid.h" + +struct CorpusTools { + static void ReadFromFile(const std::string& filename, + std::vector<std::vector<WordID> >* src, + std::set<WordID>* src_vocab = NULL, + std::vector<std::vector<WordID> >* trg = NULL, + std::set<WordID>* trg_vocab = NULL, + int rank = 0, + int size = 1); +}; + +#endif diff --git a/gi/clda/src/crp_test.cc b/utils/crp_test.cc index 561cd4dd..561cd4dd 100644 --- a/gi/clda/src/crp_test.cc +++ b/utils/crp_test.cc diff --git a/utils/d_ary_heap.h b/utils/d_ary_heap.h deleted file mode 100644 index 1270638a..00000000 --- a/utils/d_ary_heap.h +++ /dev/null @@ -1,568 +0,0 @@ -#ifndef D_ARY_HEAP_H -#define D_ARY_HEAP_H - -#include "show.h" -#define DDARY(x) - -#define D_ARY_PUSH_GRAEHL 0 // untested -#define D_ARY_POP_GRAEHL 0 // untested -#define D_ARY_DOWN_GRAEHL 0 // untested -#define D_ARY_UP_GRAEHL 0 // untested -#define D_ARY_APPEND_ALWAYS_PUSH 1 // heapify (0) is untested. otherwise switch between push and heapify depending on size (cache effects, existing items vs. # appended ones) - -#define D_ARY_TRACK_OUT_OF_HEAP 0 // shouldn't need to track, because in contains() false positives looking up stale or random loc map values are impossible - we just check key. note: if you enable this, you must init location to D_ARY_HEAP_NULL_INDEX yourself until it's been added or popped -#define D_ARY_VERIFY_HEAP 1 -// This is a very expensive test so it should be disabled even when NDEBUG is not defined - -# undef D_ARY_HEAP_NULL_INDEX -# define D_ARY_HEAP_NULL_INDEX (-1) // you may init location to this. - -/* adapted from boost/graph/detail/d_ary_heap.hpp - - local modifications: - - clear, heapify, append range/container, Size type template arg, reserve constructor arg - - hole+move rather than swap. note: swap would be more efficient for heavyweight keys, until move ctors exist - - don't set locmap to -1 when removing from heap (waste of time) - - // unlike arity=2 case, you don't gain anything by having indices start at 1, with 0-based child indices - // root @1, A=2, children indices m={0,1}: parent(i)=i/2, child(i,m)=2*i+m - // root @0: parent(i)=(i-1)/A child(i,n)=i*A+n+1 - can't improve on this except child(i,m)=i*A+m - (integer division, a/b=floor(a/b), so (i-1)/A = ceil(i/A)-1, or greatest int less than (i/A)) - - actually, no need to adjust child index, since child is called only once and inline - - e.g. for A=3 gorn address in tree -> index - - () = root -> 0 - (1) -> 1 - (2) -> 2 - (3) (A) -> 3 - (1,1) -> (1*A+1) = 4 - (1,2) -> (1*A+2) = 5 - (1,3) -> (1*A+3) = 6 - (2,1) -> (2*A+1) = 7 - etc. - -//TODO: block-align siblings! assume data[0] is 16 or 32-byte aligned ... then we want root @ index (blocksize-1). see http://www.lamarca.org/anthony/pubs/heaps.pdf pg8. for pow2(e.g. 4)-ary heap, it may be reasonable to use root @index A-1. however, suppose the key size is not padded to a power of 2 (e.g. 12 bytes), then we would need internal gaps at times. would want to use compile const template based inlineable alignment math for this? possibly use a container like vector that lets you specify padding relative to some address multiple for v[0]. - - optimal D: see http://www.lamarca.org/anthony/pubs/heaps.pdf pg 9. depedns on relative cost of swap,compare, but in all cases except swap=free, 2 is worse than 3-4. for expensive swap (3x compare), 4 still as good as 5. so just use 4. boost benchmarking djikstra agrees; 4 is best. - - cache-aligned 4-heap speedup over regular 2-heap is 10-80% (for huge heaps, the speedup is more) - - splay/skew heaps are worse than 2heap or aligned 4heap in practice. - - //TODO: switch from heapify (Floyd's method) to repeated push past some size limit (in bytes) due to cache effect - - #define D_ARY_BYTES_OUT_OF_CACHE 0x1000000 - - //TODO: assuming locmap is an lvalue pmap, we can be more efficient. on the other hand, if it's an intrusive property map to an interned mutable object, there's no difference in performance, and that's what i'm going to do in my first uses. plus, if keys are indices and the map is a vector, it's barely any overhead. - - */ - -// -//======================================================================= -// Copyright 2009 Trustees of Indiana University -// Authors: Jeremiah J. Willcock, Andrew Lumsdaine -// -// Distributed under the Boost Software License, Version 1.0. (See -// accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) -//======================================================================= -// - -#include <vector> -#include <cstddef> -#include <algorithm> -#include <utility> -#include <cassert> -#include <boost/static_assert.hpp> -#include <boost/shared_array.hpp> -#include <boost/property_map/property_map.hpp> - - - // D-ary heap using an indirect compare operator (use identity_property_map - // as DistanceMap to get a direct compare operator). This heap appears to be - // commonly used for Dijkstra's algorithm for its good practical performance - // on some platforms; asymptotically, it's not optimal; it has an O(lg N) decrease-key - // operation, which is (amortized) constant time on a relaxed heap or fibonacci heap. The - // implementation is mostly based on the binary heap page on Wikipedia and - // online sources that state that the operations are the same for d-ary - // heaps. This code is not based on the old Boost d-ary heap code. - // - // - d_ary_heap_indirect is a model of UpdatableQueue as is needed for - // dijkstra_shortest_paths. - // - // - Value must model Assignable. - // - Arity must be at least 2 (optimal value appears to be 4, both in my and - // third-party experiments). - // - IndexInHeapMap must be a ReadWritePropertyMap from Value to - // Container::size_type (to store the index of each stored value within the - // heap for decrease-key aka update). - // - DistanceMap must be a ReadablePropertyMap from Value to something - // (typedef'ed as distance_type). - // - Compare must be a BinaryPredicate used as a less-than operator on - // distance_type. - // - Container must be a random-access, contiguous container (in practice, - // the operations used probably require that it is std::vector<Value>). - // - template <typename Value, - std::size_t Arity, - typename IndexInHeapPropertyMap, - typename DistanceMap, - typename Better = std::less<Value>, - typename Container = std::vector<Value>, - typename Size = typename Container::size_type, - typename Equal = std::equal_to<Value> > - class d_ary_heap_indirect { - BOOST_STATIC_ASSERT (Arity >= 2); - public: - typedef Container container_type; - typedef Size size_type; - typedef Value value_type; - typedef typename Container::const_iterator const_iterator; - typedef const_iterator iterator; - // The distances being compared using better and that are stored in the - // distance map - typedef typename boost::property_traits<DistanceMap>::value_type distance_type; - d_ary_heap_indirect(DistanceMap const& distance, - IndexInHeapPropertyMap const& index_in_heap, - const Better& better = Better(), - size_type container_reserve = 100000, - Equal const& equal = Equal() - ) - : better(better), data(), distance(distance), - index_in_heap(index_in_heap),equal(equal) { - data.reserve(container_reserve); - } - /* Implicit copy constructor */ - /* Implicit assignment operator */ - - template <class C> - void append_heapify(C const& c) { - data.reserve(data.size()+c.size()); - append_heapify(c.begin(),c.end()); - } - - template <class I> - void append_heapify(I begin,I end) { - data.insert(data.end(),begin,end); - heapify(); - } - - template <class C> - void append_push(C const& c) { - data.reserve(data.size()+c.size()); - append_push(c.begin(),c.end()); - } - - // past some threshold, this should be faster than append_heapify. also, if there are many existing elements it will be faster. - template <class I> - void append_push(I begin,I end) { - for (;begin!=end;++begin) - push(*begin); - } - - template <class C> - void append(C const& c) { - if (D_ARY_APPEND_ALWAYS_PUSH || data.size()>=c.size()/2) - append_push(c); - else - append_heapify(c); - } - - // past some threshold, this should be faster than append_heapify. also, if there are many existing elements it will be faster. - template <class I> - void append(I begin,I end) { - if (D_ARY_APPEND_ALWAYS_PUSH || data.size()>=0x10000) - append_push(begin,end); - else - append_heapify(begin,end); - } - - // could allow mutation of data directly, e.g. push_back 1 at a time - but then they could forget to heapify() - - //from bottom of heap tree up, turn that subtree into a heap by adjusting the root down - // for n=size, array elements indexed by floor(n/2) + 1, floor(n/2) + 2, ... , n are all leaves for the tree, thus each is an one-element heap already - // warning: this is many fewer instructions but, at some point (when heap doesn't fit in Lx cache) it will become slower than repeated push(). - void heapify() { - for (size_type i=parent(data.size()-1);i>0;--i) // starting from parent of last node, ending at first child of root (i==1) - preserve_heap_property_down(i); - } - - void reserve(size_type s) { - data.reserve(s); - } - - size_type size() const { - return data.size(); - } - - bool empty() const { - return data.empty(); - } - - const_iterator begin() const { - return data.begin(); - } - - const_iterator end() const { - return data.end(); - } - - void clear() { -#if D_ARY_TRACK_OUT_OF_HEAP - using boost::put; - for (typename Container::iterator i=data.begin(),e=data.end();i!=e;++i) - put(index_in_heap,*i,(size_type)D_ARY_HEAP_NULL_INDEX); -#endif - data.clear(); - } - - void push(const Value& v) { - if (D_ARY_PUSH_GRAEHL) { - size_type i = data.size(); - data.push_back(Value()); // (hoping default construct is cheap, construct-copy inline) - preserve_heap_property_up(v,i); // we don't have to recopy v, or init index_in_heap - } else { - size_type index = data.size(); - data.push_back(v); - using boost::put; - put(index_in_heap, v, index); - preserve_heap_property_up(index); - } - verify_heap(); - } - - Value& top() { - return data[0]; - } - - const Value& top() const { - return data[0]; - } - - void pop() { - using boost::put; - if(D_ARY_TRACK_OUT_OF_HEAP) - put(index_in_heap, data[0], (size_type)D_ARY_HEAP_NULL_INDEX); - if (data.size() != 1) { - if (D_ARY_POP_GRAEHL) { - preserve_heap_property_down(data.back(),0,data.size()-1); - data.pop_back(); - } else { - data[0] = data.back(); - put(index_in_heap, data[0], 0); - data.pop_back(); - preserve_heap_property_down(); - } - verify_heap(); - } else { - data.pop_back(); - } - } - - // This function assumes the key has been improved - // (distance has become smaller, so it may need to rise toward top(). - // i.e. decrease-key in a min-heap - void update(const Value& v) { - using boost::get; - size_type index = get(index_in_heap, v); - preserve_heap_property_up(v,index); - verify_heap(); - } - - // return true if improved. - bool maybe_improve(const Value& v,distance_type dbetter) { - using boost::get; - if (better(dbetter,get(distance,v))) { - preserve_heap_property_up_dist(v,dbetter); - return true; - } - return false; - } - - distance_type best(distance_type null=0) const { - return empty() ? null : get(distance,data[0]); - } - distance_type second_best(distance_type null=0) const { - if (data.size()<2) return null; - int m=std::min(data.size(),Arity+1); -// if (m>=Arity) m=Arity+1; - distance_type b=get(distance,data[1]); - for (int i=2;i<m;++i) { - distance_type d=get(distance,data[i]); - if (better(d,b)) - b=d; - } - return b; - } - - -#include "warning_push.h" -#pragma GCC diagnostic ignored "-Wtype-limits" - // because maybe size_type is signed or unsigned - inline bool contains(const Value &v,size_type i) const { - if (D_ARY_TRACK_OUT_OF_HEAP) - return i != (size_type)D_ARY_HEAP_NULL_INDEX; - size_type sz=data.size(); - SHOWM2(DDARY,"d_ary_heap contains",i,data.size()); - return i>=0 && i<sz && equal(v,data[i]); // note: size_type may be signed (don't recommend it, though) - thus i>=0 check to catch uninit. data - } -#include "warning_pop.h" - - inline bool contains(const Value& v) const { - using boost::get; - return contains(v,get(index_in_heap, v)); - } - - void push_or_update(const Value& v) { /* insert if not present, else update */ - using boost::get; - size_type index = get(index_in_heap, v); - if (D_ARY_PUSH_GRAEHL) { - if (contains(v,index)) - preserve_heap_property_up(v,index); - else - push(v); - } else { - if (!contains(v,index)) { - index = data.size(); - data.push_back(v); - using boost::put; - put(index_in_heap, v, index); - } - preserve_heap_property_up(index); - } - verify_heap(); - } - - private: - Better better; - Container data; - DistanceMap distance; - IndexInHeapPropertyMap index_in_heap; - Equal equal; - - // Get the parent of a given node in the heap - static inline size_type parent(size_type index) { - return (index - 1) / Arity; - } - - // Get the child_idx'th child of a given node; 0 <= child_idx < Arity - static inline size_type child(size_type index, std::size_t child_idx) { - return index * Arity + child_idx + 1; - } - - // Swap two elements in the heap by index, updating index_in_heap - inline void swap_heap_elements(size_type index_a, size_type index_b) { - using std::swap; - Value value_a = data[index_a]; - Value value_b = data[index_b]; - data[index_a] = value_b; - data[index_b] = value_a; - using boost::put; - put(index_in_heap, value_a, index_b); - put(index_in_heap, value_b, index_a); - } - - inline void move_heap_element(Value const& v,size_type ito) { - using boost::put; - put(index_in_heap,v,ito); - data[ito]=v; //todo: move assign? - } - - // Verify that the array forms a heap; commented out by default - void verify_heap() const { - // This is a very expensive test so it should be disabled even when - // NDEBUG is not defined -#if D_ARY_VERIFY_HEAP - using boost::get; - for (size_t i = 1; i < data.size(); ++i) { - if (better(get(distance,data[i]), get(distance,data[parent(i)]))) { - assert (!"Element is smaller than its parent"); - } - } -#endif - } - - // we have a copy of the key, so we don't need to do that stupid find # of levels to move then move. we act as though data[index]=currently_being_moved, but in fact it's an uninitialized "hole", which we fill at the very end - inline void preserve_heap_property_up(Value const& currently_being_moved,size_type index) { - using boost::get; - preserve_heap_property_up(currently_being_moved,index,get(distance,currently_being_moved)); - } - - inline void preserve_heap_property_up_set_dist(Value const& currently_being_moved,distance_type dbetter) { - using boost::get; - using boost::put; - put(distance,currently_being_moved,dbetter); - preserve_heap_property_up(currently_being_moved,get(index_in_heap,currently_being_moved),dbetter); - verify_heap(); - } - - void preserve_heap_property_up(Value const& currently_being_moved,size_type index,distance_type currently_being_moved_dist) { - using boost::put; - using boost::get; - if (D_ARY_UP_GRAEHL) { - for (;;) { - if (index == 0) break; // Stop at root - size_type parent_index = parent(index); - Value const& parent_value = data[parent_index]; - if (better(currently_being_moved_dist, get(distance, parent_value))) { - move_heap_element(parent_value,index); - index = parent_index; - } else { - break; // Heap property satisfied - } - } - //finish "swap chain" by filling hole w/ currently_being_moved - move_heap_element(currently_being_moved,index); // note: it's ok not to return early on index==0 at start, even if self-assignment isn't supported by Value - because currently_being_moved is a copy. - } else { - put(index_in_heap,currently_being_moved,index); - put(distance,currently_being_moved,currently_being_moved_dist); - preserve_heap_property_up(index); - } - } - - // Starting at a node, move up the tree swapping elements to preserve the - // heap property. doesn't actually use swap; uses hole - void preserve_heap_property_up(size_type index) { - using boost::get; - if (index == 0) return; // Do nothing on root - if (D_ARY_UP_GRAEHL) { - Value copyi=data[index]; - preserve_heap_property_up(copyi,index); - return; - } - size_type orig_index = index; - size_type num_levels_moved = 0; - // The first loop just saves swaps that need to be done in order to avoid - // aliasing issues in its search; there is a second loop that does the - // necessary swap operations - Value currently_being_moved = data[index]; - distance_type currently_being_moved_dist = - get(distance, currently_being_moved); - for (;;) { - if (index == 0) break; // Stop at root - size_type parent_index = parent(index); - Value parent_value = data[parent_index]; - if (better(currently_being_moved_dist, get(distance, parent_value))) { - ++num_levels_moved; - index = parent_index; - continue; - } else { - break; // Heap property satisfied - } - } - // Actually do the moves -- move num_levels_moved elements down in the - // tree, then put currently_being_moved at the top - index = orig_index; - using boost::put; - for (size_type i = 0; i < num_levels_moved; ++i) { - size_type parent_index = parent(index); - Value parent_value = data[parent_index]; - put(index_in_heap, parent_value, index); - data[index] = parent_value; - index = parent_index; - } - data[index] = currently_being_moved; - put(index_in_heap, currently_being_moved, index); - verify_heap(); - } - - - // From the root, swap elements (each one with its smallest child) if there - // are any parent-child pairs that violate the heap property. v is placed at data[i], but then pushed down (note: data[i] won't be read explicitly; it will instead be overwritten by percolation). this also means that v must be a copy of data[i] if it was already at i. - // e.g. v=data.back(), i=0, sz=data.size()-1 for pop(), implicitly swapping data[i], data.back(), and doing data.pop_back(), then adjusting from 0 down w/ swaps. updates index_in_heap for v. - inline void preserve_heap_property_down(Value const& currently_being_moved,size_type i,size_type heap_size) { - using boost::get; - distance_type currently_being_moved_dist=get(distance,currently_being_moved); - Value* data_ptr = &data[0]; - size_type index = 0; // hole at index - currently_being_moved to be put here when we find the final hole spot - for (;;) { - size_type first_child_index = child(index, 0); - if (first_child_index >= heap_size) break; /* No children */ - Value* child_base_ptr = data_ptr + first_child_index; // using index of first_child_index+smallest_child_index because we hope optimizer will be smart enough to const-unroll a loop below if we do this. i think the optimizer would have gotten it even without our help (i.e. store root-relative index) - - // begin find best child index/distance - size_type smallest_child_index = 0; // don't add to base first_child_index every time we update which is smallest. - distance_type smallest_child_dist = get(distance, child_base_ptr[smallest_child_index]); -#undef D_ARY_MAYBE_IMPROVE_CHILD_I -#define D_ARY_MAYBE_IMPROVE_CHILD_I \ - distance_type i_dist = get(distance, child_base_ptr[i]); \ - if (better(i_dist, smallest_child_dist)) { \ - smallest_child_index = i; \ - smallest_child_dist = i_dist; \ - } - if (first_child_index + Arity <= heap_size) { - // avoid repeated heap_size boundcheck (should test if this is really a speedup - instruction cache tradeoff - could use upperbound = min(Arity,heap_size-first_child_index) instead. but this optimizes to a fixed number of iterations (compile time known) so probably worth it - for (size_t i = 1; i < Arity; ++i) { - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } else { - for (size_t i = 1,e=heap_size - first_child_index; i < e; ++i) { - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } - //end: know best child - - if (better(smallest_child_dist, currently_being_moved_dist)) { - // instead of swapping, move. - move_heap_element(child_base_ptr[smallest_child_index],index); // move up - index=first_child_index+smallest_child_index; // descend - hole is now here - } else { - move_heap_element(currently_being_moved,index); // finish "swap chain" by filling hole - break; - } - } - verify_heap(); - } - - inline void preserve_heap_property_down(size_type i) { - preserve_heap_property_down(data[i],i,data.size()); - } - - void preserve_heap_property_down() { - using boost::get; - if (data.empty()) return; - if (D_ARY_DOWN_GRAEHL) { // this *should* be more efficient because i avoid swaps. - Value copy0=data[0]; - preserve_heap_property_down(copy0,0,data.size()); - return; - } - size_type index = 0; - Value currently_being_moved = data[0]; - distance_type currently_being_moved_dist = - get(distance, currently_being_moved); - size_type heap_size = data.size(); - Value* data_ptr = &data[0]; - for (;;) { - size_type first_child_index = child(index, 0); - if (first_child_index >= heap_size) break; /* No children */ - Value* child_base_ptr = data_ptr + first_child_index; - size_type smallest_child_index = 0; - distance_type smallest_child_dist = get(distance, child_base_ptr[smallest_child_index]); - if (first_child_index + Arity <= heap_size) { - for (size_t i = 1; i < Arity; ++i) { // can be unrolled completely. - - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } else { - for (size_t i = 1,e=heap_size - first_child_index; i < e; ++i) { - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } - if (better(smallest_child_dist, currently_being_moved_dist)) { - swap_heap_elements(smallest_child_index + first_child_index, index); - index = smallest_child_index + first_child_index; - continue; - } else { - break; // Heap property satisfied - } - } - verify_heap(); - } - - }; - -#endif diff --git a/utils/fast_lexical_cast.hpp b/utils/fast_lexical_cast.hpp index ae49c934..ae49c934 100755..100644 --- a/utils/fast_lexical_cast.hpp +++ b/utils/fast_lexical_cast.hpp diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 8fe6cb3d..2c49948c 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -178,6 +178,12 @@ class FastSparseVector { T l2norm() const { return sqrt(l2norm_sq()); } + T pnorm(const double p) const { + T sum = T(); + for (const_iterator it = begin(), e = end(); it != e; ++it) + sum += pow(fabs(it->second), p); + return pow(sum, 1.0 / p); + } // if values are binary, gives |A intersect B|/|A union B| template<typename S> S tanimoto_coef(const FastSparseVector<S> &vec) const { @@ -373,7 +379,7 @@ class FastSparseVector { } ar & eff_size; while (it != this->end()) { - const std::pair<const std::string&, const T&> wire_pair(FD::Convert(it->first), it->second); + const std::pair<std::string, T> wire_pair(FD::Convert(it->first), it->second); ar & wire_pair; ++it; } diff --git a/utils/fdict.h b/utils/fdict.h index f0871b9a..0a2a9456 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -10,7 +10,7 @@ #ifdef HAVE_CMPH #include "perfect_hash.h" -#include "string_to.h" +#include <sstream> #endif struct FD { @@ -49,7 +49,9 @@ struct FD { #ifdef HAVE_CMPH if (hash_) { static std::string tls; - tls = to_string(w); + std::ostringstream os; + os << w; + tls = os.str(); return tls; } #endif diff --git a/utils/feature_vector.h b/utils/feature_vector.h index a7b61a66..a7b61a66 100755..100644 --- a/utils/feature_vector.h +++ b/utils/feature_vector.h diff --git a/utils/ftoa.h b/utils/ftoa.h deleted file mode 100755 index 3dba528d..00000000 --- a/utils/ftoa.h +++ /dev/null @@ -1,403 +0,0 @@ -#ifndef FTOA_H -#define FTOA_H - - -//TODO: for fractional digits/non-sci, determine the right amount of left padding (more if the whole number is indeed <1, to keep the significant digits), less if sci notation and/or mantissa has sig. digits (don't want N before . and N after!) - -#ifndef FTOA_ROUNDTRIP -# define FTOA_ROUNDTRIP 1 -#endif - -#ifndef FTOA_DEBUG -# define FTOA_DEBUG 0 -#endif - -#ifndef FTOA_USE_SPRINTF -#define FTOA_USE_SPRINTF 0 -#endif - -#if FTOA_DEBUG -# define FTOAassert(x) assert(x) -# define DBFTOA(x) std::cerr<<"\nFTOA " <<__func__<<"("<<__LINE__<<"): " #x "="<<x<<"\n" -# define DBFTOA2(x0,x1) std::cerr<<"\nFTOA " <<__func__<<"("<<__LINE__<<"): " #x0 "="<<x0<<" " #x1 "="<<x1 <<"\n" -#else -# define FTOAassert(x) -# define DBFTOA(x) -# define DBFTOA2(x0,x1) -#endif - -/* DECIMAL_FOR_WHOLE ; ftos(123) - 0 ; 123 - 1 ; 123 - 2 ; 123. - ; ftos(0) is always just "0" (not "0.0") - ; ftos(.01) - 0 ; .01 - 1 ; 0.01 - 2 ; 0.01 - -*/ - -#ifndef DECIMAL_FOR_WHOLE -# define DECIMAL_FOR_WHOLE 1 -#endif - -#include <limits> -#include <stdint.h> -#include <iostream> -#include <cmath> -#include <assert.h> -#include <cstdio> -#include "utoa.h" -#include "nan.h" - -template <class Float> -struct ftoa_traits { -}; - -//eP10, -// sigd decimal places normally printed, roundtripd needed so that round-trip float->string->float is identity - -#define DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,used,P10) \ -template <> \ -struct ftoa_traits<FLOATT> { \ - typedef INTT int_t; \ - typedef u ## INTT uint_t; \ - typedef FLOATT float_t; \ - enum { digits10=std::numeric_limits<INTT>::digits10, chars_block=P10, usedig=used, sigdig=sigd, roundtripdig=roundtripd, bufsize=roundtripdig+7 }; \ - static const double pow10_block = 1e ## P10; \ - static const float_t small_f = small; \ - static const float_t large_f = large; \ - static inline int sprintf(char *buf,double f) { return std::sprintf(buf,"%." #used "g",f); } \ - static inline int sprintf_sci(char *buf,double f) { return std::sprintf(buf,"%." #used "e",f); } \ - static inline int sprintf_nonsci(char *buf,double f) { return std::sprintf(buf,"%." #used "f",f); } \ - static inline uint_t fracblock(double frac) { FTOAassert(frac>=0 && frac<1); double f=frac*pow10_block;uint_t i=(uint_t)f;FTOAassert(i<pow10_block);return i; } \ - static inline uint_t rounded_fracblock(double frac) { FTOAassert(frac>=0 && frac<1); double f=frac*pow10_block;uint_t i=(uint_t)(f+.5);FTOAassert(i<pow10_block);return i; } \ - static inline float_t mantexp10(float_t f,int &exp) { float_t e=std::log10(f); float_t ef=std::floor(e); exp=ef; return f/std::pow((float_t)10,ef); } \ - static inline bool use_sci_abs(float_t fa) { return fa<small || fa>large; } \ - static inline bool use_sci(float_t f) { return use_sci_abs(std::fabs(f)); } \ -}; -//TODO: decide on computations in double (would hurt long double) or in native float type - any advantage? more precision is usually better. - -//10^22 = 0x1.0f0cf064dd592p73 is the largest exactly representable power of 10 in the binary64 format. but round down to 18 so int64_t can hold it. - -#if FTOA_ROUNDTRIP -#define DEFINE_FTOA_TRAITS_ROUNDTRIP(FLOATT,INTT,sigd,roundtripd,small,large) DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,roundtripd,roundtripd) -#else -#define DEFINE_FTOA_TRAITS_ROUNDTRIP(FLOATT,INTT,sigd,roundtripd,small,large) DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,sigd,sigd) -#endif - -DEFINE_FTOA_TRAITS_ROUNDTRIP(double,int64_t,15,17,1e-5,1e8) -//i've heard that 1e10 is fine for float. but we only have 1e9 (9 decimal places) in int32. -DEFINE_FTOA_TRAITS_ROUNDTRIP(float,int32_t,6,9,1e-3,1e8) - - -template <class F> -inline void ftoa_error(F f,char const* msg="") { - using namespace std; - cerr<<"ftoa error: "<<msg<<" f="<<f<<endl; - assert(!"ftoa error"); -} - -// all of the below prepend and return new cursor. null terminate yourself (like itoa/utoa) - -//possibly empty string for ~0 (no sci notation fallback). left padded with the right number of 0s (tricky). [ret,p) are the digits. -template <class F> -char *prepend_pos_frac_digits(char *p,F f) { - FTOAassert(f<1 && f >0); - typedef ftoa_traits<F> FT; - //repeat if very small??? nah, require sci notation to take care of it. - typename FT::uint_t i=FT::rounded_fracblock(f); - DBFTOA2(f,i); - if (i>0) { - unsigned n_skipped; - char *d=utoa_drop_trailing_0(p,i,n_skipped); - char *b=p-FT::chars_block+n_skipped; - FTOAassert(b<=d); - left_pad(b,d,'0'); - return b; - } else { - return p; - } -} - -template <class F> -char *append_pos_frac_digits(char *p,F f) { // '0' right-padded, nul terminated, return position of nul. [p,ret) are the digits - if (f==0) { - *p++='0'; - return p; - } - FTOAassert(f<1 && f >0); - typedef ftoa_traits<F> FT; - //repeat if very small??? nah, require sci notation to take care of it. - typename FT::uint_t i=FT::rounded_fracblock(f); - DBFTOA2(f,i); - if (i>0) { - char *e=p+FT::chars_block; - utoa_left_pad(p,e,i,'0'); - *e=0; - return e; - } else { - *p=0; - return p; - } -} - -template <class F> -inline char *prepend_pos_frac(char *p,F f) { - FTOAassert(f<1 && f>=0); - if (f==0) { - *--p='0'; - return p; - } - p=prepend_pos_frac_digits(p,f); - *--p='.'; - if (DECIMAL_FOR_WHOLE>0) - *--p='0'; - return p; -} - -template <class F> -inline char *append_pos_frac(char *p,F f) { - DBFTOA(f); - if (DECIMAL_FOR_WHOLE>0) - *p++='0'; - *p++='.'; - return append_pos_frac_digits(p,f); -} - -template <class F> -inline char *prepend_frac(char *p,F f,bool positive_sign=false) { - FTOAassert(f<1 && f>-1); - if (f==0) - *--p='0'; - else if (f<0) { - p=prepend_pos_frac(p,-f); - *--p='-'; - } else { - p=prepend_pos_frac(p,f); - if (positive_sign) - *--p='+'; - } - return p; -} - - -template <class F> -inline char *append_sign(char *p,F f,bool positive_sign=false) { - if (f<0) { - *p++='-'; - } else if (positive_sign) - *p++='+'; - return p; -} - -template <class F> -inline char *append_frac(char *p,F f,bool positive_sign=false) { - FTOAassert(f<1 && f>-1); - if (f==0) { - *p++='0'; - return p; - } else if (f<0) { - *p++='-'; - return append_pos_frac(p,-f); - } - if (positive_sign) { - *p++='+'; - return append_pos_frac(p,f); - } - -} - - -//append_frac, append_pos_sci, append_sci. notice these are all composed according to a pattern (but reversing order of composition in pre vs app). or can implement with copy through buffer - -/* will switch to sci notation if integer part is too big for the int type. but for very small values, will simply display 0 (i.e. //TODO: find out log10 and leftpad 0s then convert rest) */ -template <class F> -char *prepend_pos_nonsci(char *p,F f) { - typedef ftoa_traits<F> FT; - typedef typename FT::uint_t uint_t; - DBFTOA(f); - FTOAassert(f>0); - if (f>std::numeric_limits<uint_t>::max()) - return prepend_pos_sci(p,f); - //which is faster - modf is weird and returns negative frac part if f is negative. while we could deal with this using fabs, we instead only handle positive here (put - sign in front and negate, then call us) - ? -#if 0 - F intpart; - F frac=std::modf(f,&intpart); - uint_t u=intpart; -#else - uint_t u=f; - F frac=f-u; -#endif - DBFTOA2(u,frac); - if (frac == 0) { - if (DECIMAL_FOR_WHOLE>1) - *--p='.'; - } else { - p=prepend_pos_frac_digits(p,frac); - *--p='.'; - } - if (u==0) { - if (DECIMAL_FOR_WHOLE>0) - *--p='0'; - } else - p=utoa(p,u); - return p; -} - -// modify p; return true if handled -template <class F> -inline bool prepend_0_etc(char *&p,F f,bool positive_sign=false) { - if (f==0) { - *--p='0'; - return true; - } - if (is_nan(f)) { - p-=3; - p[0]='N';p[1]='A';p[2]='N'; - return true; - } - if (is_pos_inf(f)) { - p-=3; - p[0]='I';p[1]='N';p[2]='F'; - if (positive_sign) - *--p='+'; - return true; - } - if (is_neg_inf(f)) { - p-=4; - p[0]='-';p[1]='I';p[2]='N';p[3]='F'; - return true; - } - return false; -} - -template <class F> -inline char *prepend_nonsci(char *p,F f,bool positive_sign=false) { - if (prepend_0_etc(p,f,positive_sign)) return p; - if (f<0) { - p=prepend_pos_nonsci(p,-f); - *--p='-'; - } else { - p=prepend_pos_nonsci(p,f); - if (positive_sign) - *--p='+'; - } - return p; -} - -template <class F> -inline char *prepend_pos_sci(char *p,F f,bool positive_sign_exp=false) { - FTOAassert(f>0); - typedef ftoa_traits<F> FT; - int e10; - F mant=FT::mantexp10(f,e10); - DBFTOA(f); - DBFTOA2(mant,e10); - FTOAassert(mant<10.00001); - if (mant>=10.) { - ++e10; - mant*=.1; - } else if (mant < 1.) { - --e10; - mant*=10; - } - p=itoa(p,e10,positive_sign_exp); - *--p='e'; - return prepend_pos_nonsci(p,mant); -} - -template <class F> -inline char *prepend_sci(char *p,F f,bool positive_sign_mant=false,bool positive_sign_exp=false) { - if (prepend_0_etc(p,f,positive_sign_mant)) return p; - if (f==0) - *--p='0'; - else if (f<0) { - p=prepend_pos_sci(p,-f,positive_sign_exp); - *--p='-'; - } else { - p=prepend_pos_sci(p,f,positive_sign_exp); - if (positive_sign_mant) - *--p='+'; - } - return p; -} - -template <class F> -inline char *append_nonsci(char *p,F f,bool positive_sign=false) { - if (positive_sign&&f>=0) *p++='+'; - return p+ftoa_traits<F>::sprintf_nonsci(p,f); -} - -template <class F> -inline char *append_sci(char *p,F f,bool positive_sign=false) { - if (positive_sign&&f>=0) *p++='+'; - return p+ftoa_traits<F>::sprintf_sci(p,f); -} - -template <class F> -inline char *append_ftoa(char *p,F f,bool positive_sign=false) { - if (positive_sign&&f>=0) *p++='+'; - return p+ftoa_traits<F>::sprintf(p,f); -} - -template <class F> -inline char *prepend_ftoa(char *p,F f) -{ - typedef ftoa_traits<F> FT; - return FT::use_sci(f) ? prepend_sci(p,f) : prepend_nonsci(p,f); -} - -template <class F> -inline std::string ftos_append(F f) { - typedef ftoa_traits<F> FT; - char buf[FT::bufsize]; - return std::string(buf,append_ftoa(buf,f)); -} - -template <class F> -inline std::string ftos_prepend(F f) { - typedef ftoa_traits<F> FT; - char buf[FT::bufsize]; - char *end=buf+FT::bufsize; - return std::string(prepend_ftoa(end,f),end); -} - - -template <class F> -inline std::string ftos(F f) { -#if 0 - // trust RVO? no extra copies? - return FTOA_USE_SPRINTF ? ftos_append(f) : ftos_prepend(f); -#else - typedef ftoa_traits<F> FT; - char buf[FT::bufsize]; - if (FTOA_USE_SPRINTF) { - return std::string(buf,append_ftoa(buf,f)); - } else { - char *end=buf+FT::bufsize; - return std::string(prepend_ftoa(end,f),end); - } -#endif -} - -namespace { - const int ftoa_bufsize=30; - char ftoa_outbuf[ftoa_bufsize]; -} - -// not even THREADLOCAL - don't use. -inline char *static_ftoa(float f) -{ - if (FTOA_USE_SPRINTF) { - append_ftoa(ftoa_outbuf,f); - return ftoa_outbuf; - } else { - char *end=ftoa_outbuf+ftoa_bufsize; - return prepend_ftoa(end,f); - } -} - - -#endif diff --git a/utils/hash.h b/utils/hash.h index 2290bc34..2290bc34 100755..100644 --- a/utils/hash.h +++ b/utils/hash.h diff --git a/utils/have_64_bits.h b/utils/have_64_bits.h index d1e6064f..d1e6064f 100755..100644 --- a/utils/have_64_bits.h +++ b/utils/have_64_bits.h diff --git a/utils/indices_after.h b/utils/indices_after.h index 62683f39..62683f39 100755..100644 --- a/utils/indices_after.h +++ b/utils/indices_after.h diff --git a/utils/int_or_pointer.h b/utils/int_or_pointer.h deleted file mode 100755 index 4b6a9e4a..00000000 --- a/utils/int_or_pointer.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INT_OR_POINTER_H -#define INT_OR_POINTER_H - -// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd. - -// check lsb for expected tag? -#ifndef IOP_CHECK_LSB -# define IOP_CHECK_LSB 1 -#endif -#if IOP_CHECK_LSB -# define iop_assert(x) assert(x) -#else -# define iop_assert(x) -#endif - -#include <assert.h> -#include <iostream> - -template <class Pointed=void,class Int=size_t> -struct IntOrPointer { - typedef Pointed pointed_type; - typedef Int integer_type; - typedef Pointed *value_type; - typedef IntOrPointer<Pointed,Int> self_type; - IntOrPointer(int j) { *this=j; } - IntOrPointer(size_t j) { *this=j; } - IntOrPointer(value_type v) { *this=v; } - bool is_integer() const { return i&1; } - bool is_pointer() const { return !(i&1); } - value_type & pointer() { return p; } - const value_type & pointer() const { iop_assert(is_pointer()); return p; } - integer_type integer() const { iop_assert(is_integer()); return i >> 1; } - void set_integer(Int j) { i=2*j+1; } - void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); } - void operator=(unsigned j) { i = 2*(integer_type)j+1; } - void operator=(int j) { i = 2*(integer_type)j+1; } - template <class C> - void operator=(C j) { i = 2*(integer_type)j+1; } - void operator=(value_type v) { p=v; } - IntOrPointer() {} - IntOrPointer(const self_type &s) : p(s.p) {} - void operator=(const self_type &s) { p=s.p; } - template <class C> - bool operator ==(C* v) const { return p==v; } - template <class C> - bool operator ==(const C* v) const { return p==v; } - template <class C> - bool operator ==(C j) const { return integer() == j; } - bool operator ==(self_type s) const { return p==s.p; } - bool operator !=(self_type s) const { return p!=s.p; } - template <class O> void print(O&o) const - { - if (is_integer()) - o << integer(); - else { - o << "0x" << std::hex << (size_t)pointer() << std::dec; - } - } - friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) { - s.print(o); return o; - } -protected: - union { - value_type p; // must be even (guaranteed unless you're pointing at packed chars) - integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type - }; -}; - - -#endif diff --git a/utils/intern_pool.h b/utils/intern_pool.h deleted file mode 100755 index 7c739add..00000000 --- a/utils/intern_pool.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INTERN_POOL_H -#define INTERN_POOL_H - -#define DEBUG_INTERN_POOL(x) x - -/* to "intern" a string in lisp is to make a symbol from it (a pointer to a canonical copy whose pointer can be equality-compared/hashed directly with other interned things). we take an Item that has a key part and some mutable parts (that aren't in its identity), and we hash-by-value the key part to map to a canonical on-heap Item - and we use a boost object pool to allocate them */ - -//FIXME: actually store function object state (assumed stateless so far) - -#include <boost/pool/object_pool.hpp> -#include "hash.h" -//#include "null_traits.h" -#include <functional> - -template <class I> -struct get_key { // default accessor for I = like pair<key,val> - typedef typename I::first_type const& result_type; - typedef I const& argument_type; - result_type operator()(I const& i) const { - return i.first; - } -}; - -// Arg type should be the non-pointer version. this saves me from using boost type traits to remove_pointer. f may be binary or unary -template <class KeyF,class F,class Arg=typename KeyF::argument_type> -struct compose_indirect { - typedef Arg *argument_type; // we also accept Arg & - KeyF kf; - F f; - typedef typename F::result_type result_type; - result_type operator()(Arg const& p) const { - return f(kf(p)); - } - result_type operator()(Arg & p) const { - return f(kf(p)); - } - result_type operator()(Arg * p) const { - return f(kf(*p)); - } - template <class V> - result_type operator()(V const& v) const { - return f(kf(*v)); - } - - result_type operator()(Arg const& a1,Arg const& a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg & a1,Arg & a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg * a1,Arg * a2) const { - return f(kf(*a1),kf(*a2)); - } - template <class V,class W> - result_type operator()(V const& v,W const&w) const { - return f(kf(*v),kf(*w)); - } - - -}; - -template <class KeyF,class F,class Arg=typename KeyF::argument_type> -struct equal_indirect { - typedef Arg *argument_type; // we also accept Arg & - KeyF kf; - F f; - typedef bool result_type; - - result_type operator()(Arg const& a1,Arg const& a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg & a1,Arg & a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg * a1,Arg * a2) const { - return a1==a2||(a1&&a2&&f(kf(*a1),kf(*a2))); - } - template <class V,class W> - result_type operator()(V const& v,W const&w) const { - return v==w||(v&&w&&f(kf(*v),kf(*w))); - } - - -}; - -/* - -template <class F> -struct indirect_function { - F f; - explicit indirect_function(F const& f=F()) : f(f) {} - typedef typename F::result_type result_type; - template <class V> - result_type operator()(V *p) const { - return f(*p); - } -}; -*/ - -template <class Item,class KeyF=get_key<Item>,class HashKey=boost::hash<typename KeyF::result_type>,class EqKey=std::equal_to<typename KeyF::result_type>, class Pool=boost::object_pool<Item> > -struct intern_pool : Pool { - KeyF key; - typedef typename KeyF::result_type Key; - typedef Item *Handle; - typedef compose_indirect<KeyF,HashKey,Item> HashDeep; - typedef equal_indirect<KeyF,EqKey,Item> EqDeep; - typedef HASH_SET<Handle,HashDeep,EqDeep> Canonical; - typedef typename Canonical::iterator CFind; - typedef std::pair<CFind,bool> CInsert; - Canonical canonical; - bool interneq(Handle &i) { // returns true if i is newly interned, false if it already existed - CInsert i_new=canonical.insert(i); - i=*i_new.first; - return i_new.second; - } -// inherited: Handle construct(...) - Handle construct_fresh() { return Pool::construct(); } - Handle intern(Handle i) { // (maybe invalidating i, returning a valid canonical handle (pointer) - CInsert i_new=canonical.insert(i); - if (i_new.second) - return i; - else { - free(i); - return *i_new->first; - } - } - void destroy_interned(Handle i) { - DEBUG_INTERN_POOL(assert(canonical.find(i)!=canonical.end())); - canonical.erase(i); - destroy(i); - } - bool destroy_fresh(Handle i) { - DEBUG_INTERN_POOL(assert(canonical.find(i)!=canonical.end()||*canonical.find(i)!=i)); // i is a constructed item not yet interned. - destroy(i); - } - void destroy_both(Handle i) { // i must have come from this pool. may be interned, or not. destroy both the noninterned and interned. - if (!destroy_if_interned(i)) destroy(i); - } - // destroy intern(i) if it exists. return true if it existed AND its address was i. otherwise return false (whether or not a value-equal item existed and was destroyed) - bool destroy_if_interned(Handle i) { - CFind f=canonical.find(i); - if (f!=canonical.end()) { - Handle interned=*f; - canonical.erase(f); - destroy(f); - if (f==i) return true; - } - return false; - } - - intern_pool() { - HASH_MAP_EMPTY(canonical,(Handle)0); - } -}; - - - -#endif diff --git a/utils/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp index 4a4b0187..4a4b0187 100755..100644 --- a/utils/intrusive_refcount.hpp +++ b/utils/intrusive_refcount.hpp diff --git a/utils/kernel_string_subseq.h b/utils/kernel_string_subseq.h new file mode 100644 index 00000000..516e8b89 --- /dev/null +++ b/utils/kernel_string_subseq.h @@ -0,0 +1,51 @@ +#ifndef _KERNEL_STRING_SUBSEQ_H_ +#define _KERNEL_STRING_SUBSEQ_H_ + +#include <vector> +#include <cmath> +#include <boost/multi_array.hpp> + +template <unsigned N, typename T> +float ssk(const T* s, const size_t s_size, const T* t, const size_t t_size, const float lambda) { + assert(N > 0); + boost::multi_array<float, 3> kp(boost::extents[N + 1][s_size + 1][t_size + 1]); + const float l2 = lambda * lambda; + for (unsigned j = 0; j < s_size; ++j) + for (unsigned k = 0; k < t_size; ++k) + kp[0][j][k] = 1.0f; + for (unsigned i = 0; i < N; ++i) { + for (unsigned j = 0; j < s_size; ++j) { + float kpp = 0.0f; + for (unsigned k = 0; k < t_size; ++k) { + kpp = lambda * (kpp + lambda * (s[j]==t[k]) * kp[i][j][k]); + kp[i + 1][j + 1][k + 1] = lambda * kp[i + 1][j][k + 1] + kpp; + } + } + } + float kn = 0.0f; + for (int i = 0; i < N; ++i) + for (int j = 0; j < s_size; ++j) + for (int k = 0; k < t_size; ++k) + kn += l2 * (s[j] == t[k]) * kp[i][j][k]; + return kn; +} + +template <unsigned N, typename T> +float ssk(const std::vector<T>& s, const std::vector<T>& t, const float lambda) { + float kst = ssk<N, T>(&s[0], s.size(), &t[0], t.size(), lambda); + if (!kst) return 0.0f; + float kss = ssk<N, T>(&s[0], s.size(), &s[0], s.size(), lambda); + float ktt = ssk<N, T>(&t[0], t.size(), &t[0], t.size(), lambda); + return kst / std::sqrt(kss * ktt); +} + +template <unsigned N> +float ssk(const std::string& s, const std::string& t, const float lambda) { + float kst = ssk<N, char>(&s[0], s.size(), &t[0], t.size(), lambda); + if (!kst) return 0.0f; + float kss = ssk<N, char>(&s[0], s.size(), &s[0], s.size(), lambda); + float ktt = ssk<N, char>(&t[0], t.size(), &t[0], t.size(), lambda); + return kst / std::sqrt(kss * ktt); +} + +#endif diff --git a/utils/lvalue_pmap.h b/utils/lvalue_pmap.h deleted file mode 100755 index 5b9403c0..00000000 --- a/utils/lvalue_pmap.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef LVALUE_PMAP_H -#define LVALUE_PMAP_H - -#include <boost/property_map/property_map.hpp> - -// i checked: boost provides get and put given [] - but it's not being found by ADL so instead i define them myself - -// lvalue property map pmapname<P> that is: P p; valtype &v=p->name; -#define PMAP_MEMBER_INDIRECT(pmapname,valtype,name) template <class P> struct pmapname { \ - typedef P key_type; \ - typedef valtype value_type; \ - typedef value_type & reference; \ - typedef boost::lvalue_property_map_tag category; \ - reference operator[](key_type p) const { return p->name; } \ - typedef pmapname<P> self_type; \ - friend inline value_type const& get(self_type const&,key_type p) { return p->name; } \ - friend inline void put(self_type &,key_type p,value_type const& v) { p->name = v; } \ -}; - -#define PMAP_MEMBER_INDIRECT_2(pmapname,name) template <class P,class R> struct pmapname { \ - typedef P key_type; \ - typedef R value_type; \ - typedef value_type & reference; \ - typedef boost::lvalue_property_map_tag category; \ - reference operator[](key_type p) const { return p->name; } \ - typedef pmapname<P,R> self_type; \ - friend inline value_type const& get(self_type const&,key_type p) { return p->name; } \ - friend inline void put(self_type &,key_type p,value_type const& v) { p->name = v; } \ -}; - -#endif diff --git a/utils/m.h b/utils/m.h new file mode 100644 index 00000000..dc881b36 --- /dev/null +++ b/utils/m.h @@ -0,0 +1,140 @@ +#ifndef _M_H_ +#define _M_H_ + +#include <cassert> +#include <cmath> +#include <boost/math/special_functions/digamma.hpp> +#include <boost/math/constants/constants.hpp> + +// TODO right now I sometimes assert that x is in the support of the distributions +// should be configurable to return -inf instead + +template <typename F> +struct M { + // support [0, 1, 2 ...) + static inline F log_poisson(unsigned x, const F& lambda) { + assert(lambda > 0.0); + return std::log(lambda) * x - lgamma(x + 1) - lambda; + } + + // support [0, 1, 2 ...) + static inline F log_geometric(unsigned x, const F& p) { + assert(p > 0.0); + assert(p < 1.0); + return std::log(1 - p) * x + std::log(p); + } + + // log of the binomial coefficient + static inline F log_binom_coeff(unsigned n, unsigned k) { + assert(n >= k); + if (n == k) return 0.0; + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); + } + + // http://en.wikipedia.org/wiki/Negative_binomial_distribution + // support [0, 1, 2 ...) + static inline F log_negative_binom(unsigned x, unsigned r, const F& p) { + assert(p > 0.0); + assert(p < 1.0); + return log_binom_coeff(x + r - 1u, x) + r * std::log(F(1) - p) + x * std::log(p); + } + + // this is the Beta function, *not* the beta probability density + // http://mathworld.wolfram.com/BetaFunction.html + static inline F log_beta_fn(const F& x, const F& y) { + return lgamma(x) + lgamma(y) - lgamma(x + y); + } + + // support x >= 0.0 + static F log_gamma_density(const F& x, const F& shape, const F& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + return (shape-1)*std::log(x) - shape*std::log(rate) - x/rate - lgamma(shape); + } + + // this is the Beta *density* p(x ; alpha, beta) + // support x \in (0,1) + static inline F log_beta_density(const F& x, const F& alpha, const F& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + return (alpha-1)*std::log(x)+(beta-1)*std::log(1-x) - log_beta_fn(alpha, beta); + } + + // support x \in R + static inline F log_laplace_density(const F& x, const F& mu, const F& b) { + assert(b > 0.0); + return -std::log(2*b) - std::fabs(x - mu) / b; + } + + // support x \in R + // this is NOT the "log normal" density, it is the log of the "normal density at x" + static inline F log_gaussian_density(const F& x, const F& mu, const F& var) { + assert(var > 0.0); + return -0.5 * std::log(var * 2 * boost::math::constants::pi<F>()) - (x - mu)*(x - mu) / (2 * var); + } + + // (x1,x2) \in R^2 + // parameterized in terms of two means, a two "variances", a correlation < 1 + static inline F log_bivariate_gaussian_density(const F& x1, const F& x2, + const F& mu1, const F& mu2, + const F& var1, const F& var2, + const F& cor) { + assert(var1 > 0); + assert(var2 > 0); + assert(std::fabs(cor) < 1.0); + const F cor2 = cor*cor; + const F var1var22 = var1 * var2; + const F Z = 0.5 * std::log(var1var22 * (1 - cor2)) + std::log(2 * boost::math::constants::pi<F>()); + return -Z -1.0 / (2 * (1 - cor2)) * ((x1 - mu1)*(x1-mu1) / var1 + (x2-mu2)*(x2-mu2) / var2 - 2*cor*(x1 - mu1)*(x2-mu2) / std::sqrt(var1var22)); + } + + // support x \in [a,b] + static inline F log_triangle_density(const F& x, const F& a, const F& b, const F& c) { + assert(a < b); + assert(a <= c); + assert(c <= b); + assert(x >= a); + assert(x <= b); + if (x <= c) + return std::log(2) + std::log(x - a) - std::log(b - a) - std::log(c - a); + else + return std::log(2) + std::log(b - x) - std::log(b - a) - std::log(b - c); + } + + // note: this has been adapted so that 0 is in the support of the distribution + // support [0, 1, 2 ...) + static inline F log_yule_simon(unsigned x, const F& rho) { + assert(rho > 0.0); + return std::log(rho) + log_beta_fn(x + 1, rho + 1); + } + + // see http://www.gatsby.ucl.ac.uk/~ywteh/research/compling/hpylm.pdf + // when y=1, sometimes written x^{\overline{n}} or x^{(n)} "Pochhammer symbol" + static inline F log_generalized_factorial(const F& x, const F& n, const F& y = 1.0) { + assert(x > 0.0); + assert(y >= 0.0); + assert(n > 0.0); + if (!n) return 0.0; + if (y == F(1)) { + return lgamma(x + n) - lgamma(x); + } else if (y) { + return n * std::log(y) + lgamma(x/y + n) - lgamma(x/y); + } else { // y == 0.0 + return n * std::log(x); + } + } + + // digamma is the first derivative of the log-gamma function + static inline F digamma(const F& x) { + return boost::math::digamma(x); + } + +}; + +typedef M<double> Md; +typedef M<double> Mf; + +#endif diff --git a/utils/m_test.cc b/utils/m_test.cc new file mode 100644 index 00000000..c4d6a166 --- /dev/null +++ b/utils/m_test.cc @@ -0,0 +1,91 @@ +#include "m.h" + +#include <iostream> +#include <gtest/gtest.h> +#include <cassert> + +using namespace std; + +class MTest : public testing::Test { + public: + MTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(MTest, Densities) { + double px1 = Md::log_gaussian_density(1.0, 0.0, 1.0); + double px2 = Md::log_gaussian_density(-1.0, 0.0, 1.0); + double py1 = Md::log_laplace_density(1.0, 0.0, 1.0); + double py2 = Md::log_laplace_density(1.0, 0.0, 1.0); + double pz1 = Md::log_triangle_density(1.0, -2.0, 2.0, 0.0); + double pz2 = Md::log_triangle_density(1.0, -2.0, 2.0, 0.0); + cerr << px1 << " " << py1 << " " << pz2 << endl; + EXPECT_FLOAT_EQ(px1, px2); + EXPECT_FLOAT_EQ(py1, py2); + EXPECT_FLOAT_EQ(pz1, pz2); + double b1 = Md::log_bivariate_gaussian_density(1.0, -1.0, 0.0, 0.0, 1.0, 1.0, -0.8); + double b2 = Md::log_bivariate_gaussian_density(-1.0, 1.0, 0.0, 0.0, 1.0, 1.0, -0.8); + cerr << b1 << " " << b2 << endl; +} + +TEST_F(MTest, Poisson) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_poisson(i, 0.99); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, YuleSimon) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_yule_simon(i, 1.0); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, LogGeometric) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_geometric(i, 0.5); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, GeneralizedFactorial) { + for (double i = 0.3; i < 10000; i += 0.4) { + double a = Md::log_generalized_factorial(1.0, i); + double b = lgamma(1.0 + i); + EXPECT_FLOAT_EQ(a,b); + } + double gf_3_6 = 3.0 * 4.0 * 5.0 * 6.0 * 7.0 * 8.0; + EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.0, 6.0), std::log(gf_3_6)); + double gf_314_6 = 3.14 * 4.14 * 5.14 * 6.14 * 7.14 * 8.14; + EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.14, 6.0), std::log(gf_314_6)); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/max_plus.h b/utils/max_plus.h deleted file mode 100755 index 2e56f85e..00000000 --- a/utils/max_plus.h +++ /dev/null @@ -1,201 +0,0 @@ -#ifndef MAX_PLUS_H_ -#define MAX_PLUS_H_ - -#define MAX_PLUS_ORDER 0 -#define MAX_PLUS_DEBUG(x) - -// max-plus algebra. ordering a > b really means that (i.e. default a<b sorting will do worst (closest to 0) first. so get used to passing predicates like std::greater<MaxPlus<T> > around -// x+y := max{x,y} -// x*y := x+y -// 0 := -inf -// 1 := 0 -// additive inverse does not, but mult. does. (inverse()) and x/y := x-y = x+y.inverse() -//WARNING: default order is reversed, on purpose, i.e. a<b means a "better than" b, i.e. log(p_a)>log(p_b). sorry. defaults in libs are to order ascending, but we want best first. - -#include <boost/functional/hash.hpp> -#include <iostream> -#include <cstdlib> -#include <cmath> -#include <cassert> -#include <limits> -#include "semiring.h" -#include "show.h" -//#include "logval.h" - -template <class T> -class MaxPlus { - public: - void print(std::ostream &o) const { - o<<v_; - } - PRINT_SELF(MaxPlus<T>) - template <class O> - void operator=(O const& o) { - v_=o.v_; - } - template <class O> - MaxPlus(O const& o) : v_(o.v_) { } - - typedef MaxPlus<T> Self; - MaxPlus() : v_(LOGVAL_LOG0) {} - explicit MaxPlus(double x) : v_(std::log(x)) {} - MaxPlus(init_1) : v_(0) { } - MaxPlus(init_0) : v_(LOGVAL_LOG0) { } - MaxPlus(int x) : v_(std::log(x)) {} - MaxPlus(unsigned x) : v_(std::log(x)) { } - MaxPlus(double lnx,bool sign) : v_(lnx) { MAX_PLUS_DEBUG(assert(!sign)); } - MaxPlus(double lnx,init_lnx) : v_(lnx) {} - static Self exp(T lnx) { return MaxPlus(lnx,false); } - - // maybe the below are faster than == 1 and == 0. i don't know. - bool is_1() const { return v_==0; } - bool is_0() const { return v_==LOGVAL_LOG0; } - - static Self One() { return Self(init_1()); } - static Self Zero() { return Self(init_0()); } - static Self e() { return Self(1,false); } - void logeq(const T& v) { v_ = v; } - bool signbit() const { return false; } - - Self& logpluseq(const Self& a) { - if (a.is_0()) return *this; - if (a.v_ < v_) { - v_ = v_ + log1p(std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(std::exp(v_ - a.v_)); - } - return *this; - } - - Self& besteq(const Self& a) { - if (a.v_ < v_) - v_=a.v_; - return *this; - } - - Self& operator+=(const Self& a) { - if (a.v_ < v_) - v_=a.v_; - return *this; - } - - Self& operator*=(const Self& a) { - v_ += a.v_; - return *this; - } - - Self& operator/=(const Self& a) { - v_ -= a.v_; - return *this; - } - - // Self(fabs(log(x)),x.s_) - friend Self abslog(Self x) { - if (x.v_<0) x.v_=-x.v_; - return x; - } - - Self& poweq(const T& power) { - v_ *= power; - return *this; - } - - Self inverse() const { - return Self(-v_,false); - } - - Self pow(const T& power) const { - Self res = *this; - res.poweq(power); - return res; - } - - Self root(const T& root) const { - return pow(1/root); - } - -// copy elision - as opposed to explicit copy of Self const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r. todo: return std::move(o1) - C++0x - friend inline Self operator+(Self a,Self const& b) { - a+=b; - return a; - } - friend inline Self operator*(Self a,Self const& b) { - a*=b; - return a; - } - friend inline Self operator/(Self a,Self const& b) { - a/=b; - return a; - } - friend inline T log(Self const& a) { - return a.v_; - } - friend inline T pow(Self const& a,T const& e) { - return a.pow(e); - } - - // intentionally not defining an operator < or operator > - because you may want to default (for library convenience) a<b means a better than b (i.e. gt) - inline bool lt(Self const& o) const { - return v_ < o.v_; - } - inline bool gt(Self const& o) const { - return o.v_ > v_; - } - friend inline bool operator==(Self const& lhs, Self const& rhs) { - return lhs.v_ == rhs.v_; - } - friend inline bool operator!=(Self const& lhs, Self const& rhs) { - return lhs.v_ != rhs.v_; - } - std::size_t hash() const { - using namespace boost; - return hash_value(v_); - } - friend inline std::size_t hash_value(Self const& x) { - return x.hash(); - } - -/* - operator T() const { - return std::exp(v_); - } -*/ - T as_float() const { - return std::exp(v_); - } - - T v_; -}; - -template <class T> -struct semiring_traits<MaxPlus<T> > : default_semiring_traits<MaxPlus<T> > { - static const bool has_logplus=true; - static const bool has_besteq=true; -#if MAX_PLUS_ORDER - static const bool have_order=true; -#endif -}; - -#if MAX_PLUS_ORDER -template <class T> -bool operator<(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) { - return (lhs.v_ < rhs.v_); -} - -template <class T> -bool operator<=(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) { - return (lhs.v_ <= rhs.v_); -} - -template <class T> -bool operator>(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) { - return (lhs.v_ > rhs.v_); -} - -template <class T> -bool operator>=(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) { - return (lhs.v_ >= rhs.v_); -} -#endif - -#endif diff --git a/utils/maybe_update_bound.h b/utils/maybe_update_bound.h deleted file mode 100755 index d57215d0..00000000 --- a/utils/maybe_update_bound.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef MAYBE_UPDATE_BOUND_H -#define MAYBE_UPDATE_BOUND_H - -template <class To,class From> -inline void maybe_increase_max(To &to,const From &from) { - if (to<from) - to=from; -} - -template <class To,class From> -inline void maybe_decrease_min(To &to,const From &from) { - if (from<to) - to=from; -} - - -#endif diff --git a/utils/mfcr.h b/utils/mfcr.h new file mode 100644 index 00000000..886f01ef --- /dev/null +++ b/utils/mfcr.h @@ -0,0 +1,366 @@ +#ifndef _MFCR_H_ +#define _MFCR_H_ + +#include <algorithm> +#include <numeric> +#include <cassert> +#include <cmath> +#include <list> +#include <iostream> +#include <vector> +#include <iterator> +#include <tr1/unordered_map> +#include <boost/functional/hash.hpp> +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +struct TableCount { + TableCount() : count(), floor() {} + TableCount(int c, int f) : count(c), floor(f) { + assert(f >= 0); + } + int count; // count or delta (may be 0, <0, or >0) + unsigned char floor; // from which floor? +}; + +std::ostream& operator<<(std::ostream& o, const TableCount& tc) { + return o << "[c=" << tc.count << " floor=" << static_cast<unsigned int>(tc.floor) << ']'; +} + +// Multi-Floor Chinese Restaurant as proposed by Wood & Teh (AISTATS, 2009) to simulate +// graphical Pitman-Yor processes. +// http://jmlr.csail.mit.edu/proceedings/papers/v5/wood09a/wood09a.pdf +// +// Implementation is based on Blunsom, Cohn, Goldwater, & Johnson (ACL 2009) and code +// referenced therein. +// http://www.aclweb.org/anthology/P/P09/P09-2085.pdf +// +template <unsigned Floors, typename Dish, typename DishHash = boost::hash<Dish> > +class MFCR { + public: + + MFCR(double d, double strength) : + num_tables_(), + num_customers_(), + discount_(d), + strength_(strength), + discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()), + discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()), + strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()), + strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { check_hyperparameters(); } + + MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : + num_tables_(), + num_customers_(), + discount_(d), + strength_(strength), + discount_prior_strength_(discount_strength), + discount_prior_beta_(discount_beta), + strength_prior_shape_(strength_shape), + strength_prior_rate_(strength_rate) { check_hyperparameters(); } + + void check_hyperparameters() { + if (discount_ < 0.0 || discount_ >= 1.0) { + std::cerr << "Bad discount: " << discount_ << std::endl; + abort(); + } + if (strength_ <= -discount_) { + std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; + abort(); + } + } + + double discount() const { return discount_; } + double strength() const { return strength_; } + void set_discount(double d) { discount_ = d; check_hyperparameters(); } + void set_strength(double a) { strength_ = a; check_hyperparameters(); } + + bool has_discount_prior() const { + return !std::isnan(discount_prior_strength_); + } + + bool has_strength_prior() const { + return !std::isnan(strength_prior_shape_); + } + + void clear() { + num_tables_ = 0; + num_customers_ = 0; + dish_locs_.clear(); + } + + unsigned num_tables() const { + return num_tables_; + } + + unsigned num_tables(const Dish& dish) const { + const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->second.table_counts_.size(); + } + + // this is not terribly efficient but it should not typically be necessary to execute this query + unsigned num_tables(const Dish& dish, const unsigned floor) const { + const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + unsigned c = 0; + for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + if (i->floor == floor) ++c; + } + return c; + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->total_dish_count_; + } + + // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor + template <class InputIterator, class InputIterator2> + TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + // marg_p0 = marginal probability of opening a new table on any floor with label dish + typedef typename std::iterator_traits<InputIterator>::value_type F; + const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); + assert(marg_p0 <= F(1.0001)); + int floor = -1; + bool share_table = false; + if (loc.total_dish_count_) { + const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0; + const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + // this can be done with doubles since P0 (which may be tiny) is not involved + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= ti->count - discount_; + if (r <= 0.0) { + ++ti->count; + floor = ti->floor; + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { // sit at currently empty table -- must sample what floor + if (Floors == 1) { + floor = 0; + } else { + F r = F(rng->next()) * marg_p0; + for (unsigned i = 0; i < Floors; ++i) { + r -= (*p0s) * (*lambdas); + ++p0s; + ++lambdas; + if (r <= F(0.0)) { + floor = i; + break; + } + } + } + assert(floor >= 0); + loc.table_counts_.push_back(TableCount(1, floor)); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? TableCount(0, floor) : TableCount(1, floor)); + } + + // returns first = -1 or 0, indicating whether a table was closed, and on what floor (second) + TableCount decrement(const Dish& dish, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + assert(loc.total_dish_count_); + int floor = -1; + int delta = 0; + if (loc.total_dish_count_ == 1) { + floor = loc.table_counts_.front().floor; + dish_locs_.erase(dish); + --num_tables_; + --num_customers_; + delta = -1; + } else { + // sample customer to remove UNIFORMLY. that is, do NOT use the d + // here. if you do, it will introduce (unwanted) bias! + double r = rng->next() * loc.total_dish_count_; + --loc.total_dish_count_; + --num_customers_; + for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= ti->count; + if (r <= 0.0) { + floor = ti->floor; + if ((--ti->count) == 0) { + --num_tables_; + delta = -1; + loc.table_counts_.erase(ti); + } + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } + return TableCount(delta, floor); + } + + template <class InputIterator, class InputIterator2> + typename std::iterator_traits<InputIterator>::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const { + typedef typename std::iterator_traits<InputIterator>::value_type F; + const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); + assert(marg_p0 <= F(1.0001)); + const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish); + const F r = F(num_tables_ * discount_ + strength_); + if (it == dish_locs_.end()) { + return r * marg_p0 / F(num_customers_ + strength_); + } else { + return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) / + F(num_customers_ + strength_); + } + } + + double log_crp_prob() const { + return log_crp_prob(discount_, strength_); + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include draws from G_w's + double log_crp_prob(const double& discount, const double& strength) const { + double lp = 0.0; + if (has_discount_prior()) + lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); + if (has_strength_prior()) + lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + if (strength) + lp += lgamma(strength) - lgamma(strength / discount); + lp += - lgamma(strength + num_customers_) + + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + for (std::list<TableCount>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { + lp += lgamma(ti->count - discount) - r; + } + } + } else if (!discount) { // discount == 0.0 + lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + lp += lgamma(cur.table_counts_.size()); + } + } else { + assert(!"discount less than 0 detected!"); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_discount_prior() || has_strength_prior()); + DiscountResampler dr(*this); + StrengthResampler sr(*this); + for (int iter = 0; iter < nloop; ++iter) { + if (has_strength_prior()) { + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + } + if (has_discount_prior()) { + double min_discount = std::numeric_limits<double>::min(); + if (strength_ < 0.0) min_discount -= strength_; + discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + } + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const MFCR& crp) : crp_(crp) {} + const MFCR& crp_; + double operator()(const double& proposed_d) const { + return crp_.log_crp_prob(proposed_d, crp_.strength_); + } + }; + + struct StrengthResampler { + StrengthResampler(const MFCR& crp) : crp_(crp) {} + const MFCR& crp_; + double operator()(const double& proposediscount_strength) const { + return crp_.log_crp_prob(crp_.discount_, proposediscount_strength); + } + }; + + struct DishLocations { + DishLocations() : total_dish_count_() {} + unsigned total_dish_count_; // customers at all tables with this dish + std::list<TableCount> table_counts_; // list<> gives O(1) deletion and insertion, which we want + // .size() is the number of tables for this dish + }; + + void Print(std::ostream* out) const { + (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; + for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; + for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + (*out) << " " << *i; + } + (*out) << std::endl; + } + } + + typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator; + const_iterator begin() const { + return dish_locs_.begin(); + } + const_iterator end() const { + return dish_locs_.end(); + } + + unsigned num_tables_; + unsigned num_customers_; + std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_; + + double discount_; + double strength_; + + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_strength_; + double discount_prior_beta_; + + // optional gamma prior on strength_ (NaN if no prior) + double strength_prior_shape_; + double strength_prior_rate_; +}; + +template <unsigned N,typename T,typename H> +std::ostream& operator<<(std::ostream& o, const MFCR<N,T,H>& c) { + c.Print(&o); + return o; +} + +#endif diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc new file mode 100644 index 00000000..cc886335 --- /dev/null +++ b/utils/mfcr_test.cc @@ -0,0 +1,72 @@ +#include "mfcr.h" + +#include <iostream> +#include <cassert> +#include <cmath> + +#include "sampler.h" + +using namespace std; + +void test_exch(MT19937* rng) { + MFCR<2, int> crp(0.5, 3.0); + vector<double> lambdas(2); + vector<double> p0s(2); + lambdas[0] = 0.2; + lambdas[1] = 0.8; + p0s[0] = 1.0; + p0s[1] = 1.0; + + double tot = 0; + double tot2 = 0; + double xt = 0; + int cust = 10; + vector<int> hist(cust + 1, 0), hist2(cust + 1, 0); + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } + const int samples = 100000; + const bool simulate = true; + for (int k = 0; k < samples; ++k) { + if (!simulate) { + crp.clear(); + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } + } else { + int da = rng->next() * cust; + bool a = rng->next() < 0.45; + if (a) { + for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } + for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } + xt += 1.0; + } else { + for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } + } + } + int c = crp.num_tables(1); + ++hist[c]; + tot += c; + int c2 = crp.num_tables(1,0); // tables on floor 0 with dish 1 + ++hist2[c2]; + tot2 += c2; + } + cerr << cust << " = " << crp.num_customers() << endl; + cerr << "P(a) = " << (xt / samples) << endl; + cerr << "E[num tables] = " << (tot / samples) << endl; + double error = fabs((tot / samples) - 6.894); + cerr << " error = " << error << endl; + for (int i = 1; i <= cust; ++i) + cerr << i << ' ' << (hist[i]) << endl; + cerr << "E[num tables on floor 0] = " << (tot2 / samples) << endl; + double error2 = fabs((tot2 / samples) - 1.379); + cerr << " error2 = " << error2 << endl; + for (int i = 1; i <= cust; ++i) + cerr << i << ' ' << (hist2[i]) << endl; + assert(error < 0.05); // these can fail with very low probability + assert(error2 < 0.05); +}; + +int main(int argc, char** argv) { + MT19937 rng; + test_exch(&rng); + return 0; +} + diff --git a/utils/murmur_hash.h b/utils/murmur_hash.h index 6063d524..6063d524 100755..100644 --- a/utils/murmur_hash.h +++ b/utils/murmur_hash.h diff --git a/utils/named_enum.h b/utils/named_enum.h index 675ec868..675ec868 100755..100644 --- a/utils/named_enum.h +++ b/utils/named_enum.h diff --git a/utils/nan.h b/utils/nan.h deleted file mode 100755 index 257364d5..00000000 --- a/utils/nan.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef NAN_H -#define NAN_H -//TODO: switch to C99 isnan isfinite isinf etc. (faster) - -#include <limits> - -template <bool> struct nan_static_assert; -template <> struct nan_static_assert<true> { }; - -// is_iec559 i.e. only IEEE 754 float has x != x <=> x is nan -template<typename T> -inline bool is_nan(T x) { -// static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_quiet_NaN>)); - return std::numeric_limits<T>::has_quiet_NaN && (x != x); -} - -template <typename T> -inline bool is_inf(T x) { -// static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_infinity>)); - return x == std::numeric_limits<T>::infinity() || x == -std::numeric_limits<T>::infinity(); -} - -template <typename T> -inline bool is_pos_inf(T x) { -// static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_infinity>)); - return x == std::numeric_limits<T>::infinity(); -} - -template <typename T> -inline bool is_neg_inf(T x) { -// static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_infinity>)); - return x == -std::numeric_limits<T>::infinity(); -} - -//c99 isfinite macro shoudl be much faster -template <typename T> -inline bool is_finite(T x) { - return !is_nan(x) && !is_inf(x); -} - - -#endif diff --git a/utils/null_deleter.h b/utils/null_deleter.h index 082ab453..082ab453 100755..100644 --- a/utils/null_deleter.h +++ b/utils/null_deleter.h diff --git a/utils/null_traits.h b/utils/null_traits.h index fac857d9..fac857d9 100755..100644 --- a/utils/null_traits.h +++ b/utils/null_traits.h diff --git a/utils/sampler.h b/utils/sampler.h index cae660d2..bdbc01b0 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -48,7 +48,7 @@ struct RandomNumberGenerator { template <typename F> size_t SelectSample(const F& a, const F& b, double T = 1.0) { if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; + if (F(this->next()) > (a / (a + b))) return 1; else return 0; } else { assert(!"not implemented"); } diff --git a/utils/semiring.h b/utils/semiring.h index 5007994c..5007994c 100755..100644 --- a/utils/semiring.h +++ b/utils/semiring.h diff --git a/utils/show.h b/utils/show.h index 95cad253..95cad253 100755..100644 --- a/utils/show.h +++ b/utils/show.h diff --git a/utils/static_utoa.h b/utils/static_utoa.h index bb3d821f..bb3d821f 100755..100644 --- a/utils/static_utoa.h +++ b/utils/static_utoa.h diff --git a/utils/string_to.h b/utils/string_to.h deleted file mode 100755 index c78a5394..00000000 --- a/utils/string_to.h +++ /dev/null @@ -1,314 +0,0 @@ -#ifndef STRING_TO_H -#define STRING_TO_H - -/* - may not be any faster than boost::lexical_cast in later incarnations (see http://accu.org/index.php/journals/1375) - but is slightly simpler. no wide char or locale. - - X string_to<X>(string); - string to_string(X); - X& string_into(string,X &); // note: returns the same ref you passed in, for convenience of use - - default implementation via stringstreams (quite slow, I'm sure) - - fast implementation for string, int<->string, unsigned<->string, float<->string, double<->string - -*/ - -#ifndef USE_FTOA -#define USE_FTOA 1 -#endif -#ifndef HAVE_STRTOUL -# define HAVE_STRTOUL 1 -#endif - -#include <string> -#include <sstream> -#include <stdexcept> -#include <cstdlib> - -#include "have_64_bits.h" -#include "utoa.h" -#if USE_FTOA -# include "ftoa.h" -#endif - -namespace { -// for faster numeric to/from string. TODO: separate into optional header -#include <stdio.h> -#include <ctype.h> -#include <stdlib.h> // access to evil (fast) C isspace etc. -#include <limits.h> //strtoul -} - -inline void throw_string_to(std::string const& msg,char const* prefix="string_to: ") { - throw std::runtime_error(prefix+msg); -} - -template <class I,class To> -bool try_stream_into(I & i,To &to,bool complete=true) -{ - i >> to; - if (i.fail()) return false; - if (complete) { - char c; - return !(i >> c); - } - return true; -} - -template <class Str,class To> -bool try_string_into(Str const& str,To &to,bool complete=true) -{ - std::istringstream i(str); - return try_stream_into(i,to,complete); -} - -template <class Str,class Data> inline -Data & string_into(const Str &str,Data &data) -{ - if (!try_string_into(str,data)) - throw std::runtime_error(std::string("Couldn't convert (string_into): ")+str); - return data; -} - - -template <class Data,class Str> inline -Data string_to(const Str &str) -{ - Data ret; - string_into(str,ret); - return ret; -} - -template <class D> inline -std::string to_string(D const &d) -{ - std::ostringstream o; - o << d; - return o.str(); -} - -inline std::string to_string(unsigned x) { - return utos(x); -} - -inline std::string to_string(int x) { - return itos(x); -} - -inline long strtol_complete(char const* s,int base=10) { - char *e; - if (*s) { - long r=strtol(s,&e,base); - char c=*e; - if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos. TODO: loop consuming spaces until end? - return r; - } - throw_string_to(s,"Couldn't convert to integer: "); -} - -// returns -INT_MAX or INT_MAX if number is too large/small -inline int strtoi_complete_bounded(char const* s,int base=10) { - long l=strtol_complete(s,base); - if (l<std::numeric_limits<int>::min()) - return std::numeric_limits<int>::min(); - if (l>std::numeric_limits<int>::max()) - return std::numeric_limits<int>::max(); - return l; -} -#define RANGE_STR(x) #x -#ifdef INT_MIN -# define INTRANGE_STR "[" RANGE_STR(INT_MIN) "," RANGE_STR(INT_MAX) "]" -#else -# define INTRANGE_STR "[-2137483648,2147483647]" -#endif - - // throw if out of int range -inline int strtoi_complete_exact(char const* s,int base=10) { - long l=strtol_complete(s,base); - if (l<std::numeric_limits<int>::min() || l>std::numeric_limits<int>::max()) - throw_string_to(s,"Out of range for int " INTRANGE_STR ": "); - return l; -} - -#if HAVE_LONGER_LONG -inline int& string_into(std::string const& s,int &x) { - x=strtoi_complete_exact(s.c_str()); - return x; -} -inline int& string_into(char const* s,int &x) { - x=strtoi_complete_exact(s); - return x; -} -#endif - -inline long& string_into(std::string const& s,long &x) { - x=strtol_complete(s.c_str()); - return x; -} -inline long& string_into(char const* s,long &x) { - x=strtol_complete(s); - return x; -} - - -//FIXME: preprocessor separation for tokens int<->unsigned int, long<->unsigned long, strtol<->strtoul ? massive code duplication -inline unsigned long strtoul_complete(char const* s,int base=10) { - char *e; - if (*s) { -#if HAVE_STRTOUL - unsigned long r=strtoul(s,&e,base); -#else -// unsigned long r=strtol(s,&e,base); //FIXME: not usually safe - unsigned long r; - sscanf(s,"%ul",&r); -#endif - char c=*e; - if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos. TODO: loop consuming spaces until end? - return r; - } - throw_string_to(s,"Couldn't convert to integer: "); -} - -inline unsigned strtou_complete_bounded(char const* s,int base=10) { - unsigned long l=strtoul_complete(s,base); - if (l<std::numeric_limits<unsigned>::min()) - return std::numeric_limits<unsigned>::min(); - if (l>std::numeric_limits<unsigned>::max()) - return std::numeric_limits<unsigned>::max(); - return l; -} - -#ifdef UINT_MIN -# define UINTRANGE_STR "[" RANGE_STR(UINT_MIN) "," RANGE_STR(UINT_MAX) "]" -#else -# define UINTRANGE_STR "[0,4,294,967,295]" -#endif - - // throw if out of int range -inline unsigned strtou_complete_exact(char const* s,int base=10) { - unsigned long l=strtoul_complete(s,base); - if (l<std::numeric_limits<unsigned>::min() || l>std::numeric_limits<unsigned>::max()) - throw_string_to(s,"Out of range for uint " UINTRANGE_STR ": "); - return l; -} - -#if HAVE_LONGER_LONG -inline unsigned& string_into(std::string const& s,unsigned &x) { - x=strtou_complete_exact(s.c_str()); - return x; -} -inline unsigned& string_into(char const* s,unsigned &x) { - x=strtou_complete_exact(s); - return x; -} -#endif - -inline unsigned long& string_into(std::string const& s,unsigned long &x) { - x=strtoul_complete(s.c_str()); - return x; -} -inline unsigned long& string_into(char const* s,unsigned long &x) { - x=strtoul_complete(s); - return x; -} - -//FIXME: end code duplication - - -/* 9 decimal places needed to avoid rounding error in float->string->float. 17 for double->string->double - in terms of usable decimal places, there are 6 for float and 15 for double - */ -inline std::string to_string_roundtrip(float x) { - char buf[17]; - return std::string(buf,buf+sprintf(buf,"%.9g",x)); -} -inline std::string to_string(float x) { -#if USE_FTOA - return ftos(x); -#else - char buf[15]; - return std::string(buf,buf+sprintf(buf,"%.7g",x)); -#endif -} -inline std::string to_string_roundtrip(double x) { - char buf[32]; - return std::string(buf,buf+sprintf(buf,"%.17g",x)); -} -inline std::string to_string(double x) { -#if USE_FTOA - return ftos(x); -#else - char buf[30]; - return std::string(buf,buf+sprintf(buf,"%.15g",x)); -#endif -} - -inline double& string_into(char const* s,double &x) { - x=std::atof(s); - return x; -} -inline float& string_into(char const* s,float &x) { - x=std::atof(s); - return x; -} - -inline double& string_into(std::string const& s,double &x) { - x=std::atof(s.c_str()); - return x; -} -inline float& string_into(std::string const& s,float &x) { - x=std::atof(s.c_str()); - return x; -} - - -template <class Str> -bool try_string_into(Str const& str,Str &to,bool complete=true) -{ - str=to; - return true; -} - -inline std::string const& to_string(std::string const& d) -{ - return d; -} - -template <class Str> -Str const& string_to(Str const &s) -{ - return s; -} - -template <class Str> -Str & string_into(Str const &s,Str &d) -{ - return d=s; -} - -/* - -template <class Str,class Data,class size_type> inline -void substring_into(const Str &str,size_type pos,size_type n,Data &data) -{ -// std::istringstream i(str,pos,n); // doesn't exist! - std::istringstream i(str.substr(pos,n)); - if (!(i>>*data)) - throw std::runtime_error("Couldn't convert (string_into): "+str); -} - -template <class Data,class Str,class size_type> inline -Data string_to(const Str &str,size_type pos,size_type n) -{ - Data ret; - substring_into(str,pos,n,ret); - return ret; -} - -*/ - - - -#endif diff --git a/utils/stringlib.h b/utils/stringlib.h index cafbdac3..f457e1e4 100644 --- a/utils/stringlib.h +++ b/utils/stringlib.h @@ -125,6 +125,13 @@ inline std::string LowercaseString(const std::string& in) { return res; } +inline std::string UppercaseString(const std::string& in) { + std::string res(in.size(),' '); + for (int i = 0; i < in.size(); ++i) + res[i] = toupper(in[i]); + return res; +} + inline int CountSubstrings(const std::string& str, const std::string& sub) { size_t p = 0; int res = 0; diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc index f66cdbeb..f66cdbeb 100755..100644 --- a/utils/stringlib_test.cc +++ b/utils/stringlib_test.cc diff --git a/utils/swap_pod.h b/utils/swap_pod.h index bb9a830d..bb9a830d 100755..100644 --- a/utils/swap_pod.h +++ b/utils/swap_pod.h diff --git a/utils/utoa.h b/utils/utoa.h index 8b54987b..8b54987b 100755..100644 --- a/utils/utoa.h +++ b/utils/utoa.h diff --git a/utils/value_array.h b/utils/value_array.h index 12fc9d87..12fc9d87 100755..100644 --- a/utils/value_array.h +++ b/utils/value_array.h diff --git a/vest/Makefile.am b/vest/Makefile.am deleted file mode 100644 index 05fa5639..00000000 --- a/vest/Makefile.am +++ /dev/null @@ -1,35 +0,0 @@ -bin_PROGRAMS = \ - mr_vest_map \ - mr_vest_reduce \ - mr_vest_generate_mapper_input \ - sentserver \ - sentclient - -if HAVE_GTEST -noinst_PROGRAMS = \ - lo_test -TESTS = lo_test -endif - -sentserver_SOURCES = sentserver.c -sentserver_LDFLAGS = -all-static -pthread - -sentclient_SOURCES = sentclient.c -sentclient_LDFLAGS = -all-static -pthread - -mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -# nbest2hg_SOURCES = nbest2hg.cc -# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz - -mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/cat.pl b/vest/cat.pl deleted file mode 100755 index 2ecba3f9..00000000 --- a/vest/cat.pl +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/perl - -$|=1; -print while(<>); diff --git a/vest/ces.h b/vest/ces.h deleted file mode 100644 index 2f098990..00000000 --- a/vest/ces.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _CES_H_ -#define _CES_H_ - -#include "scorer.h" - -class ViterbiEnvelope; -class Hypergraph; -class ErrorSurface; - -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg); - -#endif diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc deleted file mode 100644 index 2867b36b..00000000 --- a/vest/mbr_kbest.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include <iostream> -#include <vector> - -#include <boost/program_options.hpp> - -#include "prob.h" -#include "tdict.h" -#include "scorer.h" -#include "filelib.h" -#include "stringlib.h" - -using namespace std; - -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function") - ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from") - ("output_list,L", "Show reranked list as output") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct LossComparer { - bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const { - return a.second < b.second; - } -}; - -bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) { - static string cache_id; - static pair<vector<WordID>, prob_t> cache_pair; - list->clear(); - string cur_id; - if (cache_pair.first.size() > 0) { - list->push_back(cache_pair); - cur_id = cache_id; - cache_pair.first.clear(); - } - string line; - string tstr; - while(*in) { - getline(*in, line); - if (line.empty()) continue; - size_t p1 = line.find(" ||| "); - if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p2 = line.find(" ||| ", p1 + 4); - if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p3 = line.rfind(" ||| "); - cache_id = line.substr(0, p1); - tstr = line.substr(p1 + 5, p2 - p1 - 5); - double val = strtod(line.substr(p3 + 5).c_str(), NULL); - TD::ConvertSentence(tstr, &cache_pair.first); - cache_pair.second.logeq(val); - if (cur_id.empty()) cur_id = cache_id; - if (cur_id == cache_id) { - list->push_back(cache_pair); - *sent_id = cur_id; - cache_pair.first.clear(); - } else { break; } - } - return !list->empty(); -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as<string>(); - const bool output_list = conf.count("output_list") > 0; - const string file = conf["input"].as<string>(); - const double mbr_scale = conf["scale"].as<double>(); - cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - - ScoreType type = ScoreTypeFromString(metric); - vector<pair<vector<WordID>, prob_t> > list; - ReadFile rf(file); - string sent_id; - while(ReadKBestList(rf.stream(), &sent_id, &list)) { - vector<prob_t> joints(list.size()); - const prob_t max_score = pow(list.front().second, mbr_scale); - prob_t marginal = prob_t::Zero(); - for (int i = 0 ; i < list.size(); ++i) { - const prob_t joint = pow(list[i].second, mbr_scale) / max_score; - joints[i] = joint; - // cerr << "list[" << i << "] joint=" << log(joint) << endl; - marginal += joint; - } - int mbr_idx = -1; - vector<double> mbr_scores(output_list ? list.size() : 0); - double mbr_loss = numeric_limits<double>::max(); - for (int i = 0 ; i < list.size(); ++i) { - vector<vector<WordID> > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); - double wl_acc = 0; - for (int j = 0; j < list.size(); ++j) { - if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; - double weighted_loss = loss * (joints[j] / marginal); - wl_acc += weighted_loss; - if ((!output_list) && wl_acc > mbr_loss) break; - } - } - if (output_list) mbr_scores[i] = wl_acc; - if (wl_acc < mbr_loss) { - mbr_loss = wl_acc; - mbr_idx = i; - } - } - // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; - cerr << "MBR Best idx: " << mbr_idx << endl; - if (output_list) { - for (int i = 0; i < list.size(); ++i) - list[i].second.logeq(mbr_scores[i]); - sort(list.begin(), list.end(), LossComparer()); - for (int i = 0; i < list.size(); ++i) - cout << sent_id << " ||| " - << TD::GetString(list[i].first) << " ||| " - << log(list[i].second) << endl; - } else { - cout << TD::GetString(list[mbr_idx].first) << endl; - } - } - return 0; -} - diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc deleted file mode 100644 index 0c094fd5..00000000 --- a/vest/mr_vest_generate_mapper_input.cc +++ /dev/null @@ -1,320 +0,0 @@ -//TODO: debug segfault when references supplied, null shared_ptr when oracle -#include <iostream> -#include <vector> -#include <sstream> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "sampler.h" -#include "filelib.h" -#include "weights.h" -#include "line_optimizer.h" -#include "hg.h" -#include "hg_io.h" -#include "scorer.h" -#include "oracle_bleu.h" -#include "ff_bleu.h" - -const bool DEBUG_ORACLE=true; - -//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle -//void register_feature_functions(); -//FFRegistry ff_registry; -namespace { -void init_bleumodel() { - ff_registry.clear(); - ff_registry.Register(new FFFactory<BLEUModel>); -} - -struct init_ff { - init_ff() { - init_bleumodel(); - } -}; -//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead. -} - -using namespace std; -namespace po = boost::program_options; - -typedef SparseVector<double> Dir; -typedef Dir Point; - -void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) { - // return; //TODO: debug - if (min_dist<=0) return; - double max_s=1.-min_dist; - if (log&&verbose) *log<<"max allowed S="<<max_s<<endl; - unsigned N=dirs.size(); - for (int i=0;i<N;++i) { - for (int j=i+1;j<N;++j) { - double s=dirs[i].tanimoto_coef(dirs[j]); - if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' '; - if (s>max_s) { - if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl; - if (avg) { - dirs[i]+=dirs[j]; - dirs[i]/=2.; - if (log) *log<<" averaged="<<dirs[i]; - } - if (log) *log<<endl; - swap(dirs[j],dirs[--N]); - } - } - if (log&&verbose) *log<<endl; - - } - dirs.resize(N); -} - -struct oracle_directions { - MT19937 rng; - OracleBleu oracle; - vector<Dir> directions; - - bool start_random; - bool include_primary; - bool old_to_hope; - bool fear_to_hope; - unsigned n_random; - void AddPrimaryAndRandomDirections() { - LineOptimizer::CreateOptimizationDirections( - fids,n_random,&rng,&directions,include_primary); - } - - void Print() { - for (int i = 0; i < dev_set_size; ++i) - for (int j = 0; j < directions.size(); ++j) { - cout << forest_file(i) <<" " << i<<" "; - print(cout,origin,"=",";"); - cout<<" "; - print(cout,directions[j],"=",";"); - cout<<"\n"; - } - } - - void AddOptions(po::options_description *opts) { - oracle.AddOptions(opts); - opts->add_options() - ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository") - ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") - ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") - ("no_old_to_hope","don't emit the usual old -> hope oracle") - ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU") - ; - } - void InitCommandLine(int argc, char *argv[], po::variables_map *conf) { - po::options_description opts("Configuration options"); - AddOptions(&opts); - opts.add_options()("help,h", "Help"); - - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -s N\n"; - goto bad_cmdline; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; - goto bad_cmdline; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r <DIR>\n"; - goto bad_cmdline; - } - if (n_oracle && oracle.refs.empty()) { - cerr<<"Specify references when using oracle directions\n"; - goto bad_cmdline; - } - if (conf->count("help")) { - cout << dcmdline_options << endl; - exit(0); - } - - return; - bad_cmdline: - cerr << dcmdline_options << endl; - exit(1); - } - - int main(int argc, char *argv[]) { - po::variables_map conf; - InitCommandLine(argc,argv,&conf); - init_bleumodel(); - UseConf(conf); - Run(); - return 0; - } - bool verbose() const { return oracle.verbose; } - void Run() { -// register_feature_functions(); - AddPrimaryAndRandomDirections(); - AddOracleDirections(); - compress_similar(directions,max_similarity,&cerr,true,verbose()); - Print(); - } - - - Point origin; // old weights that gave model 1best. - vector<string> optimize_features; - void UseConf(po::variables_map const& conf) { - oracle.UseConf(conf); - include_primary=!conf.count("no_primary"); - old_to_hope=!conf.count("no_old_to_hope"); - - if (conf.count("optimize_feature") > 0) - optimize_features=conf["optimize_feature"].as<vector<string> >(); - Init(); - } - - string weights_file; - double max_similarity; - unsigned n_oracle, oracle_batch; - string forest_repository; - unsigned dev_set_size; - vector<Oracle> oracles; - vector<int> fids; - string forest_file(unsigned i) const { - ostringstream o; - o << forest_repository << '/' << i << ".json.gz"; - return o.str(); - } - - oracle_directions() { } - - Sentences model_hyps; - - vector<ScoreP> model_scores; - bool have_doc; - void Init() { - have_doc=!decoder_translations_file.empty(); - if (have_doc) { - model_hyps.Load(decoder_translations_file); - if (verbose()) model_hyps.Print(cerr,5); - model_scores.resize(model_hyps.size()); - if (dev_set_size!=model_hyps.size()) { - cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl; - abort(); - } - cerr << "Scoring model translations " << model_hyps << endl; - for (int i=0;i<model_hyps.size();++i) { - //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle - model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]); - assert(model_scores[i]); - if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl; - if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl; - oracle.doc_score->PlusEquals(*model_scores[i]); - if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl; - } - //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating) - } - start_random=false; - cerr << "Forest repo: " << forest_repository << endl; - assert(DirectoryExists(forest_repository)); - vector<string> features; - vector<weight_t> dorigin; - Weights::InitFromFile(weights_file, &dorigin, &features); - if (optimize_features.size()) - features=optimize_features; - Weights::InitSparseVector(dorigin, &origin); - fids.clear(); - AddFeatureIds(features); - oracles.resize(dev_set_size); - } - - void AddFeatureIds(vector<string> const& features) { - int i = fids.size(); - fids.resize(fids.size()+features.size()); - for (; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - } - - - std::string decoder_translations_file; // one per line - //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg - void adjust_doc(unsigned i,double scale=1.) { - oracle.doc_score->PlusEquals(*model_scores[i],scale); - } - - Score &ds() { - return *oracle.doc_score; - } - - Oracle const& ComputeOracle(unsigned i) { - Oracle &o=oracles[i]; - if (o.is_null()) { - if (have_doc) { - if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n"; - adjust_doc(i,-1); - } - ReadFile rf(forest_file(i)); - Hypergraph hg; - { - Timer t("Loading forest from JSON "+forest_file(i)); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - } - if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl; - o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin); - if (verbose()) { - cerr << o; - ScoreP hopesc=oracle.GetScore(o.hope.sentence,i); - oracle.doc_score->PlusEquals(*hopesc,1); - cerr<<"With hope: "<<ds().ScoreDetails()<<endl; - oracle.doc_score->PlusEquals(*hopesc,-1); - cerr<<"Without hope: "<<ds().ScoreDetails()<<endl; - cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl - <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl; - if (have_doc) - cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl; - } - if (have_doc) { - adjust_doc(i,1); - } else - oracle.IncludeLastScore(); - } - return o; - } - - // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed - void AddOracleDirections() { - MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1); - unsigned b=0; - for(unsigned i=0;i<n_oracle;++i) { - Dir o2hope; - Dir fear2hope; - for (unsigned j=0;j<oracle_batch;++j,++b) { - Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b); - - if (old_to_hope) - o2hope+=o.ModelHopeGradient(); - if (fear_to_hope) - fear2hope+=o.FearHopeGradient(); - } - double N=(double)oracle_batch; - if (old_to_hope) { - o2hope/=N; - directions.push_back(o2hope); - } - if (fear_to_hope) { - fear2hope/=N; - directions.push_back(fear2hope); - } - } - } -}; - -int main(int argc, char** argv) { - oracle_directions od; - return od.main(argc,argv); -} diff --git a/vest/tac.pl b/vest/tac.pl deleted file mode 100755 index 9fb525c1..00000000 --- a/vest/tac.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - chomp; - $|=1; - print (scalar reverse($_)); - print "\n"; -} diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc deleted file mode 100644 index 9fcf75a0..00000000 --- a/vest/viterbi_envelope.cc +++ /dev/null @@ -1,177 +0,0 @@ -#include "viterbi_envelope.h" - -#include <cassert> -#include <limits> - -using namespace std; -using boost::shared_ptr; - -ostream& operator<<(ostream& os, const ViterbiEnvelope& env) { - os << '<'; - const vector<shared_ptr<Segment> >& segs = env.GetSortedSegs(); - for (int i = 0; i < segs.size(); ++i) - os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2; - return os << '>'; -} - -ViterbiEnvelope::ViterbiEnvelope(int i) { - if (i == 0) { - // do nothing - <> - } else if (i == 1) { - segs.push_back(shared_ptr<Segment>(new Segment(0, 0, 0, shared_ptr<Segment>(), shared_ptr<Segment>()))); - assert(this->IsMultiplicativeIdentity()); - } else { - cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n"; - abort(); - } -} - -struct SlopeCompare { - bool operator() (const shared_ptr<Segment>& a, const shared_ptr<Segment>& b) const { - return a->m < b->m; - } -}; - -const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) { - if (!other.is_sorted) other.Sort(); - if (segs.empty()) { - segs = other.segs; - return *this; - } - is_sorted = false; - int j = segs.size(); - segs.resize(segs.size() + other.segs.size()); - for (int i = 0; i < other.segs.size(); ++i) - segs[j++] = other.segs[i]; - assert(j == segs.size()); - return *this; -} - -void ViterbiEnvelope::Sort() const { - sort(segs.begin(), segs.end(), SlopeCompare()); - const int k = segs.size(); - int j = 0; - for (int i = 0; i < k; ++i) { - Segment l = *segs[i]; - l.x = kMinusInfinity; - // cerr << "m=" << l.m << endl; - if (0 < j) { - if (segs[j-1]->m == l.m) { // lines are parallel - if (l.b <= segs[j-1]->b) continue; - --j; - } - while(0 < j) { - l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m); - if (segs[j-1]->x < l.x) break; - --j; - } - if (0 == j) l.x = kMinusInfinity; - } - *segs[j++] = l; - } - segs.resize(j); - is_sorted = true; -} - -const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) { - if (other.IsMultiplicativeIdentity()) { return *this; } - if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } - - if (!is_sorted) Sort(); - if (!other.is_sorted) other.Sort(); - - if (this->IsEdgeEnvelope()) { -// if (other.size() > 1) -// cerr << *this << " (TIMES) " << other << endl; - shared_ptr<Segment> edge_parent = segs[0]; - const double& edge_b = edge_parent->b; - const double& edge_m = edge_parent->m; - segs.clear(); - for (int i = 0; i < other.segs.size(); ++i) { - const Segment& seg = *other.segs[i]; - const double m = seg.m + edge_m; - const double b = seg.b + edge_b; - const double& x = seg.x; // x's don't change with * - segs.push_back(shared_ptr<Segment>(new Segment(x, m, b, edge_parent, other.segs[i]))); - assert(segs.back()->p1->edge); - } -// if (other.size() > 1) -// cerr << " = " << *this << endl; - } else { - vector<shared_ptr<Segment> > new_segs; - int this_i = 0; - int other_i = 0; - const int this_size = segs.size(); - const int other_size = other.segs.size(); - double cur_x = kMinusInfinity; // moves from left to right across the - // real numbers, stopping for all inter- - // sections - double this_next_val = (1 < this_size ? segs[1]->x : kPlusInfinity); - double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity); - while (this_i < this_size && other_i < other_size) { - const Segment& this_seg = *segs[this_i]; - const Segment& other_seg= *other.segs[other_i]; - const double m = this_seg.m + other_seg.m; - const double b = this_seg.b + other_seg.b; - - new_segs.push_back(shared_ptr<Segment>(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i]))); - int comp = 0; - if (this_next_val < other_next_val) comp = -1; else - if (this_next_val > other_next_val) comp = 1; - if (0 == comp) { // the next values are equal, advance both indices - ++this_i; - ++other_i; - cur_x = this_next_val; // could be other_next_val (they're equal!) - this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); - other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); - } else { // advance the i with the lower x, update cur_x - if (-1 == comp) { - ++this_i; - cur_x = this_next_val; - this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); - } else { - ++other_i; - cur_x = other_next_val; - other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); - } - } - } - segs.swap(new_segs); - } - //cerr << "Multiply: result=" << (*this) << endl; - return *this; -} - -// recursively construct translation -void Segment::ConstructTranslation(vector<WordID>* trans) const { - const Segment* cur = this; - vector<vector<WordID> > ant_trans; - while(!cur->edge) { - ant_trans.resize(ant_trans.size() + 1); - cur->p2->ConstructTranslation(&ant_trans.back()); - cur = cur->p1.get(); - } - size_t ant_size = ant_trans.size(); - vector<const vector<WordID>*> pants(ant_size); - assert(ant_size == cur->edge->tail_nodes_.size()); - --ant_size; - for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; - cur->edge->rule_->ESubstitute(pants, trans); -} - -void Segment::CollectEdgesUsed(std::vector<bool>* edges_used) const { - if (edge) { - assert(edge->id_ < edges_used->size()); - (*edges_used)[edge->id_] = true; - } - if (p1) p1->CollectEdgesUsed(edges_used); - if (p2) p2->CollectEdgesUsed(edges_used); -} - -ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const { - const double m = direction.dot(e.feature_values_); - const double b = origin.dot(e.feature_values_); - Segment* seg = new Segment(m, b, e); - return ViterbiEnvelope(1, seg); -} - diff --git a/vest/viterbi_envelope.h b/vest/viterbi_envelope.h deleted file mode 100644 index 60ad82d8..00000000 --- a/vest/viterbi_envelope.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef _VITERBI_ENVELOPE_H_ -#define _VITERBI_ENVELOPE_H_ - -#include <vector> -#include <iostream> -#include <boost/shared_ptr.hpp> - -#include "hg.h" -#include "sparse_vector.h" - -static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); -static const double kPlusInfinity = std::numeric_limits<double>::infinity(); - -struct Segment { - Segment() : x(), m(), b(), edge() {} - Segment(double _m, double _b) : - x(kMinusInfinity), m(_m), b(_b), edge() {} - Segment(double _x, double _m, double _b, const boost::shared_ptr<Segment>& p1_, const boost::shared_ptr<Segment>& p2_) : - x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} - Segment(double _m, double _b, const Hypergraph::Edge& edge) : - x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} - - double x; // x intersection with previous segment in env, or -inf if none - double m; // this line's slope - double b; // intercept with y-axis - - // we keep a pointer to the "parents" of this segment so we can reconstruct - // the Viterbi translation corresponding to this segment - boost::shared_ptr<Segment> p1; - boost::shared_ptr<Segment> p2; - - // only Segments created from an edge using the ViterbiEnvelopeWeightFunction - // have rules - // TRulePtr rule; - const Hypergraph::Edge* edge; - - // recursively recover the Viterbi translation that will result from setting - // the weights to origin + axis * x, where x is any value from this->x up - // until the next largest x in the containing ViterbiEnvelope - void ConstructTranslation(std::vector<WordID>* trans) const; - void CollectEdgesUsed(std::vector<bool>* edges_used) const; -}; - -// this is the semiring value type, -// it defines constructors for 0, 1, and the operations + and * -struct ViterbiEnvelope { - // create semiring zero - ViterbiEnvelope() : is_sorted(true) {} // zero - // for debugging: - ViterbiEnvelope(const std::vector<boost::shared_ptr<Segment> >& s) : segs(s) { Sort(); } - // create semiring 1 or 0 - explicit ViterbiEnvelope(int i); - ViterbiEnvelope(int n, Segment* seg) : is_sorted(true), segs(n, boost::shared_ptr<Segment>(seg)) {} - const ViterbiEnvelope& operator+=(const ViterbiEnvelope& other); - const ViterbiEnvelope& operator*=(const ViterbiEnvelope& other); - bool IsMultiplicativeIdentity() const { - return size() == 1 && (segs[0]->b == 0.0 && segs[0]->m == 0.0) && (!segs[0]->edge) && (!segs[0]->p1) && (!segs[0]->p2); } - const std::vector<boost::shared_ptr<Segment> >& GetSortedSegs() const { - if (!is_sorted) Sort(); - return segs; - } - size_t size() const { return segs.size(); } - - private: - bool IsEdgeEnvelope() const { - return segs.size() == 1 && segs[0]->edge; } - void Sort() const; - mutable bool is_sorted; - mutable std::vector<boost::shared_ptr<Segment> > segs; -}; -std::ostream& operator<<(std::ostream& os, const ViterbiEnvelope& env); - -struct ViterbiEnvelopeWeightFunction { - ViterbiEnvelopeWeightFunction(const SparseVector<double>& ori, - const SparseVector<double>& dir) : origin(ori), direction(dir) {} - ViterbiEnvelope operator()(const Hypergraph::Edge& e) const; - const SparseVector<double> origin; - const SparseVector<double> direction; -}; - -#endif diff --git a/word-aligner/stemmers/ar.pl b/word-aligner/stemmers/ar.pl new file mode 100755 index 00000000..c85e883a --- /dev/null +++ b/word-aligner/stemmers/ar.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while(<STDIN>) { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(.st|.n|Al)/) { $el+=2; } + if ($w =~ /^(y|t|n)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl new file mode 100755 index 00000000..3a4f5a45 --- /dev/null +++ b/word-aligner/stemmers/ur.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while(<STDIN>) { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(al|Al)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + |