From 925087356b853e2099c1b60d8b757d7aa02121a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/clda/src/Makefile.am | 6 - gi/clda/src/ccrp.h | 291 -- gi/clda/src/clda.cc | 148 - gi/clda/src/crp.h | 50 - gi/clda/src/slice_sampler.h | 191 - gi/clda/src/timer.h | 20 - gi/evaluation/conditional_entropy.py | 61 - gi/evaluation/confusion_matrix.py | 123 - gi/evaluation/entropy.py | 38 - gi/evaluation/extract_ccg_labels.py | 129 - gi/evaluation/tree.py | 485 --- gi/markov_al/Makefile.am | 6 - gi/markov_al/README | 2 - gi/markov_al/ml.cc | 470 --- gi/morf-segmentation/filter_docs.pl | 24 - gi/morf-segmentation/invalid_vocab.patterns | 6 - gi/morf-segmentation/linestripper.py | 40 - gi/morf-segmentation/morf-pipeline.pl | 486 --- gi/morf-segmentation/morfsegment.py | 50 - gi/morf-segmentation/morftrain.sh | 110 - gi/morf-segmentation/vocabextractor.sh | 40 - gi/pf/Makefile.am | 44 - gi/pf/README | 2 - gi/pf/align-lexonly-pyp.cc | 243 -- gi/pf/align-tl.cc | 339 -- gi/pf/backward.cc | 89 - gi/pf/backward.h | 33 - gi/pf/base_distributions.cc | 241 -- gi/pf/base_distributions.h | 238 -- gi/pf/bayes_lattice_score.cc | 309 -- gi/pf/brat.cc | 543 --- gi/pf/cbgi.cc | 330 -- gi/pf/cfg_wfst_composer.cc | 731 ---- gi/pf/cfg_wfst_composer.h | 46 - gi/pf/conditional_pseg.h | 275 -- gi/pf/condnaive.cc | 298 -- gi/pf/corpus.cc | 62 - gi/pf/corpus.h | 19 - gi/pf/dpnaive.cc | 301 -- gi/pf/guess-translits.pl | 72 - gi/pf/hpyp_tm.cc | 133 - gi/pf/hpyp_tm.h | 38 - gi/pf/itg.cc | 275 -- gi/pf/learn_cfg.cc | 428 --- gi/pf/make-freq-bins.pl | 26 - gi/pf/mh_test.cc | 148 - gi/pf/monotonic_pseg.h | 89 - gi/pf/ngram_base.cc | 69 - gi/pf/ngram_base.h | 25 - gi/pf/nuisance_test.cc | 161 - gi/pf/os_phrase.h | 15 - gi/pf/pf.h | 84 - gi/pf/pf_test.cc | 148 - gi/pf/pfbrat.cc | 543 --- gi/pf/pfdist.cc | 598 --- gi/pf/pfdist.new.cc | 620 --- gi/pf/pfnaive.cc | 284 -- gi/pf/poisson_uniform_word_model.h | 50 - gi/pf/pyp_lm.cc | 273 -- gi/pf/pyp_tm.cc | 128 - gi/pf/pyp_tm.h | 36 - gi/pf/pyp_word_model.h | 61 - gi/pf/quasi_model2.h | 177 - gi/pf/reachability.cc | 74 - gi/pf/reachability.h | 34 - gi/pf/tied_resampler.h | 122 - gi/pf/tpf.cc | 99 - gi/pf/transliterations.cc | 334 -- gi/pf/transliterations.h | 24 - gi/pf/unigrams.cc | 80 - gi/pf/unigrams.h | 69 - gi/pipeline/OLD.clsp.config | 9 - gi/pipeline/OLD.evaluation-pipeline.pl | 277 -- gi/pipeline/backoff-pipe.pl | 215 -- gi/pipeline/blacklight.config | 9 - gi/pipeline/clsp.config | 10 - gi/pipeline/evaluation-pipeline.pl | 364 -- gi/pipeline/local-gi-pipeline.pl | 465 --- gi/pipeline/lticluster.config | 9 - gi/pipeline/scripts/filter-by-f.pl | 56 - gi/pipeline/scripts/patch-corpus.pl | 65 - gi/pipeline/scripts/refilter.pl | 40 - gi/pipeline/scripts/rekey.pl | 8 - gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 - gi/pipeline/scripts/remove-tags-from-corpus.pl | 44 - gi/pipeline/scripts/sort-by-key.sh | 5 - gi/pipeline/scripts/xfeats.pl | 39 - gi/pipeline/valhalla.config | 9 - gi/posterior-regularisation/Corpus.java | 167 - gi/posterior-regularisation/Lexicon.java | 32 - .../PhraseContextModel.java | 466 --- gi/posterior-regularisation/README | 3 - gi/posterior-regularisation/alphabet.hh | 61 - gi/posterior-regularisation/canned.concordance | 4 - gi/posterior-regularisation/em.cc | 830 ---- gi/posterior-regularisation/invert.hh | 45 - gi/posterior-regularisation/linesearch.py | 58 - gi/posterior-regularisation/log_add.hh | 30 - gi/posterior-regularisation/prjava.jar | 1 - gi/posterior-regularisation/prjava/Makefile | 8 - gi/posterior-regularisation/prjava/build.xml | 38 - .../prjava/lib/commons-math-2.1.jar | Bin 832410 -> 0 bytes .../prjava/lib/jopt-simple-3.2.jar | Bin 53244 -> 0 bytes .../prjava/lib/trove-2.0.2.jar | Bin 737844 -> 0 bytes gi/posterior-regularisation/prjava/src/arr/F.java | 99 - .../prjava/src/data/Corpus.java | 233 -- .../prjava/src/hmm/HMM.java | 579 --- .../prjava/src/hmm/HMMObjective.java | 351 -- .../prjava/src/hmm/POS.java | 120 - .../prjava/src/io/FileUtil.java | 48 - .../prjava/src/io/SerializedObjects.java | 83 - .../examples/GeneralizedRosenbrock.java | 110 - .../prjava/src/optimization/examples/x2y2.java | 128 - .../optimization/examples/x2y2WithConstraints.java | 127 - .../AbstractGradientBaseMethod.java | 120 - .../gradientBasedMethods/ConjugateGradient.java | 92 - .../gradientBasedMethods/DebugHelpers.java | 65 - .../gradientBasedMethods/GradientDescent.java | 19 - .../optimization/gradientBasedMethods/LBFGS.java | 234 -- .../gradientBasedMethods/Objective.java | 87 - .../gradientBasedMethods/Optimizer.java | 19 - .../ProjectedAbstractGradientBaseMethod.java | 11 - .../ProjectedGradientDescent.java | 154 - .../gradientBasedMethods/ProjectedObjective.java | 29 - .../gradientBasedMethods/ProjectedOptimizer.java | 10 - .../gradientBasedMethods/stats/OptimizerStats.java | 86 - .../stats/ProjectedOptimizerStats.java | 70 - .../linesearch/ArmijoLineSearchMinimization.java | 102 - ...joLineSearchMinimizationAlongProjectionArc.java | 141 - .../DifferentiableLineSearchObjective.java | 185 - .../linesearch/GenericPickFirstStep.java | 20 - .../linesearch/InterpolationPickFirstStep.java | 25 - .../optimization/linesearch/LineSearchMethod.java | 14 - .../NonNewtonInterpolationPickFirstStep.java | 33 - ...ProjectedDifferentiableLineSearchObjective.java | 137 - .../linesearch/WolfRuleLineSearch.java | 300 -- .../optimization/linesearch/WolfeConditions.java | 45 - .../optimization/projections/BoundsProjection.java | 104 - .../src/optimization/projections/Projection.java | 72 - .../projections/SimplexProjection.java | 127 - .../stopCriteria/CompositeStopingCriteria.java | 33 - .../optimization/stopCriteria/GradientL2Norm.java | 30 - .../stopCriteria/NormalizedGradientL2Norm.java | 48 - .../NormalizedProjectedGradientL2Norm.java | 60 - .../stopCriteria/NormalizedValueDifference.java | 54 - .../stopCriteria/ProjectedGradientL2Norm.java | 51 - .../optimization/stopCriteria/StopingCriteria.java | 8 - .../optimization/stopCriteria/ValueDifference.java | 41 - .../src/optimization/util/Interpolation.java | 37 - .../prjava/src/optimization/util/Logger.java | 7 - .../prjava/src/optimization/util/MathUtils.java | 339 -- .../prjava/src/optimization/util/MatrixOutput.java | 28 - .../prjava/src/optimization/util/StaticTools.java | 180 - .../prjava/src/phrase/Agree.java | 204 - .../prjava/src/phrase/Agree2Sides.java | 197 - .../prjava/src/phrase/C2F.java | 216 -- .../prjava/src/phrase/Corpus.java | 288 -- .../prjava/src/phrase/Lexicon.java | 34 - .../prjava/src/phrase/PhraseCluster.java | 540 --- .../prjava/src/phrase/PhraseContextObjective.java | 436 --- .../prjava/src/phrase/PhraseCorpus.java | 193 - .../prjava/src/phrase/PhraseObjective.java | 224 -- .../prjava/src/phrase/Trainer.java | 257 -- .../prjava/src/phrase/VB.java | 419 -- .../prjava/src/test/CorpusTest.java | 60 - .../prjava/src/test/HMMModelStats.java | 105 - .../prjava/src/test/IntDoublePair.java | 23 - .../prjava/src/test/X2y2WithConstraints.java | 131 - .../prjava/src/util/Array.java | 41 - .../prjava/src/util/ArrayMath.java | 186 - .../prjava/src/util/DifferentiableObjective.java | 14 - .../prjava/src/util/DigammaFunction.java | 21 - .../prjava/src/util/FileSystem.java | 21 - .../prjava/src/util/InputOutput.java | 67 - .../prjava/src/util/LogSummer.java | 86 - .../prjava/src/util/MathUtil.java | 148 - .../prjava/src/util/Matrix.java | 16 - .../prjava/src/util/MemoryTracker.java | 47 - .../prjava/src/util/Pair.java | 31 - .../prjava/src/util/Printing.java | 158 - .../prjava/src/util/Sorters.java | 39 - .../prjava/train-PR-cluster.sh | 4 - gi/posterior-regularisation/projected_gradient.cc | 87 - gi/posterior-regularisation/simplex_pg.py | 55 - gi/posterior-regularisation/split-languages.py | 23 - gi/posterior-regularisation/train_pr_agree.py | 400 -- gi/posterior-regularisation/train_pr_global.py | 296 -- gi/posterior-regularisation/train_pr_parallel.py | 333 -- gi/pyp-topics/scripts/contexts2documents.py | 37 - gi/pyp-topics/scripts/extract_contexts.py | 144 - gi/pyp-topics/scripts/extract_contexts_test.py | 72 - gi/pyp-topics/scripts/extract_leaves.py | 49 - gi/pyp-topics/scripts/map-documents.py | 20 - gi/pyp-topics/scripts/map-terms.py | 20 - gi/pyp-topics/scripts/run.sh | 13 - gi/pyp-topics/scripts/score-mkcls.py | 61 - gi/pyp-topics/scripts/score-topics.py | 64 - gi/pyp-topics/scripts/spans2labels.py | 137 - gi/pyp-topics/scripts/tokens2classes.py | 27 - gi/pyp-topics/scripts/topics.py | 20 - gi/pyp-topics/src/Makefile.am | 16 - gi/pyp-topics/src/Makefile.mpi | 26 - gi/pyp-topics/src/clock_gettime_stub.c | 141 - gi/pyp-topics/src/contexts_corpus.cc | 164 - gi/pyp-topics/src/contexts_corpus.hh | 90 - gi/pyp-topics/src/contexts_lexer.h | 22 - gi/pyp-topics/src/contexts_lexer.l | 113 - gi/pyp-topics/src/corpus.cc | 104 - gi/pyp-topics/src/corpus.hh | 133 - gi/pyp-topics/src/gammadist.c | 247 -- gi/pyp-topics/src/gammadist.h | 72 - gi/pyp-topics/src/gzstream.cc | 165 - gi/pyp-topics/src/gzstream.hh | 121 - gi/pyp-topics/src/log_add.h | 30 - gi/pyp-topics/src/macros.Linux | 18 - gi/pyp-topics/src/makefile.darwin | 15 - gi/pyp-topics/src/makefile.depend | 4042 -------------------- gi/pyp-topics/src/mpi-corpus.hh | 69 - gi/pyp-topics/src/mpi-pyp-topics.cc | 466 --- gi/pyp-topics/src/mpi-pyp-topics.hh | 106 - gi/pyp-topics/src/mpi-pyp.hh | 447 --- gi/pyp-topics/src/mpi-train-contexts.cc | 201 - gi/pyp-topics/src/mt19937ar.c | 194 - gi/pyp-topics/src/mt19937ar.h | 44 - gi/pyp-topics/src/pyp-topics.cc | 499 --- gi/pyp-topics/src/pyp-topics.hh | 98 - gi/pyp-topics/src/pyp.hh | 566 --- gi/pyp-topics/src/slice-sampler.h | 192 - gi/pyp-topics/src/timing.h | 37 - gi/pyp-topics/src/train-contexts.cc | 174 - gi/pyp-topics/src/train.cc | 135 - gi/pyp-topics/src/utility.h | 962 ----- gi/pyp-topics/src/workers.hh | 275 -- gi/scripts/buck2utf8.pl | 87 - 234 files changed, 36886 deletions(-) delete mode 100644 gi/clda/src/Makefile.am delete mode 100644 gi/clda/src/ccrp.h delete mode 100644 gi/clda/src/clda.cc delete mode 100644 gi/clda/src/crp.h delete mode 100644 gi/clda/src/slice_sampler.h delete mode 100644 gi/clda/src/timer.h delete mode 100644 gi/evaluation/conditional_entropy.py delete mode 100644 gi/evaluation/confusion_matrix.py delete mode 100644 gi/evaluation/entropy.py delete mode 100644 gi/evaluation/extract_ccg_labels.py delete mode 100644 gi/evaluation/tree.py delete mode 100644 gi/markov_al/Makefile.am delete mode 100644 gi/markov_al/README delete mode 100644 gi/markov_al/ml.cc delete mode 100755 gi/morf-segmentation/filter_docs.pl delete mode 100644 gi/morf-segmentation/invalid_vocab.patterns delete mode 100755 gi/morf-segmentation/linestripper.py delete mode 100755 gi/morf-segmentation/morf-pipeline.pl delete mode 100755 gi/morf-segmentation/morfsegment.py delete mode 100755 gi/morf-segmentation/morftrain.sh delete mode 100755 gi/morf-segmentation/vocabextractor.sh delete mode 100644 gi/pf/Makefile.am delete mode 100644 gi/pf/README delete mode 100644 gi/pf/align-lexonly-pyp.cc delete mode 100644 gi/pf/align-tl.cc delete mode 100644 gi/pf/backward.cc delete mode 100644 gi/pf/backward.h delete mode 100644 gi/pf/base_distributions.cc delete mode 100644 gi/pf/base_distributions.h delete mode 100644 gi/pf/bayes_lattice_score.cc delete mode 100644 gi/pf/brat.cc delete mode 100644 gi/pf/cbgi.cc delete mode 100644 gi/pf/cfg_wfst_composer.cc delete mode 100644 gi/pf/cfg_wfst_composer.h delete mode 100644 gi/pf/conditional_pseg.h delete mode 100644 gi/pf/condnaive.cc delete mode 100644 gi/pf/corpus.cc delete mode 100644 gi/pf/corpus.h delete mode 100644 gi/pf/dpnaive.cc delete mode 100755 gi/pf/guess-translits.pl delete mode 100644 gi/pf/hpyp_tm.cc delete mode 100644 gi/pf/hpyp_tm.h delete mode 100644 gi/pf/itg.cc delete mode 100644 gi/pf/learn_cfg.cc delete mode 100755 gi/pf/make-freq-bins.pl delete mode 100644 gi/pf/mh_test.cc delete mode 100644 gi/pf/monotonic_pseg.h delete mode 100644 gi/pf/ngram_base.cc delete mode 100644 gi/pf/ngram_base.h delete mode 100644 gi/pf/nuisance_test.cc delete mode 100644 gi/pf/os_phrase.h delete mode 100644 gi/pf/pf.h delete mode 100644 gi/pf/pf_test.cc delete mode 100644 gi/pf/pfbrat.cc delete mode 100644 gi/pf/pfdist.cc delete mode 100644 gi/pf/pfdist.new.cc delete mode 100644 gi/pf/pfnaive.cc delete mode 100644 gi/pf/poisson_uniform_word_model.h delete mode 100644 gi/pf/pyp_lm.cc delete mode 100644 gi/pf/pyp_tm.cc delete mode 100644 gi/pf/pyp_tm.h delete mode 100644 gi/pf/pyp_word_model.h delete mode 100644 gi/pf/quasi_model2.h delete mode 100644 gi/pf/reachability.cc delete mode 100644 gi/pf/reachability.h delete mode 100644 gi/pf/tied_resampler.h delete mode 100644 gi/pf/tpf.cc delete mode 100644 gi/pf/transliterations.cc delete mode 100644 gi/pf/transliterations.h delete mode 100644 gi/pf/unigrams.cc delete mode 100644 gi/pf/unigrams.h delete mode 100644 gi/pipeline/OLD.clsp.config delete mode 100755 gi/pipeline/OLD.evaluation-pipeline.pl delete mode 100644 gi/pipeline/backoff-pipe.pl delete mode 100644 gi/pipeline/blacklight.config delete mode 100644 gi/pipeline/clsp.config delete mode 100755 gi/pipeline/evaluation-pipeline.pl delete mode 100755 gi/pipeline/local-gi-pipeline.pl delete mode 100644 gi/pipeline/lticluster.config delete mode 100755 gi/pipeline/scripts/filter-by-f.pl delete mode 100755 gi/pipeline/scripts/patch-corpus.pl delete mode 100755 gi/pipeline/scripts/refilter.pl delete mode 100755 gi/pipeline/scripts/rekey.pl delete mode 100755 gi/pipeline/scripts/remove-tags-from-contexts.pl delete mode 100755 gi/pipeline/scripts/remove-tags-from-corpus.pl delete mode 100755 gi/pipeline/scripts/sort-by-key.sh delete mode 100755 gi/pipeline/scripts/xfeats.pl delete mode 100644 gi/pipeline/valhalla.config delete mode 100644 gi/posterior-regularisation/Corpus.java delete mode 100644 gi/posterior-regularisation/Lexicon.java delete mode 100644 gi/posterior-regularisation/PhraseContextModel.java delete mode 100644 gi/posterior-regularisation/README delete mode 100644 gi/posterior-regularisation/alphabet.hh delete mode 100644 gi/posterior-regularisation/canned.concordance delete mode 100644 gi/posterior-regularisation/em.cc delete mode 100644 gi/posterior-regularisation/invert.hh delete mode 100644 gi/posterior-regularisation/linesearch.py delete mode 100644 gi/posterior-regularisation/log_add.hh delete mode 120000 gi/posterior-regularisation/prjava.jar delete mode 100755 gi/posterior-regularisation/prjava/Makefile delete mode 100644 gi/posterior-regularisation/prjava/build.xml delete mode 100644 gi/posterior-regularisation/prjava/lib/commons-math-2.1.jar delete mode 100644 gi/posterior-regularisation/prjava/lib/jopt-simple-3.2.jar delete mode 100644 gi/posterior-regularisation/prjava/lib/trove-2.0.2.jar delete mode 100644 gi/posterior-regularisation/prjava/src/arr/F.java delete mode 100644 gi/posterior-regularisation/prjava/src/data/Corpus.java delete mode 100644 gi/posterior-regularisation/prjava/src/hmm/HMM.java delete mode 100644 gi/posterior-regularisation/prjava/src/hmm/HMMObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/hmm/POS.java delete mode 100644 gi/posterior-regularisation/prjava/src/io/FileUtil.java delete mode 100644 gi/posterior-regularisation/prjava/src/io/SerializedObjects.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/examples/GeneralizedRosenbrock.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/examples/x2y2.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/examples/x2y2WithConstraints.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/AbstractGradientBaseMethod.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/ConjugateGradient.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/DebugHelpers.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/GradientDescent.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/LBFGS.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/Objective.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/Optimizer.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/ProjectedAbstractGradientBaseMethod.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/ProjectedGradientDescent.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/ProjectedObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/ProjectedOptimizer.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/stats/OptimizerStats.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/gradientBasedMethods/stats/ProjectedOptimizerStats.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/ArmijoLineSearchMinimization.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/ArmijoLineSearchMinimizationAlongProjectionArc.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/DifferentiableLineSearchObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/GenericPickFirstStep.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/InterpolationPickFirstStep.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/LineSearchMethod.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/NonNewtonInterpolationPickFirstStep.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/ProjectedDifferentiableLineSearchObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/WolfRuleLineSearch.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/linesearch/WolfeConditions.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/projections/BoundsProjection.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/projections/Projection.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/projections/SimplexProjection.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/CompositeStopingCriteria.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/GradientL2Norm.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/NormalizedGradientL2Norm.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/NormalizedProjectedGradientL2Norm.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/NormalizedValueDifference.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/ProjectedGradientL2Norm.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/StopingCriteria.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/stopCriteria/ValueDifference.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/util/Interpolation.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/util/Logger.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/util/MathUtils.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/util/MatrixOutput.java delete mode 100644 gi/posterior-regularisation/prjava/src/optimization/util/StaticTools.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Agree.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/C2F.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Corpus.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Lexicon.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseContextObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Trainer.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/VB.java delete mode 100644 gi/posterior-regularisation/prjava/src/test/CorpusTest.java delete mode 100644 gi/posterior-regularisation/prjava/src/test/HMMModelStats.java delete mode 100644 gi/posterior-regularisation/prjava/src/test/IntDoublePair.java delete mode 100644 gi/posterior-regularisation/prjava/src/test/X2y2WithConstraints.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/Array.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/ArrayMath.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/DifferentiableObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/DigammaFunction.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/FileSystem.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/InputOutput.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/LogSummer.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/MathUtil.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/Matrix.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/MemoryTracker.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/Pair.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/Printing.java delete mode 100644 gi/posterior-regularisation/prjava/src/util/Sorters.java delete mode 100755 gi/posterior-regularisation/prjava/train-PR-cluster.sh delete mode 100644 gi/posterior-regularisation/projected_gradient.cc delete mode 100644 gi/posterior-regularisation/simplex_pg.py delete mode 100755 gi/posterior-regularisation/split-languages.py delete mode 100644 gi/posterior-regularisation/train_pr_agree.py delete mode 100644 gi/posterior-regularisation/train_pr_global.py delete mode 100644 gi/posterior-regularisation/train_pr_parallel.py delete mode 100755 gi/pyp-topics/scripts/contexts2documents.py delete mode 100755 gi/pyp-topics/scripts/extract_contexts.py delete mode 100755 gi/pyp-topics/scripts/extract_contexts_test.py delete mode 100755 gi/pyp-topics/scripts/extract_leaves.py delete mode 100755 gi/pyp-topics/scripts/map-documents.py delete mode 100755 gi/pyp-topics/scripts/map-terms.py delete mode 100644 gi/pyp-topics/scripts/run.sh delete mode 100755 gi/pyp-topics/scripts/score-mkcls.py delete mode 100755 gi/pyp-topics/scripts/score-topics.py delete mode 100755 gi/pyp-topics/scripts/spans2labels.py delete mode 100755 gi/pyp-topics/scripts/tokens2classes.py delete mode 100755 gi/pyp-topics/scripts/topics.py delete mode 100644 gi/pyp-topics/src/Makefile.am delete mode 100644 gi/pyp-topics/src/Makefile.mpi delete mode 100644 gi/pyp-topics/src/clock_gettime_stub.c delete mode 100644 gi/pyp-topics/src/contexts_corpus.cc delete mode 100644 gi/pyp-topics/src/contexts_corpus.hh delete mode 100644 gi/pyp-topics/src/contexts_lexer.h delete mode 100644 gi/pyp-topics/src/contexts_lexer.l delete mode 100644 gi/pyp-topics/src/corpus.cc delete mode 100644 gi/pyp-topics/src/corpus.hh delete mode 100644 gi/pyp-topics/src/gammadist.c delete mode 100644 gi/pyp-topics/src/gammadist.h delete mode 100644 gi/pyp-topics/src/gzstream.cc delete mode 100644 gi/pyp-topics/src/gzstream.hh delete mode 100644 gi/pyp-topics/src/log_add.h delete mode 100644 gi/pyp-topics/src/macros.Linux delete mode 100644 gi/pyp-topics/src/makefile.darwin delete mode 100644 gi/pyp-topics/src/makefile.depend delete mode 100644 gi/pyp-topics/src/mpi-corpus.hh delete mode 100644 gi/pyp-topics/src/mpi-pyp-topics.cc delete mode 100644 gi/pyp-topics/src/mpi-pyp-topics.hh delete mode 100644 gi/pyp-topics/src/mpi-pyp.hh delete mode 100644 gi/pyp-topics/src/mpi-train-contexts.cc delete mode 100644 gi/pyp-topics/src/mt19937ar.c delete mode 100644 gi/pyp-topics/src/mt19937ar.h delete mode 100644 gi/pyp-topics/src/pyp-topics.cc delete mode 100644 gi/pyp-topics/src/pyp-topics.hh delete mode 100644 gi/pyp-topics/src/pyp.hh delete mode 100644 gi/pyp-topics/src/slice-sampler.h delete mode 100644 gi/pyp-topics/src/timing.h delete mode 100644 gi/pyp-topics/src/train-contexts.cc delete mode 100644 gi/pyp-topics/src/train.cc delete mode 100644 gi/pyp-topics/src/utility.h delete mode 100644 gi/pyp-topics/src/workers.hh delete mode 100755 gi/scripts/buck2utf8.pl (limited to 'gi') diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am deleted file mode 100644 index cdca1f97..00000000 --- a/gi/clda/src/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -bin_PROGRAMS = clda - -clda_SOURCES = clda.cc - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -AM_LDFLAGS = $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/clda/src/ccrp.h b/gi/clda/src/ccrp.h deleted file mode 100644 index a7c2825c..00000000 --- a/gi/clda/src/ccrp.h +++ /dev/null @@ -1,291 +0,0 @@ -#ifndef _CCRP_H_ -#define _CCRP_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template > -class CCRP { - public: - CCRP(double disc, double conc) : - num_tables_(), - num_customers_(), - discount_(disc), - concentration_(conc), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), - discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} - - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) : - num_tables_(), - num_customers_(), - discount_(d), - concentration_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} - - double discount() const { return discount_; } - double concentration() const { return concentration_; } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); - } - - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_locs_.clear(); - } - - unsigned num_tables(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->second.table_counts_.size(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->total_dish_count_; - } - - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - - // returns -1 or 0, indicating whether a table was closed - int decrement(const Dish& dish, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - assert(loc.total_dish_count_); - if (loc.total_dish_count_ == 1) { - dish_locs_.erase(dish); - --num_tables_; - --num_customers_; - return -1; - } else { - int delta = 0; - // sample customer to remove UNIFORMLY. that is, do NOT use the discount - // here. if you do, it will introduce (unwanted) bias! - double r = rng->next() * loc.total_dish_count_; - --loc.total_dish_count_; - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= *ti; - if (r <= 0.0) { - if ((--(*ti)) == 0) { - --num_tables_; - delta = -1; - loc.table_counts_.erase(ti); - } - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - --num_customers_; - return delta; - } - } - - double prob(const Dish& dish, const double& p0) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; - if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); - } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { - double lp = 0.0; - if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(*ti - discount) - r; - } - } - } else { - assert(!"not implemented yet"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng) { - assert(has_discount_prior() || has_concentration_prior()); - DiscountResampler dr(*this); - ConcentrationResampler cr(*this); - const int niterations = 10; - double gamma_upper = std::numeric_limits::infinity(); - for (int iter = 0; iter < 5; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - gamma_upper, 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), - 1.0, 0.0, niterations, 100*niterations); - } - } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - gamma_upper, 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); - } - }; - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); - } - }; - - struct DishLocations { - DishLocations() : total_dish_count_() {} - unsigned total_dish_count_; // customers at all tables with this dish - std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want - // .size() is the number of tables for this dish - }; - - void Print(std::ostream* out) const { - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; - for (typename std::list::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - (*out) << " " << *i; - } - (*out) << std::endl; - } - } - - typedef typename std::tr1::unordered_map::const_iterator const_iterator; - const_iterator begin() const { - return dish_locs_.begin(); - } - const_iterator end() const { - return dish_locs_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - std::tr1::unordered_map dish_locs_; - - double discount_; - double concentration_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; - double discount_prior_beta_; - - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; -}; - -template -std::ostream& operator<<(std::ostream& o, const CCRP& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc deleted file mode 100644 index f548997f..00000000 --- a/gi/clda/src/clda.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include -#include -#include - -#include "timer.h" -#include "crp.h" -#include "ccrp.h" -#include "sampler.h" -#include "tdict.h" -const size_t MAX_DOC_LEN_CHARS = 10000000; - -using namespace std; - -void ShowTopWordsForTopic(const map& counts) { - multimap ms; - for (map::const_iterator it = counts.begin(); it != counts.end(); ++it) - ms.insert(make_pair(it->second, it->first)); - int cc = 0; - for (multimap::reverse_iterator it = ms.rbegin(); it != ms.rend(); ++it) { - cerr << it->first << ':' << TD::Convert(it->second) << " "; - ++cc; - if (cc==20) break; - } - cerr << endl; -} - -int main(int argc, char** argv) { - if (argc != 3) { - cerr << "Usage: " << argv[0] << " num-classes num-samples\n"; - return 1; - } - const int num_classes = atoi(argv[1]); - const int num_iterations = atoi(argv[2]); - const int burnin_size = num_iterations * 0.9; - if (num_classes < 2) { - cerr << "Must request more than 1 class\n"; - return 1; - } - if (num_iterations < 5) { - cerr << "Must request more than 5 iterations\n"; - return 1; - } - cerr << "CLASSES: " << num_classes << endl; - char* buf = new char[MAX_DOC_LEN_CHARS]; - vector > wji; // w[j][i] - observed word i of doc j - vector > zji; // z[j][i] - topic assignment for word i of doc j - cerr << "READING DOCUMENTS\n"; - while(cin) { - cin.getline(buf, MAX_DOC_LEN_CHARS); - if (buf[0] == 0) continue; - wji.push_back(vector()); - TD::ConvertSentence(buf, &wji.back()); - } - cerr << "READ " << wji.size() << " DOCUMENTS\n"; - MT19937 rng; - cerr << "INITIALIZING RANDOM TOPIC ASSIGNMENTS\n"; - zji.resize(wji.size()); - double disc = 0.1; - double beta = 10.0; - double alpha = 50.0; - const double uniform_topic = 1.0 / num_classes; - const double uniform_word = 1.0 / TD::NumWords(); - vector > dr(zji.size(), CCRP(1,1,1,1,disc, beta)); // dr[i] describes the probability of using a topic in document i - vector > wr(num_classes, CCRP(1,1,1,1,disc, alpha)); // wr[k] describes the probability of generating a word in topic k - for (int j = 0; j < zji.size(); ++j) { - const size_t num_words = wji[j].size(); - vector& zj = zji[j]; - const vector& wj = wji[j]; - zj.resize(num_words); - for (int i = 0; i < num_words; ++i) { - int random_topic = rng.next() * num_classes; - if (random_topic == num_classes) { --random_topic; } - zj[i] = random_topic; - const int word = wj[i]; - dr[j].increment(random_topic, uniform_topic, &rng); - wr[random_topic].increment(word, uniform_word, &rng); - } - } - cerr << "SAMPLING\n"; - vector > t2w(num_classes); - Timer timer; - SampleSet ss; - ss.resize(num_classes); - double total_time = 0; - for (int iter = 0; iter < num_iterations; ++iter) { - cerr << '.'; - if (iter && iter % 10 == 0) { - total_time += timer.Elapsed(); - timer.Reset(); - double llh = 0; -#if 1 - for (int j = 0; j < dr.size(); ++j) - dr[j].resample_hyperparameters(&rng); - for (int j = 0; j < wr.size(); ++j) - wr[j].resample_hyperparameters(&rng); -#endif - - for (int j = 0; j < dr.size(); ++j) - llh += dr[j].log_crp_prob(); - for (int j = 0; j < wr.size(); ++j) - llh += wr[j].log_crp_prob(); - cerr << " [LLH=" << llh << " I=" << iter << "]\n"; - } - for (int j = 0; j < zji.size(); ++j) { - const size_t num_words = wji[j].size(); - vector& zj = zji[j]; - const vector& wj = wji[j]; - for (int i = 0; i < num_words; ++i) { - const int word = wj[i]; - const int cur_topic = zj[i]; - dr[j].decrement(cur_topic, &rng); - wr[cur_topic].decrement(word, &rng); - - for (int k = 0; k < num_classes; ++k) { - ss[k]= dr[j].prob(k, uniform_topic) * wr[k].prob(word, uniform_word); - } - const int new_topic = rng.SelectSample(ss); - dr[j].increment(new_topic, uniform_topic, &rng); - wr[new_topic].increment(word, uniform_word, &rng); - zj[i] = new_topic; - if (iter > burnin_size) { - ++t2w[cur_topic][word]; - } - } - } - } - for (int i = 0; i < num_classes; ++i) { - cerr << "---------------------------------\n"; - cerr << " final PYP(" << wr[i].discount() << "," << wr[i].concentration() << ")\n"; - ShowTopWordsForTopic(t2w[i]); - } - cerr << "-------------\n"; -#if 0 - for (int j = 0; j < zji.size(); ++j) { - const size_t num_words = wji[j].size(); - vector& zj = zji[j]; - const vector& wj = wji[j]; - zj.resize(num_words); - for (int i = 0; i < num_words; ++i) { - cerr << TD::Convert(wji[j][i]) << '(' << zj[i] << ") "; - } - cerr << endl; - } -#endif - return 0; -} - diff --git a/gi/clda/src/crp.h b/gi/clda/src/crp.h deleted file mode 100644 index 9d35857e..00000000 --- a/gi/clda/src/crp.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _CRP_H_ -#define _CRP_H_ - -// shamelessly adapted from code by Phil Blunsom and Trevor Cohn - -#include -#include - -#include "prob.h" - -template > -class CRP { - public: - CRP(double alpha) : alpha_(alpha), palpha_(alpha), total_customers_() {} - void increment(const DishType& dish); - void decrement(const DishType& dish); - void erase(const DishType& dish) { - counts_.erase(dish); - } - inline int count(const DishType& dish) const { - const typename MapType::const_iterator i = counts_.find(dish); - if (i == counts_.end()) return 0; else return i->second; - } - inline prob_t prob(const DishType& dish, const prob_t& p0) const { - return (prob_t(count(dish)) + palpha_ * p0) / prob_t(total_customers_ + alpha_); - } - private: - typedef std::tr1::unordered_map MapType; - MapType counts_; - const double alpha_; - const prob_t palpha_; - int total_customers_; -}; - -template -void CRP::increment(const Dish& dish) { - ++counts_[dish]; - ++total_customers_; -} - -template -void CRP::decrement(const Dish& dish) { - typename MapType::iterator i = counts_.find(dish); - assert(i != counts_.end()); - if (--i->second == 0) - counts_.erase(i); - --total_customers_; -} - -#endif diff --git a/gi/clda/src/slice_sampler.h b/gi/clda/src/slice_sampler.h deleted file mode 100644 index aa48a169..00000000 --- a/gi/clda/src/slice_sampler.h +++ /dev/null @@ -1,191 +0,0 @@ -//! slice-sampler.h is an MCMC slice sampler -//! -//! Mark Johnson, 1st August 2008 - -#ifndef SLICE_SAMPLER_H -#define SLICE_SAMPLER_H - -#include -#include -#include -#include -#include - -//! slice_sampler_rfc_type{} returns the value of a user-specified -//! function if the argument is within range, or - infinity otherwise -// -template -struct slice_sampler_rfc_type { - F min_x, max_x; - const Fn& f; - U max_nfeval, nfeval; - slice_sampler_rfc_type(F min_x, F max_x, const Fn& f, U max_nfeval) - : min_x(min_x), max_x(max_x), f(f), max_nfeval(max_nfeval), nfeval(0) { } - - F operator() (F x) { - if (min_x < x && x < max_x) { - assert(++nfeval <= max_nfeval); - F fx = f(x); - assert(std::isfinite(fx)); - return fx; - } - return -std::numeric_limits::infinity(); - } -}; // slice_sampler_rfc_type{} - -//! slice_sampler1d() implements the univariate "range doubling" slice sampler -//! described in Neal (2003) "Slice Sampling", The Annals of Statistics 31(3), 705-767. -// -template -F slice_sampler1d(const LogF& logF0, //!< log of function to sample - F x, //!< starting point - Uniform01& u01, //!< uniform [0,1) random number generator - F min_x = -std::numeric_limits::infinity(), //!< minimum value of support - F max_x = std::numeric_limits::infinity(), //!< maximum value of support - F w = 0.0, //!< guess at initial width - unsigned nsamples=1, //!< number of samples to draw - unsigned max_nfeval=200) //!< max number of function evaluations -{ - typedef unsigned U; - slice_sampler_rfc_type logF(min_x, max_x, logF0, max_nfeval); - - assert(std::isfinite(x)); - - if (w <= 0.0) { // set w to a default width - if (min_x > -std::numeric_limits::infinity() && max_x < std::numeric_limits::infinity()) - w = (max_x - min_x)/4; - else - w = std::max(((x < 0.0) ? -x : x)/4, (F) 0.1); - } - assert(std::isfinite(w)); - - F logFx = logF(x); - for (U sample = 0; sample < nsamples; ++sample) { - F logY = logFx + log(u01()+1e-100); //! slice logFx at this value - assert(std::isfinite(logY)); - - F xl = x - w*u01(); //! lower bound on slice interval - F logFxl = logF(xl); - F xr = xl + w; //! upper bound on slice interval - F logFxr = logF(xr); - - while (logY < logFxl || logY < logFxr) // doubling procedure - if (u01() < 0.5) - logFxl = logF(xl -= xr - xl); - else - logFxr = logF(xr += xr - xl); - - F xl1 = xl; - F xr1 = xr; - while (true) { // shrinking procedure - F x1 = xl1 + u01()*(xr1 - xl1); - if (logY < logF(x1)) { - F xl2 = xl; // acceptance procedure - F xr2 = xr; - bool d = false; - while (xr2 - xl2 > 1.1*w) { - F xm = (xl2 + xr2)/2; - if ((x < xm && x1 >= xm) || (x >= xm && x1 < xm)) - d = true; - if (x1 < xm) - xr2 = xm; - else - xl2 = xm; - if (d && logY >= logF(xl2) && logY >= logF(xr2)) - goto unacceptable; - } - x = x1; - goto acceptable; - } - goto acceptable; - unacceptable: - if (x1 < x) // rest of shrinking procedure - xl1 = x1; - else - xr1 = x1; - } - acceptable: - w = (4*w + (xr1 - xl1))/5; // update width estimate - } - return x; -} - -/* -//! slice_sampler1d() implements a 1-d MCMC slice sampler. -//! It should be correct for unimodal distributions, but -//! not for multimodal ones. -// -template -F slice_sampler1d(const LogP& logP, //!< log of distribution to sample - F x, //!< initial sample - Uniform01& u01, //!< uniform random number generator - F min_x = -std::numeric_limits::infinity(), //!< minimum value of support - F max_x = std::numeric_limits::infinity(), //!< maximum value of support - F w = 0.0, //!< guess at initial width - unsigned nsamples=1, //!< number of samples to draw - unsigned max_nfeval=200) //!< max number of function evaluations -{ - typedef unsigned U; - assert(std::isfinite(x)); - if (w <= 0.0) { - if (min_x > -std::numeric_limits::infinity() && max_x < std::numeric_limits::infinity()) - w = (max_x - min_x)/4; - else - w = std::max(((x < 0.0) ? -x : x)/4, 0.1); - } - // TRACE4(x, min_x, max_x, w); - F logPx = logP(x); - assert(std::isfinite(logPx)); - U nfeval = 1; - for (U sample = 0; sample < nsamples; ++sample) { - F x0 = x; - F logU = logPx + log(u01()+1e-100); - assert(std::isfinite(logU)); - F r = u01(); - F xl = std::max(min_x, x - r*w); - F xr = std::min(max_x, x + (1-r)*w); - // TRACE3(x, logPx, logU); - while (xl > min_x && logP(xl) > logU) { - xl -= w; - w *= 2; - ++nfeval; - if (nfeval >= max_nfeval) - std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << std::endl; - assert(nfeval < max_nfeval); - } - xl = std::max(xl, min_x); - while (xr < max_x && logP(xr) > logU) { - xr += w; - w *= 2; - ++nfeval; - if (nfeval >= max_nfeval) - std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xr = " << xr << std::endl; - assert(nfeval < max_nfeval); - } - xr = std::min(xr, max_x); - while (true) { - r = u01(); - x = r*xl + (1-r)*xr; - assert(std::isfinite(x)); - logPx = logP(x); - // TRACE4(logPx, x, xl, xr); - assert(std::isfinite(logPx)); - ++nfeval; - if (nfeval >= max_nfeval) - std::cerr << "## Error: nfeval = " << nfeval << ", max_nfeval = " << max_nfeval << ", sample = " << sample << ", nsamples = " << nsamples << ", r = " << r << ", w = " << w << ", xl = " << xl << ", xr = " << xr << ", x = " << x << std::endl; - assert(nfeval < max_nfeval); - if (logPx > logU) - break; - else if (x > x0) - xr = x; - else - xl = x; - } - // w = (4*w + (xr-xl))/5; // gradually adjust w - } - // TRACE2(logPx, x); - return x; -} // slice_sampler1d() -*/ - -#endif // SLICE_SAMPLER_H diff --git a/gi/clda/src/timer.h b/gi/clda/src/timer.h deleted file mode 100644 index 123d9a94..00000000 --- a/gi/clda/src/timer.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _TIMER_STATS_H_ -#define _TIMER_STATS_H_ - -#include - -struct Timer { - Timer() { Reset(); } - void Reset() { - start_t = clock(); - } - double Elapsed() const { - const clock_t end_t = clock(); - const double elapsed = (end_t - start_t) / 1000000.0; - return elapsed; - } - private: - std::clock_t start_t; -}; - -#endif diff --git a/gi/evaluation/conditional_entropy.py b/gi/evaluation/conditional_entropy.py deleted file mode 100644 index 356d3b1d..00000000 --- a/gi/evaluation/conditional_entropy.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python - -import sys, math, itertools, getopt - -def usage(): - print >>sys.stderr, 'Usage:', sys.argv[0], '[-s slash_threshold] input-1 input-2' - sys.exit(0) - -optlist, args = getopt.getopt(sys.argv[1:], 'hs:') -slash_threshold = None -for opt, arg in optlist: - if opt == '-s': - slash_threshold = int(arg) - else: - usage() -if len(args) != 2: - usage() - -ginfile = open(args[0]) -pinfile = open(args[1]) - -# evaluating: H(G | P) = sum_{g,p} p(g,p) log { p(p) / p(g,p) } -# = sum_{g,p} c(g,p)/N { log c(p) - log N - log c(g,p) + log N } -# = 1/N sum_{g,p} c(g,p) { log c(p) - log c(g,p) } -# where G = gold, P = predicted, N = number of events - -N = 0 -gold_frequencies = {} -predict_frequencies = {} -joint_frequencies = {} - -for gline, pline in itertools.izip(ginfile, pinfile): - gparts = gline.split('||| ')[1].split() - pparts = pline.split('||| ')[1].split() - assert len(gparts) == len(pparts) - - for gpart, ppart in zip(gparts, pparts): - gtag = gpart.split(':',1)[1] - ptag = ppart.split(':',1)[1] - - if slash_threshold == None or gtag.count('/') + gtag.count('\\') <= slash_threshold: - joint_frequencies.setdefault((gtag, ptag), 0) - joint_frequencies[gtag,ptag] += 1 - - predict_frequencies.setdefault(ptag, 0) - predict_frequencies[ptag] += 1 - - gold_frequencies.setdefault(gtag, 0) - gold_frequencies[gtag] += 1 - - N += 1 - -hg2p = 0 -hp2g = 0 -for (gtag, ptag), cgp in joint_frequencies.items(): - hp2g += cgp * (math.log(predict_frequencies[ptag], 2) - math.log(cgp, 2)) - hg2p += cgp * (math.log(gold_frequencies[gtag], 2) - math.log(cgp, 2)) -hg2p /= N -hp2g /= N - -print 'H(P|G)', hg2p, 'H(G|P)', hp2g, 'VI', hg2p + hp2g diff --git a/gi/evaluation/confusion_matrix.py b/gi/evaluation/confusion_matrix.py deleted file mode 100644 index 2dd7aa47..00000000 --- a/gi/evaluation/confusion_matrix.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python - -import sys, math, itertools, getopt - -def usage(): - print >>sys.stderr, 'Usage:', sys.argv[0], '[-s slash_threshold] [-p output] [-m] input-1 input-2' - sys.exit(0) - -optlist, args = getopt.getopt(sys.argv[1:], 'hs:mp:') -slash_threshold = None -output_fname = None -show_matrix = False -for opt, arg in optlist: - if opt == '-s': - slash_threshold = int(arg) - elif opt == '-p': - output_fname = arg - elif opt == '-m': - show_matrix = True - else: - usage() -if len(args) != 2 or (not show_matrix and not output_fname): - usage() - -ginfile = open(args[0]) -pinfile = open(args[1]) - -if output_fname: - try: - import Image, ImageDraw - except ImportError: - print >>sys.stderr, "Error: Python Image Library not available. Did you forget to set your PYTHONPATH environment variable?" - sys.exit(1) - -N = 0 -gold_frequencies = {} -predict_frequencies = {} -joint_frequencies = {} - -for gline, pline in itertools.izip(ginfile, pinfile): - gparts = gline.split('||| ')[1].split() - pparts = pline.split('||| ')[1].split() - assert len(gparts) == len(pparts) - - for gpart, ppart in zip(gparts, pparts): - gtag = gpart.split(':',1)[1] - ptag = ppart.split(':',1)[1] - - if slash_threshold == None or gtag.count('/') + gtag.count('\\') <= slash_threshold: - joint_frequencies.setdefault((gtag, ptag), 0) - joint_frequencies[gtag,ptag] += 1 - - predict_frequencies.setdefault(ptag, 0) - predict_frequencies[ptag] += 1 - - gold_frequencies.setdefault(gtag, 0) - gold_frequencies[gtag] += 1 - - N += 1 - -# find top tags -gtags = gold_frequencies.items() -gtags.sort(lambda x,y: x[1]-y[1]) -gtags.reverse() -#gtags = gtags[:50] - -preds = predict_frequencies.items() -preds.sort(lambda x,y: x[1]-y[1]) -preds.reverse() - -if show_matrix: - print '%7s %7s' % ('pred', 'cnt'), - for gtag, gcount in gtags: print '%7s' % gtag, - print - print '=' * 80 - - for ptag, pcount in preds: - print '%7s %7d' % (ptag, pcount), - for gtag, gcount in gtags: - print '%7d' % joint_frequencies.get((gtag, ptag), 0), - print - - print '%7s %7d' % ('total', N), - for gtag, gcount in gtags: print '%7d' % gcount, - print - -if output_fname: - offset=10 - - image = Image.new("RGB", (len(preds), len(gtags)), (255, 255, 255)) - #hsl(hue, saturation%, lightness%) - - # re-sort preds to get a better diagonal - ptags=[] - if True: - ptags = map(lambda (p,c): p, preds) - else: - remaining = set(predict_frequencies.keys()) - for y, (gtag, gcount) in enumerate(gtags): - best = (None, 0) - for ptag in remaining: - #pcount = predict_frequencies[ptag] - p = joint_frequencies.get((gtag, ptag), 0)# / float(pcount) - if p > best[1]: best = (ptag, p) - ptags.append(ptag) - remaining.remove(ptag) - if not remaining: break - - print 'Predicted tag ordering:', ' '.join(ptags) - print 'Gold tag ordering:', ' '.join(map(lambda (t,c): t, gtags)) - - draw = ImageDraw.Draw(image) - for x, ptag in enumerate(ptags): - pcount = predict_frequencies[ptag] - minval = math.log(offset) - maxval = math.log(pcount + offset) - for y, (gtag, gcount) in enumerate(gtags): - f = math.log(offset + joint_frequencies.get((gtag, ptag), 0)) - z = int(240. * (maxval - f) / float(maxval - minval)) - #print x, y, z, f, maxval - draw.point([(x,y)], fill='hsl(%d, 100%%, 50%%)' % z) - del draw - image.save(output_fname) diff --git a/gi/evaluation/entropy.py b/gi/evaluation/entropy.py deleted file mode 100644 index ec1ef502..00000000 --- a/gi/evaluation/entropy.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python - -import sys, math, itertools, getopt - -def usage(): - print >>sys.stderr, 'Usage:', sys.argv[0], '[-s slash_threshold] input file' - sys.exit(0) - -optlist, args = getopt.getopt(sys.argv[1:], 'hs:') -slash_threshold = None -for opt, arg in optlist: - if opt == '-s': - slash_threshold = int(arg) - else: - usage() -if len(args) != 1: - usage() - -infile = open(args[0]) -N = 0 -frequencies = {} - -for line in infile: - - for part in line.split('||| ')[1].split(): - tag = part.split(':',1)[1] - - if slash_threshold == None or tag.count('/') + tag.count('\\') <= slash_threshold: - frequencies.setdefault(tag, 0) - frequencies[tag] += 1 - N += 1 - -h = 0 -for tag, c in frequencies.items(): - h -= c * (math.log(c, 2) - math.log(N, 2)) -h /= N - -print 'entropy', h diff --git a/gi/evaluation/extract_ccg_labels.py b/gi/evaluation/extract_ccg_labels.py deleted file mode 100644 index e0034648..00000000 --- a/gi/evaluation/extract_ccg_labels.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python - -# -# Takes spans input along with treebank and spits out CG style categories for each span. -# spans = output from CDEC's extools/extractor with --base_phrase_spans option -# treebank = PTB format, one tree per line -# -# Output is in CDEC labelled-span format -# - -import sys, itertools, tree - -tinfile = open(sys.argv[1]) -einfile = open(sys.argv[2]) - -def number_leaves(node, next=0): - left, right = None, None - for child in node.children: - l, r = number_leaves(child, next) - next = max(next, r+1) - if left == None or l < left: - left = l - if right == None or r > right: - right = r - - #print node, left, right, next - if left == None or right == None: - assert not node.children - left = right = next - - node.left = left - node.right = right - - return left, right - -def ancestor(node, indices): - #print node, node.left, node.right, indices - # returns the deepest node covering all the indices - if min(indices) >= node.left and max(indices) <= node.right: - # try the children - for child in node.children: - x = ancestor(child, indices) - if x: return x - return node - else: - return None - -def frontier(node, indices): - #print 'frontier for node', node, 'indices', indices - if node.left > max(indices) or node.right < min(indices): - #print '\toutside' - return [node] - elif node.children: - #print '\tcovering at least part' - ns = [] - for child in node.children: - n = frontier(child, indices) - ns.extend(n) - return ns - else: - return [node] - -def project_heads(node): - #print 'project_heads', node - is_head = node.data.tag.endswith('-HEAD') - if node.children: - found = 0 - for child in node.children: - x = project_heads(child) - if x: - node.data.tag = x - found += 1 - assert found == 1 - elif is_head: - node.data.tag = node.data.tag[:-len('-HEAD')] - - if is_head: - return node.data.tag - else: - return None - -for tline, eline in itertools.izip(tinfile, einfile): - if tline.strip() != '(())': - if tline.startswith('( '): - tline = tline[2:-1].strip() - tr = tree.parse_PST(tline) - if tr != None: - number_leaves(tr) - #project_heads(tr) # assumes Bikel-style head annotation for the input trees - else: - tr = None - - parts = eline.strip().split(" ||| ") - zh, en = parts[:2] - spans = parts[-1] - print '|||', - for span in spans.split(): - sps = span.split(":") - i, j, x, y = map(int, sps[0].split("-")) - - if tr: - a = ancestor(tr, range(x,y)) - try: - fs = frontier(a, range(x,y)) - except: - print >>sys.stderr, "problem with line", tline.strip(), "--", eline.strip() - raise - - #print x, y - #print 'ancestor', a - #print 'frontier', fs - - cat = a.data.tag - for f in fs: - if f.right < x: - cat += '\\' + f.data.tag - else: - break - fs.reverse() - for f in fs: - if f.left >= y: - cat += '/' + f.data.tag - else: - break - else: - cat = 'FAIL' - - print '%d-%d:%s' % (x, y, cat), - print diff --git a/gi/evaluation/tree.py b/gi/evaluation/tree.py deleted file mode 100644 index 702d80b6..00000000 --- a/gi/evaluation/tree.py +++ /dev/null @@ -1,485 +0,0 @@ -import re, sys - -class Symbol: - def __init__(self, nonterm, term=None, var=None): - assert not (term != None and var != None) - self.tag = nonterm - self.token = term - self.variable = var - - def is_variable(self): - return self.variable != None - - def __eq__(self, other): - return self.tag == other.tag and self.token == other.token and self.variable == other.variable - - def __ne__(self, other): - return not (self == other) - - def __hash__(self): - return hash((self.tag, self.token, self.variable)) - - def __repr__(self): - return str(self) - - def __cmp__(self, other): - return cmp((self.tag, self.token, self.variable), - (other.tag, other.token, other.variable)) - - def __str__(self): - parts = [] - if False: # DEPENDENCY - if self.token: - parts.append(str(self.token)) - elif self.variable != None: - parts.append('#%d' % self.variable) - if self.tag: - parts.append(str(self.tag)) - return '/'.join(parts) - else: - if self.tag: - parts.append(str(self.tag)) - if self.token: - parts.append(str(self.token)) - elif self.variable != None: - parts.append('#%d' % self.variable) - return ' '.join(parts) - -class TreeNode: - def __init__(self, data, children=None, order=-1): - self.data = data - self.children = [] - self.order = order - self.parent = None - if children: self.children = children - - def insert(self, child): - self.children.append(child) - child.parent = self - - def leaves(self): - ls = [] - for node in self.xtraversal(): - if not node.children: - ls.append(node.data) - return ls - - def leaf_nodes(self): - ls = [] - for node in self.xtraversal(): - if not node.children: - ls.append(node) - return ls - - def max_depth(self): - d = 1 - for child in self.children: - d = max(d, 1 + child.max_depth()) - if not self.children and self.data.token: - d = 2 - return d - - def max_width(self): - w = 0 - for child in self.children: - w += child.max_width() - return max(1, w) - - def num_internal_nodes(self): - if self.children: - n = 1 - for child in self.children: - n += child.num_internal_nodes() - return n - elif self.data.token: - return 1 - else: - return 0 - - def postorder_traversal(self, visit): - """ - Postorder traversal; no guarantee that terminals will be read in the - correct order for dep. trees. - """ - for child in self.children: - child.traversal(visit) - visit(self) - - def traversal(self, visit): - """ - Preorder for phrase structure trees, and inorder for dependency trees. - In both cases the terminals will be read off in the correct order. - """ - visited_self = False - if self.order <= 0: - visited_self = True - visit(self) - - for i, child in enumerate(self.children): - child.traversal(visit) - if i + 1 == self.order: - visited_self = True - visit(self) - - assert visited_self - - def xpostorder_traversal(self): - for child in self.children: - for node in child.xpostorder_traversal(): - yield node - yield self - - def xtraversal(self): - visited_self = False - if self.order <= 0: - visited_self = True - yield self - - for i, child in enumerate(self.children): - for d in child.xtraversal(): - yield d - - if i + 1 == self.order: - visited_self = True - yield self - - assert visited_self - - def xpostorder_traversal(self): - for i, child in enumerate(self.children): - for d in child.xpostorder_traversal(): - yield d - yield self - - def edges(self): - es = [] - self.traverse_edges(lambda h,c: es.append((h,c))) - return es - - def traverse_edges(self, visit): - for child in self.children: - visit(self.data, child.data) - child.traverse_edges(visit) - - def subtrees(self, include_self=False): - st = [] - if include_self: - stack = [self] - else: - stack = self.children[:] - - while stack: - node = stack.pop() - st.append(node) - stack.extend(node.children) - return st - - def find_parent(self, node): - try: - index = self.children.index(node) - return self, index - except ValueError: - for child in self.children: - if isinstance(child, TreeNode): - r = child.find_parent(node) - if r: return r - return None - - def is_ancestor_of(self, node): - if self == node: - return True - for child in self.children: - if child.is_ancestor_of(child): - return True - return False - - def find(self, node): - if self == node: - return self - for child in self.children: - if isinstance(child, TreeNode): - r = child.find(node) - if r: return r - else: - if child == node: - return r - return None - - def equals_ignorecase(self, other): - if not isinstance(other, TreeNode): - return False - if self.data != other.data: - return False - if len(self.children) != len(other.children): - return False - for mc, oc in zip(self.children, other.children): - if isinstance(mc, TreeNode): - if not mc.equals_ignorecase(oc): - return False - else: - if mc.lower() != oc.lower(): - return False - return True - - def node_number(self, numbering, next=0): - if self.order <= 0: - numbering[id(self)] = next - next += 1 - - for i, child in enumerate(self.children): - next = child.node_number(numbering, next) - if i + 1 == self.order: - numbering[id(self)] = next - next += 1 - - return next - - def display_conll(self, out): - numbering = {} - self.node_number(numbering) - next = 0 - self.children[0].traversal(lambda x: \ - out.write('%d\t%s\t%s\t%s\t%s\t_\t%d\tLAB\n' \ - % (numbering[id(x)], x.data.token, x.data.token, - x.data.tag, x.data.tag, numbering[id(x.parent)]))) - out.write('\n') - - def size(self): - sz = 1 - for child in self.children: - sz += child.size() - return sz - - def __eq__(self, other): - if isinstance(other, TreeNode) and self.data == other.data \ - and self.children == other.children: - return True - return False - - def __cmp__(self, other): - if not isinstance(other, TreeNode): return 1 - n = cmp(self.data, other.data) - if n != 0: return n - n = len(self.children) - len(other.children) - if n != 0: return n - for sc, oc in zip(self.children, other.children): - n = cmp(sc, oc) - if n != 0: return n - return 0 - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - return hash((self.data, tuple(self.children))) - - def __repr__(self): - return str(self) - - def __str__(self): - s = '(' - space = False - if self.order <= 0: - s += str(self.data) - space = True - for i, child in enumerate(self.children): - if space: s += ' ' - s += str(child) - space = True - if i+1 == self.order: - s += ' ' + str(self.data) - return s + ')' - -def read_PSTs(fname): - infile = open(fname) - trees = [] - for line in infile: - trees.append(parse_PST(line.strip())) - infile.close() - return trees - -def parse_PST_multiline(infile, hash_is_var=True): - buf = '' - num_open = 0 - while True: - line = infile.readline() - if not line: - return None - buf += ' ' + line.rstrip() - num_open += line.count('(') - line.count(')') - if num_open == 0: - break - - return parse_PST(buf, hash_is_var) - -def parse_PST(line, hash_is_var=True): - line = line.rstrip() - if not line or line.lower() == 'null': - return None - - # allow either (a/DT) or (DT a) - #parts_re = re.compile(r'(\(*)([^/)]*)(?:/([^)]*))?(\)*)$') - - # only allow (DT a) - parts_re = re.compile(r'(\(*)([^)]*)(\)*)$') - - root = TreeNode(Symbol('TOP')) - stack = [root] - for part in line.rstrip().split(): - m = parts_re.match(part) - #opening, tok_or_tag, tag, closing = m.groups() - opening, tok_or_tag, closing = m.groups() - tag = None - #print 'token', part, 'bits', m.groups() - for i in opening: - node = TreeNode(Symbol(None)) - stack[-1].insert(node) - stack.append(node) - - if tag: - stack[-1].data.tag = tag - if hash_is_var and tok_or_tag.startswith('#'): - stack[-1].data.variable = int(tok_or_tag[1:]) - else: - stack[-1].data.token = tok_or_tag - else: - if stack[-1].data.tag == None: - stack[-1].data.tag = tok_or_tag - else: - if hash_is_var and tok_or_tag.startswith('#'): - try: - stack[-1].data.variable = int(tok_or_tag[1:]) - except ValueError: # it's really a token! - #print >>sys.stderr, 'Warning: # used for token:', tok_or_tag - stack[-1].data.token = tok_or_tag - else: - stack[-1].data.token = tok_or_tag - - for i in closing: - stack.pop() - - #assert str(root.children[0]) == line - return root.children[0] - -def read_DTs(fname): - infile = open(fname) - trees = [] - while True: - t = parse_DT(infile) - if t: trees.append(t) - else: break - infile.close() - return trees - -def read_bracketed_DTs(fname): - infile = open(fname) - trees = [] - for line in infile: - trees.append(parse_bracketed_DT(line)) - infile.close() - return trees - -def parse_DT(infile): - tokens = [Symbol('ROOT')] - children = {} - - for line in infile: - parts = line.rstrip().split() - #print parts - if not parts: break - index = len(tokens) - token = parts[1] - tag = parts[3] - parent = int(parts[6]) - if token.startswith('#'): - tokens.append(Symbol(tag, var=int(token[1:]))) - else: - tokens.append(Symbol(tag, token)) - children.setdefault(parent, set()).add(index) - - if len(tokens) == 1: return None - - root = TreeNode(Symbol('ROOT'), [], 0) - schedule = [] - for child in sorted(children[0]): - schedule.append((root, child)) - - while schedule: - parent, index = schedule[0] - del schedule[0] - - node = TreeNode(tokens[index]) - node.order = 0 - parent.insert(node) - - for child in sorted(children.get(index, [])): - schedule.append((node, child)) - if child < index: - node.order += 1 - - return root - -_bracket_split_re = re.compile(r'([(]*)([^)/]*)(?:/([^)]*))?([)]*)') - -def parse_bracketed_DT(line, insert_root=True): - line = line.rstrip() - if not line or line == 'NULL': return None - #print line - - root = TreeNode(Symbol('ROOT')) - stack = [root] - for part in line.rstrip().split(): - m = _bracket_split_re.match(part) - - for c in m.group(1): - node = TreeNode(Symbol(None)) - stack[-1].insert(node) - stack.append(node) - - if m.group(3) != None: - if m.group(2).startswith('#'): - stack[-1].data.variable = int(m.group(2)[1:]) - else: - stack[-1].data.token = m.group(2) - stack[-1].data.tag = m.group(3) - else: - stack[-1].data.tag = m.group(2) - stack[-1].order = len(stack[-1].children) - # FIXME: also check for vars - - for c in m.group(4): - stack.pop() - - assert len(stack) == 1 - if not insert_root or root.children[0].data.tag == 'ROOT': - return root.children[0] - else: - return root - -_bracket_split_notag_re = re.compile(r'([(]*)([^)/]*)([)]*)') - -def parse_bracketed_untagged_DT(line): - line = line.rstrip() - if not line or line == 'NULL': return None - - root = TreeNode(Symbol('TOP')) - stack = [root] - for part in line.rstrip().split(): - m = _bracket_split_notag_re.match(part) - - for c in m.group(1): - node = TreeNode(Symbol(None)) - stack[-1].insert(node) - stack.append(node) - - if stack[-1].data.token == None: - stack[-1].data.token = m.group(2) - stack[-1].order = len(stack[-1].children) - else: - child = TreeNode(Symbol(nonterm=None, term=m.group(2))) - stack[-1].insert(child) - - for c in m.group(3): - stack.pop() - - return root.children[0] diff --git a/gi/markov_al/Makefile.am b/gi/markov_al/Makefile.am deleted file mode 100644 index fe3e3349..00000000 --- a/gi/markov_al/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -bin_PROGRAMS = ml - -ml_SOURCES = ml.cc - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/markov_al/README b/gi/markov_al/README deleted file mode 100644 index 9c10f7cd..00000000 --- a/gi/markov_al/README +++ /dev/null @@ -1,2 +0,0 @@ -Experimental translation models with Markovian dependencies. - diff --git a/gi/markov_al/ml.cc b/gi/markov_al/ml.cc deleted file mode 100644 index 1e71edd6..00000000 --- a/gi/markov_al/ml.cc +++ /dev/null @@ -1,470 +0,0 @@ -#include -#include - -#include -#include -#include -#include - -#include "tdict.h" -#include "filelib.h" -#include "sampler.h" -#include "ccrp_onetable.h" -#include "array2d.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -void PrintTopCustomers(const CCRP_OneTable& crp) { - for (CCRP_OneTable::const_iterator it = crp.begin(); it != crp.end(); ++it) { - cerr << " " << TD::Convert(it->first) << " = " << it->second << endl; - } -} - -void PrintAlignment(const vector& src, const vector& trg, const vector& a) { - cerr << TD::GetString(src) << endl << TD::GetString(trg) << endl; - Array2D al(src.size(), trg.size()); - for (int i = 0; i < a.size(); ++i) - if (a[i] != 255) al(a[i], i) = true; - cerr << al << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct Unigram; -struct Bigram { - Bigram() : trg(), cond() {} - Bigram(WordID prev, WordID cur, WordID t) : trg(t) { cond.first = prev; cond.second = cur; } - const pair& ConditioningPair() const { - return cond; - } - WordID& prev_src() { return cond.first; } - WordID& cur_src() { return cond.second; } - const WordID& prev_src() const { return cond.first; } - const WordID& cur_src() const { return cond.second; } - WordID trg; - private: - pair cond; -}; - -struct Unigram { - Unigram() : cur_src(), trg() {} - Unigram(WordID s, WordID t) : cur_src(s), trg(t) {} - WordID cur_src; - WordID trg; -}; - -ostream& operator<<(ostream& os, const Bigram& b) { - os << "( " << TD::Convert(b.trg) << " | " << TD::Convert(b.prev_src()) << " , " << TD::Convert(b.cur_src()) << " )"; - return os; -} - -ostream& operator<<(ostream& os, const Unigram& u) { - os << "( " << TD::Convert(u.trg) << " | " << TD::Convert(u.cur_src) << " )"; - return os; -} - -bool operator==(const Bigram& a, const Bigram& b) { - return a.trg == b.trg && a.cur_src() == b.cur_src() && a.prev_src() == b.prev_src(); -} - -bool operator==(const Unigram& a, const Unigram& b) { - return a.trg == b.trg && a.cur_src == b.cur_src; -} - -size_t hash_value(const Bigram& b) { - size_t h = boost::hash_value(b.prev_src()); - boost::hash_combine(h, boost::hash_value(b.cur_src())); - boost::hash_combine(h, boost::hash_value(b.trg)); - return h; -} - -size_t hash_value(const Unigram& u) { - size_t h = boost::hash_value(u.cur_src); - boost::hash_combine(h, boost::hash_value(u.trg)); - return h; -} - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -struct UnigramModel { - UnigramModel(size_t src_voc_size, size_t trg_voc_size) : - unigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), - p0(1.0 / trg_voc_size) {} - - void increment(const Bigram& b) { - unigrams[b.cur_src()].increment(b.trg); - } - - void decrement(const Bigram& b) { - unigrams[b.cur_src()].decrement(b.trg); - } - - double prob(const Bigram& b) const { - const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); - return q0; - } - - double LogLikelihood() const { - double llh = 0; - for (unsigned i = 0; i < unigrams.size(); ++i) { - const CCRP_OneTable& crp = unigrams[i]; - if (crp.num_customers() > 0) { - llh += crp.log_crp_prob(); - llh += crp.num_tables() * log(p0); - } - } - return llh; - } - - void ResampleHyperparameters(MT19937* rng) { - for (unsigned i = 0; i < unigrams.size(); ++i) - unigrams[i].resample_hyperparameters(rng); - } - - vector > unigrams; // unigrams[src].prob(trg, p0) = p(trg|src) - - const double p0; -}; - -struct BigramModel { - BigramModel(size_t src_voc_size, size_t trg_voc_size) : - unigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), - p0(1.0 / trg_voc_size) {} - - void increment(const Bigram& b) { - BigramMap::iterator it = bigrams.find(b.ConditioningPair()); - if (it == bigrams.end()) { - it = bigrams.insert(make_pair(b.ConditioningPair(), CCRP_OneTable(1,1,1,1))).first; - } - if (it->second.increment(b.trg)) - unigrams[b.cur_src()].increment(b.trg); - } - - void decrement(const Bigram& b) { - BigramMap::iterator it = bigrams.find(b.ConditioningPair()); - assert(it != bigrams.end()); - if (it->second.decrement(b.trg)) { - unigrams[b.cur_src()].decrement(b.trg); - if (it->second.num_customers() == 0) - bigrams.erase(it); - } - } - - double prob(const Bigram& b) const { - const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); - const BigramMap::const_iterator it = bigrams.find(b.ConditioningPair()); - if (it == bigrams.end()) return q0; - return it->second.prob(b.trg, q0); - } - - double LogLikelihood() const { - double llh = 0; - for (unsigned i = 0; i < unigrams.size(); ++i) { - const CCRP_OneTable& crp = unigrams[i]; - if (crp.num_customers() > 0) { - llh += crp.log_crp_prob(); - llh += crp.num_tables() * log(p0); - } - } - for (BigramMap::const_iterator it = bigrams.begin(); it != bigrams.end(); ++it) { - const CCRP_OneTable& crp = it->second; - const WordID cur_src = it->first.second; - llh += crp.log_crp_prob(); - for (CCRP_OneTable::const_iterator bit = crp.begin(); bit != crp.end(); ++bit) { - llh += log(unigrams[cur_src].prob(bit->second, p0)); - } - } - return llh; - } - - void ResampleHyperparameters(MT19937* rng) { - for (unsigned i = 0; i < unigrams.size(); ++i) - unigrams[i].resample_hyperparameters(rng); - for (BigramMap::iterator it = bigrams.begin(); it != bigrams.end(); ++it) - it->second.resample_hyperparameters(rng); - } - - typedef unordered_map, CCRP_OneTable, boost::hash > > BigramMap; - BigramMap bigrams; // bigrams[(src-1,src)].prob(trg, q0) = p(trg|src,src-1) - vector > unigrams; // unigrams[src].prob(trg, p0) = p(trg|src) - - const double p0; -}; - -struct BigramAlignmentModel { - BigramAlignmentModel(size_t src_voc_size, size_t trg_voc_size) : bigrams(TD::NumWords() + 1, CCRP_OneTable(1,1,1,1)), p0(1.0 / src_voc_size) {} - void increment(WordID prev, WordID next) { - bigrams[prev].increment(next); // hierarchy? - } - void decrement(WordID prev, WordID next) { - bigrams[prev].decrement(next); // hierarchy? - } - double prob(WordID prev, WordID next) { - return bigrams[prev].prob(next, p0); - } - double LogLikelihood() const { - double llh = 0; - for (unsigned i = 0; i < bigrams.size(); ++i) { - const CCRP_OneTable& crp = bigrams[i]; - if (crp.num_customers() > 0) { - llh += crp.log_crp_prob(); - llh += crp.num_tables() * log(p0); - } - } - return llh; - } - - vector > bigrams; // bigrams[prev].prob(next, p0) = p(next|prev) - const double p0; -}; - -struct Alignment { - vector a; -}; - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as(); - - boost::shared_ptr prng; - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - cerr << "Reading corpus...\n"; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - assert(corpusf.size() == corpuse.size()); - const size_t corpus_len = corpusf.size(); - const WordID kNULL = TD::Convert(""); - const WordID kBOS = TD::Convert(""); - const WordID kEOS = TD::Convert(""); - Bigram TT(kBOS, TD::Convert("我"), TD::Convert("i")); - Bigram TT2(kBOS, TD::Convert("要"), TD::Convert("i")); - - UnigramModel model(vocabf.size(), vocabe.size()); - vector alignments(corpus_len); - for (unsigned ci = 0; ci < corpus_len; ++ci) { - const vector& src = corpusf[ci]; - const vector& trg = corpuse[ci]; - vector& alg = alignments[ci].a; - alg.resize(trg.size()); - int lenp1 = src.size() + 1; - WordID prev_src = kBOS; - for (int j = 0; j < trg.size(); ++j) { - int samp = lenp1 * rng.next(); - --samp; - if (samp < 0) samp = 255; - alg[j] = samp; - WordID cur_src = (samp == 255 ? kNULL : src[alg[j]]); - Bigram b(prev_src, cur_src, trg[j]); - model.increment(b); - prev_src = cur_src; - } - Bigram b(prev_src, kEOS, kEOS); - model.increment(b); - } - cerr << "Initial LLH: " << model.LogLikelihood() << endl; - - SampleSet ss; - for (unsigned si = 0; si < 50; ++si) { - for (unsigned ci = 0; ci < corpus_len; ++ci) { - const vector& src = corpusf[ci]; - const vector& trg = corpuse[ci]; - vector& alg = alignments[ci].a; - WordID prev_src = kBOS; - for (unsigned j = 0; j < trg.size(); ++j) { - unsigned char& a_j = alg[j]; - WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); - Bigram b(prev_src, cur_e_a_j, trg[j]); - //cerr << "DEC: " << b << "\t" << nextb << endl; - model.decrement(b); - ss.clear(); - for (unsigned i = 0; i <= src.size(); ++i) { - const WordID cur_src = (i ? src[i-1] : kNULL); - b.cur_src() = cur_src; - ss.add(model.prob(b)); - } - int sampled_a_j = rng.SelectSample(ss); - a_j = (sampled_a_j ? sampled_a_j - 1 : 255); - cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); - b.cur_src() = cur_e_a_j; - //cerr << "INC: " << b << "\t" << nextb << endl; - model.increment(b); - prev_src = cur_e_a_j; - } - } - cerr << '.' << flush; - if (si % 10 == 9) { - cerr << "[LLH prev=" << model.LogLikelihood(); - //model.ResampleHyperparameters(&rng); - cerr << " new=" << model.LogLikelihood() << "]\n"; - //pair xx = make_pair(kBOS, TD::Convert("我")); - //PrintTopCustomers(model.bigrams.find(xx)->second); - cerr << "p(" << TT << ") = " << model.prob(TT) << endl; - cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; - PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); - } - } - { - // MODEL 2 - BigramModel model(vocabf.size(), vocabe.size()); - BigramAlignmentModel amodel(vocabf.size(), vocabe.size()); - for (unsigned ci = 0; ci < corpus_len; ++ci) { - const vector& src = corpusf[ci]; - const vector& trg = corpuse[ci]; - vector& alg = alignments[ci].a; - WordID prev_src = kBOS; - for (int j = 0; j < trg.size(); ++j) { - WordID cur_src = (alg[j] == 255 ? kNULL : src[alg[j]]); - Bigram b(prev_src, cur_src, trg[j]); - model.increment(b); - amodel.increment(prev_src, cur_src); - prev_src = cur_src; - } - amodel.increment(prev_src, kEOS); - Bigram b(prev_src, kEOS, kEOS); - model.increment(b); - } - cerr << "Initial LLH: " << model.LogLikelihood() << " " << amodel.LogLikelihood() << endl; - - SampleSet ss; - for (unsigned si = 0; si < samples; ++si) { - for (unsigned ci = 0; ci < corpus_len; ++ci) { - const vector& src = corpusf[ci]; - const vector& trg = corpuse[ci]; - vector& alg = alignments[ci].a; - WordID prev_src = kBOS; - for (unsigned j = 0; j < trg.size(); ++j) { - unsigned char& a_j = alg[j]; - WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); - Bigram b(prev_src, cur_e_a_j, trg[j]); - WordID next_src = kEOS; - WordID next_trg = kEOS; - if (j < (trg.size() - 1)) { - next_src = (alg[j+1] == 255 ? kNULL : src[alg[j + 1]]); - next_trg = trg[j + 1]; - } - Bigram nextb(cur_e_a_j, next_src, next_trg); - //cerr << "DEC: " << b << "\t" << nextb << endl; - model.decrement(b); - model.decrement(nextb); - amodel.decrement(prev_src, cur_e_a_j); - amodel.decrement(cur_e_a_j, next_src); - ss.clear(); - for (unsigned i = 0; i <= src.size(); ++i) { - const WordID cur_src = (i ? src[i-1] : kNULL); - b.cur_src() = cur_src; - ss.add(model.prob(b) * model.prob(nextb) * amodel.prob(prev_src, cur_src) * amodel.prob(cur_src, next_src)); - //cerr << log(ss[ss.size() - 1]) << "\t" << b << endl; - } - int sampled_a_j = rng.SelectSample(ss); - a_j = (sampled_a_j ? sampled_a_j - 1 : 255); - cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); - b.cur_src() = cur_e_a_j; - nextb.prev_src() = cur_e_a_j; - //cerr << "INC: " << b << "\t" << nextb << endl; - //exit(1); - model.increment(b); - model.increment(nextb); - amodel.increment(prev_src, cur_e_a_j); - amodel.increment(cur_e_a_j, next_src); - prev_src = cur_e_a_j; - } - } - cerr << '.' << flush; - if (si % 10 == 9) { - cerr << "[LLH prev=" << (model.LogLikelihood() + amodel.LogLikelihood()); - //model.ResampleHyperparameters(&rng); - cerr << " new=" << model.LogLikelihood() << "]\n"; - pair xx = make_pair(kBOS, TD::Convert("我")); - cerr << "p(" << TT << ") = " << model.prob(TT) << endl; - cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; - pair xx2 = make_pair(kBOS, TD::Convert("要")); - PrintTopCustomers(model.bigrams.find(xx)->second); - //PrintTopCustomers(amodel.bigrams[TD::Convert("")]); - //PrintTopCustomers(model.unigrams[TD::Convert("")]); - PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); - } - } - } - return 0; -} - diff --git a/gi/morf-segmentation/filter_docs.pl b/gi/morf-segmentation/filter_docs.pl deleted file mode 100755 index a78575da..00000000 --- a/gi/morf-segmentation/filter_docs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -#Filters the phrase&cluster document set to retain only documents that correspond to words or morphs, i.e. not crossing word boundaries. - -#Usage: filter_docs.pl [mark] -# STDIN: data in the doc.txt format (i.e. phrase\t blahblah ), most likely from cdec extractor -# STDOUT: the matching subset, same format - -use utf8; -my $letter=qr/\p{L}\p{M}*/; # see http://www.regular-expressions.info/unicode.html - -my $morph=qr/$letter+/; - -my $m = "##"; # marker used to indicate morphemes -if ((scalar @ARGV) >= 1) { - $m = $ARGV[0]; - shift; -} -print STDERR "Using $m to filter for morphemes\n"; - -my $expr = qr/^($morph\Q$m\E)? ?(\Q$m\E$morph\Q$m\E)* ?(\Q$m\E$morph)?\t/; #\Q and \E bounded sections are escaped -while(<>) { - /$expr/ && print; -} diff --git a/gi/morf-segmentation/invalid_vocab.patterns b/gi/morf-segmentation/invalid_vocab.patterns deleted file mode 100644 index 473ce1b1..00000000 --- a/gi/morf-segmentation/invalid_vocab.patterns +++ /dev/null @@ -1,6 +0,0 @@ -[[:digit:]] -[] !"#$%&()*+,./:;<=>?@[\^_`{|}~] -^'$ --$ -^- -^$ diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py deleted file mode 100755 index 04e9044a..00000000 --- a/gi/morf-segmentation/linestripper.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/python - -import sys - -#linestripper file file maxlen [numlines] - -if len(sys.argv) < 3: - print "linestripper file1 file2 maxlen [numlines]" - print " outputs subset of file1 to stdout, ..of file2 to stderr" - sys.exit(1) - - -f1 = open(sys.argv[1],'r') -f2 = open(sys.argv[2],'r') - -maxlen=int(sys.argv[3]) -numlines = 0 - -if len(sys.argv) > 4: - numlines = int(sys.argv[4]) - -count=0 -for line1 in f1: - line2 = f2.readline() - - w1 = len(line1.strip().split()) - w2 = len(line2.strip().split()) - - if w1 <= maxlen and w2 <= maxlen: - count = count + 1 - sys.stdout.write(line1) - sys.stderr.write(line2) - - if numlines > 0 and count >= numlines: - break - -f1.close() -f2.close() - - diff --git a/gi/morf-segmentation/morf-pipeline.pl b/gi/morf-segmentation/morf-pipeline.pl deleted file mode 100755 index 46eb5b46..00000000 --- a/gi/morf-segmentation/morf-pipeline.pl +++ /dev/null @@ -1,486 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use File::Copy; - - -# Preprocessing pipeline to take care of word segmentation -# Learns a segmentation model for each/either side of the parallel corpus using all train/dev/test data -# Applies the segmentation where necessary. -# Learns word alignments on the preprocessed training data. -# Outputs script files used later to score output. - - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -use Getopt::Long "GetOptions"; - -my $GZIP = 'gzip'; -my $ZCAT = 'gunzip -c'; -my $SED = 'sed -e'; - -my $MORF_TRAIN = "$SCRIPT_DIR/morftrain.sh"; -my $MORF_SEGMENT = "$SCRIPT_DIR/morfsegment.py"; - -my $LINESTRIPPER = "$SCRIPT_DIR/linestripper.py"; -my $ALIGNER = "/export/ws10smt/software/berkeleyaligner/berkeleyaligner.jar"; -#java -d64 -Xmx10g -jar $ALIGNER ++word-align.conf >> aligner.log -assert_exec($MORF_TRAIN, $LINESTRIPPER, $MORF_SEGMENT, $ALIGNER); - -my $OUTPUT = './morfwork'; -my $PPL_SRC = 50; -my $PPL_TRG = 50; -my $MARKER = "#"; -my $MAX_WORDS = 40; -my $SENTENCES;# = 100000; -my $SPLIT_TYPE = ""; #possible values: s, t, st, or (empty string) -my $NAME_SHORTCUT; - -usage() unless &GetOptions('max_words=i' => \$MAX_WORDS, - 'output=s' => \$OUTPUT, - 'ppl_src=i' => \$PPL_SRC, - 'ppl_trg=i' => \$PPL_TRG, - 'sentences=i' => \$SENTENCES, - 'marker=s' => \$MARKER, - 'split=s' => \$SPLIT_TYPE, - 'get_name_only' => \$NAME_SHORTCUT, - ); - -usage() unless scalar @ARGV >= 2; - -my %CORPUS; # for (src,trg) it has (orig, name, filtered, final) - -$CORPUS{'src'}{'orig'} = $ARGV[0]; -open F, "<$CORPUS{'src'}{'orig'}" or die "Can't read $CORPUS{'src'}{'orig'}: $!"; close F; -$CORPUS{'src'}{'name'} = get_basename($CORPUS{'src'}{'orig'}); - -$CORPUS{'trg'}{'orig'} = $ARGV[1]; -open F, "<$CORPUS{'trg'}{'orig'}" or die "Can't read $CORPUS{'trg'}{'orig'}: $!"; close F; -$CORPUS{'trg'}{'name'} = get_basename($CORPUS{'trg'}{'orig'}); - -my %DEV; # for (src,trg) has (orig, final.split final.unsplit -if (@ARGV >= 4) { - $DEV{'src'}{'orig'} = $ARGV[2]; - open F, "<$DEV{'src'}{'orig'}" or die "Can't read $DEV{'src'}{'orig'}: $!"; close F; - $DEV{'src'}{'name'} = get_basename($DEV{'src'}{'orig'}); - $DEV{'trg'}{'orig'} = $ARGV[3]; - open F, "<$DEV{'trg'}{'orig'}" or die "Can't read $DEV{'trg'}{'orig'}: $!"; close F; - $DEV{'trg'}{'name'} = get_basename($DEV{'trg'}{'orig'}); -} - -my %TEST; # for (src,trg) has (orig, name) -if (@ARGV >= 6) { - $TEST{'src'}{'orig'} = $ARGV[4]; - open F, "<$TEST{'src'}{'orig'}" or die "Can't read $TEST{'src'}{'orig'}: $!"; close F; - $TEST{'src'}{'name'} = get_basename($TEST{'src'}{'orig'}); - $TEST{'trg'}{'orig'} = $ARGV[5]; - open F, "<$TEST{'trg'}{'orig'}" or die "Can't read $TEST{'trg'}{'orig'}: $!"; close F; - $TEST{'trg'}{'name'} = get_basename($TEST{'trg'}{'orig'}); -} - -my $SPLIT_SRC; #use these to check whether that part is being split -my $SPLIT_TRG; - -#OUTPUT WILL GO IN THESE -my $CORPUS_DIR = $OUTPUT . '/' . corpus_dir(); #subsampled corpus -my $MODEL_SRC_DIR = $OUTPUT . '/' . model_dir("src"); #splitting.. -my $MODEL_TRG_DIR = $OUTPUT . '/' . model_dir("trg"); # .. models -my $PROCESSED_DIR = $OUTPUT . '/' . processed_dir(); #segmented copora+alignments -my $ALIGNMENT_DIR = $PROCESSED_DIR . '/alignments'; - -$CORPUS{'src'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'src'}{'name'}"; -$CORPUS{'trg'}{'filtered'} = $CORPUS_DIR . "/$CORPUS{'trg'}{'name'}"; - -print STDERR "Output: $OUTPUT\n"; -print STDERR "Corpus: $CORPUS_DIR\n"; -print STDERR "Model-src: $MODEL_SRC_DIR\n"; -print STDERR "Model-trg: $MODEL_TRG_DIR\n"; -print STDERR "Finaldir: $PROCESSED_DIR\n"; - -safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; -safemkdir($CORPUS_DIR) or die "Couldn't create output directory $CORPUS_DIR: $!"; -filter_corpus(); - -safemkdir($PROCESSED_DIR); -safemkdir($ALIGNMENT_DIR); - -if ($SPLIT_SRC) { - safemkdir($MODEL_SRC_DIR) or die "Couldn't create output directory $MODEL_SRC_DIR: $!"; - learn_segmentation("src"); - apply_segmentation_side("src", $MODEL_SRC_DIR); -} - -#assume that unsplit hypotheses will be scored against an aritificially split target test set; thus obtain a target splitting model -#TODO: add a flag to override this behaviour -safemkdir($MODEL_TRG_DIR) or die "Couldn't create output directory $MODEL_TRG_DIR: $!"; -learn_segmentation("trg"); -$TEST{'trg'}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}"; -copy($TEST{'trg'}{'orig'}, $TEST{'trg'}{'finalunsplit'}) or die "Could not copy unsegmented test set"; - -if ($SPLIT_TRG) { - apply_segmentation_side("trg", $MODEL_TRG_DIR); - } else { - $TEST{'trg'}{'finalsplit'} = "$PROCESSED_DIR/$TEST{'trg'}{'name'}.split"; - apply_segmentation_any($MODEL_TRG_DIR, $TEST{'trg'}{'finalunsplit'}, $TEST{'trg'}{'finalsplit'}); -} - -write_eval_sh("$PROCESSED_DIR/eval-devtest.sh"); - -#copy corpora if they haven't been put in place by splitting operations -place_missing_data_side('src'); -place_missing_data_side('trg'); - -do_align(); - -if ($CORPUS{'src'}{'orig'} && $DEV{'src'}{'orig'} && $TEST{'src'}{'orig'}) { - print STDERR "Putting the config file entry in $PROCESSED_DIR/exp.config\n"; -#format is: - # nlfr100k_unsplit /export/ws10smt/jan/nlfr/morfwork/s100k.w40.sp_0 corpus.nl-fr.al fr-3.lm.gz dev.nl dev.fr test2008.nl eval-devtest.sh - my $line = split_name() . " $PROCESSED_DIR corpus.src-trg.al LMFILE.lm.gz"; - $line = $line . " $DEV{'src'}{'name'} $DEV{'trg'}{'name'}"; - $line = $line . " " . get_basename($TEST{'src'}{$SPLIT_SRC ? "finalsplit" : "finalunsplit"}) . " eval-devtest.sh"; - safesystem("echo '$line' > $PROCESSED_DIR/exp.config"); -} - -system("date"); -print STDERR "All done. You now need to train a language model (if target split), put it in the right dir and update the config file.\n\n"; - -############################## BILINGUAL ################################### - -sub filter_corpus { - print STDERR "\n!!!FILTERING TRAINING COPRUS!!!\n"; - if ( -f $CORPUS{'src'}{'filtered'} && -f $CORPUS{'trg'}{'filtered'}) { - print STDERR "$CORPUS{'src'}{'filtered'} and $CORPUS{'trg'}{'filtered'} exist, reusing...\n"; - return; - } - my $args = "$CORPUS{'src'}{'orig'} $CORPUS{'trg'}{'orig'} $MAX_WORDS"; - if ($SENTENCES) { $args = $args . " $SENTENCES"; } - safesystem("$LINESTRIPPER $args 1> $CORPUS{'src'}{'filtered'} 2> $CORPUS{'trg'}{'filtered'}") or die "Failed to filter training corpus for length."; -} - -sub learn_segmentation -{ - my $WHICH = shift; - my $corpus; my $dev; my $test; my $moddir; my $ppl; - - $corpus = $CORPUS{$WHICH}{'filtered'}; - $dev = $DEV{$WHICH}{'orig'}; - $test = $TEST{$WHICH}{'orig'}; - - if ($WHICH eq "src") { - $moddir = $MODEL_SRC_DIR; - $ppl = $PPL_SRC; - } else { - $moddir = $MODEL_TRG_DIR; - $ppl = $PPL_TRG; - } - my $cmd = "cat $corpus"; - if ($dev) { $cmd = "$cmd $dev"; } - if ($test) { $cmd = "$cmd $test"; } - my $tmpfile = "$CORPUS_DIR/all.tmp.gz"; - safesystem("$cmd | $GZIP > $tmpfile") or die "Failed to concatenate data for model learning.."; - assert_marker($tmpfile); - - learn_segmentation_side($tmpfile, $moddir, $ppl, $WHICH); - safesystem("rm $tmpfile"); -} - -sub do_align { - print STDERR "\n!!!WORD ALIGNMENT!!!\n"; - system("date"); - - my $ALIGNMENTS = "$ALIGNMENT_DIR/training.align"; - if ( -f $ALIGNMENTS ) { - print STDERR "$ALIGNMENTS exists, reusing...\n"; - return; - } - my $conf_file = "$ALIGNMENT_DIR/word-align.conf"; - - #decorate training files with identifiers to stop the aligner from training on dev and test when rerun in future. - safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'src'}{'name'} corpus.src") or die "Failed to symlink: $!"; - safesystem("cd $PROCESSED_DIR && ln -s $CORPUS{'trg'}{'name'} corpus.trg") or die "Failed to symlink: $!"; - - write_wconf($conf_file, $PROCESSED_DIR); - system("java -d64 -Xmx24g -jar $ALIGNER ++$conf_file > $ALIGNMENT_DIR/aligner.log"); - - if (! -f $ALIGNMENTS) { die "Failed to run word alignment.";} - - my $cmd = "paste $PROCESSED_DIR/corpus.src $PROCESSED_DIR/corpus.trg $ALIGNMENTS"; - $cmd = $cmd . " | sed 's/\\t/ \|\|\| /g' > $PROCESSED_DIR/corpus.src-trg.al"; - safesystem($cmd) or die "Failed to paste into aligned corpus file."; - -} - -############################# MONOLINGUAL ################################# - -#copy the necessary data files that weren't place by segmentation -sub place_missing_data_side { - my $side = shift; - - ifne_copy($CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}") ; - - if ($DEV{$side}{'orig'} && ! -f "$PROCESSED_DIR/$DEV{$side}{'name'}") { - $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; - copy($DEV{$side}{'orig'}, $DEV{$side}{'final'}) or die "Copy failed: $!"; - } - - if ($TEST{$side}{'orig'} && ! -f "$PROCESSED_DIR/$TEST{$side}{'name'}" && ! $TEST{$side}{'finalunsplit'}) { - $TEST{$side}{'finalunsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}"; - copy($TEST{$side}{'orig'}, $TEST{$side}{'finalunsplit'}) or die "Copy failed: $!"; - } - -} - -sub apply_segmentation_side { - my ($side, $moddir) = @_; - - print STDERR "\n!!!APPLYING SEGMENTATION MODEL ($side)!!!\n"; - apply_segmentation_any($moddir, $CORPUS{$side}{'filtered'}, "$PROCESSED_DIR/$CORPUS{$side}{'name'}"); - if ($DEV{$side}{'orig'}) { - $DEV{$side}{'final'} = "$PROCESSED_DIR/$DEV{$side}{'name'}"; - apply_segmentation_any($moddir, $DEV{$side}{'orig'}, "$DEV{$side}{'final'}"); - } - if ($TEST{$side}{'orig'}) { - $TEST{$side}{'finalsplit'} = "$PROCESSED_DIR/$TEST{$side}{'name'}.split"; - apply_segmentation_any($moddir, $TEST{$side}{'orig'}, $TEST{$side}{'finalsplit'} ); - } - -} - -sub learn_segmentation_side { - my($INPUT_FILE, $SEGOUT_DIR, $PPL, $LANG) = @_; - - print STDERR "\n!!!LEARNING SEGMENTATION MODEL ($LANG)!!!\n"; - system("date"); - my $SEG_FILE = $SEGOUT_DIR . "/segmentation.ready"; - if ( -f $SEG_FILE) { - print STDERR "$SEG_FILE exists, reusing...\n"; - return; - } - my $cmd = "$MORF_TRAIN $INPUT_FILE $SEGOUT_DIR $PPL \"$MARKER\""; - safesystem($cmd) or die "Failed to learn segmentation model"; -} - -sub apply_segmentation_any { - my($moddir, $datfile, $outfile) = @_; - if ( -f $outfile) { - print STDERR "$outfile exists, reusing...\n"; - return; - } - - my $args = "$moddir/inputvocab.gz $moddir/segmentation.ready \"$MARKER\""; - safesystem("cat $datfile | $MORF_SEGMENT $args &> $outfile") or die "Could not segment $datfile"; -} - -##################### PATH FUNCTIONS ########################## - -sub beautify_numlines { - return ($SENTENCES ? $SENTENCES : "_all"); -} - -sub corpus_dir { - return "s" . beautify_numlines() . ".w" . $MAX_WORDS; -} - -sub model_dir { - my $lang = shift; - if ($lang eq "src") { - return corpus_dir() . ".PPL" . $PPL_SRC . ".src"; - } elsif ($lang eq "trg") { - return corpus_dir() . ".PPL" . $PPL_TRG . ".trg"; - } else { - return "PPLundef"; - } -} - -sub processed_dir { - return corpus_dir() . "." . split_name(); -} - -########################## HELPER FUNCTIONS ############################ - -sub ifne_copy { - my ($src, $dest) = @_; - if (! -f $dest) { - copy($src, $dest) or die "Copy failed: $!"; - } -} - -sub split_name { - #parses SPLIT_TYPE, which can have the following values - # t|s|ts|st (last 2 are equiv) - # or is undefined when no splitting is done - my $name = ""; - - if ($SPLIT_TYPE) { - $SPLIT_SRC = lc($SPLIT_TYPE) =~ /s/; - $SPLIT_TRG = lc($SPLIT_TYPE) =~ /t/; - $name = $name . ($SPLIT_SRC ? $PPL_SRC : "0"); - $name = $name . "_" . ($SPLIT_TRG ? $PPL_TRG : "0"); - } else { - #no splitting - $name = "0"; - } - - return "sp_" . $name; - -} - -sub usage { - print <> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } -} - -sub get_basename -{ - my $x = shift; - $x = `basename $x`; - $x =~ s/\n//; - return $x; -} - -sub assert_marker { - my $file = shift; - my $result = `zcat $file| grep '$MARKER' | wc -l` or die "Cannot read $file: $!"; - print $result; - if (scalar($result) != 0) { die "Data contains marker '$MARKER'; use something else.";} -} -########################### Dynamic config files ############################## - -sub write_wconf { - my ($filename, $train_dir) = @_; - open WCONF, ">$filename" or die "Can't write $filename: $!"; - - print WCONF <$filename" or die "Can't write $filename: $!"; - - print EVALFILE < "\$1.recombined" - -\$EVAL_MAIN "\$1.recombined" $TEST{'trg'}{'finalunsplit'} -EOT - - } else { - print EVALFILE < "\$1.split" - -\$EVAL_MAIN "\$1.split" $TEST{'trg'}{'finalsplit'} - -echo "DIRECT EVALUATION" -echo "--------------------------" -\$EVAL_MAIN "\$1" $TEST{'trg'}{'finalunsplit'} - -EOT - - } - close EVALFILE; - -} - - - - diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py deleted file mode 100755 index 85b9d4fb..00000000 --- a/gi/morf-segmentation/morfsegment.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/python - -import sys -import gzip - -#usage: morfsegment.py inputvocab.gz segmentation.ready -# stdin: the data to segment -# stdout: the segmented data - -if len(sys.argv) < 3: - print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]" - print " stdin: the data to segment" - print " stdout: the segmented data" - sys.exit() - -#read index: -split_index={} - -marker="##" - -if len(sys.argv) > 3: - marker=sys.argv[3] - -word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz -seg_vocab=open(sys.argv[2], 'r') #segm.ready.. - -for seg in seg_vocab: - #seg = ver# #wonder\n - #wordline = 1 verwonder\n - word = word_vocab.readline().strip().split(' ') - assert(len(word) == 2) - word = word[1] - seg=seg.strip() - - if seg != word: - split_index[word] = seg - -word_vocab.close() -seg_vocab.close() - -for line in sys.stdin: - words = line.strip().split() - - newsent = [] - for word in words: - splitword = split_index.get(word, word) - newsent.append(splitword) - - print ' '.join(newsent) - diff --git a/gi/morf-segmentation/morftrain.sh b/gi/morf-segmentation/morftrain.sh deleted file mode 100755 index 9004922f..00000000 --- a/gi/morf-segmentation/morftrain.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash - -if [[ $# -lt 3 ]]; then - echo "Trains a morfessor model and places the result in writedir" - echo - echo "Usage: `basename $0` corpus_input_file writedir [PPL] [marker] [lines]" - echo -e "\tcorpus_input_file contains a sentence per line." - exit 1 -fi - -MORFESSOR_DIR="/export/ws10smt/software/morfessor_catmap0.9.2" -SCRIPT_DIR=$(dirname `readlink -f $0`) - -MORFBINDIR="$MORFESSOR_DIR/bin" -MORFMAKEFILE_TRAIN="$MORFESSOR_DIR/train/Makefile" -VOCABEXT="$SCRIPT_DIR/vocabextractor.sh" - -MARKER="#" - -if [[ ! -f $VOCABEXT ]]; then - echo "$VOCABEXT doesn't exist!" - exit 1 -fi -if [[ ! -f $MORFMAKEFILE_TRAIN ]]; then - echo "$MORFMAKEFILE_TRAIN doesn't exist!" - exit 1 -fi - - -CORPUS="$1" -WRITETODIR=$2 - -if [[ ! -f $CORPUS ]]; then - echo "$CORPUS doesn't exist!" - exit 1 -fi - -PPL=10 -LINES=0 -if [[ $# -gt 2 ]]; then - PPL=$3 -fi -if [[ $# -gt 3 ]]; then - MARKER="$4" -fi -if [[ $# -gt 4 ]]; then - LINES=$5 -fi - -mkdir -p $WRITETODIR - -#extract vocabulary to train on -echo "Extracting vocabulary..." -if [[ -f $WRITETODIR/inputvocab.gz ]]; then - echo " ....$WRITETODIR/inputvocab.gz exists, reusing." -else - if [[ $LINES -gt 0 ]]; then - $VOCABEXT $CORPUS $LINES | gzip > $WRITETODIR/inputvocab.gz - else - $VOCABEXT $CORPUS | gzip > $WRITETODIR/inputvocab.gz - fi -fi - - -#train it -echo "Training morf model..." -if [[ -f $WRITETODIR/segmentation.final.gz ]]; then - echo " ....$WRITETODIR/segmentation.final.gz exists, reusing.." -else - OLDPWD=`pwd` - cd $WRITETODIR - - #put the training Makefile in place, with appropriate modifications - sed -e "s/^GZIPPEDINPUTDATA = .*$/GZIPPEDINPUTDATA = inputvocab.gz/" \ - -e "s/^PPLTHRESH = .*$/PPLTHRESH = $PPL/" \ - -e "s;^BINDIR = .*$;BINDIR = $MORFBINDIR;" \ - $MORFMAKEFILE_TRAIN > ./Makefile - - date - make > ./trainmorf.log 2>&1 - cd $OLDPWD - - - echo "Post processing..." - #remove comments, counts and morph types - #mark morphs - - if [[ ! -f $WRITETODIR/segmentation.final.gz ]]; then - echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written" - exit 1 - fi - - zcat $WRITETODIR/segmentation.final.gz | \ - awk '$1 !~ /^#/ {print}' | \ - cut -d ' ' --complement -f 1 | \ - sed -e "s/\/...//g" -e "s/ + /$MARKER $MARKER/g" \ - > $WRITETODIR/segmentation.ready - - if [[ ! -f $WRITETODIR/segmentation.ready ]]; then - echo "Failed to learn segmentation model: $WRITETODIR/segmentation.final.gz not written" - exit 1 - fi - - - - echo "Done training." - date -fi -echo "Segmentation model is $WRITETODIR/segmentation.ready." - diff --git a/gi/morf-segmentation/vocabextractor.sh b/gi/morf-segmentation/vocabextractor.sh deleted file mode 100755 index 00ae7109..00000000 --- a/gi/morf-segmentation/vocabextractor.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -d=$(dirname `readlink -f $0`) -if [ $# -lt 1 ]; then - echo "Extracts unique words and their frequencies from a subset of a corpus." - echo - echo "Usage: `basename $0` input_file [number_of_lines] > output_file" - echo -e "\tinput_file contains a sentence per line." - echo - echo "Script also removes words from the vocabulary if they contain a digit or a special character. Output is printed to stdout in a format suitable for use with Morfessor." - echo - exit -fi - -srcname=$1 -reallen=0 - -if [[ $# -gt 1 ]]; then - reallen=$2 -fi - -pattern_file=$d/invalid_vocab.patterns - -if [[ ! -f $pattern_file ]]; then - echo "Pattern file missing" - exit 1 -fi - -#this awk strips entries from the vocabulary if they contain invalid characters -#invalid characters are digits and punctuation marks, and words beginning or ending with a dash -#uniq -c extracts the unique words and counts the occurrences - -if [[ $reallen -eq 0 ]]; then - #when a zero is passed, use the whole file - zcat -f $srcname | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' - -else - zcat -f $srcname | head -n $reallen | sed 's/ /\n/g' | egrep -v -f $pattern_file | sort | uniq -c | sed 's/^ *//' -fi - diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am deleted file mode 100644 index 86f8e07b..00000000 --- a/gi/pf/Makefile.am +++ /dev/null @@ -1,44 +0,0 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test bayes_lattice_score - -noinst_LIBRARIES = libpf.a - -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc hpyp_tm.cc pyp_tm.cc - -bayes_lattice_score_SOURCES = bayes_lattice_score.cc -bayes_lattice_score_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -pf_test_SOURCES = pf_test.cc -pf_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -nuisance_test_SOURCES = nuisance_test.cc -nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc -align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -align_tl_SOURCES = align-tl.cc -align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -itg_SOURCES = itg.cc - -pyp_lm_SOURCES = pyp_lm.cc - -learn_cfg_SOURCES = learn_cfg.cc - -condnaive_SOURCES = condnaive.cc - -dpnaive_SOURCES = dpnaive.cc - -pfdist_SOURCES = pfdist.cc - -pfnaive_SOURCES = pfnaive.cc - -cbgi_SOURCES = cbgi.cc - -brat_SOURCES = brat.cc - -pfbrat_SOURCES = pfbrat.cc - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm - -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/README b/gi/pf/README deleted file mode 100644 index 62e47541..00000000 --- a/gi/pf/README +++ /dev/null @@ -1,2 +0,0 @@ -Experimental Bayesian alignment tools. Nothing to see here. - diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc deleted file mode 100644 index e7509f57..00000000 --- a/gi/pf/align-lexonly-pyp.cc +++ /dev/null @@ -1,243 +0,0 @@ -#include -#include - -#include -#include - -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "array2d.h" -#include "sampler.h" -#include "corpus.h" -#include "pyp_tm.h" -#include "hpyp_tm.h" -#include "quasi_model2.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") - ("p_null,0", po::value()->default_value(0.08), "probability of aligning to null") - ("align_alpha,a", po::value()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -MT19937* prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -template -struct Aligner { - Aligner(const vector >& lets, - int vocab_size, - int num_letters, - const po::variables_map& conf, - vector* c) : - corpus(*c), - paj_model(conf["align_alpha"].as(), conf["p_null"].as()), - infer_paj(conf.count("infer_alignment_hyperparameters") > 0), - model(lets, vocab_size, num_letters), - kNULL(TD::Convert("NULL")) { - assert(lets[kNULL].size() == 0); - } - - vector& corpus; - QuasiModel2 paj_model; - const bool infer_paj; - LexicalTranslationModel model; - const WordID kNULL; - - void ResampleHyperparameters() { - model.ResampleHyperparameters(prng); - if (infer_paj) paj_model.ResampleHyperparameters(prng); - } - - void InitializeRandom() { - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - unsigned char& a_j = asp.a[j].src_index; - a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - model.Increment(f_a_j, asp.trg[j], &*prng); - paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); - } - } - cerr << "Corpus intialized randomly." << endl; - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; - } - - void ResampleCorpus() { - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - unsigned char& a_j = asp.a[j].src_index; - const WordID e_j = asp.trg[j]; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - model.Decrement(f_a_j, e_j, prng); - paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - ss[prop_a_j] = model.Prob(prop_f, e_j); - ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - model.Increment(f_a_j, e_j, prng); - paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); - } - } - } - - prob_t Likelihood() const { - return model.Likelihood() * paj_model.Likelihood(); - } -}; - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - assert(asp.a[j].src_index <= asp.src.size()); - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - } - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng = new MT19937(conf["random_seed"].as()); - else - prng = new MT19937; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - //Aligner aligner(letters, vocabe.size(), letset.size(), conf, &corpus); - Aligner aligner(letters, vocabe.size(), letset.size(), conf, &corpus); - aligner.InitializeRandom(); - - const unsigned samples = conf["samples"].as(); - for (int i = 0; i < samples; ++i) { - for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) { - aligner.ResampleHyperparameters(); - cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() - << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; - } - aligner.ResampleCorpus(); - if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); - } - for (unsigned i = 0; i < corpus.size(); ++i) - WriteAlignments(corpus[i]); - aligner.model.Summary(); - - return 0; -} diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc deleted file mode 100644 index f6608f1d..00000000 --- a/gi/pf/align-tl.cc +++ /dev/null @@ -1,339 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "backward.h" -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "mfcr.h" -#include "corpus.h" -#include "ngram_base.h" -#include "transliterations.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("s2t", po::value(), "character level source-to-target prior transliteration probabilities") - ("t2s", po::value(), "character level target-to-source prior transliteration probabilities") - ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") - ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") - ("expected_src_to_trg_ratio", po::value()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -boost::shared_ptr prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return Md::log_poisson(s.size(), 7.5) + s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - v[0].logeq(logp0(rule.e_)); - return r.prob(rule.e_, v.begin(), l.begin()); - } - - void Increment(const TRule& rule) { - v[0].logeq(logp0(rule.e_)); - if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { - base *= v[0] * l[0]; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_, &*prng).count) { - base /= prob_t(exp(logp0(rule.e_))); - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; - } - - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; - } - - prob_t base; - MFCR<1,vector > r; - const double u0; - const vector l; - mutable vector v; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } - - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; - } - - void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; - } - - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - MConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r, &*prng)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; -} - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords() + 1); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - // TODO configure this - const int max_src_chunk = conf["max_src_chunk"].as(); - const int max_trg_chunk = conf["max_trg_chunk"].as(); - const double s2t_rat = conf["expected_src_to_trg_ratio"].as(); - const BackwardEstimator be(conf["s2t"].as(), conf["t2s"].as()); - Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); - - cerr << "Initializing transliteration graph structures ...\n"; - for (int i = 0; i < corpus.size(); ++i) { - const vector& src = corpus[i].src; - const vector& trg = corpus[i].trg; - for (int j = 0; j < src.size(); ++j) { - const vector& src_let = letters[src[j]]; - for (int k = 0; k < trg.size(); ++k) { - const vector& trg_let = letters[trg[k]]; - tl.Initialize(src[j], src_let, trg[k], trg_let); - //if (src_let.size() < min_trans_src) - // tl.Forbid(src[j], src_let, trg[k], trg_let); - } - } - } - cerr << endl; - tl.GraphSummary(); - - return 0; -} diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc deleted file mode 100644 index b92629fd..00000000 --- a/gi/pf/backward.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "backward.h" - -#include -#include - -#include "array2d.h" -#include "reachability.h" -#include "base_distributions.h" - -using namespace std; - -BackwardEstimator::BackwardEstimator(const string& s2t, - const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} - -BackwardEstimator::~BackwardEstimator() { - delete m1; m1 = NULL; - delete m1inv; m1inv = NULL; -} - -float BackwardEstimator::ComputeBackwardProb(const std::vector& src, - const std::vector& trg, - unsigned src_covered, - unsigned trg_covered, - double s2t_ratio) const { - if (src_covered == src.size() || trg_covered == trg.size()) { - assert(src_covered == src.size()); - assert(trg_covered == trg.size()); - return 0; - } - static const WordID kNULL = TD::Convert(""); - const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); - // TODO factor in expected length ratio - prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) - for (unsigned j = trg_covered; j < trg.size(); ++j) { - prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); - for (unsigned i = src_covered; i < src.size(); ++i) - p += (*m1)(src[i], trg[j]); - if (p.is_0()) { - cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; - assert(!"failed"); - } - p *= uniform_alignment; - e *= p; - } - // TODO factor in expected length ratio - const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); - prob_t inv; - inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); - for (unsigned i = src_covered; i < src.size(); ++i) { - prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); - for (unsigned j = trg_covered; j < trg.size(); ++j) - p += (*m1inv)(trg[j], src[i]); - if (p.is_0()) { - cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; - assert(!"failed"); - } - p *= inv_uniform; - inv *= p; - } - return (log(e) + log(inv)) / 2; -} - -void BackwardEstimator::InitializeGrid(const vector& src, - const vector& trg, - const Reachability& r, - double s2t_ratio, - float* grid) const { - queue > q; - q.push(make_pair(0,0)); - Array2D done(src.size()+1, trg.size()+1, false); - //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; - while(!q.empty()) { - const pair n = q.front(); - q.pop(); - if (done(n.first,n.second)) continue; - done(n.first,n.second) = true; - - float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); - if (n.first == 0 && n.second == 0) grid[0] = lp; - //cerr << " " << n.first << "," << n.second << "\t" << lp << endl; - - if (n.first == src.size() || n.second == trg.size()) continue; - const vector >& edges = r.valid_deltas[n.first][n.second]; - for (int i = 0; i < edges.size(); ++i) - q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); - } - //static int cc = 0; ++cc; if (cc == 80) exit(1); -} - diff --git a/gi/pf/backward.h b/gi/pf/backward.h deleted file mode 100644 index e67eff0c..00000000 --- a/gi/pf/backward.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _BACKWARD_H_ -#define _BACKWARD_H_ - -#include -#include -#include "wordid.h" - -struct Reachability; -struct Model1; - -struct BackwardEstimator { - BackwardEstimator(const std::string& s2t, - const std::string& t2s); - ~BackwardEstimator(); - - void InitializeGrid(const std::vector& src, - const std::vector& trg, - const Reachability& r, - double src2trg_ratio, - float* grid) const; - - private: - float ComputeBackwardProb(const std::vector& src, - const std::vector& trg, - unsigned src_covered, - unsigned trg_covered, - double src2trg_ratio) const; - - Model1* m1; - Model1* m1inv; -}; - -#endif diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc deleted file mode 100644 index 57e0bbe1..00000000 --- a/gi/pf/base_distributions.cc +++ /dev/null @@ -1,241 +0,0 @@ -#include "base_distributions.h" - -#include - -#include "filelib.h" - -using namespace std; - -TableLookupBase::TableLookupBase(const string& fname) { - cerr << "TableLookupBase reading from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - vector le, lf; - TRule x; - x.lhs_ = -TD::Convert("X"); - bool flag = false; - while(getline(in, line)) { - ++lc; - if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } - else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } - tmp.clear(); - TD::ConvertSentence(line, &tmp); - x.f_.clear(); - x.e_.clear(); - size_t pos = 0; - int cc = 0; - while(pos < tmp.size()) { - const WordID cur = tmp[pos++]; - if (cur == kDIV) { - ++cc; - } else if (cc == 0) { - x.f_.push_back(cur); - } else if (cc == 1) { - x.e_.push_back(cur); - } else if (cc == 2) { - table[x].logeq(atof(TD::Convert(cur).c_str())); - ++cc; - } else { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (cc != 3) { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (flag) cerr << endl; - cerr << " read " << lc << " entries\n"; -} - -prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform - return p; -} - -prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= kUNIFORM_TARGET; // draw e_i ~Uniform - return p; -} - -void Model1::LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; -} - -prob_t PhraseConditionalBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) { // for each position i in e-RHS - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); - p *= ptrglen; - p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); - - prob_t p1; - p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); - p1 *= ptrglen; - p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p1.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - prob_t p2; - p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1) - // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01)); - p2 *= psrclen; - p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform - for (int i = 0; i < flen; ++i) { // for each position i in E - const WordID src = vsrc[i + start_src]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < elen; ++j) { - const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; - tp += kM1MIXTURE * invmodel1(trg, src); - tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; - } - tp *= uniform_trg_alignment; // draw a_i ~uniform - p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p2.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - static const prob_t kHALF(0.5); - return (p1 + p2) * kHALF; -} - -JumpBase::JumpBase() : p(200) { - for (unsigned src_len = 1; src_len < 200; ++src_len) { - map& cpd = p[src_len]; - int min_jump = 1 - src_len; - int max_jump = src_len; - prob_t z; - for (int j = min_jump; j <= max_jump; ++j) { - prob_t& cp = cpd[j]; - if (j < 0) - cp.logeq(Md::log_poisson(1.5-j, 1)); - else if (j > 0) - cp.logeq(Md::log_poisson(j, 1)); - cp.poweq(0.2); - z += cp; - } - for (int j = min_jump; j <= max_jump; ++j) { - cpd[j] /= z; - } - } -} - diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h deleted file mode 100644 index 41b513f8..00000000 --- a/gi/pf/base_distributions.h +++ /dev/null @@ -1,238 +0,0 @@ -#ifndef _BASE_MEASURES_H_ -#define _BASE_MEASURES_H_ - -#include -#include -#include -#include -#include -#include - -#include "unigrams.h" -#include "trule.h" -#include "prob.h" -#include "tdict.h" -#include "sampler.h" -#include "m.h" -#include "os_phrase.h" - -struct Model1 { - explicit Model1(const std::string& fname) : - kNULL(TD::Convert("")), - kZERO() { - LoadModel1(fname); - } - - void LoadModel1(const std::string& fname); - - // returns prob 0 if src or trg is not found - const prob_t& operator()(WordID src, WordID trg) const { - if (src == 0) src = kNULL; - if (src < ttable.size()) { - const std::map& cpd = ttable[src]; - const std::map::const_iterator it = cpd.find(trg); - if (it != cpd.end()) - return it->second; - } - return kZERO; - } - - const WordID kNULL; - const prob_t kZERO; - std::vector > ttable; -}; - -struct PoissonUniformUninformativeBase { - explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); - prob_t q = kUNIFORM; q.poweq(r.e_.size()); - p *= q; - return p; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct CompletelyUniformBase { - explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule&) const { - return kUNIFORM; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct UnigramWordBase { - explicit UnigramWordBase(const std::string& fname) : un(fname) {} - prob_t operator()(const TRule& r) const { - return un(r.e_); - } - const UnigramWordModel un; -}; - -struct RuleHasher { - size_t operator()(const TRule& r) const { - return hash_value(r); - } -}; - -struct TableLookupBase { - TableLookupBase(const std::string& fname); - - prob_t operator()(const TRule& rule) const { - const std::tr1::unordered_map::const_iterator it = table.find(rule); - if (it == table.end()) { - std::cerr << rule << " not found\n"; - abort(); - } - return it->second; - } - - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - void Summary() const {} - - std::tr1::unordered_map table; -}; - -struct PhraseConditionalUninformativeBase { - explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseConditionalUninformativeUnigramBase { - explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const UnigramModel u; -}; - -struct PhraseConditionalBase { - explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase { - explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase_BiDir { - explicit PhraseJointBase_BiDir(const Model1& m1, - const Model1& im1, - const double m1mixture, - const unsigned vocab_e_size, - const unsigned vocab_f_size) : - model1(m1), - invmodel1(im1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const Model1& invmodel1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -// base distribution for jump size multinomials -// basically p(0) = 0 and then, p(1) is max, and then -// you drop as you move to the max jump distance -struct JumpBase { - JumpBase(); - - const prob_t& operator()(int jump, unsigned src_len) const { - assert(jump != 0); - const std::map::const_iterator it = p[src_len].find(jump); - assert(it != p[src_len].end()); - return it->second; - } - std::vector > p; -}; - - -#endif diff --git a/gi/pf/bayes_lattice_score.cc b/gi/pf/bayes_lattice_score.cc deleted file mode 100644 index 70cb8dc2..00000000 --- a/gi/pf/bayes_lattice_score.cc +++ /dev/null @@ -1,309 +0,0 @@ -#include -#include - -#include -#include -#include - -#include "inside_outside.h" -#include "hg.h" -#include "hg_io.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -unsigned ReadCorpus(const string& filename, - vector* e, - set* vocab_e) { - e->clear(); - vocab_e->clear(); - ReadFile rf(filename); - istream* in = rf.stream(); - assert(*in); - string line; - unsigned toks = 0; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(Lattice()); - Lattice& le = e->back(); - LatticeTools::ConvertTextOrPLF(line, & le); - for (unsigned i = 0; i < le.size(); ++i) - for (unsigned j = 0; j < le[i].size(); ++j) - vocab_e->insert(le[i][j].label); - toks += le.size(); - } - return toks; -} - -struct BaseModel { - explicit BaseModel(unsigned tc) : - unif(1.0 / tc), p(prob_t::One()) {} - prob_t prob(const TRule& r) const { - return unif; - } - void increment(const TRule& r, MT19937* rng) { - p *= prob(r); - } - void decrement(const TRule& r, MT19937* rng) { - p /= prob(r); - } - prob_t Likelihood() const { - return p; - } - const prob_t unif; - prob_t p; -}; - -struct UnigramModel { - explicit UnigramModel(unsigned tc) : base(tc), crp(1,1,1,1), glue(1,1,1,1) {} - BaseModel base; - CCRP crp; - CCRP glue; - - prob_t Prob(const TRule& r) const { - if (r.Arity() != 0) { - return glue.prob(r, prob_t(0.5)); - } - return crp.prob(r, base.prob(r)); - } - - int Increment(const TRule& r, MT19937* rng) { - if (r.Arity() != 0) { - glue.increment(r, 0.5, rng); - return 0; - } else { - if (crp.increment(r, base.prob(r), rng)) { - base.increment(r, rng); - return 1; - } - return 0; - } - } - - int Decrement(const TRule& r, MT19937* rng) { - if (r.Arity() != 0) { - glue.decrement(r, rng); - return 0; - } else { - if (crp.decrement(r, rng)) { - base.decrement(r, rng); - return -1; - } - return 0; - } - } - - prob_t Likelihood() const { - prob_t p; - p.logeq(crp.log_crp_prob() + glue.log_crp_prob()); - p *= base.Likelihood(); - return p; - } - - void ResampleHyperparameters(MT19937* rng) { - crp.resample_hyperparameters(rng); - glue.resample_hyperparameters(rng); - cerr << " d=" << crp.discount() << ", s=" << crp.strength() << "\t STOP d=" << glue.discount() << ", s=" << glue.strength() << endl; - } -}; - -UnigramModel* plm; - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { - vector node_probs; - Inside(hg, &node_probs); - queue q; - q.push(hg.nodes_.size() - 2); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - //prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = edge.edge_prob_; - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - //z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } -// for (unsigned i = 0; i < sampled_deriv->size(); ++i) { -// cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; -// } -} - -void IncrementDerivation(const Hypergraph& hg, const vector& d, UnigramModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector& d, UnigramModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -prob_t TotalProb(const Hypergraph& hg) { - return Inside(hg); -} - -void IncrementLatticePath(const Hypergraph& hg, const vector& d, Lattice* pl) { - Lattice& lat = *pl; - for (int i = 0; i < d.size(); ++i) { - const Hypergraph::Edge& edge = hg.edges_[d[i]]; - if (edge.rule_->Arity() != 0) continue; - WordID sym = edge.rule_->e_[0]; - vector& las = lat[edge.i_]; - int dist = edge.j_ - edge.i_; - assert(dist > 0); - for (int j = 0; j < las.size(); ++j) { - if (las[j].dist2next == dist && - las[j].label == sym) { - las[j].cost += 1; - } - } - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - - InitCommandLine(argc, argv, &conf); - vector grammars(2); - grammars[0].reset(new GlueGrammar("S","X")); - const unsigned samples = conf["samples"].as(); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - vector corpuse; - set vocabe; - cerr << "Reading corpus...\n"; - const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " lattices\t (" << vocabe.size() << " word types)\n"; - UnigramModel lm(vocabe.size()); - vector hgs(corpuse.size()); - vector > derivs(corpuse.size()); - for (int i = 0; i < corpuse.size(); ++i) { - grammars[1].reset(new PassThroughGrammar(corpuse[i], "X")); - ExhaustiveBottomUpParser parser("S", grammars); - bool res = parser.Parse(corpuse[i], &hgs[i]); // exhaustive parse - assert(res); - } - - double csamples = 0; - for (int SS=0; SS < samples; ++SS) { - const bool is_last = ((samples - 1) == SS); - prob_t dlh = prob_t::One(); - bool record_sample = (SS > (samples * 1 / 3) && (SS % 5 == 3)); - if (record_sample) csamples++; - for (int ci = 0; ci < corpuse.size(); ++ci) { - Lattice& lat = corpuse[ci]; - Hypergraph& hg = hgs[ci]; - vector& d = derivs[ci]; - if (!is_last) DecrementDerivation(hg, d, &lm, &rng); - for (unsigned i = 0; i < hg.edges_.size(); ++i) { - TRule& r = *hg.edges_[i].rule_; - if (r.Arity() != 0) - hg.edges_[i].edge_prob_ = prob_t::One(); - else - hg.edges_[i].edge_prob_ = lm.Prob(r); - } - if (!is_last) { - d.clear(); - SampleDerivation(hg, &rng, &d); - IncrementDerivation(hg, derivs[ci], &lm, &rng); - } else { - prob_t p = TotalProb(hg); - dlh *= p; - cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; - } - if (record_sample) IncrementLatticePath(hg, derivs[ci], &lat); - } - double llh = log(lm.Likelihood()); - cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; - if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); - if (is_last) { - double z = log(dlh); - cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; - } - } - cerr << lm.crp << endl; - cerr << lm.glue << endl; - for (int i = 0; i < corpuse.size(); ++i) { - for (int j = 0; j < corpuse[i].size(); ++j) - for (int k = 0; k < corpuse[i][j].size(); ++k) { - corpuse[i][j][k].cost /= csamples; - corpuse[i][j][k].cost += 1e-3; - corpuse[i][j][k].cost = log(corpuse[i][j][k].cost); - } - cout << HypergraphIO::AsPLF(corpuse[i]) << endl; - } - return 0; -} - diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc deleted file mode 100644 index 832f22cf..00000000 --- a/gi/pf/brat.cc +++ /dev/null @@ -1,543 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "cfg_wfst_composer.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; -struct FSTState; - -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -struct ConditionalBase { - explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_TARGET(1.0 / vocab_e_size), - kNULL(TD::Convert("")) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - LoadModel1(model1fname); - } - - void LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; - } - - // return logp0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - const int flen = rule.f_.size(); - const int elen = rule.e_.size(); - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) { // for each position i in e-RHS - const WordID trg = rule.e_[i]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? kNULL : rule.f_[j]; - const map::const_iterator it = ttable[src].find(trg); - if (it != ttable[src].end()) { - tp += kM1MIXTURE * it->second; - } - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - return p; - } - - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_TARGET; - const WordID kNULL; - vector > ttable; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(3),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(3),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -struct UniphraseLM { - UniphraseLM(const vector >& corpus, - const set& vocab, - const po::variables_map& conf) : - phrases_(1,1), - gen_(1,1), - corpus_(corpus), - uniform_word_(1.0 / vocab.size()), - gen_p0_(0.5), - p_end_(0.5), - use_poisson_(conf.count("poisson_length") > 0) {} - - void ResampleHyperparameters(MT19937* rng) { - phrases_.resample_hyperparameters(rng); - gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.alpha(); - } - - CCRP_NoTable > phrases_; - CCRP_NoTable gen_; - vector > z_; // z_[i] is there a phrase boundary after the ith word - const vector >& corpus_; - const double uniform_word_; - const double gen_p0_; - const double p_end_; // in base length distribution, p of the end of a phrase - const bool use_poisson_; -}; - -struct Reachability { - boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? - boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid - - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : - edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); - } - - private: - struct SState { - SState() : prev_src_covered(), prev_trg_covered() {} - SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} - int prev_src_covered; - int prev_trg_covered; - }; - - struct NState { - NState() : next_src_covered(), next_trg_covered() {} - NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} - int next_src_covered; - int next_trg_covered; - }; - - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { - typedef boost::multi_array, 2> array_type; - array_type a(boost::extents[srclen + 1][trglen + 1]); - a[0][0].push_back(SState()); - for (int i = 0; i < srclen; ++i) { - for (int j = 0; j < trglen; ++j) { - if (a[i][j].size() == 0) continue; - const SState prev(i,j); - for (int k = 1; k <= src_max_phrase_len; ++k) { - if ((i + k) > srclen) continue; - for (int l = 1; l <= trg_max_phrase_len; ++l) { - if ((j + l) > trglen) continue; - a[i + k][j + l].push_back(prev); - } - } - } - } - a[0][0].clear(); - cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - assert(a[srclen][trglen].size() > 0); - - typedef boost::multi_array rarray_type; - rarray_type r(boost::extents[srclen + 1][trglen + 1]); -// typedef boost::multi_array, 2> narray_type; -// narray_type b(boost::extents[srclen + 1][trglen + 1]); - r[srclen][trglen] = true; - for (int i = srclen; i >= 0; --i) { - for (int j = trglen; j >= 0; --j) { - vector& prevs = a[i][j]; - if (!r[i][j]) { prevs.clear(); } -// const NState nstate(i,j); - for (int k = 0; k < prevs.size(); ++k) { - r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; - int src_delta = i - prevs[k].prev_src_covered; - edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; - short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; - if (src_delta > msd) msd = src_delta; -// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); - } - } - } - assert(!edges[0][0][1][0]); - assert(!edges[0][0][0][1]); - assert(!edges[0][0][0][0]); - cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; - assert(max_src_delta[0][0] > 0); - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} - } -}; - -ostream& operator<<(ostream& os, const FSTState& q); -struct FSTState { - explicit FSTState(int src_size) : - trg_covered_(), - src_covered_(), - src_coverage_(src_size) {} - - FSTState(short trg_covered, short src_covered, const vector& src_coverage, const vector& src_prefix) : - trg_covered_(trg_covered), - src_covered_(src_covered), - src_coverage_(src_coverage), - src_prefix_(src_prefix) { - if (src_coverage_.size() == src_covered) { - assert(src_prefix.size() == 0); - } - } - - // if we extend by the word at src_position, what are - // the next states that are reachable and lie on a valid - // path to the final state? - vector Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { - assert(src_position < src_coverage_.size()); - if (src_coverage_[src_position]) { - cerr << "Trying to extend " << *this << " with position " << src_position << endl; - abort(); - } - vector ncvg = src_coverage_; - ncvg[src_position] = true; - - vector res; - const int trg_remaining = trg_len - trg_covered_; - if (trg_remaining <= 0) { - cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; - abort(); - } - const int src_remaining = src_len - src_covered_; - if (src_remaining <= 0) { - cerr << "Source appears to have been covered: " << *this << endl; - abort(); - } - - for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { - if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { - int nc = src_prefix_.size() + 1 + src_covered_; - res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector())); - } - } - - if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { - vector nsp = src_prefix_; - nsp.push_back(src_position); - res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); - } - - if (res.size() == 0) { - cerr << *this << " can't be extended!\n"; - abort(); - } - return res; - } - - short trg_covered_, src_covered_; - vector src_coverage_; - vector src_prefix_; -}; -bool operator<(const FSTState& q, const FSTState& r) { - if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; - if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; - if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; - return q.src_prefix_ < r.src_prefix_; -} - -ostream& operator<<(ostream& os, const FSTState& q) { - os << "[" << q.trg_covered_ << " : "; - for (int i = 0; i < q.src_coverage_.size(); ++i) - os << q.src_coverage_[i]; - os << " : <"; - for (int i = 0; i < q.src_prefix_.size(); ++i) { - if (i != 0) os << ' '; - os << q.src_prefix_[i]; - } - return os << ">]"; -} - -struct MyModel { - MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} - typedef unordered_map, CCRP_NoTable, boost::hash > > SrcToRuleCRPMap; - - void DecrementRule(const TRule& rule) { - SrcToRuleCRPMap::iterator it = rules.find(rule.f_); - assert(it != rules.end()); - it->second.decrement(rule); - if (it->second.num_customers() == 0) rules.erase(it); - } - - void IncrementRule(const TRule& rule) { - SrcToRuleCRPMap::iterator it = rules.find(rule.f_); - if (it == rules.end()) { - CCRP_NoTable crp(1,1); - it = rules.insert(make_pair(rule.f_, crp)).first; - } - it->second.increment(rule); - } - - // conditioned on rule.f_ - prob_t RuleConditionalProbability(const TRule& rule) const { - const prob_t base = rp0(rule); - SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); - if (it == rules.end()) { - return base; - } else { - const double lp = it->second.logprob(rule, log(base)); - prob_t q; q.logeq(lp); - return q; - } - } - - const ConditionalBase& rp0; - SrcToRuleCRPMap rules; -}; - -struct MyFST : public WFST { - MyFST(const vector& ssrc, const vector& strg, MyModel* m) : - src(ssrc), trg(strg), - r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), - model(m) { - FSTState in(src.size()); - cerr << " INIT: " << in << endl; - init = GetNode(in); - for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; - in.src_covered_ = src.size(); - in.trg_covered_ = trg.size(); - cerr << "FINAL: " << in << endl; - final = GetNode(in); - } - virtual const WFSTNode* Final() const; - virtual const WFSTNode* Initial() const; - - const WFSTNode* GetNode(const FSTState& q); - map > m; - const vector& src; - const vector& trg; - Reachability r; - const WFSTNode* init; - const WFSTNode* final; - MyModel* model; -}; - -struct MyNode : public WFSTNode { - MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} - virtual vector > ExtendInput(unsigned srcindex) const; - const FSTState state; - mutable MyFST* container; -}; - -vector > MyNode::ExtendInput(unsigned srcindex) const { - cerr << "EXTEND " << state << " with " << srcindex << endl; - vector ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); - vector > res(ext.size()); - for (unsigned i = 0; i < ext.size(); ++i) { - res[i].first = container->GetNode(ext[i]); - if (ext[i].src_prefix_.size() == 0) { - const unsigned trg_from = state.trg_covered_; - const unsigned trg_to = ext[i].trg_covered_; - const unsigned prev_prfx_size = state.src_prefix_.size(); - res[i].second.reset(new TRule); - res[i].second->lhs_ = -TD::Convert("X"); - vector& src = res[i].second->f_; - vector& trg = res[i].second->e_; - src.resize(prev_prfx_size + 1); - for (unsigned j = 0; j < prev_prfx_size; ++j) - src[j] = container->src[state.src_prefix_[j]]; - src[prev_prfx_size] = container->src[srcindex]; - for (unsigned j = trg_from; j < trg_to; ++j) - trg.push_back(container->trg[j]); - res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); - } - } - return res; -} - -const WFSTNode* MyFST::GetNode(const FSTState& q) { - boost::shared_ptr& res = m[q]; - if (!res) { - res.reset(new MyNode(q, this)); - } - return &*res; -} - -const WFSTNode* MyFST::Final() const { - return final; -} - -const WFSTNode* MyFST::Initial() const { - return init; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - boost::shared_ptr prng; - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - ConditionalBase lp0(conf["model1_interpolation_weight"].as(), - vocabe.size(), - conf["model1"].as()); - MyModel m(lp0); - - TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); - m.IncrementRule(x); - TRule y("[X] ||| nY dyN ||| gave ||| 0"); - m.IncrementRule(y); - - - MyFST fst(corpusf[0], corpuse[0], &m); - ifstream in("./kimura.g"); - assert(in); - CFG_WFSTComposer comp(fst); - Hypergraph hg; - bool succeed = comp.Compose(&in, &hg); - hg.PrintGraphviz(); - if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } - -#if 0 - ifstream in2("./amnabooks.g"); - assert(in2); - MyFST fst2(corpusf[1], corpuse[1], &m); - CFG_WFSTComposer comp2(fst2); - Hypergraph hg2; - bool succeed2 = comp2.Compose(&in2, &hg2); - if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } -#endif - - SparseVector w; w.set_value(FD::Convert("Proposal"), 1.0); - hg.Reweight(w); - cerr << ViterbiFTree(hg) << endl; - return 0; -} - diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc deleted file mode 100644 index 97f1ba34..00000000 --- a/gi/pf/cbgi.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include -#include -#include - -#include -#include - -#include "sampler.h" -#include "filelib.h" -#include "hg_io.h" -#include "hg.h" -#include "ccrp_nt.h" -#include "trule.h" -#include "inside_outside.h" - -using namespace std; -using namespace std::tr1; - -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -double log_decay(unsigned x, const double& b) { - assert(b > 1.0); - assert(x > 0); - return log(b - 1) - x * log(b); -} - -struct SimpleBase { - SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : - uniform_e(-log(esize)), - uniform_f(-log(fsize)), - uniform_nt(-log(ntsize)) { - } - - // binomial coefficient - static double choose(unsigned n, unsigned k) { - return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)); - } - - // count the number of patterns of terminals and NTs in the rule, given elen and flen - static double log_number_of_patterns(const unsigned flen, const unsigned elen) { - static vector > counts; - if (elen >= counts.size()) counts.resize(elen + 1); - if (flen >= counts[elen].size()) counts[elen].resize(flen + 1); - double& count = counts[elen][flen]; - if (count) return log(count); - const unsigned max_arity = min(elen, flen); - for (unsigned a = 0; a <= max_arity; ++a) - count += choose(elen, a) * choose(flen, a); - return log(count); - } - - // return logp0 of rule | LHS - double operator()(const TRule& rule) const { - const unsigned flen = rule.f_.size(); - const unsigned elen = rule.e_.size(); -#if 0 - double p = 0; - p += log_poisson(flen, 0.5); // flen ~Pois(0.5) - p += log_poisson(elen, flen); // elen | flen ~Pois(flen) - p -= log_number_of_patterns(flen, elen); // pattern | flen,elen ~Uniform - for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS - if (rule.f_[i] <= 0) // according to pattern - p += uniform_nt; // draw NT ~Uniform - else - p += uniform_f; // draw f terminal ~Uniform - } - p -= lgamma(rule.Arity() + 1); // draw permutation ~Uniform - for (unsigned i = 0; i < elen; ++i) { // for each position in e-RHS - if (rule.e_[i] > 0) // according to pattern - p += uniform_e; // draw e|f term ~Uniform - // TODO this should prob be model 1 - } -#else - double p = 0; - bool is_abstract = rule.f_[0] <= 0; - p += log(0.5); - if (is_abstract) { - if (flen == 2) p += log(0.99); else p += log(0.01); - } else { - p += log_decay(flen, 3); - } - - for (unsigned i = 0; i < flen; ++i) { // for each position in f-RHS - if (rule.f_[i] <= 0) // according to pattern - p += uniform_nt; // draw NT ~Uniform - else - p += uniform_f; // draw f terminal ~Uniform - } -#endif - return p; - } - const double uniform_e; - const double uniform_f; - const double uniform_nt; - vector arities; -}; - -MT19937* rng = NULL; - -template -struct MHSamplerEdgeProb { - MHSamplerEdgeProb(const Hypergraph& hg, - const map >& rdp, - const Base& logp0, - const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) { - for (int i = 0; i < edge_probs.size(); ++i) { - const TRule& rule = *hg.edges_[i].rule_; - const map >::const_iterator it = rdp.find(rule.lhs_); - assert(it != rdp.end()); - const CCRP_NoTable& crp = it->second; - edge_probs[i].logeq(crp.logprob(rule, logp0(rule))); - if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1) - edge_probs[i] = prob_t::Zero(); - } - } - inline prob_t operator()(const Hypergraph::Edge& e) const { - return edge_probs[e.id_]; - } - prob_t DerivationProb(const vector& d) const { - prob_t p = prob_t::One(); - for (unsigned i = 0; i < d.size(); ++i) - p *= edge_probs[d[i]]; - return p; - } - vector edge_probs; -}; - -template -struct ModelAndData { - ModelAndData() : - base_lh(prob_t::One()), - logp0(10000, 10000), - mh_samples(), - mh_rejects() {} - - void SampleCorpus(const string& hgpath, int i); - void ResampleHyperparameters() { - for (map >::iterator it = rules.begin(); it != rules.end(); ++it) - it->second.resample_hyperparameters(rng); - } - - CCRP_NoTable& RuleCRP(int lhs) { - map >::iterator it = rules.find(lhs); - if (it == rules.end()) { - rules.insert(make_pair(lhs, CCRP_NoTable(1,1))); - it = rules.find(lhs); - } - return it->second; - } - - void IncrementRule(const TRule& rule) { - CCRP_NoTable& crp = RuleCRP(rule.lhs_); - if (crp.increment(rule)) { - prob_t p; p.logeq(logp0(rule)); - base_lh *= p; - } - } - - void DecrementRule(const TRule& rule) { - CCRP_NoTable& crp = RuleCRP(rule.lhs_); - if (crp.decrement(rule)) { - prob_t p; p.logeq(logp0(rule)); - base_lh /= p; - } - } - - void DecrementDerivation(const Hypergraph& hg, const vector& d) { - for (unsigned i = 0; i < d.size(); ++i) { - const TRule& rule = *hg.edges_[d[i]].rule_; - DecrementRule(rule); - } - } - - void IncrementDerivation(const Hypergraph& hg, const vector& d) { - for (unsigned i = 0; i < d.size(); ++i) { - const TRule& rule = *hg.edges_[d[i]].rule_; - IncrementRule(rule); - } - } - - prob_t Likelihood() const { - prob_t p = prob_t::One(); - for (map >::const_iterator it = rules.begin(); it != rules.end(); ++it) { - prob_t q; q.logeq(it->second.log_crp_prob()); - p *= q; - } - p *= base_lh; - return p; - } - - void ResampleDerivation(const Hypergraph& hg, vector* sampled_derivation); - - map > rules; // [lhs] -> distribution over RHSs - prob_t base_lh; - SimpleBase logp0; - vector > samples; // sampled derivations - unsigned int mh_samples; - unsigned int mh_rejects; -}; - -template -void ModelAndData::SampleCorpus(const string& hgpath, int n) { - vector hgs(n); hgs.clear(); - boost::unordered_map acc; - map tot; - for (int i = 0; i < n; ++i) { - ostringstream os; - os << hgpath << '/' << i << ".json.gz"; - if (!FileExists(os.str())) continue; - hgs.push_back(Hypergraph()); - ReadFile rf(os.str()); - HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back()); - } - cerr << "Read " << hgs.size() << " alignment hypergraphs.\n"; - samples.resize(hgs.size()); - const unsigned SAMPLES = 2000; - const unsigned burnin = 3 * SAMPLES / 4; - const unsigned every = 20; - for (unsigned s = 0; s < SAMPLES; ++s) { - if (s % 10 == 0) { - if (s > 0) { cerr << endl; ResampleHyperparameters(); } - cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] "; - } - cerr << '.'; - for (unsigned i = 0; i < hgs.size(); ++i) { - ResampleDerivation(hgs[i], &samples[i]); - if (s > burnin && s % every == 0) { - for (unsigned j = 0; j < samples[i].size(); ++j) { - const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_; - ++acc[rule]; - ++tot[rule.lhs_]; - } - } - } - } - cerr << endl; - for (boost::unordered_map::iterator it = acc.begin(); it != acc.end(); ++it) { - cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl; - } -} - -template -void ModelAndData::ResampleDerivation(const Hypergraph& hg, vector* sampled_deriv) { - vector cur; - cur.swap(*sampled_deriv); - - const prob_t p_cur = Likelihood(); - DecrementDerivation(hg, cur); - if (cur.empty()) { - // first iteration, create restaurants - for (int i = 0; i < hg.edges_.size(); ++i) - RuleCRP(hg.edges_[i].rule_->lhs_); - } - MHSamplerEdgeProb wf(hg, rules, logp0, cur.empty()); -// MHSamplerEdgeProb wf(hg, rules, logp0, false); - const prob_t q_cur = wf.DerivationProb(cur); - vector node_probs; - Inside >(hg, &node_probs, wf); - queue q; - q.push(hg.nodes_.size() - 3); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = wf.edge_probs[edge.id_]; // edge proposal prob - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } - IncrementDerivation(hg, *sampled_deriv); - -// cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n"; -// cerr << "DERIV:\n"; -// for (int i = 0; i < sampled_deriv->size(); ++i) { -// cerr << " " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl; -// } - - if (cur.empty()) return; // accept first sample - - ++mh_samples; - // only need to do MH if proposal is different to current state - if (cur != *sampled_deriv) { - const prob_t q_prop = wf.DerivationProb(*sampled_deriv); - const prob_t p_prop = Likelihood(); - if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) { - ++mh_rejects; - DecrementDerivation(hg, *sampled_deriv); - IncrementDerivation(hg, cur); - swap(cur, *sampled_deriv); - } - } -} - -int main(int argc, char** argv) { - rng = new MT19937; - ModelAndData m; - m.SampleCorpus("./hgs", 50); - // m.SampleCorpus("./btec/hgs", 5000); - return 0; -} - diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc deleted file mode 100644 index 21d5ec5b..00000000 --- a/gi/pf/cfg_wfst_composer.cc +++ /dev/null @@ -1,731 +0,0 @@ -#include "cfg_wfst_composer.h" - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include "fast_lexical_cast.hpp" - -#include "phrasetable_fst.h" -#include "sparse_vector.h" -#include "tdict.h" -#include "hg.h" -#include "hg_remove_eps.h" - -namespace po = boost::program_options; -using namespace std; -using namespace std::tr1; - -WFSTNode::~WFSTNode() {} -WFST::~WFST() {} - -// Define the following macro if you want to see lots of debugging output -// when you run the chart parser -#undef DEBUG_CHART_PARSER - -// A few constants used by the chart parser /////////////// -static const int kMAX_NODES = 2000000; -static const string kPHRASE_STRING = "X"; -static bool constants_need_init = true; -static WordID kUNIQUE_START; -static WordID kPHRASE; -static TRulePtr kX1X2; -static TRulePtr kX1; -static WordID kEPS; -static TRulePtr kEPSRule; - -static void InitializeConstants() { - if (constants_need_init) { - kPHRASE = TD::Convert(kPHRASE_STRING) * -1; - kUNIQUE_START = TD::Convert("S") * -1; - kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); - kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); - kEPSRule.reset(new TRule("[X] ||| ||| ")); - kEPS = TD::Convert(""); - constants_need_init = false; - } -} -//////////////////////////////////////////////////////////// - -class EGrammarNode { - friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); - friend void AddGrammarRule(const string& r, map* g); - public: -#ifdef DEBUG_CHART_PARSER - string hint; -#endif - EGrammarNode() : is_some_rule_complete(false), is_root(false) {} - const map& GetTerminals() const { return tptr; } - const map& GetNonTerminals() const { return ntptr; } - bool HasNonTerminals() const { return (!ntptr.empty()); } - bool HasTerminals() const { return (!tptr.empty()); } - bool RuleCompletes() const { - return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); - } - bool GrammarContinues() const { - return !(ntptr.empty() && tptr.empty()); - } - bool IsRoot() const { - return is_root; - } - // these are the features associated with the rule from the start - // node up to this point. If you use these features, you must - // not Extend() this rule. - const SparseVector& GetCFGProductionFeatures() const { - return input_features; - } - - const EGrammarNode* Extend(const WordID& t) const { - if (t < 0) { - map::const_iterator it = ntptr.find(t); - if (it == ntptr.end()) return NULL; - return &it->second; - } else { - map::const_iterator it = tptr.find(t); - if (it == tptr.end()) return NULL; - return &it->second; - } - } - - private: - map tptr; - map ntptr; - SparseVector input_features; - bool is_some_rule_complete; - bool is_root; -}; -typedef map EGrammar; // indexed by the rule LHS - -// edges are immutable once created -struct Edge { -#ifdef DEBUG_CHART_PARSER - static int id_count; - const int id; -#endif - const WordID cat; // lhs side of rule proved/being proved - const EGrammarNode* const dot; // dot position - const WFSTNode* const q; // start of span - const WFSTNode* const r; // end of span - const Edge* const active_parent; // back pointer, NULL for PREDICT items - const Edge* const passive_parent; // back pointer, NULL for SCAN and PREDICT items - TRulePtr tps; // translations - boost::shared_ptr > features; // features from CFG rule - - bool IsPassive() const { - // when a rule is completed, this value will be set - return static_cast(features); - } - bool IsActive() const { return !IsPassive(); } - bool IsInitial() const { - return !(active_parent || passive_parent); - } - bool IsCreatedByScan() const { - return active_parent && !passive_parent && !dot->IsRoot(); - } - bool IsCreatedByPredict() const { - return dot->IsRoot(); - } - bool IsCreatedByComplete() const { - return active_parent && passive_parent; - } - - // constructor for PREDICT - Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {} - Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {} - - // constructors for SCAN - Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, - const Edge* act_par, const TRulePtr& translations) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} - - Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, - const Edge* act_par, const TRulePtr& translations, - const SparseVector& feats) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), - features(new SparseVector(feats)) {} - - // constructors for COMPLETE - Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, - const Edge* act_par, const Edge *pas_par) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() { - assert(pas_par->IsPassive()); - assert(act_par->IsActive()); - } - - Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, - const Edge* act_par, const Edge *pas_par, const SparseVector& feats) : -#ifdef DEBUG_CHART_PARSER - id(++id_count), -#endif - cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(), - features(new SparseVector(feats)) { - assert(pas_par->IsPassive()); - assert(act_par->IsActive()); - } - - // constructor for COMPLETE query - Edge(const WFSTNode* _r) : -#ifdef DEBUG_CHART_PARSER - id(0), -#endif - cat(0), dot(NULL), q(NULL), - r(_r), active_parent(NULL), passive_parent(NULL), tps() {} - // constructor for MERGE quere - Edge(const WFSTNode* _q, int) : -#ifdef DEBUG_CHART_PARSER - id(0), -#endif - cat(0), dot(NULL), q(_q), - r(NULL), active_parent(NULL), passive_parent(NULL), tps() {} -}; -#ifdef DEBUG_CHART_PARSER -int Edge::id_count = 0; -#endif - -ostream& operator<<(ostream& os, const Edge& e) { - string type = "PREDICT"; - if (e.IsCreatedByScan()) - type = "SCAN"; - else if (e.IsCreatedByComplete()) - type = "COMPLETE"; - os << "[" -#ifdef DEBUG_CHART_PARSER - << '(' << e.id << ") " -#else - << '(' << &e << ") " -#endif - << "q=" << e.q << ", r=" << e.r - << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" - << e.dot -#ifdef DEBUG_CHART_PARSER - << e.dot->hint -#endif - << (e.IsActive() ? ", Active" : ", Passive") - << ", " << type; -#ifdef DEBUG_CHART_PARSER - if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } - if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } -#endif - if (e.tps) { os << ", tps=" << e.tps->AsString(); } - return os << ']'; -} - -struct Traversal { - const Edge* const edge; // result from the active / passive combination - const Edge* const active; - const Edge* const passive; - Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} -}; - -struct UniqueTraversalHash { - size_t operator()(const Traversal* t) const { - size_t x = 5381; - x = ((x << 5) + x) ^ reinterpret_cast(t->active); - x = ((x << 5) + x) ^ reinterpret_cast(t->passive); - x = ((x << 5) + x) ^ t->edge->IsActive(); - return x; - } -}; - -struct UniqueTraversalEquals { - size_t operator()(const Traversal* a, const Traversal* b) const { - return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); - } -}; - -struct UniqueEdgeHash { - size_t operator()(const Edge* e) const { - size_t x = 5381; - if (e->IsActive()) { - x = ((x << 5) + x) ^ reinterpret_cast(e->dot); - x = ((x << 5) + x) ^ reinterpret_cast(e->q); - x = ((x << 5) + x) ^ reinterpret_cast(e->r); - x = ((x << 5) + x) ^ static_cast(e->cat); - x += 13; - } else { // with passive edges, we don't care about the dot - x = ((x << 5) + x) ^ reinterpret_cast(e->q); - x = ((x << 5) + x) ^ reinterpret_cast(e->r); - x = ((x << 5) + x) ^ static_cast(e->cat); - } - return x; - } -}; - -struct UniqueEdgeEquals { - bool operator()(const Edge* a, const Edge* b) const { - if (a->IsActive() != b->IsActive()) return false; - if (a->IsActive()) { - return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); - } else { - return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); - } - } -}; - -struct REdgeHash { - size_t operator()(const Edge* e) const { - size_t x = 5381; - x = ((x << 5) + x) ^ reinterpret_cast(e->r); - return x; - } -}; - -struct REdgeEquals { - bool operator()(const Edge* a, const Edge* b) const { - return (a->r == b->r); - } -}; - -struct QEdgeHash { - size_t operator()(const Edge* e) const { - size_t x = 5381; - x = ((x << 5) + x) ^ reinterpret_cast(e->q); - return x; - } -}; - -struct QEdgeEquals { - bool operator()(const Edge* a, const Edge* b) const { - return (a->q == b->q); - } -}; - -struct EdgeQueue { - queue q; - EdgeQueue() {} - void clear() { while(!q.empty()) q.pop(); } - bool HasWork() const { return !q.empty(); } - const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } - void AddEdge(const Edge* s) { q.push(s); } -}; - -class CFG_WFSTComposerImpl { - public: - CFG_WFSTComposerImpl(WordID start_cat, - const WFSTNode* q_0, - const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {} - - // returns false if the intersection is empty - bool Compose(const EGrammar& g, Hypergraph* forest) { - goal_node = NULL; - EGrammar::const_iterator sit = g.find(start_cat_); - forest->ReserveNodes(kMAX_NODES); - assert(sit != g.end()); - Edge* init = new Edge(start_cat_, &sit->second, q_0_); - assert(IncorporateNewEdge(init)); - while (exp_agenda.HasWork() || agenda.HasWork()) { - while(exp_agenda.HasWork()) { - const Edge* edge = exp_agenda.Next(); - FinishEdge(edge, forest); - } - if (agenda.HasWork()) { - const Edge* edge = agenda.Next(); -#ifdef DEBUG_CHART_PARSER - cerr << "processing (" << edge->id << ')' << endl; -#endif - if (edge->IsActive()) { - if (edge->dot->HasTerminals()) - DoScan(edge); - if (edge->dot->HasNonTerminals()) { - DoMergeWithPassives(edge); - DoPredict(edge, g); - } - } else { - DoComplete(edge); - } - } - } - if (goal_node) { - forest->PruneUnreachable(goal_node->id_); - RemoveEpsilons(forest, kEPS); - } - FreeAll(); - return goal_node; - } - - void FreeAll() { - for (int i = 0; i < free_list_.size(); ++i) - delete free_list_[i]; - free_list_.clear(); - for (int i = 0; i < traversal_free_list_.size(); ++i) - delete traversal_free_list_[i]; - traversal_free_list_.clear(); - all_traversals.clear(); - exp_agenda.clear(); - agenda.clear(); - tps2node.clear(); - edge2node.clear(); - all_edges.clear(); - passive_edges.clear(); - active_edges.clear(); - } - - ~CFG_WFSTComposerImpl() { - FreeAll(); - } - - // returns the total number of edges created during composition - int EdgesCreated() const { - return free_list_.size(); - } - - private: - void DoScan(const Edge* edge) { - // here, we assume that the FST will potentially have many more outgoing - // edges than the grammar, which will be just a couple. If you want to - // efficiently handle the case where both are relatively large, this code - // will need to change how the intersection is done. The best general - // solution would probably be the Baeza-Yates double binary search. - - const EGrammarNode* dot = edge->dot; - const WFSTNode* r = edge->r; - const map& terms = dot->GetTerminals(); - for (map::const_iterator git = terms.begin(); - git != terms.end(); ++git) { - - if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) { - std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl; - abort(); - } - std::vector > extensions = r->ExtendInput(atoi(TD::Convert(git->first).c_str())); - for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) { - const WFSTNode* next_r = extensions[nsi].first; - const EGrammarNode* next_dot = &git->second; - const bool grammar_continues = next_dot->GrammarContinues(); - const bool rule_completes = next_dot->RuleCompletes(); - if (extensions[nsi].second) - cerr << "!!! " << extensions[nsi].second->AsString() << endl; - // cerr << " rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl; - assert(grammar_continues || rule_completes); - const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); - if (rule_completes) - IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features)); - if (grammar_continues) - IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second)); - } - } - } - - void DoPredict(const Edge* edge, const EGrammar& g) { - const EGrammarNode* dot = edge->dot; - const map& non_terms = dot->GetNonTerminals(); - for (map::const_iterator git = non_terms.begin(); - git != non_terms.end(); ++git) { - const WordID nt_to_predict = git->first; - //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; - EGrammar::const_iterator egi = g.find(nt_to_predict); - if (egi == g.end()) { - cerr << "[ERROR] Can't find any grammar rules with a LHS of type " - << TD::Convert(-1*nt_to_predict) << '!' << endl; - continue; - } - assert(edge->IsActive()); - const EGrammarNode* new_dot = &egi->second; - Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); - IncorporateNewEdge(new_edge); - } - } - - void DoComplete(const Edge* passive) { -#ifdef DEBUG_CHART_PARSER - cerr << " complete: " << *passive << endl; -#endif - const WordID completed_nt = passive->cat; - const WFSTNode* q = passive->q; - const WFSTNode* next_r = passive->r; - const Edge query(q); - const pair::iterator, - unordered_multiset::iterator > p = - active_edges.equal_range(&query); - for (unordered_multiset::iterator it = p.first; - it != p.second; ++it) { - const Edge* active = *it; -#ifdef DEBUG_CHART_PARSER - cerr << " pos: " << *active << endl; -#endif - const EGrammarNode* next_dot = active->dot->Extend(completed_nt); - if (!next_dot) continue; - const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); - // add up to 2 rules - if (next_dot->RuleCompletes()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); - if (next_dot->GrammarContinues()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); - } - } - - void DoMergeWithPassives(const Edge* active) { - // edge is active, has non-terminals, we need to find the passives that can extend it - assert(active->IsActive()); - assert(active->dot->HasNonTerminals()); -#ifdef DEBUG_CHART_PARSER - cerr << " merge active with passives: ACT=" << *active << endl; -#endif - const Edge query(active->r, 1); - const pair::iterator, - unordered_multiset::iterator > p = - passive_edges.equal_range(&query); - for (unordered_multiset::iterator it = p.first; - it != p.second; ++it) { - const Edge* passive = *it; - const EGrammarNode* next_dot = active->dot->Extend(passive->cat); - if (!next_dot) continue; - const WFSTNode* next_r = passive->r; - const SparseVector& input_features = next_dot->GetCFGProductionFeatures(); - if (next_dot->RuleCompletes()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); - if (next_dot->GrammarContinues()) - IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); - } - } - - // take ownership of edge memory, add to various indexes, etc - // returns true if this edge is new - bool IncorporateNewEdge(Edge* edge) { - free_list_.push_back(edge); - if (edge->passive_parent && edge->active_parent) { - Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); - traversal_free_list_.push_back(t); - if (all_traversals.find(t) != all_traversals.end()) { - return false; - } else { - all_traversals.insert(t); - } - } - exp_agenda.AddEdge(edge); - return true; - } - - bool FinishEdge(const Edge* edge, Hypergraph* hg) { - bool is_new = false; - if (all_edges.find(edge) == all_edges.end()) { -#ifdef DEBUG_CHART_PARSER - cerr << *edge << " is NEW\n"; -#endif - all_edges.insert(edge); - is_new = true; - if (edge->IsPassive()) passive_edges.insert(edge); - if (edge->IsActive()) active_edges.insert(edge); - agenda.AddEdge(edge); - } else { -#ifdef DEBUG_CHART_PARSER - cerr << *edge << " is NOT NEW.\n"; -#endif - } - AddEdgeToTranslationForest(edge, hg); - return is_new; - } - - // build the translation forest - void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { - assert(hg->nodes_.size() < kMAX_NODES); - Hypergraph::Node* tps = NULL; - // first add any target language rules - if (edge->tps) { - Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()]; - if (!node) { - // cerr << "Creating phrases for " << edge->tps << endl; - const TRulePtr& rule = edge->tps; - node = hg->AddNode(kPHRASE); - Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector()); - hg_edge->feature_values_ += rule->GetFeatureValues(); - hg->ConnectEdgeToHeadNode(hg_edge, node); - } - tps = node; - } - Hypergraph::Node*& head_node = edge2node[edge]; - if (!head_node) - head_node = hg->AddNode(kPHRASE); - if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) { - assert(goal_node == NULL || goal_node == head_node); - goal_node = head_node; - } - Hypergraph::TailNodeVector tail; - SparseVector extra; - if (edge->IsCreatedByPredict()) { - // extra.set_value(FD::Convert("predict"), 1); - } else if (edge->IsCreatedByScan()) { - tail.push_back(edge2node[edge->active_parent]->id_); - if (tps) { - tail.push_back(tps->id_); - } - //extra.set_value(FD::Convert("scan"), 1); - } else if (edge->IsCreatedByComplete()) { - tail.push_back(edge2node[edge->active_parent]->id_); - tail.push_back(edge2node[edge->passive_parent]->id_); - //extra.set_value(FD::Convert("complete"), 1); - } else { - assert(!"unexpected edge type!"); - } - //cerr << head_node->id_ << "<--" << *edge << endl; - -#ifdef DEBUG_CHART_PARSER - for (int i = 0; i < tail.size(); ++i) - if (tail[i] == head_node->id_) { - cerr << "ERROR: " << *edge << "\n i=" << i << endl; - if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } - if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } - assert(!"self-loop found!"); - } -#endif - Hypergraph::Edge* hg_edge = NULL; - if (tail.size() == 0) { - hg_edge = hg->AddEdge(kEPSRule, tail); - } else if (tail.size() == 1) { - hg_edge = hg->AddEdge(kX1, tail); - } else if (tail.size() == 2) { - hg_edge = hg->AddEdge(kX1X2, tail); - } - if (edge->features) - hg_edge->feature_values_ += *edge->features; - hg_edge->feature_values_ += extra; - hg->ConnectEdgeToHeadNode(hg_edge, head_node); - } - - Hypergraph::Node* goal_node; - EdgeQueue exp_agenda; - EdgeQueue agenda; - unordered_map tps2node; - unordered_map edge2node; - unordered_set all_traversals; - unordered_set all_edges; - unordered_multiset passive_edges; - unordered_multiset active_edges; - vector free_list_; - vector traversal_free_list_; - const WordID start_cat_; - const WFSTNode* const q_0_; - const WFSTNode* const q_final_; -}; - -#ifdef DEBUG_CHART_PARSER -static string TrimRule(const string& r) { - size_t start = r.find(" |||") + 5; - size_t end = r.rfind(" |||"); - return r.substr(start, end - start); -} -#endif - -void AddGrammarRule(const string& r, EGrammar* g) { - const size_t pos = r.find(" ||| "); - if (pos == string::npos || r[0] != '[') { - cerr << "Bad rule: " << r << endl; - return; - } - const size_t rpos = r.rfind(" ||| "); - string feats; - string rs = r; - if (rpos != pos) { - feats = r.substr(rpos + 5); - rs = r.substr(0, rpos); - } - string rhs = rs.substr(pos + 5); - string trule = rs + " ||| " + rhs + " ||| " + feats; - TRule tr(trule); - cerr << "X: " << tr.e_[0] << endl; -#ifdef DEBUG_CHART_PARSER - string hint_last_rule; -#endif - EGrammarNode* cur = &(*g)[tr.GetLHS()]; - cur->is_root = true; - for (int i = 0; i < tr.FLength(); ++i) { - WordID sym = tr.f()[i]; -#ifdef DEBUG_CHART_PARSER - hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); - cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); -#endif - if (sym < 0) - cur = &cur->ntptr[sym]; - else - cur = &cur->tptr[sym]; - } -#ifdef DEBUG_CHART_PARSER - cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); -#endif - cur->is_some_rule_complete = true; - cur->input_features = tr.GetFeatureValues(); -} - -CFG_WFSTComposer::~CFG_WFSTComposer() { - delete pimpl_; -} - -CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) { - InitializeConstants(); - pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final()); -} - -bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { - // first, convert the src forest into an EGrammar - EGrammar g; - const int nedges = src_forest.edges_.size(); - const int nnodes = src_forest.nodes_.size(); - vector cats(nnodes); - bool assign_cats = false; - for (int i = 0; i < nnodes; ++i) - if (assign_cats) { - cats[i] = TD::Convert("CAT_" + boost::lexical_cast(i)) * -1; - } else { - cats[i] = src_forest.nodes_[i].cat_; - } - // construct the grammar - for (int i = 0; i < nedges; ++i) { - const Hypergraph::Edge& edge = src_forest.edges_[i]; - const vector& src = edge.rule_->f(); - EGrammarNode* cur = &g[cats[edge.head_node_]]; - cur->is_root = true; - int ntc = 0; - for (int j = 0; j < src.size(); ++j) { - WordID sym = src[j]; - if (sym <= 0) { - sym = cats[edge.tail_nodes_[ntc]]; - ++ntc; - cur = &cur->ntptr[sym]; - } else { - cur = &cur->tptr[sym]; - } - } - cur->is_some_rule_complete = true; - cur->input_features = edge.feature_values_; - } - EGrammarNode& goal_rule = g[kUNIQUE_START]; - assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || - (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); - - return pimpl_->Compose(g, trg_forest); -} - -bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) { - EGrammar g; - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - AddGrammarRule(line, &g); - } - - return pimpl_->Compose(g, trg_forest); -} diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h deleted file mode 100644 index cf47f459..00000000 --- a/gi/pf/cfg_wfst_composer.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _CFG_WFST_COMPOSER_H_ -#define _CFG_WFST_COMPOSER_H_ - -#include -#include -#include - -#include "trule.h" -#include "wordid.h" - -class CFG_WFSTComposerImpl; -class Hypergraph; - -struct WFSTNode { - virtual ~WFSTNode(); - // returns the next states reachable by consuming srcindex (which identifies a word) - // paired with the output string generated by taking that transition. - virtual std::vector > ExtendInput(unsigned srcindex) const = 0; -}; - -struct WFST { - virtual ~WFST(); - virtual const WFSTNode* Final() const = 0; - virtual const WFSTNode* Initial() const = 0; -}; - -class CFG_WFSTComposer { - public: - ~CFG_WFSTComposer(); - explicit CFG_WFSTComposer(const WFST& wfst); - bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest); - - // reads the grammar from a file. There must be a single top-level - // S -> X rule. Anything else is possible. Format is: - // [S] ||| [SS,1] - // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 - // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 - // [NP] ||| [DET,1] [N,2] ||| Feature3=2 - // ... - bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); - - private: - CFG_WFSTComposerImpl* pimpl_; -}; - -#endif diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h deleted file mode 100644 index 81ddb206..00000000 --- a/gi/pf/conditional_pseg.h +++ /dev/null @@ -1,275 +0,0 @@ -#ifndef _CONDITIONAL_PSEG_H_ -#define _CONDITIONAL_PSEG_H_ - -#include -#include -#include -#include - -#include "m.h" -#include "prob.h" -#include "ccrp_nt.h" -#include "mfcr.h" -#include "trule.h" -#include "base_distributions.h" -#include "tdict.h" - -template -struct MConditionalTranslationModel { - explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : - rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} - - void Summary() const { - std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; - for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; - } - } - - double log_likelihood(const double& dd, const double& aa) const { - if (aa <= -dd) return -std::numeric_limits::infinity(); - //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); - double llh = Md::log_beta_density(dd, 1, 1) + - Md::log_gamma_density(dd + aa, 1, 1); - typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::const_iterator it; - for (it = r.begin(); it != r.end(); ++it) - llh += it->second.log_crp_prob(dd, aa); - return llh; - } - - struct DiscountResampler { - DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} - const MConditionalTranslationModel& m_; - double operator()(const double& proposed_discount) const { - return m_.log_likelihood(proposed_discount, m_.strength); - } - }; - - struct AlphaResampler { - AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} - const MConditionalTranslationModel& m_; - double operator()(const double& proposed_strength) const { - return m_.log_likelihood(m_.d, proposed_strength); - } - }; - - void ResampleHyperparameters(MT19937* rng) { - typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; -#if 1 - for (it = r.begin(); it != r.end(); ++it) { - it->second.resample_hyperparameters(rng); - } -#else - const unsigned nloop = 5; - const unsigned niterations = 10; - DiscountResampler dr(*this); - AlphaResampler ar(*this); - for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - double min_discount = std::numeric_limits::min(); - if (strength < 0.0) min_discount -= strength; - d = slice_sampler1d(dr, d, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - strength = slice_sampler1d(ar, strength, *rng, -d, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; - for (it = r.begin(); it != r.end(); ++it) { - it->second.set_discount(d); - it->second.set_strength(strength); - } -#endif - } - - int DecrementRule(const TRule& rule, MT19937* rng) { - RuleModelHash::iterator it = r.find(rule.f_); - assert(it != r.end()); - const TableCount delta = it->second.decrement(rule, rng); - if (delta.count) { - if (it->second.num_customers() == 0) r.erase(it); - } - return delta.count; - } - - int IncrementRule(const TRule& rule, MT19937* rng) { - RuleModelHash::iterator it = r.find(rule.f_); - if (it == r.end()) { - //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; - it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; - } - p0s[0] = rp0(rule); - TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); - return delta.count; - } - - prob_t RuleProbability(const TRule& rule) const { - prob_t p; - RuleModelHash::const_iterator it = r.find(rule.f_); - if (it == r.end()) { - p = rp0(rule); - } else { - p0s[0] = rp0(rule); - p = it->second.prob(rule, p0s.begin(), lambdas.begin()); - } - return p; - } - - prob_t Likelihood() const { - prob_t p; p.logeq(log_likelihood(d, strength)); - return p; - } - - const ConditionalBaseMeasure& rp0; - typedef std::tr1::unordered_map, - MFCR<1, TRule>, - boost::hash > > RuleModelHash; - RuleModelHash r; - double d, strength; - std::vector lambdas; - mutable std::vector p0s; -}; - -template -struct ConditionalTranslationModel { - explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : - rp0(rcp0) {} - - void Summary() const { - std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; - for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - std::cerr << " " << i2->second << '\t' << i2->first << std::endl; - } - } - - void ResampleHyperparameters(MT19937* rng) { - for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) - it->second.resample_hyperparameters(rng); - } - - int DecrementRule(const TRule& rule) { - RuleModelHash::iterator it = r.find(rule.f_); - assert(it != r.end()); - int count = it->second.decrement(rule); - if (count) { - if (it->second.num_customers() == 0) r.erase(it); - } - return count; - } - - int IncrementRule(const TRule& rule) { - RuleModelHash::iterator it = r.find(rule.f_); - if (it == r.end()) { - it = r.insert(make_pair(rule.f_, CCRP_NoTable(1.0, 1.0, 8.0))).first; - } - int count = it->second.increment(rule); - return count; - } - - void IncrementRules(const std::vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const std::vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - prob_t RuleProbability(const TRule& rule) const { - prob_t p; - RuleModelHash::const_iterator it = r.find(rule.f_); - if (it == r.end()) { - p.logeq(log(rp0(rule))); - } else { - p.logeq(it->second.logprob(rule, log(rp0(rule)))); - } - return p; - } - - prob_t Likelihood() const { - prob_t p = prob_t::One(); - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - prob_t q; q.logeq(it->second.log_crp_prob()); - p *= q; - for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - p *= rp0(i2->first); - } - return p; - } - - const ConditionalBaseMeasure& rp0; - typedef std::tr1::unordered_map, - CCRP_NoTable, - boost::hash > > RuleModelHash; - RuleModelHash r; -}; - -template -struct ConditionalParallelSegementationModel { - explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : - tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} - - ConditionalTranslationModel tmodel; - - void DecrementRule(const TRule& rule) { - tmodel.DecrementRule(rule); - } - - void IncrementRule(const TRule& rule) { - tmodel.IncrementRule(rule); - } - - void IncrementRulesAndAlignments(const std::vector& rules) { - tmodel.IncrementRules(rules); - for (int i = 0; i < rules.size(); ++i) { - IncrementAlign(rules[i]->f_.size()); - } - } - - void DecrementRulesAndAlignments(const std::vector& rules) { - tmodel.DecrementRules(rules); - for (int i = 0; i < rules.size(); ++i) { - DecrementAlign(rules[i]->f_.size()); - } - } - - prob_t RuleProbability(const TRule& rule) const { - return tmodel.RuleProbability(rule); - } - - void IncrementAlign(unsigned span) { - if (aligns.increment(span)) { - // TODO - } - } - - void DecrementAlign(unsigned span) { - if (aligns.decrement(span)) { - // TODO - } - } - - prob_t AlignProbability(unsigned span) const { - prob_t p; - p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); - return p; - } - - prob_t Likelihood() const { - prob_t p; p.logeq(aligns.log_crp_prob()); - p *= base; - p *= tmodel.Likelihood(); - return p; - } - - prob_t base; - CCRP_NoTable aligns; -}; - -#endif - diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc deleted file mode 100644 index 419731ac..00000000 --- a/gi/pf/condnaive.cc +++ /dev/null @@ -1,298 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(4),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(4),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -boost::shared_ptr prng; - -struct ModelAndData { - explicit ModelAndData(ConditionalParallelSegementationModel& m, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : - model(m), - rng(&*prng), - corpuse(ce), - corpusf(cf), - vocabe(ve), - vocabf(vf), - mh_samples(), - mh_rejects(), - kX(-TD::Convert("X")), - derivations(corpuse.size()) {} - - void ResampleHyperparameters() { - } - - void InstantiateRule(const pair& from, - const pair& to, - const vector& sentf, - const vector& sente, - TRule* rule) const { - rule->f_.clear(); - rule->e_.clear(); - rule->lhs_ = kX; - for (short i = from.first; i < to.first; ++i) - rule->f_.push_back(sentf[i]); - for (short i = from.second; i < to.second; ++i) - rule->e_.push_back(sente[i]); - } - - void DecrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { - if (d.size() < 2) return; - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - model.DecrementRule(x); - model.DecrementAlign(x.f_.size()); - } - } - - void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { - if (d.size() < 2) return; - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - cerr << i << '/' << (d.size() - 1) << ": " << x << endl; - } - } - - void IncrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { - if (d.size() < 2) return; - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - model.IncrementRule(x); - model.IncrementAlign(x.f_.size()); - } - } - - prob_t Likelihood() const { - return model.Likelihood(); - } - - prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { - prob_t p = prob_t::One(); - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - p *= model.RuleProbability(x); - p *= model.AlignProbability(x.f_.size()); - } - return p; - } - - void Sample(); - - ConditionalParallelSegementationModel& model; - MT19937* rng; - const vector >& corpuse, corpusf; - const set& vocabe, vocabf; - unsigned mh_samples, mh_rejects; - const int kX; - vector > > derivations; -}; - -void ModelAndData::Sample() { - unsigned MAXK = kMAX_SRC_PHRASE; - unsigned MAXL = kMAX_TRG_PHRASE; - TRule x; - x.lhs_ = -TD::Convert("X"); - - for (int samples = 0; samples < 1000; ++samples) { - if (samples % 1 == 0 && samples > 0) { - //ResampleHyperparameters(); - cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; - for (int i = 0; i < 10; ++i) { - cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; - PrintDerivation(derivations[i], corpusf[i], corpuse[i]); - } - static TRule xx("[X] ||| w n ||| s h ||| X=0"); - const CCRP_NoTable& dcrp = model.tmodel.r.find(xx.f_)->second; - for (CCRP_NoTable::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { - cerr << "\t" << it->second << "\t" << it->first << endl; - } - } - cerr << '.' << flush; - for (int s = 0; s < corpuse.size(); ++s) { - const vector& sentf = corpusf[s]; - const vector& sente = corpuse[s]; -// cerr << " CUSTOMERS: " << rules.num_customers() << endl; -// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; - - vector >& deriv = derivations[s]; - const prob_t p_cur = Likelihood(); - DecrementDerivation(deriv, sentf, sente); - - boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); - boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); - a[0][0] = prob_t::One(); - for (int i = 0; i < sentf.size(); ++i) { - for (int j = 0; j < sente.size(); ++j) { - const prob_t src_a = a[i][j]; - x.f_.clear(); - for (int k = 1; k <= MAXK; ++k) { - if (i + k > sentf.size()) break; - x.f_.push_back(sentf[i + k - 1]); - x.e_.clear(); - const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source - for (int l = 1; l <= MAXL; ++l) { - if (j + l > sente.size()) break; - x.e_.push_back(sente[j + l - 1]); - trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; - a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; - } - } - } - } -// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; - const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); - - vector > newderiv; - int cur_i = sentf.size(); - int cur_j = sente.size(); - while(cur_i > 0 && cur_j > 0) { - newderiv.push_back(pair(cur_i, cur_j)); -// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; - SampleSet ss; - vector > nexts; - for (int k = 1; k <= MAXK; ++k) { - const int hyp_i = cur_i - k; - if (hyp_i < 0) break; - for (int l = 1; l <= MAXL; ++l) { - const int hyp_j = cur_j - l; - if (hyp_j < 0) break; - const prob_t& inside = a[hyp_i][hyp_j]; - if (inside == prob_t::Zero()) continue; - const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; - if (transp == prob_t::Zero()) continue; - const prob_t p = inside * transp; - ss.add(p); - nexts.push_back(pair(hyp_i, hyp_j)); -// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; - } - } -// cerr << " sample set has " << nexts.size() << " elements.\n"; - const int selected = rng->SelectSample(ss); - cur_i = nexts[selected].first; - cur_j = nexts[selected].second; - } - newderiv.push_back(pair(0,0)); - const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); - IncrementDerivation(newderiv, sentf, sente); -// cerr << "SANITY: " << q_new << " " <(); - kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - Model1 m1(conf["model1"].as()); - - PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); - ConditionalParallelSegementationModel x(pcb0); - - ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); - posterior.Sample(); - - TRule r1("[X] ||| x ||| l e ||| X=0"); - TRule r2("[X] ||| A ||| a d ||| X=0"); - TRule r3("[X] ||| n ||| e r ||| X=0"); - TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); - - PhraseConditionalUninformativeBase u0(vocabe.size()); - - cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; - cerr << (u0(r4)) << endl; - - return 0; -} - diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc deleted file mode 100644 index cb6e4ed7..00000000 --- a/gi/pf/corpus.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "corpus.h" - -#include -#include -#include - -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -namespace corpus { - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - ReadFile rf(filename); - istream* in = rf.stream(); - assert(*in); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(getline(*in, line)) { - ++lc; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { - isf = false; - } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - if (cur == kDIV) { - cerr << "ERROR in " << lc << ": " << line << endl << endl; - abort(); - } - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } -} - -} - diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h deleted file mode 100644 index e7febdb7..00000000 --- a/gi/pf/corpus.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _CORPUS_H_ -#define _CORPUS_H_ - -#include -#include -#include -#include "wordid.h" - -namespace corpus { - -void ReadParallelCorpus(const std::string& filename, - std::vector >* f, - std::vector >* e, - std::set* vocab_f, - std::set* vocab_e); - -} - -#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc deleted file mode 100644 index 75ccad72..00000000 --- a/gi/pf/dpnaive.cc +++ /dev/null @@ -1,301 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(4),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(4),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in base distribution)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -boost::shared_ptr prng; - -template -struct ModelAndData { - explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : - model(m), - rng(&*prng), - p0(b), - baseprob(prob_t::One()), - corpuse(ce), - corpusf(cf), - vocabe(ve), - vocabf(vf), - mh_samples(), - mh_rejects(), - kX(-TD::Convert("X")), - derivations(corpuse.size()) {} - - void ResampleHyperparameters() { - } - - void InstantiateRule(const pair& from, - const pair& to, - const vector& sentf, - const vector& sente, - TRule* rule) const { - rule->f_.clear(); - rule->e_.clear(); - rule->lhs_ = kX; - for (short i = from.first; i < to.first; ++i) - rule->f_.push_back(sentf[i]); - for (short i = from.second; i < to.second; ++i) - rule->e_.push_back(sente[i]); - } - - void DecrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { - if (d.size() < 2) return; - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - model.DecrementRule(x); - model.DecrementContinue(); - } - model.DecrementStop(); - } - - void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { - if (d.size() < 2) return; - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - cerr << i << '/' << (d.size() - 1) << ": " << x << endl; - } - } - - void IncrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { - if (d.size() < 2) return; - TRule x; - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - model.IncrementRule(x); - model.IncrementContinue(); - } - model.IncrementStop(); - } - - prob_t Likelihood() const { - return model.Likelihood(); - } - - prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { - prob_t p = model.StopProbability(); - if (d.size() < 2) return p; - TRule x; - const prob_t p_cont = model.ContinueProbability(); - for (int i = 1; i < d.size(); ++i) { - InstantiateRule(d[i], d[i-1], sentf, sente, &x); - p *= p_cont; - p *= model.RuleProbability(x); - } - return p; - } - - void Sample(); - - MonotonicParallelSegementationModel& model; - MT19937* rng; - const Base& p0; - prob_t baseprob; // cached value of generating the table table labels from p0 - // this can't be used if we go to a hierarchical prior! - const vector >& corpuse, corpusf; - const set& vocabe, vocabf; - unsigned mh_samples, mh_rejects; - const int kX; - vector > > derivations; -}; - -template -void ModelAndData::Sample() { - unsigned MAXK = kMAX_SRC_PHRASE; - unsigned MAXL = kMAX_TRG_PHRASE; - TRule x; - x.lhs_ = -TD::Convert("X"); - for (int samples = 0; samples < 1000; ++samples) { - if (samples % 1 == 0 && samples > 0) { - //ResampleHyperparameters(); - cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; - for (int i = 0; i < 10; ++i) { - cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; - PrintDerivation(derivations[i], corpusf[i], corpuse[i]); - } - } - cerr << '.' << flush; - for (int s = 0; s < corpuse.size(); ++s) { - const vector& sentf = corpusf[s]; - const vector& sente = corpuse[s]; -// cerr << " CUSTOMERS: " << rules.num_customers() << endl; -// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; - - vector >& deriv = derivations[s]; - const prob_t p_cur = Likelihood(); - DecrementDerivation(deriv, sentf, sente); - - boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); - boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); - a[0][0] = prob_t::One(); - const prob_t q_stop = model.StopProbability(); - const prob_t q_cont = model.ContinueProbability(); - for (int i = 0; i < sentf.size(); ++i) { - for (int j = 0; j < sente.size(); ++j) { - const prob_t src_a = a[i][j]; - x.f_.clear(); - for (int k = 1; k <= MAXK; ++k) { - if (i + k > sentf.size()) break; - x.f_.push_back(sentf[i + k - 1]); - x.e_.clear(); - for (int l = 1; l <= MAXL; ++l) { - if (j + l > sente.size()) break; - x.e_.push_back(sente[j + l - 1]); - const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); - const prob_t& cp = stop_now ? q_stop : q_cont; - trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp; - a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; - } - } - } - } -// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; - const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); - - vector > newderiv; - int cur_i = sentf.size(); - int cur_j = sente.size(); - while(cur_i > 0 && cur_j > 0) { - newderiv.push_back(pair(cur_i, cur_j)); -// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; - SampleSet ss; - vector > nexts; - for (int k = 1; k <= MAXK; ++k) { - const int hyp_i = cur_i - k; - if (hyp_i < 0) break; - for (int l = 1; l <= MAXL; ++l) { - const int hyp_j = cur_j - l; - if (hyp_j < 0) break; - const prob_t& inside = a[hyp_i][hyp_j]; - if (inside == prob_t::Zero()) continue; - const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; - if (transp == prob_t::Zero()) continue; - const prob_t p = inside * transp; - ss.add(p); - nexts.push_back(pair(hyp_i, hyp_j)); -// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; - } - } -// cerr << " sample set has " << nexts.size() << " elements.\n"; - const int selected = rng->SelectSample(ss); - cur_i = nexts[selected].first; - cur_j = nexts[selected].second; - } - newderiv.push_back(pair(0,0)); - const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); - IncrementDerivation(newderiv, sentf, sente); -// cerr << "SANITY: " << q_new << " " <(); - kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - if (!conf.count("inverse_model1")) { - cerr << argv[0] << "Please use --inverse_model1 to specify inverse model 1 parameters\n"; - return 1; - } - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - Model1 m1(conf["model1"].as()); - Model1 invm1(conf["inverse_model1"].as()); -// PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - MonotonicParallelSegementationModel m(alp0); - - ModelAndData posterior(m, alp0, corpuse, corpusf, vocabe, vocabf); - posterior.Sample(); - - return 0; -} - diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl deleted file mode 100755 index d00c2168..00000000 --- a/gi/pf/guess-translits.pl +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use utf8; - -my $MIN_PMI = -3; - -my %fs; -my %es; -my %ef; - -die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; - -binmode(STDIN,":utf8"); -binmode(STDOUT,":utf8"); -binmode(STDERR,":utf8"); - -my $tot = 0; -print STDERR "Reading alignments from STDIN ...\n"; -while() { - chomp; - my ($fsent, $esent, $alsent) = split / \|\|\| /; - die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; - - my @fws = split /\s+/, $fsent; - my @ews = split /\s+/, $esent; - my @as = split /\s+/, $alsent; - my %a2b; - my %b2a; - for my $ap (@as) { - my ($a,$b) = split /-/, $ap; - die "BAD INPUT: $_\n" unless defined $a && defined $b; - $a2b{$a}->{$b} = 1; - $b2a{$b}->{$a} = 1; - } - for my $a (keys %a2b) { - my $bref = $a2b{$a}; - next unless scalar keys %$bref < 2; - my $b = (keys %$bref)[0]; - next unless scalar keys %{$b2a{$b}} < 2; - my $f = $fws[$a]; - next unless defined $f; - next unless length($f) > 3; - my $e = $ews[$b]; - next unless defined $e; - next unless length($e) > 3; - - $ef{$f}->{$e}++; - $es{$e}++; - $fs{$f}++; - $tot++; - } -} -my $ltot = log($tot); -my $num = 0; -print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; -for my $f (keys %fs) { - my $logf = log($fs{$f}); - my $esref = $ef{$f}; - for my $e (keys %$esref) { - my $loge = log($es{$e}); - my $ef = $esref->{$e}; - my $logef = log($ef); - my $pmi = $logef - ($loge + $logf); - next if $pmi < $MIN_PMI; - my @flets = split //, $f; - my @elets = split //, $e; - print "@flets ||| @elets\n"; - $num++; - } -} -print STDERR "Extracted $num pairs.\n"; -print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/hpyp_tm.cc b/gi/pf/hpyp_tm.cc deleted file mode 100644 index f362d3f8..00000000 --- a/gi/pf/hpyp_tm.cc +++ /dev/null @@ -1,133 +0,0 @@ -#include "hpyp_tm.h" - -#include -#include -#include - -#include "tdict.h" -#include "ccrp.h" -#include "pyp_word_model.h" -#include "tied_resampler.h" - -using namespace std; -using namespace std::tr1; - -struct FreqBinner { - FreqBinner(const std::string& fname) { fd_.Load(fname); } - unsigned NumberOfBins() const { return fd_.Max() + 1; } - unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } - FreqDict fd_; -}; - -template -struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : - base(*b), - binner(bnr), - btr(binner ? binner->NumberOfBins() + 1u : 2u) {} - - void Summary() const { - cerr << "Number of conditioning contexts: " << r.size() << endl; - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; - for (CCRP >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - cerr << " " << i2->second << endl; - } - } - - void ResampleHyperparameters(MT19937* rng) { - btr.ResampleHyperparameters(rng); - } - - prob_t Prob(const WordID src, const vector& trglets) const { - RuleModelHash::const_iterator it = r.find(src); - if (it == r.end()) { - return base(trglets); - } else { - return it->second.prob(trglets, base(trglets)); - } - } - - void Increment(const WordID src, const vector& trglets, MT19937* rng) { - RuleModelHash::iterator it = r.find(src); - if (it == r.end()) { - it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; - static const WordID kNULL = TD::Convert("NULL"); - unsigned bin = (src == kNULL ? 0 : 1); - if (binner && bin) { bin = binner->Bin(src) + 1; } - btr.Add(bin, &it->second); - } - if (it->second.increment(trglets, base(trglets), rng)) - base.Increment(trglets, rng); - } - - void Decrement(const WordID src, const vector& trglets, MT19937* rng) { - RuleModelHash::iterator it = r.find(src); - assert(it != r.end()); - if (it->second.decrement(trglets, rng)) { - base.Decrement(trglets, rng); - } - } - - prob_t Likelihood() const { - prob_t p = prob_t::One(); - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - prob_t q; q.logeq(it->second.log_crp_prob()); - p *= q; - } - return p; - } - - unsigned UniqueConditioningContexts() const { - return r.size(); - } - - // TODO tie PYP hyperparameters based on source word frequency bins - Base& base; - const Binner* binner; - BinTiedResampler > > btr; - typedef unordered_map > > RuleModelHash; - RuleModelHash r; -}; - -HPYPLexicalTranslation::HPYPLexicalTranslation(const vector >& lets, - const unsigned vocab_size, - const unsigned num_letters) : - letters(lets), - base(vocab_size, num_letters, 5), - up0(new PYPWordModel(&base)), - tmodel(new ConditionalPYPWordModel >(up0, new FreqBinner("10k.freq"))), - kX(-TD::Convert("X")) {} - -void HPYPLexicalTranslation::Summary() const { - tmodel->Summary(); - up0->Summary(); -} - -prob_t HPYPLexicalTranslation::Likelihood() const { - prob_t p = up0->Likelihood(); - p *= tmodel->Likelihood(); - return p; -} - -void HPYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { - tmodel->ResampleHyperparameters(rng); - up0->ResampleHyperparameters(rng); -} - -unsigned HPYPLexicalTranslation::UniqueConditioningContexts() const { - return tmodel->UniqueConditioningContexts(); -} - -prob_t HPYPLexicalTranslation::Prob(WordID src, WordID trg) const { - return tmodel->Prob(src, letters[trg]); -} - -void HPYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { - tmodel->Increment(src, letters[trg], rng); -} - -void HPYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { - tmodel->Decrement(src, letters[trg], rng); -} - diff --git a/gi/pf/hpyp_tm.h b/gi/pf/hpyp_tm.h deleted file mode 100644 index af3215ba..00000000 --- a/gi/pf/hpyp_tm.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef HPYP_LEX_TRANS -#define HPYP_LEX_TRANS - -#include -#include "wordid.h" -#include "prob.h" -#include "sampler.h" -#include "freqdict.h" -#include "poisson_uniform_word_model.h" - -struct FreqBinner; -template struct PYPWordModel; -template struct ConditionalPYPWordModel; - -struct HPYPLexicalTranslation { - explicit HPYPLexicalTranslation(const std::vector >& lets, - const unsigned vocab_size, - const unsigned num_letters); - - prob_t Likelihood() const; - - void ResampleHyperparameters(MT19937* rng); - prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) - void Summary() const; - void Increment(WordID src, WordID trg, MT19937* rng); - void Decrement(WordID src, WordID trg, MT19937* rng); - unsigned UniqueConditioningContexts() const; - - private: - const std::vector >& letters; // spelling dictionary - PoissonUniformWordModel base; // "generator" of English types - PYPWordModel* up0; // model English lexicon - ConditionalPYPWordModel, FreqBinner>* tmodel; // translation distributions - // (model English word | French word) - const WordID kX; -}; - -#endif diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc deleted file mode 100644 index 29ec3860..00000000 --- a/gi/pf/itg.cc +++ /dev/null @@ -1,275 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -ostream& operator<<(ostream& os, const vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} - -struct UnigramModel { - explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : - use_uniform_(fname.size() == 0), - p0null_(p0null), - uniform_((1.0 - p0null) / vocab_size), - probs_(TD::NumWords() + 1) { - if (fname.size() > 0) LoadUnigrams(fname); - probs_[0] = p0null_; - } - -// -// \data\ -// ngram 1=9295 -// -// \1-grams: -// -3.191193 " - - void LoadUnigrams(const string& fname) { - cerr << "Loading unigram probabilities from " << fname << " ..." << endl; - ReadFile rf(fname); - string line; - istream& in = *rf.stream(); - assert(in); - getline(in, line); - assert(line.empty()); - getline(in, line); - assert(line == "\\data\\"); - getline(in, line); - size_t pos = line.find("ngram 1="); - assert(pos == 0); - assert(line.size() > 8); - const size_t num_unigrams = atoi(&line[8]); - getline(in, line); - assert(line.empty()); - getline(in, line); - assert(line == "\\1-grams:"); - for (size_t i = 0; i < num_unigrams; ++i) { - getline(in, line); - assert(line.size() > 0); - pos = line.find('\t'); - assert(pos > 0); - assert(pos + 1 < line.size()); - const WordID w = TD::Convert(line.substr(pos + 1)); - line[pos] = 0; - float p = atof(&line[0]); - const prob_t pnon_null(1.0 - p0null_.as_float()); - if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); - } - } - - const prob_t& operator()(const WordID& w) const { - if (!w) return p0null_; - if (use_uniform_) return uniform_; - return probs_[w]; - } - - const bool use_uniform_; - const prob_t p0null_; - const prob_t uniform_; - vector probs_; -}; - -struct Model1 { - explicit Model1(const string& fname) : - kNULL(TD::Convert("")), - kZERO() { - LoadModel1(fname); - } - - void LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; - } - - // returns prob 0 if src or trg is not found! - const prob_t& operator()(WordID src, WordID trg) const { - if (src == 0) src = kNULL; - if (src < ttable.size()) { - const map& cpd = ttable[src]; - const map::const_iterator it = cpd.find(trg); - if (it != cpd.end()) - return it->second; - } - return kZERO; - } - - const WordID kNULL; - const prob_t kZERO; - vector > ttable; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("particles,p",po::value()->default_value(25),"Number of particles") - ("input,i",po::value(),"Read parallel data from") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("src_unigram,u",po::value()->default_value(""),"Source unigram distribution; empty for uniform") - ("trg_unigram,U",po::value()->default_value(""),"Target unigram distribution; empty for uniform") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const unsigned particles = conf["particles"].as(); - const unsigned samples = conf["samples"].as(); - TD::Convert(""); - TD::Convert(""); - TD::Convert(""); - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - boost::shared_ptr prng; - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - cerr << "Reading corpus...\n"; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - assert(corpusf.size() == corpuse.size()); - UnigramModel src_unigram(conf["src_unigram"].as(), vocabf.size()); - UnigramModel trg_unigram(conf["trg_unigram"].as(), vocabe.size()); - const prob_t kHALF(0.5); - - const string kEMPTY = "NULL"; - const int kLHS = -TD::Convert("X"); - Model1 m1(conf["model1"].as()); - Model1 invm1(conf["inverse_model1"].as()); - for (int si = 0; si < conf["samples"].as(); ++si) { - cerr << '.' << flush; - for (int ci = 0; ci < corpusf.size(); ++ci) { - const vector& trg = corpuse[ci]; - const vector& src = corpusf[ci]; - for (int i = 0; i <= trg.size(); ++i) { - const WordID e_i = i > 0 ? trg[i-1] : 0; - for (int j = 0; j <= src.size(); ++j) { - const WordID f_j = j > 0 ? src[j-1] : 0; - if (e_i == 0 && f_j == 0) continue; - prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); - cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; - if (e_i && f_j) - cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; - } - } - } - } -} - diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc deleted file mode 100644 index 1d5126e4..00000000 --- a/gi/pf/learn_cfg.cc +++ /dev/null @@ -1,428 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "inside_outside.h" -#include "hg.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr prng; -vector nt_vocab; -vector nt_id_to_index; -static unsigned kMAX_RULE_SIZE = 0; -static unsigned kMAX_ARITY = 0; -static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs -static bool kHIERARCHICAL_PRIOR = false; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("max_rule_size,m", po::value()->default_value(0), "Maximum rule size (0 for unlimited)") - ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") - ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") - ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") - ("hierarchical_prior,h", "Use hierarchical prior") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -unsigned ReadCorpus(const string& filename, - vector >* e, - set* vocab_e) { - e->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - unsigned toks = 0; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - vector& le = e->back(); - TD::ConvertSentence(line, &le); - for (unsigned i = 0; i < le.size(); ++i) - vocab_e->insert(le[i]); - toks += le.size(); - } - if (in != &cin) delete in; - return toks; -} - -struct Grid { - // a b c d e - // 0 - 0 - - - vector grid; -}; - -struct BaseRuleModel { - explicit BaseRuleModel(unsigned term_size, - unsigned nonterm_size = 1) : - unif_term(1.0 / term_size), - unif_nonterm(1.0 / nonterm_size) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); - const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); - const prob_t nonterm_prob(1.0 - term_prob.as_float()); - for (unsigned i = 0; i < r.f_.size(); ++i) { - if (r.f_[i] <= 0) { // nonterminal - if (kALLOW_MIXED) p *= nonterm_prob; - p *= unif_nonterm; - } else { // terminal - if (kALLOW_MIXED) p *= term_prob; - p *= unif_term; - } - } - return p; - } - const prob_t unif_term, unif_nonterm; -}; - -struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : - base(vocab_size, num_nts), - q0(1,1,1,1), - nts(num_nts, CCRP(1,1,1,1)) {} - - prob_t Prob(const TRule& r) const { - return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); - } - - inline prob_t p0(const TRule& r) const { - if (kHIERARCHICAL_PRIOR) - return q0.prob(r, base(r)); - else - return base(r); - } - - int Increment(const TRule& r, MT19937* rng) { - const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); - if (kHIERARCHICAL_PRIOR && delta) - q0.increment(r, base(r), rng); - return delta; - // return x.increment(r); - } - - int Decrement(const TRule& r, MT19937* rng) { - const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); - if (kHIERARCHICAL_PRIOR && delta) - q0.decrement(r, rng); - return delta; - //return x.decrement(r); - } - - prob_t Likelihood() const { - prob_t p = prob_t::One(); - for (unsigned i = 0; i < nts.size(); ++i) { - prob_t q; q.logeq(nts[i].log_crp_prob()); - p *= q; - for (CCRP::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { - prob_t tp = p0(it->first); - tp.poweq(it->second.num_tables()); - p *= tp; - } - } - if (kHIERARCHICAL_PRIOR) { - prob_t q; q.logeq(q0.log_crp_prob()); - p *= q; - for (CCRP::const_iterator it = q0.begin(); it != q0.end(); ++it) { - prob_t tp = base(it->first); - tp.poweq(it->second.num_tables()); - p *= tp; - } - } - //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= base(it->first); - return p; - } - - void ResampleHyperparameters(MT19937* rng) { - for (unsigned i = 0; i < nts.size(); ++i) - nts[i].resample_hyperparameters(rng); - if (kHIERARCHICAL_PRIOR) { - q0.resample_hyperparameters(rng); - cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; - } - cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; - } - - const BaseRuleModel base; - CCRP q0; - vector > nts; - //CCRP_OneTable x; -}; - -vector tofreelist; - -HieroLMModel* plm; - -struct NPGrammarIter : public GrammarIter, public RuleBin { - NPGrammarIter() : arity() { tofreelist.push_back(this); } - NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - r.reset(new TRule); - } - TRule& rr = *r; - rr.lhs_ = nt_vocab[0]; - rr.f_.push_back(symbol); - rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); - tofreelist.push_back(this); - } - inline static unsigned NextArity(int cur_a, int symbol) { - return cur_a + (symbol <= 0 ? 1 : 0); - } - virtual int GetNumRules() const { - if (r) return nt_vocab.size(); else return 0; - } - virtual TRulePtr GetIthRule(int i) const { - if (i == 0) return r; - TRulePtr nr(new TRule(*r)); - nr->lhs_ = nt_vocab[i]; - return nr; - } - virtual int Arity() const { - return arity; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - const int next_arity = NextArity(arity, symbol); - if (kMAX_ARITY && next_arity > kMAX_ARITY) - return NULL; - if (!kALLOW_MIXED && r) { - bool t1 = r->f_.front() <= 0; - bool t2 = symbol <= 0; - if (t1 != t2) return NULL; - } - if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) - return new NPGrammarIter(r, next_arity, symbol); - else - return NULL; - } - const unsigned char arity; - TRulePtr r; -}; - -struct NPGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new NPGrammarIter; - } -}; - -prob_t TotalProb(const Hypergraph& hg) { - return Inside(hg); -} - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { - vector node_probs; - Inside(hg, &node_probs); - queue q; - q.push(hg.nodes_.size() - 2); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - //prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = edge.edge_prob_; - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - //z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } - for (unsigned i = 0; i < sampled_deriv->size(); ++i) { - cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; - } -} - -void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -int main(int argc, char** argv) { - po::variables_map conf; - - InitCommandLine(argc, argv, &conf); - nt_vocab.resize(conf["nonterminals"].as()); - assert(nt_vocab.size() > 0); - assert(nt_vocab.size() < 26); - { - string nt = "X"; - for (unsigned i = 0; i < nt_vocab.size(); ++i) { - if (nt_vocab.size() > 1) nt[0] = ('A' + i); - int pid = TD::Convert(nt); - nt_vocab[i] = -pid; - if (pid >= nt_id_to_index.size()) { - nt_id_to_index.resize(pid + 1, -1); - } - nt_id_to_index[pid] = i; - } - } - vector grammars; - grammars.push_back(GrammarPtr(new NPGrammar)); - - const unsigned samples = conf["samples"].as(); - kMAX_RULE_SIZE = conf["max_rule_size"].as(); - if (kMAX_RULE_SIZE == 1) { - cerr << "Invalid maximum rule size: must be 0 or >1\n"; - return 1; - } - kMAX_ARITY = conf["max_arity"].as(); - if (kMAX_ARITY == 1) { - cerr << "Invalid maximum arity: must be 0 or >1\n"; - return 1; - } - kALLOW_MIXED = !conf.count("no_mixed_rules"); - - kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - vector > corpuse; - set vocabe; - cerr << "Reading corpus...\n"; - const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - HieroLMModel lm(vocabe.size(), nt_vocab.size()); - - plm = &lm; - ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); - - Hypergraph hg; - const int kGoal = -TD::Convert("Goal"); - const int kLP = FD::Convert("LogProb"); - SparseVector v; v.set_value(kLP, 1.0); - vector > derivs(corpuse.size()); - vector cl(corpuse.size()); - for (int ci = 0; ci < corpuse.size(); ++ci) { - vector& src = corpuse[ci]; - Lattice& lat = cl[ci]; - lat.resize(src.size()); - for (unsigned i = 0; i < src.size(); ++i) - lat[i].push_back(LatticeArc(src[i], 0.0, 1)); - } - for (int SS=0; SS < samples; ++SS) { - const bool is_last = ((samples - 1) == SS); - prob_t dlh = prob_t::One(); - for (int ci = 0; ci < corpuse.size(); ++ci) { - const vector& src = corpuse[ci]; - const Lattice& lat = cl[ci]; - cerr << TD::GetString(src) << endl; - hg.clear(); - parser.Parse(lat, &hg); // exhaustive parse - vector& d = derivs[ci]; - if (!is_last) DecrementDerivation(hg, d, &lm, &rng); - for (unsigned i = 0; i < hg.edges_.size(); ++i) { - TRule& r = *hg.edges_[i].rule_; - if (r.lhs_ == kGoal) - hg.edges_[i].edge_prob_ = prob_t::One(); - else - hg.edges_[i].edge_prob_ = lm.Prob(r); - } - if (!is_last) { - d.clear(); - SampleDerivation(hg, &rng, &d); - IncrementDerivation(hg, derivs[ci], &lm, &rng); - } else { - prob_t p = TotalProb(hg); - dlh *= p; - cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; - } - if (tofreelist.size() > 200000) { - cerr << "Freeing ... "; - for (unsigned i = 0; i < tofreelist.size(); ++i) - delete tofreelist[i]; - tofreelist.clear(); - cerr << "Freed.\n"; - } - } - double llh = log(lm.Likelihood()); - cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; - if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); - if (is_last) { - double z = log(dlh); - cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; - } - } - for (unsigned i = 0; i < nt_vocab.size(); ++i) - cerr << lm.nts[i] << endl; - return 0; -} - diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl deleted file mode 100755 index fdcd3555..00000000 --- a/gi/pf/make-freq-bins.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $BASE = 6; -my $CUTOFF = 3; - -my %d; -my $num = 0; -while(<>){ - chomp; - my @words = split /\s+/; - for my $w (@words) {$d{$w}++; $num++;} -} - -my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; - -for (my $i=0; $i -#include - -#include "tdict.h" -#include "transliterations.h" - -using namespace std; - -MT19937 rng; - -static bool verbose = false; - -struct Model { - - Model() : bp(), base(0.2, 0.6) , ccrps(5, CCRP(0.8, 0.5)) {} - - double p0(int x) const { - assert(x > 0); - assert(x < 5); - return 1.0/4.0; - } - - double llh() const { - double lh = bp + base.log_crp_prob(); - for (int ctx = 1; ctx < 5; ++ctx) - lh += ccrps[ctx].log_crp_prob(); - return lh; - } - - double prob(int ctx, int x) const { - assert(ctx > 0 && ctx < 5); - return ccrps[ctx].prob(x, base.prob(x, p0(x))); - } - - void increment(int ctx, int x) { - assert(ctx > 0 && ctx < 5); - if (ccrps[ctx].increment(x, base.prob(x, p0(x)), &rng)) { - if (base.increment(x, p0(x), &rng)) { - bp += log(1.0 / 4.0); - } - } - } - - // this is just a biased estimate - double est_base_prob(int x) { - return (x + 1) * x / 40.0; - } - - void increment_is(int ctx, int x) { - assert(ctx > 0 && ctx < 5); - SampleSet ss; - const int PARTICLES = 25; - vector > s1s(PARTICLES, CCRP(0.5,0.5)); - vector > sbs(PARTICLES, CCRP(0.5,0.5)); - vector sp0s(PARTICLES); - - CCRP s1 = ccrps[ctx]; - CCRP sb = base; - double sp0 = bp; - for (int pp = 0; pp < PARTICLES; ++pp) { - if (pp > 0) { - ccrps[ctx] = s1; - base = sb; - bp = sp0; - } - - double q = 1; - double gamma = 1; - double est_p = est_base_prob(x); - //base.prob(x, p0(x)) + rng.next() * 0.1; - if (ccrps[ctx].increment(x, est_p, &rng, &q)) { - gamma = q * base.prob(x, p0(x)); - q *= est_p; - if (verbose) cerr << "(DP-base draw) "; - double qq = -1; - if (base.increment(x, p0(x), &rng, &qq)) { - if (verbose) cerr << "(G0 draw) "; - bp += log(p0(x)); - qq *= p0(x); - } - } else { gamma = q; } - double w = gamma / q; - if (verbose) - cerr << "gamma=" << gamma << " q=" << q << "\tw=" << w << endl; - ss.add(w); - s1s[pp] = ccrps[ctx]; - sbs[pp] = base; - sp0s[pp] = bp; - } - int ps = rng.SelectSample(ss); - ccrps[ctx] = s1s[ps]; - base = sbs[ps]; - bp = sp0s[ps]; - if (verbose) { - cerr << "SELECTED: " << ps << endl; - static int cc = 0; cc++; if (cc ==10) exit(1); - } - } - - void decrement(int ctx, int x) { - assert(ctx > 0 && ctx < 5); - if (ccrps[ctx].decrement(x, &rng)) { - if (base.decrement(x, &rng)) { - bp -= log(p0(x)); - } - } - } - - double bp; - CCRP base; - vector > ccrps; - -}; - -int main(int argc, char** argv) { - if (argc > 1) { verbose = true; } - vector counts(15, 0); - vector tcounts(15, 0); - int points[] = {1,2, 2,2, 3,2, 4,1, 3, 4, 3, 3, 2, 3, 4, 1, 4, 1, 3, 2, 1, 3, 1, 4, 0, 0}; - double tlh = 0; - double tt = 0; - for (int n = 0; n < 1000; ++n) { - if (n % 10 == 0) cerr << '.'; - if ((n+1) % 400 == 0) cerr << " [" << (n+1) << "]\n"; - Model m; - for (int *x = points; *x; x += 2) - m.increment(x[0], x[1]); - - for (int j = 0; j < 24; ++j) { - for (int *x = points; *x; x += 2) { - if (rng.next() < 0.8) { - m.decrement(x[0], x[1]); - m.increment_is(x[0], x[1]); - } - } - } - counts[m.base.num_customers()]++; - tcounts[m.base.num_tables()]++; - tlh += m.llh(); - tt += 1.0; - } - cerr << "mean LLH = " << (tlh / tt) << endl; - for (int i = 0; i < 15; ++i) - cerr << i << ": " << (counts[i] / tt) << "\t" << (tcounts[i] / tt) << endl; -} - diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h deleted file mode 100644 index 10d171fe..00000000 --- a/gi/pf/monotonic_pseg.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef _MONOTONIC_PSEG_H_ -#define _MONOTONIC_PSEG_H_ - -#include - -#include "prob.h" -#include "ccrp_nt.h" -#include "trule.h" -#include "base_distributions.h" - -template -struct MonotonicParallelSegementationModel { - explicit MonotonicParallelSegementationModel(BaseMeasure& rcp0) : - rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {} - - void DecrementRule(const TRule& rule) { - if (rules.decrement(rule)) - base /= rp0(rule); - } - - void IncrementRule(const TRule& rule) { - if (rules.increment(rule)) - base *= rp0(rule); - } - - void IncrementRulesAndStops(const std::vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - if (rules.size()) IncrementContinue(rules.size() - 1); - IncrementStop(); - } - - void DecrementRulesAndStops(const std::vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - if (rules.size()) { - DecrementContinue(rules.size() - 1); - DecrementStop(); - } - } - - prob_t RuleProbability(const TRule& rule) const { - prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); - return p; - } - - prob_t Likelihood() const { - prob_t p = base; - prob_t q; q.logeq(rules.log_crp_prob()); - p *= q; - q.logeq(stop.log_crp_prob()); - p *= q; - return p; - } - - void IncrementStop() { - stop.increment(true); - } - - void IncrementContinue(int n = 1) { - for (int i = 0; i < n; ++i) - stop.increment(false); - } - - void DecrementStop() { - stop.decrement(true); - } - - void DecrementContinue(int n = 1) { - for (int i = 0; i < n; ++i) - stop.decrement(false); - } - - prob_t StopProbability() const { - return prob_t(stop.prob(true, 0.5)); - } - - prob_t ContinueProbability() const { - return prob_t(stop.prob(false, 0.5)); - } - - const BaseMeasure& rp0; - prob_t base; - CCRP_NoTable rules; - CCRP_NoTable stop; -}; - -#endif - diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc deleted file mode 100644 index 1299f06f..00000000 --- a/gi/pf/ngram_base.cc +++ /dev/null @@ -1,69 +0,0 @@ -#include "ngram_base.h" - -#include "lm/model.hh" -#include "tdict.h" - -using namespace std; - -namespace { -struct GICSVMapper : public lm::EnumerateVocab { - GICSVMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } - void Add(lm::WordIndex index, const StringPiece &str) { - const WordID cdec_id = TD::Convert(str.as_string()); - if (cdec_id >= out_->size()) - out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); - (*out_)[cdec_id] = index; - } - vector* out_; - const lm::WordIndex kLM_UNKNOWN_TOKEN; -}; -} - -struct FixedNgramBaseImpl { - FixedNgramBaseImpl(const string& param) { - GICSVMapper vm(&cdec2klm_map_); - lm::ngram::Config conf; - conf.enumerate_vocab = &vm; - cerr << "Reading character LM from " << param << endl; - model = new lm::ngram::ProbingModel(param.c_str(), conf); - order = model->Order(); - kEOS = MapWord(TD::Convert("")); - assert(kEOS > 0); - } - - lm::WordIndex MapWord(const WordID w) const { - if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; - return 0; - } - - ~FixedNgramBaseImpl() { delete model; } - - prob_t StringProbability(const vector& s) const { - lm::ngram::State state = model->BeginSentenceState(); - double prob = 0; - for (unsigned i = 0; i < s.size(); ++i) { - const lm::ngram::State scopy(state); - prob += model->Score(scopy, MapWord(s[i]), state); - } - const lm::ngram::State scopy(state); - prob += model->Score(scopy, kEOS, state); - prob_t p; p.logeq(prob * log(10)); - return p; - } - - lm::ngram::ProbingModel* model; - unsigned order; - vector cdec2klm_map_; - lm::WordIndex kEOS; -}; - -FixedNgramBase::~FixedNgramBase() { delete impl; } - -FixedNgramBase::FixedNgramBase(const string& lmfname) { - impl = new FixedNgramBaseImpl(lmfname); -} - -prob_t FixedNgramBase::StringProbability(const vector& s) const { - return impl->StringProbability(s); -} - diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h deleted file mode 100644 index 4ea999f3..00000000 --- a/gi/pf/ngram_base.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _NGRAM_BASE_H_ -#define _NGRAM_BASE_H_ - -#include -#include -#include "trule.h" -#include "wordid.h" -#include "prob.h" - -struct FixedNgramBaseImpl; -struct FixedNgramBase { - FixedNgramBase(const std::string& lmfname); - ~FixedNgramBase(); - prob_t StringProbability(const std::vector& s) const; - - prob_t operator()(const TRule& rule) const { - return StringProbability(rule.e_); - } - - private: - FixedNgramBaseImpl* impl; - -}; - -#endif diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc deleted file mode 100644 index fc0af9cb..00000000 --- a/gi/pf/nuisance_test.cc +++ /dev/null @@ -1,161 +0,0 @@ -#include "ccrp.h" - -#include -#include - -#include "tdict.h" -#include "transliterations.h" - -using namespace std; - -MT19937 rng; - -ostream& operator<<(ostream&os, const vector& v) { - os << '[' << v[0]; - if (v.size() == 2) os << ' ' << v[1]; - return os << ']'; -} - -struct Base { - Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} - inline double p0(const vector& x) const { - double p = 0.75; - if (x.size() == 2) p = 0.25; - p *= 1.0 / 3.0; - if (x.size() == 2) p *= 1.0 / 3.0; - return p; - } - double est_deriv_prob(int a, int b, int seg) const { - assert(a > 0 && a < 4); // a \in {1,2,3} - assert(b > 0 && b < 4); // b \in {1,2,3} - assert(seg == 0 || seg == 1); // seg \in {0,1} - if (seg == 0) { - v[0] = a; - v[1] = b; - return crp.prob(v, p0(v)); - } else { - v1[0] = a; - v2[0] = b; - return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); - } - } - double est_marginal_prob(int a, int b) const { - return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); - } - int increment(int a, int b, double* pw = NULL) { - double p1 = est_deriv_prob(a, b, 0); - double p2 = est_deriv_prob(a, b, 1); - //p1 = 0.5; p2 = 0.5; - int seg = rng.SelectSample(p1,p2); - double tmp = 0; - if (!pw) pw = &tmp; - double& w = *pw; - if (seg == 0) { - v[0] = a; - v[1] = b; - w = crp.prob(v, p0(v)) / p1; - if (crp.increment(v, p0(v), &rng)) { - llh += log(p0(v)); - } - } else { - v1[0] = a; - w = crp.prob(v1, p0(v1)) / p2; - if (crp.increment(v1, p0(v1), &rng)) { - llh += log(p0(v1)); - } - v2[0] = b; - w *= crp.prob(v2, p0(v2)); - if (crp.increment(v2, p0(v2), &rng)) { - llh += log(p0(v2)); - } - } - return seg; - } - void increment(int a, int b, int seg) { - if (seg == 0) { - v[0] = a; - v[1] = b; - if (crp.increment(v, p0(v), &rng)) { - llh += log(p0(v)); - } - } else { - v1[0] = a; - if (crp.increment(v1, p0(v1), &rng)) { - llh += log(p0(v1)); - } - v2[0] = b; - if (crp.increment(v2, p0(v2), &rng)) { - llh += log(p0(v2)); - } - } - } - void decrement(int a, int b, int seg) { - if (seg == 0) { - v[0] = a; - v[1] = b; - if (crp.decrement(v, &rng)) { - llh -= log(p0(v)); - } - } else { - v1[0] = a; - if (crp.decrement(v1, &rng)) { - llh -= log(p0(v1)); - } - v2[0] = b; - if (crp.decrement(v2, &rng)) { - llh -= log(p0(v2)); - } - } - } - double log_likelihood() const { - return llh + crp.log_crp_prob(); - } - double llh; - mutable vector v, v1, v2; - CCRP > crp; -}; - -int main(int argc, char** argv) { - double tl = 0; - const int ITERS = 1000; - const int PARTICLES = 20; - const int DATAPOINTS = 50; - WordID x = TD::Convert("souvenons"); - WordID y = TD::Convert("remember"); - vector src; TD::ConvertSentence("s o u v e n o n s", &src); - vector trg; TD::ConvertSentence("r e m e m b e r", &trg); -// Transliterations xx; -// xx.Initialize(x, src, y, trg); -// return 1; - - for (int j = 0; j < ITERS; ++j) { - Base b; - vector segs(DATAPOINTS); - SampleSet ss; - vector sss; - for (int i = 0; i < DATAPOINTS; i++) { - ss.clear(); - sss.clear(); - int x = ((i / 10) % 3) + 1; - int y = (i % 3) + 1; - //double ep = b.est_marginal_prob(x,y); - //cerr << "est p(" << x << "," << y << ") = " << ep << endl; - for (int n = 0; n < PARTICLES; ++n) { - double w; - int seg = b.increment(x,y,&w); - //cerr << seg << " w=" << w << endl; - ss.add(w); - sss.push_back(seg); - b.decrement(x,y,seg); - } - int seg = sss[rng.SelectSample(ss)]; - b.increment(x, y, seg); - //cerr << "Selected: " << seg << endl; - //return 1; - segs[i] = seg; - } - tl += b.log_likelihood(); - } - cerr << "LLH=" << tl / ITERS << endl; -} - diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h deleted file mode 100644 index dfe40cb1..00000000 --- a/gi/pf/os_phrase.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _OS_PHRASE_H_ -#define _OS_PHRASE_H_ - -#include -#include -#include "tdict.h" - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} - -#endif diff --git a/gi/pf/pf.h b/gi/pf/pf.h deleted file mode 100644 index ede7cda8..00000000 --- a/gi/pf/pf.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef _PF_H_ -#define _PF_H_ - -#include -#include -#include "sampler.h" -#include "prob.h" - -template -struct ParticleRenormalizer { - void operator()(std::vector* pv) const { - if (pv->empty()) return; - prob_t z = prob_t::Zero(); - for (unsigned i = 0; i < pv->size(); ++i) - z += (*pv)[i].weight; - assert(z > prob_t::Zero()); - for (unsigned i = 0; i < pv->size(); ++i) - (*pv)[i].weight /= z; - } -}; - -template -struct MultinomialResampleFilter { - explicit MultinomialResampleFilter(MT19937* rng) : rng_(rng) {} - - void operator()(std::vector* pv) { - if (pv->empty()) return; - std::vector& ps = *pv; - SampleSet ss; - for (int i = 0; i < ps.size(); ++i) - ss.add(ps[i].weight); - std::vector nps; nps.reserve(ps.size()); - const prob_t uniform_weight(1.0 / ps.size()); - for (int i = 0; i < ps.size(); ++i) { - nps.push_back(ps[rng_->SelectSample(ss)]); - nps[i].weight = uniform_weight; - } - nps.swap(ps); - } - - private: - MT19937* rng_; -}; - -template -struct SystematicResampleFilter { - explicit SystematicResampleFilter(MT19937* rng) : rng_(rng), renorm_() {} - - void operator()(std::vector* pv) { - if (pv->empty()) return; - renorm_(pv); - std::vector& ps = *pv; - std::vector nps; nps.reserve(ps.size()); - double lower = 0, upper = 0; - const double skip = 1.0 / ps.size(); - double u_j = rng_->next() * skip; - //std::cerr << "u_0: " << u_j << std::endl; - int j = 0; - for (unsigned i = 0; i < ps.size(); ++i) { - upper += ps[i].weight.as_float(); - //std::cerr << "lower: " << lower << " upper: " << upper << std::endl; - // how many children does ps[i] have? - while (u_j < lower) { u_j += skip; ++j; } - while (u_j >= lower && u_j <= upper) { - assert(j < ps.size()); - nps.push_back(ps[i]); - u_j += skip; - //std::cerr << " add u_j=" << u_j << std::endl; - ++j; - } - lower = upper; - } - //std::cerr << ps.size() << " " << nps.size() << "\n"; - assert(ps.size() == nps.size()); - //exit(1); - ps.swap(nps); - } - - private: - MT19937* rng_; - ParticleRenormalizer renorm_; -}; - -#endif diff --git a/gi/pf/pf_test.cc b/gi/pf/pf_test.cc deleted file mode 100644 index 296e7285..00000000 --- a/gi/pf/pf_test.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include "ccrp.h" - -#include -#include - -#include "tdict.h" -#include "transliterations.h" - -using namespace std; - -MT19937 rng; - -static bool verbose = false; - -struct Model { - - Model() : bp(), base(0.2, 0.6) , ccrps(5, CCRP(0.8, 0.5)) {} - - double p0(int x) const { - assert(x > 0); - assert(x < 5); - return 1.0/4.0; - } - - double llh() const { - double lh = bp + base.log_crp_prob(); - for (int ctx = 1; ctx < 5; ++ctx) - lh += ccrps[ctx].log_crp_prob(); - return lh; - } - - double prob(int ctx, int x) const { - assert(ctx > 0 && ctx < 5); - return ccrps[ctx].prob(x, base.prob(x, p0(x))); - } - - void increment(int ctx, int x) { - assert(ctx > 0 && ctx < 5); - if (ccrps[ctx].increment(x, base.prob(x, p0(x)), &rng)) { - if (base.increment(x, p0(x), &rng)) { - bp += log(1.0 / 4.0); - } - } - } - - // this is just a biased estimate - double est_base_prob(int x) { - return (x + 1) * x / 40.0; - } - - void increment_is(int ctx, int x) { - assert(ctx > 0 && ctx < 5); - SampleSet ss; - const int PARTICLES = 25; - vector > s1s(PARTICLES, CCRP(0.5,0.5)); - vector > sbs(PARTICLES, CCRP(0.5,0.5)); - vector sp0s(PARTICLES); - - CCRP s1 = ccrps[ctx]; - CCRP sb = base; - double sp0 = bp; - for (int pp = 0; pp < PARTICLES; ++pp) { - if (pp > 0) { - ccrps[ctx] = s1; - base = sb; - bp = sp0; - } - - double q = 1; - double gamma = 1; - double est_p = est_base_prob(x); - //base.prob(x, p0(x)) + rng.next() * 0.1; - if (ccrps[ctx].increment(x, est_p, &rng, &q)) { - gamma = q * base.prob(x, p0(x)); - q *= est_p; - if (verbose) cerr << "(DP-base draw) "; - double qq = -1; - if (base.increment(x, p0(x), &rng, &qq)) { - if (verbose) cerr << "(G0 draw) "; - bp += log(p0(x)); - qq *= p0(x); - } - } else { gamma = q; } - double w = gamma / q; - if (verbose) - cerr << "gamma=" << gamma << " q=" << q << "\tw=" << w << endl; - ss.add(w); - s1s[pp] = ccrps[ctx]; - sbs[pp] = base; - sp0s[pp] = bp; - } - int ps = rng.SelectSample(ss); - ccrps[ctx] = s1s[ps]; - base = sbs[ps]; - bp = sp0s[ps]; - if (verbose) { - cerr << "SELECTED: " << ps << endl; - static int cc = 0; cc++; if (cc ==10) exit(1); - } - } - - void decrement(int ctx, int x) { - assert(ctx > 0 && ctx < 5); - if (ccrps[ctx].decrement(x, &rng)) { - if (base.decrement(x, &rng)) { - bp -= log(p0(x)); - } - } - } - - double bp; - CCRP base; - vector > ccrps; - -}; - -int main(int argc, char** argv) { - if (argc > 1) { verbose = true; } - vector counts(15, 0); - vector tcounts(15, 0); - int points[] = {1,2, 2,2, 3,2, 4,1, 3, 4, 3, 3, 2, 3, 4, 1, 4, 1, 3, 2, 1, 3, 1, 4, 0, 0}; - double tlh = 0; - double tt = 0; - for (int n = 0; n < 1000; ++n) { - if (n % 10 == 0) cerr << '.'; - if ((n+1) % 400 == 0) cerr << " [" << (n+1) << "]\n"; - Model m; - for (int *x = points; *x; x += 2) - m.increment(x[0], x[1]); - - for (int j = 0; j < 24; ++j) { - for (int *x = points; *x; x += 2) { - if (rng.next() < 0.8) { - m.decrement(x[0], x[1]); - m.increment_is(x[0], x[1]); - } - } - } - counts[m.base.num_customers()]++; - tcounts[m.base.num_tables()]++; - tlh += m.llh(); - tt += 1.0; - } - cerr << "mean LLH = " << (tlh / tt) << endl; - for (int i = 0; i < 15; ++i) - cerr << i << ": " << (counts[i] / tt) << "\t" << (tcounts[i] / tt) << endl; -} - diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc deleted file mode 100644 index 832f22cf..00000000 --- a/gi/pf/pfbrat.cc +++ /dev/null @@ -1,543 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "cfg_wfst_composer.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; -struct FSTState; - -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -struct ConditionalBase { - explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_TARGET(1.0 / vocab_e_size), - kNULL(TD::Convert("")) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - LoadModel1(model1fname); - } - - void LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; - } - - // return logp0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - const int flen = rule.f_.size(); - const int elen = rule.e_.size(); - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) { // for each position i in e-RHS - const WordID trg = rule.e_[i]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? kNULL : rule.f_[j]; - const map::const_iterator it = ttable[src].find(trg); - if (it != ttable[src].end()) { - tp += kM1MIXTURE * it->second; - } - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - return p; - } - - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_TARGET; - const WordID kNULL; - vector > ttable; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(3),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(3),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -struct UniphraseLM { - UniphraseLM(const vector >& corpus, - const set& vocab, - const po::variables_map& conf) : - phrases_(1,1), - gen_(1,1), - corpus_(corpus), - uniform_word_(1.0 / vocab.size()), - gen_p0_(0.5), - p_end_(0.5), - use_poisson_(conf.count("poisson_length") > 0) {} - - void ResampleHyperparameters(MT19937* rng) { - phrases_.resample_hyperparameters(rng); - gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.alpha(); - } - - CCRP_NoTable > phrases_; - CCRP_NoTable gen_; - vector > z_; // z_[i] is there a phrase boundary after the ith word - const vector >& corpus_; - const double uniform_word_; - const double gen_p0_; - const double p_end_; // in base length distribution, p of the end of a phrase - const bool use_poisson_; -}; - -struct Reachability { - boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? - boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid - - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : - edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); - } - - private: - struct SState { - SState() : prev_src_covered(), prev_trg_covered() {} - SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} - int prev_src_covered; - int prev_trg_covered; - }; - - struct NState { - NState() : next_src_covered(), next_trg_covered() {} - NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} - int next_src_covered; - int next_trg_covered; - }; - - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { - typedef boost::multi_array, 2> array_type; - array_type a(boost::extents[srclen + 1][trglen + 1]); - a[0][0].push_back(SState()); - for (int i = 0; i < srclen; ++i) { - for (int j = 0; j < trglen; ++j) { - if (a[i][j].size() == 0) continue; - const SState prev(i,j); - for (int k = 1; k <= src_max_phrase_len; ++k) { - if ((i + k) > srclen) continue; - for (int l = 1; l <= trg_max_phrase_len; ++l) { - if ((j + l) > trglen) continue; - a[i + k][j + l].push_back(prev); - } - } - } - } - a[0][0].clear(); - cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - assert(a[srclen][trglen].size() > 0); - - typedef boost::multi_array rarray_type; - rarray_type r(boost::extents[srclen + 1][trglen + 1]); -// typedef boost::multi_array, 2> narray_type; -// narray_type b(boost::extents[srclen + 1][trglen + 1]); - r[srclen][trglen] = true; - for (int i = srclen; i >= 0; --i) { - for (int j = trglen; j >= 0; --j) { - vector& prevs = a[i][j]; - if (!r[i][j]) { prevs.clear(); } -// const NState nstate(i,j); - for (int k = 0; k < prevs.size(); ++k) { - r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; - int src_delta = i - prevs[k].prev_src_covered; - edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; - short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; - if (src_delta > msd) msd = src_delta; -// b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); - } - } - } - assert(!edges[0][0][1][0]); - assert(!edges[0][0][0][1]); - assert(!edges[0][0][0][0]); - cerr << " MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; - assert(max_src_delta[0][0] > 0); - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} - } -}; - -ostream& operator<<(ostream& os, const FSTState& q); -struct FSTState { - explicit FSTState(int src_size) : - trg_covered_(), - src_covered_(), - src_coverage_(src_size) {} - - FSTState(short trg_covered, short src_covered, const vector& src_coverage, const vector& src_prefix) : - trg_covered_(trg_covered), - src_covered_(src_covered), - src_coverage_(src_coverage), - src_prefix_(src_prefix) { - if (src_coverage_.size() == src_covered) { - assert(src_prefix.size() == 0); - } - } - - // if we extend by the word at src_position, what are - // the next states that are reachable and lie on a valid - // path to the final state? - vector Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { - assert(src_position < src_coverage_.size()); - if (src_coverage_[src_position]) { - cerr << "Trying to extend " << *this << " with position " << src_position << endl; - abort(); - } - vector ncvg = src_coverage_; - ncvg[src_position] = true; - - vector res; - const int trg_remaining = trg_len - trg_covered_; - if (trg_remaining <= 0) { - cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; - abort(); - } - const int src_remaining = src_len - src_covered_; - if (src_remaining <= 0) { - cerr << "Source appears to have been covered: " << *this << endl; - abort(); - } - - for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { - if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { - int nc = src_prefix_.size() + 1 + src_covered_; - res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector())); - } - } - - if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { - vector nsp = src_prefix_; - nsp.push_back(src_position); - res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); - } - - if (res.size() == 0) { - cerr << *this << " can't be extended!\n"; - abort(); - } - return res; - } - - short trg_covered_, src_covered_; - vector src_coverage_; - vector src_prefix_; -}; -bool operator<(const FSTState& q, const FSTState& r) { - if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; - if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; - if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; - return q.src_prefix_ < r.src_prefix_; -} - -ostream& operator<<(ostream& os, const FSTState& q) { - os << "[" << q.trg_covered_ << " : "; - for (int i = 0; i < q.src_coverage_.size(); ++i) - os << q.src_coverage_[i]; - os << " : <"; - for (int i = 0; i < q.src_prefix_.size(); ++i) { - if (i != 0) os << ' '; - os << q.src_prefix_[i]; - } - return os << ">]"; -} - -struct MyModel { - MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} - typedef unordered_map, CCRP_NoTable, boost::hash > > SrcToRuleCRPMap; - - void DecrementRule(const TRule& rule) { - SrcToRuleCRPMap::iterator it = rules.find(rule.f_); - assert(it != rules.end()); - it->second.decrement(rule); - if (it->second.num_customers() == 0) rules.erase(it); - } - - void IncrementRule(const TRule& rule) { - SrcToRuleCRPMap::iterator it = rules.find(rule.f_); - if (it == rules.end()) { - CCRP_NoTable crp(1,1); - it = rules.insert(make_pair(rule.f_, crp)).first; - } - it->second.increment(rule); - } - - // conditioned on rule.f_ - prob_t RuleConditionalProbability(const TRule& rule) const { - const prob_t base = rp0(rule); - SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); - if (it == rules.end()) { - return base; - } else { - const double lp = it->second.logprob(rule, log(base)); - prob_t q; q.logeq(lp); - return q; - } - } - - const ConditionalBase& rp0; - SrcToRuleCRPMap rules; -}; - -struct MyFST : public WFST { - MyFST(const vector& ssrc, const vector& strg, MyModel* m) : - src(ssrc), trg(strg), - r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), - model(m) { - FSTState in(src.size()); - cerr << " INIT: " << in << endl; - init = GetNode(in); - for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; - in.src_covered_ = src.size(); - in.trg_covered_ = trg.size(); - cerr << "FINAL: " << in << endl; - final = GetNode(in); - } - virtual const WFSTNode* Final() const; - virtual const WFSTNode* Initial() const; - - const WFSTNode* GetNode(const FSTState& q); - map > m; - const vector& src; - const vector& trg; - Reachability r; - const WFSTNode* init; - const WFSTNode* final; - MyModel* model; -}; - -struct MyNode : public WFSTNode { - MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} - virtual vector > ExtendInput(unsigned srcindex) const; - const FSTState state; - mutable MyFST* container; -}; - -vector > MyNode::ExtendInput(unsigned srcindex) const { - cerr << "EXTEND " << state << " with " << srcindex << endl; - vector ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); - vector > res(ext.size()); - for (unsigned i = 0; i < ext.size(); ++i) { - res[i].first = container->GetNode(ext[i]); - if (ext[i].src_prefix_.size() == 0) { - const unsigned trg_from = state.trg_covered_; - const unsigned trg_to = ext[i].trg_covered_; - const unsigned prev_prfx_size = state.src_prefix_.size(); - res[i].second.reset(new TRule); - res[i].second->lhs_ = -TD::Convert("X"); - vector& src = res[i].second->f_; - vector& trg = res[i].second->e_; - src.resize(prev_prfx_size + 1); - for (unsigned j = 0; j < prev_prfx_size; ++j) - src[j] = container->src[state.src_prefix_[j]]; - src[prev_prfx_size] = container->src[srcindex]; - for (unsigned j = trg_from; j < trg_to; ++j) - trg.push_back(container->trg[j]); - res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); - } - } - return res; -} - -const WFSTNode* MyFST::GetNode(const FSTState& q) { - boost::shared_ptr& res = m[q]; - if (!res) { - res.reset(new MyNode(q, this)); - } - return &*res; -} - -const WFSTNode* MyFST::Final() const { - return final; -} - -const WFSTNode* MyFST::Initial() const { - return init; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - boost::shared_ptr prng; - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - ConditionalBase lp0(conf["model1_interpolation_weight"].as(), - vocabe.size(), - conf["model1"].as()); - MyModel m(lp0); - - TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); - m.IncrementRule(x); - TRule y("[X] ||| nY dyN ||| gave ||| 0"); - m.IncrementRule(y); - - - MyFST fst(corpusf[0], corpuse[0], &m); - ifstream in("./kimura.g"); - assert(in); - CFG_WFSTComposer comp(fst); - Hypergraph hg; - bool succeed = comp.Compose(&in, &hg); - hg.PrintGraphviz(); - if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } - -#if 0 - ifstream in2("./amnabooks.g"); - assert(in2); - MyFST fst2(corpusf[1], corpuse[1], &m); - CFG_WFSTComposer comp2(fst2); - Hypergraph hg2; - bool succeed2 = comp2.Compose(&in2, &hg2); - if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } -#endif - - SparseVector w; w.set_value(FD::Convert("Proposal"), 1.0); - hg.Reweight(w); - cerr << ViterbiFTree(hg) << endl; - return 0; -} - diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc deleted file mode 100644 index a3e46064..00000000 --- a/gi/pf/pfdist.cc +++ /dev/null @@ -1,598 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "pf.h" -#include "base_distributions.h" -#include "reachability.h" -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("particles,p",po::value()->default_value(30),"Number of particles") - ("filter_frequency,f",po::value()->default_value(5),"Number of time steps between filterings") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -#if 0 -struct MyConditionalModel { - MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} - - prob_t srcp0(const vector& src) const { - prob_t p(1.0 / 3000.0); - p.poweq(src.size()); - prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); - p *= lenp; - return p; - } - - void DecrementRule(const TRule& rule) { - const RuleCRPMap::iterator it = rules.find(rule.f_); - assert(it != rules.end()); - if (it->second.decrement(rule)) { - base /= (*rp0)(rule); - if (it->second.num_customers() == 0) - rules.erase(it); - } - if (src_phrases.decrement(rule.f_)) - base /= srcp0(rule.f_); - } - - void IncrementRule(const TRule& rule) { - RuleCRPMap::iterator it = rules.find(rule.f_); - if (it == rules.end()) - it = rules.insert(make_pair(rule.f_, CCRP_NoTable(1,1))).first; - if (it->second.increment(rule)) { - base *= (*rp0)(rule); - } - if (src_phrases.increment(rule.f_)) - base *= srcp0(rule.f_); - } - - void IncrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - void IncrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].increment(dist)) - base *= jp0(dist, src_len); - } - - void DecrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].decrement(dist)) - base /= jp0(dist, src_len); - } - - void IncrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - IncrementJump(js[i], src_len); - } - - void DecrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - DecrementJump(js[i], src_len); - } - - // p(jump = dist | src_len , z) - prob_t JumpProbability(int dist, unsigned src_len) { - const prob_t p0 = jp0(dist, src_len); - const double lp = src_jumps[src_len].logprob(dist, log(p0)); - prob_t q; q.logeq(lp); - return q; - } - - // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) - prob_t RuleProbability(const TRule& rule) const { - const prob_t p0 = (*rp0)(rule); - prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); - const RuleCRPMap::const_iterator it = rules.find(rule.f_); - if (it == rules.end()) return srcp * p0; - const double lp = it->second.logprob(rule, log(p0)); - prob_t q; q.logeq(lp); - return q * srcp; - } - - prob_t Likelihood() const { - prob_t p = base; - for (RuleCRPMap::const_iterator it = rules.begin(); - it != rules.end(); ++it) { - prob_t cl; cl.logeq(it->second.log_crp_prob()); - p *= cl; - } - for (unsigned l = 1; l < src_jumps.size(); ++l) { - if (src_jumps[l].num_customers() > 0) { - prob_t q; - q.logeq(src_jumps[l].log_crp_prob()); - p *= q; - } - } - return p; - } - - JumpBase jp0; - const PhraseConditionalBase* rp0; - prob_t base; - typedef unordered_map, CCRP_NoTable, boost::hash > > RuleCRPMap; - RuleCRPMap rules; - CCRP_NoTable > src_phrases; - vector > src_jumps; -}; - -#endif - -struct MyJointModel { - MyJointModel(PhraseJointBase& rcp0) : - rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} - - void DecrementRule(const TRule& rule) { - if (rules.decrement(rule)) - base /= rp0(rule); - } - - void IncrementRule(const TRule& rule) { - if (rules.increment(rule)) - base *= rp0(rule); - } - - void IncrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - void IncrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].increment(dist)) - base *= jp0(dist, src_len); - } - - void DecrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].decrement(dist)) - base /= jp0(dist, src_len); - } - - void IncrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - IncrementJump(js[i], src_len); - } - - void DecrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - DecrementJump(js[i], src_len); - } - - // p(jump = dist | src_len , z) - prob_t JumpProbability(int dist, unsigned src_len) { - const prob_t p0 = jp0(dist, src_len); - const double lp = src_jumps[src_len].logprob(dist, log(p0)); - prob_t q; q.logeq(lp); - return q; - } - - // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) - prob_t RuleProbability(const TRule& rule) const { - prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); - return p; - } - - prob_t Likelihood() const { - prob_t p = base; - prob_t q; q.logeq(rules.log_crp_prob()); - p *= q; - for (unsigned l = 1; l < src_jumps.size(); ++l) { - if (src_jumps[l].num_customers() > 0) { - prob_t q; - q.logeq(src_jumps[l].log_crp_prob()); - p *= q; - } - } - return p; - } - - JumpBase jp0; - const PhraseJointBase& rp0; - prob_t base; - CCRP_NoTable rules; - vector > src_jumps; -}; - -struct BackwardEstimate { - BackwardEstimate(const Model1& m1, const vector& src, const vector& trg) : - model1_(m1), src_(src), trg_(trg) { - } - const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { - assert(src_.size() == src_cov.size()); - assert(trg_cov <= trg_.size()); - prob_t& e = cache_[src_cov][trg_cov]; - if (e.is_0()) { - if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } - vector r(src_.size() + 1); r.clear(); - r.push_back(0); // NULL word - for (int i = 0; i < src_cov.size(); ++i) - if (!src_cov[i]) r.push_back(src_[i]); - const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) - for (unsigned j = trg_cov; j < trg_.size(); ++j) { - prob_t p; - for (unsigned i = 0; i < r.size(); ++i) - p += model1_(r[i], trg_[j]); - if (p.is_0()) { - cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; - abort(); - } - p *= uniform_alignment; - e *= p; - } - } - return e; - } - const Model1& model1_; - const vector& src_; - const vector& trg_; - mutable unordered_map, map, boost::hash > > cache_; -}; - -struct BackwardEstimateSym { - BackwardEstimateSym(const Model1& m1, - const Model1& invm1, const vector& src, const vector& trg) : - model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { - } - const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { - assert(src_.size() == src_cov.size()); - assert(trg_cov <= trg_.size()); - prob_t& e = cache_[src_cov][trg_cov]; - if (e.is_0()) { - if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } - vector r(src_.size() + 1); r.clear(); - for (int i = 0; i < src_cov.size(); ++i) - if (!src_cov[i]) r.push_back(src_[i]); - r.push_back(0); // NULL word - const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) - for (unsigned j = trg_cov; j < trg_.size(); ++j) { - prob_t p; - for (unsigned i = 0; i < r.size(); ++i) - p += model1_(r[i], trg_[j]); - if (p.is_0()) { - cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; - abort(); - } - p *= uniform_alignment; - e *= p; - } - r.pop_back(); - const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); - prob_t inv; - inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); - for (unsigned i = 0; i < r.size(); ++i) { - prob_t p; - for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) - p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); - if (p.is_0()) { - cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; - abort(); - } - p *= inv_uniform; - inv *= p; - } - prob_t x = pow(e * inv, 0.5); - e = x; - //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; - } - return e; - } - const Model1& model1_; - const Model1& invmodel1_; - const vector& src_; - const vector& trg_; - mutable unordered_map, map, boost::hash > > cache_; -}; - -struct Particle { - Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} - prob_t weight; - prob_t gamma_last; - vector src_jumps; - vector rules; - vector src_cv; - int src_cov; - int trg_cov; - int prev_pos; -}; - -ostream& operator<<(ostream& o, const vector& v) { - for (int i = 0; i < v.size(); ++i) - o << (v[i] ? '1' : '0'); - return o; -} -ostream& operator<<(ostream& o, const Particle& p) { - o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; - return o; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - const unsigned particles = conf["particles"].as(); - const unsigned samples = conf["samples"].as(); - const unsigned rejuv_freq = conf["filter_frequency"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - cerr << "Reading corpus...\n"; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - assert(corpusf.size() == corpuse.size()); - - const int kLHS = -TD::Convert("X"); - Model1 m1(conf["model1"].as()); - Model1 invm1(conf["inverse_model1"].as()); - -#if 0 - PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); - MyConditionalModel m(lp0); -#else - PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - MyJointModel m(lp0); -#endif - - MultinomialResampleFilter filter(&rng); - cerr << "Initializing reachability limits...\n"; - vector ps(corpusf.size()); - vector reaches; reaches.reserve(corpusf.size()); - for (int ci = 0; ci < corpusf.size(); ++ci) - reaches.push_back(Reachability(corpusf[ci].size(), - corpuse[ci].size(), - kMAX_SRC_PHRASE, - kMAX_TRG_PHRASE)); - cerr << "Sampling...\n"; - vector tmp_p(10000); // work space - SampleSet pfss; - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpusf.size(); ++ci) { - vector& src = corpusf[ci]; - vector& trg = corpuse[ci]; - m.DecrementRules(ps[ci].rules); - m.DecrementJumps(ps[ci].src_jumps, src.size()); - - //BackwardEstimate be(m1, src, trg); - BackwardEstimateSym be(m1, invm1, src, trg); - const Reachability& r = reaches[ci]; - vector lps(particles); - - for (int pi = 0; pi < particles; ++pi) { - Particle& p = lps[pi]; - p.src_cv.resize(src.size(), false); - } - - bool all_complete = false; - while(!all_complete) { - SampleSet ss; - - // all particles have now been extended a bit, we will reweight them now - if (lps[0].trg_cov > 0) - filter(&lps); - - // loop over all particles and extend them - bool done_nothing = true; - for (int pi = 0; pi < particles; ++pi) { - Particle& p = lps[pi]; - int tic = 0; - while(p.trg_cov < trg.size() && tic < rejuv_freq) { - ++tic; - done_nothing = false; - ss.clear(); - TRule x; x.lhs_ = kLHS; - prob_t z; - int first_uncovered = src.size(); - int last_uncovered = -1; - for (int i = 0; i < src.size(); ++i) { - const bool is_uncovered = !p.src_cv[i]; - if (i < first_uncovered && is_uncovered) first_uncovered = i; - if (is_uncovered && i > last_uncovered) last_uncovered = i; - } - assert(last_uncovered > -1); - assert(first_uncovered < src.size()); - - for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { - x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); - for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { - if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; - - const int last_possible_start = last_uncovered - src_len + 1; - assert(last_possible_start >= 0); - //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; - //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl; - for (int i = first_uncovered; i <= last_possible_start; ++i) { - if (p.src_cv[i]) continue; - assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size - Particle& np = tmp_p[ss.size()]; - np = p; - x.f_.clear(); - int gap_add = 0; - bool bad = false; - prob_t jp = prob_t::One(); - int prev_pos = p.prev_pos; - for (int j = 0; j < src_len; ++j) { - if ((j + i + gap_add) == src.size()) { bad = true; break; } - while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } - if ((j + i + gap_add) == src.size()) { bad = true; break; } - np.src_cv[i + j + gap_add] = true; - x.f_.push_back(src[i + j + gap_add]); - jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); - int jump = i + j + gap_add - prev_pos; - assert(jump != 0); - np.src_jumps.push_back(jump); - prev_pos = i + j + gap_add; - } - if (bad) continue; - np.prev_pos = prev_pos; - np.src_cov += x.f_.size(); - np.trg_cov += x.e_.size(); - if (x.f_.size() != src_len) continue; - prob_t rp = m.RuleProbability(x); - np.gamma_last = rp * jp; - const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); - //cerr << "**rule=" << x << endl; - //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl; - ss.add(u); - np.rules.push_back(TRulePtr(new TRule(x))); - z += u; - - const bool completed = (p.trg_cov == trg.size()); - if (completed) { - int last_jump = src.size() - p.prev_pos; - assert(last_jump > 0); - p.src_jumps.push_back(last_jump); - p.weight *= m.JumpProbability(last_jump, src.size()); - } - } - } - } - cerr << "number of edges to consider: " << ss.size() << endl; - const int sampled = rng.SelectSample(ss); - prob_t q_n = ss[sampled] / z; - p = tmp_p[sampled]; - //m.IncrementRule(*p.rules.back()); - p.weight *= p.gamma_last / q_n; - cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; - cerr << p << endl; - } - } // loop over particles (pi = 0 .. particles) - if (done_nothing) all_complete = true; - } - pfss.clear(); - for (int i = 0; i < lps.size(); ++i) - pfss.add(lps[i].weight); - const int sampled = rng.SelectSample(pfss); - ps[ci] = lps[sampled]; - m.IncrementRules(lps[sampled].rules); - m.IncrementJumps(lps[sampled].src_jumps, src.size()); - for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } - cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; - } - cerr << "LLH: " << log(m.Likelihood()) << endl; - for (int sni = 0; sni < 5; ++sni) { - for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } - } - } - return 0; -} - diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc deleted file mode 100644 index 3169eb75..00000000 --- a/gi/pf/pfdist.new.cc +++ /dev/null @@ -1,620 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "base_measures.h" -#include "reachability.h" -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -shared_ptr prng; - -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("particles,p",po::value()->default_value(25),"Number of particles") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_f, - set* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -#if 0 -struct MyConditionalModel { - MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} - - prob_t srcp0(const vector& src) const { - prob_t p(1.0 / 3000.0); - p.poweq(src.size()); - prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); - p *= lenp; - return p; - } - - void DecrementRule(const TRule& rule) { - const RuleCRPMap::iterator it = rules.find(rule.f_); - assert(it != rules.end()); - if (it->second.decrement(rule)) { - base /= (*rp0)(rule); - if (it->second.num_customers() == 0) - rules.erase(it); - } - if (src_phrases.decrement(rule.f_)) - base /= srcp0(rule.f_); - } - - void IncrementRule(const TRule& rule) { - RuleCRPMap::iterator it = rules.find(rule.f_); - if (it == rules.end()) - it = rules.insert(make_pair(rule.f_, CCRP_NoTable(1,1))).first; - if (it->second.increment(rule)) { - base *= (*rp0)(rule); - } - if (src_phrases.increment(rule.f_)) - base *= srcp0(rule.f_); - } - - void IncrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - void IncrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].increment(dist)) - base *= jp0(dist, src_len); - } - - void DecrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].decrement(dist)) - base /= jp0(dist, src_len); - } - - void IncrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - IncrementJump(js[i], src_len); - } - - void DecrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - DecrementJump(js[i], src_len); - } - - // p(jump = dist | src_len , z) - prob_t JumpProbability(int dist, unsigned src_len) { - const prob_t p0 = jp0(dist, src_len); - const double lp = src_jumps[src_len].logprob(dist, log(p0)); - prob_t q; q.logeq(lp); - return q; - } - - // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) - prob_t RuleProbability(const TRule& rule) const { - const prob_t p0 = (*rp0)(rule); - prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); - const RuleCRPMap::const_iterator it = rules.find(rule.f_); - if (it == rules.end()) return srcp * p0; - const double lp = it->second.logprob(rule, log(p0)); - prob_t q; q.logeq(lp); - return q * srcp; - } - - prob_t Likelihood() const { - prob_t p = base; - for (RuleCRPMap::const_iterator it = rules.begin(); - it != rules.end(); ++it) { - prob_t cl; cl.logeq(it->second.log_crp_prob()); - p *= cl; - } - for (unsigned l = 1; l < src_jumps.size(); ++l) { - if (src_jumps[l].num_customers() > 0) { - prob_t q; - q.logeq(src_jumps[l].log_crp_prob()); - p *= q; - } - } - return p; - } - - JumpBase jp0; - const PhraseConditionalBase* rp0; - prob_t base; - typedef unordered_map, CCRP_NoTable, boost::hash > > RuleCRPMap; - RuleCRPMap rules; - CCRP_NoTable > src_phrases; - vector > src_jumps; -}; - -#endif - -struct MyJointModel { - MyJointModel(PhraseJointBase& rcp0) : - rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable(1,1)) {} - - void DecrementRule(const TRule& rule) { - if (rules.decrement(rule)) - base /= rp0(rule); - } - - void IncrementRule(const TRule& rule) { - if (rules.increment(rule)) - base *= rp0(rule); - } - - void IncrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - void IncrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].increment(dist)) - base *= jp0(dist, src_len); - } - - void DecrementJump(int dist, unsigned src_len) { - assert(src_len > 0); - if (src_jumps[src_len].decrement(dist)) - base /= jp0(dist, src_len); - } - - void IncrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - IncrementJump(js[i], src_len); - } - - void DecrementJumps(const vector& js, unsigned src_len) { - for (unsigned i = 0; i < js.size(); ++i) - DecrementJump(js[i], src_len); - } - - // p(jump = dist | src_len , z) - prob_t JumpProbability(int dist, unsigned src_len) { - const prob_t p0 = jp0(dist, src_len); - const double lp = src_jumps[src_len].logprob(dist, log(p0)); - prob_t q; q.logeq(lp); - return q; - } - - // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) - prob_t RuleProbability(const TRule& rule) const { - prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); - return p; - } - - prob_t Likelihood() const { - prob_t p = base; - prob_t q; q.logeq(rules.log_crp_prob()); - p *= q; - for (unsigned l = 1; l < src_jumps.size(); ++l) { - if (src_jumps[l].num_customers() > 0) { - prob_t q; - q.logeq(src_jumps[l].log_crp_prob()); - p *= q; - } - } - return p; - } - - JumpBase jp0; - const PhraseJointBase& rp0; - prob_t base; - CCRP_NoTable rules; - vector > src_jumps; -}; - -struct BackwardEstimate { - BackwardEstimate(const Model1& m1, const vector& src, const vector& trg) : - model1_(m1), src_(src), trg_(trg) { - } - const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { - assert(src_.size() == src_cov.size()); - assert(trg_cov <= trg_.size()); - prob_t& e = cache_[src_cov][trg_cov]; - if (e.is_0()) { - if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } - vector r(src_.size() + 1); r.clear(); - r.push_back(0); // NULL word - for (int i = 0; i < src_cov.size(); ++i) - if (!src_cov[i]) r.push_back(src_[i]); - const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) - for (unsigned j = trg_cov; j < trg_.size(); ++j) { - prob_t p; - for (unsigned i = 0; i < r.size(); ++i) - p += model1_(r[i], trg_[j]); - if (p.is_0()) { - cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; - abort(); - } - p *= uniform_alignment; - e *= p; - } - } - return e; - } - const Model1& model1_; - const vector& src_; - const vector& trg_; - mutable unordered_map, map, boost::hash > > cache_; -}; - -struct BackwardEstimateSym { - BackwardEstimateSym(const Model1& m1, - const Model1& invm1, const vector& src, const vector& trg) : - model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { - } - const prob_t& operator()(const vector& src_cov, unsigned trg_cov) const { - assert(src_.size() == src_cov.size()); - assert(trg_cov <= trg_.size()); - prob_t& e = cache_[src_cov][trg_cov]; - if (e.is_0()) { - if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } - vector r(src_.size() + 1); r.clear(); - for (int i = 0; i < src_cov.size(); ++i) - if (!src_cov[i]) r.push_back(src_[i]); - r.push_back(0); // NULL word - const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) - for (unsigned j = trg_cov; j < trg_.size(); ++j) { - prob_t p; - for (unsigned i = 0; i < r.size(); ++i) - p += model1_(r[i], trg_[j]); - if (p.is_0()) { - cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; - abort(); - } - p *= uniform_alignment; - e *= p; - } - r.pop_back(); - const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); - prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); - for (unsigned i = 0; i < r.size(); ++i) { - prob_t p; - for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) - p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); - if (p.is_0()) { - cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; - abort(); - } - p *= inv_uniform; - inv *= p; - } - prob_t x = pow(e * inv, 0.5); - e = x; - //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; - } - return e; - } - const Model1& model1_; - const Model1& invmodel1_; - const vector& src_; - const vector& trg_; - mutable unordered_map, map, boost::hash > > cache_; -}; - -struct Particle { - Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} - prob_t weight; - prob_t gamma_last; - vector src_jumps; - vector rules; - vector src_cv; - int src_cov; - int trg_cov; - int prev_pos; -}; - -ostream& operator<<(ostream& o, const vector& v) { - for (int i = 0; i < v.size(); ++i) - o << (v[i] ? '1' : '0'); - return o; -} -ostream& operator<<(ostream& o, const Particle& p) { - o << "[cv=" << p.src_cv << " src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; - return o; -} - -void FilterCrapParticlesAndReweight(vector* pps) { - vector& ps = *pps; - SampleSet ss; - for (int i = 0; i < ps.size(); ++i) - ss.add(ps[i].weight); - vector nps; nps.reserve(ps.size()); - const prob_t uniform_weight(1.0 / ps.size()); - for (int i = 0; i < ps.size(); ++i) { - nps.push_back(ps[prng->SelectSample(ss)]); - nps[i].weight = uniform_weight; - } - nps.swap(ps); -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - const unsigned particles = conf["particles"].as(); - const unsigned samples = conf["samples"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - cerr << "Reading corpus...\n"; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - assert(corpusf.size() == corpuse.size()); - - const int kLHS = -TD::Convert("X"); - Model1 m1(conf["model1"].as()); - Model1 invm1(conf["inverse_model1"].as()); - -#if 0 - PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); - MyConditionalModel m(lp0); -#else - PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - MyJointModel m(lp0); -#endif - - cerr << "Initializing reachability limits...\n"; - vector ps(corpusf.size()); - vector reaches; reaches.reserve(corpusf.size()); - for (int ci = 0; ci < corpusf.size(); ++ci) - reaches.push_back(Reachability(corpusf[ci].size(), - corpuse[ci].size(), - kMAX_SRC_PHRASE, - kMAX_TRG_PHRASE)); - cerr << "Sampling...\n"; - vector tmp_p(10000); // work space - SampleSet pfss; - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpusf.size(); ++ci) { - vector& src = corpusf[ci]; - vector& trg = corpuse[ci]; - m.DecrementRules(ps[ci].rules); - m.DecrementJumps(ps[ci].src_jumps, src.size()); - - //BackwardEstimate be(m1, src, trg); - BackwardEstimateSym be(m1, invm1, src, trg); - const Reachability& r = reaches[ci]; - vector lps(particles); - - for (int pi = 0; pi < particles; ++pi) { - Particle& p = lps[pi]; - p.src_cv.resize(src.size(), false); - } - - bool all_complete = false; - while(!all_complete) { - SampleSet ss; - - // all particles have now been extended a bit, we will reweight them now - if (lps[0].trg_cov > 0) - FilterCrapParticlesAndReweight(&lps); - - // loop over all particles and extend them - bool done_nothing = true; - for (int pi = 0; pi < particles; ++pi) { - Particle& p = lps[pi]; - int tic = 0; - const int rejuv_freq = 1; - while(p.trg_cov < trg.size() && tic < rejuv_freq) { - ++tic; - done_nothing = false; - ss.clear(); - TRule x; x.lhs_ = kLHS; - prob_t z; - int first_uncovered = src.size(); - int last_uncovered = -1; - for (int i = 0; i < src.size(); ++i) { - const bool is_uncovered = !p.src_cv[i]; - if (i < first_uncovered && is_uncovered) first_uncovered = i; - if (is_uncovered && i > last_uncovered) last_uncovered = i; - } - assert(last_uncovered > -1); - assert(first_uncovered < src.size()); - - for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { - x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); - for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { - if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; - - const int last_possible_start = last_uncovered - src_len + 1; - assert(last_possible_start >= 0); - //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; - //cerr << " first_uncovered=" << first_uncovered << " last_possible_start=" << last_possible_start << endl; - for (int i = first_uncovered; i <= last_possible_start; ++i) { - if (p.src_cv[i]) continue; - assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size - Particle& np = tmp_p[ss.size()]; - np = p; - x.f_.clear(); - int gap_add = 0; - bool bad = false; - prob_t jp = prob_t::One(); - int prev_pos = p.prev_pos; - for (int j = 0; j < src_len; ++j) { - if ((j + i + gap_add) == src.size()) { bad = true; break; } - while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } - if ((j + i + gap_add) == src.size()) { bad = true; break; } - np.src_cv[i + j + gap_add] = true; - x.f_.push_back(src[i + j + gap_add]); - jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); - int jump = i + j + gap_add - prev_pos; - assert(jump != 0); - np.src_jumps.push_back(jump); - prev_pos = i + j + gap_add; - } - if (bad) continue; - np.prev_pos = prev_pos; - np.src_cov += x.f_.size(); - np.trg_cov += x.e_.size(); - if (x.f_.size() != src_len) continue; - prob_t rp = m.RuleProbability(x); - np.gamma_last = rp * jp; - const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); - //cerr << "**rule=" << x << endl; - //cerr << " u=" << log(u) << " rule=" << rp << " jump=" << jp << endl; - ss.add(u); - np.rules.push_back(TRulePtr(new TRule(x))); - z += u; - - const bool completed = (p.trg_cov == trg.size()); - if (completed) { - int last_jump = src.size() - p.prev_pos; - assert(last_jump > 0); - p.src_jumps.push_back(last_jump); - p.weight *= m.JumpProbability(last_jump, src.size()); - } - } - } - } - cerr << "number of edges to consider: " << ss.size() << endl; - const int sampled = rng.SelectSample(ss); - prob_t q_n = ss[sampled] / z; - p = tmp_p[sampled]; - //m.IncrementRule(*p.rules.back()); - p.weight *= p.gamma_last / q_n; - cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; - cerr << p << endl; - } - } // loop over particles (pi = 0 .. particles) - if (done_nothing) all_complete = true; - } - pfss.clear(); - for (int i = 0; i < lps.size(); ++i) - pfss.add(lps[i].weight); - const int sampled = rng.SelectSample(pfss); - ps[ci] = lps[sampled]; - m.IncrementRules(lps[sampled].rules); - m.IncrementJumps(lps[sampled].src_jumps, src.size()); - for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } - cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; - } - cerr << "LLH: " << log(m.Likelihood()) << endl; - for (int sni = 0; sni < 5; ++sni) { - for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } - } - } - return 0; -} - diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc deleted file mode 100644 index 958ec4e2..00000000 --- a/gi/pf/pfnaive.cc +++ /dev/null @@ -1,284 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "pf.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "reachability.h" -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" -#include "corpus.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("particles,p",po::value()->default_value(30),"Number of particles") - ("filter_frequency,f",po::value()->default_value(5),"Number of time steps between filterings") - ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(5),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(5),"Maximum length of target language phrases") - ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") - ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") - ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct BackwardEstimateSym { - BackwardEstimateSym(const Model1& m1, - const Model1& invm1, const vector& src, const vector& trg) : - model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { - } - const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const { - assert(src_cov <= src_.size()); - assert(trg_cov <= trg_.size()); - prob_t& e = cache_[src_cov][trg_cov]; - if (e.is_0()) { - if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } - vector r(src_.size() + 1); r.clear(); - for (int i = src_cov; i < src_.size(); ++i) - r.push_back(src_[i]); - r.push_back(0); // NULL word - const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) - for (unsigned j = trg_cov; j < trg_.size(); ++j) { - prob_t p; - for (unsigned i = 0; i < r.size(); ++i) - p += model1_(r[i], trg_[j]); - if (p.is_0()) { - cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; - abort(); - } - p *= uniform_alignment; - e *= p; - } - r.pop_back(); - const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); - prob_t inv; - inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); - for (unsigned i = 0; i < r.size(); ++i) { - prob_t p; - for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) - p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); - if (p.is_0()) { - cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; - abort(); - } - p *= inv_uniform; - inv *= p; - } - prob_t x = pow(e * inv, 0.5); - e = x; - //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; - } - return e; - } - const Model1& model1_; - const Model1& invmodel1_; - const vector& src_; - const vector& trg_; - mutable unordered_map > cache_; -}; - -struct Particle { - Particle() : weight(prob_t::One()), src_cov(), trg_cov() {} - prob_t weight; - prob_t gamma_last; - vector rules; - int src_cov; - int trg_cov; -}; - -ostream& operator<<(ostream& o, const vector& v) { - for (int i = 0; i < v.size(); ++i) - o << (v[i] ? '1' : '0'); - return o; -} -ostream& operator<<(ostream& o, const Particle& p) { - o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << " w=" << log(p.weight) << ']'; - return o; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); - const unsigned particles = conf["particles"].as(); - const unsigned samples = conf["samples"].as(); - const unsigned rejuv_freq = conf["filter_frequency"].as(); - - if (!conf.count("model1")) { - cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; - return 1; - } - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - cerr << "Reading corpus...\n"; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - assert(corpusf.size() == corpuse.size()); - - const int kLHS = -TD::Convert("X"); - Model1 m1(conf["model1"].as()); - Model1 invm1(conf["inverse_model1"].as()); - - PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); - MonotonicParallelSegementationModel m(alp0); - TRule xx("[X] ||| ms. kimura ||| MS. KIMURA ||| X=0"); - cerr << xx << endl << lp0(xx) << " " << alp0(xx) << endl; - TRule xx12("[X] ||| . ||| PHARMACY . ||| X=0"); - TRule xx21("[X] ||| pharmacy . ||| . ||| X=0"); -// TRule xx22("[X] ||| . ||| . ||| X=0"); - TRule xx22("[X] ||| . ||| THE . ||| X=0"); - cerr << xx12 << "\t" << lp0(xx12) << " " << alp0(xx12) << endl; - cerr << xx21 << "\t" << lp0(xx21) << " " << alp0(xx21) << endl; - cerr << xx22 << "\t" << lp0(xx22) << " " << alp0(xx22) << endl; - - cerr << "Initializing reachability limits...\n"; - vector ps(corpusf.size()); - vector reaches; reaches.reserve(corpusf.size()); - for (int ci = 0; ci < corpusf.size(); ++ci) - reaches.push_back(Reachability(corpusf[ci].size(), - corpuse[ci].size(), - kMAX_SRC_PHRASE, - kMAX_TRG_PHRASE)); - cerr << "Sampling...\n"; - vector tmp_p(10000); // work space - SampleSet pfss; - SystematicResampleFilter filter(&rng); - // MultinomialResampleFilter filter(&rng); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpusf.size(); ++ci) { - vector& src = corpusf[ci]; - vector& trg = corpuse[ci]; - m.DecrementRulesAndStops(ps[ci].rules); - const prob_t q_stop = m.StopProbability(); - const prob_t q_cont = m.ContinueProbability(); - cerr << "P(stop)=" << q_stop << "\tP(continue)=" < lps(particles); - - bool all_complete = false; - while(!all_complete) { - SampleSet ss; - - // all particles have now been extended a bit, we will reweight them now - if (lps[0].trg_cov > 0) - filter(&lps); - - // loop over all particles and extend them - bool done_nothing = true; - for (int pi = 0; pi < particles; ++pi) { - Particle& p = lps[pi]; - int tic = 0; - while(p.trg_cov < trg.size() && tic < rejuv_freq) { - ++tic; - done_nothing = false; - ss.clear(); - TRule x; x.lhs_ = kLHS; - prob_t z; - - for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { - x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); - for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { - if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; - - int i = p.src_cov; - assert(ss.size() < tmp_p.size()); // if fails increase tmp_p size - Particle& np = tmp_p[ss.size()]; - np = p; - x.f_.clear(); - for (int j = 0; j < src_len; ++j) - x.f_.push_back(src[i + j]); - np.src_cov += x.f_.size(); - np.trg_cov += x.e_.size(); - const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len); - prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont); - np.gamma_last = rp; - const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1); - //cerr << "**rule=" << x << endl; - //cerr << " u=" << log(u) << " rule=" << rp << endl; - ss.add(u); - np.rules.push_back(TRulePtr(new TRule(x))); - z += u; - } - } - //cerr << "number of edges to consider: " << ss.size() << endl; - const int sampled = rng.SelectSample(ss); - prob_t q_n = ss[sampled] / z; - p = tmp_p[sampled]; - //m.IncrementRule(*p.rules.back()); - p.weight *= p.gamma_last / q_n; - //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; - //cerr << p << endl; - } - } // loop over particles (pi = 0 .. particles) - if (done_nothing) all_complete = true; - prob_t wv = prob_t::Zero(); - for (int pp = 0; pp < lps.size(); ++pp) - wv += lps[pp].weight; - for (int pp = 0; pp < lps.size(); ++pp) - lps[pp].weight /= wv; - } - pfss.clear(); - for (int i = 0; i < lps.size(); ++i) - pfss.add(lps[i].weight); - const int sampled = rng.SelectSample(pfss); - ps[ci] = lps[sampled]; - m.IncrementRulesAndStops(lps[sampled].rules); - for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } - cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; - } - cerr << "LLH: " << log(m.Likelihood()) << endl; - } - return 0; -} - diff --git a/gi/pf/poisson_uniform_word_model.h b/gi/pf/poisson_uniform_word_model.h deleted file mode 100644 index 76204a0e..00000000 --- a/gi/pf/poisson_uniform_word_model.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _POISSON_UNIFORM_WORD_MODEL_H_ -#define _POISSON_UNIFORM_WORD_MODEL_H_ - -#include -#include -#include "prob.h" -#include "m.h" - -// len ~ Poisson(lambda) -// for (1..len) -// e_i ~ Uniform({Vocabulary}) -struct PoissonUniformWordModel { - explicit PoissonUniformWordModel(const unsigned vocab_size, - const unsigned alphabet_size, - const double mean_len = 5) : - lh(prob_t::One()), - v0(-std::log(vocab_size)), - u0(-std::log(alphabet_size)), - mean_length(mean_len) {} - - void ResampleHyperparameters(MT19937*) {} - - inline prob_t operator()(const std::vector& s) const { - prob_t p; - p.logeq(Md::log_poisson(s.size(), mean_length) + s.size() * u0); - //p.logeq(v0); - return p; - } - - inline void Increment(const std::vector& w, MT19937*) { - lh *= (*this)(w); - } - - inline void Decrement(const std::vector& w, MT19937 *) { - lh /= (*this)(w); - } - - inline prob_t Likelihood() const { return lh; } - - void Summary() const {} - - private: - - prob_t lh; // keeps track of the draws from the base distribution - const double v0; // uniform log prob of generating a word - const double u0; // uniform log prob of generating a letter - const double mean_length; // mean length of a word in the base distribution -}; - -#endif diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc deleted file mode 100644 index 605d8206..00000000 --- a/gi/pf/pyp_lm.cc +++ /dev/null @@ -1,273 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "gamma_poisson.h" -#include "corpus_tools.h" -#include "m.h" -#include "tdict.h" -#include "sampler.h" -#include "ccrp.h" -#include "tied_resampler.h" - -// A not very memory-efficient implementation of an N-gram LM based on PYPs -// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model -// based on Pitman-Yor Processes. In Proc. ACL. - -// I use templates to handle the recursive formalation of the prior, so -// the order of the model has to be specified here, at compile time: -#define kORDER 3 - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,n",po::value()->default_value(300),"Number of samples") - ("train,i",po::value(),"Training data file") - ("test,T",po::value(),"Test data file") - ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") - ("discount_prior_b,b",po::value()->default_value(1.0), "discount ~ Beta(a,b): b=this") - ("strength_prior_s,s",po::value()->default_value(1.0), "strength ~ Gamma(s,r): s=this") - ("strength_prior_r,r",po::value()->default_value(1.0), "strength ~ Gamma(s,r): r=this") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("train") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -// uniform distribution over a fixed vocabulary -struct UniformVocabulary { - UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} - void increment(WordID, const vector&, MT19937*) { ++draws; } - void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } - double prob(WordID, const vector&) const { return p0; } - void resample_hyperparameters(MT19937*) {} - double log_likelihood() const { return draws * log(p0); } - const double p0; - int draws; -}; - -// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. -// Journal of Statistical Planning and Inference 14 (1986) 311-322 -struct PoissonLengthUniformCharWordModel { - explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {} - void increment(WordID w, const vector& v, MT19937*) { - llh += log(prob(w, v)); // this isn't quite right - plen.increment(TD::Convert(w).size() - 1); - } - void decrement(WordID w, const vector& v, MT19937*) { - plen.decrement(TD::Convert(w).size() - 1); - llh -= log(prob(w, v)); // this isn't quite right - } - double prob(WordID w, const vector&) const { - const unsigned len = TD::Convert(w).size(); - return plen.prob(len - 1) * exp(uc * len); - } - double log_likelihood() const { return llh; } - void resample_hyperparameters(MT19937*) {} - GammaPoisson plen; - const double uc; - double llh; -}; - -struct PYPAdaptedPoissonLengthUniformCharWordModel { - explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : - base(vocab_size,1,1,1,1), - crp(1,1,1,1) {} - void increment(WordID w, const vector& v, MT19937* rng) { - double p0 = base.prob(w, v); - if (crp.increment(w, p0, rng)) - base.increment(w, v, rng); - } - void decrement(WordID w, const vector& v, MT19937* rng) { - if (crp.decrement(w, rng)) - base.decrement(w, v, rng); - } - double prob(WordID w, const vector& v) const { - double p0 = base.prob(w, v); - return crp.prob(w, p0); - } - double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); } - void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); } - PoissonLengthUniformCharWordModel base; - CCRP crp; -}; - -template struct PYPLM; - -#if 1 -template<> struct PYPLM<0> : public UniformVocabulary { - PYPLM(unsigned vs, double a, double b, double c, double d) : - UniformVocabulary(vs, a, b, c, d) {} -}; -#else -#if 0 -template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel { - PYPLM(unsigned vs, double a, double b, double c, double d) : - PoissonLengthUniformCharWordModel(vs, a, b, c, d) {} -}; -#else -template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel { - PYPLM(unsigned vs, double a, double b, double c, double d) : - PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {} -}; -#endif -#endif - -// represents an N-gram LM -template struct PYPLM { - PYPLM(unsigned vs, double da, double db, double ss, double sr) : - backoff(vs, da, db, ss, sr), - tr(da, db, ss, sr, 0.8, 1.0), - lookup(N-1) {} - void increment(WordID w, const vector& context, MT19937* rng) { - const double bo = backoff.prob(w, context); - for (unsigned i = 0; i < N-1; ++i) - lookup[i] = context[context.size() - 1 - i]; - typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); - if (it == p.end()) { - it = p.insert(make_pair(lookup, CCRP(0.5,1))).first; - tr.Add(&it->second); // add to resampler - } - if (it->second.increment(w, bo, rng)) - backoff.increment(w, context, rng); - } - void decrement(WordID w, const vector& context, MT19937* rng) { - for (unsigned i = 0; i < N-1; ++i) - lookup[i] = context[context.size() - 1 - i]; - typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); - assert(it != p.end()); - if (it->second.decrement(w, rng)) - backoff.decrement(w, context, rng); - } - double prob(WordID w, const vector& context) const { - const double bo = backoff.prob(w, context); - for (unsigned i = 0; i < N-1; ++i) - lookup[i] = context[context.size() - 1 - i]; - typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); - if (it == p.end()) return bo; - return it->second.prob(w, bo); - } - - double log_likelihood() const { - double llh = backoff.log_likelihood(); - typename unordered_map, CCRP, boost::hash > >::const_iterator it; - for (it = p.begin(); it != p.end(); ++it) - llh += it->second.log_crp_prob(); - llh += tr.LogLikelihood(); - return llh; - } - - void resample_hyperparameters(MT19937* rng) { - tr.ResampleHyperparameters(rng); - backoff.resample_hyperparameters(rng); - } - - PYPLM backoff; - TiedResampler > tr; - double discount_a, discount_b, strength_s, strength_r; - double d, strength; - mutable vector lookup; // thread-local - unordered_map, CCRP, boost::hash > > p; -}; - -int main(int argc, char** argv) { - po::variables_map conf; - - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as(); - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - vector > corpuse; - set vocabe; - const WordID kEOS = TD::Convert(""); - cerr << "Reading corpus...\n"; - CorpusTools::ReadFromFile(conf["train"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - vector > test; - if (conf.count("test")) - CorpusTools::ReadFromFile(conf["test"].as(), &test); - else - test = corpuse; - PYPLM lm(vocabe.size(), - conf["discount_prior_a"].as(), - conf["discount_prior_b"].as(), - conf["strength_prior_s"].as(), - conf["strength_prior_r"].as()); - vector ctx(kORDER - 1, TD::Convert("")); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { - ctx.resize(kORDER - 1); - const vector& s = corpuse[ci]; - for (int i = 0; i <= s.size(); ++i) { - WordID w = (i < s.size() ? s[i] : kEOS); - if (SS > 0) lm.decrement(w, ctx, &rng); - lm.increment(w, ctx, &rng); - ctx.push_back(w); - } - } - if (SS % 10 == 9) { - cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; - if (SS % 30 == 29) lm.resample_hyperparameters(&rng); - } else { cerr << '.' << flush; } - } - double llh = 0; - unsigned cnt = 0; - unsigned oovs = 0; - for (int ci = 0; ci < test.size(); ++ci) { - ctx.resize(kORDER - 1); - const vector& s = test[ci]; - for (int i = 0; i <= s.size(); ++i) { - WordID w = (i < s.size() ? s[i] : kEOS); - double lp = log(lm.prob(w, ctx)) / log(2); - if (i < s.size() && vocabe.count(w) == 0) { - cerr << "**OOV "; - ++oovs; - lp = 0; - } - cerr << "p(" << TD::Convert(w) << " |"; - for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) - cerr << ' ' << TD::Convert(ctx[j]); - cerr << ") = " << lp << endl; - ctx.push_back(w); - llh -= lp; - cnt++; - } - } - cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; - cerr << " Count: " << cnt << endl; - cerr << " OOVs: " << oovs << endl; - cerr << "Cross-entropy: " << (llh / cnt) << endl; - cerr << " Perplexity: " << pow(2, llh / cnt) << endl; - return 0; -} - - diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc deleted file mode 100644 index 37b9a604..00000000 --- a/gi/pf/pyp_tm.cc +++ /dev/null @@ -1,128 +0,0 @@ -#include "pyp_tm.h" - -#include -#include -#include - -#include "tdict.h" -#include "ccrp.h" -#include "pyp_word_model.h" -#include "tied_resampler.h" - -using namespace std; -using namespace std::tr1; - -struct FreqBinner { - FreqBinner(const std::string& fname) { fd_.Load(fname); } - unsigned NumberOfBins() const { return fd_.Max() + 1; } - unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } - FreqDict fd_; -}; - -template -struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : - base(*b), - binner(bnr), - btr(binner ? binner->NumberOfBins() + 1u : 2u) {} - - void Summary() const { - cerr << "Number of conditioning contexts: " << r.size() << endl; - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; - for (CCRP >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - cerr << " " << i2->second << '\t' << TD::GetString(i2->first) << endl; - } - } - - void ResampleHyperparameters(MT19937* rng) { - btr.ResampleHyperparameters(rng); - } - - prob_t Prob(const WordID src, const vector& trglets) const { - RuleModelHash::const_iterator it = r.find(src); - if (it == r.end()) { - return base(trglets); - } else { - return it->second.prob(trglets, base(trglets)); - } - } - - void Increment(const WordID src, const vector& trglets, MT19937* rng) { - RuleModelHash::iterator it = r.find(src); - if (it == r.end()) { - it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; - static const WordID kNULL = TD::Convert("NULL"); - unsigned bin = (src == kNULL ? 0 : 1); - if (binner && bin) { bin = binner->Bin(src) + 1; } - btr.Add(bin, &it->second); - } - if (it->second.increment(trglets, base(trglets), rng)) - base.Increment(trglets, rng); - } - - void Decrement(const WordID src, const vector& trglets, MT19937* rng) { - RuleModelHash::iterator it = r.find(src); - assert(it != r.end()); - if (it->second.decrement(trglets, rng)) { - base.Decrement(trglets, rng); - } - } - - prob_t Likelihood() const { - prob_t p = prob_t::One(); - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - prob_t q; q.logeq(it->second.log_crp_prob()); - p *= q; - } - return p; - } - - unsigned UniqueConditioningContexts() const { - return r.size(); - } - - // TODO tie PYP hyperparameters based on source word frequency bins - Base& base; - const Binner* binner; - BinTiedResampler > > btr; - typedef unordered_map > > RuleModelHash; - RuleModelHash r; -}; - -PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets, - const unsigned vocab_size, - const unsigned num_letters) : - letters(lets), - base(vocab_size, num_letters, 5), - tmodel(new ConditionalPYPWordModel(&base, new FreqBinner("10k.freq"))), - kX(-TD::Convert("X")) {} - -void PYPLexicalTranslation::Summary() const { - tmodel->Summary(); -} - -prob_t PYPLexicalTranslation::Likelihood() const { - return tmodel->Likelihood() * base.Likelihood(); -} - -void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { - tmodel->ResampleHyperparameters(rng); -} - -unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { - return tmodel->UniqueConditioningContexts(); -} - -prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { - return tmodel->Prob(src, letters[trg]); -} - -void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { - tmodel->Increment(src, letters[trg], rng); -} - -void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { - tmodel->Decrement(src, letters[trg], rng); -} - diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h deleted file mode 100644 index 2b076a25..00000000 --- a/gi/pf/pyp_tm.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef PYP_LEX_TRANS -#define PYP_LEX_TRANS - -#include -#include "wordid.h" -#include "prob.h" -#include "sampler.h" -#include "freqdict.h" -#include "poisson_uniform_word_model.h" - -struct FreqBinner; -template struct ConditionalPYPWordModel; - -struct PYPLexicalTranslation { - explicit PYPLexicalTranslation(const std::vector >& lets, - const unsigned vocab_size, - const unsigned num_letters); - - prob_t Likelihood() const; - - void ResampleHyperparameters(MT19937* rng); - prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) - void Summary() const; - void Increment(WordID src, WordID trg, MT19937* rng); - void Decrement(WordID src, WordID trg, MT19937* rng); - unsigned UniqueConditioningContexts() const; - - private: - const std::vector >& letters; // spelling dictionary - PoissonUniformWordModel base; // "generator" of English types - ConditionalPYPWordModel* tmodel; // translation distributions - // (model English word | French word) - const WordID kX; -}; - -#endif diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h deleted file mode 100644 index 0bebb751..00000000 --- a/gi/pf/pyp_word_model.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef _PYP_WORD_MODEL_H_ -#define _PYP_WORD_MODEL_H_ - -#include -#include -#include -#include "prob.h" -#include "ccrp.h" -#include "m.h" -#include "tdict.h" -#include "os_phrase.h" - -// PYP(d,s,poisson-uniform) represented as a CRP -template -struct PYPWordModel { - explicit PYPWordModel(Base* b) : - base(*b), - r(1,1,1,1,0.66,50.0) - {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - std::cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; - } - - inline prob_t operator()(const std::vector& s) const { - return r.prob(s, base(s)); - } - - inline void Increment(const std::vector& s, MT19937* rng) { - if (r.increment(s, base(s), rng)) - base.Increment(s, rng); - } - - inline void Decrement(const std::vector& s, MT19937 *rng) { - if (r.decrement(s, rng)) - base.Decrement(s, rng); - } - - inline prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base.Likelihood(); - return p; - } - - void Summary() const { - std::cerr << "PYPWordModel: generations=" << r.num_customers() - << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << std::endl; - for (typename CCRP >::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << " " << it->second - << TD::GetString(it->first) << std::endl; - } - } - - private: - - Base& base; // keeps track of the draws from the base distribution - CCRP > r; -}; - -#endif diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h deleted file mode 100644 index 4075affe..00000000 --- a/gi/pf/quasi_model2.h +++ /dev/null @@ -1,177 +0,0 @@ -#ifndef _QUASI_MODEL2_H_ -#define _QUASI_MODEL2_H_ - -#include -#include -#include -#include "boost/functional.hpp" -#include "prob.h" -#include "array2d.h" -#include "slice_sampler.h" -#include "m.h" -#include "have_64_bits.h" - -struct AlignmentObservation { - AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} - AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : - src_len(sl), trg_len(tl), j(tw), a_j(sw) {} - unsigned short src_len; - unsigned short trg_len; - unsigned short j; - unsigned short a_j; -}; - -#ifdef HAVE_64_BITS -inline size_t hash_value(const AlignmentObservation& o) { - return reinterpret_cast(o); -} -inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { - return hash_value(a) == hash_value(b); -} -#else -inline size_t hash_value(const AlignmentObservation& o) { - size_t h = 1; - boost::hash_combine(h, o.src_len); - boost::hash_combine(h, o.trg_len); - boost::hash_combine(h, o.j); - boost::hash_combine(h, o.a_j); - return h; -} -#endif - -struct QuasiModel2 { - explicit QuasiModel2(double alpha, double pnull = 0.1) : - alpha_(alpha), - pnull_(pnull), - pnotnull_(1 - pnull) {} - - // a_j = 0 => NULL; src_len does *not* include null - prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { - if (!a_j) return pnull_; - return pnotnull_ * - prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); - } - - void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { - assert(a_j <= src_len); - assert(j < trg_len); - ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; - } - - void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { - const AlignmentObservation ao(src_len, trg_len, j, a_j); - int &cc = obs_[ao]; - assert(cc > 0); - --cc; - if (!cc) obs_.erase(ao); - } - - struct PNullResampler { - PNullResampler(const QuasiModel2& m) : m_(m) {} - const QuasiModel2& m_; - double operator()(const double& proposed_pnull) const { - return log(m_.Likelihood(m_.alpha_, proposed_pnull)); - } - }; - - struct AlphaResampler { - AlphaResampler(const QuasiModel2& m) : m_(m) {} - const QuasiModel2& m_; - double operator()(const double& proposed_alpha) const { - return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); - } - }; - - void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - const PNullResampler dr(*this); - const AlphaResampler ar(*this); - for (unsigned i = 0; i < nloop; ++i) { - double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, - 1.0, 0.0, niterations, 100*niterations); - pnull_ = prob_t(pnull); - alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" - << pnull_.as_float() << ") = " << Likelihood() << std::endl; - zcache_.clear(); - } - - prob_t Likelihood() const { - return Likelihood(alpha_, pnull_.as_float()); - } - - prob_t Likelihood(double alpha, double ppnull) const { - const prob_t pnull(ppnull); - const prob_t pnotnull(1 - ppnull); - - prob_t p; - p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure - assert(!p.is_0()); - prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); - assert(!prob_of_ppnull.is_0()); - p *= prob_of_ppnull; - for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { - const AlignmentObservation& ao = it->first; - if (ao.a_j) { - prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); - prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); - prob_t pa(u / z); - pa *= pnotnull; - pa.poweq(it->second); - p *= pa; - } else { - p *= pnull.pow(it->second); - } - } - return p; - } - - private: - static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { - prob_t p; - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); - return p; - } - - static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { - prob_t z = prob_t::Zero(); - for (int a_j = 1; a_j <= src_len; ++a_j) - z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); - return z; - } - - static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { - return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); - } - - static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { - double z = 0; - for (int a_j = 1; a_j <= src_len; ++a_j) - z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); - return z; - } - - const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { - if (src_len >= zcache_.size()) - zcache_.resize(src_len + 1); - if (trg_len >= zcache_[src_len].size()) - zcache_[src_len].resize(trg_len + 1); - std::vector& zv = zcache_[src_len][trg_len]; - if (zv.size() == 0) - zv.resize(trg_len); - double& z = zv[j]; - if (!z) - z = ComputeZ(j, src_len, trg_len, alpha_); - return z; - } - - double alpha_; - prob_t pnull_; - prob_t pnotnull_; - mutable std::vector > > zcache_; - typedef std::tr1::unordered_map > ObsCount; - ObsCount obs_; -}; - -#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc deleted file mode 100644 index 7d0d04ac..00000000 --- a/gi/pf/reachability.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include "reachability.h" - -#include -#include - -using namespace std; - -struct SState { - SState() : prev_src_covered(), prev_trg_covered() {} - SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} - int prev_src_covered; - int prev_trg_covered; -}; - -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { - typedef boost::multi_array, 2> array_type; - array_type a(boost::extents[srclen + 1][trglen + 1]); - a[0][0].push_back(SState()); - for (int i = 0; i < srclen; ++i) { - for (int j = 0; j < trglen; ++j) { - if (a[i][j].size() == 0) continue; - const SState prev(i,j); - for (int k = 1; k <= src_max_phrase_len; ++k) { - if ((i + k) > srclen) continue; - for (int l = 1; l <= trg_max_phrase_len; ++l) { - if ((j + l) > trglen) continue; - a[i + k][j + l].push_back(prev); - } - } - } - } - a[0][0].clear(); - //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - if (a[srclen][trglen].empty()) { - cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; - nodes = 0; - return; - } - - typedef boost::multi_array rarray_type; - rarray_type r(boost::extents[srclen + 1][trglen + 1]); - r[srclen][trglen] = true; - nodes = 0; - for (int i = srclen; i >= 0; --i) { - for (int j = trglen; j >= 0; --j) { - vector& prevs = a[i][j]; - if (!r[i][j]) { prevs.clear(); } - for (int k = 0; k < prevs.size(); ++k) { - r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; - int src_delta = i - prevs[k].prev_src_covered; - edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; - valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair(src_delta,j - prevs[k].prev_trg_covered)); - short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; - if (src_delta > msd) msd = src_delta; - } - } - } - assert(!edges[0][0][1][0]); - assert(!edges[0][0][0][1]); - assert(!edges[0][0][0][0]); - assert(max_src_delta[0][0] > 0); - nodes = 0; - for (int i = 0; i < srclen; ++i) { - for (int j = 0; j < trglen; ++j) { - if (valid_deltas[i][j].size() > 0) { - node_addresses[i][j] = nodes++; - } else { - node_addresses[i][j] = -1; - } - } - } - cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; - } - diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h deleted file mode 100644 index 1e22c76a..00000000 --- a/gi/pf/reachability.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _REACHABILITY_H_ -#define _REACHABILITY_H_ - -#include "boost/multi_array.hpp" - -// determines minimum and maximum lengths of outgoing edges from all -// coverage positions such that the alignment path respects src and -// trg maximum phrase sizes -// -// runs in O(n^2 * src_max * trg_max) time but should be relatively fast -// -// currently forbids 0 -> n and n -> 0 alignments - -struct Reachability { - unsigned nodes; - boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? - boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid - boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") - boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : - nodes(), - edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]), - node_addresses(boost::extents[srclen][trglen]), - valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); - } - - private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); -}; - -#endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h deleted file mode 100644 index a4f4af36..00000000 --- a/gi/pf/tied_resampler.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _TIED_RESAMPLER_H_ -#define _TIED_RESAMPLER_H_ - -#include -#include -#include "sampler.h" -#include "slice_sampler.h" -#include "m.h" - -template -struct TiedResampler { - explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : - d_alpha(da), - d_beta(db), - s_shape(ss), - s_rate(sr), - discount(d), - strength(s) {} - - void Add(CRP* crp) { - crps.insert(crp); - crp->set_discount(discount); - crp->set_strength(strength); - assert(!crp->has_discount_prior()); - assert(!crp->has_strength_prior()); - } - - void Remove(CRP* crp) { - crps.erase(crp); - } - - size_t size() const { - return crps.size(); - } - - double LogLikelihood(double d, double s) const { - if (s <= -d) return -std::numeric_limits::infinity(); - double llh = Md::log_beta_density(d, d_alpha, d_beta) + - Md::log_gamma_density(d + s, s_shape, s_rate); - for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) - llh += (*it)->log_crp_prob(d, s); - return llh; - } - - double LogLikelihood() const { - return LogLikelihood(discount, strength); - } - - struct DiscountResampler { - DiscountResampler(const TiedResampler& m) : m_(m) {} - const TiedResampler& m_; - double operator()(const double& proposed_discount) const { - return m_.LogLikelihood(proposed_discount, m_.strength); - } - }; - - struct AlphaResampler { - AlphaResampler(const TiedResampler& m) : m_(m) {} - const TiedResampler& m_; - double operator()(const double& proposed_strength) const { - return m_.LogLikelihood(m_.discount, proposed_strength); - } - }; - - void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } - const DiscountResampler dr(*this); - const AlphaResampler ar(*this); - for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - double min_discount = std::numeric_limits::min(); - if (strength < 0.0) min_discount -= strength; - discount = slice_sampler1d(dr, discount, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - std::cerr << "TiedCRPs(d=" << discount << ",s=" - << strength << ") = " << LogLikelihood(discount, strength) << std::endl; - for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) - (*it)->set_hyperparameters(discount, strength); - } - private: - std::set crps; - const double d_alpha, d_beta, s_shape, s_rate; - double discount, strength; -}; - -// split according to some criterion -template -struct BinTiedResampler { - explicit BinTiedResampler(unsigned nbins) : - resamplers(nbins, TiedResampler(1,1,1,1)) {} - - void Add(unsigned bin, CRP* crp) { - resamplers[bin].Add(crp); - } - - void Remove(unsigned bin, CRP* crp) { - resamplers[bin].Remove(crp); - } - - void ResampleHyperparameters(MT19937* rng) { - for (unsigned i = 0; i < resamplers.size(); ++i) { - std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; - resamplers[i].ResampleHyperparameters(rng); - } - } - - double LogLikelihood() const { - double llh = 0; - for (unsigned i = 0; i < resamplers.size(); ++i) - llh += resamplers[i].LogLikelihood(); - return llh; - } - - private: - std::vector > resamplers; -}; - -#endif diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc deleted file mode 100644 index 7348d21c..00000000 --- a/gi/pf/tpf.cc +++ /dev/null @@ -1,99 +0,0 @@ -#include -#include -#include - -#include "sampler.h" - -using namespace std; -using namespace tr1; - -shared_ptr prng; - -struct Particle { - Particle() : weight(prob_t::One()) {} - vector states; - prob_t weight; - prob_t gamma_last; -}; - -ostream& operator<<(ostream& os, const Particle& p) { - os << "["; - for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' '; - os << "| w=" << log(p.weight) << ']'; - return os; -} - -void Rejuvenate(vector& pps) { - SampleSet ss; - vector nps(pps.size()); - for (int i = 0; i < pps.size(); ++i) { -// cerr << pps[i] << endl; - ss.add(pps[i].weight); - } -// cerr << "REJUVINATING...\n"; - for (int i = 0; i < pps.size(); ++i) { - nps[i] = pps[prng->SelectSample(ss)]; - nps[i].weight = prob_t(1.0 / pps.size()); -// cerr << nps[i] << endl; - } - nps.swap(pps); -// exit(1); -} - -int main(int argc, char** argv) { - const unsigned particles = 100; - prng.reset(new MT19937); - MT19937& rng = *prng; - - // q(a) = 0.8 - // q(b) = 0.8 - // q(c) = 0.4 - SampleSet ssq; - ssq.add(0.4); - ssq.add(0.6); - ssq.add(0); - double qz = 1; - - // p(a) = 0.2 - // p(b) = 0.8 - vector p(3); - p[0] = 0.2; - p[1] = 0.8; - p[2] = 0; - - vector counts(3); - int tot = 0; - - vector pps(particles); - SampleSet ppss; - int LEN = 12; - int PP = 1; - while (pps[0].states.size() < LEN) { - for (int pi = 0; pi < particles; ++pi) { - Particle& prt = pps[pi]; - - bool redo = true; - const Particle savedp = prt; - while (redo) { - redo = false; - for (int i = 0; i < PP; ++i) { - int s = rng.SelectSample(ssq); - double gamma_last = p[s]; - if (!gamma_last) { redo = true; break; } - double q = ssq[s] / qz; - prt.states.push_back(s); - prt.weight *= prob_t(gamma_last / q); - } - if (redo) { prt = savedp; continue; } - } - } - Rejuvenate(pps); - } - ppss.clear(); - for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); } - int sp = rng.SelectSample(ppss); - cerr << pps[sp] << endl; - - return 0; -} - diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc deleted file mode 100644 index b2996f65..00000000 --- a/gi/pf/transliterations.cc +++ /dev/null @@ -1,334 +0,0 @@ -#include "transliterations.h" - -#include -#include - -#include "boost/shared_ptr.hpp" - -#include "backward.h" -#include "filelib.h" -#include "tdict.h" -#include "trule.h" -#include "filelib.h" -#include "ccrp_nt.h" -#include "m.h" -#include "reachability.h" - -using namespace std; -using namespace std::tr1; - -struct TruncatedConditionalLengthModel { - TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : - plens(max_src_size+1, vector(max_trg_size+1, 0.0)) { - for (unsigned i = 1; i <= max_src_size; ++i) { - prob_t z = prob_t::Zero(); - for (unsigned j = 1; j <= max_trg_size; ++j) - z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); - for (unsigned j = 1; j <= max_trg_size; ++j) - plens[i][j] /= z; - //for (unsigned j = 1; j <= max_trg_size; ++j) - // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; - } - } - - // return p(tlen | slen) for *chunks* not full words - inline const prob_t& operator()(int slen, int tlen) const { - return plens[slen][tlen]; - } - - vector > plens; -}; - -struct CondBaseDist { - CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : - tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} - - prob_t operator()(const vector& src, unsigned sf, unsigned st, - const vector& trg, unsigned tf, unsigned tt) const { - prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len) - assert(!"not impl"); - return p; - } - inline prob_t operator()(const vector& src, const vector& trg) const { - return (*this)(src, 0, src.size(), trg, 0, trg.size()); - } - TruncatedConditionalLengthModel tclm; -}; - -// represents transliteration phrase probabilities, e.g. -// p( a l - | A l ) , p( o | A w ) , ... -struct TransliterationChunkConditionalModel { - explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : - d(0.0), - strength(1.0), - rp0(pp0) { - } - - void Summary() const { - std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; - for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - std::cerr << " " << i2->second << '\t' << i2->first << std::endl; - } - } - - int DecrementRule(const TRule& rule) { - RuleModelHash::iterator it = r.find(rule.f_); - assert(it != r.end()); - int count = it->second.decrement(rule); - if (count) { - if (it->second.num_customers() == 0) r.erase(it); - } - return count; - } - - int IncrementRule(const TRule& rule) { - RuleModelHash::iterator it = r.find(rule.f_); - if (it == r.end()) { - it = r.insert(make_pair(rule.f_, CCRP_NoTable(strength))).first; - } - int count = it->second.increment(rule); - return count; - } - - void IncrementRules(const std::vector& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const std::vector& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - prob_t RuleProbability(const TRule& rule) const { - prob_t p; - RuleModelHash::const_iterator it = r.find(rule.f_); - if (it == r.end()) { - p = rp0(rule.f_, rule.e_); - } else { - p = it->second.prob(rule, rp0(rule.f_, rule.e_)); - } - return p; - } - - double LogLikelihood(const double& dd, const double& aa) const { - if (aa <= -dd) return -std::numeric_limits::infinity(); - //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); - double llh = //Md::log_beta_density(dd, 1, 1) + - Md::log_gamma_density(dd + aa, 1, 1); - std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::const_iterator it; - for (it = r.begin(); it != r.end(); ++it) - llh += it->second.log_crp_prob(aa); - return llh; - } - - struct AlphaResampler { - AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} - const TransliterationChunkConditionalModel& m_; - double operator()(const double& proposed_strength) const { - return m_.LogLikelihood(m_.d, proposed_strength); - } - }; - - void ResampleHyperparameters(MT19937* rng) { - std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::iterator it; - //const unsigned nloop = 5; - const unsigned niterations = 10; - //DiscountResampler dr(*this); - AlphaResampler ar(*this); -#if 0 - for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - double min_discount = std::numeric_limits::min(); - if (strength < 0.0) min_discount -= strength; - d = slice_sampler1d(dr, d, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } -#endif - strength = slice_sampler1d(ar, strength, *rng, -d, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; - for (it = r.begin(); it != r.end(); ++it) { -#if 0 - it->second.set_discount(d); -#endif - it->second.set_alpha(strength); - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(LogLikelihood(d, strength)); - return p; - } - - const CondBaseDist& rp0; - typedef std::tr1::unordered_map, - CCRP_NoTable, - boost::hash > > RuleModelHash; - RuleModelHash r; - double d, strength; -}; - -struct GraphStructure { - GraphStructure() : r() {} - // leak memory - these are basically static - const Reachability* r; - bool IsReachable() const { return r->nodes > 0; } -}; - -struct ProbabilityEstimates { - ProbabilityEstimates() : gs(), backward() {} - explicit ProbabilityEstimates(const GraphStructure& g) : - gs(&g), backward() { - if (g.r->nodes > 0) - backward = new float[g.r->nodes]; - } - // leak memory, these are static - - // returns an estimate of the marginal probability - double MarginalEstimate() const { - if (!backward) return 0; - return backward[0]; - } - - // returns an backward estimate - double Backward(int src_covered, int trg_covered) const { - if (!backward) return 0; - int ind = gs->r->node_addresses[src_covered][trg_covered]; - if (ind < 0) return 0; - return backward[ind]; - } - - prob_t estp; - float* backward; - private: - const GraphStructure* gs; -}; - -struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : - cp0(max_src, max_trg, sr), - tccm(cp0), - be(b), - kMAX_SRC_CHUNK(max_src), - kMAX_TRG_CHUNK(max_trg), - kS2T_RATIO(sr), - tot_pairs(), tot_mem() { - } - const CondBaseDist cp0; - TransliterationChunkConditionalModel tccm; - const BackwardEstimator& be; - - void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - const size_t src_len = src_lets.size(); - const size_t trg_len = trg_lets.size(); - - // init graph structure - if (src_len >= graphs.size()) graphs.resize(src_len + 1); - if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - GraphStructure& gs = graphs[src_len][trg_len]; - if (!gs.r) { - double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); - if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { - cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; - gs.r = new Reachability(src_len, trg_len, 0, 0); - } else { - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); - } - } - - const Reachability& r = *gs.r; - - // init backward estimates - if (src >= ests.size()) ests.resize(src + 1); - unordered_map::iterator it = ests[src].find(trg); - if (it != ests[src].end()) return; // already initialized - - it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; - ProbabilityEstimates& est = it->second; - if (!gs.r->nodes) return; // not derivable subject to length constraints - - be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); - cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; - tot_pairs++; - tot_mem += sizeof(float) * gs.r->nodes; - } - - void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - const size_t src_len = src_lets.size(); - const size_t trg_len = trg_lets.size(); - // TODO - } - - prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { - assert(src.size() < graphs.size()); - const vector& tv = graphs[src.size()]; - assert(trg.size() < tv.size()); - const GraphStructure& gs = tv[trg.size()]; - if (gs.r->nodes == 0) - return prob_t::Zero(); - const unordered_map::const_iterator it = ests[s].find(t); - assert(it != ests[s].end()); - return it->second.estp; - } - - void GraphSummary() const { - double to = 0; - double tn = 0; - double tt = 0; - for (int i = 0; i < graphs.size(); ++i) { - const vector& vt = graphs[i]; - for (int j = 0; j < vt.size(); ++j) { - const GraphStructure& gs = vt[j]; - if (!gs.r) continue; - tt++; - for (int k = 0; k < i; ++k) { - for (int l = 0; l < j; ++l) { - size_t c = gs.r->valid_deltas[k][l].size(); - if (c) { - tn += 1; - to += c; - } - } - } - } - } - cerr << " Average nodes = " << (tn / tt) << endl; - cerr << "Average out-degree = " << (to / tn) << endl; - cerr << " Unique structures = " << tt << endl; - cerr << " Unique pairs = " << tot_pairs << endl; - cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; - } - - const int kMAX_SRC_CHUNK; - const int kMAX_TRG_CHUNK; - const double kS2T_RATIO; - unsigned tot_pairs; - size_t tot_mem; - vector > graphs; // graphs[src_len][trg_len] - vector > ests; // ests[src][trg] -}; - -Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : - pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} -Transliterations::~Transliterations() { delete pimpl_; } - -void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - pimpl_->Initialize(src, src_lets, trg, trg_lets); -} - -prob_t Transliterations::EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { - return pimpl_->EstimateProbability(s, src,t, trg); -} - -void Transliterations::Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - pimpl_->Forbid(src, src_lets, trg, trg_lets); -} - -void Transliterations::GraphSummary() const { - pimpl_->GraphSummary(); -} - diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h deleted file mode 100644 index 49d14684..00000000 --- a/gi/pf/transliterations.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _TRANSLITERATIONS_H_ -#define _TRANSLITERATIONS_H_ - -#include -#include "wordid.h" -#include "prob.h" - -struct BackwardEstimator; -struct TransliterationsImpl; -struct Transliterations { - // max_src and max_trg indicate how big the transliteration phrases can be - // see reachability.h for information about filter_ratio - explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); - ~Transliterations(); - void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); - void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); - void GraphSummary() const; - prob_t EstimateProbability(WordID s, const std::vector& src, WordID t, const std::vector& trg) const; - private: - TransliterationsImpl* pimpl_; -}; - -#endif - diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc deleted file mode 100644 index 40829775..00000000 --- a/gi/pf/unigrams.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "unigrams.h" - -#include -#include - -#include "stringlib.h" -#include "filelib.h" - -using namespace std; - -void UnigramModel::LoadUnigrams(const string& fname) { - cerr << "Loading unigram probabilities from " << fname << " ..." << endl; - ReadFile rf(fname); - string line; - istream& in = *rf.stream(); - assert(in); - getline(in, line); - assert(line.empty()); - getline(in, line); - assert(line == "\\data\\"); - getline(in, line); - size_t pos = line.find("ngram 1="); - assert(pos == 0); - assert(line.size() > 8); - const size_t num_unigrams = atoi(&line[8]); - getline(in, line); - assert(line.empty()); - getline(in, line); - assert(line == "\\1-grams:"); - for (size_t i = 0; i < num_unigrams; ++i) { - getline(in, line); - assert(line.size() > 0); - pos = line.find('\t'); - assert(pos > 0); - assert(pos + 1 < line.size()); - const WordID w = TD::Convert(line.substr(pos + 1)); - line[pos] = 0; - float p = atof(&line[0]); - if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; - } -} - -void UnigramWordModel::LoadUnigrams(const string& fname) { - cerr << "Loading unigram probabilities from " << fname << " ..." << endl; - ReadFile rf(fname); - string line; - istream& in = *rf.stream(); - assert(in); - getline(in, line); - assert(line.empty()); - getline(in, line); - assert(line == "\\data\\"); - getline(in, line); - size_t pos = line.find("ngram 1="); - assert(pos == 0); - assert(line.size() > 8); - const size_t num_unigrams = atoi(&line[8]); - getline(in, line); - assert(line.empty()); - getline(in, line); - assert(line == "\\1-grams:"); - for (size_t i = 0; i < num_unigrams; ++i) { - getline(in, line); - assert(line.size() > 0); - pos = line.find('\t'); - assert(pos > 0); - assert(pos + 1 < line.size()); - size_t cur = pos + 1; - vector w; - while (cur < line.size()) { - const size_t len = UTF8Len(line[cur]); - w.push_back(TD::Convert(line.substr(cur, len))); - cur += len; - } - line[pos] = 0; - float p = atof(&line[0]); - probs_[w].logeq(p * log(10.0)); - } -} - diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h deleted file mode 100644 index 1660d1ed..00000000 --- a/gi/pf/unigrams.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef _UNIGRAMS_H_ -#define _UNIGRAMS_H_ - -#include -#include -#include -#include - -#include "wordid.h" -#include "prob.h" -#include "tdict.h" - -struct UnigramModel { - explicit UnigramModel(const std::string& fname, unsigned vocab_size) : - use_uniform_(fname.size() == 0), - uniform_(1.0 / vocab_size), - probs_() { - if (fname.size() > 0) { - probs_.resize(TD::NumWords() + 1); - LoadUnigrams(fname); - } - } - - const prob_t& operator()(const WordID& w) const { - assert(w); - if (use_uniform_) return uniform_; - return probs_[w]; - } - - private: - void LoadUnigrams(const std::string& fname); - - const bool use_uniform_; - const prob_t uniform_; - std::vector probs_; -}; - - -// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' -struct UnigramWordModel { - explicit UnigramWordModel(const std::string& fname) : - use_uniform_(false), - uniform_(1.0), - probs_() { - LoadUnigrams(fname); - } - - explicit UnigramWordModel(const unsigned vocab_size) : - use_uniform_(true), - uniform_(1.0 / vocab_size), - probs_() {} - - const prob_t& operator()(const std::vector& s) const { - if (use_uniform_) return uniform_; - const VectorProbHash::const_iterator it = probs_.find(s); - assert(it != probs_.end()); - return it->second; - } - - private: - void LoadUnigrams(const std::string& fname); - - const bool use_uniform_; - const prob_t uniform_; - typedef std::tr1::unordered_map, prob_t, boost::hash > > VectorProbHash; - VectorProbHash probs_; -}; - -#endif diff --git a/gi/pipeline/OLD.clsp.config b/gi/pipeline/OLD.clsp.config deleted file mode 100644 index cd0f9d65..00000000 --- a/gi/pipeline/OLD.clsp.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... -btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /export/ws10smt/data/chinese-english corpus.zh-en.al -aren /export/ws10smt/data/arabic-english corpus.ar-en.al -uren /export/ws10smt/data/urdu-english corpus.ur-en.al -nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/OLD.evaluation-pipeline.pl b/gi/pipeline/OLD.evaluation-pipeline.pl deleted file mode 100755 index 49c303eb..00000000 --- a/gi/pipeline/OLD.evaluation-pipeline.pl +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use Getopt::Long; -use Cwd; -my $CWD = getcwd; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -my @DEFAULT_FEATS = qw( - LogRuleCount SingletonRule LexE2F LexF2E WordPenalty - LogFCount LanguageModel Glue GlueTop PassThrough SingletonF -); - -my %init_weights = qw( - LogRuleCount 0.2 - LexE2F -0.3 - LexF2E -0.3 - LogFCount 0.1 - WordPenalty -1.5 - LanguageModel 1.2 - Glue -1.0 - GlueTop 0.00001 - PassThrough -10.0 - SingletonRule -0.1 - X_EGivenF -0.3 - X_FGivenE -0.3 - X_LogECount -1 - X_LogFCount -0.1 - X_LogRuleCount 0.3 - X_SingletonE -0.1 - X_SingletonF -0.1 - X_SingletonRule -0.5 -); - -my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; -my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; -my $EXTOOLS = "$SCRIPT_DIR/../../extools"; -die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; -my $VEST = "$SCRIPT_DIR/../../vest"; -die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; -my $DISTVEST = "$VEST/dist-vest.pl"; -my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; -my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; -assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); - -my $config = "$SCRIPT_DIR/OLD.clsp.config"; -print STDERR "CORPORA CONFIGURATION: $config\n"; -open CONF, "<$config" or die "Can't read $config: $!"; -my %paths; -my %corpora; -my %lms; -my %devs; -my %devrefs; -my %tests; -my %testevals; -my %xgrammars; -print STDERR " LANGUAGE PAIRS:"; -while() { - chomp; - next if /^#/; - next if /^\s*$/; - s/^\s+//; - s/\s+$//; - my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/; - $paths{$name} = $path; - $corpora{$name} = $corpus; - $lms{$name} = $lm; - $xgrammars{$name} = $xgrammar; - $devs{$name} = $dev; - $devrefs{$name} = $devref; - $tests{$name} = $xtests[0]; - $testevals{$name} = $xtests[1]; - print STDERR " $name"; -} -print STDERR "\n"; - -my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); - -my $outdir = "$CWD/exp"; -my $help; -my $XFEATS; -my $EXTRA_FILTER = ''; -my $dataDir = '/export/ws10smt/data'; -if (GetOptions( - "data=s" => \$dataDir, - "xfeats" => \$XFEATS, -) == 0 || @ARGV!=2 || $help) { - print_help(); - exit; -} -my $lp = $ARGV[0]; -my $grammar = $ARGV[1]; -print STDERR " CORPUS REPO: $dataDir\n"; -print STDERR " LANGUAGE PAIR: $lp\n"; -die "I don't know about that language pair\n" unless $paths{$lp}; -my $corpdir = "$dataDir"; -if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } -die "I can't find the corpora directory: $corpdir" unless -d $corpdir; -print STDERR " GRAMMAR: $grammar\n"; -my $LANG_MODEL = mydircat($corpdir, $lms{$lp}); -print STDERR " LM: $LANG_MODEL\n"; -my $CORPUS = mydircat($corpdir, $corpora{$lp}); -die "Can't find corpus: $CORPUS" unless -f $CORPUS; - -my $dev = mydircat($corpdir, $devs{$lp}); -my $drefs = $devrefs{$lp}; -die "Can't find dev: $dev\n" unless -f $dev; -die "Dev refs not set" unless $drefs; -$drefs = mydircat($corpdir, $drefs); - -my $test = mydircat($corpdir, $tests{$lp}); -my $teval = mydircat($corpdir, $testevals{$lp}); -die "Can't find test: $test\n" unless -f $test; -assert_exec($teval); - -if ($XFEATS) { - my $xgram = mydircat($corpdir, $xgrammars{$lp}); - die "Can't find x-grammar: $xgram" unless -f $xgram; - $EXTRA_FILTER = "$ADDXFEATS $xgram |"; - print STDERR "ADDING X-FEATS FROM $xgram\n"; -} - -# MAKE DEV -print STDERR "\nFILTERING FOR dev...\n"; -print STDERR "DEV: $dev (REFS=$drefs)\n"; -`mkdir -p $outdir`; -my $devgrammar = filter($grammar, $dev, 'dev', $outdir); -my $devini = mydircat($outdir, "cdec-dev.ini"); -write_cdec_ini($devini, $devgrammar); - - -# MAKE TEST -print STDERR "\nFILTERING FOR test...\n"; -print STDERR "TEST: $test (EVAL=$teval)\n"; -`mkdir -p $outdir`; -my $testgrammar = filter($grammar, $test, 'test', $outdir); -my $testini = mydircat($outdir, "cdec-test.ini"); -write_cdec_ini($testini, $testgrammar); - - -# CREATE INIT WEIGHTS -print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; -my $weights = mydircat($outdir, "weights.init"); -write_random_weights_file($weights); - - -# VEST -print STDERR "\nMINIMUM ERROR TRAINING\n"; -my $tuned_weights = mydircat($outdir, 'weights.tuned'); -if (-f $tuned_weights) { - print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; -} else { - my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; - print STDERR "MERT COMMAND: $cmd\n"; - `rm -rf $outdir/vest 2> /dev/null`; - chdir $outdir or die "Can't chdir to $outdir: $!"; - $weights = `$cmd`; - die "MERT reported non-zero exit code" unless $? == 0; - chomp $weights; - safesystem($tuned_weights, "cp $weights $tuned_weights"); - print STDERR "TUNED WEIGHTS: $tuned_weights\n"; - die "$tuned_weights is missing!" unless -f $tuned_weights; -} - -# DECODE -print STDERR "\nDECODE TEST SET\n"; -my $decolog = mydircat($outdir, "test-decode.log"); -my $testtrans = mydircat($outdir, "test.trans"); -my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; -safesystem($testtrans, $cmd) or die "Failed to decode test set!"; - - -# EVALUATE -print STDERR "\nEVALUATE TEST SET\n"; -print STDERR "TEST: $testtrans\n"; -$cmd = "$teval $testtrans"; -safesystem(undef, $cmd) or die "Failed to evaluate!"; -exit 0; - - -sub write_random_weights_file { - my ($file, @extras) = @_; - open F, ">$file" or die "Can't write $file: $!"; - my @feats = (@DEFAULT_FEATS, @extras); - if ($XFEATS) { - my @xfeats = qw( - X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF - ); - @feats = (@feats, @xfeats); - } - for my $feat (@feats) { - my $r = rand(1.6); - my $w = $init_weights{$feat} * $r; - if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } - print F "$feat $w\n"; - } - close F; -} - -sub filter { - my ($grammar, $set, $name, $outdir) = @_; - my $outgrammar = mydircat($outdir, "$name.scfg.gz"); - if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { - my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar"; - safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!"; - } - return $outgrammar; -} - -sub mydircat { - my ($base, $suffix) = @_; - if ($suffix =~ /^\//) { return $suffix; } - my $res = $base . '/' . $suffix; - $res =~ s/\/\//\//g; - return $res; -} - -sub write_cdec_ini { - my ($filename, $grammar_path) = (@_); - open CDECINI, ">$filename" or die "Can't write $filename: $!"; - print CDECINI <> 8; - if ($exitcode) { - print STDERR "Exit code: $exitcode\n"; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - } - return ! $exitcode; - } -} - -sub assert_exec { - my @files = @_; - for my $file (@files) { - die "Can't find $file - did you run make?\n" unless -e $file; - die "Can't execute $file" unless -e $file; - } -}; - diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl deleted file mode 100644 index ac103c8b..00000000 --- a/gi/pipeline/backoff-pipe.pl +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my @grammars; -my $OUTPUTPREFIX = './giwork/bo.hier.grammar'; -safemkdir($OUTPUTPREFIX); -my $backoff_levels = 1; -my $glue_levels = 1; - -usage() unless &GetOptions('grmr=s@' => \ @grammars, - 'outprefix=s' => \ $OUTPUTPREFIX, - 'bo-lvls=i' => \ $backoff_levels, - 'glue-lvls=i' => \ $glue_levels, -); - -my $OUTDIR = $OUTPUTPREFIX . '/hier'; -print STDERR "@grammars\n"; - - -my %grmr = (); -foreach my $grammar (@grammars) { - $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*/; - $grmr{$1} = $grammar; -} - -my @index = sort keys %grmr; -$OUTDIR = $OUTDIR . join('-',@index); -safemkdir($OUTDIR); -my $BACKOFF_GRMR = $OUTDIR . '/backoff.hier.gz'; -safesystem("echo \"\" | gzip > $BACKOFF_GRMR"); -my $GLUE_GRMR = $OUTDIR . '/glue.hier.gz'; -safesystem("echo \"\" | gzip > $GLUE_GRMR"); -my $joinedgrammars = $OUTDIR . '/grammar.hier.gz'; - -join_grammars(); - -for my $i (0..(scalar @index)-2) { - my $freqs = extract_freqs($index[$i], $index[$i+1]); - if ($i < $backoff_levels) { - create_backoff_rules($index[$i],$index[$i+1],$freqs); - } - if ($i < $glue_levels) { - add_glue_rules($index[$i]); - } -} - -output_grammar_info(); - - -sub usage { - print <> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } -} - - -sub join_grammars { - print STDERR "\n!!! JOINING GRAMMARS\n"; - if(-e $joinedgrammars) { - print STDERR "$joinedgrammars exists, reusing...\n"; - return; - } - safesystem("echo \"\" | gzip > $joinedgrammars"); - foreach my $i (@index) { - my $g = $grmr{$i}; - safesystem("zcat $g | sed -r -e 's/X([0-9]+)/X$i\\1/g' - | gzip > $g.2.gz"); - safesystem("zcat $joinedgrammars $g.2.gz | gzip > $joinedgrammars.2.gz"); - safesystem("mv $joinedgrammars.2.gz $joinedgrammars"); - } -} - - -sub extract_freqs { - my($grmr1,$grmr2) = @_; - print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n"; - my $IN_COARSE = substr($grmr{$grmr1},0,index($grmr{$grmr1},".grammar/")) . "/labeled_spans.txt"; - my $IN_FINE = substr($grmr{$grmr2},0,index($grmr{$grmr2},".grammar/")) . "/labeled_spans.txt"; - my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$grmr1-$grmr2.txt"; - my $FREQS = "$OUTDIR/label_freq.hier$grmr1-$grmr2.txt"; - if(-e $OUT_SPANS && -e $FREQS) { - print STDERR "$OUT_SPANS exists, reusing...\n"; - print STDERR "$FREQS exists, reusing...\n"; - return $FREQS; - } - - safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); - - my %FREQ_HIER = (); - my %finehier = (); - - open SPANS, $OUT_SPANS or die $!; - while () { - my ($tmp, $coarse, $fine) = split /\|\|\|/; - my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; - my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; - - foreach my $i (0..(scalar @coarse_spans)-1) { - my $coarse_cat = $coarse_spans[$i]; - my $fine_cat = $fine_spans[$i]; - - $FREQ_HIER{$coarse_cat}{$fine_cat}++; - } - } - close SPANS; - foreach (values %FREQ_HIER) { - my $coarse_freq = $_; - my $total = 0; - $total+=$_ for (values %{ $coarse_freq }); - $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); - } - open FREQS, ">", $FREQS or die $!; - foreach my $coarse_cat (keys %FREQ_HIER) { - print FREQS "$coarse_cat |||"; - foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { - my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat}; - print FREQS " $fine_cat:$freq"; - if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) { - $finehier{$fine_cat} = $coarse_cat; - } - } - print FREQS "\n"; - } -# foreach my $fine_cat (keys %finehier) { -# print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; -# } - close FREQS; - return $FREQS; -} - - -sub create_backoff_rules { - print STDERR "\n!!! CREATING BACKOFF RULES\n"; - my ($grmr1, $grmr2, $freq) = @_; - my $OUTFILE = "$OUTDIR/backoff.hier$grmr1-$grmr2.txt"; - if(-e $OUTFILE) { - print STDERR "$OUTFILE exists, reusing...\n"; - return; - } - open FREQS, $freq or die $!; - open TMP, ">", $OUTFILE or die $!; - while () { - my $line = $_; - $line = m/^(\d+) \|\|\| (.+)$/; - my $coarse = $1; - $line = $2; - my @finefreq = $line =~ m/(\d+):(\S+)/g; - for(my $i = 0; $i < scalar @finefreq; $i+=2) { - my $finecat = $finefreq[$i]; - my $finefreq = $finefreq[$i+1]; - print TMP "[X$grmr1$coarse] ||| [X$grmr2$finecat,1]\t[1] ||| BackoffRule=$finefreq A=0-0\n"; - } - } - close TMP; - close FREQS; - safesystem("zcat $BACKOFF_GRMR | cat - $OUTFILE | gzip > $BACKOFF_GRMR.2.gz"); - safesystem("mv $BACKOFF_GRMR.2.gz $BACKOFF_GRMR"); -} - -sub add_glue_rules { - print STDERR "\n!!! CREATING GLUE RULES\n"; - my ($grmr) = @_; - my $OUTFILE = "$OUTDIR/glue.$grmr.gz"; - if (-e $OUTFILE) { - print STDERR "$OUTFILE exists, reusing...\n"; - return; - } - open TMP, ">", $OUTFILE or die $!; - for my $i (0..($grmr-1)) { - print TMP "[S] ||| [S,1] [X$grmr$i,2] ||| [1] [2] ||| Glue=1\n"; - print TMP "[S] ||| [X$grmr$i,1] ||| [1] ||| GlueTop=1\n"; - } - close TMP; - safesystem("zcat $GLUE_GRMR | cat - $OUTFILE | gzip > $GLUE_GRMR.2.gz"); - safesystem("mv $GLUE_GRMR.2.gz $GLUE_GRMR"); -} - -sub output_grammar_info { - print STDERR "\n!!! GRAMMAR INFORMATION\n"; - print STDOUT "GRAMMAR: \t$joinedgrammars\n"; - print STDOUT "GLUE: \t$GLUE_GRMR\n"; - print STDOUT "BACKOFF: \t$BACKOFF_GRMR\n"; -} diff --git a/gi/pipeline/blacklight.config b/gi/pipeline/blacklight.config deleted file mode 100644 index fc59a604..00000000 --- a/gi/pipeline/blacklight.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/usr/users/0/cdyer/ws10smt/data -btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh -aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh -uren /usr/users/0/cdyer/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config deleted file mode 100644 index c23d409f..00000000 --- a/gi/pipeline/clsp.config +++ /dev/null @@ -1,10 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/export/ws10smt/data -btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /export/ws10smt/data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh -aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh -uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl deleted file mode 100755 index 4b4529d9..00000000 --- a/gi/pipeline/evaluation-pipeline.pl +++ /dev/null @@ -1,364 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use Getopt::Long; -use Cwd; -my $CWD = getcwd; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } -use LocalConfig; - -my $JOBS = 15; -my $PMEM = "9G"; -my $NUM_TRANSLATIONS = 50; -my $GOAL = "S"; - -# featurize_grammar may add multiple features from a single feature extractor -# the key in this map is the extractor name, the value is a list of the extracted features -my $feat_map = { - "LogRuleCount" => [ "LogRuleCount", "SingletonRule" ] , -# "XFeatures" => [ "XFE","XEF" ] , - "XFeatures" => [ "XFE","XEF","LabelledEF","LabelledFE"], # ,"XE_Singleton","XF_Singleton"] , - "LabelledRuleConditionals" => [ "LabelledFE","LabelledEF" ] , - "LexProb" => [ "LexE2F", "LexF2E" ] , - "BackoffRule" => [ "BackoffRule" ] , - "RulePenalty" => [ "RulePenalty" ] , - "LHSProb" => [ "LHSProb" ] , - "LabellingShape" => [ "LabellingShape" ] , - "GenerativeProb" => [ "GenerativeProb" ] , -}; - -my %init_weights = qw( - EGivenF -0.735245 - FGivenE -0.219391 - Glue -0.306709 - GlueTop 0.0473331 - LanguageModel 2.40403 - LexE2F -0.266989 - LexF2E -0.550373 - LogECount -0.129853 - LogFCount -0.194037 - LogRuleCount 0.256706 - BackoffRule 0.5 - XFE -0.256706 - XEF -0.256706 - XF_Singleton -0.05 - XE_Singleton -0.8 - LabelledFE -0.256706 - LabelledEF -0.256706 - PassThrough -0.9304905 - SingletonE -3.04161 - SingletonF 0.0714027 - SingletonRule -0.889377 - WordPenalty -1.99495 - RulePenalty -0.1 - LabellingShape -0.1 - LHSProb -0.1 - GenerativeProb -0.1 -); - - -# these features are included by default -my @DEFAULT_FEATS = qw( PassThrough Glue GlueTop LanguageModel WordPenalty ); - - - -my $FILTERBYF = "$SCRIPT_DIR/scripts/filter-by-f.pl"; -my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; -my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; -my $EXTOOLS = "$SCRIPT_DIR/../../extools"; -die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; -my $VEST = "$SCRIPT_DIR/../../vest"; -die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; -my $DISTVEST = "$VEST/dist-vest.pl"; -my $FILTER = "$EXTOOLS/filter_grammar"; -my $FEATURIZE = "$EXTOOLS/featurize_grammar"; -assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF); - -my $numtopics = 25; - -my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config'; -print STDERR "CORPORA CONFIGURATION: $config\n"; -open CONF, "<$config" or die "Can't read $config: $!"; -my %paths; -my %corpora; -my %lms; -my %devs; -my %devrefs; -my %tests; -my %testevals; -my $datadir; -print STDERR " LANGUAGE PAIRS:"; -while() { - chomp; - next if /^#/; - next if /^\s*$/; - s/^\s+//; - s/\s+$//; - if (! defined $datadir) { $datadir = $_; next; } - my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/; - $paths{$name} = $path; - $corpora{$name} = $corpus; - $lms{$name} = $lm; - $devs{$name} = $dev; - $devrefs{$name} = $devref; - $tests{$name} = $xtests[0]; - $testevals{$name} = $xtests[1]; - print STDERR " $name"; -} -print STDERR "\n"; - -my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); - -my $outdir = "$CWD/exp"; -my $help; -my $FEATURIZER_OPTS = ''; -my $dataDir = '/export/ws10smt/data'; -my @features; -my $bkoffgram; -my $gluegram; -my $oovgram; -my $usefork; -my $lmorder = 3; -my $density; -if (GetOptions( - "backoff-grammar=s" => \$bkoffgram, - "density-prune=f" => \$density, - "glue-grammar=s" => \$gluegram, - "oov-grammar=s" => \$oovgram, - "data=s" => \$dataDir, - "pmem=s" => \$PMEM, - "n=i" => \$NUM_TRANSLATIONS, - "features=s@" => \@features, - "use-fork" => \$usefork, - "jobs=i" => \$JOBS, - "out-dir=s" => \$outdir, - "lmorder=i" => \$lmorder, - "goal=s" => \$GOAL, -) == 0 || @ARGV!=2 || $help) { - print_help(); - exit; -} -my $DENSITY_PRUNE = ''; -if ($density) { - $DENSITY_PRUNE = "--density-prune $density"; -} -if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; } -my @fkeys = keys %$feat_map; -die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0; -my @xfeats; -for my $feat (@features) { - my $rs = $feat_map->{$feat}; - if (!defined $rs) { die "DON'T KNOW ABOUT FEATURE $feat\n"; } - my @xfs = @$rs; - @xfeats = (@xfeats, @xfs); - $FEATURIZER_OPTS .= " -f $feat" unless $feat eq "BackoffRule"; -} -print STDERR "X-FEATS: @xfeats\n"; - -my $lp = $ARGV[0]; -my $grammar = $ARGV[1]; -print STDERR " CORPUS REPO: $dataDir\n"; -print STDERR " LANGUAGE PAIR: $lp\n"; -die "I don't know about that language pair\n" unless $paths{$lp}; -my $corpdir = "$dataDir"; -if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } -die "I can't find the corpora directory: $corpdir" unless -d $corpdir; -print STDERR " GRAMMAR: $grammar\n"; -my $LANG_MODEL = mydircat($corpdir, $lms{$lp}); -print STDERR " LM: $LANG_MODEL\n"; -my $CORPUS = mydircat($corpdir, $corpora{$lp}); -die "Can't find corpus: $CORPUS" unless -f $CORPUS; - -my $dev = mydircat($corpdir, $devs{$lp}); -my $drefs = $devrefs{$lp}; -die "Can't find dev: $dev\n" unless -f $dev; -die "Dev refs not set" unless $drefs; -$drefs = mydircat($corpdir, $drefs); - -my $test = mydircat($corpdir, $tests{$lp}); -my $teval = mydircat($corpdir, $testevals{$lp}); -#die "Can't find test: $test\n" unless -f $test; -#assert_exec($teval); - -`mkdir -p $outdir`; - -# CREATE INIT WEIGHTS -print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; -my $weights = mydircat($outdir, "weights.init"); -write_random_weights_file($weights, @xfeats); - -my $bkoff_grmr; -my $glue_grmr; -if($bkoffgram) { - print STDERR "Placing backoff grammar…\n"; - $bkoff_grmr = mydircat($outdir, "backoff.scfg.gz"); - print STDERR "cp $bkoffgram $bkoff_grmr\n"; - safesystem(undef,"cp $bkoffgram $bkoff_grmr"); -} -if($gluegram) { - print STDERR "Placing glue grammar…\n"; - $glue_grmr = mydircat($outdir, "glue.bo.scfg.gz"); - print STDERR "cp $gluegram $glue_grmr\n"; - safesystem(undef,"cp $gluegram $glue_grmr"); -} - -# MAKE DEV -print STDERR "\nFILTERING FOR dev...\n"; -print STDERR "DEV: $dev (REFS=$drefs)\n"; -my $devgrammar = filter($grammar, $dev, 'dev', $outdir); -my $devini = mydircat($outdir, "cdec-dev.ini"); -write_cdec_ini($devini, $devgrammar); - - -# MAKE TEST -print STDERR "\nFILTERING FOR test...\n"; -print STDERR "TEST: $test (EVAL=$teval)\n"; -`mkdir -p $outdir`; -my $testgrammar = filter($grammar, $test, 'test', $outdir); -my $testini = mydircat($outdir, "cdec-test.ini"); -write_cdec_ini($testini, $testgrammar); - - -# VEST -print STDERR "\nMINIMUM ERROR TRAINING\n"; -my $tuned_weights = mydircat($outdir, 'weights.tuned'); -if (-f $tuned_weights) { - print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; -} else { - my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; - print STDERR "MERT COMMAND: $cmd\n"; - `rm -rf $outdir/vest 2> /dev/null`; - chdir $outdir or die "Can't chdir to $outdir: $!"; - $weights = `$cmd`; - die "MERT reported non-zero exit code" unless $? == 0; - chomp $weights; - safesystem($tuned_weights, "cp $weights $tuned_weights"); - print STDERR "TUNED WEIGHTS: $tuned_weights\n"; - die "$tuned_weights is missing!" unless -f $tuned_weights; -} - -# DECODE -print STDERR "\nDECODE TEST SET\n"; -my $decolog = mydircat($outdir, "test-decode.log"); -my $testtrans = mydircat($outdir, "test.trans"); -my $cmd = "cat $test | $PARALLELIZE $usefork -j $JOBS -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; -safesystem($testtrans, $cmd) or die "Failed to decode test set!"; - - -# EVALUATE -print STDERR "\nEVALUATE TEST SET\n"; -print STDERR "TEST: $testtrans\n"; -$cmd = "$teval $testtrans"; -safesystem(undef, $cmd) or die "Failed to evaluate!"; -exit 0; - - -sub write_random_weights_file { - my ($file, @extras) = @_; - if (-f $file) { - print STDERR "$file exists - REUSING!\n"; - return; - } - open F, ">$file" or die "Can't write $file: $!"; - my @feats = (@DEFAULT_FEATS, @extras); - for my $feat (@feats) { - my $r = rand(0.4) + 0.8; - my $w = $init_weights{$feat} * $r; - if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } - print F "$feat $w\n"; - } - close F; -} - -sub filter { - my ($grammar, $set, $name, $outdir) = @_; - my $out1 = mydircat($outdir, "$name.filt.gz"); - my $out2 = mydircat($outdir, "$name.f_feat.gz"); - my $outgrammar = mydircat($outdir, "$name.scfg.gz"); - if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { - my $cmd = "gunzip -c $grammar | $FILTER -t $set | gzip > $out1"; - safesystem($out1, $cmd) or die "Filtering failed."; - $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2"; - safesystem($out2, $cmd) or die "Featurizing failed"; - $cmd = "$FILTERBYF $NUM_TRANSLATIONS $out2 $outgrammar"; - safesystem($outgrammar, $cmd) or die "Secondary filtering failed"; - } - return $outgrammar; -} - -sub mydircat { - my ($base, $suffix) = @_; - if ($suffix =~ /^\//) { return $suffix; } - my $res = $base . '/' . $suffix; - $res =~ s/\/\//\//g; - return $res; -} - -sub write_cdec_ini { - my ($filename, $grammar_path) = (@_); - open CDECINI, ">$filename" or die "Can't write $filename: $!"; - my $glue = ($gluegram ? "$glue_grmr" : "$datadir/glue/glue.scfg.gz"); - my $oov = ($oovgram ? "$oovgram" : "$datadir/oov.scfg.gz"); - print CDECINI <> 8; - if ($exitcode) { - print STDERR "Exit code: $exitcode\n"; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - } - return ! $exitcode; - } -} - -sub assert_exec { - my @files = @_; - for my $file (@files) { - die "Can't find $file - did you run make?\n" unless -e $file; - die "Can't execute $file" unless -e $file; - } -}; - diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl deleted file mode 100755 index e31167a2..00000000 --- a/gi/pipeline/local-gi-pipeline.pl +++ /dev/null @@ -1,465 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use File::Copy; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -use Getopt::Long "GetOptions"; - -my $GZIP = 'gzip'; -my $ZCAT = 'gunzip -c'; -my $SED = 'sed -e'; -my $BASE_PHRASE_MAX_SIZE = 10; -my $COMPLETE_CACHE = 1; -my $ITEMS_IN_MEMORY = 10000000; # cache size in extractors -my $NUM_TOPICS = 50; -my $NUM_TOPICS_COARSE; -my $NUM_TOPICS_FINE = $NUM_TOPICS; -my $NUM_SAMPLES = 1000; -my $CONTEXT_SIZE = 1; -my $BIDIR = 0; -my $TOPICS_CONFIG = "pyp-topics.conf"; -my $LANGUAGE = "target"; -my $LABEL_THRESHOLD = "0"; -my $PRESERVE_PHRASES; - -my $MODEL = "pyp"; -my $NUM_ITERS = 100; -my $PR_SCALE_P = 0; -my $PR_SCALE_C = 0; -my $PR_FLAGS = ""; -my $MORFMARK = ""; - -my $EXTOOLS = "$SCRIPT_DIR/../../extools"; -die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; -my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src"; -die "Can't find pyp-topics: $PYPTOOLS" unless -e $PYPTOOLS && -d $PYPTOOLS; -my $PYPSCRIPTS = "$SCRIPT_DIR/../pyp-topics/scripts"; -die "Can't find pyp-topics: $PYPSCRIPTS" unless -e $PYPSCRIPTS && -d $PYPSCRIPTS; -my $PRTOOLS = "$SCRIPT_DIR/../posterior-regularisation"; -die "Can't find posterior-regularisation: $PRTOOLS" unless -e $PRTOOLS && -d $PRTOOLS; -my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce"; -my $C2D = "$PYPSCRIPTS/contexts2documents.py"; -my $S2L = "$PYPSCRIPTS/spans2labels.py"; -my $SPLIT = "$SCRIPT_DIR/../posterior-regularisation/split-languages.py"; - -my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh"; - -my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh"; -my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl"; -my $REMOVE_TAGS_CORPUS = "$SCRIPT_DIR/scripts/remove-tags-from-corpus.pl"; -my $REMOVE_TAGS_CONTEXT = "$SCRIPT_DIR/scripts/remove-tags-from-contexts.pl"; -my $EXTRACTOR = "$EXTOOLS/extractor"; -my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train"; -my $MORF_DOC_FILTER = "$SCRIPT_DIR/../morf-segmentation/filter_docs.pl"; - -assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, - $S2L, $C2D, $TOPIC_TRAIN, $SPLIT, $REMOVE_TAGS_CONTEXT, $REMOVE_TAGS_CORPUS, $MORF_DOC_FILTER); - -my $BACKOFF_GRAMMAR; -my $DEFAULT_CAT; -my $HIER_CAT; -my %FREQ_HIER = (); -my $TAGGED_CORPUS; - -my $NAME_SHORTCUT; - -my $OUTPUT = './giwork'; -usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, - 'backoff_grammar' => \$BACKOFF_GRAMMAR, - 'output=s' => \$OUTPUT, - 'model=s' => \$MODEL, - 'topics=i' => \$NUM_TOPICS_FINE, - 'coarse_topics=i' => \$NUM_TOPICS_COARSE, - 'trg_context=i' => \$CONTEXT_SIZE, - 'samples=i' => \$NUM_SAMPLES, - 'label_threshold=f' => \$LABEL_THRESHOLD, - 'use_default_cat' => \$DEFAULT_CAT, - 'topics-config=s' => \$TOPICS_CONFIG, - 'iterations=i' => \$NUM_ITERS, - 'pr-scale-phrase=f' => \$PR_SCALE_P, - 'pr-scale-context=f' => \$PR_SCALE_C, - 'pr-flags=s' => \$PR_FLAGS, - 'tagged_corpus=s' => \$TAGGED_CORPUS, - 'language=s' => \$LANGUAGE, - 'get_name_only' => \$NAME_SHORTCUT, - 'preserve_phrases' => \$PRESERVE_PHRASES, - 'morf=s' => \$MORFMARK, - ); -if ($NAME_SHORTCUT) { - $NUM_TOPICS = $NUM_TOPICS_FINE; - print STDERR labeled_dir(); - exit 0; -} -usage() unless scalar @ARGV == 1; -my $CORPUS = $ARGV[0]; -open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F; - -$NUM_TOPICS = $NUM_TOPICS_FINE; - -$HIER_CAT = ( $NUM_TOPICS_COARSE ? 1 : 0 ); - -print STDERR " Output: $OUTPUT\n"; -my $DATA_DIR = $OUTPUT . '/corpora'; -my $LEX_NAME = "corpus.f_e_a.$LANGUAGE.lex"; -my $CORPUS_LEX = $DATA_DIR . '/' . $LEX_NAME; # corpus used to extract rules -my $CORPUS_CLUSTER = $DATA_DIR . "/corpus.f_e_a.$LANGUAGE.cluster"; # corpus used for clustering (often identical) - -my $CONTEXT_DIR = $OUTPUT . '/' . context_dir(); -my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir(); -my $LABELED_DIR = $OUTPUT . '/' . labeled_dir(); -my $CLUSTER_DIR_C; -my $CLUSTER_DIR_F; -my $LABELED_DIR_C; -my $LABELED_DIR_F; -if($HIER_CAT) { - $CLUSTER_DIR_F = $CLUSTER_DIR; - $LABELED_DIR_F = $LABELED_DIR; - $NUM_TOPICS = $NUM_TOPICS_COARSE; - $CLUSTER_DIR_C = $OUTPUT . '/' . cluster_dir(); - $LABELED_DIR_C = $OUTPUT . '/' . labeled_dir(); - $NUM_TOPICS = $NUM_TOPICS_FINE; -} -my $GRAMMAR_DIR = $OUTPUT . '/' . grammar_dir(); -print STDERR " Context: $CONTEXT_DIR\n Cluster: $CLUSTER_DIR\n Labeled: $LABELED_DIR\n Grammar: $GRAMMAR_DIR\n"; -safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!"; -safemkdir($DATA_DIR) or die "Couldn't create output directory $DATA_DIR: $!"; -safemkdir($CONTEXT_DIR) or die "Couldn't create output directory $CONTEXT_DIR: $!"; -safemkdir($CLUSTER_DIR) or die "Couldn't create output directory $CLUSTER_DIR: $!"; -if($HIER_CAT) { - safemkdir($CLUSTER_DIR_C) or die "Couldn't create output directory $CLUSTER_DIR_C: $!"; - safemkdir($LABELED_DIR_C) or die "Couldn't create output directory $LABELED_DIR_C: $!"; -} -safemkdir($LABELED_DIR) or die "Couldn't create output directory $LABELED_DIR: $!"; -safemkdir($GRAMMAR_DIR) or die "Couldn't create output directory $GRAMMAR_DIR: $!"; -if(-e $TOPICS_CONFIG) { - copy($TOPICS_CONFIG, $CLUSTER_DIR) or die "Copy failed: $!"; -} - -setup_data(); - -if (lc($MODEL) eq "blagree") { - extract_bilingual_context(); -} else { - extract_context(); -} - -if (lc($MODEL) eq "pyp") { - if($HIER_CAT) { - $NUM_TOPICS = $NUM_TOPICS_COARSE; - $CLUSTER_DIR = $CLUSTER_DIR_C; - topic_train(); - $NUM_TOPICS = $NUM_TOPICS_FINE; - $CLUSTER_DIR = $CLUSTER_DIR_F; - topic_train(); - } else { - topic_train(); - } -} elsif (lc($MODEL) =~ /pr|em|agree/) { - prem_train(); -} else { die "Unsupported model type: $MODEL. Must be one of PYP or PREM.\n"; } -if($HIER_CAT) { - $NUM_TOPICS = $NUM_TOPICS_COARSE; - $CLUSTER_DIR = $CLUSTER_DIR_C; - $LABELED_DIR = $LABELED_DIR_C; - label_spans_with_topics(); - $NUM_TOPICS = $NUM_TOPICS_FINE; - $CLUSTER_DIR = $CLUSTER_DIR_F; - $LABELED_DIR = $LABELED_DIR_F; - label_spans_with_topics(); - extract_freqs(); -} else { - label_spans_with_topics(); -} -my $res; -if ($BIDIR) { - $res = grammar_extract_bidir(); -} else { - $res = grammar_extract(); -} -print STDERR "\n!!!COMPLETE!!!\n"; -print STDERR "GRAMMAR: $res\nYou should probably run: $SCRIPT_DIR/evaluation-pipeline.pl LANGPAIR giwork/ct1s0.L10.PYP.t4.s20.grammar/grammar.gz -f FEAT1 -f FEAT2\n\n"; -exit 0; - -sub setup_data { - print STDERR "\n!!!PREPARE CORPORA!!!\n"; - if (-f $CORPUS_LEX && $CORPUS_CLUSTER) { - print STDERR "$CORPUS_LEX and $CORPUS_CLUSTER exist, reusing...\n"; - return; - } - copy($CORPUS, $CORPUS_LEX); - if ($TAGGED_CORPUS) { - die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS; - my $opt=""; - $opt = "-s" if ($LANGUAGE eq "source"); - $opt = $opt . " -a" if ($PRESERVE_PHRASES); - my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER"; - safesystem($cmd) or die "Failed to extract contexts."; - } else { - symlink($LEX_NAME, $CORPUS_CLUSTER); - } -} - -sub context_dir { - return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE"; -} - -sub cluster_dir { - if (lc($MODEL) eq "pyp") { - return context_dir() . ".PYP.t$NUM_TOPICS.s$NUM_SAMPLES"; - } elsif (lc($MODEL) eq "em") { - return context_dir() . ".EM.t$NUM_TOPICS.i$NUM_ITERS"; - } elsif (lc($MODEL) eq "pr") { - return context_dir() . ".PR.t$NUM_TOPICS.i$NUM_ITERS.sp$PR_SCALE_P.sc$PR_SCALE_C"; - } elsif (lc($MODEL) eq "agree") { - return context_dir() . ".AGREE.t$NUM_TOPICS.i$NUM_ITERS"; - } elsif (lc($MODEL) eq "blagree") { - return context_dir() . ".BLAGREE.t$NUM_TOPICS.i$NUM_ITERS"; - } -} - -sub labeled_dir { - if (lc($MODEL) eq "pyp" && $LABEL_THRESHOLD ne "0") { - return cluster_dir() . "_lt$LABEL_THRESHOLD"; - } else { - return cluster_dir(); - } -} - -sub grammar_dir { - # TODO add grammar config options -- adjacent NTs, etc - if($HIER_CAT) { - return cluster_dir() . ".hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.grammar"; - } else { - return labeled_dir() . ".grammar"; - } -} - - - -sub safemkdir { - my $dir = shift; - if (-d $dir) { return 1; } - return mkdir($dir); -} - -sub usage { - print < $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; - safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans"; - unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; - safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS | sed 's/ *||| *\$//' > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; - } -} - -sub extract_freqs { - print STDERR "\n!!!EXTRACTING FREQUENCIES\n"; - my $IN_COARSE = "$LABELED_DIR_C/labeled_spans.txt"; - my $IN_FINE = "$LABELED_DIR_F/labeled_spans.txt"; - my $OUT_SPANS = "$LABELED_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; - my $FREQS = "$LABELED_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; - my $COARSE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1c/g\'"; #' - my $FINE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1f/g\'"; #' - my %finehier = (); - if (-e $OUT_SPANS) { - print STDERR "$OUT_SPANS exists, reusing...\n"; - } else { - safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); - } - open SPANS, $OUT_SPANS or die $!; - while () { - my ($tmp, $coarse, $fine) = split /\|\|\|/; - my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; - my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; - - foreach my $i (0..(scalar @coarse_spans)-1) { - my $coarse_cat = $coarse_spans[$i]; - my $fine_cat = $fine_spans[$i]; - - $FREQ_HIER{$coarse_cat}{$fine_cat}++; - } - } - close SPANS; - foreach (values %FREQ_HIER) { - my $coarse_freq = $_; - my $total = 0; - $total+=$_ for (values %{ $coarse_freq }); - $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); - } - open FREQS, ">", $FREQS or die $!; - foreach my $coarse_cat (keys %FREQ_HIER) { - print FREQS "$coarse_cat |||"; - foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { - my $res = $FREQ_HIER{$coarse_cat}{$fine_cat}; - print FREQS " $fine_cat:$res"; - if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $res) { - $finehier{$fine_cat} = $coarse_cat; - } - } - print FREQS "\n"; - } -# foreach my $fine_cat (keys %finehier) { -# print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; -# } - close FREQS; - $CLUSTER_DIR = $CLUSTER_DIR_F; -} - -sub grammar_extract { - my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label"; - print STDERR "\n!!!EXTRACTING GRAMMAR\n"; - my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.gz"; - if (-e $OUTGRAMMAR) { - print STDERR "$OUTGRAMMAR exists, reusing...\n"; - } else { - my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); - my $DEFAULT_CAT_ARG = ($DEFAULT_CAT ? "-d X" : ""); - safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -t $NUM_TOPICS $BACKOFF_ARG $DEFAULT_CAT_ARG | $SORT_KEYS | $REDUCER -p | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; - } - return $OUTGRAMMAR; -} - -sub grammar_extract_bidir { -#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz - my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label"; - print STDERR "\n!!!EXTRACTING GRAMMAR\n"; - my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.bidir.gz"; - if (-e $OUTGRAMMAR) { - print STDERR "$OUTGRAMMAR exists, reusing...\n"; - } else { - my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); - safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; - } - return $OUTGRAMMAR; -} - -sub safesystem { - print STDERR "Executing: @_\n"; - system(@_); - if ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - exit(1); - } - elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - } - else { - my $exitcode = $? >> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } -} - diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config deleted file mode 100644 index 3e23c8cb..00000000 --- a/gi/pipeline/lticluster.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/home/cdyer/ws10smt-data -btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh -aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh -uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl deleted file mode 100755 index 0cef0606..00000000 --- a/gi/pipeline/scripts/filter-by-f.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -my $REKEY="$SCRIPT_DIR/rekey.pl"; -my $REFILTER="$SCRIPT_DIR/refilter.pl"; -my $SORT="$SCRIPT_DIR/sort-by-key.sh"; -assert_exec($REKEY, $REFILTER, $SORT); - - -die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3; -my $translations = shift @ARGV; -die "Need number: $translations" unless $translations > 0; -die unless $ARGV[0] =~ /\.gz$/; -die unless $ARGV[1] =~ /\.gz$/; -die if $ARGV[0] eq $ARGV[1]; -die "Can't find $ARGV[0]" unless -f $ARGV[0]; - -my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]"; -safesystem($ARGV[1], $cmd) or die "Filtering failed"; -exit 0; - -sub assert_exec { - my @files = @_; - for my $file (@files) { - die "Can't find $file - did you run make?\n" unless -e $file; - die "Can't execute $file" unless -e $file; - } -}; - -sub safesystem { - my $output = shift @_; - print STDERR "Executing: @_\n"; - system(@_); - if ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - exit(1); - } - elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - exit(1); - } - else { - my $exitcode = $? >> 8; - if ($exitcode) { - print STDERR "Exit code: $exitcode\n"; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - } - return ! $exitcode; - } -} - diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl deleted file mode 100755 index c0eec43e..00000000 --- a/gi/pipeline/scripts/patch-corpus.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $PATCH = shift @ARGV; -my $TGT = 1; -my $APPEND; -while ($PATCH eq "-s" || $PATCH eq "-a") { - if ($PATCH eq "-s") { - undef $TGT; - } else { - $APPEND = 1; - } - $PATCH = shift @ARGV; -} - -die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; - -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -my $first=

; close P; -my @fields = split / \|\|\| /, $first; -die "Bad format!" if (scalar @fields > 2); - -if (scalar @fields != 1) { - # TODO support this - die "Patching source and target not supported yet!"; -} - -my $line = 0; -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -while(my $pline =

) { - chomp $pline; - $line++; - my $line = <>; - die "Too few lines in lexical corpus!" unless $line; - chomp $line; - @fields = split / \|\|\| /, $line; - my @pwords = split /\s+/, $pline; - if ($TGT) { - my @lwords = split /\s+/, $fields[1]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - if ($APPEND) { - foreach my $i (0..(scalar @pwords-1)) { - $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; - } - $fields[1] = join ' ', @lwords; - } else { - $fields[1] = $pline; - } - } else { # source side - my @lwords = split /\s+/, $fields[0]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - if ($APPEND) { - foreach my $i (0..(scalar @pwords-1)) { - $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; - } - $fields[0] = join ' ', @lwords; - } else { - $fields[0] = $pline; - } - } - print join ' ||| ', @fields; - print "\n"; -} - - diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl deleted file mode 100755 index a783eb4e..00000000 --- a/gi/pipeline/scripts/refilter.pl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $NUM_TRANSLATIONS = shift @ARGV; -unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; } -print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n"; - -my $pk = ''; -my %dict; -while(<>) { - s/^(.+)\t//; - my $key = $1; - if ($key ne $pk) { - if ($pk) { - emit_dict(); - } - %dict = (); - $pk = $key; - } - my ($lhs, $f, $e, $s) = split / \|\|\| /; - my $score = 0; - if ($s =~ /XEF=([^ ]+)/) { - $score += $1; - } else { die; } - if ($s =~ /GenerativeProb=([^ ]+)/) { - $score += ($1 / 10); - } else { die; } - $dict{"$lhs ||| $f ||| $e ||| $s"} = $score; -} -emit_dict(); - -sub emit_dict { - my $cc = 0; - for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) { - print "$k"; - $cc++; - if ($cc >= $NUM_TRANSLATIONS) { last; } - } -} - diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl deleted file mode 100755 index 31eb86b8..00000000 --- a/gi/pipeline/scripts/rekey.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - my ($lhs, $f, $e, $s) = split / \|\|\| /; - $f =~ s/\[X[0-9]+\]/\[X\]/g; - print "$f\t$_"; -} - diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl deleted file mode 100755 index 20698816..00000000 --- a/gi/pipeline/scripts/remove-tags-from-contexts.pl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" - unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); - -my $lno = 0; -while(my $line = <>) { - $lno++; - chomp $line; - my @top = split /\t/, $line; - die unless (scalar @top == 2); - - my @pwords = split /\s+/, $top[0]; - foreach my $token (@pwords) { - #print $token . "\n"; - my @parts = split /_(?!.*_)/, $token; - die unless (scalar @parts == 2); - if ($PHRASE eq "tok") { - $token = $parts[0] - } elsif ($PHRASE eq "tag") { - $token = $parts[1] - } - } - - my @fields = split / \|\|\| /, $top[1]; - foreach my $i (0..((scalar @fields) / 2 - 1)) { - #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; - my @cwords = split /\s+/, $fields[2*$i]; - foreach my $token (@cwords) { - #print $i . ": " . $token . "\n"; - my @parts = split /_(?!.*_)/, $token; - if (scalar @parts == 2) { - if ($CONTEXT eq "tok") { - $token = $parts[0] - } elsif ($CONTEXT eq "tag") { - $token = $parts[1] - } - } - } - $fields[2*$i] = join ' ', @cwords; - } - - print join ' ', @pwords; - print "\t"; - print join ' ||| ', @fields; - print "\n"; -} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl deleted file mode 100755 index be3e97c0..00000000 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $LANGUAGE = shift @ARGV; -$LANGUAGE = 'target' unless ($LANGUAGE); - -my $lno = 0; -while(my $line = <>) { - $lno++; - chomp $line; - - my @fields = split / \|\|\| /, $line; - - if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { - my @cwords = split /\s+/, $fields[0]; - foreach my $token (@cwords) { - my @parts = split /_(?!.*_)/, $token; - if (scalar @parts == 2) { - $token = $parts[0] - } else { - print STDERR "WARNING: invalid tagged token $token\n"; - } - } - $fields[0] = join ' ', @cwords; - } - - if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { - my @cwords = split /\s+/, $fields[1]; - foreach my $token (@cwords) { - my @parts = split /_(?!.*_)/, $token; - if (scalar @parts == 2) { - $token = $parts[1] - } else { - print STDERR "WARNING: invalid tagged token $token\n"; - } - } - $fields[0] = join ' ', @cwords; - } - - print join ' ||| ', @fields; - print "\n"; -} diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh deleted file mode 100755 index 7ae33e03..00000000 --- a/gi/pipeline/scripts/sort-by-key.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export LANG=C -sort -t $'\t' -k 1 -T /tmp -S 6000000000 - diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl deleted file mode 100755 index dc578513..00000000 --- a/gi/pipeline/scripts/xfeats.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0; - -my $xgrammar = shift @ARGV; -die "Can't find $xgrammar" unless -f $xgrammar; -my $fh; -if ($xgrammar =~ /\.gz$/) { - open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!"; -} else { - open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!"; -} -print STDERR "Reading X-feats from $xgrammar...\n"; -my %dict; -while(<$fh>) { - chomp; - my ($lhs, $f, $e, $feats) = split / \|\|\| /; - my $xfeats; - my $cc = 0; - my @xfeats = (); - while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) { - push @xfeats, "X_$1=$2"; - } - #print "$lhs ||| $f ||| $e ||| @xfeats\n"; - $dict{"$lhs ||| $f ||| $e"} = "@xfeats"; -} -close $fh; - -print STDERR "Add features...\n"; -while(<>) { - chomp; - my ($lhs, $f, $e) = split / \|\|\| /; - $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g; - my $xfeats = $dict{"[X] ||| $f ||| $e"}; - die "Can't find x features for: $_\n" unless $xfeats; - print "$_ $xfeats\n"; -} - diff --git a/gi/pipeline/valhalla.config b/gi/pipeline/valhalla.config deleted file mode 100644 index e00a8485..00000000 --- a/gi/pipeline/valhalla.config +++ /dev/null @@ -1,9 +0,0 @@ -# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -/home/chris/ws10smt/data -btec /home/chris/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis /home/chris/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /home/chris/ws10smt/data/chinese-english corpus.zh-en.al -aren /home/chris/ws10smt/data/arabic-english corpus.ar-en.al -uren /home/chris/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh -nlfr /home/chris/ws10smt/data/dutch-french corpus.nl-fr.al diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java deleted file mode 100644 index 07b27387..00000000 --- a/gi/posterior-regularisation/Corpus.java +++ /dev/null @@ -1,167 +0,0 @@ -import gnu.trove.TIntArrayList; - -import java.io.*; -import java.util.*; -import java.util.regex.Pattern; - -public class Corpus -{ - private Lexicon tokenLexicon = new Lexicon(); - private Lexicon phraseLexicon = new Lexicon(); - private Lexicon contextLexicon = new Lexicon(); - private List edges = new ArrayList(); - private List> phraseToContext = new ArrayList>(); - private List> contextToPhrase = new ArrayList>(); - - public class Edge - { - Edge(int phraseId, int contextId, int count) - { - this.phraseId = phraseId; - this.contextId = contextId; - this.count = count; - } - public int getPhraseId() - { - return phraseId; - } - public TIntArrayList getPhrase() - { - return phraseLexicon.lookup(phraseId); - } - public String getPhraseString() - { - StringBuffer b = new StringBuffer(); - for (int tid: getPhrase().toNativeArray()) - { - if (b.length() > 0) - b.append(" "); - b.append(tokenLexicon.lookup(tid)); - } - return b.toString(); - } - public int getContextId() - { - return contextId; - } - public TIntArrayList getContext() - { - return contextLexicon.lookup(contextId); - } - public String getContextString() - { - StringBuffer b = new StringBuffer(); - for (int tid: getContext().toNativeArray()) - { - if (b.length() > 0) - b.append(" "); - b.append(tokenLexicon.lookup(tid)); - } - return b.toString(); - } - public int getCount() - { - return count; - } - private int phraseId; - private int contextId; - private int count; - } - - List getEdges() - { - return edges; - } - - int getNumEdges() - { - return edges.size(); - } - - int getNumPhrases() - { - return phraseLexicon.size(); - } - - List getEdgesForPhrase(int phraseId) - { - return phraseToContext.get(phraseId); - } - - int getNumContexts() - { - return contextLexicon.size(); - } - - List getEdgesForContext(int contextId) - { - return contextToPhrase.get(contextId); - } - - int getNumTokens() - { - return tokenLexicon.size(); - } - - static Corpus readFromFile(Reader in) throws IOException - { - Corpus c = new Corpus(); - - // read in line-by-line - BufferedReader bin = new BufferedReader(in); - String line; - Pattern separator = Pattern.compile(" \\|\\|\\| "); - - while ((line = bin.readLine()) != null) - { - // split into phrase and contexts - StringTokenizer st = new StringTokenizer(line, "\t"); - assert (st.hasMoreTokens()); - String phraseToks = st.nextToken(); - assert (st.hasMoreTokens()); - String rest = st.nextToken(); - assert (!st.hasMoreTokens()); - - // process phrase - st = new StringTokenizer(phraseToks, " "); - TIntArrayList ptoks = new TIntArrayList(); - while (st.hasMoreTokens()) - ptoks.add(c.tokenLexicon.insert(st.nextToken())); - int phraseId = c.phraseLexicon.insert(ptoks); - if (phraseId == c.phraseToContext.size()) - c.phraseToContext.add(new ArrayList()); - - // process contexts - String[] parts = separator.split(rest); - assert (parts.length % 2 == 0); - for (int i = 0; i < parts.length; i += 2) - { - // process pairs of strings - context and count - TIntArrayList ctx = new TIntArrayList(); - String ctxString = parts[i]; - String countString = parts[i + 1]; - StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); - while (ctxStrtok.hasMoreTokens()) - { - String token = ctxStrtok.nextToken(); - if (!token.equals("")) - ctx.add(c.tokenLexicon.insert(token)); - } - int contextId = c.contextLexicon.insert(ctx); - if (contextId == c.contextToPhrase.size()) - c.contextToPhrase.add(new ArrayList()); - - assert (countString.startsWith("C=")); - Edge e = c.new Edge(phraseId, contextId, - Integer.parseInt(countString.substring(2).trim())); - c.edges.add(e); - - // index the edge for fast phrase, context lookup - c.phraseToContext.get(phraseId).add(e); - c.contextToPhrase.get(contextId).add(e); - } - } - - return c; - } -} diff --git a/gi/posterior-regularisation/Lexicon.java b/gi/posterior-regularisation/Lexicon.java deleted file mode 100644 index 9f0245ee..00000000 --- a/gi/posterior-regularisation/Lexicon.java +++ /dev/null @@ -1,32 +0,0 @@ -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class Lexicon -{ - public int insert(T word) - { - Integer i = wordToIndex.get(word); - if (i == null) - { - i = indexToWord.size(); - wordToIndex.put(word, i); - indexToWord.add(word); - } - return i; - } - - public T lookup(int index) - { - return indexToWord.get(index); - } - - public int size() - { - return indexToWord.size(); - } - - private Map wordToIndex = new HashMap(); - private List indexToWord = new ArrayList(); -} \ No newline at end of file diff --git a/gi/posterior-regularisation/PhraseContextModel.java b/gi/posterior-regularisation/PhraseContextModel.java deleted file mode 100644 index 85bcfb89..00000000 --- a/gi/posterior-regularisation/PhraseContextModel.java +++ /dev/null @@ -1,466 +0,0 @@ -// Input of the form: -// " the phantom of the opera " tickets for tonight ? ||| C=1 ||| seats for ? ||| C=1 ||| i see ? ||| C=1 -// phrase TAB [context]+ -// where context = phrase ||| C=... which are separated by ||| - -// Model parameterised as follows: -// - each phrase, p, is allocated a latent state, t -// - this is used to generate the contexts, c -// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR - -// Training with EM: -// - e-step is estimating q(t) = P(t|p,c) for all x,c -// - m-step is estimating model parameters P(c,t|p) = P(t) P(c|t) -// - PR uses alternate e-step, which first optimizes lambda -// min_q KL(q||p) + delta sum_pt max_c E_q[phi_ptc] -// where -// q(t|p,c) propto p(t,c|p) exp( -phi_ptc ) -// Then q is used to obtain expectations for vanilla M-step. - -// Sexing it up: -// - learn p-specific conditionals P(t|p) -// - or generate phrase internals, e.g., generate edge words from -// different distribution to central words -// - agreement between phrase->context model and context->phrase model - -import java.io.*; -import optimization.gradientBasedMethods.*; -import optimization.gradientBasedMethods.stats.OptimizerStats; -import optimization.gradientBasedMethods.stats.ProjectedOptimizerStats; -import optimization.linesearch.ArmijoLineSearchMinimizationAlongProjectionArc; -import optimization.linesearch.GenericPickFirstStep; -import optimization.linesearch.InterpolationPickFirstStep; -import optimization.linesearch.LineSearchMethod; -import optimization.linesearch.WolfRuleLineSearch; -import optimization.projections.SimplexProjection; -import optimization.stopCriteria.CompositeStopingCriteria; -import optimization.stopCriteria.NormalizedProjectedGradientL2Norm; -import optimization.stopCriteria.NormalizedValueDifference; -import optimization.stopCriteria.ProjectedGradientL2Norm; -import optimization.stopCriteria.StopingCriteria; -import optimization.stopCriteria.ValueDifference; -import optimization.util.MathUtils; -import java.util.*; -import java.util.regex.*; -import gnu.trove.TDoubleArrayList; -import gnu.trove.TIntArrayList; -import static java.lang.Math.*; - -class PhraseContextModel -{ - // model/optimisation configuration parameters - int numTags; - boolean posteriorRegularisation = true; - double constraintScale = 3; // FIXME: make configurable - - // copied from L1LMax in depparsing code - final double c1= 0.0001, c2=0.9, stoppingPrecision = 1e-5, maxStep = 10; - final int maxZoomEvals = 10, maxExtrapolationIters = 200; - int maxProjectionIterations = 200; - int minOccurrencesForProjection = 0; - - // book keeping - int numPositions; - Random rng = new Random(); - - // training set - Corpus training; - - // model parameters (learnt) - double emissions[][][]; // position in 0 .. 3 x tag x word Pr(word | tag, position) - double prior[][]; // phrase x tag Pr(tag | phrase) - double lambda[]; // edge = (phrase, context) x tag flattened lagrange multipliers - - PhraseContextModel(Corpus training, int tags) - { - this.training = training; - this.numTags = tags; - assert (!training.getEdges().isEmpty()); - assert (numTags > 1); - - // now initialise emissions - numPositions = training.getEdges().get(0).getContext().size(); - assert (numPositions > 0); - - emissions = new double[numPositions][numTags][training.getNumTokens()]; - prior = new double[training.getNumEdges()][numTags]; - if (posteriorRegularisation) - lambda = new double[training.getNumEdges() * numTags]; - - for (double[][] emissionTW : emissions) - { - for (double[] emissionW : emissionTW) - { - randomise(emissionW); -// for (int i = 0; i < emissionW.length; ++i) -// emissionW[i] = i+1; -// normalise(emissionW); - } - } - - for (double[] priorTag : prior) - { - randomise(priorTag); -// for (int i = 0; i < priorTag.length; ++i) -// priorTag[i] = i+1; -// normalise(priorTag); - } - } - - void expectationMaximisation(int numIterations) - { - double lastLlh = Double.NEGATIVE_INFINITY; - - for (int iteration = 0; iteration < numIterations; ++iteration) - { - double emissionsCounts[][][] = new double[numPositions][numTags][training.getNumTokens()]; - double priorCounts[][] = new double[training.getNumPhrases()][numTags]; - - // E-step - double llh = 0; - if (posteriorRegularisation) - { - EStepDualObjective objective = new EStepDualObjective(); - - // copied from x2y2withconstraints -// LineSearchMethod ls = new ArmijoLineSearchMinimizationAlongProjectionArc(new InterpolationPickFirstStep(1)); -// OptimizerStats stats = new OptimizerStats(); -// ProjectedGradientDescent optimizer = new ProjectedGradientDescent(ls); -// CompositeStopingCriteria compositeStop = new CompositeStopingCriteria(); -// compositeStop.add(new ProjectedGradientL2Norm(0.001)); -// compositeStop.add(new ValueDifference(0.001)); -// optimizer.setMaxIterations(50); -// boolean succeed = optimizer.optimize(objective,stats,compositeStop); - - // copied from depparser l1lmaxobjective - ProjectedOptimizerStats stats = new ProjectedOptimizerStats(); - GenericPickFirstStep pickFirstStep = new GenericPickFirstStep(1); - LineSearchMethod linesearch = new WolfRuleLineSearch(pickFirstStep, c1, c2); - ProjectedGradientDescent optimizer = new ProjectedGradientDescent(linesearch); - optimizer.setMaxIterations(maxProjectionIterations); - CompositeStopingCriteria stop = new CompositeStopingCriteria(); - stop.add(new NormalizedProjectedGradientL2Norm(stoppingPrecision)); - stop.add(new NormalizedValueDifference(stoppingPrecision)); - boolean succeed = optimizer.optimize(objective, stats, stop); - - System.out.println("Ended optimzation Projected Gradient Descent\n" + stats.prettyPrint(1)); - //System.out.println("Solution: " + objective.parameters); - if (!succeed) - System.out.println("Failed to optimize"); - //System.out.println("Ended optimization in " + optimizer.getCurrentIteration()); - - //lambda = objective.getParameters(); - llh = objective.primal(); - - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - for (int j = 0; j < edges.size(); ++j) - { - Corpus.Edge e = edges.get(j); - for (int t = 0; t < numTags; t++) - { - double p = objective.q.get(i).get(j).get(t); - priorCounts[i][t] += e.getCount() * p; - TIntArrayList tokens = e.getContext(); - for (int k = 0; k < tokens.size(); ++k) - emissionsCounts[k][t][tokens.get(k)] += e.getCount() * p; - } - } - } - } - else - { - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - for (int j = 0; j < edges.size(); ++j) - { - Corpus.Edge e = edges.get(j); - double probs[] = posterior(i, e); - double z = normalise(probs); - llh += log(z) * e.getCount(); - - TIntArrayList tokens = e.getContext(); - for (int t = 0; t < numTags; ++t) - { - priorCounts[i][t] += e.getCount() * probs[t]; - for (int k = 0; k < tokens.size(); ++k) - emissionsCounts[j][t][tokens.get(k)] += e.getCount() * probs[t]; - } - } - } - } - - // M-step: normalise - for (double[][] emissionTW : emissionsCounts) - for (double[] emissionW : emissionTW) - normalise(emissionW); - - for (double[] priorTag : priorCounts) - normalise(priorTag); - - emissions = emissionsCounts; - prior = priorCounts; - - System.out.println("Iteration " + iteration + " llh " + llh); - -// if (llh - lastLlh < 1e-4) -// break; -// else -// lastLlh = llh; - } - } - - static double normalise(double probs[]) - { - double z = 0; - for (double p : probs) - z += p; - for (int i = 0; i < probs.length; ++i) - probs[i] /= z; - return z; - } - - void randomise(double probs[]) - { - double z = 0; - for (int i = 0; i < probs.length; ++i) - { - probs[i] = 10 + rng.nextDouble(); - z += probs[i]; - } - - for (int i = 0; i < probs.length; ++i) - probs[i] /= z; - } - - static int argmax(double probs[]) - { - double m = Double.NEGATIVE_INFINITY; - int mi = -1; - for (int i = 0; i < probs.length; ++i) - { - if (probs[i] > m) - { - m = probs[i]; - mi = i; - } - } - return mi; - } - - double[] posterior(int phraseId, Corpus.Edge e) // unnormalised - { - double probs[] = new double[numTags]; - TIntArrayList tokens = e.getContext(); - for (int t = 0; t < numTags; ++t) - { - probs[t] = prior[phraseId][t]; - for (int k = 0; k < tokens.size(); ++k) - probs[t] *= emissions[k][t][tokens.get(k)]; - } - return probs; - } - - void displayPosterior() - { - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - for (Corpus.Edge e: edges) - { - double probs[] = posterior(i, e); - normalise(probs); - - // emit phrase - System.out.print(e.getPhraseString()); - System.out.print("\t"); - System.out.print(e.getContextString()); - System.out.print("||| C=" + e.getCount() + " |||"); - - int t = argmax(probs); - System.out.print(" " + t + " ||| " + probs[t]); - // for (int t = 0; t < numTags; ++t) - // System.out.print(" " + probs[t]); - System.out.println(); - } - } - } - - public static void main(String[] args) - { - assert (args.length >= 2); - try - { - Corpus corpus = Corpus.readFromFile(new FileReader(new File(args[0]))); - PhraseContextModel model = new PhraseContextModel(corpus, Integer.parseInt(args[1])); - model.expectationMaximisation(Integer.parseInt(args[2])); - model.displayPosterior(); - } - catch (IOException e) - { - System.out.println("Failed to read input file: " + args[0]); - e.printStackTrace(); - } - } - - class EStepDualObjective extends ProjectedObjective - { - List> conditionals; // phrase id x context # x tag - precomputed - List> q; // ditto, but including exp(-lambda) terms - double objective = 0; // log(z) - // Objective.gradient = d log(z) / d lambda = E_q[phi] - double llh = 0; - - public EStepDualObjective() - { - super(); - // compute conditionals p(context, tag | phrase) for all training instances - conditionals = new ArrayList>(training.getNumPhrases()); - q = new ArrayList>(training.getNumPhrases()); - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - - conditionals.add(new ArrayList(edges.size())); - q.add(new ArrayList(edges.size())); - - for (int j = 0; j < edges.size(); ++j) - { - Corpus.Edge e = edges.get(j); - double probs[] = posterior(i, e); - double z = normalise(probs); - llh += log(z) * e.getCount(); - conditionals.get(i).add(new TDoubleArrayList(probs)); - q.get(i).add(new TDoubleArrayList(probs)); - } - } - - gradient = new double[training.getNumEdges()*numTags]; - setInitialParameters(lambda); - computeObjectiveAndGradient(); - } - - @Override - public double[] projectPoint(double[] point) - { - SimplexProjection p = new SimplexProjection(constraintScale); - - double[] newPoint = point.clone(); - int edgeIndex = 0; - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - - for (int t = 0; t < numTags; t++) - { - double[] subPoint = new double[edges.size()]; - for (int j = 0; j < edges.size(); ++j) - subPoint[j] = point[edgeIndex+j*numTags+t]; - - p.project(subPoint); - for (int j = 0; j < edges.size(); ++j) - newPoint[edgeIndex+j*numTags+t] = subPoint[j]; - } - - edgeIndex += edges.size() * numTags; - } -// System.out.println("Proj from: " + Arrays.toString(point)); -// System.out.println("Proj to: " + Arrays.toString(newPoint)); - return newPoint; - } - - @Override - public void setParameters(double[] params) - { - super.setParameters(params); - computeObjectiveAndGradient(); - } - - @Override - public double[] getGradient() - { - gradientCalls += 1; - return gradient; - } - - @Override - public double getValue() - { - functionCalls += 1; - return objective; - } - - public void computeObjectiveAndGradient() - { - int edgeIndex = 0; - objective = 0; - Arrays.fill(gradient, 0); - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - - for (int j = 0; j < edges.size(); ++j) - { - Corpus.Edge e = edges.get(j); - - double z = 0; - for (int t = 0; t < numTags; t++) - { - double v = conditionals.get(i).get(j).get(t) * exp(-parameters[edgeIndex+t]); - q.get(i).get(j).set(t, v); - z += v; - } - objective += log(z) * e.getCount(); - - for (int t = 0; t < numTags; t++) - { - double v = q.get(i).get(j).get(t) / z; - q.get(i).get(j).set(t, v); - gradient[edgeIndex+t] -= e.getCount() * v; - } - - edgeIndex += numTags; - } - } -// System.out.println("computeObjectiveAndGradient logz=" + objective); -// System.out.println("lambda= " + Arrays.toString(parameters)); -// System.out.println("gradient=" + Arrays.toString(gradient)); - } - - public String toString() - { - StringBuilder sb = new StringBuilder(); - sb.append(getClass().getCanonicalName()).append(" with "); - sb.append(parameters.length).append(" parameters and "); - sb.append(training.getNumPhrases() * numTags).append(" constraints"); - return sb.toString(); - } - - double primal() - { - // primal = llh + KL(q||p) + scale * sum_pt max_c E_q[phi_pct] - // kl = sum_Y q(Y) log q(Y) / p(Y|X) - // = sum_Y q(Y) { -lambda . phi(Y) - log Z } - // = -log Z - lambda . E_q[phi] - // = -objective + lambda . gradient - - double kl = -objective + MathUtils.dotProduct(parameters, gradient); - double l1lmax = 0; - for (int i = 0; i < training.getNumPhrases(); ++i) - { - List edges = training.getEdgesForPhrase(i); - for (int t = 0; t < numTags; t++) - { - double lmax = Double.NEGATIVE_INFINITY; - for (int j = 0; j < edges.size(); ++j) - lmax = max(lmax, q.get(i).get(j).get(t)); - l1lmax += lmax; - } - } - - return llh + kl + constraintScale * l1lmax; - } - } -} diff --git a/gi/posterior-regularisation/README b/gi/posterior-regularisation/README deleted file mode 100644 index a3d54ffc..00000000 --- a/gi/posterior-regularisation/README +++ /dev/null @@ -1,3 +0,0 @@ - 557 ./cdec_extools/extractor -i btec/split.zh-en.al -c 500000 -L 12 -C | sort -t $'\t' -k 1 | ./cdec_extools/mr_stripe_rule_reduce > btec.concordance - 559 wc -l btec.concordance - 588 cat btec.concordance | sed 's/.* //' | awk '{ for (i=1; i < NF; i++) { x=substr($i, 1, 2); if (x == "C=") printf "\n"; else if (x != "||") printf "%s ", $i; }; printf "\n"; }' | sort | uniq | wc -l diff --git a/gi/posterior-regularisation/alphabet.hh b/gi/posterior-regularisation/alphabet.hh deleted file mode 100644 index 1db928da..00000000 --- a/gi/posterior-regularisation/alphabet.hh +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef _alphabet_hh -#define _alphabet_hh - -#include -#include -#include -#include -#include - -// Alphabet: indexes a set of types -template -class Alphabet: protected std::map -{ -public: - Alphabet() {}; - - bool empty() const { return std::map::empty(); } - int size() const { return std::map::size(); } - - int operator[](const T &k) const - { - typename std::map::const_iterator cit = find(k); - if (cit != std::map::end()) - return cit->second; - else - return -1; - } - - int lookup(const T &k) const { return (*this)[k]; } - - int insert(const T &k) - { - int sz = size(); - assert((unsigned) sz == _items.size()); - - std::pair::iterator, bool> - ins = std::map::insert(make_pair(k, sz)); - - if (ins.second) - _items.push_back(k); - - return ins.first->second; - } - - const T &type(int i) const - { - assert(i >= 0); - assert(i < size()); - return _items[i]; - } - - std::ostream &display(std::ostream &out, int i) const - { - return out << type(i); - } - -private: - std::vector _items; -}; - -#endif diff --git a/gi/posterior-regularisation/canned.concordance b/gi/posterior-regularisation/canned.concordance deleted file mode 100644 index 710973ff..00000000 --- a/gi/posterior-regularisation/canned.concordance +++ /dev/null @@ -1,4 +0,0 @@ -a 0 0 0 0 ||| C=1 ||| 1 1 1 1 ||| C=1 ||| 2 2 2 2 ||| C=1 -b 0 0 0 0 ||| C=1 ||| 1 1 1 1 ||| C=1 -c 2 2 2 2 ||| C=1 ||| 4 4 4 4 ||| C=1 ||| 5 5 5 5 ||| C=1 -d 4 4 4 4 ||| C=1 ||| 5 5 5 5 ||| C=1 diff --git a/gi/posterior-regularisation/em.cc b/gi/posterior-regularisation/em.cc deleted file mode 100644 index f6c9fd68..00000000 --- a/gi/posterior-regularisation/em.cc +++ /dev/null @@ -1,830 +0,0 @@ -// Input of the form: -// " the phantom of the opera " tickets for tonight ? ||| C=1 ||| seats for ? ||| C=1 ||| i see ? ||| C=1 -// phrase TAB [context]+ -// where context = phrase ||| C=... which are separated by ||| - -// Model parameterised as follows: -// - each phrase, p, is allocated a latent state, t -// - this is used to generate the contexts, c -// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR - -// Training with EM: -// - e-step is estimating P(t|p,c) for all x,c -// - m-step is estimating model parameters P(p,c,t) = P(t) P(p|t) P(c|t) - -// Sexing it up: -// - constrain the posteriors P(t|c) and P(t|p) to have few high-magnitude entries -// - improve the generation of phrase internals, e.g., generate edge words from -// different distribution to central words - -#include "alphabet.hh" -#include "log_add.hh" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; -using namespace std::tr1; - -const int numTags = 5; -const int numIterations = 100; -const bool posterior_regularisation = true; -const double PHRASE_VIOLATION_WEIGHT = 10; -const double CONTEXT_VIOLATION_WEIGHT = 0; -const bool includePhraseProb = false; - -// Data structures: -Alphabet lexicon; -typedef vector Phrase; -typedef tuple Context; -Alphabet phrases; -Alphabet contexts; - -typedef map ContextCounts; -typedef map PhraseCounts; -typedef map PhraseToContextCounts; -typedef map ContextToPhraseCounts; - -PhraseToContextCounts concordancePhraseToContexts; -ContextToPhraseCounts concordanceContextToPhrases; - -typedef vector Dist; -typedef vector ConditionalDist; -Dist prior; // class -> P(class) -vector probCtx; // word -> class -> P(word | class), for each position of context word -ConditionalDist probPhrase; // class -> P(word | class) -Dist probPhraseLength; // class -> P(length | class) expressed as geometric distribution parameter - -mt19937 randomGenerator((size_t) time(NULL)); -uniform_real uniDist(0.0, 1e-1); -variate_generator< mt19937, uniform_real > rng(randomGenerator, uniDist); - -void addRandomNoise(Dist &d); -void normalise(Dist &d); -void addTo(Dist &d, const Dist &e); -int argmax(const Dist &d); - -map > lambda_indices; - -Dist conditional_probs(const Phrase &phrase, const Context &context, double *normalisation = 0); -template -Dist -penalised_conditionals(const Phrase &phrase, const Context &context, - const T &lambda, double *normalisation); -//Dist penalised_conditionals(const Phrase &phrase, const Context &context, const double *lambda, double *normalisation = 0); -double penalised_log_likelihood(int n, const double *lambda, double *gradient, void *data); -void optimise_lambda(double delta, double gamma, vector &lambda); -double expected_violation_phrases(const double *lambda); -double expected_violation_contexts(const double *lambda); -double primal_kl_divergence(const double *lambda); -double dual(const double *lambda); -void print_primal_dual(const double *lambda, double delta, double gamma); - -ostream &operator<<(ostream &, const Phrase &); -ostream &operator<<(ostream &, const Context &); -ostream &operator<<(ostream &, const Dist &); -ostream &operator<<(ostream &, const ConditionalDist &); - -int -main(int argc, char *argv[]) -{ - randomGenerator.seed(time(NULL)); - - int edges = 0; - istream &input = cin; - while (input.good()) - { - // read the phrase - string phraseString; - Phrase phrase; - getline(input, phraseString, '\t'); - istringstream pinput(phraseString); - string token; - while (pinput >> token) - phrase.push_back(lexicon.insert(token)); - int phraseId = phrases.insert(phrase); - - // read the rest, storing each context - string remainder; - getline(input, remainder, '\n'); - istringstream rinput(remainder); - Context context(-1, -1, -1, -1); - int index = 0; - while (rinput >> token) - { - if (token != "|||" && token != "") - { - if (index < 4) - { - // eugh! damn templates - switch (index) - { - case 0: get<0>(context) = lexicon.insert(token); break; - case 1: get<1>(context) = lexicon.insert(token); break; - case 2: get<2>(context) = lexicon.insert(token); break; - case 3: get<3>(context) = lexicon.insert(token); break; - default: assert(false); - } - index += 1; - } - else if (token.find("C=") == 0) - { - int contextId = contexts.insert(context); - int count = atoi(token.substr(strlen("C=")).c_str()); - concordancePhraseToContexts[phraseId][contextId] += count; - concordanceContextToPhrases[contextId][phraseId] += count; - index = 0; - context = Context(-1, -1, -1, -1); - edges += 1; - } - } - } - - // trigger EOF - input >> ws; - } - - cout << "Read in " << phrases.size() << " phrases" - << " and " << contexts.size() << " contexts" - << " and " << edges << " edges" - << " and " << lexicon.size() << " word types\n"; - - // FIXME: filter out low count phrases and low count contexts (based on individual words?) - // now populate model parameters with uniform + random noise - prior.resize(numTags, 1.0); - addRandomNoise(prior); - normalise(prior); - - probCtx.resize(4, ConditionalDist(numTags, Dist(lexicon.size(), 1.0))); - if (includePhraseProb) - probPhrase.resize(numTags, Dist(lexicon.size(), 1.0)); - for (int t = 0; t < numTags; ++t) - { - for (int j = 0; j < 4; ++j) - { - addRandomNoise(probCtx[j][t]); - normalise(probCtx[j][t]); - } - if (includePhraseProb) - { - addRandomNoise(probPhrase[t]); - normalise(probPhrase[t]); - } - } - if (includePhraseProb) - { - probPhraseLength.resize(numTags, 0.5); // geometric distribution p=0.5 - addRandomNoise(probPhraseLength); - } - - cout << "\tprior: " << prior << "\n"; - //cout << "\tcontext: " << probCtx << "\n"; - //cout << "\tphrase: " << probPhrase << "\n"; - //cout << "\tphraseLen: " << probPhraseLength << endl; - - vector lambda; - - // now do EM training - for (int iteration = 0; iteration < numIterations; ++iteration) - { - cout << "EM iteration " << iteration << endl; - - if (posterior_regularisation) - optimise_lambda(PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT, lambda); - //cout << "\tlambda " << lambda << endl; - - Dist countsPrior(numTags, 0.0); - vector countsCtx(4, ConditionalDist(numTags, Dist(lexicon.size(), 1e-10))); - ConditionalDist countsPhrase(numTags, Dist(lexicon.size(), 1e-10)); - Dist countsPhraseLength(numTags, 0.0); - Dist nPhrases(numTags, 0.0); - - double llh = 0; - for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin(); - pcit != concordancePhraseToContexts.end(); ++pcit) - { - const Phrase &phrase = phrases.type(pcit->first); - - // e-step: estimate latent class probs; compile (class,word) stats for m-step - for (ContextCounts::iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - - double z = 0; - Dist tagCounts; - if (!posterior_regularisation) - tagCounts = conditional_probs(phrase, context, &z); - else - tagCounts = penalised_conditionals(phrase, context, lambda, &z); - - llh += log(z) * ccit->second; - addTo(countsPrior, tagCounts); // FIXME: times ccit->secon - - for (int t = 0; t < numTags; ++t) - { - for (int j = 0; j < 4; ++j) - countsCtx[j][t][get<0>(context)] += tagCounts[t] * ccit->second; - - if (includePhraseProb) - { - for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) - countsPhrase[t][*pit] += tagCounts[t] * ccit->second; - countsPhraseLength[t] += phrase.size() * tagCounts[t] * ccit->second; - nPhrases[t] += tagCounts[t] * ccit->second; - } - } - } - } - - cout << "M-step\n"; - - // m-step: normalise prior and (class,word) stats and assign to model parameters - normalise(countsPrior); - prior = countsPrior; - for (int t = 0; t < numTags; ++t) - { - //cout << "\t\tt " << t << " prior " << countsPrior[t] << "\n"; - for (int j = 0; j < 4; ++j) - normalise(countsCtx[j][t]); - if (includePhraseProb) - { - normalise(countsPhrase[t]); - countsPhraseLength[t] = nPhrases[t] / countsPhraseLength[t]; - } - } - probCtx = countsCtx; - if (includePhraseProb) - { - probPhrase = countsPhrase; - probPhraseLength = countsPhraseLength; - } - - double *larray = new double[lambda.size()]; - copy(lambda.begin(), lambda.end(), larray); - print_primal_dual(larray, PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT); - delete [] larray; - - //cout << "\tllh " << llh << endl; - //cout << "\tprior: " << prior << "\n"; - //cout << "\tcontext: " << probCtx << "\n"; - //cout << "\tphrase: " << probPhrase << "\n"; - //cout << "\tphraseLen: " << probPhraseLength << "\n"; - } - - // output class membership - for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin(); - pcit != concordancePhraseToContexts.end(); ++pcit) - { - const Phrase &phrase = phrases.type(pcit->first); - for (ContextCounts::iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - Dist tagCounts = conditional_probs(phrase, context, 0); - cout << phrase << " ||| " << context << " ||| " << argmax(tagCounts) << "\n"; - } - } - - return 0; -} - -void addRandomNoise(Dist &d) -{ - for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) - *dit += rng(); -} - -void normalise(Dist &d) -{ - double z = 0; - for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) - z += *dit; - for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) - *dit /= z; -} - -void addTo(Dist &d, const Dist &e) -{ - assert(d.size() == e.size()); - for (int i = 0; i < (int) d.size(); ++i) - d[i] += e[i]; -} - -int argmax(const Dist &d) -{ - double best = d[0]; - int index = 0; - for (int i = 1; i < (int) d.size(); ++i) - { - if (d[i] > best) - { - best = d[i]; - index = i; - } - } - return index; -} - -ostream &operator<<(ostream &out, const Phrase &phrase) -{ - for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) - lexicon.display(((pit == phrase.begin()) ? out : out << " "), *pit); - return out; -} - -ostream &operator<<(ostream &out, const Context &context) -{ - lexicon.display(out, get<0>(context)); - lexicon.display(out << " ", get<1>(context)); - lexicon.display(out << " ", get<2>(context)); - lexicon.display(out << " ", get<3>(context)); - return out; -} - -ostream &operator<<(ostream &out, const Dist &dist) -{ - for (Dist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit) - out << ((dit == dist.begin()) ? "" : " ") << *dit; - return out; -} - -ostream &operator<<(ostream &out, const ConditionalDist &dist) -{ - for (ConditionalDist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit) - out << ((dit == dist.begin()) ? "" : "; ") << *dit; - return out; -} - -// FIXME: slow - just use the phrase index, context index to do the mapping -// (n.b. it's a sparse setup, not just equal to 3d array index) -int -lambda_index(const Phrase &phrase, const Context &context, int tag) -{ - return lambda_indices[phrase][context] + tag; -} - -template -Dist -penalised_conditionals(const Phrase &phrase, const Context &context, - const T &lambda, double *normalisation) -{ - Dist d = conditional_probs(phrase, context, 0); - - double z = 0; - for (int t = 0; t < numTags; ++t) - { - d[t] *= exp(-lambda[lambda_index(phrase, context, t)]); - z += d[t]; - } - - if (normalisation) - *normalisation = z; - - for (int t = 0; t < numTags; ++t) - d[t] /= z; - - return d; -} - -Dist -conditional_probs(const Phrase &phrase, const Context &context, double *normalisation) -{ - Dist tagCounts(numTags, 0.0); - double z = 0; - for (int t = 0; t < numTags; ++t) - { - double prob = prior[t]; - prob *= (probCtx[0][t][get<0>(context)] * probCtx[1][t][get<1>(context)] * - probCtx[2][t][get<2>(context)] * probCtx[3][t][get<3>(context)]); - - if (includePhraseProb) - { - prob *= pow(1 - probPhraseLength[t], phrase.size() - 1) * probPhraseLength[t]; - for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) - prob *= probPhrase[t][*pit]; - } - - tagCounts[t] = prob; - z += prob; - } - if (normalisation) - *normalisation = z; - - for (int t = 0; t < numTags; ++t) - tagCounts[t] /= z; - - return tagCounts; -} - -double -penalised_log_likelihood(int n, const double *lambda, double *grad, void *) -{ - // return log Z(lambda, theta) over the corpus - // where theta are the global parameters (prior, probCtx*, probPhrase*) - // and lambda are lagrange multipliers for the posterior sparsity constraints - // - // this is formulated as: - // f = log Z(lambda) = sum_i log ( sum_i p_theta(t_i|p_i,c_i) exp [-lambda_{t_i,p_i,c_i}] ) - // where i indexes the training examples - specifying the (p, c) pair (which may occur with count > 1) - // - // with derivative: - // f'_{tpc} = frac { - count(t,p,c) p_theta(t|p,c) exp (-lambda_{t,p,c}) } - // { sum_t' p_theta(t'|p,c) exp (-lambda_{t',p,c}) } - - //cout << "penalised_log_likelihood with lambda "; - //copy(lambda, lambda+n, ostream_iterator(cout, " ")); - //cout << "\n"; - - double f = 0; - if (grad) - { - for (int i = 0; i < n; ++i) - grad[i] = 0.0; - } - - for (int p = 0; p < phrases.size(); ++p) - { - const Phrase &phrase = phrases.type(p); - PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); - for (ContextCounts::const_iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - double z = 0; - Dist scores = penalised_conditionals(phrase, context, lambda, &z); - - f += ccit->second * log(z); - //cout << "\tphrase: " << phrase << " context: " << context << " count: " << ccit->second << " z " << z << endl; - //cout << "\t\tscores: " << scores << "\n"; - - if (grad) - { - for (int t = 0; t < numTags; ++t) - { - int i = lambda_index(phrase, context, t); // FIXME: redundant lookups - assert(grad[i] == 0.0); - grad[i] = - ccit->second * scores[t]; - } - } - } - } - - //cout << "penalised_log_likelihood returning " << f; - //if (grad) - //{ - //cout << "\ngradient: "; - //copy(grad, grad+n, ostream_iterator(cout, " ")); - //} - //cout << "\n"; - - return f; -} - -typedef struct -{ - // one of p or c should be set to -1, in which case it will be marginalised out - // i.e. sum_p' lambda_{p'ct} <= threshold - // or sum_c' lambda_{pc't} <= threshold - int p, c, t, threshold; -} constraint_data; - -double -constraint_and_gradient(int n, const double *lambda, double *grad, void *data) -{ - constraint_data *d = (constraint_data *) data; - assert(d->t >= 0); - assert(d->threshold >= 0); - - //cout << "constraint_and_gradient: t " << d->t << " p " << d->p << " c " << d->c << " tau " << d->threshold << endl; - //cout << "\tlambda "; - //copy(lambda, lambda+n, ostream_iterator(cout, " ")); - //cout << "\n"; - - // FIXME: it's crazy to use a dense gradient here => will only have a handful of non-zero entries - if (grad) - { - for (int i = 0; i < n; ++i) - grad[i] = 0.0; - } - - //cout << "constraint_and_gradient: " << d->p << "; " << d->c << "; " << d->t << "; " << d->threshold << endl; - - if (d->p >= 0) - { - assert(d->c < 0); - // sum_c lambda_pct <= delta [a.k.a. threshold] - // => sum_c lambda_pct - delta <= 0 - // derivative_pct = { 1, if p and t match; 0, otherwise } - - double val = -d->threshold; - - const Phrase &phrase = phrases.type(d->p); - PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(d->p); - assert(pcit != concordancePhraseToContexts.end()); - for (ContextCounts::const_iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - int i = lambda_index(phrase, context, d->t); - val += lambda[i]; - if (grad) grad[i] = 1; - } - //cout << "\treturning " << val << endl; - - return val; - } - else - { - assert(d->c >= 0); - assert(d->p < 0); - // sum_p lambda_pct <= gamma [a.k.a. threshold] - // => sum_p lambda_pct - gamma <= 0 - // derivative_pct = { 1, if c and t match; 0, otherwise } - - double val = -d->threshold; - - const Context &context = contexts.type(d->c); - ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(d->c); - assert(cpit != concordanceContextToPhrases.end()); - for (PhraseCounts::iterator pcit = cpit->second.begin(); - pcit != cpit->second.end(); ++pcit) - { - const Phrase &phrase = phrases.type(pcit->first); - int i = lambda_index(phrase, context, d->t); - val += lambda[i]; - if (grad) grad[i] = 1; - } - //cout << "\treturning " << val << endl; - - return val; - } -} - -void -optimise_lambda(double delta, double gamma, vector &lambdav) -{ - int num_lambdas = lambdav.size(); - if (lambda_indices.empty() || lambdav.empty()) - { - lambda_indices.clear(); - lambdav.clear(); - - int i = 0; - for (int p = 0; p < phrases.size(); ++p) - { - const Phrase &phrase = phrases.type(p); - PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.find(p); - for (ContextCounts::iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - lambda_indices[phrase][context] = i; - i += numTags; - } - } - num_lambdas = i; - lambdav.resize(num_lambdas); - } - //cout << "optimise_lambda: #langrange multipliers " << num_lambdas << endl; - - // FIXME: better to work with an implicit representation to save memory usage - int num_constraints = (((delta > 0) ? phrases.size() : 0) + ((gamma > 0) ? contexts.size() : 0)) * numTags; - //cout << "optimise_lambda: #constraints " << num_constraints << endl; - constraint_data *data = new constraint_data[num_constraints]; - int i = 0; - if (delta > 0) - { - for (int p = 0; p < phrases.size(); ++p) - { - for (int t = 0; t < numTags; ++t, ++i) - { - constraint_data &d = data[i]; - d.p = p; - d.c = -1; - d.t = t; - d.threshold = delta; - } - } - } - - if (gamma > 0) - { - for (int c = 0; c < contexts.size(); ++c) - { - for (int t = 0; t < numTags; ++t, ++i) - { - constraint_data &d = data[i]; - d.p = -1; - d.c = c; - d.t = t; - d.threshold = gamma; - } - } - } - assert(i == num_constraints); - - double lambda[num_lambdas]; - double lb[num_lambdas], ub[num_lambdas]; - for (i = 0; i < num_lambdas; ++i) - { - lambda[i] = lambdav[i]; // starting value - lb[i] = 0; // lower bound - if (delta <= 0) // upper bound - ub[i] = gamma; - else if (gamma <= 0) - ub[i] = delta; - else - assert(false); - } - - //print_primal_dual(lambda, delta, gamma); - - double minf; - int error_code = nlopt_minimize_constrained(NLOPT_LN_COBYLA, num_lambdas, penalised_log_likelihood, NULL, - num_constraints, constraint_and_gradient, data, sizeof(constraint_data), - lb, ub, lambda, &minf, -HUGE_VAL, 0.0, 0.0, 1e-4, NULL, 0, 0.0); - //cout << "optimise error code " << error_code << endl; - - //print_primal_dual(lambda, delta, gamma); - - delete [] data; - - if (error_code < 0) - cout << "WARNING: optimisation failed with error code: " << error_code << endl; - //else - //{ - //cout << "success; minf " << minf << endl; - //print_primal_dual(lambda, delta, gamma); - //} - - lambdav = vector(&lambda[0], &lambda[0] + num_lambdas); -} - -// FIXME: inefficient - cache the scores -double -expected_violation_phrases(const double *lambda) -{ - // sum_pt max_c E_q[phi_pct] - double violation = 0; - - for (int p = 0; p < phrases.size(); ++p) - { - const Phrase &phrase = phrases.type(p); - PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); - - for (int t = 0; t < numTags; ++t) - { - double best = 0; - for (ContextCounts::const_iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - Dist scores = penalised_conditionals(phrase, context, lambda, 0); - best = max(best, scores[t]); - } - violation += best; - } - } - - return violation; -} - -// FIXME: inefficient - cache the scores -double -expected_violation_contexts(const double *lambda) -{ - // sum_ct max_p E_q[phi_pct] - double violation = 0; - - for (int c = 0; c < contexts.size(); ++c) - { - const Context &context = contexts.type(c); - ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(c); - - for (int t = 0; t < numTags; ++t) - { - double best = 0; - for (PhraseCounts::iterator pit = cpit->second.begin(); - pit != cpit->second.end(); ++pit) - { - const Phrase &phrase = phrases.type(pit->first); - Dist scores = penalised_conditionals(phrase, context, lambda, 0); - best = max(best, scores[t]); - } - violation += best; - } - } - - return violation; -} - -// FIXME: possibly inefficient -double -primal_likelihood() // FIXME: primal evaluation needs to use lambda and calculate l1linf terms -{ - double llh = 0; - for (int p = 0; p < phrases.size(); ++p) - { - const Phrase &phrase = phrases.type(p); - PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); - for (ContextCounts::const_iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - double z = 0; - Dist scores = conditional_probs(phrase, context, &z); - llh += ccit->second * log(z); - } - } - return llh; -} - -// FIXME: inefficient - cache the scores -double -primal_kl_divergence(const double *lambda) -{ - // return KL(q || p) = sum_y q(y) { log q(y) - log p(y | x) } - // = sum_y q(y) { log p(y | x) - lambda . phi(x, y) - log Z - log p(y | x) } - // = sum_y q(y) { - lambda . phi(x, y) } - log Z - // and q(y) factors with each edge, ditto for Z - - double feature_sum = 0, log_z = 0; - for (int p = 0; p < phrases.size(); ++p) - { - const Phrase &phrase = phrases.type(p); - PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); - for (ContextCounts::const_iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - - double local_z = 0; - double local_f = 0; - Dist d = conditional_probs(phrase, context, 0); - for (int t = 0; t < numTags; ++t) - { - int i = lambda_index(phrase, context, t); - double s = d[t] * exp(-lambda[i]); - local_f += lambda[i] * s; - local_z += s; - } - - log_z += ccit->second * log(local_z); - feature_sum += ccit->second * (local_f / local_z); - } - } - - return -feature_sum - log_z; -} - -// FIXME: inefficient - cache the scores -double -dual(const double *lambda) -{ - // return log(Z) = - log { sum_y p(y | x) exp( - lambda . phi(x, y) } - // n.b. have flipped the sign as we're minimising - - double z = 0; - for (int p = 0; p < phrases.size(); ++p) - { - const Phrase &phrase = phrases.type(p); - PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); - for (ContextCounts::const_iterator ccit = pcit->second.begin(); - ccit != pcit->second.end(); ++ccit) - { - const Context &context = contexts.type(ccit->first); - double lz = 0; - Dist scores = penalised_conditionals(phrase, context, lambda, &z); - z += lz * ccit->second; - } - } - return log(z); -} - -void -print_primal_dual(const double *lambda, double delta, double gamma) -{ - double likelihood = primal_likelihood(); - double kl = primal_kl_divergence(lambda); - double sum_pt = expected_violation_phrases(lambda); - double sum_ct = expected_violation_contexts(lambda); - //double d = dual(lambda); - - cout << "\tllh=" << likelihood - << " kl=" << kl - << " violations phrases=" << sum_pt - << " contexts=" << sum_ct - //<< " primal=" << (kl + delta * sum_pt + gamma * sum_ct) - //<< " dual=" << d - << " objective=" << (likelihood - kl + delta * sum_pt + gamma * sum_ct) - << endl; -} diff --git a/gi/posterior-regularisation/invert.hh b/gi/posterior-regularisation/invert.hh deleted file mode 100644 index d06356e9..00000000 --- a/gi/posterior-regularisation/invert.hh +++ /dev/null @@ -1,45 +0,0 @@ -// The following code inverts the matrix input using LU-decomposition with -// backsubstitution of unit vectors. Reference: Numerical Recipies in C, 2nd -// ed., by Press, Teukolsky, Vetterling & Flannery. -// Code written by Fredrik Orderud. -// http://www.crystalclearsoftware.com/cgi-bin/boost_wiki/wiki.pl?LU_Matrix_Inversion - -#ifndef INVERT_MATRIX_HPP -#define INVERT_MATRIX_HPP - -// REMEMBER to update "lu.hpp" header includes from boost-CVS -#include -#include -#include -#include -#include -#include - -namespace ublas = boost::numeric::ublas; - -/* Matrix inversion routine. - Uses lu_factorize and lu_substitute in uBLAS to invert a matrix */ -template -bool invert_matrix(const ublas::matrix& input, ublas::matrix& inverse) -{ - using namespace boost::numeric::ublas; - typedef permutation_matrix pmatrix; - // create a working copy of the input - matrix A(input); - // create a permutation matrix for the LU-factorization - pmatrix pm(A.size1()); - - // perform LU-factorization - int res = lu_factorize(A,pm); - if( res != 0 ) return false; - - // create identity matrix of "inverse" - inverse.assign(ublas::identity_matrix(A.size1())); - - // backsubstitute to get the inverse - lu_substitute(A, pm, inverse); - - return true; -} - -#endif //INVERT_MATRIX_HPP diff --git a/gi/posterior-regularisation/linesearch.py b/gi/posterior-regularisation/linesearch.py deleted file mode 100644 index 5a3f2e9c..00000000 --- a/gi/posterior-regularisation/linesearch.py +++ /dev/null @@ -1,58 +0,0 @@ -## Automatically adapted for scipy Oct 07, 2005 by convertcode.py - -from scipy.optimize import minpack2 -import numpy - -import __builtin__ -pymin = __builtin__.min - -def line_search(f, myfprime, xk, pk, gfk, old_fval, old_old_fval, - args=(), c1=1e-4, c2=0.9, amax=50): - - fc = 0 - gc = 0 - phi0 = old_fval - derphi0 = numpy.dot(gfk,pk) - alpha1 = pymin(1.0,1.01*2*(phi0-old_old_fval)/derphi0) - # trevor: added this test - alpha1 = pymin(alpha1,amax) - - if isinstance(myfprime,type(())): - eps = myfprime[1] - fprime = myfprime[0] - newargs = (f,eps) + args - gradient = False - else: - fprime = myfprime - newargs = args - gradient = True - - xtol = 1e-14 - amin = 1e-8 - isave = numpy.zeros((2,), numpy.intc) - dsave = numpy.zeros((13,), float) - task = 'START' - fval = old_fval - gval = gfk - - while 1: - stp,fval,derphi,task = minpack2.dcsrch(alpha1, phi0, derphi0, c1, c2, - xtol, task, amin, amax,isave,dsave) - #print 'minpack2.dcsrch', alpha1, phi0, derphi0, c1, c2, xtol, task, amin, amax,isave,dsave - #print 'returns', stp,fval,derphi,task - - if task[:2] == 'FG': - alpha1 = stp - fval = f(xk+stp*pk,*args) - fc += 1 - gval = fprime(xk+stp*pk,*newargs) - if gradient: gc += 1 - else: fc += len(xk) + 1 - phi0 = fval - derphi0 = numpy.dot(gval,pk) - else: - break - - if task[:5] == 'ERROR' or task[1:4] == 'WARN': - stp = None # failed - return stp, fc, gc, fval, old_fval, gval diff --git a/gi/posterior-regularisation/log_add.hh b/gi/posterior-regularisation/log_add.hh deleted file mode 100644 index e0620c5a..00000000 --- a/gi/posterior-regularisation/log_add.hh +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef log_add_hh -#define log_add_hh - -#include -#include -#include -#include - -template -struct Log -{ - static T zero() { return -std::numeric_limits::infinity(); } - - static T add(T l1, T l2) - { - if (l1 == zero()) return l2; - if (l1 > l2) - return l1 + std::log(1 + exp(l2 - l1)); - else - return l2 + std::log(1 + exp(l1 - l2)); - } - - static T subtract(T l1, T l2) - { - //std::assert(l1 >= l2); - return l1 + log(1 - exp(l2 - l1)); - } -}; - -#endif diff --git a/gi/posterior-regularisation/prjava.jar b/gi/posterior-regularisation/prjava.jar deleted file mode 120000 index da8bf761..00000000 --- a/gi/posterior-regularisation/prjava.jar +++ /dev/null @@ -1 +0,0 @@ -prjava/prjava-20100708.jar \ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/Makefile b/gi/posterior-regularisation/prjava/Makefile deleted file mode 100755 index bd3bfca0..00000000 --- a/gi/posterior-regularisation/prjava/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -all: - ant dist - -check: - echo no tests - -clean: - ant clean diff --git a/gi/posterior-regularisation/prjava/build.xml b/gi/posterior-regularisation/prjava/build.xml deleted file mode 100644 index 7222b3c8..00000000 --- a/gi/posterior-regularisation/prjava/build.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/gi/posterior-regularisation/prjava/lib/commons-math-2.1.jar b/gi/posterior-regularisation/prjava/lib/commons-math-2.1.jar deleted file mode 100644 index 43b4b369..00000000 Binary files a/gi/posterior-regularisation/prjava/lib/commons-math-2.1.jar and /dev/null differ diff --git a/gi/posterior-regularisation/prjava/lib/jopt-simple-3.2.jar b/gi/posterior-regularisation/prjava/lib/jopt-simple-3.2.jar deleted file mode 100644 index 56373621..00000000 Binary files a/gi/posterior-regularisation/prjava/lib/jopt-simple-3.2.jar and /dev/null differ diff --git a/gi/posterior-regularisation/prjava/lib/trove-2.0.2.jar b/gi/posterior-regularisation/prjava/lib/trove-2.0.2.jar deleted file mode 100644 index 3e59fbf3..00000000 Binary files a/gi/posterior-regularisation/prjava/lib/trove-2.0.2.jar and /dev/null differ diff --git a/gi/posterior-regularisation/prjava/src/arr/F.java b/gi/posterior-regularisation/prjava/src/arr/F.java deleted file mode 100644 index be0a6ed6..00000000 --- a/gi/posterior-regularisation/prjava/src/arr/F.java +++ /dev/null @@ -1,99 +0,0 @@ -package arr; - -import java.util.Arrays; -import java.util.Random; - -public class F { - public static Random rng = new Random(); - - public static void randomise(double probs[]) - { - randomise(probs, true); - } - - public static void randomise(double probs[], boolean normalise) - { - double z = 0; - for (int i = 0; i < probs.length; ++i) - { - probs[i] = 10 + rng.nextDouble(); - if (normalise) - z += probs[i]; - } - - if (normalise) - for (int i = 0; i < probs.length; ++i) - probs[i] /= z; - } - - public static void uniform(double probs[]) - { - for (int i = 0; i < probs.length; ++i) - probs[i] = 1.0 / probs.length; - } - - public static void l1normalize(double [] a){ - double sum=0; - for(int i=0;i m) - { - m = probs[i]; - mi = i; - } - } - return mi; - } - -} diff --git a/gi/posterior-regularisation/prjava/src/data/Corpus.java b/gi/posterior-regularisation/prjava/src/data/Corpus.java deleted file mode 100644 index 425ede11..00000000 --- a/gi/posterior-regularisation/prjava/src/data/Corpus.java +++ /dev/null @@ -1,233 +0,0 @@ -package data; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Scanner; - -public class Corpus { - - public static final String alphaFilename="../posdata/corpus.alphabet"; - public static final String tagalphaFilename="../posdata/corpus.tag.alphabet"; - -// public static final String START_SYM=""; - public static final String END_SYM=""; - public static final String NUM_TOK=""; - - public static final String UNK_TOK=""; - - private ArrayListsent; - private ArrayListdata; - - public ArrayListtag; - public ArrayListtagData; - - public static boolean convertNumTok=true; - - private HashMapfreq; - public HashMapvocab; - - public HashMaptagVocab; - private int tagV; - - private int V; - - public static void main(String[] args) { - Corpus c=new Corpus("../posdata/en_test.conll"); - System.out.println( - Arrays.toString(c.get(0)) - ); - System.out.println( - Arrays.toString(c.getInt(0)) - ); - - System.out.println( - Arrays.toString(c.get(1)) - ); - System.out.println( - Arrays.toString(c.getInt(1)) - ); - } - - public Corpus(String filename,HashMapdict){ - V=0; - tagV=0; - freq=new HashMap(); - tagVocab=new HashMap(); - vocab=dict; - - sent=new ArrayList(); - tag=new ArrayList(); - - Scanner sc=io.FileUtil.openInFile(filename); - ArrayLists=new ArrayList(); - // s.add(START_SYM); - while(sc.hasNextLine()){ - String line=sc.nextLine(); - String toks[]=line.split("\t"); - if(toks.length<2){ - s.add(END_SYM); - sent.add(s.toArray(new String[0])); - s=new ArrayList(); - // s.add(START_SYM); - continue; - } - String tok=toks[1].toLowerCase(); - s.add(tok); - } - sc.close(); - - buildData(); - } - - public Corpus(String filename){ - V=0; - freq=new HashMap(); - vocab=new HashMap(); - tagVocab=new HashMap(); - - sent=new ArrayList(); - tag=new ArrayList(); - - System.out.println("Reading:"+filename); - - Scanner sc=io.FileUtil.openInFile(filename); - ArrayLists=new ArrayList(); - ArrayListtags=new ArrayList(); - //s.add(START_SYM); - while(sc.hasNextLine()){ - String line=sc.nextLine(); - String toks[]=line.split("\t"); - if(toks.length<2){ - s.add(END_SYM); - tags.add(END_SYM); - if(s.size()>2){ - sent.add(s.toArray(new String[0])); - tag.add(tags.toArray(new String [0])); - } - s=new ArrayList(); - tags=new ArrayList(); - // s.add(START_SYM); - continue; - } - - String tok=toks[1].toLowerCase(); - if(convertNumTok && tok.matches(".*\\d.*")){ - tok=NUM_TOK; - } - s.add(tok); - - if(toks.length>3){ - tok=toks[3].toLowerCase(); - }else{ - tok="_"; - } - tags.add(tok); - - } - sc.close(); - - for(int i=0;i(); - for(int i=0;i(); - for(int i=0;i2){ - vocab.put(key, V); - V++; - } - } - io.SerializedObjects.writeSerializedObject(vocab, alphaFilename); - io.SerializedObjects.writeSerializedObject(tagVocab,tagalphaFilename); - } - - private void addTag(String tag){ - Integer i=tagVocab.get(tag); - if(i==null){ - tagVocab.put(tag, tagV); - tagV++; - } - } - -} diff --git a/gi/posterior-regularisation/prjava/src/hmm/HMM.java b/gi/posterior-regularisation/prjava/src/hmm/HMM.java deleted file mode 100644 index 17a4679f..00000000 --- a/gi/posterior-regularisation/prjava/src/hmm/HMM.java +++ /dev/null @@ -1,579 +0,0 @@ -package hmm; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Scanner; - -public class HMM { - - - //trans[i][j]=prob of going FROM i to j - double [][]trans; - double [][]emit; - double []pi; - int [][]data; - int [][]tagdata; - - double logtrans[][]; - - public HMMObjective o; - - public static void main(String[] args) { - - } - - public HMM(int n_state,int n_emit,int [][]data){ - trans=new double [n_state][n_state]; - emit=new double[n_state][n_emit]; - pi=new double [n_state]; - System.out.println(" random initial parameters"); - fillRand(trans); - fillRand(emit); - fillRand(pi); - - this.data=data; - - } - - private void fillRand(double [][] a){ - for(int i=0;i=0;n--){ - for(int i=0;imaxprob){ - maxprob=p[seq.length-1][i]; - maxIdx=i; - } - } - int ans[]=new int [seq.length]; - ans[seq.length-1]=maxIdx; - for(int i=seq.length-2;i>=0;i--){ - ans[i]=backp[i+1][ans[i+1]]; - } - return ans; - } - - public double l1norm(double a[]){ - double norm=0; - for(int i=0;i s=new ArrayList(); - int state=sample(pi); - int sym=sample(emit[state]); - while(sym!=terminalSym){ - s.add(sym); - state=sample(trans[state]); - sym=sample(emit[state]); - } - - int ans[]=new int [s.size()]; - for(int i=0;i=r){ - return i; - } - } - return p.length-1; - } - - public void train(int tagdata[][]){ - double trans_exp_cnt[][]=new double [trans.length][trans.length]; - double emit_exp_cnt[][]=new double[trans.length][emit[0].length]; - double start_exp_cnt[]=new double[trans.length]; - - for(int i=0;imaxwt[i][d[sentNum][n]]){ - maxwt[i][d[sentNum][n]]=py; - } - - } - } - - //the last state - int len=post.length; - for(int i=0;imaxwt[i][d[sentNum][len-1]]){ - maxwt[i][d[sentNum][len-1]]=py; - } - - } - - } - - } - -}//end of class diff --git a/gi/posterior-regularisation/prjava/src/hmm/HMMObjective.java b/gi/posterior-regularisation/prjava/src/hmm/HMMObjective.java deleted file mode 100644 index 70b6c966..00000000 --- a/gi/posterior-regularisation/prjava/src/hmm/HMMObjective.java +++ /dev/null @@ -1,351 +0,0 @@ -package hmm; - -import gnu.trove.TIntArrayList; -import optimization.gradientBasedMethods.ProjectedGradientDescent; -import optimization.gradientBasedMethods.ProjectedObjective; -import optimization.gradientBasedMethods.stats.OptimizerStats; -import optimization.linesearch.ArmijoLineSearchMinimizationAlongProjectionArc; -import optimization.linesearch.InterpolationPickFirstStep; -import optimization.linesearch.LineSearchMethod; -import optimization.projections.SimplexProjection; -import optimization.stopCriteria.CompositeStopingCriteria; -import optimization.stopCriteria.ProjectedGradientL2Norm; -import optimization.stopCriteria.StopingCriteria; -import optimization.stopCriteria.ValueDifference; - -public class HMMObjective extends ProjectedObjective{ - - - private static final double GRAD_DIFF = 3; - public static double INIT_STEP_SIZE=10; - public static double VAL_DIFF=1000; - - private HMM hmm; - double[] newPoint ; - - //posterior[sent num][tok num][tag]=index into lambda - private int posteriorMap[][][]; - //projection[word][tag].get(occurence)=index into lambda - private TIntArrayList projectionMap[][]; - - //Size of the simplex - public double scale=10; - private SimplexProjection projection; - - private int wordFreq[]; - private static int MIN_FREQ=10; - private int numWordsToProject=0; - - private int n_param; - - public double loglikelihood; - - public HMMObjective(HMM h){ - hmm=h; - - countWords(); - buildMap(); - - gradient=new double [n_param]; - projection = new SimplexProjection(scale); - newPoint = new double[n_param]; - setInitialParameters(new double[n_param]); - - } - - /**@brief counts word frequency in the corpus - * - */ - private void countWords(){ - wordFreq=new int [hmm.emit[0].length]; - for(int i=0;i