From 8a71c56ee9a59f30fa2a7765905e2b8e9a023025 Mon Sep 17 00:00:00 2001 From: redpony Date: Fri, 25 Jun 2010 01:45:03 +0000 Subject: optionally use unlimited cache git-svn-id: https://ws10smt.googlecode.com/svn/trunk@27 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/extractor.cc | 13 +++++++------ gi/clda/src/clda.cc | 2 +- gi/clda/src/crp.h | 1 - gi/pipeline/local-gi-pipeline.pl | 10 ++++++++-- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/extools/extractor.cc b/extools/extractor.cc index a3791d2a..4f9b4dc6 100644 --- a/extools/extractor.cc +++ b/extools/extractor.cc @@ -34,7 +34,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("base_phrase,B", "Write base phrases") ("base_phrase_spans", "Write base sentences and phrase spans") ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))") - ("combiner_size,c", po::value()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 0 to disable cache.") + ("combiner_size,c", po::value()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") ("silent", "Write nothing to stderr except errors") ("phrase_context,C", "Write base phrase contexts") ("phrase_context_size,S", po::value()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") @@ -86,7 +86,9 @@ void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence, } struct CountCombiner { - CountCombiner(size_t csize) : combiner_size(csize) {} + CountCombiner(const size_t& csize) : combiner_size(csize) { + if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } + } ~CountCombiner() { if (!cache.empty()) WriteAndClearCache(); } @@ -95,13 +97,14 @@ struct CountCombiner { const vector& val, const int count_type, const vector >& aligns) { - if (combiner_size > 0) { + if (combiner_size != 1) { RuleStatistics& v = cache[key][val]; float newcount = v.counts.add_value(count_type, 1.0f); // hack for adding alignments if (newcount < 7.0f && aligns.size() > v.aligns.size()) v.aligns = aligns; - if (cache.size() > combiner_size) WriteAndClearCache(); + if (combiner_size > 1 && cache.size() > combiner_size) + WriteAndClearCache(); } else { cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| "; cout << RuleStatistics(count_type, 1.0f, aligns) << endl; @@ -300,8 +303,6 @@ int main(int argc, char** argv) { string sdefault_cat = conf["default_category"].as(); default_cat = -TD::Convert(sdefault_cat); cerr << "Default category: " << sdefault_cat << endl; - } else { - cerr << "No default category (use --default_category if you want to set one)\n"; } ReadFile rf(conf["input"].as()); istream& in = *rf.stream(); diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc index 05cbb441..efe666e2 100644 --- a/gi/clda/src/clda.cc +++ b/gi/clda/src/clda.cc @@ -61,13 +61,13 @@ int main(int argc, char** argv) { double alpha = 50.0 / num_classes; vector > dr(zji.size(), CRP(beta)); // dr[i] describes the probability of using a topic in document i vector > wr(num_classes, CRP(alpha)); // wr[k] describes the probability of generating a word in topic k - int random_topic = rng.next() * num_classes; for (int j = 0; j < zji.size(); ++j) { const size_t num_words = wji[j].size(); vector& zj = zji[j]; const vector& wj = wji[j]; zj.resize(num_words); for (int i = 0; i < num_words; ++i) { + int random_topic = rng.next() * num_classes; if (random_topic == num_classes) { --random_topic; } zj[i] = random_topic; const int word = wj[i]; diff --git a/gi/clda/src/crp.h b/gi/clda/src/crp.h index b01a7f47..9d35857e 100644 --- a/gi/clda/src/crp.h +++ b/gi/clda/src/crp.h @@ -3,7 +3,6 @@ // shamelessly adapted from code by Phil Blunsom and Trevor Cohn -#include #include #include diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index a0f8c271..6199e4c9 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -7,6 +7,7 @@ use Getopt::Long "GetOptions"; my $GZIP = 'gzip'; my $ZCAT = 'gunzip -c'; my $BASE_PHRASE_MAX_SIZE = 10; +my $COMPLETE_CACHE = 1; my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors my $NUM_TOPICS = 50; my $NUM_SAMPLES = 100; @@ -95,8 +96,13 @@ sub extract_context { if (-e $OUT_CONTEXTS) { print STDERR "$OUT_CONTEXTS exists, reusing...\n"; } else { - safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts."; - } + my $cmd = "$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; + if ($COMPLETE_CACHE) { + print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; + $cmd = "$EXTRACTOR -i $CORPUS -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; + } + safesystem($cmd) or die "Failed to extract contexts."; + } } sub contexts_to_documents { -- cgit v1.2.3