optionally use unlimited cache

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@27 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-25 01:45:03 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-25 01:45:03 +0000
commit: 8a71c56ee9a59f30fa2a7765905e2b8e9a023025 (patch)
tree: 1b04e462eb9aa7a79f641fbf6a3f1083d19b4b53
parent: e45f99a03c6d8e0cf8453aa9419920d0faf741ca (diff)
4 files changed, 16 insertions, 10 deletions
diff --git a/extools/extractor.cc b/extools/extractor.cc
index a3791d2a..4f9b4dc6 100644
--- a/extools/extractor.cc
+++ b/extools/extractor.cc
@@ -34,7 +34,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("base_phrase,B", "Write base phrases")
         ("base_phrase_spans", "Write base sentences and phrase spans")
         ("bidir,b", "Extract bidirectional rules (for computing p(f|e) in addition to p(e|f))")
-        ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 0 to disable cache.")
+        ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.")
         ("silent", "Write nothing to stderr except errors")
         ("phrase_context,C", "Write base phrase contexts")
         ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts")
@@ -86,7 +86,9 @@ void WriteBasePhraseSpans(const AnnotatedParallelSentence& sentence,
 }
 
 struct CountCombiner {
-  CountCombiner(size_t csize) : combiner_size(csize) {}
+  CountCombiner(const size_t& csize) : combiner_size(csize) {
+    if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; }
+  }
   ~CountCombiner() {
     if (!cache.empty()) WriteAndClearCache();
   }
@@ -95,13 +97,14 @@ struct CountCombiner {
              const vector<WordID>& val,
              const int count_type,
              const vector<pair<short,short> >& aligns) {
-    if (combiner_size > 0) {
+    if (combiner_size != 1) {
       RuleStatistics& v = cache[key][val];
       float newcount = v.counts.add_value(count_type, 1.0f);
       // hack for adding alignments
       if (newcount < 7.0f && aligns.size() > v.aligns.size())
         v.aligns = aligns;
-      if (cache.size() > combiner_size) WriteAndClearCache();
+      if (combiner_size > 1 && cache.size() > combiner_size)
+        WriteAndClearCache();
     } else {
       cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| ";
       cout << RuleStatistics(count_type, 1.0f, aligns) << endl;
@@ -300,8 +303,6 @@ int main(int argc, char** argv) {
     string sdefault_cat = conf["default_category"].as<string>();
     default_cat = -TD::Convert(sdefault_cat);
     cerr << "Default category: " << sdefault_cat << endl;
-  } else {
-    cerr << "No default category (use --default_category if you want to set one)\n";
   }
   ReadFile rf(conf["input"].as<string>());
   istream& in = *rf.stream();
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc
index 05cbb441..efe666e2 100644
--- a/gi/clda/src/clda.cc
+++ b/gi/clda/src/clda.cc
@@ -61,13 +61,13 @@ int main(int argc, char** argv) {
   double alpha = 50.0 / num_classes;
   vector<CRP<int> > dr(zji.size(), CRP<int>(beta)); // dr[i] describes the probability of using a topic in document i
   vector<CRP<int> > wr(num_classes, CRP<int>(alpha)); // wr[k] describes the probability of generating a word in topic k
-      int random_topic = rng.next() * num_classes;
   for (int j = 0; j < zji.size(); ++j) {
     const size_t num_words = wji[j].size();
     vector<int>& zj = zji[j];
     const vector<int>& wj = wji[j];
     zj.resize(num_words);
     for (int i = 0; i < num_words; ++i) {
+      int random_topic = rng.next() * num_classes;
       if (random_topic == num_classes) { --random_topic; }
       zj[i] = random_topic;
       const int word = wj[i];
diff --git a/gi/clda/src/crp.h b/gi/clda/src/crp.h
index b01a7f47..9d35857e 100644
--- a/gi/clda/src/crp.h
+++ b/gi/clda/src/crp.h
@@ -3,7 +3,6 @@
 
 // shamelessly adapted from code by Phil Blunsom and Trevor Cohn
 
-#include <map>
 #include <boost/functional/hash.hpp>
 #include <tr1/unordered_map>
 
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index a0f8c271..6199e4c9 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -7,6 +7,7 @@ use Getopt::Long "GetOptions";
 my $GZIP = 'gzip';
 my $ZCAT = 'gunzip -c';
 my $BASE_PHRASE_MAX_SIZE = 10;
+my $COMPLETE_CACHE = 1;
 my $ITEMS_IN_MEMORY = 3000000;  # cache size in extractors
 my $NUM_TOPICS = 50;
 my $NUM_SAMPLES = 100;
@@ -95,8 +96,13 @@ sub extract_context {
  if (-e $OUT_CONTEXTS) {
    print STDERR "$OUT_CONTEXTS exists, reusing...\n";
  } else {
-   safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts.";
- }
+   my $cmd = "$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+   if ($COMPLETE_CACHE) {
+     print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
+     $cmd = "$EXTRACTOR -i $CORPUS -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+   }
+   safesystem($cmd) or die "Failed to extract contexts.";
+  }
 }
 
 sub contexts_to_documents {
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-25 01:45:03 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-25 01:45:03 +0000
commit	8a71c56ee9a59f30fa2a7765905e2b8e9a023025 (patch)
tree	1b04e462eb9aa7a79f641fbf6a3f1083d19b4b53
parent	e45f99a03c6d8e0cf8453aa9419920d0faf741ca (diff)