diff options
Diffstat (limited to 'gi')
-rw-r--r-- | gi/clda/src/clda.cc | 2 | ||||
-rw-r--r-- | gi/clda/src/crp.h | 1 | ||||
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 10 |
3 files changed, 9 insertions, 4 deletions
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc index 05cbb441..efe666e2 100644 --- a/gi/clda/src/clda.cc +++ b/gi/clda/src/clda.cc @@ -61,13 +61,13 @@ int main(int argc, char** argv) { double alpha = 50.0 / num_classes; vector<CRP<int> > dr(zji.size(), CRP<int>(beta)); // dr[i] describes the probability of using a topic in document i vector<CRP<int> > wr(num_classes, CRP<int>(alpha)); // wr[k] describes the probability of generating a word in topic k - int random_topic = rng.next() * num_classes; for (int j = 0; j < zji.size(); ++j) { const size_t num_words = wji[j].size(); vector<int>& zj = zji[j]; const vector<int>& wj = wji[j]; zj.resize(num_words); for (int i = 0; i < num_words; ++i) { + int random_topic = rng.next() * num_classes; if (random_topic == num_classes) { --random_topic; } zj[i] = random_topic; const int word = wj[i]; diff --git a/gi/clda/src/crp.h b/gi/clda/src/crp.h index b01a7f47..9d35857e 100644 --- a/gi/clda/src/crp.h +++ b/gi/clda/src/crp.h @@ -3,7 +3,6 @@ // shamelessly adapted from code by Phil Blunsom and Trevor Cohn -#include <map> #include <boost/functional/hash.hpp> #include <tr1/unordered_map> diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index a0f8c271..6199e4c9 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -7,6 +7,7 @@ use Getopt::Long "GetOptions"; my $GZIP = 'gzip'; my $ZCAT = 'gunzip -c'; my $BASE_PHRASE_MAX_SIZE = 10; +my $COMPLETE_CACHE = 1; my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors my $NUM_TOPICS = 50; my $NUM_SAMPLES = 100; @@ -95,8 +96,13 @@ sub extract_context { if (-e $OUT_CONTEXTS) { print STDERR "$OUT_CONTEXTS exists, reusing...\n"; } else { - safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts."; - } + my $cmd = "$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; + if ($COMPLETE_CACHE) { + print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; + $cmd = "$EXTRACTOR -i $CORPUS -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; + } + safesystem($cmd) or die "Failed to extract contexts."; + } } sub contexts_to_documents { |