summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-25 01:45:03 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-25 01:45:03 +0000
commit595ab284eef74e540f1e4aef7458a0cf6790482d (patch)
tree865c1d5de91e363b50d77d59c4bfd0bf15d312ac /gi/pipeline
parent60a7aa278e9d24e0927fabf12581e8b36b774a2c (diff)
optionally use unlimited cache
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@27 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl10
1 files changed, 8 insertions, 2 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index a0f8c271..6199e4c9 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -7,6 +7,7 @@ use Getopt::Long "GetOptions";
my $GZIP = 'gzip';
my $ZCAT = 'gunzip -c';
my $BASE_PHRASE_MAX_SIZE = 10;
+my $COMPLETE_CACHE = 1;
my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors
my $NUM_TOPICS = 50;
my $NUM_SAMPLES = 100;
@@ -95,8 +96,13 @@ sub extract_context {
if (-e $OUT_CONTEXTS) {
print STDERR "$OUT_CONTEXTS exists, reusing...\n";
} else {
- safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts.";
- }
+ my $cmd = "$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+ if ($COMPLETE_CACHE) {
+ print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
+ $cmd = "$EXTRACTOR -i $CORPUS -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+ }
+ safesystem($cmd) or die "Failed to extract contexts.";
+ }
}
sub contexts_to_documents {