diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-25 01:45:03 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-25 01:45:03 +0000 |
commit | 8a71c56ee9a59f30fa2a7765905e2b8e9a023025 (patch) | |
tree | 1b04e462eb9aa7a79f641fbf6a3f1083d19b4b53 /gi/pipeline | |
parent | e45f99a03c6d8e0cf8453aa9419920d0faf741ca (diff) |
optionally use unlimited cache
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@27 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index a0f8c271..6199e4c9 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -7,6 +7,7 @@ use Getopt::Long "GetOptions"; my $GZIP = 'gzip'; my $ZCAT = 'gunzip -c'; my $BASE_PHRASE_MAX_SIZE = 10; +my $COMPLETE_CACHE = 1; my $ITEMS_IN_MEMORY = 3000000; # cache size in extractors my $NUM_TOPICS = 50; my $NUM_SAMPLES = 100; @@ -95,8 +96,13 @@ sub extract_context { if (-e $OUT_CONTEXTS) { print STDERR "$OUT_CONTEXTS exists, reusing...\n"; } else { - safesystem("$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS") or die "Failed to extract contexts."; - } + my $cmd = "$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; + if ($COMPLETE_CACHE) { + print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; + $cmd = "$EXTRACTOR -i $CORPUS -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; + } + safesystem($cmd) or die "Failed to extract contexts."; + } } sub contexts_to_documents { |