summaryrefslogtreecommitdiff
path: root/gi
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 16:59:07 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 16:59:07 +0000
commit7776119e54c477a27fb0617d8bf8b483ac78898e (patch)
tree590676de6f338e3404d7d334b454db31067f75ba /gi
parentcdc426ec830a2bb33141c2b968482f5da54af482 (diff)
Cleaned up extractor
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@379 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl11
1 files changed, 4 insertions, 7 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 37051721..13a2b421 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -266,18 +266,15 @@ sub extract_context {
print STDERR "$OUT_CONTEXTS exists, reusing...\n";
} else {
my $ccopt = "-c $ITEMS_IN_MEMORY";
- my $pipe = "| $REDUCER ";
+ my $postsort = "| $REDUCER ";
if ($COMPLETE_CACHE) {
print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
$ccopt = "-c 0";
- $pipe = "";
+ $postsort = "" unless ($PRESERVE_PHRASES);
}
+ my $presort = ($PRESERVE_PHRASES ? "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " : "");
- if ($PRESERVE_PHRASES) {
- $pipe = "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " . $pipe;
- }
-
- my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS $pipe | $GZIP > $OUT_CONTEXTS";
+ my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE $presort | $SORT_KEYS $postsort | $GZIP > $OUT_CONTEXTS";
safesystem($cmd) or die "Failed to extract contexts.";
}
}