summaryrefslogtreecommitdiff
path: root/gi/pipeline/local-gi-pipeline.pl
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pipeline/local-gi-pipeline.pl')
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl30
1 files changed, 22 insertions, 8 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index db2969c7..e832e556 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -21,6 +21,7 @@ my $BIDIR = 0;
my $TOPICS_CONFIG = "pyp-topics.conf";
my $LANGUAGE = "target";
my $LABEL_THRESHOLD = 0;
+my $PRESERVE_PHRASES;
my $MODEL = "pyp";
my $NUM_ITERS = 100;
@@ -45,11 +46,13 @@ my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh";
my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";
my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl";
+my $REMOVE_TAGS_CORPUS = "$SCRIPT_DIR/scripts/remove-tags-from-corpus.pl";
+my $REMOVE_TAGS_CONTEXT = "$SCRIPT_DIR/scripts/remove-tags-from-contexts.pl";
my $EXTRACTOR = "$EXTOOLS/extractor";
my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";
assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR,
- $S2L, $C2D, $TOPIC_TRAIN, $SPLIT);
+ $S2L, $C2D, $TOPIC_TRAIN, $SPLIT, $REMOVE_TAGS_CONTEXT, $REMOVE_TAGS_CORPUS);
my $BACKOFF_GRAMMAR;
my $DEFAULT_CAT;
@@ -77,7 +80,8 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
'pr-flags=s' => \$PR_FLAGS,
'tagged_corpus=s' => \$TAGGED_CORPUS,
'language=s' => \$LANGUAGE,
- 'get_name_only' => \$NAME_SHORTCUT
+ 'get_name_only' => \$NAME_SHORTCUT,
+ 'preserve_phrases' => \$PRESERVE_PHRASES,
);
if ($NAME_SHORTCUT) {
$NUM_TOPICS = $NUM_TOPICS_FINE;
@@ -185,6 +189,7 @@ sub setup_data {
die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS;
my $opt="";
$opt = "-s" if ($LANGUAGE eq "source");
+ $opt = "-a" if ($PRESERVE_PHRASES);
my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER";
safesystem($cmd) or die "Failed to extract contexts.";
} else {
@@ -260,11 +265,19 @@ sub extract_context {
if (-e $OUT_CONTEXTS) {
print STDERR "$OUT_CONTEXTS exists, reusing...\n";
} else {
- my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+ my $ccopt = "-c $ITEMS_IN_MEMORY";
+ my $pipe = "| $REDUCER ";
if ($COMPLETE_CACHE) {
print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
- $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+ $ccopt = "-c 0";
+ $pipe = "";
}
+
+ if ($PRESERVE_PHRASES) {
+ $pipe = "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " . $pipe;
+ }
+
+ my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS $pipe | $GZIP > $OUT_CONTEXTS";
safesystem($cmd) or die "Failed to extract contexts.";
}
}
@@ -331,14 +344,15 @@ sub label_spans_with_topics {
if (-e $OUT_SPANS) {
print STDERR "$OUT_SPANS exists, reusing...\n";
} else {
- my $l = "tt";
+ my $extra = "tt";
if ($LANGUAGE eq "source") {
- $l = "ss";
+ $extra = "ss";
} elsif ($LANGUAGE eq "both") {
- $l = "bb";
+ $extra = "bb";
} else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" };
+ $extra = $extra . " tok,tag" if ($PRESERVE_PHRASES);
safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip";
- safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans";
+ safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans";
unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";
safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste";
}