summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl12
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py2
2 files changed, 11 insertions, 3 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index acd6b94c..17f39a62 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -14,6 +14,9 @@ my $NUM_SAMPLES = 100;
my $CONTEXT_SIZE = 1;
my $BIDIR = 1;
+my $HIERARCHICAL_TOPICS = 0;
+my $FILTER_SINGLETONS = 0;
+
my $EXTOOLS = "$SCRIPT_DIR/../../extools";
die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;
my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src";
@@ -39,6 +42,8 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
'topics=i' => \$NUM_TOPICS,
'trg_context=i' => \$CONTEXT_SIZE,
'samples=i' => \$NUM_SAMPLES,
+ 'hierarchical-topics' => \$HIERARCHICAL_TOPICS,
+ 'filter-singletons' => \$FILTER_SINGLETONS,
);
usage() unless scalar @ARGV == 1;
@@ -130,8 +135,11 @@ sub topic_train {
if (-e $OUT_CLUSTERS) {
print STDERR "$OUT_CLUSTERS exists, reusing...\n";
} else {
- safesystem("$TOPIC_TRAIN --data $IN_CONTEXTS --backoff-type simple -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n";
-# safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n";
+ my $FILTER_SINGLETONS_ARG = "";
+ $FILTER_SINGLETONS_ARG = "--filter-singleton-contexts" if $FILTER_SINGLETONS;
+ my $HIERARCHICAL_TOPICS_ARG = "";
+ $HIERARCHICAL_TOPICS_ARG = "--hierarchical-topics" if $HIERARCHICAL_TOPICS;
+ safesystem("$TOPIC_TRAIN --data $IN_CONTEXTS --backoff-type simple -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS $HIERARCHICAL_TOPICS_ARG $FILTER_SINGLETONS_ARG -w /dev/null") or die "Topic training failed.\n";
}
}
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 09f879d0..25c57778 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -14,7 +14,7 @@ if len(sys.argv) > 2:
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
phrase,tail= line.split('\t')
- contexts = tail.split(" ||| ")
+ contexts = tail.split(" ||| ")[1:]
assert len(contexts) % 2 == 0
for i in range(0, len(contexts), 2):
category = contexts[i+1].split("=")[1].strip()