From 755b9e189ed9d07f42816937466ec89e5b977c6e Mon Sep 17 00:00:00 2001 From: philblunsom Date: Fri, 2 Jul 2010 20:07:37 +0000 Subject: git-svn-id: https://ws10smt.googlecode.com/svn/trunk@119 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/local-gi-pipeline.pl | 12 ++++++++++-- gi/pyp-topics/scripts/spans2labels.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'gi') diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index acd6b94c..17f39a62 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -14,6 +14,9 @@ my $NUM_SAMPLES = 100; my $CONTEXT_SIZE = 1; my $BIDIR = 1; +my $HIERARCHICAL_TOPICS = 0; +my $FILTER_SINGLETONS = 0; + my $EXTOOLS = "$SCRIPT_DIR/../../extools"; die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; my $PYPTOOLS = "$SCRIPT_DIR/../pyp-topics/src"; @@ -39,6 +42,8 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'topics=i' => \$NUM_TOPICS, 'trg_context=i' => \$CONTEXT_SIZE, 'samples=i' => \$NUM_SAMPLES, + 'hierarchical-topics' => \$HIERARCHICAL_TOPICS, + 'filter-singletons' => \$FILTER_SINGLETONS, ); usage() unless scalar @ARGV == 1; @@ -130,8 +135,11 @@ sub topic_train { if (-e $OUT_CLUSTERS) { print STDERR "$OUT_CLUSTERS exists, reusing...\n"; } else { - safesystem("$TOPIC_TRAIN --data $IN_CONTEXTS --backoff-type simple -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n"; -# safesystem("$TOPIC_TRAIN -d $IN_DOCS -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS -w /dev/null") or die "Topic training failed.\n"; + my $FILTER_SINGLETONS_ARG = ""; + $FILTER_SINGLETONS_ARG = "--filter-singleton-contexts" if $FILTER_SINGLETONS; + my $HIERARCHICAL_TOPICS_ARG = ""; + $HIERARCHICAL_TOPICS_ARG = "--hierarchical-topics" if $HIERARCHICAL_TOPICS; + safesystem("$TOPIC_TRAIN --data $IN_CONTEXTS --backoff-type simple -t $NUM_TOPICS -s $NUM_SAMPLES -o $OUT_CLUSTERS $HIERARCHICAL_TOPICS_ARG $FILTER_SINGLETONS_ARG -w /dev/null") or die "Topic training failed.\n"; } } diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index 09f879d0..25c57778 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -14,7 +14,7 @@ if len(sys.argv) > 2: phrase_context_index = {} for line in file(sys.argv[1], 'r'): phrase,tail= line.split('\t') - contexts = tail.split(" ||| ") + contexts = tail.split(" ||| ")[1:] assert len(contexts) % 2 == 0 for i in range(0, len(contexts), 2): category = contexts[i+1].split("=")[1].strip() -- cgit v1.2.3