From ce071775da01607459dab7469b61197c7d8d0b43 Mon Sep 17 00:00:00 2001
From: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Tue, 13 Jul 2010 23:37:29 +0000
Subject: added thresholding for span labelling

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@247 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pyp-topics/scripts/spans2labels.py | 18 ++++++++++++++----
 gi/pyp-topics/src/pyp-topics.cc       | 10 ++++++----
 gi/pyp-topics/src/pyp-topics.hh       |  4 ++--
 gi/pyp-topics/src/train-contexts.cc   |  7 ++++---
 gi/pyp-topics/src/train.cc            |  4 ++--
 5 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'gi/pyp-topics')
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 0560af39..f990582e 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,12 +4,16 @@ import sys
 from operator import itemgetter
 
 if len(sys.argv) <= 2:
-  print "Usage: spans2labels.py phrase_context_index [order]"
+  print "Usage: spans2labels.py phrase_context_index [order] [threshold]"
   exit(1)
 
 order=1
+threshold = 0
+cutoff_cat = "<UNK>"
 if len(sys.argv) > 2:
   order = int(sys.argv[2])
+if len(sys.argv) > 3:
+  threshold = float(sys.argv[3])
 
 phrase_context_index = {}
 for line in file(sys.argv[1], 'r'):
@@ -24,9 +28,15 @@ for line in file(sys.argv[1], 'r'):
   if len(contexts) == 1: continue
   assert len(contexts) % 2 == 0
   for i in range(0, len(contexts), 2):
-    category = contexts[i+1].split("=")[1].strip()
-    phrase_context_index[(phrase,contexts[i])] = category
-    #print (phrase,contexts[i]), category
+    #parse contexts[i+1] = " C=1 P=0.8 "
+    features = contexts[i+1].split()
+    category = features[0].split("=")[1].strip()
+    prob = float(features[1].split("=")[1].strip())
+    if prob >= threshold:
+      phrase_context_index[(phrase,contexts[i])] = category
+    else:
+      phrase_context_index[(phrase,contexts[i])] = cutoff_cat
+#      print (phrase,contexts[i]), category, prob
 
 for line in sys.stdin:
   line_segments = line.split('|||')
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index e528a923..3614fb22 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -344,7 +344,7 @@ int PYPTopics::max_topic() const {
   return current_topic;
 }
 
-int PYPTopics::max(const DocumentId& doc) const {
+std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc) const {
   //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;
   // collect probs
   F current_max=0.0;
@@ -366,10 +366,11 @@ int PYPTopics::max(const DocumentId& doc) const {
     }
   }
   assert(current_topic >= 0);
-  return current_topic;
+  assert(current_max >= 0);
+  return std::make_pair(current_topic, current_max);
 }
 
-int PYPTopics::max(const DocumentId& doc, const Term& term) const {
+std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc, const Term& term) const {
   //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;
   // collect probs
   F current_max=0.0;
@@ -392,7 +393,8 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) const {
     }
   }
   assert(current_topic >= 0);
-  return current_topic;
+  assert(current_max >= 0);
+  return std::make_pair(current_topic,current_max);
 }
 
 std::ostream& PYPTopics::print_document_topics(std::ostream& out) const {
diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh
index 5e1fc6d6..32d2d939 100644
--- a/gi/pyp-topics/src/pyp-topics.hh
+++ b/gi/pyp-topics/src/pyp-topics.hh
@@ -33,8 +33,8 @@ public:
                      int freq_cutoff_interval=0);
 
   int sample(const DocumentId& doc, const Term& term);
-  int max(const DocumentId& doc, const Term& term) const;
-  int max(const DocumentId& doc) const;
+  std::pair<int,F> max(const DocumentId& doc, const Term& term) const;
+  std::pair<int,F> max(const DocumentId& doc) const;
   int max_topic() const;
 
   void set_backoff(const std::string& filename) {
diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc
index a673bf4e..0a48d3d9 100644
--- a/gi/pyp-topics/src/train-contexts.cc
+++ b/gi/pyp-topics/src/train-contexts.cc
@@ -131,14 +131,15 @@ int main(int argc, char **argv)
           //insert_result.first++;
       }
       documents_out << contexts_corpus.key(document_id) << '\t';
-      documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| ";
+      documents_out << model.max(document_id).first << " " << corpusIt->size() << " ||| ";
       for (std::vector<int>::const_iterator termIt=unique_terms.begin();
            termIt != unique_terms.end(); ++termIt) {
         if (termIt != unique_terms.begin())
           documents_out << " ||| ";
        vector<std::string> strings = contexts_corpus.context2string(*termIt);
        copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " "));
-        documents_out << "||| C=" << model.max(document_id, *termIt);
+        std::pair<int,PYPTopics::F> maxinfo = model.max(document_id, *termIt);
+        documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second;
 
       }
       documents_out <<endl;
@@ -150,7 +151,7 @@ int main(int argc, char **argv)
       default_topics << model.max_topic() <<endl;
       for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {
        vector<std::string> strings = contexts_corpus.context2string(termIt->first);
-        default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| ";
+        default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| ";
        copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));
         default_topics <<endl;
       }
diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc
index 3462f26c..db7ca46e 100644
--- a/gi/pyp-topics/src/train.cc
+++ b/gi/pyp-topics/src/train.cc
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
       documents_out << unique_terms.size();
       for (std::vector<int>::const_iterator termIt=unique_terms.begin();
            termIt != unique_terms.end(); ++termIt)
-        documents_out << " " << *termIt << ":" << model.max(document_id, *termIt);
+        documents_out << " " << *termIt << ":" << model.max(document_id, *termIt).first;
       documents_out << std::endl;
     }
     documents_out.close();
@@ -121,7 +121,7 @@ int main(int argc, char **argv)
       int index=0;
       for (DocumentTerms::const_iterator instanceIt=corpusIt->begin();
            instanceIt != corpusIt->end(); ++instanceIt, ++index) {
-        int topic = model.max(instanceIt->doc, instanceIt->term);
+        int topic = model.max(instanceIt->doc, instanceIt->term).first;
         if (index != 0) topics_out << " ";
         topics_out << topic;
       }
-- 
cgit v1.2.3