summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 23:37:29 +0000
committerbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 23:37:29 +0000
commitce071775da01607459dab7469b61197c7d8d0b43 (patch)
tree818ab286a5649c58125d034c9d849656a8654572
parent1dae2292288f648578ac66f72688ba82483ba911 (diff)
added thresholding for span labelling
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@247 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py18
-rw-r--r--gi/pyp-topics/src/pyp-topics.cc10
-rw-r--r--gi/pyp-topics/src/pyp-topics.hh4
-rw-r--r--gi/pyp-topics/src/train-contexts.cc7
-rw-r--r--gi/pyp-topics/src/train.cc4
5 files changed, 28 insertions, 15 deletions
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 0560af39..f990582e 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,12 +4,16 @@ import sys
from operator import itemgetter
if len(sys.argv) <= 2:
- print "Usage: spans2labels.py phrase_context_index [order]"
+ print "Usage: spans2labels.py phrase_context_index [order] [threshold]"
exit(1)
order=1
+threshold = 0
+cutoff_cat = "<UNK>"
if len(sys.argv) > 2:
order = int(sys.argv[2])
+if len(sys.argv) > 3:
+ threshold = float(sys.argv[3])
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
@@ -24,9 +28,15 @@ for line in file(sys.argv[1], 'r'):
if len(contexts) == 1: continue
assert len(contexts) % 2 == 0
for i in range(0, len(contexts), 2):
- category = contexts[i+1].split("=")[1].strip()
- phrase_context_index[(phrase,contexts[i])] = category
- #print (phrase,contexts[i]), category
+ #parse contexts[i+1] = " C=1 P=0.8 "
+ features = contexts[i+1].split()
+ category = features[0].split("=")[1].strip()
+ prob = float(features[1].split("=")[1].strip())
+ if prob >= threshold:
+ phrase_context_index[(phrase,contexts[i])] = category
+ else:
+ phrase_context_index[(phrase,contexts[i])] = cutoff_cat
+# print (phrase,contexts[i]), category, prob
for line in sys.stdin:
line_segments = line.split('|||')
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index e528a923..3614fb22 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -344,7 +344,7 @@ int PYPTopics::max_topic() const {
return current_topic;
}
-int PYPTopics::max(const DocumentId& doc) const {
+std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc) const {
//std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;
// collect probs
F current_max=0.0;
@@ -366,10 +366,11 @@ int PYPTopics::max(const DocumentId& doc) const {
}
}
assert(current_topic >= 0);
- return current_topic;
+ assert(current_max >= 0);
+ return std::make_pair(current_topic, current_max);
}
-int PYPTopics::max(const DocumentId& doc, const Term& term) const {
+std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc, const Term& term) const {
//std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;
// collect probs
F current_max=0.0;
@@ -392,7 +393,8 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) const {
}
}
assert(current_topic >= 0);
- return current_topic;
+ assert(current_max >= 0);
+ return std::make_pair(current_topic,current_max);
}
std::ostream& PYPTopics::print_document_topics(std::ostream& out) const {
diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh
index 5e1fc6d6..32d2d939 100644
--- a/gi/pyp-topics/src/pyp-topics.hh
+++ b/gi/pyp-topics/src/pyp-topics.hh
@@ -33,8 +33,8 @@ public:
int freq_cutoff_interval=0);
int sample(const DocumentId& doc, const Term& term);
- int max(const DocumentId& doc, const Term& term) const;
- int max(const DocumentId& doc) const;
+ std::pair<int,F> max(const DocumentId& doc, const Term& term) const;
+ std::pair<int,F> max(const DocumentId& doc) const;
int max_topic() const;
void set_backoff(const std::string& filename) {
diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc
index a673bf4e..0a48d3d9 100644
--- a/gi/pyp-topics/src/train-contexts.cc
+++ b/gi/pyp-topics/src/train-contexts.cc
@@ -131,14 +131,15 @@ int main(int argc, char **argv)
//insert_result.first++;
}
documents_out << contexts_corpus.key(document_id) << '\t';
- documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| ";
+ documents_out << model.max(document_id).first << " " << corpusIt->size() << " ||| ";
for (std::vector<int>::const_iterator termIt=unique_terms.begin();
termIt != unique_terms.end(); ++termIt) {
if (termIt != unique_terms.begin())
documents_out << " ||| ";
vector<std::string> strings = contexts_corpus.context2string(*termIt);
copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " "));
- documents_out << "||| C=" << model.max(document_id, *termIt);
+ std::pair<int,PYPTopics::F> maxinfo = model.max(document_id, *termIt);
+ documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second;
}
documents_out <<endl;
@@ -150,7 +151,7 @@ int main(int argc, char **argv)
default_topics << model.max_topic() <<endl;
for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {
vector<std::string> strings = contexts_corpus.context2string(termIt->first);
- default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| ";
+ default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| ";
copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));
default_topics <<endl;
}
diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc
index 3462f26c..db7ca46e 100644
--- a/gi/pyp-topics/src/train.cc
+++ b/gi/pyp-topics/src/train.cc
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
documents_out << unique_terms.size();
for (std::vector<int>::const_iterator termIt=unique_terms.begin();
termIt != unique_terms.end(); ++termIt)
- documents_out << " " << *termIt << ":" << model.max(document_id, *termIt);
+ documents_out << " " << *termIt << ":" << model.max(document_id, *termIt).first;
documents_out << std::endl;
}
documents_out.close();
@@ -121,7 +121,7 @@ int main(int argc, char **argv)
int index=0;
for (DocumentTerms::const_iterator instanceIt=corpusIt->begin();
instanceIt != corpusIt->end(); ++instanceIt, ++index) {
- int topic = model.max(instanceIt->doc, instanceIt->term);
+ int topic = model.max(instanceIt->doc, instanceIt->term).first;
if (index != 0) topics_out << " ";
topics_out << topic;
}