From ce071775da01607459dab7469b61197c7d8d0b43 Mon Sep 17 00:00:00 2001 From: bothameister Date: Tue, 13 Jul 2010 23:37:29 +0000 Subject: added thresholding for span labelling git-svn-id: https://ws10smt.googlecode.com/svn/trunk@247 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/src/pyp-topics.cc | 10 ++++++---- gi/pyp-topics/src/pyp-topics.hh | 4 ++-- gi/pyp-topics/src/train-contexts.cc | 7 ++++--- gi/pyp-topics/src/train.cc | 4 ++-- 4 files changed, 14 insertions(+), 11 deletions(-) (limited to 'gi/pyp-topics/src') diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index e528a923..3614fb22 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -344,7 +344,7 @@ int PYPTopics::max_topic() const { return current_topic; } -int PYPTopics::max(const DocumentId& doc) const { +std::pair PYPTopics::max(const DocumentId& doc) const { //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; @@ -366,10 +366,11 @@ int PYPTopics::max(const DocumentId& doc) const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return std::make_pair(current_topic, current_max); } -int PYPTopics::max(const DocumentId& doc, const Term& term) const { +std::pair PYPTopics::max(const DocumentId& doc, const Term& term) const { //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; @@ -392,7 +393,8 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) const { } } assert(current_topic >= 0); - return current_topic; + assert(current_max >= 0); + return std::make_pair(current_topic,current_max); } std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index 5e1fc6d6..32d2d939 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -33,8 +33,8 @@ public: int freq_cutoff_interval=0); int sample(const DocumentId& doc, const Term& term); - int max(const DocumentId& doc, const Term& term) const; - int max(const DocumentId& doc) const; + std::pair max(const DocumentId& doc, const Term& term) const; + std::pair max(const DocumentId& doc) const; int max_topic() const; void set_backoff(const std::string& filename) { diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc index a673bf4e..0a48d3d9 100644 --- a/gi/pyp-topics/src/train-contexts.cc +++ b/gi/pyp-topics/src/train-contexts.cc @@ -131,14 +131,15 @@ int main(int argc, char **argv) //insert_result.first++; } documents_out << contexts_corpus.key(document_id) << '\t'; - documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| "; + documents_out << model.max(document_id).first << " " << corpusIt->size() << " ||| "; for (std::vector::const_iterator termIt=unique_terms.begin(); termIt != unique_terms.end(); ++termIt) { if (termIt != unique_terms.begin()) documents_out << " ||| "; vector strings = contexts_corpus.context2string(*termIt); copy(strings.begin(), strings.end(),ostream_iterator(documents_out, " ")); - documents_out << "||| C=" << model.max(document_id, *termIt); + std::pair maxinfo = model.max(document_id, *termIt); + documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second; } documents_out <::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { vector strings = contexts_corpus.context2string(termIt->first); - default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| "; + default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| "; copy(strings.begin(), strings.end(),ostream_iterator(default_topics, " ")); default_topics <::const_iterator termIt=unique_terms.begin(); termIt != unique_terms.end(); ++termIt) - documents_out << " " << *termIt << ":" << model.max(document_id, *termIt); + documents_out << " " << *termIt << ":" << model.max(document_id, *termIt).first; documents_out << std::endl; } documents_out.close(); @@ -121,7 +121,7 @@ int main(int argc, char **argv) int index=0; for (DocumentTerms::const_iterator instanceIt=corpusIt->begin(); instanceIt != corpusIt->end(); ++instanceIt, ++index) { - int topic = model.max(instanceIt->doc, instanceIt->term); + int topic = model.max(instanceIt->doc, instanceIt->term).first; if (index != 0) topics_out << " "; topics_out << topic; } -- cgit v1.2.3