diff options
Diffstat (limited to 'gi')
| -rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 18 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 10 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.hh | 4 | ||||
| -rw-r--r-- | gi/pyp-topics/src/train-contexts.cc | 7 | ||||
| -rw-r--r-- | gi/pyp-topics/src/train.cc | 4 | 
5 files changed, 28 insertions, 15 deletions
| diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index 0560af39..f990582e 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -4,12 +4,16 @@ import sys  from operator import itemgetter  if len(sys.argv) <= 2: -  print "Usage: spans2labels.py phrase_context_index [order]" +  print "Usage: spans2labels.py phrase_context_index [order] [threshold]"    exit(1)  order=1 +threshold = 0 +cutoff_cat = "<UNK>"  if len(sys.argv) > 2:    order = int(sys.argv[2]) +if len(sys.argv) > 3: +  threshold = float(sys.argv[3])  phrase_context_index = {}  for line in file(sys.argv[1], 'r'): @@ -24,9 +28,15 @@ for line in file(sys.argv[1], 'r'):    if len(contexts) == 1: continue    assert len(contexts) % 2 == 0    for i in range(0, len(contexts), 2): -    category = contexts[i+1].split("=")[1].strip() -    phrase_context_index[(phrase,contexts[i])] = category -    #print (phrase,contexts[i]), category +    #parse contexts[i+1] = " C=1 P=0.8 " +    features = contexts[i+1].split() +    category = features[0].split("=")[1].strip() +    prob = float(features[1].split("=")[1].strip()) +    if prob >= threshold: +      phrase_context_index[(phrase,contexts[i])] = category +    else: +      phrase_context_index[(phrase,contexts[i])] = cutoff_cat +#      print (phrase,contexts[i]), category, prob  for line in sys.stdin:    line_segments = line.split('|||') diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index e528a923..3614fb22 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -344,7 +344,7 @@ int PYPTopics::max_topic() const {    return current_topic;  } -int PYPTopics::max(const DocumentId& doc) const { +std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc) const {    //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs    F current_max=0.0; @@ -366,10 +366,11 @@ int PYPTopics::max(const DocumentId& doc) const {      }    }    assert(current_topic >= 0); -  return current_topic; +  assert(current_max >= 0); +  return std::make_pair(current_topic, current_max);  } -int PYPTopics::max(const DocumentId& doc, const Term& term) const { +std::pair<int,PYPTopics::F> PYPTopics::max(const DocumentId& doc, const Term& term) const {    //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs    F current_max=0.0; @@ -392,7 +393,8 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) const {      }    }    assert(current_topic >= 0); -  return current_topic; +  assert(current_max >= 0); +  return std::make_pair(current_topic,current_max);  }  std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index 5e1fc6d6..32d2d939 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -33,8 +33,8 @@ public:                       int freq_cutoff_interval=0);    int sample(const DocumentId& doc, const Term& term); -  int max(const DocumentId& doc, const Term& term) const; -  int max(const DocumentId& doc) const; +  std::pair<int,F> max(const DocumentId& doc, const Term& term) const; +  std::pair<int,F> max(const DocumentId& doc) const;    int max_topic() const;    void set_backoff(const std::string& filename) { diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc index a673bf4e..0a48d3d9 100644 --- a/gi/pyp-topics/src/train-contexts.cc +++ b/gi/pyp-topics/src/train-contexts.cc @@ -131,14 +131,15 @@ int main(int argc, char **argv)            //insert_result.first++;        }        documents_out << contexts_corpus.key(document_id) << '\t'; -      documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| "; +      documents_out << model.max(document_id).first << " " << corpusIt->size() << " ||| ";        for (std::vector<int>::const_iterator termIt=unique_terms.begin();             termIt != unique_terms.end(); ++termIt) {          if (termIt != unique_terms.begin())            documents_out << " ||| ";         vector<std::string> strings = contexts_corpus.context2string(*termIt);         copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " ")); -        documents_out << "||| C=" << model.max(document_id, *termIt); +        std::pair<int,PYPTopics::F> maxinfo = model.max(document_id, *termIt); +        documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second;        }        documents_out <<endl; @@ -150,7 +151,7 @@ int main(int argc, char **argv)        default_topics << model.max_topic() <<endl;        for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {         vector<std::string> strings = contexts_corpus.context2string(termIt->first); -        default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| "; +        default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| ";         copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));          default_topics <<endl;        } diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc index 3462f26c..db7ca46e 100644 --- a/gi/pyp-topics/src/train.cc +++ b/gi/pyp-topics/src/train.cc @@ -99,7 +99,7 @@ int main(int argc, char **argv)        documents_out << unique_terms.size();        for (std::vector<int>::const_iterator termIt=unique_terms.begin();             termIt != unique_terms.end(); ++termIt) -        documents_out << " " << *termIt << ":" << model.max(document_id, *termIt); +        documents_out << " " << *termIt << ":" << model.max(document_id, *termIt).first;        documents_out << std::endl;      }      documents_out.close(); @@ -121,7 +121,7 @@ int main(int argc, char **argv)        int index=0;        for (DocumentTerms::const_iterator instanceIt=corpusIt->begin();             instanceIt != corpusIt->end(); ++instanceIt, ++index) { -        int topic = model.max(instanceIt->doc, instanceIt->term); +        int topic = model.max(instanceIt->doc, instanceIt->term).first;          if (index != 0) topics_out << " ";          topics_out << topic;        } | 
