From f343459d6198352964dbb6779f15c352fe2d5794 Mon Sep 17 00:00:00 2001 From: Jacob Date: Sun, 28 Jul 2013 09:54:54 +0100 Subject: init --- nlp_tools/dict_utils.py | 101 +++++++++++++++++++++++++++++++++++++++++++++++ nlp_tools/dict_utils.pyc | Bin 0 -> 3803 bytes nlp_tools/feature.pyc | Bin 0 -> 427 bytes nlp_tools/vocabulary.py | 49 +++++++++++++++++++++++ nlp_tools/vocabulary.pyc | Bin 0 -> 2461 bytes 5 files changed, 150 insertions(+) create mode 100644 nlp_tools/dict_utils.py create mode 100644 nlp_tools/dict_utils.pyc create mode 100644 nlp_tools/feature.pyc create mode 100644 nlp_tools/vocabulary.py create mode 100644 nlp_tools/vocabulary.pyc (limited to 'nlp_tools') diff --git a/nlp_tools/dict_utils.py b/nlp_tools/dict_utils.py new file mode 100644 index 0000000..8b9b94b --- /dev/null +++ b/nlp_tools/dict_utils.py @@ -0,0 +1,101 @@ +""" +Utilities for doing math on sparse vectors indexed by arbitrary objects. +(These will usually be feature vectors.) +""" + +import math_utils as mu +import math + +def d_elt_op_keep(op, zero, args): + """ + Applies op to arguments elementwise, keeping entries that don't occur in + every argument (i.e. behaves like a sum). + """ + ret = {} + for d in args: + for key in d: + if key not in ret: + ret[key] = d[key] + else: + ret[key] = op([ret[key], d[key]]) + for key in ret.keys(): + if ret[key] == zero: + del ret[key] + return ret + +def d_elt_op_drop(op, args): + """ + Applies op to arguments elementwise, discarding entries that don't occur in + every argument (i.e. behaves like a product). + """ + # avoid querying lots of nonexistent keys + smallest = min(args, key=len) + sindex = args.index(smallest) + ret = dict(smallest) + for i in range(len(args)): + if i == sindex: + continue + d = args[i] + for key in ret.keys(): + if key in d: + ret[key] = op([ret[key], d[key]]) + else: + del ret[key] + return ret + +def d_sum(args): + """ + Computes a sum of vectors. + """ + return d_elt_op_keep(sum, 0, args) + +def d_logspace_sum(args): + """ + Computes a sum of vectors whose elements are represented in logspace. + """ + return d_elt_op_keep(mu.logspace_sum, -float('inf'), args) + +def d_elt_prod(args): + """ + Computes an elementwise product of vectors. + """ + return d_elt_op_drop(lambda l: reduce(lambda a,b: a*b, l), args) + +def d_dot_prod(d1, d2): + """ + Takes the dot product of the two arguments. + """ + # avoid querying lots of nonexistent keys + if len(d2) < len(d1): + d1, d2 = d2, d1 + dot_prod = 0 + for key in d1: + if key in d2: + dot_prod += d1[key] * d2[key] + return dot_prod + +def d_logspace_scalar_prod(c, d): + """ + Multiplies every element of d by c, where c and d are both represented in + logspace. + """ + ret = {} + for key in d: + ret[key] = c + d[key] + return ret + +def d_op(op, d): + """ + Applies op to every element of the dictionary. + """ + ret = {} + for key in d: + ret[key] = op(d[key]) + return ret + +# convenience methods +def d_log(d): + return d_op(math.log, d) + +def d_exp(d): + return d_op(math.exp, d) diff --git a/nlp_tools/dict_utils.pyc b/nlp_tools/dict_utils.pyc new file mode 100644 index 0000000..ada4c58 Binary files /dev/null and b/nlp_tools/dict_utils.pyc differ diff --git a/nlp_tools/feature.pyc b/nlp_tools/feature.pyc new file mode 100644 index 0000000..9c96271 Binary files /dev/null and b/nlp_tools/feature.pyc differ diff --git a/nlp_tools/vocabulary.py b/nlp_tools/vocabulary.py new file mode 100644 index 0000000..ed200f5 --- /dev/null +++ b/nlp_tools/vocabulary.py @@ -0,0 +1,49 @@ +import cPickle + +class Vocabulary: + + OOV_VAL = -1 + + def __init__(self): + self.str_to_tok = {} + self.tok_to_str = {} + + def put(self, string): + if string in self.str_to_tok: + raise ValueError("%s is already in this vocabulary (token %d)" % \ + (string, self.str_to_tok[string])) + return self.ensure(string) + + def ensure(self, string): + if string in self.str_to_tok: + return + tok = len(self) + self.str_to_tok[string] = tok + self.tok_to_str[tok] = string + return tok + + def gett(self, string): + if string not in self.str_to_tok: + return self.OOV_VAL + return self.str_to_tok[string] + + def gets(self, tok): + return self.tok_to_str[tok] + + def strs(self): + return self.str_to_tok.keys() + + def toks(self): + return self.tok_to_str.keys() + + def __len__(self): + return len(self.str_to_tok) + + def save(self, path): + with open(path, 'w') as f: + cPickle.dump(self, f) + + @classmethod + def load(cls, path): + with open(path) as f: + return cPickle.load(f) diff --git a/nlp_tools/vocabulary.pyc b/nlp_tools/vocabulary.pyc new file mode 100644 index 0000000..952b7fd Binary files /dev/null and b/nlp_tools/vocabulary.pyc differ -- cgit v1.2.3