summaryrefslogtreecommitdiff
path: root/nlp_tools
diff options
context:
space:
mode:
authorJacob <andqso@gmail.com>2013-07-28 09:54:54 +0100
committerJacob <andqso@gmail.com>2013-07-28 09:54:54 +0100
commitf343459d6198352964dbb6779f15c352fe2d5794 (patch)
tree07d50c9a8269e3892ccea8f5680b3e2bac984fce /nlp_tools
init
Diffstat (limited to 'nlp_tools')
-rw-r--r--nlp_tools/dict_utils.py101
-rw-r--r--nlp_tools/dict_utils.pycbin0 -> 3803 bytes
-rw-r--r--nlp_tools/feature.pycbin0 -> 427 bytes
-rw-r--r--nlp_tools/vocabulary.py49
-rw-r--r--nlp_tools/vocabulary.pycbin0 -> 2461 bytes
5 files changed, 150 insertions, 0 deletions
diff --git a/nlp_tools/dict_utils.py b/nlp_tools/dict_utils.py
new file mode 100644
index 0000000..8b9b94b
--- /dev/null
+++ b/nlp_tools/dict_utils.py
@@ -0,0 +1,101 @@
+"""
+Utilities for doing math on sparse vectors indexed by arbitrary objects.
+(These will usually be feature vectors.)
+"""
+
+import math_utils as mu
+import math
+
+def d_elt_op_keep(op, zero, args):
+ """
+ Applies op to arguments elementwise, keeping entries that don't occur in
+ every argument (i.e. behaves like a sum).
+ """
+ ret = {}
+ for d in args:
+ for key in d:
+ if key not in ret:
+ ret[key] = d[key]
+ else:
+ ret[key] = op([ret[key], d[key]])
+ for key in ret.keys():
+ if ret[key] == zero:
+ del ret[key]
+ return ret
+
+def d_elt_op_drop(op, args):
+ """
+ Applies op to arguments elementwise, discarding entries that don't occur in
+ every argument (i.e. behaves like a product).
+ """
+ # avoid querying lots of nonexistent keys
+ smallest = min(args, key=len)
+ sindex = args.index(smallest)
+ ret = dict(smallest)
+ for i in range(len(args)):
+ if i == sindex:
+ continue
+ d = args[i]
+ for key in ret.keys():
+ if key in d:
+ ret[key] = op([ret[key], d[key]])
+ else:
+ del ret[key]
+ return ret
+
+def d_sum(args):
+ """
+ Computes a sum of vectors.
+ """
+ return d_elt_op_keep(sum, 0, args)
+
+def d_logspace_sum(args):
+ """
+ Computes a sum of vectors whose elements are represented in logspace.
+ """
+ return d_elt_op_keep(mu.logspace_sum, -float('inf'), args)
+
+def d_elt_prod(args):
+ """
+ Computes an elementwise product of vectors.
+ """
+ return d_elt_op_drop(lambda l: reduce(lambda a,b: a*b, l), args)
+
+def d_dot_prod(d1, d2):
+ """
+ Takes the dot product of the two arguments.
+ """
+ # avoid querying lots of nonexistent keys
+ if len(d2) < len(d1):
+ d1, d2 = d2, d1
+ dot_prod = 0
+ for key in d1:
+ if key in d2:
+ dot_prod += d1[key] * d2[key]
+ return dot_prod
+
+def d_logspace_scalar_prod(c, d):
+ """
+ Multiplies every element of d by c, where c and d are both represented in
+ logspace.
+ """
+ ret = {}
+ for key in d:
+ ret[key] = c + d[key]
+ return ret
+
+def d_op(op, d):
+ """
+ Applies op to every element of the dictionary.
+ """
+ ret = {}
+ for key in d:
+ ret[key] = op(d[key])
+ return ret
+
+# convenience methods
+def d_log(d):
+ return d_op(math.log, d)
+
+def d_exp(d):
+ return d_op(math.exp, d)
diff --git a/nlp_tools/dict_utils.pyc b/nlp_tools/dict_utils.pyc
new file mode 100644
index 0000000..ada4c58
--- /dev/null
+++ b/nlp_tools/dict_utils.pyc
Binary files differ
diff --git a/nlp_tools/feature.pyc b/nlp_tools/feature.pyc
new file mode 100644
index 0000000..9c96271
--- /dev/null
+++ b/nlp_tools/feature.pyc
Binary files differ
diff --git a/nlp_tools/vocabulary.py b/nlp_tools/vocabulary.py
new file mode 100644
index 0000000..ed200f5
--- /dev/null
+++ b/nlp_tools/vocabulary.py
@@ -0,0 +1,49 @@
+import cPickle
+
+class Vocabulary:
+
+ OOV_VAL = -1
+
+ def __init__(self):
+ self.str_to_tok = {}
+ self.tok_to_str = {}
+
+ def put(self, string):
+ if string in self.str_to_tok:
+ raise ValueError("%s is already in this vocabulary (token %d)" % \
+ (string, self.str_to_tok[string]))
+ return self.ensure(string)
+
+ def ensure(self, string):
+ if string in self.str_to_tok:
+ return
+ tok = len(self)
+ self.str_to_tok[string] = tok
+ self.tok_to_str[tok] = string
+ return tok
+
+ def gett(self, string):
+ if string not in self.str_to_tok:
+ return self.OOV_VAL
+ return self.str_to_tok[string]
+
+ def gets(self, tok):
+ return self.tok_to_str[tok]
+
+ def strs(self):
+ return self.str_to_tok.keys()
+
+ def toks(self):
+ return self.tok_to_str.keys()
+
+ def __len__(self):
+ return len(self.str_to_tok)
+
+ def save(self, path):
+ with open(path, 'w') as f:
+ cPickle.dump(self, f)
+
+ @classmethod
+ def load(cls, path):
+ with open(path) as f:
+ return cPickle.load(f)
diff --git a/nlp_tools/vocabulary.pyc b/nlp_tools/vocabulary.pyc
new file mode 100644
index 0000000..952b7fd
--- /dev/null
+++ b/nlp_tools/vocabulary.pyc
Binary files differ