summaryrefslogtreecommitdiff
path: root/extractor/vocabulary.h
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
commit6d347f1ce078dede3da0e1498f75e357351c6543 (patch)
tree8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/vocabulary.h
parentd11b76def6899790161c47a73018146311356d8b (diff)
parent5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff)
merge paul's extractor code
Diffstat (limited to 'extractor/vocabulary.h')
-rw-r--r--extractor/vocabulary.h48
1 files changed, 48 insertions, 0 deletions
diff --git a/extractor/vocabulary.h b/extractor/vocabulary.h
new file mode 100644
index 00000000..c8fd9411
--- /dev/null
+++ b/extractor/vocabulary.h
@@ -0,0 +1,48 @@
+#ifndef _VOCABULARY_H_
+#define _VOCABULARY_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+using namespace std;
+
+namespace extractor {
+
+/**
+ * Data structure for mapping words to word ids.
+ *
+ * This strucure contains words located in the frequent collocations and words
+ * encountered during the grammar extraction time. This dictionary is
+ * considerably smaller than the dictionaries in the data arrays (and so is the
+ * query time). Note that this is the single data structure that changes state
+ * and needs to have thread safe read/write operations.
+ *
+ * Note: For an experiment using different vocabulary instances for each thread,
+ * the running time did not improve implying that the critical regions do not
+ * cause bottlenecks.
+ */
+class Vocabulary {
+ public:
+ virtual ~Vocabulary();
+
+ // Returns the word id for the given word.
+ virtual int GetTerminalIndex(const string& word);
+
+ // Returns the id for a nonterminal located at the given position in a phrase.
+ int GetNonterminalIndex(int position);
+
+ // Checks if a symbol is a nonterminal.
+ bool IsTerminal(int symbol);
+
+ // Returns the word corresponding to the given word id.
+ virtual string GetTerminalValue(int symbol);
+
+ private:
+ unordered_map<string, int> dictionary;
+ vector<string> words;
+};
+
+} // namespace extractor
+
+#endif