diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
commit | 6d347f1ce078dede3da0e1498f75e357351c6543 (patch) | |
tree | 8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/vocabulary.h | |
parent | d11b76def6899790161c47a73018146311356d8b (diff) | |
parent | 5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff) |
merge paul's extractor code
Diffstat (limited to 'extractor/vocabulary.h')
-rw-r--r-- | extractor/vocabulary.h | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/extractor/vocabulary.h b/extractor/vocabulary.h new file mode 100644 index 00000000..c8fd9411 --- /dev/null +++ b/extractor/vocabulary.h @@ -0,0 +1,48 @@ +#ifndef _VOCABULARY_H_ +#define _VOCABULARY_H_ + +#include <string> +#include <unordered_map> +#include <vector> + +using namespace std; + +namespace extractor { + +/** + * Data structure for mapping words to word ids. + * + * This strucure contains words located in the frequent collocations and words + * encountered during the grammar extraction time. This dictionary is + * considerably smaller than the dictionaries in the data arrays (and so is the + * query time). Note that this is the single data structure that changes state + * and needs to have thread safe read/write operations. + * + * Note: For an experiment using different vocabulary instances for each thread, + * the running time did not improve implying that the critical regions do not + * cause bottlenecks. + */ +class Vocabulary { + public: + virtual ~Vocabulary(); + + // Returns the word id for the given word. + virtual int GetTerminalIndex(const string& word); + + // Returns the id for a nonterminal located at the given position in a phrase. + int GetNonterminalIndex(int position); + + // Checks if a symbol is a nonterminal. + bool IsTerminal(int symbol); + + // Returns the word corresponding to the given word id. + virtual string GetTerminalValue(int symbol); + + private: + unordered_map<string, int> dictionary; + vector<string> words; +}; + +} // namespace extractor + +#endif |