summaryrefslogtreecommitdiff
path: root/extractor/vocabulary.h
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/vocabulary.h')
-rw-r--r--extractor/vocabulary.h17
1 files changed, 17 insertions, 0 deletions
diff --git a/extractor/vocabulary.h b/extractor/vocabulary.h
index 03c7dc66..c8fd9411 100644
--- a/extractor/vocabulary.h
+++ b/extractor/vocabulary.h
@@ -9,16 +9,33 @@ using namespace std;
namespace extractor {
+/**
+ * Data structure for mapping words to word ids.
+ *
+ * This strucure contains words located in the frequent collocations and words
+ * encountered during the grammar extraction time. This dictionary is
+ * considerably smaller than the dictionaries in the data arrays (and so is the
+ * query time). Note that this is the single data structure that changes state
+ * and needs to have thread safe read/write operations.
+ *
+ * Note: For an experiment using different vocabulary instances for each thread,
+ * the running time did not improve implying that the critical regions do not
+ * cause bottlenecks.
+ */
class Vocabulary {
public:
virtual ~Vocabulary();
+ // Returns the word id for the given word.
virtual int GetTerminalIndex(const string& word);
+ // Returns the id for a nonterminal located at the given position in a phrase.
int GetNonterminalIndex(int position);
+ // Checks if a symbol is a nonterminal.
bool IsTerminal(int symbol);
+ // Returns the word corresponding to the given word id.
virtual string GetTerminalValue(int symbol);
private: