diff options
Diffstat (limited to 'gi/posterior-regularisation/prjava/src/phrase/Corpus.java')
-rw-r--r-- | gi/posterior-regularisation/prjava/src/phrase/Corpus.java | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java index d57f3c04..2de2797b 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java @@ -15,6 +15,14 @@ public class Corpus private List<Edge> edges = new ArrayList<Edge>(); private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>(); private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>(); + public int splitSentinel; + public int phraseSentinel; + + public Corpus() + { + splitSentinel = wordLexicon.insert("<SPLIT>"); + phraseSentinel = wordLexicon.insert("<PHRASE>"); + } public class Edge { @@ -157,6 +165,11 @@ public class Corpus return b.toString(); } + public boolean isSentinel(int wordId) + { + return wordId == splitSentinel || wordId == phraseSentinel; + } + static Corpus readFromFile(Reader in) throws IOException { Corpus c = new Corpus(); @@ -218,6 +231,19 @@ public class Corpus return c; } + + TIntArrayList phraseEdges(TIntArrayList phrase) + { + TIntArrayList r = new TIntArrayList(4); + for (int p = 0; p < phrase.size(); ++p) + { + if (p == 0 || phrase.get(p-1) == splitSentinel) + r.add(p); + if (p == phrase.size() - 1 || phrase.get(p+1) == splitSentinel) + r.add(p); + } + return r; + } public void printStats(PrintStream out) { |