summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
diff options
context:
space:
mode:
Diffstat (limited to 'gi/posterior-regularisation/prjava/src/phrase/Corpus.java')
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/Corpus.java26
1 files changed, 26 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
index d57f3c04..2de2797b 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
@@ -15,6 +15,14 @@ public class Corpus
private List<Edge> edges = new ArrayList<Edge>();
private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>();
private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>();
+ public int splitSentinel;
+ public int phraseSentinel;
+
+ public Corpus()
+ {
+ splitSentinel = wordLexicon.insert("<SPLIT>");
+ phraseSentinel = wordLexicon.insert("<PHRASE>");
+ }
public class Edge
{
@@ -157,6 +165,11 @@ public class Corpus
return b.toString();
}
+ public boolean isSentinel(int wordId)
+ {
+ return wordId == splitSentinel || wordId == phraseSentinel;
+ }
+
static Corpus readFromFile(Reader in) throws IOException
{
Corpus c = new Corpus();
@@ -218,6 +231,19 @@ public class Corpus
return c;
}
+
+ TIntArrayList phraseEdges(TIntArrayList phrase)
+ {
+ TIntArrayList r = new TIntArrayList(4);
+ for (int p = 0; p < phrase.size(); ++p)
+ {
+ if (p == 0 || phrase.get(p-1) == splitSentinel)
+ r.add(p);
+ if (p == phrase.size() - 1 || phrase.get(p+1) == splitSentinel)
+ r.add(p);
+ }
+ return r;
+ }
public void printStats(PrintStream out)
{