summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/Corpus.java
diff options
context:
space:
mode:
Diffstat (limited to 'gi/posterior-regularisation/Corpus.java')
-rw-r--r--gi/posterior-regularisation/Corpus.java120
1 files changed, 52 insertions, 68 deletions
diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java
index 047e6ee8..07b27387 100644
--- a/gi/posterior-regularisation/Corpus.java
+++ b/gi/posterior-regularisation/Corpus.java
@@ -7,69 +7,64 @@ import java.util.regex.Pattern;
public class Corpus
{
private Lexicon<String> tokenLexicon = new Lexicon<String>();
- private Lexicon<TIntArrayList> ngramLexicon = new Lexicon<TIntArrayList>();
+ private Lexicon<TIntArrayList> phraseLexicon = new Lexicon<TIntArrayList>();
+ private Lexicon<TIntArrayList> contextLexicon = new Lexicon<TIntArrayList>();
private List<Edge> edges = new ArrayList<Edge>();
- private Map<Ngram,List<Edge>> phraseToContext = new HashMap<Ngram,List<Edge>>();
- private Map<Ngram,List<Edge>> contextToPhrase = new HashMap<Ngram,List<Edge>>();
+ private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>();
+ private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>();
- public class Ngram
+ public class Edge
{
- private Ngram(int id)
+ Edge(int phraseId, int contextId, int count)
{
- ngramId = id;
+ this.phraseId = phraseId;
+ this.contextId = contextId;
+ this.count = count;
}
- public int getId()
+ public int getPhraseId()
{
- return ngramId;
+ return phraseId;
}
- public TIntArrayList getTokenIds()
+ public TIntArrayList getPhrase()
{
- return ngramLexicon.lookup(ngramId);
+ return phraseLexicon.lookup(phraseId);
}
- public String toString()
+ public String getPhraseString()
{
StringBuffer b = new StringBuffer();
- for (int tid: getTokenIds().toNativeArray())
+ for (int tid: getPhrase().toNativeArray())
{
if (b.length() > 0)
b.append(" ");
b.append(tokenLexicon.lookup(tid));
}
return b.toString();
- }
- public int hashCode()
- {
- return ngramId;
- }
- public boolean equals(Object other)
- {
- return other instanceof Ngram && ngramId == ((Ngram) other).ngramId;
- }
- private int ngramId;
- }
-
- public class Edge
- {
- Edge(Ngram phrase, Ngram context, int count)
+ }
+ public int getContextId()
{
- this.phrase = phrase;
- this.context = context;
- this.count = count;
+ return contextId;
}
- public Ngram getPhrase()
+ public TIntArrayList getContext()
{
- return phrase;
+ return contextLexicon.lookup(contextId);
}
- public Ngram getContext()
+ public String getContextString()
{
- return context;
+ StringBuffer b = new StringBuffer();
+ for (int tid: getContext().toNativeArray())
+ {
+ if (b.length() > 0)
+ b.append(" ");
+ b.append(tokenLexicon.lookup(tid));
+ }
+ return b.toString();
}
public int getCount()
{
return count;
}
- private Ngram phrase;
- private Ngram context;
+ private int phraseId;
+ private int contextId;
private int count;
}
@@ -78,32 +73,32 @@ public class Corpus
return edges;
}
- int numEdges()
+ int getNumEdges()
{
return edges.size();
}
- Set<Ngram> getPhrases()
+ int getNumPhrases()
{
- return phraseToContext.keySet();
+ return phraseLexicon.size();
}
- List<Edge> getEdgesForPhrase(Ngram phrase)
+ List<Edge> getEdgesForPhrase(int phraseId)
{
- return phraseToContext.get(phrase);
+ return phraseToContext.get(phraseId);
}
- Set<Ngram> getContexts()
+ int getNumContexts()
{
- return contextToPhrase.keySet();
+ return contextLexicon.size();
}
- List<Edge> getEdgesForContext(Ngram context)
+ List<Edge> getEdgesForContext(int contextId)
{
- return contextToPhrase.get(context);
+ return contextToPhrase.get(contextId);
}
- int numTokens()
+ int getNumTokens()
{
return tokenLexicon.size();
}
@@ -132,8 +127,9 @@ public class Corpus
TIntArrayList ptoks = new TIntArrayList();
while (st.hasMoreTokens())
ptoks.add(c.tokenLexicon.insert(st.nextToken()));
- int phraseId = c.ngramLexicon.insert(ptoks);
- Ngram phrase = c.new Ngram(phraseId);
+ int phraseId = c.phraseLexicon.insert(ptoks);
+ if (phraseId == c.phraseToContext.size())
+ c.phraseToContext.add(new ArrayList<Edge>());
// process contexts
String[] parts = separator.split(rest);
@@ -151,30 +147,18 @@ public class Corpus
if (!token.equals("<PHRASE>"))
ctx.add(c.tokenLexicon.insert(token));
}
- int contextId = c.ngramLexicon.insert(ctx);
- Ngram context = c.new Ngram(contextId);
+ int contextId = c.contextLexicon.insert(ctx);
+ if (contextId == c.contextToPhrase.size())
+ c.contextToPhrase.add(new ArrayList<Edge>());
assert (countString.startsWith("C="));
- Edge e = c.new Edge(phrase, context, Integer.parseInt(countString.substring(2).trim()));
+ Edge e = c.new Edge(phraseId, contextId,
+ Integer.parseInt(countString.substring(2).trim()));
c.edges.add(e);
- // index the edge for fast phrase lookup
- List<Edge> edges = c.phraseToContext.get(phrase);
- if (edges == null)
- {
- edges = new ArrayList<Edge>();
- c.phraseToContext.put(phrase, edges);
- }
- edges.add(e);
-
- // index the edge for fast context lookup
- edges = c.contextToPhrase.get(context);
- if (edges == null)
- {
- edges = new ArrayList<Edge>();
- c.contextToPhrase.put(context, edges);
- }
- edges.add(e);
+ // index the edge for fast phrase, context lookup
+ c.phraseToContext.get(phraseId).add(e);
+ c.contextToPhrase.get(contextId).add(e);
}
}