diff options
Diffstat (limited to 'gi/posterior-regularisation/Corpus.java')
| -rw-r--r-- | gi/posterior-regularisation/Corpus.java | 120 | 
1 files changed, 52 insertions, 68 deletions
| diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java index 047e6ee8..07b27387 100644 --- a/gi/posterior-regularisation/Corpus.java +++ b/gi/posterior-regularisation/Corpus.java @@ -7,69 +7,64 @@ import java.util.regex.Pattern;  public class Corpus  {  	private Lexicon<String> tokenLexicon = new Lexicon<String>(); -	private Lexicon<TIntArrayList> ngramLexicon = new Lexicon<TIntArrayList>(); +	private Lexicon<TIntArrayList> phraseLexicon = new Lexicon<TIntArrayList>(); +	private Lexicon<TIntArrayList> contextLexicon = new Lexicon<TIntArrayList>();  	private List<Edge> edges = new ArrayList<Edge>(); -	private Map<Ngram,List<Edge>> phraseToContext = new HashMap<Ngram,List<Edge>>(); -	private Map<Ngram,List<Edge>> contextToPhrase = new HashMap<Ngram,List<Edge>>(); +	private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>(); +	private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>(); -	public class Ngram +	public class Edge  	{ -		private Ngram(int id) +		Edge(int phraseId, int contextId, int count)  		{ -			ngramId = id; +			this.phraseId = phraseId; +			this.contextId = contextId; +			this.count = count;  		} -		public int getId() +		public int getPhraseId()  		{ -			return ngramId; +			return phraseId;  		} -		public TIntArrayList getTokenIds() +		public TIntArrayList getPhrase()  		{ -			return ngramLexicon.lookup(ngramId); +			return phraseLexicon.lookup(phraseId);  		} -		public String toString() +		public String getPhraseString()  		{  			StringBuffer b = new StringBuffer(); -			for (int tid: getTokenIds().toNativeArray()) +			for (int tid: getPhrase().toNativeArray())  			{  				if (b.length() > 0)  					b.append(" ");  				b.append(tokenLexicon.lookup(tid));  			}  			return b.toString(); -		} -		public int hashCode() -		{ -			return ngramId; -		} -		public boolean equals(Object other) -		{ -			return other instanceof Ngram && ngramId == ((Ngram) other).ngramId; -		} -		private int ngramId; -	} -	 -	public class Edge -	{ -		Edge(Ngram phrase, Ngram context, int count) +		}		 +		public int getContextId()  		{ -			this.phrase = phrase; -			this.context = context; -			this.count = count; +			return contextId;  		} -		public Ngram getPhrase() +		public TIntArrayList getContext()  		{ -			return phrase; +			return contextLexicon.lookup(contextId);  		} -		public Ngram getContext() +		public String getContextString()  		{ -			return context; +			StringBuffer b = new StringBuffer(); +			for (int tid: getContext().toNativeArray()) +			{ +				if (b.length() > 0) +					b.append(" "); +				b.append(tokenLexicon.lookup(tid)); +			} +			return b.toString();  		}  		public int getCount()  		{  			return count;  		} -		private Ngram phrase; -		private Ngram context; +		private int phraseId; +		private int contextId;  		private int count;  	} @@ -78,32 +73,32 @@ public class Corpus  		return edges;  	} -	int numEdges() +	int getNumEdges()  	{  		return edges.size();  	} -	Set<Ngram> getPhrases() +	int getNumPhrases()  	{ -		return phraseToContext.keySet(); +		return phraseLexicon.size();  	} -	List<Edge> getEdgesForPhrase(Ngram phrase) +	List<Edge> getEdgesForPhrase(int phraseId)  	{ -		return phraseToContext.get(phrase); +		return phraseToContext.get(phraseId);  	} -	Set<Ngram> getContexts() +	int getNumContexts()  	{ -		return contextToPhrase.keySet(); +		return contextLexicon.size();  	} -	List<Edge> getEdgesForContext(Ngram context) +	List<Edge> getEdgesForContext(int contextId)  	{ -		return contextToPhrase.get(context); +		return contextToPhrase.get(contextId);  	} -	int numTokens() +	int getNumTokens()  	{  		return tokenLexicon.size();  	} @@ -132,8 +127,9 @@ public class Corpus  			TIntArrayList ptoks = new TIntArrayList();  			while (st.hasMoreTokens())  				ptoks.add(c.tokenLexicon.insert(st.nextToken())); -			int phraseId = c.ngramLexicon.insert(ptoks); -			Ngram phrase = c.new Ngram(phraseId); +			int phraseId = c.phraseLexicon.insert(ptoks); +			if (phraseId == c.phraseToContext.size()) +				c.phraseToContext.add(new ArrayList<Edge>());  			// process contexts  			String[] parts = separator.split(rest); @@ -151,30 +147,18 @@ public class Corpus  					if (!token.equals("<PHRASE>"))  						ctx.add(c.tokenLexicon.insert(token));  				} -				int contextId = c.ngramLexicon.insert(ctx); -				Ngram context = c.new Ngram(contextId); +				int contextId = c.contextLexicon.insert(ctx); +				if (contextId == c.contextToPhrase.size()) +					c.contextToPhrase.add(new ArrayList<Edge>());  				assert (countString.startsWith("C=")); -				Edge e = c.new Edge(phrase, context, Integer.parseInt(countString.substring(2).trim())); +				Edge e = c.new Edge(phraseId, contextId,  +						Integer.parseInt(countString.substring(2).trim()));  				c.edges.add(e); -				// index the edge for fast phrase lookup -				List<Edge> edges = c.phraseToContext.get(phrase); -				if (edges == null) -				{ -					edges = new ArrayList<Edge>(); -					c.phraseToContext.put(phrase, edges); -				} -				edges.add(e); -				 -				// index the edge for fast context lookup -				edges = c.contextToPhrase.get(context); -				if (edges == null) -				{ -					edges = new ArrayList<Edge>(); -					c.contextToPhrase.put(context, edges); -				} -				edges.add(e); +				// index the edge for fast phrase, context lookup +				c.phraseToContext.get(phraseId).add(e); +				c.contextToPhrase.get(contextId).add(e);  			}  		} | 
