diff options
Diffstat (limited to 'gi/posterior-regularisation')
4 files changed, 85 insertions, 11 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java index f2c6b132..2afc18dc 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java @@ -18,7 +18,9 @@ public class Corpus public int splitSentinel; public int phraseSentinel; public int rareSentinel; - private boolean[] wordIsRare; + private boolean[] rareWords; + private boolean[] rarePhrases; + private boolean[] rareContexts; public Corpus() { @@ -62,7 +64,8 @@ public class Corpus public TIntArrayList getRawContext() { return Corpus.this.getRawContext(contextId); - } public String getContextString(boolean insertPhraseSentinel) + } + public String getContextString(boolean insertPhraseSentinel) { return Corpus.this.getContextString(contextId, insertPhraseSentinel); } @@ -143,12 +146,12 @@ public class Corpus public TIntArrayList getPhrase(int phraseId) { TIntArrayList phrase = phraseLexicon.lookup(phraseId); - if (wordIsRare != null) + if (rareWords != null) { boolean first = true; for (int i = 0; i < phrase.size(); ++i) { - if (wordIsRare[phrase.get(i)]) + if (rareWords[phrase.get(i)]) { if (first) { @@ -182,12 +185,12 @@ public class Corpus public TIntArrayList getContext(int contextId) { TIntArrayList context = contextLexicon.lookup(contextId); - if (wordIsRare != null) + if (rareWords != null) { boolean first = true; for (int i = 0; i < context.size(); ++i) { - if (wordIsRare[context.get(i)]) + if (rareWords[context.get(i)]) { if (first) { @@ -319,8 +322,71 @@ public class Corpus counts[context.get(i)] += e.getCount(); } - wordIsRare = new boolean[wordLexicon.size()]; + int count = 0; + rareWords = new boolean[wordLexicon.size()]; for (int i = 0; i < wordLexicon.size(); ++i) - wordIsRare[i] = counts[i] < wordThreshold; + { + rareWords[i] = counts[i] < wordThreshold; + if (rareWords[i]) + count++; + } + System.err.println("There are " + count + " rare words"); + } + + public void applyPhraseThreshold(int threshold) + { + rarePhrases = new boolean[phraseLexicon.size()]; + + int n = 0; + for (int i = 0; i < phraseLexicon.size(); ++i) + { + List<Edge> contexts = phraseToContext.get(i); + int count = 0; + for (Edge edge: contexts) + { + count += edge.getCount(); + if (count >= threshold) + break; + } + + if (count < threshold) + { + rarePhrases[i] = true; + n++; + } + } + System.err.println("There are " + n + " rare phrases"); + } + + public void applyContextThreshold(int threshold) + { + rareContexts = new boolean[contextLexicon.size()]; + + int n = 0; + for (int i = 0; i < contextLexicon.size(); ++i) + { + List<Edge> phrases = contextToPhrase.get(i); + int count = 0; + for (Edge edge: phrases) + { + count += edge.getCount(); + if (count >= threshold) + break; + } + + if (count < threshold) + { + rareContexts[i] = true; + n++; + } + } + System.err.println("There are " + n + " rare contexts"); + } + + boolean isRare(Edge edge) + { + if (rarePhrases != null && rarePhrases[edge.getPhraseId()] == true) return true; + if (rareContexts != null && rareContexts[edge.getContextId()] == true) return true; + return false; } }
\ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java index feab5eda..9ee766d4 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java +++ b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java @@ -103,7 +103,7 @@ public class PhraseCluster { for (int ctx=0; ctx<contexts.size(); ctx++)
{
Edge edge = contexts.get(ctx);
- if (edge.getCount() < edge_threshold)
+ if (edge.getCount() < edge_threshold || c.isRare(edge))
continue;
double p[]=posterior(edge);
diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java index 903e47c8..0cf31c1c 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java +++ b/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java @@ -74,7 +74,7 @@ public class PhraseCorpus }
try{
- r.close();
+ r.close();
}catch(IOException ioe){
ioe.printStackTrace();
}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java index ed7a6bbe..d1322c26 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java @@ -36,6 +36,8 @@ public class Trainer parser.accepts("skip-large-phrases").withRequiredArg().ofType(Integer.class).defaultsTo(5); parser.accepts("rare-word").withRequiredArg().ofType(Integer.class).defaultsTo(0); parser.accepts("rare-edge").withRequiredArg().ofType(Integer.class).defaultsTo(0); + parser.accepts("rare-phrase").withRequiredArg().ofType(Integer.class).defaultsTo(0); + parser.accepts("rare-context").withRequiredArg().ofType(Integer.class).defaultsTo(0); OptionSet options = parser.parse(args); if (options.has("help") || !options.has("in")) @@ -61,6 +63,8 @@ public class Trainer int skip = (Integer) options.valueOf("skip-large-phrases"); int wordThreshold = (Integer) options.valueOf("rare-word"); int edgeThreshold = (Integer) options.valueOf("rare-edge"); + int phraseThreshold = (Integer) options.valueOf("rare-phrase"); + int contextThreshold = (Integer) options.valueOf("rare-context"); if (options.has("seed")) F.rng = new Random((Long) options.valueOf("seed")); @@ -83,8 +87,12 @@ public class Trainer System.exit(1); } - if (wordThreshold > 0) + if (wordThreshold > 1) corpus.applyWordThreshold(wordThreshold); + if (phraseThreshold > 1) + corpus.applyPhraseThreshold(phraseThreshold); + if (contextThreshold > 1) + corpus.applyContextThreshold(contextThreshold); if (!options.has("agree")) System.out.println("Running with " + tags + " tags " + |