From f21864661fbd245a464bfc9ab9e452c92e5d6ed1 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Sat, 17 Jul 2010 03:13:09 +0000 Subject: Extra options git-svn-id: https://ws10smt.googlecode.com/svn/trunk@305 ec762483-ff6d-05da-a07a-a48fb63a330f --- .../prjava/src/phrase/Corpus.java | 82 +++++++++++++++++++--- .../prjava/src/phrase/PhraseCluster.java | 2 +- .../prjava/src/phrase/PhraseCorpus.java | 2 +- .../prjava/src/phrase/Trainer.java | 10 ++- 4 files changed, 85 insertions(+), 11 deletions(-) (limited to 'gi/posterior-regularisation/prjava/src/phrase') diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java index f2c6b132..2afc18dc 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java @@ -18,7 +18,9 @@ public class Corpus public int splitSentinel; public int phraseSentinel; public int rareSentinel; - private boolean[] wordIsRare; + private boolean[] rareWords; + private boolean[] rarePhrases; + private boolean[] rareContexts; public Corpus() { @@ -62,7 +64,8 @@ public class Corpus public TIntArrayList getRawContext() { return Corpus.this.getRawContext(contextId); - } public String getContextString(boolean insertPhraseSentinel) + } + public String getContextString(boolean insertPhraseSentinel) { return Corpus.this.getContextString(contextId, insertPhraseSentinel); } @@ -143,12 +146,12 @@ public class Corpus public TIntArrayList getPhrase(int phraseId) { TIntArrayList phrase = phraseLexicon.lookup(phraseId); - if (wordIsRare != null) + if (rareWords != null) { boolean first = true; for (int i = 0; i < phrase.size(); ++i) { - if (wordIsRare[phrase.get(i)]) + if (rareWords[phrase.get(i)]) { if (first) { @@ -182,12 +185,12 @@ public class Corpus public TIntArrayList getContext(int contextId) { TIntArrayList context = contextLexicon.lookup(contextId); - if (wordIsRare != null) + if (rareWords != null) { boolean first = true; for (int i = 0; i < context.size(); ++i) { - if (wordIsRare[context.get(i)]) + if (rareWords[context.get(i)]) { if (first) { @@ -319,8 +322,71 @@ public class Corpus counts[context.get(i)] += e.getCount(); } - wordIsRare = new boolean[wordLexicon.size()]; + int count = 0; + rareWords = new boolean[wordLexicon.size()]; for (int i = 0; i < wordLexicon.size(); ++i) - wordIsRare[i] = counts[i] < wordThreshold; + { + rareWords[i] = counts[i] < wordThreshold; + if (rareWords[i]) + count++; + } + System.err.println("There are " + count + " rare words"); + } + + public void applyPhraseThreshold(int threshold) + { + rarePhrases = new boolean[phraseLexicon.size()]; + + int n = 0; + for (int i = 0; i < phraseLexicon.size(); ++i) + { + List contexts = phraseToContext.get(i); + int count = 0; + for (Edge edge: contexts) + { + count += edge.getCount(); + if (count >= threshold) + break; + } + + if (count < threshold) + { + rarePhrases[i] = true; + n++; + } + } + System.err.println("There are " + n + " rare phrases"); + } + + public void applyContextThreshold(int threshold) + { + rareContexts = new boolean[contextLexicon.size()]; + + int n = 0; + for (int i = 0; i < contextLexicon.size(); ++i) + { + List phrases = contextToPhrase.get(i); + int count = 0; + for (Edge edge: phrases) + { + count += edge.getCount(); + if (count >= threshold) + break; + } + + if (count < threshold) + { + rareContexts[i] = true; + n++; + } + } + System.err.println("There are " + n + " rare contexts"); + } + + boolean isRare(Edge edge) + { + if (rarePhrases != null && rarePhrases[edge.getPhraseId()] == true) return true; + if (rareContexts != null && rareContexts[edge.getContextId()] == true) return true; + return false; } } \ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java index feab5eda..9ee766d4 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java +++ b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java @@ -103,7 +103,7 @@ public class PhraseCluster { for (int ctx=0; ctx 0) + if (wordThreshold > 1) corpus.applyWordThreshold(wordThreshold); + if (phraseThreshold > 1) + corpus.applyPhraseThreshold(phraseThreshold); + if (contextThreshold > 1) + corpus.applyContextThreshold(contextThreshold); if (!options.has("agree")) System.out.println("Running with " + tags + " tags " + -- cgit v1.2.3