summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/prjava/src/phrase
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-17 03:13:09 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-17 03:13:09 +0000
commit879536d8e7bec518af716d4ee425fcd0eda36937 (patch)
treec63814f3bb6826099018885ed808b93b5aa900f1 /gi/posterior-regularisation/prjava/src/phrase
parentceeac641b346ed462b802e2fee9091a6c0eb0dbb (diff)
Extra options
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@305 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/posterior-regularisation/prjava/src/phrase')
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/Corpus.java82
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java2
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java2
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/Trainer.java10
4 files changed, 85 insertions, 11 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
index f2c6b132..2afc18dc 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
@@ -18,7 +18,9 @@ public class Corpus
public int splitSentinel;
public int phraseSentinel;
public int rareSentinel;
- private boolean[] wordIsRare;
+ private boolean[] rareWords;
+ private boolean[] rarePhrases;
+ private boolean[] rareContexts;
public Corpus()
{
@@ -62,7 +64,8 @@ public class Corpus
public TIntArrayList getRawContext()
{
return Corpus.this.getRawContext(contextId);
- } public String getContextString(boolean insertPhraseSentinel)
+ }
+ public String getContextString(boolean insertPhraseSentinel)
{
return Corpus.this.getContextString(contextId, insertPhraseSentinel);
}
@@ -143,12 +146,12 @@ public class Corpus
public TIntArrayList getPhrase(int phraseId)
{
TIntArrayList phrase = phraseLexicon.lookup(phraseId);
- if (wordIsRare != null)
+ if (rareWords != null)
{
boolean first = true;
for (int i = 0; i < phrase.size(); ++i)
{
- if (wordIsRare[phrase.get(i)])
+ if (rareWords[phrase.get(i)])
{
if (first)
{
@@ -182,12 +185,12 @@ public class Corpus
public TIntArrayList getContext(int contextId)
{
TIntArrayList context = contextLexicon.lookup(contextId);
- if (wordIsRare != null)
+ if (rareWords != null)
{
boolean first = true;
for (int i = 0; i < context.size(); ++i)
{
- if (wordIsRare[context.get(i)])
+ if (rareWords[context.get(i)])
{
if (first)
{
@@ -319,8 +322,71 @@ public class Corpus
counts[context.get(i)] += e.getCount();
}
- wordIsRare = new boolean[wordLexicon.size()];
+ int count = 0;
+ rareWords = new boolean[wordLexicon.size()];
for (int i = 0; i < wordLexicon.size(); ++i)
- wordIsRare[i] = counts[i] < wordThreshold;
+ {
+ rareWords[i] = counts[i] < wordThreshold;
+ if (rareWords[i])
+ count++;
+ }
+ System.err.println("There are " + count + " rare words");
+ }
+
+ public void applyPhraseThreshold(int threshold)
+ {
+ rarePhrases = new boolean[phraseLexicon.size()];
+
+ int n = 0;
+ for (int i = 0; i < phraseLexicon.size(); ++i)
+ {
+ List<Edge> contexts = phraseToContext.get(i);
+ int count = 0;
+ for (Edge edge: contexts)
+ {
+ count += edge.getCount();
+ if (count >= threshold)
+ break;
+ }
+
+ if (count < threshold)
+ {
+ rarePhrases[i] = true;
+ n++;
+ }
+ }
+ System.err.println("There are " + n + " rare phrases");
+ }
+
+ public void applyContextThreshold(int threshold)
+ {
+ rareContexts = new boolean[contextLexicon.size()];
+
+ int n = 0;
+ for (int i = 0; i < contextLexicon.size(); ++i)
+ {
+ List<Edge> phrases = contextToPhrase.get(i);
+ int count = 0;
+ for (Edge edge: phrases)
+ {
+ count += edge.getCount();
+ if (count >= threshold)
+ break;
+ }
+
+ if (count < threshold)
+ {
+ rareContexts[i] = true;
+ n++;
+ }
+ }
+ System.err.println("There are " + n + " rare contexts");
+ }
+
+ boolean isRare(Edge edge)
+ {
+ if (rarePhrases != null && rarePhrases[edge.getPhraseId()] == true) return true;
+ if (rareContexts != null && rareContexts[edge.getContextId()] == true) return true;
+ return false;
}
} \ No newline at end of file
diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java
index feab5eda..9ee766d4 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java
@@ -103,7 +103,7 @@ public class PhraseCluster {
for (int ctx=0; ctx<contexts.size(); ctx++)
{
Edge edge = contexts.get(ctx);
- if (edge.getCount() < edge_threshold)
+ if (edge.getCount() < edge_threshold || c.isRare(edge))
continue;
double p[]=posterior(edge);
diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
index 903e47c8..0cf31c1c 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
@@ -74,7 +74,7 @@ public class PhraseCorpus
}
try{
- r.close();
+ r.close();
}catch(IOException ioe){
ioe.printStackTrace();
}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java
index ed7a6bbe..d1322c26 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java
@@ -36,6 +36,8 @@ public class Trainer
parser.accepts("skip-large-phrases").withRequiredArg().ofType(Integer.class).defaultsTo(5);
parser.accepts("rare-word").withRequiredArg().ofType(Integer.class).defaultsTo(0);
parser.accepts("rare-edge").withRequiredArg().ofType(Integer.class).defaultsTo(0);
+ parser.accepts("rare-phrase").withRequiredArg().ofType(Integer.class).defaultsTo(0);
+ parser.accepts("rare-context").withRequiredArg().ofType(Integer.class).defaultsTo(0);
OptionSet options = parser.parse(args);
if (options.has("help") || !options.has("in"))
@@ -61,6 +63,8 @@ public class Trainer
int skip = (Integer) options.valueOf("skip-large-phrases");
int wordThreshold = (Integer) options.valueOf("rare-word");
int edgeThreshold = (Integer) options.valueOf("rare-edge");
+ int phraseThreshold = (Integer) options.valueOf("rare-phrase");
+ int contextThreshold = (Integer) options.valueOf("rare-context");
if (options.has("seed"))
F.rng = new Random((Long) options.valueOf("seed"));
@@ -83,8 +87,12 @@ public class Trainer
System.exit(1);
}
- if (wordThreshold > 0)
+ if (wordThreshold > 1)
corpus.applyWordThreshold(wordThreshold);
+ if (phraseThreshold > 1)
+ corpus.applyPhraseThreshold(phraseThreshold);
+ if (contextThreshold > 1)
+ corpus.applyContextThreshold(contextThreshold);
if (!options.has("agree"))
System.out.println("Running with " + tags + " tags " +