summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
commit6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
parentb510da2e562c695c90d565eb295c749569c59be8 (diff)
parentc615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
merge upstream/master
Diffstat (limited to 'gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java')
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java193
1 files changed, 0 insertions, 193 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
deleted file mode 100644
index 0cf31c1c..00000000
--- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java
+++ /dev/null
@@ -1,193 +0,0 @@
-package phrase;
-
-import io.FileUtil;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Scanner;
-
-public class PhraseCorpus
-{
- public HashMap<String,Integer>wordLex;
- public HashMap<String,Integer>phraseLex;
-
- public String wordList[];
- public String phraseList[];
-
- //data[phrase][num context][position]
- public int data[][][];
- public int numContexts;
-
- public PhraseCorpus(String filename) throws FileNotFoundException, IOException
- {
- BufferedReader r = FileUtil.reader(new File(filename));
-
- phraseLex=new HashMap<String,Integer>();
- wordLex=new HashMap<String,Integer>();
-
- ArrayList<int[][]>dataList=new ArrayList<int[][]>();
- String line=null;
- numContexts = 0;
-
- while((line=readLine(r))!=null){
-
- String toks[]=line.split("\t");
- String phrase=toks[0];
- addLex(phrase,phraseLex);
-
- toks=toks[1].split(" \\|\\|\\| ");
-
- ArrayList <int[]>ctxList=new ArrayList<int[]>();
-
- for(int i=0;i<toks.length;i+=2){
- String ctx=toks[i];
- String words[]=ctx.split(" ");
- if (numContexts == 0)
- numContexts = words.length - 1;
- else
- assert numContexts == words.length - 1;
-
- int []context=new int [numContexts+1];
- int idx=0;
- for(String word:words){
- if(word.equals("<PHRASE>")){
- continue;
- }
- addLex(word,wordLex);
- context[idx]=wordLex.get(word);
- idx++;
- }
-
- String count=toks[i+1];
- context[idx]=Integer.parseInt(count.trim().substring(2));
-
- ctxList.add(context);
- }
-
- dataList.add(ctxList.toArray(new int [0][]));
-
- }
- try{
- r.close();
- }catch(IOException ioe){
- ioe.printStackTrace();
- }
- data=dataList.toArray(new int[0][][]);
- }
-
- private void addLex(String key, HashMap<String,Integer>lex){
- Integer i=lex.get(key);
- if(i==null){
- lex.put(key, lex.size());
- }
- }
-
- //for debugging
- public void saveLex(String lexFilename) throws FileNotFoundException, IOException
- {
- PrintStream ps = FileUtil.printstream(new File(lexFilename));
- ps.println("Phrase Lexicon");
- ps.println(phraseLex.size());
- printDict(phraseLex,ps);
-
- ps.println("Word Lexicon");
- ps.println(wordLex.size());
- printDict(wordLex,ps);
- ps.close();
- }
-
- private static void printDict(HashMap<String,Integer>lex,PrintStream ps){
- String []dict=buildList(lex);
- for(int i=0;i<dict.length;i++){
- ps.println(dict[i]);
- }
- }
-
- public void loadLex(String lexFilename){
- Scanner sc=io.FileUtil.openInFile(lexFilename);
-
- sc.nextLine();
- int size=sc.nextInt();
- sc.nextLine();
- String[]dict=new String[size];
- for(int i=0;i<size;i++){
- dict[i]=sc.nextLine();
- }
- phraseLex=buildMap(dict);
-
- sc.nextLine();
- size=sc.nextInt();
- sc.nextLine();
- dict=new String[size];
- for(int i=0;i<size;i++){
- dict[i]=sc.nextLine();
- }
- wordLex=buildMap(dict);
- sc.close();
- }
-
- private HashMap<String, Integer> buildMap(String[]dict){
- HashMap<String,Integer> map=new HashMap<String,Integer>();
- for(int i=0;i<dict.length;i++){
- map.put(dict[i], i);
- }
- return map;
- }
-
- public void buildList(){
- if(wordList==null){
- wordList=buildList(wordLex);
- phraseList=buildList(phraseLex);
- }
- }
-
- private static String[]buildList(HashMap<String,Integer>lex){
- String dict[]=new String [lex.size()];
- for(String key:lex.keySet()){
- dict[lex.get(key)]=key;
- }
- return dict;
- }
-
- public String getContextString(int context[], boolean addPhraseMarker)
- {
- StringBuffer b = new StringBuffer();
- for (int i=0;i<context.length-1;i++)
- {
- if (b.length() > 0)
- b.append(" ");
-
- if (i == context.length/2)
- b.append("<PHRASE> ");
-
- b.append(wordList[context[i]]);
- }
- return b.toString();
- }
-
- public static String readLine(BufferedReader r){
- try{
- return r.readLine();
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- return null;
- }
-
- public static void main(String[] args) throws Exception
- {
- String LEX_FILENAME="../pdata/lex.out";
- String DATA_FILENAME="../pdata/btec.con";
- PhraseCorpus c=new PhraseCorpus(DATA_FILENAME);
- c.saveLex(LEX_FILENAME);
- c.loadLex(LEX_FILENAME);
- c.saveLex(LEX_FILENAME);
- }
-}