package phrase; import io.FileUtil; import joptsimple.OptionParser; import joptsimple.OptionSet; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.Random; import arr.F; public class Trainer { public static void main(String[] args) { OptionParser parser = new OptionParser(); parser.accepts("help"); parser.accepts("in").withRequiredArg().ofType(File.class); parser.accepts("out").withRequiredArg().ofType(File.class); parser.accepts("parameters").withRequiredArg().ofType(File.class); parser.accepts("topics").withRequiredArg().ofType(Integer.class).defaultsTo(5); parser.accepts("em-iterations").withRequiredArg().ofType(Integer.class).defaultsTo(5); parser.accepts("pr-iterations").withRequiredArg().ofType(Integer.class).defaultsTo(0); parser.accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(0); parser.accepts("scale-phrase").withRequiredArg().ofType(Double.class).defaultsTo(5.0); parser.accepts("scale-context").withRequiredArg().ofType(Double.class).defaultsTo(0.0); parser.accepts("seed").withRequiredArg().ofType(Long.class).defaultsTo(0l); parser.accepts("convergence-threshold").withRequiredArg().ofType(Double.class).defaultsTo(1e-6); parser.accepts("variational-bayes"); parser.accepts("alpha-emit").withRequiredArg().ofType(Double.class).defaultsTo(0.1); parser.accepts("alpha-pi").withRequiredArg().ofType(Double.class).defaultsTo(0.01); parser.accepts("agree"); parser.accepts("no-parameter-cache"); parser.accepts("skip-large-phrases").withRequiredArg().ofType(Integer.class).defaultsTo(5); OptionSet options = parser.parse(args); if (options.has("help") || !options.has("in")) { try { parser.printHelpOn(System.err); } catch (IOException e) { System.err.println("This should never happen."); e.printStackTrace(); } System.exit(1); } int tags = (Integer) options.valueOf("topics"); int em_iterations = (Integer) options.valueOf("em-iterations"); int pr_iterations = (Integer) options.valueOf("pr-iterations"); double scale_phrase = (Double) options.valueOf("scale-phrase"); double scale_context = (Double) options.valueOf("scale-context"); int threads = (Integer) options.valueOf("threads"); double threshold = (Double) options.valueOf("convergence-threshold"); boolean vb = options.has("variational-bayes"); double alphaEmit = (vb) ? (Double) options.valueOf("alpha-emit") : 0; double alphaPi = (vb) ? (Double) options.valueOf("alpha-pi") : 0; int skip = (Integer) options.valueOf("skip-large-phrases"); if (options.has("seed")) F.rng = new Random((Long) options.valueOf("seed")); if (tags <= 1 || scale_phrase < 0 || scale_context < 0 || threshold < 0) { System.err.println("Invalid arguments. Try again!"); System.exit(1); } Corpus corpus = null; File infile = (File) options.valueOf("in"); try { System.out.println("Reading concordance from " + infile); corpus = Corpus.readFromFile(FileUtil.reader(infile)); corpus.printStats(System.out); } catch (IOException e) { System.err.println("Failed to open input file: " + infile); e.printStackTrace(); System.exit(1); } if (!options.has("agree")) System.out.println("Running with " + tags + " tags " + "for " + em_iterations + " EM and " + pr_iterations + " PR iterations " + "skipping large phrases for first " + skip + " iterations " + "with scale " + scale_phrase + " phrase and " + scale_context + " context " + "and " + threads + " threads"); else System.out.println("Running agreement model with " + tags + " tags " + "for " + em_iterations); System.out.println(); PhraseCluster cluster = null; Agree agree = null; if (options.has("agree")) agree = new Agree(tags, corpus); else { cluster = new PhraseCluster(tags, corpus); if (threads > 0) cluster.useThreadPool(threads); if (vb) cluster.initialiseVB(alphaEmit, alphaPi); if (options.has("no-parameter-cache")) cluster.cacheLambda = false; } double last = 0; for (int i=0; i