summaryrefslogtreecommitdiff
path: root/report/pyp_clustering/acl09-short/code/pygibbs3.c
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-27 16:13:19 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-27 16:13:19 +0000
commitfd519b0e45c857b266814994ba8c1421f508e522 (patch)
tree6d50c9b954e3c13e9df627c1ecc25c53544a5f58 /report/pyp_clustering/acl09-short/code/pygibbs3.c
parent4c5df460c9da5c935438850ef7993463a9113286 (diff)
preso
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@435 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'report/pyp_clustering/acl09-short/code/pygibbs3.c')
-rw-r--r--report/pyp_clustering/acl09-short/code/pygibbs3.c198
1 files changed, 198 insertions, 0 deletions
diff --git a/report/pyp_clustering/acl09-short/code/pygibbs3.c b/report/pyp_clustering/acl09-short/code/pygibbs3.c
new file mode 100644
index 00000000..3c2240a1
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/pygibbs3.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+#include <math.h>
+
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+
+#define W 30114
+#define N 831190
+#define KWMAX 1000
+
+#define NLOOPS 1000
+#define BURNIN 0
+#define SAMPLEFREQ 1
+
+#define ALPHA 0.0 // PYB a
+//#define GAMMA 1000000000.0
+#define GAMMA .01 // Dirichlet over multinomial P0
+
+double BETA; // CRP alpha (PYB b)
+int w[N], z[N]; // words, table assignments
+int typecount[W], typetot; //# of tables of each type, total # tables
+int usedcount[W];
+double ztot[W][KWMAX];
+double k; // total # tables
+int nactive;
+
+void initialise(void);
+void anderson(void);
+void fileread(void);
+
+void initialise(void)
+{
+ int i,j;
+
+ for (i = 1; i < W; i++) {
+ typecount[i] = 0;
+ usedcount[i] = 0;
+ for (j = 0; j < KWMAX; j++) {
+ ztot[i][j] = 0;
+ }
+ }
+
+}
+
+void anderson(void) //stochastic Anderson-style initialisation
+{
+ int i,j, tag;
+ double max, totprob, r, runtot;
+ double probs[KWMAX];
+ int ind, temp;
+
+ ztot[w[0]][0] = 1;
+ z[0] = 0;
+ typecount[w[0]] = 1;
+ usedcount[w[0]] = 1;
+ k = 1;
+ typetot = 1;
+
+ for (i = 1; i < N; i++) {
+ // printf("%5d\n", w[i]);
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j < usedcount[w[i]]; j++) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ totprob += probs[j];
+ }
+ probs[usedcount[w[i]]] = (ALPHA*k+BETA)*((double) typecount[w[i]]+GAMMA)/((double) typetot+W*GAMMA);
+ totprob += probs[usedcount[w[i]]];
+ // printf("%10.6lf\n",totprob);
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ // printf("%5d\n",j);
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ typecount[w[i]]++;
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ typetot++;
+ k++;
+ }
+ }
+}
+
+void fileread(void)
+{
+ int i,j, wt;
+ FILE *fileptr;
+
+ fileptr = fopen("wsj.dat", "r");
+
+ for (i = 1; i < N; i++) {
+ fscanf(fileptr, "%d", &wt);
+ w[i] = wt-1;
+ z[i] = 0;
+ }
+ printf("Total cases: %10d\n", N);
+ fclose(fileptr);
+}
+
+main(int argc, char* argv[])
+{
+ int i,j,loop,run;
+ int temp,ind, tag;
+ double newprob, WBETA;
+ double probs[KWMAX];
+ double max, totprob, r;
+ int sampcount;
+ FILE *fileptr;
+ char filename[30];
+ double score;
+
+ if (argc < 2) {
+ printf("Please provide a value of b\n");
+ exit(0);
+ }
+ BETA = strtol(argv[1]);
+ printf("Basic initialising...\n");
+
+ // you can seed with any uint32, but the best are odds in 0..(2^32 - 1)
+ seedMT(4157U);
+
+ sprintf(filename,"typecountrecordwsjpeak%0.1f.%0.1f.dat",ALPHA,BETA);
+ fileptr = fopen(filename, "w");
+
+ printf("Reading from file...\n");
+ fileread();
+
+ printf("Initialising...\n");
+ initialise();
+ printf("k = %1.0f, typetot = %d\n",k,typetot);
+
+ printf("Finding start state...\n");
+ anderson();
+ printf("Beginning burnin...\n");
+ for (loop = 0; loop < NLOOPS; loop++) {
+ for (i = 0; i < N; i++) {
+ j = z[i];
+ ztot[w[i]][j]--;
+ if (ztot[w[i]][j] == 0) {
+ if (j==usedcount[w[i]]) {
+ usedcount[w[i]]--;
+ }
+ typecount[w[i]]--;
+ typetot--;
+ k--;
+ }
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j <= usedcount[w[i]]; j++) {
+ if (ztot[w[i]][j] > 0) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ } else {
+ probs[j] = 0;
+ if (tag == 0) {
+ probs[j] = (ALPHA*k+BETA)*(((double) typecount[w[i]])+GAMMA)/(((double) typetot)+((double) W)*GAMMA);
+ tag = 1;
+ }
+ }
+ totprob += probs[j];
+ }
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ if (j == usedcount[w[i]]) {
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ }
+ typecount[w[i]]++;
+ typetot++;
+ k++;
+ }
+ }
+ printf("Completed sample # %5d\n", loop);
+ if (k != typetot) printf("k = %1.0f, typetot = %d\n",k,typetot);
+ if (loop >= BURNIN && loop % SAMPLEFREQ == 0) {
+ for (i = 0; i < W; i++) {
+ fprintf(fileptr," %d", typecount[i]); //print (table?) count for each word type
+ }
+ fprintf(fileptr,"\n");
+ }
+ }
+ fclose(fileptr);
+}
+