From 2cdc7dd166a91e2ca1fa8aeb0a0836eb2c80cf5c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 8 Jan 2014 18:33:06 +0100 Subject: version 0.2 --- COPYING.LESSER | 165 ++++ README | 97 +++ example/bin/1PUT | 0 example/bin/EXTRACTOR | 0 example/bin/MERT | 0 example/bin/MOSES | 0 example/bin/ZIN_HERE | 0 example/data/epmini-dev-A.de | 2 + example/data/epmini-dev-A.en | 2 + example/data/epmini-dev-B.de | 2 + example/data/epmini-dev-B.en | 2 + example/data/epmini-dev-C.de | 2 + example/data/epmini-dev-C.en | 2 + example/data/epmini-dev-D.de | 2 + example/data/epmini-dev-D.en | 2 + example/ini/moses.ini | 66 ++ example/mmert/inbetwmert.sh | 1 + example/mmert/mert-moses.pl | 1 + example/mmert/regmtl.py | 1 + example/models/epmini.mmap | Bin 0 -> 20326 bytes example/models/phrase-table | 387 +++++++++ example/models/reordering-table | 387 +++++++++ example/scripts/PUT_MOSES_SCRIPTS_IN_HERE | 0 inbetwmert.sh | 159 ++++ mert-moses.pl | 1346 +++++++++++++++++++++++++++++ regmtl.py | 203 +++++ 26 files changed, 2829 insertions(+) create mode 100644 COPYING.LESSER create mode 100644 README create mode 100644 example/bin/1PUT create mode 100644 example/bin/EXTRACTOR create mode 100644 example/bin/MERT create mode 100644 example/bin/MOSES create mode 100644 example/bin/ZIN_HERE create mode 100644 example/data/epmini-dev-A.de create mode 100644 example/data/epmini-dev-A.en create mode 100644 example/data/epmini-dev-B.de create mode 100644 example/data/epmini-dev-B.en create mode 100644 example/data/epmini-dev-C.de create mode 100644 example/data/epmini-dev-C.en create mode 100644 example/data/epmini-dev-D.de create mode 100644 example/data/epmini-dev-D.en create mode 100644 example/ini/moses.ini create mode 120000 example/mmert/inbetwmert.sh create mode 120000 example/mmert/mert-moses.pl create mode 120000 example/mmert/regmtl.py create mode 100644 example/models/epmini.mmap create mode 100644 example/models/phrase-table create mode 100644 example/models/reordering-table create mode 100644 example/scripts/PUT_MOSES_SCRIPTS_IN_HERE create mode 100755 inbetwmert.sh create mode 100755 mert-moses.pl create mode 100755 regmtl.py diff --git a/COPYING.LESSER b/COPYING.LESSER new file mode 100644 index 0000000..65c5ca8 --- /dev/null +++ b/COPYING.LESSER @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/README b/README new file mode 100644 index 0000000..ce08232 --- /dev/null +++ b/README @@ -0,0 +1,97 @@ +Multi-Task Minimum Error Rate Training +====================================== + +This set of scripts is essentially a wrapper around the mert-moses.pl script which +is distributed with moses (http://statmt.org/moses). In its current implementation +this is used for running several MERT runs in parallel and altering resulting weights +after each run. It also manages convergence. + +With MMERT we transfer the basic idea of regularized multi-task learning to MERT. That is, +in each iteration we run a separate instance of MERT for each task, and then regularize +the returned weight vectors towards the average weight vector of the previous iteration +by adding or subtracting the regularization parameter lambda. As simply adding or subtracting +lambda is only an approximation of regularization, we clip the new value at the average, +if the average would be crossed otherwise. + +See the "Multi-Task Minimum Error Rate Training for SMT" (Simianer et al. 2011) paper for further details. + +This was tested with r4106 of moses trunk. +NOTE: v0.1 worked only upto r4065, because mert-moses.pl script produced incompatible *.init.opt files. + +Usage +===== +An exemplary experiment installation can be found in the example/ directory (4 models, +built from the first 8 sentences of europarl v6 de-en, http://statmt.org/europarl/): + +example/ + bin/ + 3 binaries from a moses build: extractor, mert and moses + data/ + Put target/source language data (development sets) in here. + Use a common file naming scheme (this has to be defined in inbetwmert.sh). + ini/ + One or several configuration files (ini) for use with moses, these can be defined per 'task' or 'one-fits-all' + This is useful if you want to start MERT with different initial weights for each run + or different model files. + mmert/ + 3 scripts: inbetwmert.sh, regmtl.py and mert-moses.pl + In default configuration work directories are created here. + models/ + Language model, phrase table and reordering model files. Path has to be defined + in the ini(s) in ini/. + scripts/ + The scripts distributed with moses (needed by mert, http://www.statmt.org/moses/?n=Moses.SupportTools). + +Note: The phrase and reordering table(s) should be filtered against +each dev set, see http://www.statmt.org/moses/?n=Moses.SupportTools#ntoc3 to fit into memory. +The language model can be read from disk using kenlm models (the one distributed +in this tarball is kenlm v4). + +After putting moses binaries into bin/ and moses scripts in scripts/ the experiment can be run with + $ cd ~/mmert/mmert/;./inbetwmert.sh SUFFIX +Assuming you extracted mmert in your home directory and adjusted the username in +the paths in the ini/moses.ini file(!). +SUFFIX is used for creating working directories. + +After convergence (max change in the average vector is lower as the MIN_CHANGE parameter), +you can find the final weight vectors in the following files: + * Average: mmert_SUFFIX/runX.avector.txt (X is the last iteration) + * Individual: mert_SUFFIX_TASK/runX.mert.log in the line 'Best point:' +The weights in the example are ordered as follows: + d d d d d d d lm w tm tm tm tm tm +This can be different in your installation, if you use more or less models. +The order can be found in the stdout output of MERT (e.g. mmert_SUFFIX/mert.TASK.out). + +inbetwmert.sh +------------- +$FIRST_AVG: 'first average' to clip against + 0: 0-vector + 1: average of start weights (put the vector as a file name run0.avector.txt into mmert_SUFFIX) +$INI or $INIS + Define to use several or one moses configuration files. + This can be used for using several model files, using different initial weights etc. +$LAMBDA + Regularization parameter, useful values: 0.1 .. 0.05 .. 0.0000001 +$MIN_CHANGE + Stopping criterion: minimum change in average vector. Useful values: 0.01 .. 0.00001. + Everything above above ~0.2 leads to convergence after 1 iteration because + MERT normalizes the weight vector. +More parameters are documented in inbetwmert.sh itself. + +regmtl.py +--------- +Does actual regularization/clipping. Reads and writes current mert.log (the mert binary reads/writes +weights in there). + +mert-moses.pl +------------- +Changes/hacks: + * the outer 'while 1' loop (line 613 to 810) was commented out + * normalization was fixed (division by 0 if we start 0 weights in a moses.ini) + + +Version History +=============== +0.1 initial release +0.2 make code better readable, updated mert-moses.pl script to current trunk r4106 + diff --git a/example/bin/1PUT b/example/bin/1PUT new file mode 100644 index 0000000..e69de29 diff --git a/example/bin/EXTRACTOR b/example/bin/EXTRACTOR new file mode 100644 index 0000000..e69de29 diff --git a/example/bin/MERT b/example/bin/MERT new file mode 100644 index 0000000..e69de29 diff --git a/example/bin/MOSES b/example/bin/MOSES new file mode 100644 index 0000000..e69de29 diff --git a/example/bin/ZIN_HERE b/example/bin/ZIN_HERE new file mode 100644 index 0000000..e69de29 diff --git a/example/data/epmini-dev-A.de b/example/data/epmini-dev-A.de new file mode 100644 index 0000000..c1b8a30 --- /dev/null +++ b/example/data/epmini-dev-A.de @@ -0,0 +1,2 @@ +wiederaufnahme der sitzungsperiode +ich erkläre die am freitag , dem 17. dezember unterbrochene sitzungsperiode des europäischen parlaments für wiederaufgenommen , wünsche ihnen nochmals alles gute zum jahreswechsel und hoffe , daß sie schöne ferien hatten . diff --git a/example/data/epmini-dev-A.en b/example/data/epmini-dev-A.en new file mode 100644 index 0000000..df4c88f --- /dev/null +++ b/example/data/epmini-dev-A.en @@ -0,0 +1,2 @@ +resumption of the session +i declare resumed the session of the european parliament adjourned on friday 17 december 1999 , and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period . diff --git a/example/data/epmini-dev-B.de b/example/data/epmini-dev-B.de new file mode 100644 index 0000000..4e84e2a --- /dev/null +++ b/example/data/epmini-dev-B.de @@ -0,0 +1,2 @@ +wie sie feststellen konnten , ist der gefürchtete " millenium-bug " nicht eingetreten . doch sind bürger einiger unserer mitgliedstaaten opfer von schrecklichen naturkatastrophen geworden . +im parlament besteht der wunsch nach einer aussprache im verlauf dieser sitzungsperiode in den nächsten tagen . diff --git a/example/data/epmini-dev-B.en b/example/data/epmini-dev-B.en new file mode 100644 index 0000000..1980371 --- /dev/null +++ b/example/data/epmini-dev-B.en @@ -0,0 +1,2 @@ +although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful . +you have requested a debate on this subject in the course of the next few days , during this part-session . diff --git a/example/data/epmini-dev-C.de b/example/data/epmini-dev-C.de new file mode 100644 index 0000000..c32923f --- /dev/null +++ b/example/data/epmini-dev-C.de @@ -0,0 +1,2 @@ +heute möchte ich sie bitten - das ist auch der wunsch einiger kolleginnen und kollegen - , allen opfern der stürme , insbesondere in den verschiedenen ländern der europäischen union , in einer schweigeminute zu gedenken . +ich bitte sie , sich zu einer schweigeminute zu erheben . diff --git a/example/data/epmini-dev-C.en b/example/data/epmini-dev-C.en new file mode 100644 index 0000000..02874bf --- /dev/null +++ b/example/data/epmini-dev-C.en @@ -0,0 +1,2 @@ +in the meantime , i should like to observe a minute ' s silence , as a number of members have requested , on behalf of all the victims concerned , particularly those of the terrible storms , in the various countries of the european union . +please rise , then , for this minute ' s silence . diff --git a/example/data/epmini-dev-D.de b/example/data/epmini-dev-D.de new file mode 100644 index 0000000..e867aa2 --- /dev/null +++ b/example/data/epmini-dev-D.de @@ -0,0 +1,2 @@ +( das parlament erhebt sich zu einer schweigeminute . ) +frau präsidentin , zur geschäftsordnung . diff --git a/example/data/epmini-dev-D.en b/example/data/epmini-dev-D.en new file mode 100644 index 0000000..0f8016b --- /dev/null +++ b/example/data/epmini-dev-D.en @@ -0,0 +1,2 @@ +( the house rose and observed a minute ' s silence ) +madam president , on a point of order . diff --git a/example/ini/moses.ini b/example/ini/moses.ini new file mode 100644 index 0000000..102179a --- /dev/null +++ b/example/ini/moses.ini @@ -0,0 +1,66 @@ +######################### +### MOSES CONFIG FILE ### +######################### + +# input factors +[input-factors] +0 + +# mapping steps +[mapping] +0 T 0 + +# translation tables: table type (hierarchical(0), textual (0), binary (1)), source-factors, target-factors, number of scores, file +# OLD FORMAT is still handled for back-compatibility +# OLD FORMAT translation tables: source-factors, target-factors, number of scores, file +# OLD FORMAT a binary table type (1) is assumed +[ttable-file] +0 0 0 5 /home/mitarb/simianer/mmert/example/models/phrase-table + +# no generation models, no generation-file section + +# language models: type(srilm/irstlm), factors, order, file +[lmodel-file] +8 0 5 /home/mitarb/simianer/mmert/example/models/epmini.mmap + +# limit on how many phrase translations e for each phrase f are loaded +# 0 = all elements loaded +[ttable-limit] +20 + +# distortion (reordering) files +[distortion-file] +0-0 wbe-msd-bidirectional-fe-allff 6 /home/mitarb/simianer/mmert/example/models/reordering-table + +# distortion (reordering) weight +[weight-d] +0.3 +0.3 +0.3 +0.3 +0.3 +0.3 +0.3 + +# language model weights +[weight-l] +0.5000 + + +# translation model weights +[weight-t] +0.2 +0.2 +0.2 +0.2 +0.2 + +# no generation models, no weight-generation section + +# word penalty +[weight-w] +-1 + +[distortion-limit] +6 + diff --git a/example/mmert/inbetwmert.sh b/example/mmert/inbetwmert.sh new file mode 120000 index 0000000..7060cd8 --- /dev/null +++ b/example/mmert/inbetwmert.sh @@ -0,0 +1 @@ +../../inbetwmert.sh \ No newline at end of file diff --git a/example/mmert/mert-moses.pl b/example/mmert/mert-moses.pl new file mode 120000 index 0000000..efd4a3f --- /dev/null +++ b/example/mmert/mert-moses.pl @@ -0,0 +1 @@ +../../mert-moses.pl \ No newline at end of file diff --git a/example/mmert/regmtl.py b/example/mmert/regmtl.py new file mode 120000 index 0000000..cf6ef6c --- /dev/null +++ b/example/mmert/regmtl.py @@ -0,0 +1 @@ +../../regmtl.py \ No newline at end of file diff --git a/example/models/epmini.mmap b/example/models/epmini.mmap new file mode 100644 index 0000000..49a021d Binary files /dev/null and b/example/models/epmini.mmap differ diff --git a/example/models/phrase-table b/example/models/phrase-table new file mode 100644 index 0000000..c5909e5 --- /dev/null +++ b/example/models/phrase-table @@ -0,0 +1,387 @@ +" ||| ' failed to ||| 1 0.2 0.125 0.000236294 2.718 ||| ||| 1 8 +" ||| ' failed ||| 1 0.2 0.125 0.0108696 2.718 ||| ||| 1 8 +" ||| ' ||| 1 0.2 0.125 0.5 2.718 ||| ||| 1 8 +" ||| bug ' failed to ||| 1 0.2 0.125 5.13682e-06 2.718 ||| ||| 1 8 +" ||| bug ' failed ||| 1 0.2 0.125 0.000236294 2.718 ||| ||| 1 8 +" ||| bug ' ||| 1 0.2 0.125 0.0108696 2.718 ||| ||| 1 8 +" ||| were dreadful ||| 1 1 0.125 0.0108696 2.718 ||| ||| 1 8 +" ||| were ||| 1 1 0.125 0.5 2.718 ||| ||| 1 8 +( das parlament erhebt sich ||| ( the house rose and observed ||| 1 0.142857 1 0.0277778 2.718 ||| ||| 1 1 +( das parlament ||| ( the house rose ||| 1 0.571429 1 0.111111 2.718 ||| ||| 1 1 +( ||| ( ||| 1 1 1 1 2.718 ||| ||| 1 1 +) ||| ) ||| 1 1 1 1 2.718 ||| ||| 1 1 +, daß ||| pleasant festive period ||| 0.5 0.15 1 0.037037 2.718 ||| ||| 2 1 +, insbesondere ||| behalf of ||| 0.5 0.15 0.25 0.0434783 2.718 ||| ||| 2 4 +, insbesondere ||| behalf ||| 0.5 0.15 0.25 1 2.718 ||| ||| 2 4 +, insbesondere ||| on behalf of ||| 0.5 0.15 0.25 0.00189036 2.718 ||| ||| 2 4 +, insbesondere ||| on behalf ||| 0.5 0.15 0.25 0.0434783 2.718 ||| ||| 2 4 +, ist der ||| a number of countries ||| 0.5 0.0675 0.333333 0.0193237 2.718 ||| ||| 2 3 +, ist der ||| in a number of countries ||| 0.5 0.0675 0.333333 0.000420079 2.718 ||| ||| 2 3 +, ist der ||| number of countries ||| 0.5 0.0675 0.333333 0.222222 2.718 ||| ||| 2 3 +, sich zu einer schweigeminute zu ||| , for this minute ' s silence ||| 1 0.000649038 1 3.17028e-06 2.718 ||| ||| 1 1 +, sich zu ||| , for this minute ||| 1 0.00923077 1 0.000126024 2.718 ||| ||| 1 1 +, sich ||| , for this ||| 0.5 0.0153846 0.333333 0.000210039 2.718 ||| ||| 2 3 +, sich ||| , for ||| 0.5 0.0153846 0.333333 0.00966182 2.718 ||| ||| 2 3 +, sich ||| , ||| 0.0769231 0.0153846 0.333333 0.444444 2.718 ||| ||| 13 3 +, zur geschäftsordnung . ||| madam president , on a point ||| 1 0.17094 1 0.00301933 2.718 ||| ||| 1 1 +, ||| , as a number ||| 1 0.307692 0.0666667 1.82643e-05 2.718 ||| ||| 1 15 +, ||| , as a ||| 1 0.307692 0.0666667 0.000840158 2.718 ||| ||| 1 15 +, ||| , as ||| 1 0.307692 0.0666667 0.00966182 2.718 ||| ||| 1 15 +, ||| , for this ||| 0.5 0.307692 0.0666667 0.000210039 2.718 ||| ||| 2 15 +, ||| , for ||| 0.5 0.307692 0.0666667 0.00966182 2.718 ||| ||| 2 15 +, ||| , on ||| 0.5 0.307692 0.0666667 0.0193237 2.718 ||| ||| 2 15 +, ||| , ||| 0.307692 0.307692 0.266667 0.444444 2.718 ||| ||| 13 15 +, ||| . ||| 0.166667 0.166667 0.0666667 0.111111 2.718 ||| ||| 6 15 +, ||| again to wish ||| 1 0.333333 0.0666667 5.25098e-05 2.718 ||| ||| 1 15 +, ||| again to ||| 1 0.333333 0.0666667 0.00241546 2.718 ||| ||| 1 15 +, ||| to wish ||| 1 0.333333 0.0666667 0.00241546 2.718 ||| ||| 1 15 +, ||| to ||| 0.5 0.333333 0.0666667 0.111111 2.718 ||| ||| 2 15 +- ||| in the various ||| 1 0.2 0.25 0.00118147 2.718 ||| ||| 1 4 +- ||| in the ||| 0.25 0.2 0.25 0.0543479 2.718 ||| ||| 4 4 +- ||| in ||| 0.125 0.2 0.25 0.5 2.718 ||| ||| 8 4 +- ||| particularly ||| 0.5 1 0.25 0.5 2.718 ||| ||| 2 4 +. ||| . ||| 0.5 0.5 0.3 0.375 2.718 ||| ||| 6 10 +. ||| a ||| 1 0.555556 0.5 0.625 2.718 ||| ||| 5 10 +. ||| dreadful . ||| 0.5 0.5 0.1 0.00815216 2.718 ||| ||| 2 10 +. ||| on a ||| 1 0.555556 0.1 0.0271739 2.718 ||| ||| 1 10 +17. ||| adjourned ||| 1 1 1 1 2.718 ||| ||| 1 1 +allen ||| the european ||| 1 0.0714286 0.5 0.0217391 2.718 ||| ||| 1 2 +allen ||| the ||| 0.0909091 0.0714286 0.5 1 2.718 ||| ||| 11 2 +alles gute zum ||| in the ||| 0.25 0.0005 0.5 0.108696 2.718 ||| ||| 4 2 +alles gute zum ||| in ||| 0.125 0.0005 0.5 1 2.718 ||| ||| 8 2 +alles gute ||| in the ||| 0.25 0.01 0.5 0.108696 2.718 ||| ||| 4 2 +alles gute ||| in ||| 0.125 0.01 0.5 1 2.718 ||| ||| 8 2 +alles ||| in the ||| 0.25 0.2 0.5 0.108696 2.718 ||| ||| 4 2 +alles ||| in ||| 0.125 0.2 0.5 1 2.718 ||| ||| 8 2 +am freitag ||| would ||| 0.333333 0.05 1 1 2.718 ||| ||| 3 1 +auch ||| terrible storms ||| 1 1 0.25 0.0217391 2.718 ||| ||| 1 4 +auch ||| terrible ||| 1 1 0.25 1 2.718 ||| ||| 1 4 +auch ||| the terrible storms ||| 1 1 0.25 0.00236295 2.718 ||| ||| 1 4 +auch ||| the terrible ||| 1 1 0.25 0.108696 2.718 ||| ||| 1 4 +besteht ||| next few days , during ||| 1 1 0.2 6.70021e-07 2.718 ||| ||| 1 5 +besteht ||| next few days , ||| 1 1 0.2 3.0821e-05 2.718 ||| ||| 1 5 +besteht ||| next few days ||| 1 1 0.2 0.000472588 2.718 ||| ||| 1 5 +besteht ||| next few ||| 1 1 0.2 0.0217391 2.718 ||| ||| 1 5 +besteht ||| next ||| 1 1 0.2 1 2.718 ||| ||| 1 5 +bitten - ||| particularly ||| 0.5 0.05 1 0.5 2.718 ||| ||| 2 1 +bürger einiger unserer ||| the ||| 0.0909091 0.000178572 1 0.5 2.718 ||| ||| 11 1 +bürger einiger ||| the ||| 0.0909091 0.00357143 1 0.5 2.718 ||| ||| 11 1 +das parlament erhebt sich ||| the house rose and observed ||| 1 0.142857 1 0.0277778 2.718 ||| ||| 1 1 +das parlament ||| the house rose ||| 1 0.571429 1 0.111111 2.718 ||| ||| 1 1 +das ||| house ||| 1 1 0.5 0.5 2.718 ||| ||| 1 2 +das ||| the ||| 0.0909091 0.0714286 0.5 0.5 2.718 ||| ||| 11 2 +daß ||| pleasant festive period ||| 0.5 1 1 0.037037 2.718 ||| ||| 2 1 +dem 17. ||| adjourned on ||| 1 0.25 1 1 2.718 ||| ||| 1 1 +dem ||| on ||| 1 0.25 1 1 2.718 ||| ||| 1 1 +der sitzungsperiode ||| of the session ||| 1 0.385714 1 0.163265 2.718 ||| ||| 1 1 +der ||| a number of ||| 1 0.6 0.1 0.00189036 2.718 ||| ||| 1 10 +der ||| as a number of ||| 1 0.6 0.1 4.10946e-05 2.718 ||| ||| 1 10 +der ||| number of ||| 1 0.6 0.1 0.0217391 2.718 ||| ||| 1 10 +der ||| of the ||| 1 0.6 0.1 0.108696 2.718 ||| ||| 1 10 +der ||| of ||| 0.857143 0.6 0.6 1 2.718 ||| ||| 7 10 +des ||| 17 ||| 1 1 1 1 2.718 ||| ||| 1 1 +dezember unterbrochene sitzungsperiode ||| resumed the session of the european ||| 1 0.208929 1 0.0133278 2.718 ||| ||| 1 1 +dezember ||| european ||| 1 0.5 1 1 2.718 ||| ||| 1 1 +die am freitag ||| would ||| 0.333333 0.0025 1 1 2.718 ||| ||| 3 1 +dieser sitzungsperiode ||| the ||| 0.0909091 0.0142857 1 0.571429 2.718 ||| ||| 11 1 +doch sind bürger ||| will ||| 0.333333 0.0025 1 1 2.718 ||| ||| 3 1 +doch sind ||| will ||| 0.333333 0.05 1 1 2.718 ||| ||| 3 1 +doch ||| will ||| 0.333333 1 1 1 2.718 ||| ||| 3 1 +einer aussprache ||| on this subject ||| 0.5 0.125 1 0.0277778 2.718 ||| ||| 2 1 +einer schweigeminute zu gedenken . ||| observe a minute ' s silence ||| 1 0.0825 1 0.0119629 2.718 ||| ||| 1 1 +einer schweigeminute zu ||| ' s silence ||| 1 0.0703125 0.5 0.0251563 2.718 ||| ||| 1 2 +einer schweigeminute zu ||| minute ' s silence ||| 0.25 0.1485 0.5 0.0191406 2.718 ||| ||| 4 2 +eingetreten . ||| . ||| 0.166667 0.025 0.5 0.375 2.718 ||| ||| 6 2 +eingetreten . ||| dreadful . ||| 0.5 0.025 0.5 0.00815216 2.718 ||| ||| 2 2 +einiger kolleginnen ||| those ||| 0.5 0.05 1 1 2.718 ||| ||| 2 1 +einiger unserer ||| the ||| 0.0909091 0.00357143 1 0.5 2.718 ||| ||| 11 1 +einiger ||| the ||| 0.0909091 0.0714286 1 0.5 2.718 ||| ||| 11 1 +erheben ||| then ||| 1 1 1 1 2.718 ||| ||| 1 1 +erhebt sich zu einer schweigeminute . ||| and observed a minute ' s silence ||| 1 0.020625 1 0.00299072 2.718 ||| ||| 1 1 +erhebt sich ||| and observed ||| 1 0.25 1 0.25 2.718 ||| ||| 1 1 +erkläre die am ||| declare ||| 0.333333 0.0025 1 1 2.718 ||| ||| 3 1 +erkläre die ||| declare ||| 0.333333 0.05 1 1 2.718 ||| ||| 3 1 +erkläre ||| declare ||| 0.333333 1 1 1 2.718 ||| ||| 3 1 +europäischen ||| i ||| 0.333333 0.333333 0.25 0.5 2.718 ||| ||| 3 4 +europäischen ||| like to ||| 1 0.333333 0.25 0.0108696 2.718 ||| ||| 1 4 +europäischen ||| should like to ||| 1 0.333333 0.25 0.000236294 2.718 ||| ||| 1 4 +europäischen ||| to ||| 0.5 0.333333 0.25 0.5 2.718 ||| ||| 2 4 +ferien ||| a happy new year ||| 0.5 1 0.25 4.10946e-05 2.718 ||| ||| 2 4 +ferien ||| happy new year ||| 0.5 1 0.25 0.000472588 2.718 ||| ||| 2 4 +ferien ||| new year ||| 0.5 1 0.25 0.0217391 2.718 ||| ||| 2 4 +ferien ||| year ||| 0.5 1 0.25 1 2.718 ||| ||| 2 4 +feststellen konnten , ||| failed to materialise , still ||| 0.5 0.15 0.333333 3.0821e-05 2.718 ||| ||| 2 3 +feststellen konnten , ||| materialise , still ||| 0.5 0.15 0.333333 0.0652174 2.718 ||| ||| 2 3 +feststellen konnten , ||| to materialise , still ||| 0.5 0.15 0.333333 0.00141777 2.718 ||| ||| 2 3 +feststellen konnten ||| failed to materialise , still ||| 0.5 1 0.333333 3.0821e-05 2.718 ||| ||| 2 3 +feststellen konnten ||| materialise , still ||| 0.5 1 0.333333 0.0652174 2.718 ||| ||| 2 3 +feststellen konnten ||| to materialise , still ||| 0.5 1 0.333333 0.00141777 2.718 ||| ||| 2 3 +feststellen ||| failed to materialise , ||| 1 1 0.166667 3.0821e-05 2.718 ||| ||| 1 6 +feststellen ||| failed to materialise ||| 1 1 0.166667 0.000472588 2.718 ||| ||| 1 6 +feststellen ||| materialise , ||| 1 1 0.166667 0.0652174 2.718 ||| ||| 1 6 +feststellen ||| materialise ||| 1 1 0.166667 1 2.718 ||| ||| 1 6 +feststellen ||| to materialise , ||| 1 1 0.166667 0.00141777 2.718 ||| ||| 1 6 +feststellen ||| to materialise ||| 1 1 0.166667 0.0217391 2.718 ||| ||| 1 6 +frau präsidentin ||| of order . ||| 1 0.1 0.5 0.0217391 2.718 ||| ||| 1 2 +frau präsidentin ||| of order ||| 1 0.1 0.5 1 2.718 ||| ||| 1 2 +frau ||| order . ||| 1 1 0.5 0.0217391 2.718 ||| ||| 1 2 +frau ||| order ||| 1 1 0.5 1 2.718 ||| ||| 1 2 +freitag ||| would ||| 0.333333 1 1 1 2.718 ||| ||| 3 1 +für ||| 1999 ||| 1 1 1 1 2.718 ||| ||| 1 1 +gedenken . ||| observe a ||| 1 0.555556 1 0.625 2.718 ||| ||| 1 1 +gedenken ||| observe ||| 1 1 1 1 2.718 ||| ||| 1 1 +gefürchtete ||| have seen , the dreaded ' ||| 1 0.2 0.166667 7.28284e-08 2.718 ||| ||| 1 6 +gefürchtete ||| have seen , the dreaded ||| 1 0.2 0.166667 3.35011e-06 2.718 ||| ||| 1 6 +gefürchtete ||| have seen , the ||| 1 0.2 0.166667 0.000154105 2.718 ||| ||| 1 6 +gefürchtete ||| have seen , ||| 1 0.2 0.166667 0.00141777 2.718 ||| ||| 1 6 +gefürchtete ||| have seen ||| 1 0.2 0.166667 0.0217391 2.718 ||| ||| 1 6 +gefürchtete ||| have ||| 1 0.2 0.166667 1 2.718 ||| ||| 1 6 +geworden . ||| suffered a ||| 1 0.555556 1 0.625 2.718 ||| ||| 1 1 +geworden ||| suffered ||| 1 1 1 1 2.718 ||| ||| 1 1 +gute zum jahreswechsel ||| once again ||| 0.333333 0.0025 0.5 0.0217391 2.718 ||| ||| 3 2 +gute zum jahreswechsel ||| once ||| 0.333333 0.0025 0.5 1 2.718 ||| ||| 3 2 +hatten . ||| enjoyed a ||| 1 0.555556 0.333333 0.625 2.718 ||| ||| 1 3 +hatten . ||| that you enjoyed a ||| 1 0.555556 0.333333 0.000590737 2.718 ||| ||| 1 3 +hatten . ||| you enjoyed a ||| 1 0.555556 0.333333 0.0135869 2.718 ||| ||| 1 3 +hatten ||| enjoyed ||| 1 1 0.333333 1 2.718 ||| ||| 1 3 +hatten ||| that you enjoyed ||| 1 1 0.333333 0.000945179 2.718 ||| ||| 1 3 +hatten ||| you enjoyed ||| 1 1 0.333333 0.0217391 2.718 ||| ||| 1 3 +heute möchte ||| the victims concerned ||| 1 1 0.5 0.108696 2.718 ||| ||| 1 2 +heute möchte ||| victims concerned ||| 1 1 0.5 1 2.718 ||| ||| 1 2 +heute ||| the victims ||| 1 1 0.5 0.108696 2.718 ||| ||| 1 2 +heute ||| victims ||| 1 1 0.5 1 2.718 ||| ||| 1 2 +hoffe , ||| hope that you ||| 0.5 0.15 0.166667 0.000945179 2.718 ||| ||| 2 6 +hoffe , ||| hope that ||| 0.5 0.15 0.166667 0.0434783 2.718 ||| ||| 2 6 +hoffe , ||| hope ||| 0.5 0.15 0.166667 1 2.718 ||| ||| 2 6 +hoffe , ||| the hope that you ||| 0.5 0.15 0.166667 0.000102737 2.718 ||| ||| 2 6 +hoffe , ||| the hope that ||| 0.5 0.15 0.166667 0.0047259 2.718 ||| ||| 2 6 +hoffe , ||| the hope ||| 0.5 0.15 0.166667 0.108696 2.718 ||| ||| 2 6 +hoffe ||| hope that you ||| 0.5 1 0.166667 0.000945179 2.718 ||| ||| 2 6 +hoffe ||| hope that ||| 0.5 1 0.166667 0.0434783 2.718 ||| ||| 2 6 +hoffe ||| hope ||| 0.5 1 0.166667 1 2.718 ||| ||| 2 6 +hoffe ||| the hope that you ||| 0.5 1 0.166667 0.000102737 2.718 ||| ||| 2 6 +hoffe ||| the hope that ||| 0.5 1 0.166667 0.0047259 2.718 ||| ||| 2 6 +hoffe ||| the hope ||| 0.5 1 0.166667 0.108696 2.718 ||| ||| 2 6 +ich bitte sie ||| please rise , ||| 1 0.115385 1 0.208333 2.718 ||| ||| 1 1 +ich bitte ||| please rise ||| 1 0.375 1 0.208333 2.718 ||| ||| 1 1 +ich ||| . ||| 0.166667 0.166667 0.5 0.333333 2.718 ||| ||| 6 2 +ich ||| like ||| 1 0.5 0.5 0.333333 2.718 ||| ||| 1 2 +ihnen ||| wish you a happy new ||| 1 0.25 0.125 8.9336e-07 2.718 ||| ||| 1 8 +ihnen ||| wish you a happy ||| 1 0.25 0.125 4.10946e-05 2.718 ||| ||| 1 8 +ihnen ||| wish you a ||| 1 0.25 0.125 0.00189036 2.718 ||| ||| 1 8 +ihnen ||| wish you ||| 1 0.25 0.125 0.0217391 2.718 ||| ||| 1 8 +ihnen ||| you a happy new ||| 1 0.25 0.125 4.10946e-05 2.718 ||| ||| 1 8 +ihnen ||| you a happy ||| 1 0.25 0.125 0.00189036 2.718 ||| ||| 1 8 +ihnen ||| you a ||| 1 0.25 0.125 0.0869565 2.718 ||| ||| 1 8 +ihnen ||| you ||| 0.333333 0.25 0.125 1 2.718 ||| ||| 3 8 +im parlament besteht ||| the next few days , during this ||| 1 0.0357143 1 2.2334e-07 2.718 ||| ||| 1 1 +im ||| , during this ||| 1 0.25 0.166667 0.000708884 2.718 ||| ||| 1 6 +im ||| days , during this ||| 1 0.25 0.166667 1.54105e-05 2.718 ||| ||| 1 6 +im ||| during this ||| 1 0.25 0.166667 0.0108696 2.718 ||| ||| 1 6 +im ||| few days , during this ||| 1 0.25 0.166667 3.3501e-07 2.718 ||| ||| 1 6 +im ||| part-session ||| 1 1 0.166667 0.5 2.718 ||| ||| 1 6 +im ||| this ||| 1 0.25 0.166667 0.5 2.718 ||| ||| 1 6 +in den nächsten tagen ||| you have requested a debate ||| 1 0.07 1 0.0253623 2.718 ||| ||| 1 1 +in den nächsten ||| you have requested a ||| 1 0.07 0.5 0.0253623 2.718 ||| ||| 1 2 +in den nächsten ||| you have requested ||| 1 0.07 0.5 0.291667 2.718 ||| ||| 1 2 +in den verschiedenen ländern der ||| a number of members have requested ||| 1 0.0084 0.25 0.000551354 2.718 ||| ||| 1 4 +in den verschiedenen ländern der ||| as a number of members have requested ||| 1 0.0084 0.25 1.19859e-05 2.718 ||| ||| 1 4 +in den verschiedenen ländern der ||| number of members have requested ||| 1 0.0084 0.25 0.00634057 2.718 ||| ||| 1 4 +in den verschiedenen ländern der ||| of members have requested ||| 1 0.0084 0.25 0.291667 2.718 ||| ||| 1 4 +in den verschiedenen ländern ||| members have requested ||| 1 0.014 1 0.291667 2.718 ||| ||| 1 1 +in den verschiedenen ||| have requested ||| 0.333333 0.014 1 0.291667 2.718 ||| ||| 3 1 +in den ||| have requested a ||| 1 0.28 0.333333 0.0253623 2.718 ||| ||| 1 3 +in den ||| have requested ||| 0.666667 0.28 0.666667 0.291667 2.718 ||| ||| 3 3 +in ||| , ||| 0.0769231 0.0769231 1 0.333333 2.718 ||| ||| 13 1 +insbesondere ||| behalf of ||| 0.5 1 0.25 0.0434783 2.718 ||| ||| 2 4 +insbesondere ||| behalf ||| 0.5 1 0.25 1 2.718 ||| ||| 2 4 +insbesondere ||| on behalf of ||| 0.5 1 0.25 0.00189036 2.718 ||| ||| 2 4 +insbesondere ||| on behalf ||| 0.5 1 0.25 0.0434783 2.718 ||| ||| 2 4 +ist der ||| a number of countries ||| 0.5 0.45 0.333333 0.0193237 2.718 ||| ||| 2 3 +ist der ||| in a number of countries ||| 0.5 0.45 0.333333 0.000420079 2.718 ||| ||| 2 3 +ist der ||| number of countries ||| 0.5 0.45 0.333333 0.222222 2.718 ||| ||| 2 3 +ist ||| countries ||| 1 1 0.333333 0.666667 2.718 ||| ||| 1 3 +ist ||| the various countries ||| 1 1 0.333333 0.0015753 2.718 ||| ||| 1 3 +ist ||| various countries ||| 1 1 0.333333 0.0144927 2.718 ||| ||| 1 3 +jahreswechsel ||| once again ||| 0.333333 1 0.5 0.0217391 2.718 ||| ||| 3 2 +jahreswechsel ||| once ||| 0.333333 1 0.5 1 2.718 ||| ||| 3 2 +kollegen ||| european union ||| 1 1 0.5 0.0217391 2.718 ||| ||| 1 2 +kollegen ||| union ||| 1 1 0.5 1 2.718 ||| ||| 1 2 +kolleginnen ||| those ||| 0.5 1 1 1 2.718 ||| ||| 2 1 +konnten , ||| , still ||| 0.5 0.15 0.5 0.0652174 2.718 ||| ||| 2 2 +konnten , ||| still ||| 0.5 0.15 0.5 1 2.718 ||| ||| 2 2 +konnten ||| , still ||| 0.5 1 0.5 0.0652174 2.718 ||| ||| 2 2 +konnten ||| still ||| 0.5 1 0.5 1 2.718 ||| ||| 2 2 +ländern der ||| a number of members ||| 0.5 0.6 0.25 0.00189036 2.718 ||| ||| 2 4 +ländern der ||| as a number of members ||| 0.5 0.6 0.25 4.10946e-05 2.718 ||| ||| 2 4 +ländern der ||| number of members ||| 0.5 0.6 0.25 0.0217391 2.718 ||| ||| 2 4 +ländern der ||| of members ||| 0.5 0.6 0.25 1 2.718 ||| ||| 2 4 +ländern ||| members ||| 0.5 1 1 1 2.718 ||| ||| 2 1 +millenium-bug ||| you ||| 0.333333 0.25 1 1 2.718 ||| ||| 3 1 +mitgliedstaaten opfer ||| disasters that truly ||| 0.25 0.05 0.2 0.000945179 2.718 ||| ||| 4 5 +mitgliedstaaten opfer ||| natural disasters that truly ||| 0.25 0.05 0.2 2.05473e-05 2.718 ||| ||| 4 5 +mitgliedstaaten opfer ||| of natural disasters that truly ||| 0.25 0.05 0.2 8.93364e-07 2.718 ||| ||| 4 5 +mitgliedstaaten opfer ||| that truly ||| 0.25 0.05 0.2 0.0434783 2.718 ||| ||| 4 5 +mitgliedstaaten opfer ||| truly ||| 0.25 0.05 0.2 1 2.718 ||| ||| 4 5 +mitgliedstaaten ||| disasters that truly ||| 0.25 1 0.2 0.000945179 2.718 ||| ||| 4 5 +mitgliedstaaten ||| natural disasters that truly ||| 0.25 1 0.2 2.05473e-05 2.718 ||| ||| 4 5 +mitgliedstaaten ||| of natural disasters that truly ||| 0.25 1 0.2 8.93364e-07 2.718 ||| ||| 4 5 +mitgliedstaaten ||| that truly ||| 0.25 1 0.2 0.0434783 2.718 ||| ||| 4 5 +mitgliedstaaten ||| truly ||| 0.25 1 0.2 1 2.718 ||| ||| 4 5 +möchte ||| concerned ||| 1 1 1 1 2.718 ||| ||| 1 1 +nach einer aussprache ||| on this subject ||| 0.5 0.00625 1 0.0277778 2.718 ||| ||| 2 1 +naturkatastrophen ||| people in a ||| 0.5 1 0.333333 0.00189036 2.718 ||| ||| 2 3 +naturkatastrophen ||| people in ||| 0.5 1 0.333333 0.0217391 2.718 ||| ||| 2 3 +naturkatastrophen ||| people ||| 0.5 1 0.333333 1 2.718 ||| ||| 2 3 +nicht eingetreten ||| series of natural disasters that ||| 0.5 0.05 0.2 8.93364e-07 2.718 ||| ||| 2 5 +nicht eingetreten ||| series of natural disasters ||| 0.5 0.05 0.2 2.05473e-05 2.718 ||| ||| 2 5 +nicht eingetreten ||| series of natural ||| 0.5 0.05 0.2 0.000945179 2.718 ||| ||| 2 5 +nicht eingetreten ||| series of ||| 0.5 0.05 0.2 0.0434783 2.718 ||| ||| 2 5 +nicht eingetreten ||| series ||| 0.5 0.05 0.2 1 2.718 ||| ||| 2 5 +nicht ||| series of natural disasters that ||| 0.5 1 0.2 8.93364e-07 2.718 ||| ||| 2 5 +nicht ||| series of natural disasters ||| 0.5 1 0.2 2.05473e-05 2.718 ||| ||| 2 5 +nicht ||| series of natural ||| 0.5 1 0.2 0.000945179 2.718 ||| ||| 2 5 +nicht ||| series of ||| 0.5 1 0.2 0.0434783 2.718 ||| ||| 2 5 +nicht ||| series ||| 0.5 1 0.2 1 2.718 ||| ||| 2 5 +nochmals ||| and ||| 1 0.5 1 1 2.718 ||| ||| 1 1 +nächsten ||| you ||| 0.333333 0.25 1 1 2.718 ||| ||| 3 1 +opfer von schrecklichen ||| ' millennium bug ||| 0.25 0.0025 0.0833333 0.000472588 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| ' millennium ||| 0.25 0.0025 0.0833333 0.0217391 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| , the dreaded ' millennium bug ||| 0.25 0.0025 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| , the dreaded ' millennium ||| 0.25 0.0025 0.0833333 3.35011e-06 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| dreaded ' millennium bug ||| 0.25 0.0025 0.0833333 1.02736e-05 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| dreaded ' millennium ||| 0.25 0.0025 0.0833333 0.000472588 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| millennium bug ||| 0.25 0.0025 0.0833333 0.0217391 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| millennium ||| 0.25 0.0025 0.0833333 1 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| seen , the dreaded ' millennium bug ||| 0.25 0.0025 0.0833333 1.58322e-09 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| seen , the dreaded ' millennium ||| 0.25 0.0025 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| the dreaded ' millennium bug ||| 0.25 0.0025 0.0833333 1.1167e-06 2.718 ||| ||| 4 12 +opfer von schrecklichen ||| the dreaded ' millennium ||| 0.25 0.0025 0.0833333 5.13683e-05 2.718 ||| ||| 4 12 +opfer von ||| ' millennium bug ||| 0.25 0.05 0.0833333 0.000472588 2.718 ||| ||| 4 12 +opfer von ||| ' millennium ||| 0.25 0.05 0.0833333 0.0217391 2.718 ||| ||| 4 12 +opfer von ||| , the dreaded ' millennium bug ||| 0.25 0.05 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +opfer von ||| , the dreaded ' millennium ||| 0.25 0.05 0.0833333 3.35011e-06 2.718 ||| ||| 4 12 +opfer von ||| dreaded ' millennium bug ||| 0.25 0.05 0.0833333 1.02736e-05 2.718 ||| ||| 4 12 +opfer von ||| dreaded ' millennium ||| 0.25 0.05 0.0833333 0.000472588 2.718 ||| ||| 4 12 +opfer von ||| millennium bug ||| 0.25 0.05 0.0833333 0.0217391 2.718 ||| ||| 4 12 +opfer von ||| millennium ||| 0.25 0.05 0.0833333 1 2.718 ||| ||| 4 12 +opfer von ||| seen , the dreaded ' millennium bug ||| 0.25 0.05 0.0833333 1.58322e-09 2.718 ||| ||| 4 12 +opfer von ||| seen , the dreaded ' millennium ||| 0.25 0.05 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +opfer von ||| the dreaded ' millennium bug ||| 0.25 0.05 0.0833333 1.1167e-06 2.718 ||| ||| 4 12 +opfer von ||| the dreaded ' millennium ||| 0.25 0.05 0.0833333 5.13683e-05 2.718 ||| ||| 4 12 +opfern ||| all the ||| 1 1 0.25 0.108696 2.718 ||| ||| 1 4 +opfern ||| all ||| 1 1 0.25 1 2.718 ||| ||| 1 4 +opfern ||| of all the ||| 1 1 0.25 0.0047259 2.718 ||| ||| 1 4 +opfern ||| of all ||| 1 1 0.25 0.0434783 2.718 ||| ||| 1 4 +parlament besteht der ||| of the next few days , during ||| 1 0.0857143 0.2 4.4668e-07 2.718 ||| ||| 1 5 +parlament besteht der ||| of the next few days , ||| 1 0.0857143 0.2 2.05473e-05 2.718 ||| ||| 1 5 +parlament besteht der ||| of the next few days ||| 1 0.0857143 0.2 0.000315059 2.718 ||| ||| 1 5 +parlament besteht der ||| of the next few ||| 1 0.0857143 0.2 0.0144927 2.718 ||| ||| 1 5 +parlament besteht der ||| of the next ||| 1 0.0857143 0.2 0.666667 2.718 ||| ||| 1 5 +parlament besteht ||| the next few days , during ||| 1 0.142857 0.2 4.4668e-07 2.718 ||| ||| 1 5 +parlament besteht ||| the next few days , ||| 1 0.142857 0.2 2.05473e-05 2.718 ||| ||| 1 5 +parlament besteht ||| the next few days ||| 1 0.142857 0.2 0.000315059 2.718 ||| ||| 1 5 +parlament besteht ||| the next few ||| 1 0.142857 0.2 0.0144927 2.718 ||| ||| 1 5 +parlament besteht ||| the next ||| 1 0.142857 0.2 0.666667 2.718 ||| ||| 1 5 +parlament ||| the ||| 0.0909091 0.142857 1 0.666667 2.718 ||| ||| 11 1 +parlaments ||| parliament ||| 1 1 1 1 2.718 ||| ||| 1 1 +präsidentin , zur geschäftsordnung . ||| madam president , on a point of ||| 1 0.017094 1 0.00301933 2.718 ||| ||| 1 1 +präsidentin ||| of ||| 0.142857 0.1 1 1 2.718 ||| ||| 7 1 +schrecklichen naturkatastrophen ||| people in a ||| 0.5 0.05 0.333333 0.00189036 2.718 ||| ||| 2 3 +schrecklichen naturkatastrophen ||| people in ||| 0.5 0.05 0.333333 0.0217391 2.718 ||| ||| 2 3 +schrecklichen naturkatastrophen ||| people ||| 0.5 0.05 0.333333 1 2.718 ||| ||| 2 3 +schöne ferien ||| a happy new year ||| 0.5 0.05 0.25 4.10946e-05 2.718 ||| ||| 2 4 +schöne ferien ||| happy new year ||| 0.5 0.05 0.25 0.000472588 2.718 ||| ||| 2 4 +schöne ferien ||| new year ||| 0.5 0.05 0.25 0.0217391 2.718 ||| ||| 2 4 +schöne ferien ||| year ||| 0.5 0.05 0.25 1 2.718 ||| ||| 2 4 +sich zu einer schweigeminute zu ||| for this minute ' s silence ||| 0.5 0.00210938 0.333333 7.13313e-06 2.718 ||| ||| 2 3 +sich zu einer schweigeminute zu ||| minute ' s silence ||| 0.25 0.00210938 0.333333 0.0150937 2.718 ||| ||| 4 3 +sich zu einer schweigeminute zu ||| this minute ' s silence ||| 0.5 0.00210938 0.333333 0.000328125 2.718 ||| ||| 2 3 +sich zu ||| for this minute ||| 0.5 0.03 0.333333 0.000283553 2.718 ||| ||| 2 3 +sich zu ||| minute ||| 0.5 0.03 0.333333 0.6 2.718 ||| ||| 2 3 +sich zu ||| this minute ||| 0.5 0.03 0.333333 0.0130435 2.718 ||| ||| 2 3 +sie bitten ||| , ||| 0.0769231 0.0153846 0.5 1 2.718 ||| ||| 13 2 +sie bitten ||| storms , ||| 0.5 0.0153846 0.5 0.0217391 2.718 ||| ||| 2 2 +sie schöne ||| , ||| 0.0769231 0.0153846 1 1 2.718 ||| ||| 13 1 +sie ||| , ||| 0.307692 0.307692 0.8 1 2.718 ||| ||| 13 5 +sie ||| storms , ||| 0.5 0.307692 0.2 0.0217391 2.718 ||| ||| 2 5 +sind bürger einiger unserer ||| the ||| 0.0909091 8.92858e-06 1 0.5 2.718 ||| ||| 11 1 +sind bürger einiger ||| the ||| 0.0909091 0.000178572 1 0.5 2.718 ||| ||| 11 1 +sitzungsperiode ||| the session of the ||| 1 0.417857 0.333333 0.0133278 2.718 ||| ||| 1 3 +sitzungsperiode ||| the session ||| 1 0.642857 0.333333 0.163265 2.718 ||| ||| 1 3 +sitzungsperiode ||| the ||| 0.0909091 0.285714 0.333333 0.571429 2.718 ||| ||| 11 3 +stürme , ||| meantime ||| 0.5 0.15 1 1 2.718 ||| ||| 2 1 +stürme ||| meantime ||| 0.5 1 1 1 2.718 ||| ||| 2 1 +tagen ||| a debate ||| 1 1 0.5 0.0869565 2.718 ||| ||| 1 2 +tagen ||| debate ||| 1 1 0.5 1 2.718 ||| ||| 1 2 +und ||| i should like ||| 1 0.666667 0.25 0.000472588 2.718 ||| ||| 1 4 +und ||| i should ||| 1 0.666667 0.25 0.0217391 2.718 ||| ||| 1 4 +und ||| i ||| 0.666667 0.666667 0.5 1 2.718 ||| ||| 3 4 +union ||| , on ||| 0.5 0.0769231 0.5 0.0434783 2.718 ||| ||| 2 2 +union ||| , ||| 0.0769231 0.0769231 0.5 1 2.718 ||| ||| 13 2 +unserer mitgliedstaaten opfer ||| disasters that truly ||| 0.25 0.0025 0.2 0.000945179 2.718 ||| ||| 4 5 +unserer mitgliedstaaten opfer ||| natural disasters that truly ||| 0.25 0.0025 0.2 2.05473e-05 2.718 ||| ||| 4 5 +unserer mitgliedstaaten opfer ||| of natural disasters that truly ||| 0.25 0.0025 0.2 8.93364e-07 2.718 ||| ||| 4 5 +unserer mitgliedstaaten opfer ||| that truly ||| 0.25 0.0025 0.2 0.0434783 2.718 ||| ||| 4 5 +unserer mitgliedstaaten opfer ||| truly ||| 0.25 0.0025 0.2 1 2.718 ||| ||| 4 5 +unserer mitgliedstaaten ||| disasters that truly ||| 0.25 0.05 0.2 0.000945179 2.718 ||| ||| 4 5 +unserer mitgliedstaaten ||| natural disasters that truly ||| 0.25 0.05 0.2 2.05473e-05 2.718 ||| ||| 4 5 +unserer mitgliedstaaten ||| of natural disasters that truly ||| 0.25 0.05 0.2 8.93364e-07 2.718 ||| ||| 4 5 +unserer mitgliedstaaten ||| that truly ||| 0.25 0.05 0.2 0.0434783 2.718 ||| ||| 4 5 +unserer mitgliedstaaten ||| truly ||| 0.25 0.05 0.2 1 2.718 ||| ||| 4 5 +unterbrochene sitzungsperiode ||| resumed the session of the ||| 1 0.417857 1 0.0133278 2.718 ||| ||| 1 1 +unterbrochene ||| resumed ||| 1 1 1 1 2.718 ||| ||| 1 1 +verlauf dieser sitzungsperiode ||| the course ||| 1 0.0142857 1 0.571429 2.718 ||| ||| 1 1 +verlauf dieser ||| course ||| 0.5 0.05 1 1 2.718 ||| ||| 2 1 +verlauf ||| course ||| 0.5 1 1 1 2.718 ||| ||| 2 1 +verschiedenen ländern der ||| a number of members ||| 0.5 0.03 0.25 0.00189036 2.718 ||| ||| 2 4 +verschiedenen ländern der ||| as a number of members ||| 0.5 0.03 0.25 4.10946e-05 2.718 ||| ||| 2 4 +verschiedenen ländern der ||| number of members ||| 0.5 0.03 0.25 0.0217391 2.718 ||| ||| 2 4 +verschiedenen ländern der ||| of members ||| 0.5 0.03 0.25 1 2.718 ||| ||| 2 4 +verschiedenen ländern ||| members ||| 0.5 0.05 1 1 2.718 ||| ||| 2 1 +von schrecklichen ||| ' millennium bug ||| 0.25 0.05 0.0833333 0.000472588 2.718 ||| ||| 4 12 +von schrecklichen ||| ' millennium ||| 0.25 0.05 0.0833333 0.0217391 2.718 ||| ||| 4 12 +von schrecklichen ||| , the dreaded ' millennium bug ||| 0.25 0.05 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +von schrecklichen ||| , the dreaded ' millennium ||| 0.25 0.05 0.0833333 3.35011e-06 2.718 ||| ||| 4 12 +von schrecklichen ||| dreaded ' millennium bug ||| 0.25 0.05 0.0833333 1.02736e-05 2.718 ||| ||| 4 12 +von schrecklichen ||| dreaded ' millennium ||| 0.25 0.05 0.0833333 0.000472588 2.718 ||| ||| 4 12 +von schrecklichen ||| millennium bug ||| 0.25 0.05 0.0833333 0.0217391 2.718 ||| ||| 4 12 +von schrecklichen ||| millennium ||| 0.25 0.05 0.0833333 1 2.718 ||| ||| 4 12 +von schrecklichen ||| seen , the dreaded ' millennium bug ||| 0.25 0.05 0.0833333 1.58322e-09 2.718 ||| ||| 4 12 +von schrecklichen ||| seen , the dreaded ' millennium ||| 0.25 0.05 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +von schrecklichen ||| the dreaded ' millennium bug ||| 0.25 0.05 0.0833333 1.1167e-06 2.718 ||| ||| 4 12 +von schrecklichen ||| the dreaded ' millennium ||| 0.25 0.05 0.0833333 5.13683e-05 2.718 ||| ||| 4 12 +von ||| ' millennium bug ||| 0.25 1 0.0833333 0.000472588 2.718 ||| ||| 4 12 +von ||| ' millennium ||| 0.25 1 0.0833333 0.0217391 2.718 ||| ||| 4 12 +von ||| , the dreaded ' millennium bug ||| 0.25 1 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +von ||| , the dreaded ' millennium ||| 0.25 1 0.0833333 3.35011e-06 2.718 ||| ||| 4 12 +von ||| dreaded ' millennium bug ||| 0.25 1 0.0833333 1.02736e-05 2.718 ||| ||| 4 12 +von ||| dreaded ' millennium ||| 0.25 1 0.0833333 0.000472588 2.718 ||| ||| 4 12 +von ||| millennium bug ||| 0.25 1 0.0833333 0.0217391 2.718 ||| ||| 4 12 +von ||| millennium ||| 0.25 1 0.0833333 1 2.718 ||| ||| 4 12 +von ||| seen , the dreaded ' millennium bug ||| 0.25 1 0.0833333 1.58322e-09 2.718 ||| ||| 4 12 +von ||| seen , the dreaded ' millennium ||| 0.25 1 0.0833333 7.28284e-08 2.718 ||| ||| 4 12 +von ||| the dreaded ' millennium bug ||| 0.25 1 0.0833333 1.1167e-06 2.718 ||| ||| 4 12 +von ||| the dreaded ' millennium ||| 0.25 1 0.0833333 5.13683e-05 2.718 ||| ||| 4 12 +wie sie ||| although , as ||| 1 0.230769 1 0.25 2.718 ||| ||| 1 1 +wiederaufgenommen ||| friday ||| 1 1 1 1 2.718 ||| ||| 1 1 +wiederaufnahme der sitzungsperiode ||| resumption of the session ||| 1 0.385714 1 0.163265 2.718 ||| ||| 1 1 +wiederaufnahme der ||| resumption of ||| 1 0.6 1 1 2.718 ||| ||| 1 1 +wiederaufnahme ||| resumption ||| 1 1 1 1 2.718 ||| ||| 1 1 +wunsch einiger ||| in ||| 0.125 0.02 1 1 2.718 ||| ||| 8 1 +wunsch nach einer aussprache ||| on this subject in ||| 1 0.0025 1 0.0277778 2.718 ||| ||| 1 1 +wunsch nach ||| in ||| 0.125 0.02 1 1 2.718 ||| ||| 8 1 +wunsch ||| in ||| 0.25 0.4 1 1 2.718 ||| ||| 8 2 +wünsche ||| december ||| 1 1 1 1 2.718 ||| ||| 1 1 +zu einer schweigeminute . ) ||| a minute ' s silence ) ||| 1 0.0825 1 0.0119629 2.718 ||| ||| 1 1 +zu einer schweigeminute . ||| a minute ' s silence ||| 1 0.0825 1 0.0119629 2.718 ||| ||| 1 1 +zu einer schweigeminute zu ||| for this minute ' s silence ||| 0.5 0.0421875 0.333333 7.13313e-06 2.718 ||| ||| 2 3 +zu einer schweigeminute zu ||| minute ' s silence ||| 0.25 0.0421875 0.333333 0.0150937 2.718 ||| ||| 4 3 +zu einer schweigeminute zu ||| this minute ' s silence ||| 0.5 0.0421875 0.333333 0.000328125 2.718 ||| ||| 2 3 +zu einer schweigeminute ||| minute ' s silence ||| 0.25 0.1485 1 0.0191406 2.718 ||| ||| 4 1 +zu ||| for this minute ||| 0.5 0.6 0.333333 0.000283553 2.718 ||| ||| 2 3 +zu ||| minute ||| 0.5 0.6 0.333333 0.6 2.718 ||| ||| 2 3 +zu ||| this minute ||| 0.5 0.6 0.333333 0.0130435 2.718 ||| ||| 2 3 +zum jahreswechsel ||| once again ||| 0.333333 0.05 0.5 0.0217391 2.718 ||| ||| 3 2 +zum jahreswechsel ||| once ||| 0.333333 0.05 0.5 1 2.718 ||| ||| 3 2 +zur ||| madam ||| 1 1 1 1 2.718 ||| ||| 1 1 diff --git a/example/models/reordering-table b/example/models/reordering-table new file mode 100644 index 0000000..040e2ea --- /dev/null +++ b/example/models/reordering-table @@ -0,0 +1,387 @@ +" ||| ' failed to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| ' failed ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| ' ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| bug ' failed to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| bug ' failed ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| bug ' ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| were dreadful ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +" ||| were ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +( das parlament erhebt sich ||| ( the house rose and observed ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +( das parlament ||| ( the house rose ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +( ||| ( ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +) ||| ) ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +, daß ||| pleasant festive period ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, insbesondere ||| behalf of ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, insbesondere ||| behalf ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, insbesondere ||| on behalf of ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, insbesondere ||| on behalf ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ist der ||| a number of countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ist der ||| in a number of countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ist der ||| number of countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, sich zu einer schweigeminute zu ||| , for this minute ' s silence ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +, sich zu ||| , for this minute ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, sich ||| , for this ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +, sich ||| , for ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, sich ||| , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, zur geschäftsordnung . ||| madam president , on a point ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +, ||| , as a number ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| , as a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| , as ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| , for this ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| , for ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| , on ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| , ||| 0.090909 0.090909 0.818182 0.090909 0.090909 0.818182 +, ||| . ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| again to wish ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| again to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| to wish ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +, ||| to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +- ||| in the various ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +- ||| in the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +- ||| in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +- ||| particularly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +. ||| . ||| 0.111111 0.111111 0.777778 0.555556 0.111111 0.333333 +. ||| a ||| 0.538462 0.076923 0.384615 0.076923 0.230769 0.692308 +. ||| dreadful . ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +. ||| on a ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +17. ||| adjourned ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +allen ||| the european ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +allen ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +alles gute zum ||| in the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +alles gute zum ||| in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +alles gute ||| in the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +alles gute ||| in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +alles ||| in the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +alles ||| in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +am freitag ||| would ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +auch ||| terrible storms ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +auch ||| terrible ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +auch ||| the terrible storms ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +auch ||| the terrible ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +besteht ||| next few days , during ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +besteht ||| next few days , ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +besteht ||| next few days ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +besteht ||| next few ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +besteht ||| next ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +bitten - ||| particularly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +bürger einiger unserer ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +bürger einiger ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +das parlament erhebt sich ||| the house rose and observed ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +das parlament ||| the house rose ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +das ||| house ||| 0.200000 0.600000 0.200000 0.600000 0.200000 0.200000 +das ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +daß ||| pleasant festive period ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +dem 17. ||| adjourned on ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +dem ||| on ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +der sitzungsperiode ||| of the session ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +der ||| a number of ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +der ||| as a number of ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +der ||| number of ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +der ||| of the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +der ||| of ||| 0.333333 0.066667 0.600000 0.200000 0.333333 0.466667 +des ||| 17 ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +dezember unterbrochene sitzungsperiode ||| resumed the session of the european ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +dezember ||| european ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +die am freitag ||| would ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +dieser sitzungsperiode ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +doch sind bürger ||| will ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +doch sind ||| will ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +doch ||| will ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +einer aussprache ||| on this subject ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +einer schweigeminute zu gedenken . ||| observe a minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +einer schweigeminute zu ||| ' s silence ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +einer schweigeminute zu ||| minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +eingetreten . ||| . ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +eingetreten . ||| dreadful . ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +einiger kolleginnen ||| those ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +einiger unserer ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +einiger ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +erheben ||| then ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +erhebt sich zu einer schweigeminute . ||| and observed a minute ' s silence ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +erhebt sich ||| and observed ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +erkläre die am ||| declare ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +erkläre die ||| declare ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +erkläre ||| declare ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +europäischen ||| i ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +europäischen ||| like to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +europäischen ||| should like to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +europäischen ||| to ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ferien ||| a happy new year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ferien ||| happy new year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ferien ||| new year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ferien ||| year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen konnten , ||| failed to materialise , still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen konnten , ||| materialise , still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen konnten , ||| to materialise , still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen konnten ||| failed to materialise , still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen konnten ||| materialise , still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen konnten ||| to materialise , still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen ||| failed to materialise , ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +feststellen ||| failed to materialise ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen ||| materialise , ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +feststellen ||| materialise ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +feststellen ||| to materialise , ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +feststellen ||| to materialise ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +frau präsidentin ||| of order . ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +frau präsidentin ||| of order ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +frau ||| order . ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +frau ||| order ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +freitag ||| would ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +für ||| 1999 ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gedenken . ||| observe a ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +gedenken ||| observe ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +gefürchtete ||| have seen , the dreaded ' ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gefürchtete ||| have seen , the dreaded ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gefürchtete ||| have seen , the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gefürchtete ||| have seen , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gefürchtete ||| have seen ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gefürchtete ||| have ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +geworden . ||| suffered a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +geworden ||| suffered ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +gute zum jahreswechsel ||| once again ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +gute zum jahreswechsel ||| once ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hatten . ||| enjoyed a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hatten . ||| that you enjoyed a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hatten . ||| you enjoyed a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hatten ||| enjoyed ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +hatten ||| that you enjoyed ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +hatten ||| you enjoyed ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +heute möchte ||| the victims concerned ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +heute möchte ||| victims concerned ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +heute ||| the victims ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +heute ||| victims ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +hoffe , ||| hope that you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe , ||| hope that ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe , ||| hope ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe , ||| the hope that you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe , ||| the hope that ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe , ||| the hope ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe ||| hope that you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe ||| hope that ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe ||| hope ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe ||| the hope that you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe ||| the hope that ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +hoffe ||| the hope ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ich bitte sie ||| please rise , ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +ich bitte ||| please rise ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +ich ||| . ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ich ||| like ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| wish you a happy new ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| wish you a happy ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| wish you a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| wish you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| you a happy new ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| you a happy ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| you a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ihnen ||| you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +im parlament besteht ||| the next few days , during this ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +im ||| , during this ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +im ||| days , during this ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +im ||| during this ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +im ||| few days , during this ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +im ||| part-session ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +im ||| this ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den nächsten tagen ||| you have requested a debate ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den nächsten ||| you have requested a ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +in den nächsten ||| you have requested ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den verschiedenen ländern der ||| a number of members have requested ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den verschiedenen ländern der ||| as a number of members have requested ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den verschiedenen ländern der ||| number of members have requested ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den verschiedenen ländern der ||| of members have requested ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +in den verschiedenen ländern ||| members have requested ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +in den verschiedenen ||| have requested ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +in den ||| have requested a ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +in den ||| have requested ||| 0.142857 0.428571 0.428571 0.142857 0.142857 0.714286 +in ||| , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +insbesondere ||| behalf of ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +insbesondere ||| behalf ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +insbesondere ||| on behalf of ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +insbesondere ||| on behalf ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ist der ||| a number of countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ist der ||| in a number of countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ist der ||| number of countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ist ||| countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ist ||| the various countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ist ||| various countries ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +jahreswechsel ||| once again ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +jahreswechsel ||| once ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +kollegen ||| european union ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +kollegen ||| union ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +kolleginnen ||| those ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +konnten , ||| , still ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +konnten , ||| still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +konnten ||| , still ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +konnten ||| still ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ländern der ||| a number of members ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ländern der ||| as a number of members ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ländern der ||| number of members ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ländern der ||| of members ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +ländern ||| members ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +millenium-bug ||| you ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten opfer ||| disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten opfer ||| natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten opfer ||| of natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten opfer ||| that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten opfer ||| truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten ||| disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten ||| natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten ||| of natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten ||| that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +mitgliedstaaten ||| truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +möchte ||| concerned ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +nach einer aussprache ||| on this subject ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +naturkatastrophen ||| people in a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +naturkatastrophen ||| people in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +naturkatastrophen ||| people ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht eingetreten ||| series of natural disasters that ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht eingetreten ||| series of natural disasters ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht eingetreten ||| series of natural ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht eingetreten ||| series of ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht eingetreten ||| series ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht ||| series of natural disasters that ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht ||| series of natural disasters ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht ||| series of natural ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht ||| series of ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nicht ||| series ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nochmals ||| and ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +nächsten ||| you ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +opfer von schrecklichen ||| ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| seen , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| seen , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von schrecklichen ||| the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| seen , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| seen , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfer von ||| the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfern ||| all the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfern ||| all ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfern ||| of all the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +opfern ||| of all ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +parlament besteht der ||| of the next few days , during ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +parlament besteht der ||| of the next few days , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +parlament besteht der ||| of the next few days ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +parlament besteht der ||| of the next few ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +parlament besteht der ||| of the next ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +parlament besteht ||| the next few days , during ||| 0.200000 0.600000 0.200000 0.200000 0.600000 0.200000 +parlament besteht ||| the next few days , ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +parlament besteht ||| the next few days ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +parlament besteht ||| the next few ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +parlament besteht ||| the next ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +parlament ||| the ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +parlaments ||| parliament ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +präsidentin , zur geschäftsordnung . ||| madam president , on a point of ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +präsidentin ||| of ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +schrecklichen naturkatastrophen ||| people in a ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +schrecklichen naturkatastrophen ||| people in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +schrecklichen naturkatastrophen ||| people ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +schöne ferien ||| a happy new year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +schöne ferien ||| happy new year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +schöne ferien ||| new year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +schöne ferien ||| year ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sich zu einer schweigeminute zu ||| for this minute ' s silence ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +sich zu einer schweigeminute zu ||| minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sich zu einer schweigeminute zu ||| this minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sich zu ||| for this minute ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +sich zu ||| minute ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sich zu ||| this minute ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sie bitten ||| , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sie bitten ||| storms , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sie schöne ||| , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sie ||| , ||| 0.454545 0.090909 0.454545 0.090909 0.272727 0.636364 +sie ||| storms , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sind bürger einiger unserer ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sind bürger einiger ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +sitzungsperiode ||| the session of the ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +sitzungsperiode ||| the session ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +sitzungsperiode ||| the ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +stürme , ||| meantime ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +stürme ||| meantime ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +tagen ||| a debate ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +tagen ||| debate ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +und ||| i should like ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +und ||| i should ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +und ||| i ||| 0.142857 0.142857 0.714286 0.142857 0.142857 0.714286 +union ||| , on ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +union ||| , ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten opfer ||| disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten opfer ||| natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten opfer ||| of natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten opfer ||| that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten opfer ||| truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten ||| disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten ||| natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten ||| of natural disasters that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten ||| that truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unserer mitgliedstaaten ||| truly ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +unterbrochene sitzungsperiode ||| resumed the session of the ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +unterbrochene ||| resumed ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 +verlauf dieser sitzungsperiode ||| the course ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +verlauf dieser ||| course ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +verlauf ||| course ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +verschiedenen ländern der ||| a number of members ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +verschiedenen ländern der ||| as a number of members ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +verschiedenen ländern der ||| number of members ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +verschiedenen ländern der ||| of members ||| 0.200000 0.200000 0.600000 0.200000 0.600000 0.200000 +verschiedenen ländern ||| members ||| 0.200000 0.600000 0.200000 0.200000 0.600000 0.200000 +von schrecklichen ||| ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| seen , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| seen , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von schrecklichen ||| the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| seen , the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| seen , the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| the dreaded ' millennium bug ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +von ||| the dreaded ' millennium ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +wie sie ||| although , as ||| 0.600000 0.200000 0.200000 0.200000 0.200000 0.600000 +wiederaufgenommen ||| friday ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +wiederaufnahme der sitzungsperiode ||| resumption of the session ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +wiederaufnahme der ||| resumption of ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +wiederaufnahme ||| resumption ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +wunsch einiger ||| in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +wunsch nach einer aussprache ||| on this subject in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +wunsch nach ||| in ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +wunsch ||| in ||| 0.142857 0.142857 0.714286 0.142857 0.142857 0.714286 +wünsche ||| december ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zu einer schweigeminute . ) ||| a minute ' s silence ) ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +zu einer schweigeminute . ||| a minute ' s silence ||| 0.600000 0.200000 0.200000 0.600000 0.200000 0.200000 +zu einer schweigeminute zu ||| for this minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zu einer schweigeminute zu ||| minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zu einer schweigeminute zu ||| this minute ' s silence ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zu einer schweigeminute ||| minute ' s silence ||| 0.200000 0.600000 0.200000 0.200000 0.200000 0.600000 +zu ||| for this minute ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zu ||| minute ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zu ||| this minute ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zum jahreswechsel ||| once again ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zum jahreswechsel ||| once ||| 0.200000 0.200000 0.600000 0.200000 0.200000 0.600000 +zur ||| madam ||| 0.200000 0.200000 0.600000 0.600000 0.200000 0.200000 diff --git a/example/scripts/PUT_MOSES_SCRIPTS_IN_HERE b/example/scripts/PUT_MOSES_SCRIPTS_IN_HERE new file mode 100644 index 0000000..e69de29 diff --git a/inbetwmert.sh b/inbetwmert.sh new file mode 100755 index 0000000..94f2b84 --- /dev/null +++ b/inbetwmert.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# mmert v0.2 +# manipulate mert-moses.pl script +# Copyright 2011 +# Patrick Simianer +# Heidelberg University, ICL +# +# This file is part of MMERT. +# +# MMERT is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MMERT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MMERT. If not, see . + + +function usage() +{ + echo "Usage: $0 " + exit +} + +if [ -z "$1" ]; then usage; fi + + +MMERTPKG=~/mmert/example/ # base directory +BIN=$MMERTPKG/bin/ # binaries (moses, mert, extractor) +DECODER=$BIN/moses # decoder +SCRIPTS=$MMERTPKG/scripts/ # moses scripts folder +FOLDER_PREFIX=mert_$1 # directory for logs (created in current directory) +WORKDIR=mmert_$1 # working directory for mmert (created in current directory) +PARALLEL=2 # number moses/mert processes to run in parallel +DECODER_FLAGS="-th 1" # additional decoder flags, e.g. '-th 8' for 8 threads per moses instance, needs moses with thread support +TASKS=(A B C D) # used to identify tasks +TASKL="A,B,C,D" # same as above as string, tasks ids separated by ',' +NUMTASKS=${#TASKS[@]} # for parallelization, number of tasks +INI=$MMERTPKG/ini/moses.ini # one moses ini for all tasks (e.g. pooled model and/or same start weights)? or prefix for individual inis +SET=dev # dev or devtest (only used to build filenames)? +#declare -A INIS +#INIS[${TASKS[0]}]=$INI/$SET/${TASKS[0]}/moses.ini # individual inis (for individual models and/or individual start weights) +#INIS[${TASKS[1]}]=$INI/$SET/${TASKS[1]}/moses.ini +#INIS[${TASKS[2]}]=$INI/$SET/${TASKS[2]}/moses.ini +#INIS[${TASKS[3]}]=$INI/$SET/${TASKS[3]}/moses.ini +#INIS[${TASKS[4]}]=$INI/$SET/${TASKS[4]}/moses.ini +#INIS[${TASKS[5]}]=$INI/$SET/${TASKS[5]}/moses.ini +#INIS[${TASKS[6]}]=$INI/$SET/${TASKS[6]}/moses.ini + +# the next variables enable the script to locate your dev set(s), please set accordingly +# see also run_mert_wrapper() function +FR=de # source language +EN=en # target language +TUNEFILE_PREFIX=epmini-$SET # to build filenames of dev(test) sets + +MAX_ITER=100 # max mert iterations +NBEST=100 # n for nbest lists +INBETW=./regmtl.py # script to run after MERT runs finished +NUM_WEIGHTS=14 # dimension, length of weight vector +MIN_CHANGE=0.01 # minimum change in average vector, stopping criterion +LAMBDA=0.01 # regularization parameter +FIRST_AVG=0 # 0: zero vector, 1: provide run0.avector.txt yourself (in $WORKDIR) + +# parameters +# $1 FR tuning data +# $2 EN tuning data +# $3 /path/to/ini +# $4 task id +# $5 --continue +# +# +function run_mert() +{ + ./mert-moses.pl \ + $1 \ + $2 \ + $DECODER \ + $3 \ + --no-filter-phrase-table \ + --working-dir $FOLDER_PREFIX"_$4" \ + --rootdir $SCRIPTS \ + --decoder-flags "$DECODER_FLAGS" \ + --mertdir $BIN \ + --inputtype=0 \ + --maximum-iterations=9999 \ + --efficient_scorenbest_flag \ + --nocase \ + --nonorm \ + --nbest=$NBEST \ + $5 +} + +function run_mert_wrapper() +{ + T=$1 + echo -e "\n ===> $IT ========>\n\n" >> $WORKDIR/mert.$T.out >> $WORKDIR/mert.$T.err + # replace $INI with ${INIS[$T]} to use separate inis + run_mert $MMERTPKG/data/$TUNEFILE_PREFIX-$T.$FR $MMERTPKG/data/$TUNEFILE_PREFIX-$T.$EN $INI $T $CONT >> $WORKDIR/mert.$T.out 2>> $WORKDIR/mert.$T.err +} + +function wait_for() +{ + echo "Waiting for ${#WAITFOR[@]} MERT procs..." + for pid in ${WAITFOR[@]}; do + wait $pid; + done +} + + +if [ ! -d "$WORKDIR" ]; then + mkdir $WORKDIR +fi + +IT=0 +while true; do + IT=$(($IT+1)) + if [ $IT -eq 1 ]; then + echo "First iteration" + CONT=""; + else + echo -e "\nContinue with $IT" + CONT="--continue"; + fi + + # first half + WAITFOR=() + for (( i = 1; i <= $PARALLEL; i++ )); do + echo "Start for ${TASKS[$i-1]}" + run_mert_wrapper ${TASKS[$i-1]} & + WAITFOR+=( $! ) + done + wait_for $WAITFOR + + # second half + WAITFOR=() + for (( i = $PARALLEL+1; i <= $NUMTASKS; i++)); do + echo "Start for ${TASKS[$i-1]}" + run_mert_wrapper ${TASKS[$i-1]} & + WAITFOR+=( $! ) + done + wait_for $WAITFOR + + echo "Running $INBETW ..." + $INBETW $FOLDER_PREFIX $WORKDIR $TASKL $IT $NUM_WEIGHTS $MIN_CHANGE $LAMBDA $FIRST_AVG + + if [ -f "$WORKDIR/CONVERGED" ]; then break; fi + if [ $IT -eq $MAX_ITER ]; then + echo "Reached global iteration limit ($MAX_ITER), stopping."; + break; + fi +done +echo 'done' + diff --git a/mert-moses.pl b/mert-moses.pl new file mode 100755 index 0000000..8c384ed --- /dev/null +++ b/mert-moses.pl @@ -0,0 +1,1346 @@ +#!/usr/bin/perl -w + +# $Id$ +# Usage: +# mert-moses.pl +# For other options see below or run 'mert-moses.pl --help' + +# Notes: +# and should be raw text files, one sentence per line +# can be a prefix, in which case the files are 0, 1, etc. are used + +# Revision history + +# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU +# and case-sensistive/insensitive evaluation (Nicola Bertoldi) +# 5 Jun 2008 Forked previous version to support new mert implementation. +# 13 Feb 2007 Better handling of default values for lambda, now works with multiple +# models and lexicalized reordering +# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1] +# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi) +# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table) +# useful if binary phrase tables are used (Nicola Bertoldi) +# 28 Aug 2006 Use either closest or average or shortest (default) reference +# length as effective reference length +# Use either normalization or not (default) of texts (Nicola Bertoldi) +# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings +# adding default paths +# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) +# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) +# 27 Jul 2006 adding the safesystem() function to handle with process failure +# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) +# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi) +# 18 Jul 2006 adapted for Moses and cleaned up (PK) +# 21 Jan 2005 unified various versions, thorough cleanup (DWC) +# now indexing accumulated n-best list solely by feature vectors +# 14 Dec 2004 reimplemented find_threshold_points in C (NMD) +# 25 Oct 2004 Use either average or shortest (default) reference +# length as effective reference length (DWC) +# 13 Oct 2004 Use alternative decoders (DWC) +# Original version by Philipp Koehn + +use FindBin qw($Bin); +use File::Basename; +my $SCRIPTS_ROOTDIR = $Bin; +$SCRIPTS_ROOTDIR =~ s/\/training$//; +$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); + +# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list +# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used, +# but the translation model has currently 5 features + +# defaults for initial values and ranges are: + +my $default_triples = { + # these basic models exist even if not specified, they are + # not associated with any model file + "w" => [ [ 0.0, 0.0, 1.0 ] ], # word penalty +}; + +my $additional_triples = { + # if the more lambda parameters for the weights are needed + # (due to additional tables) use the following values for them + "d" => [ [ 1.0, 0.0, 1.0 ] ], # lexicalized reordering model + "lm" => [ [ 1.0, 0.0, 1.0 ] ], # language model + "g" => [ [ 1.0, 0.0, 1.0 ], # generation model + [ 1.0, 0.0, 1.0 ] ], + "tm" => [ [ 0.3, 0.0, 1.0 ], # translation model + [ 0.2, 0.0, 1.0 ], + [ 0.3, 0.0, 1.0 ], + [ 0.2, 0.0, 1.0 ], + [ 0.0, 0.0, 1.0 ] ], # ... last weight is phrase penalty + "lex"=> [ [ 0.1, 0.0, 1.0 ] ], # global lexical model + "I" => [ [ 0.0, 0.0, 1.0 ] ], # input lattice scores +}; + # the following models (given by shortname) use same triplet + # for any number of lambdas, the number of the lambdas is determined + # by the ini file +my $additional_tripes_loop = { map { ($_, 1) } qw/ d I / }; + +# moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line) +# uses ABBR names. +my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation lex=weight-lex I=weight-i"; +my %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP; + +# We parse moses.ini to figure out how many weights do we need to optimize. +# For this, we must know the correspondence between options defining files +# for models and options assigning weights to these models. +my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d generation-file=g global-lexical-file=lex link-param-count=I"; +my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP; + +# There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize +#my $extra_lambdas_for_model = { +# "w" => 1, # word penalty +# "d" => 1, # basic distortion +#}; + +my $minimum_required_change_in_weights = 0.00001; + # stop if no lambda changes more than this + +my $verbose = 0; +my $usage = 0; # request for --help +my $___WORKING_DIR = "mert-work"; +my $___DEV_F = undef; # required, input text to decode +my $___DEV_E = undef; # required, basename of files with references +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file +my $___N_BEST_LIST_SIZE = 100; +my $queue_flags = "-hard"; # extra parameters for parallelizer + # the -l ws0ssmt is relevant only to JHU workshop +my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder +my $___LAMBDA = undef; # string specifying the seed weights and boundaries of all lambdas +my $continue = 0; # should we try to continue from the last saved step? +my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) +my $___FILTER_PHRASE_TABLE = 1; # filter phrase table +my $___PREDICTABLE_SEEDS = 0; +my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009] +my $___RANDOM_DIRECTIONS = 0; # search in random directions only +my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008] +my $___PAIRWISE_RANKED_OPTIMIZER = 0; # use Hopkins&May[2011] + +# Parameter for effective reference length when computing BLEU score +# Default is to use shortest reference +# Use "--shortest" to use shortest reference length +# Use "--average" to use average reference length +# Use "--closest" to use closest reference length +# Only one between --shortest, --average and --closest can be set +# If more than one choice the defualt (--shortest) is used +my $___SHORTEST = 0; +my $___AVERAGE = 0; +my $___CLOSEST = 0; + +# Use "--nocase" to compute case-insensitive scores +my $___NOCASE = 0; + +# Use "--nonorm" to non normalize translation before computing scores +my $___NONORM = 0; + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + + +my $mertdir = undef; # path to new mert directory +my $mertargs = undef; # args to pass through to mert +my $filtercmd = undef; # path to filter-model-given-input.pl +my $filterfile = undef; +my $qsubwrapper = undef; +my $moses_parallel_cmd = undef; +my $old_sge = 0; # assume sge<6.0 +my $___CONFIG_BAK = undef; # backup pathname to startup ini file +my $efficient_scorenbest_flag = undef; # set to 1 to activate a time-efficient scoring of nbest lists + # (this method is more memory-consumptive) +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1) + # -1 means all previous, i.e. from iteration 1 + # 0 means no previous data, i.e. from actual iteration + # 1 means 1 previous data , i.e. from the actual iteration and from the previous one + # and so on +my $starting_weights_from_ini = 1; + +my $maximum_iterations = 25; + +use strict; +use Getopt::Long; +GetOptions( + "working-dir=s" => \$___WORKING_DIR, + "input=s" => \$___DEV_F, + "inputtype=i" => \$___INPUTTYPE, + "refs=s" => \$___DEV_E, + "decoder=s" => \$___DECODER, + "config=s" => \$___CONFIG, + "nbest=i" => \$___N_BEST_LIST_SIZE, + "queue-flags=s" => \$queue_flags, + "jobs=i" => \$___JOBS, + "decoder-flags=s" => \$___DECODER_FLAGS, + "lambdas=s" => \$___LAMBDA, + "continue" => \$continue, + "skip-decoder" => \$skip_decoder, + "shortest" => \$___SHORTEST, + "average" => \$___AVERAGE, + "closest" => \$___CLOSEST, + "nocase" => \$___NOCASE, + "nonorm" => \$___NONORM, + "help" => \$usage, + "verbose" => \$verbose, + "mertdir=s" => \$mertdir, + "mertargs=s" => \$mertargs, + "rootdir=s" => \$SCRIPTS_ROOTDIR, + "filtercmd=s" => \$filtercmd, # allow to override the default location + "filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets) + "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location + "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location + "old-sge" => \$old_sge, #passed to moses-parallel + "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # allow (disallow)filtering of phrase tables + "predictable-seeds" => \$___PREDICTABLE_SEEDS, # allow (disallow) switch on/off reseeding of random restarts + "historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points + "random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions + "number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions + "efficient_scorenbest_flag" => \$efficient_scorenbest_flag, # activate a time-efficient scoring of nbest lists + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous) + "maximum-iterations=i" => \$maximum_iterations, + "starting-weights-from-ini!" => \$starting_weights_from_ini, + "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER +) or exit(1); + +# the 4 required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 4) { + # required parameters: input_file references_basename decoder_executable + $___DEV_F = shift; + $___DEV_E = shift; + $___DECODER = shift; + $___CONFIG = shift; +} + +if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER || !defined $___CONFIG) { + print STDERR "usage: mert-moses.pl input-text references decoder-executable decoder.ini +Options: + --working-dir=mert-dir ... where all the files are created + --nbest=100 ... how big nbestlist to generate + --jobs=N ... set this to anything to run moses in parallel + --mosesparallelcmd=STR ... use a different script instead of moses-parallel + --queue-flags=STRING ... anything you with to pass to qsub, eg. + '-l ws06osssmt=true'. The default is: '-hard' + To reset the parameters, please use + --queue-flags=' ' + (i.e. a space between the quotes). + --decoder-flags=STRING ... extra parameters for the decoder + --lambdas=STRING ... default values and ranges for lambdas, a + complex string such as + 'd:1,0.5-1.5 lm:1,0.5-1.5 tm:0.3,0.25-0.75;0.2,0.25-0.75;0.2,0.25-0.75;0.3,0.25-0.75;0,-0.5-0.5 w:0,-0.5-0.5' + --allow-unknown-lambda ... keep going even if someone supplies a new + lambda in the lambdas option (such as + 'superbmodel:1,0-1'); optimize it, too + --continue ... continue from the last successful iteration + --skip-decoder ... skip the decoder run for the first time, + assuming that we got interrupted during + optimization + --shortest --average --closest + ... Use shortest/average/closest reference length + as effective reference length (mutually exclusive) + --nocase ... Do not preserve case information; i.e. + case-insensitive evaluation (default is false). + --nonorm ... Do not use text normalization (flag is not active, + i.e. text is NOT normalized) + --filtercmd=STRING ... path to filter-model-given-input.pl + --filterfile=STRING ... path to alternative to input-text for filtering + model. useful for lattice decoding + --rootdir=STRING ... where do helpers reside (if not given explicitly) + --mertdir=STRING ... path to new mert implementation + --mertargs=STRING ... extra args for mert, eg. to specify scorer + --scorenbestcmd=STRING ... path to score-nbest.py + --old-sge ... passed to parallelizers, assume Grid Engine < 6.0 + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) + --no-filter-phrase-table ... disallow filtering of phrase tables + (useful if binary phrase tables are available) + --predictable-seeds ... provide predictable seeds to mert so that random + restarts are the same on every run + --efficient_scorenbest_flag ... time-efficient scoring of nbest lists + (this method is more memory-consumptive) + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --prev-aggregate-nbestlist=INT ... number of previous step to consider when + loading data (default = $prev_aggregate_nbl_size) + -1 means all previous, i.e. from iteration 1 + 0 means no previous data, i.e. only the + current iteration + N means this and N previous iterations + + --maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations + --starting-weights-from-ini ... use the weights given in moses.ini file as + the starting weights (and also as the fixed + weights if --activate-features is used). + default: yes (used to be 'no') + --random-directions ... search only in random directions + --number-of-random-directions=int ... number of random directions + (also works with regular optimizer, default: 0) +"; + exit 1; +} + + +# Check validity of input parameters and set defaults if needed + +print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n"; + +# path of script for filtering phrase tables and running the decoder +$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd; + +$qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper; + +$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl" + if !defined $moses_parallel_cmd; + + + +if (!defined $mertdir) { + $mertdir = "$SCRIPTS_ROOTDIR/../mert"; + print STDERR "Assuming --mertdir=$mertdir\n"; +} + +my $mert_extract_cmd = "$mertdir/extractor"; +my $mert_mert_cmd = "$mertdir/mert"; + +die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; +die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; + +my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation +if ($___PAIRWISE_RANKED_OPTIMIZER && ! -x $pro_optimizer) { + print "did not find $pro_optimizer, installing it in $mertdir\n"; + `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`; + `gunzip $pro_optimizer.gz`; + `chmod +x $pro_optimizer`; + die("ERROR: Installation of megam_i686.opt failed! Install by hand from http://www.cs.utah.edu/~hal/megam/") unless -x $pro_optimizer; +} + +$mertargs = "" if !defined $mertargs; + +my $scconfig = undef; +if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/){ + $scconfig=$1; + $scconfig =~ s/\,/ /g; + $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//; +} + +# handling reference lengh strategy +if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1){ + die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n"; +} + +if ($___SHORTEST){ + $scconfig .= " reflen:shortest"; +}elsif ($___AVERAGE){ + $scconfig .= " reflen:average"; +}elsif ($___CLOSEST){ + $scconfig .= " reflen:closest"; +} + +# handling case-insensitive flag +if ($___NOCASE) { + $scconfig .= " case:false"; +}else{ + $scconfig .= " case:true"; +} +$scconfig =~ s/^\s+//; +$scconfig =~ s/\s+$//; +$scconfig =~ s/\s+/,/g; + +$scconfig = "--scconfig $scconfig" if ($scconfig); + +my $mert_extract_args=$mertargs; +$mert_extract_args .=" $scconfig"; + +my $mert_mert_args=$mertargs; +$mert_mert_args =~ s/\-+(binary|b)\b//; +$mert_mert_args .=" $scconfig"; +if ($___ACTIVATE_FEATURES){ $mert_mert_args .=" -o \"$___ACTIVATE_FEATURES\""; } + +my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd); +die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd; +die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd; +die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; +die "Not executable: $___DECODER" if ! -x $___DECODER; + + +my $input_abs = ensure_full_path($___DEV_F); +die "File not found: $___DEV_F (interpreted as $input_abs)." + if ! -e $input_abs; +$___DEV_F = $input_abs; + + +# Option to pass to qsubwrapper and moses-parallel +my $pass_old_sge = $old_sge ? "-old-sge" : ""; + +my $decoder_abs = ensure_full_path($___DECODER); +die "File not found: $___DECODER (interpreted as $decoder_abs)." + if ! -x $decoder_abs; +$___DECODER = $decoder_abs; + + +my $ref_abs = ensure_full_path($___DEV_E); +# check if English dev set (reference translations) exist and store a list of all references +my @references; +if (-e $ref_abs) { + push @references, $ref_abs; +} +else { + # if multiple file, get a full list of the files + my $part = 0; + while (-e $ref_abs.$part) { + push @references, $ref_abs.$part; + $part++; + } + die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part; +} + +my $config_abs = ensure_full_path($___CONFIG); +die "File not found: $___CONFIG (interpreted as $config_abs)." + if ! -e $config_abs; +$___CONFIG = $config_abs; + + + +# check validity of moses.ini and collect number of models and lambdas per model +# need to make a copy of $extra_lambdas_for_model, scan_config spoils it +#my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model; +my %used_triples = %{$default_triples}; +my ($models_used) = scan_config($___CONFIG); + +# Parse the lambda config string and convert it to a nice structure in the same format as $used_triples +if (defined $___LAMBDA) { + my %specified_triples; + # interpreting lambdas from command line + foreach (split(/\s+/,$___LAMBDA)) { + my ($name,$values) = split(/:/); + die "Malformed setting: '$_', expected name:values\n" if !defined $name || !defined $values; + foreach my $startminmax (split/;/,$values) { + if ($startminmax =~ /^(-?[\.\d]+),(-?[\.\d]+)-(-?[\.\d]+)$/) { + my $start = $1; + my $min = $2; + my $max = $3; + push @{$specified_triples{$name}}, [$start, $min, $max]; + } + else { + die "Malformed feature range definition: $name => $startminmax\n"; + } + } + } + # sanity checks for specified lambda triples + foreach my $name (keys %used_triples) { + die "No lambdas specified for '$name', but ".($#{$used_triples{$name}}+1)." needed.\n" + unless defined($specified_triples{$name}); + die "Number of lambdas specified for '$name' (".($#{$specified_triples{$name}}+1).") does not match number needed (".($#{$used_triples{$name}}+1).")\n" + if (($#{$used_triples{$name}}) != ($#{$specified_triples{$name}})); + } + foreach my $name (keys %specified_triples) { + die "Lambdas specified for '$name' ".(@{$specified_triples{$name}}).", but none needed.\n" + unless defined($used_triples{$name}); + } + %used_triples = %specified_triples; +} + +# moses should use our config +if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / +|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) / +|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) / +) { + die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files."; +} + +# as weights are normalized in the next steps (by cmert) +# normalize initial LAMBDAs, too +my $need_to_normalize = 1; + + + +my @order_of_lambdas_from_decoder = (); +# this will store the labels of scores coming out of the decoder (and hence the order of lambdas coming out of mert) +# we will use the array to interpret the lambdas +# the array gets filled with labels only after first nbestlist was generated + + + + +#store current directory and create the working directory (if needed) +my $cwd = `pawd 2>/dev/null`; +if(!$cwd){$cwd = `pwd`;} +chomp($cwd); + +safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR"; + +{ +# open local scope + +#chdir to the working directory +chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR"; + +# fixed file names +my $mert_outfile = "mert.out"; +my $mert_logfile = "mert.log"; +my $weights_in_file = "init.opt"; +my $weights_out_file = "weights.txt"; + + +# set start run +my $start_run = 1; +my $bestpoint = undef; +my $devbleu = undef; + +my $prev_feature_file = undef; +my $prev_score_file = undef; +my $prev_init_file = undef; + +if ($continue) { + # getting the last finished step + print STDERR "Trying to continue an interrupted optimization.\n"; + open IN, "finished_step.txt" or die "Failed to find the step number, failed to read finished_step.txt"; + my $step = ; + chomp $step; + close IN; + + print STDERR "Last finished step is $step\n"; + + # getting the first needed step + my $firststep; + if ($prev_aggregate_nbl_size==-1){ + $firststep=1; + } + else{ + $firststep=$step-$prev_aggregate_nbl_size+1; + $firststep=($firststep>0)?$firststep:1; + } + +#checking if all needed data are available + if ($firststep<=$step){ + print STDERR "First previous needed data index is $firststep\n"; + print STDERR "Checking whether all needed data (from step $firststep to step $step) are available\n"; + + for (my $prevstep=$firststep; $prevstep<=$step;$prevstep++){ + print STDERR "Checking whether data of step $prevstep are available\n"; + if (! -e "run$prevstep.features.dat"){ + die "Can't start from step $step, because run$prevstep.features.dat was not found!"; + }else{ + if (defined $prev_feature_file){ + $prev_feature_file = "${prev_feature_file},run$prevstep.features.dat"; + } + else{ + $prev_feature_file = "run$prevstep.features.dat"; + } + } + if (! -e "run$prevstep.scores.dat"){ + die "Can't start from step $step, because run$prevstep.scores.dat was not found!"; + }else{ + if (defined $prev_score_file){ + $prev_score_file = "${prev_score_file},run$prevstep.scores.dat"; + } + else{ + $prev_score_file = "run$prevstep.scores.dat"; + } + } + if (! -e "run$prevstep.${weights_in_file}"){ + die "Can't start from step $step, because run$prevstep.${weights_in_file} was not found!"; + }else{ + if (defined $prev_init_file){ + $prev_init_file = "${prev_init_file},run$prevstep.${weights_in_file}"; + } + else{ + $prev_init_file = "run$prevstep.${weights_in_file}"; + } + } + } + if (! -e "run$step.weights.txt"){ + die "Can't start from step $step, because run$step.weights.txt was not found!"; + } + if (! -e "run$step.$mert_logfile"){ + die "Can't start from step $step, because run$step.$mert_logfile was not found!"; + } + if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz"){ + die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!"; + } + print STDERR "All needed data are available\n"; + + print STDERR "Loading information from last step ($step)\n"; + open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile"; + while () { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + die "Failed to parse mert.log, missed Best point there." + if !defined $bestpoint || !defined $devbleu; + print "($step) BEST at $step $bestpoint => $devbleu at ".`date`; + + my @newweights = split /\s+/, $bestpoint; + + + print STDERR "Reading last cached lambda values (result from step $step)\n"; + @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |"); + + + # update my cache of lambda values + store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights); + + } + else{ + print STDERR "No pevious data are needed\n"; + } + + $start_run = $step +1; +} + +if ($___FILTER_PHRASE_TABLE){ + # filter the phrase tables wih respect to input, use --decoder-flags + print "filtering the phrase tables... ".`date`; + my $___FILTER_F = $___DEV_F; + $___FILTER_F = $filterfile if (defined $filterfile); + my $cmd = "$filtercmd ./filtered $___CONFIG $___FILTER_F"; + if (defined $___JOBS && $___JOBS > 0) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=filterphrases.out -stderr=filterphrases.err" ) + or die "Failed to submit filtering of tables to the queue (via $qsubwrapper)"; + } else { + safesystem($cmd) or die "Failed to filter the tables."; + } + + # make a backup copy of startup ini file + $___CONFIG_BAK = $___CONFIG; + # the decoder should now use the filtered model + $___CONFIG = "filtered/moses.ini"; +} +else{ + # do not filter phrase tables (useful if binary phrase tables are available) + # use the original configuration file + $___CONFIG_BAK = $___CONFIG; +} + +my $PARAMETERS; +#$PARAMETERS = $___DECODER_FLAGS . " -config $___CONFIG -inputtype $___INPUTTYPE"; +$PARAMETERS = $___DECODER_FLAGS; + +my $run=$start_run-1; + +my $oldallsorted = undef; +my $allsorted = undef; + +my $cmd; +# features and scores from the last run. +my $nbest_file=undef; + +#while(1) { + $run++; + if ($maximum_iterations && $run > $maximum_iterations) { + print "Maximum number of iterations exceeded - stopping\n"; + last; + } + # run beamdecoder with option to output nbestlists + # the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists + + print "run $run start at ".`date`; + + # In case something dies later, we might wish to have a copy + create_config($___CONFIG, "./run$run.moses.ini", \%used_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--")); + + + # skip if the user wanted + if (!$skip_decoder) { + print "($run) run decoder to produce n-best lists\n"; + $nbest_file = run_decoder(\%used_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize); + $need_to_normalize = 0; + safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out"; + $nbest_file = $nbest_file.".gz"; + } + else { + $nbest_file="run$run.best$___N_BEST_LIST_SIZE.out.gz"; + print "skipped decoder run $run\n"; + if (0 == scalar @order_of_lambdas_from_decoder) { + @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -dc $nbest_file | head -1 |"); + } + $skip_decoder = 0; + $need_to_normalize = 0; + } + + + + # extract score statistics and features from the nbest lists + print STDERR "Scoring the nbestlist.\n"; + + my $base_feature_file = "features.dat"; + my $base_score_file = "scores.dat"; + my $feature_file = "run$run.${base_feature_file}"; + my $score_file = "run$run.${base_score_file}"; + + $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file"; + + if (defined $___JOBS && $___JOBS > 0) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=extract.out -stderr=extract.err" ) + or die "Failed to submit extraction to queue (via $qsubwrapper)"; + } else { + safesystem("$cmd > extract.out 2> extract.err") or die "Failed to do extraction of statistics."; + } + + # Create the initial weights file for mert, in init.opt + # mert reads in the file init.opt containing the current + # values of lambda. + + # We need to prepare the files and **the order of the lambdas must + # correspond to the order @order_of_lambdas_from_decoder + + my @MIN = (); # lower bounds + my @MAX = (); # upper bounds + my @CURR = (); # the starting values + my @NAME = (); # to which model does the lambda belong + + my %visited = (); + foreach my $name (@order_of_lambdas_from_decoder) { + if (!defined $visited{$name}) { + $visited{$name} = 0; + } else { + $visited{$name}++; + } + my ($val, $min, $max) = @{$used_triples{$name}->[$visited{$name}]}; + push @CURR, $val; + push @MIN, $min; + push @MAX, $max; + push @NAME, $name; + } + + open(OUT,"> $weights_in_file") or die "Can't write $weights_in_file (WD now $___WORKING_DIR)"; + print OUT join(" ", @CURR)."\n"; + print OUT join(" ", @MIN)."\n"; + print OUT join(" ", @MAX)."\n"; + close(OUT); + print join(" ", @NAME)."\n"; + + # make a backup copy labelled with this run number + safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; + + my $DIM = scalar(@CURR); # number of lambdas + + # run mert + $cmd = "$mert_mert_cmd -d $DIM $mert_mert_args -n 20"; + if ($___PREDICTABLE_SEEDS) { + my $seed = $run * 1000; + $cmd = $cmd." -r $seed"; + } + if ($___RANDOM_DIRECTIONS) { + if ($___NUM_RANDOM_DIRECTIONS == 0) { + $cmd .= " -m 50"; + } + $cmd = $cmd." -t random-direction"; + } + if ($___NUM_RANDOM_DIRECTIONS) { + $cmd .= " -m $___NUM_RANDOM_DIRECTIONS"; + } + + if (defined $prev_feature_file) { + $cmd = $cmd." --ffile $prev_feature_file,$feature_file"; + } + else{ + $cmd = $cmd." --ffile $feature_file"; + } + if (defined $prev_score_file) { + $cmd = $cmd." --scfile $prev_score_file,$score_file"; + } + else{ + $cmd = $cmd." --scfile $score_file"; + } + if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) { + $cmd = $cmd." --ifile $prev_init_file,run$run.$weights_in_file"; + } + else{ + $cmd = $cmd." --ifile run$run.$weights_in_file"; + } + + if ($___PAIRWISE_RANKED_OPTIMIZER) { + $cmd .= " --pro pro.data ; echo 'not used' > $weights_out_file; ~/statmt/project/megam/megam_i686.opt -fvals -maxi 30 -nobias binary pro.data"; + } + + if (defined $___JOBS && $___JOBS > 0) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -stdout=$mert_outfile -stderr=$mert_logfile -queue-parameter=\"$queue_flags\"") or die "Failed to start mert (via qsubwrapper $qsubwrapper)"; + } + else { + safesystem("$cmd > $mert_outfile 2> $mert_logfile") or die "Failed to run mert"; + } + die "Optimization failed, file $weights_out_file does not exist or is empty" + if ! -s $weights_out_file; + + + # backup copies + safesystem ("\\cp -f extract.err run$run.extract.err") or die; + safesystem ("\\cp -f extract.out run$run.extract.out") or die; + if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; } + safesystem ("\\cp -f $mert_outfile run$run.$mert_outfile") or die; + safesystem ("\\cp -f $mert_logfile run$run.$mert_logfile") or die; + safesystem ("touch $mert_logfile run$run.$mert_logfile") or die; + safesystem ("\\cp -f $weights_out_file run$run.$weights_out_file") or die; # this one is needed for restarts, too + + print "run $run end at ".`date`; + + $bestpoint = undef; + $devbleu = undef; + if ($___PAIRWISE_RANKED_OPTIMIZER) { + open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile"; + my (@WEIGHT,$sum); + foreach (@CURR) { push @WEIGHT, 0; } + while() { + if (/^F(\d+) ([\-\.\de]+)/) { + $WEIGHT[$1] = $2; + $sum += abs($2); + } + } + $devbleu = "unknown"; + foreach (@WEIGHT) { $_ /= $sum; } + $bestpoint = join(" ",@WEIGHT); + close IN; + } + else { + open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile"; + while () { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + } + die "Failed to parse mert.log, missed Best point there." + if !defined $bestpoint || !defined $devbleu; + print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`; + + my @newweights = split /\s+/, $bestpoint; + + # update my cache of lambda values + store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights); + + ## additional stopping criterion: weights have not changed + my $shouldstop = 1; + for(my $i=0; $i<@CURR; $i++) { + die "Lost weight! mert reported fewer weights (@newweights) than we gave it (@CURR)" + if !defined $newweights[$i]; + if (abs($CURR[$i] - $newweights[$i]) >= $minimum_required_change_in_weights) { + $shouldstop = 0; + last; + } + } + + open F, "> finished_step.txt" or die "Can't mark finished step"; + print F $run."\n"; + close F; + + if ($shouldstop) { + print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n"; + last; + } + + my $firstrun; + if ($prev_aggregate_nbl_size==-1){ + $firstrun=1; + } + else{ + $firstrun=$run-$prev_aggregate_nbl_size+1; + $firstrun=($firstrun>0)?$firstrun:1; + } + print "loading data from $firstrun to $run (prev_aggregate_nbl_size=$prev_aggregate_nbl_size)\n"; + $prev_feature_file = undef; + $prev_score_file = undef; + $prev_init_file = undef; + for (my $i=$firstrun;$i<=$run;$i++){ + if (defined $prev_feature_file){ + $prev_feature_file = "${prev_feature_file},run${i}.${base_feature_file}"; + } + else{ + $prev_feature_file = "run${i}.${base_feature_file}"; + } + if (defined $prev_score_file){ + $prev_score_file = "${prev_score_file},run${i}.${base_score_file}"; + } + else{ + $prev_score_file = "run${i}.${base_score_file}"; + } + if (defined $prev_init_file){ + $prev_init_file = "${prev_init_file},run${i}.${weights_in_file}"; + } + else{ + $prev_init_file = "run${i}.${weights_in_file}"; + } + } + print "loading data from $prev_feature_file\n" if defined($prev_feature_file); + print "loading data from $prev_score_file\n" if defined($prev_score_file); + print "loading data from $prev_init_file\n" if defined($prev_init_file); +#} +print "Training finished at ".`date`; + +if (defined $allsorted){ safesystem ("\\rm -f $allsorted") or die; }; + +safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; +safesystem("\\cp -f $mert_logfile run$run.$mert_logfile") or die; + +create_config($___CONFIG_BAK, "./moses.ini", \%used_triples, $run, $devbleu); + +# just to be sure that we have the really last finished step marked +open F, "> finished_step.txt" or die "Can't mark finished step"; +print F $run."\n"; +close F; + + +#chdir back to the original directory # useless, just to remind we were not there +chdir($cwd); + +} # end of local scope + +sub store_new_lambda_values { + # given new lambda values (in given order), replace the 'val' element in our triples + my $triples = shift; + my $names = shift; + my $values = shift; + + my %idx = (); + foreach my $i (0..scalar(@$values)-1) { + my $name = $names->[$i]; + die "Missed name for lambda $values->[$i] (in @$values; names: @$names)" + if !defined $name; + if (!defined $idx{$name}) { + $idx{$name} = 0; + } else { + $idx{$name}++; + } + die "We did not optimize '$name', but moses returned it back to us" + if !defined $triples->{$name}; + die "Moses gave us too many lambdas for '$name', we had ".scalar(@{$triples->{$name}}) + ." but we got at least ".$idx{$name}+1 + if !defined $triples->{$name}->[$idx{$name}]; + + # set the corresponding field in triples + # print STDERR "Storing $i-th score as $name: $idx{$name}: $values->[$i]\n"; + $triples->{$name}->[$idx{$name}]->[0] = $values->[$i]; + } +} + +sub dump_triples { + my $triples = shift; + + foreach my $name (keys %$triples) { + foreach my $triple (@{$triples->{$name}}) { + my ($val, $min, $max) = @$triple; + print STDERR "Triples: $name\t$val\t$min\t$max ($triple)\n"; + } + } +} + + +sub run_decoder { + my ($triples, $parameters, $run, $output_order_of_lambdas, $need_to_normalize) = @_; + my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; + my $filename = sprintf($filename_template, $run); + + print "params = $parameters\n"; + # prepare the decoder config: + my $decoder_config = ""; + my @vals = (); + foreach my $name (keys %$triples) { + $decoder_config .= "-$name "; + foreach my $triple (@{$triples->{$name}}) { + my ($val, $min, $max) = @$triple; + $decoder_config .= "%.6f "; + push @vals, $val; + } + } + if ($need_to_normalize) { + print STDERR "Normalizing lambdas: @vals\n"; + my $totlambda=0; + grep($totlambda+=abs($_),@vals); + if($totlambda > 0) { + grep($_/=$totlambda,@vals); + } + } + print STDERR "DECODER_CFG = $decoder_config\n"; + print STDERR " values = @vals\n"; + $decoder_config = sprintf($decoder_config, @vals); + print "decoder_config = $decoder_config\n"; + + # run the decoder + my $nBest_cmd = "-n-best-size $___N_BEST_LIST_SIZE"; + my $decoder_cmd; + + if (defined $___JOBS && $___JOBS > 0) { + $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$parameters $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; + } else { + $decoder_cmd = "$___DECODER $parameters -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out"; + } + + safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n"; + + if (0 == scalar @$output_order_of_lambdas) { + # we have to peek at the nbestlist + @$output_order_of_lambdas = get_order_of_scores_from_nbestlist($filename); + } + # we have checked the nbestlist already, we trust the order of output scores does not change + return $filename; +} + +sub get_order_of_scores_from_nbestlist { + # read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist + # return the score labels in order + my $fname_or_source = shift; + print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; + open IN, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source'"; + my $line = ; + close IN; + die "Line empty in nbestlist '$fname_or_source'" if !defined $line; + my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line; + $scores =~ s/^\s*|\s*$//g; + die "No scores in line: $line" if $scores eq ""; + + my @order = (); + my $label = undef; + foreach my $tok (split /\s+/, $scores) { + if ($tok =~ /^([a-z][0-9a-z]*):/i) { + $label = $1; + } elsif ($tok =~ /^-?[-0-9.e]+$/) { + # a score found, remember it + die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" + if !defined $label; + push @order, $label; + } else { + die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; + } + } + print STDERR "The decoder returns the scores in this order: @order\n"; + return @order; +} + +sub create_config { + my $infn = shift; # source config + my $outfn = shift; # where to save the config + my $triples = shift; # the lambdas we should write + my $iteration = shift; # just for verbosity + my $bleu_achieved = shift; # just for verbosity + + my %P; # the hash of all parameters we wish to override + + # first convert the command line parameters to the hash + { # ensure local scope of vars + my $parameter=undef; + print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; + $___DECODER_FLAGS =~ s/^\s*|\s*$//; + $___DECODER_FLAGS =~ s/\s+/ /; + foreach (split(/ /,$___DECODER_FLAGS)) { + if (/^\-([^\d].*)$/) { + $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + } + else { + die "Found value with no -paramname before it: $_" + if !defined $parameter; + push @{$P{$parameter}},$_; + } + } + } + + # Convert weights to elements in P + foreach my $abbr (keys %$triples) { + # First delete all weights params from the input, in short or long-named version + delete($P{$abbr}); + delete($P{$ABBR2FULL{$abbr}}); + # Then feed P with the current values + foreach my $feature (@{$used_triples{$abbr}}) { + my ($val, $min, $max) = @$feature; + my $name = defined $ABBR2FULL{$abbr} ? $ABBR2FULL{$abbr} : $abbr; + push @{$P{$name}}, $val; + } + } + + # create new moses.ini decoder config file by cloning and overriding the original one + open(INI,$infn) or die "Can't read $infn"; + delete($P{"config"}); # never output + print "Saving new config to: $outfn\n"; + open(OUT,"> $outfn") or die "Can't write $outfn"; + print OUT "# MERT optimized configuration\n"; + print OUT "# decoder $___DECODER\n"; + print OUT "# BLEU $bleu_achieved on dev $___DEV_F\n"; + print OUT "# We were before running iteration $iteration\n"; + print OUT "# finished ".`date`; + my $line = ; + while(1) { + last unless $line; + + # skip until hit [parameter] + if ($line !~ /^\[(.+)\]\s*$/) { + $line = ; + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + next; + } + + # parameter name + my $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + print OUT "[$parameter]\n"; + + # change parameter, if new values + if (defined($P{$parameter})) { + # write new values + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + delete($P{$parameter}); + # skip until new parameter, only write comments + while($line = ) { + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + last if $line =~ /^\[/; + last unless $line; + } + next; + } + + # unchanged parameter, write old + while($line = ) { + last if $line =~ /^\[/; + print OUT $line; + } + } + + # write all additional parameters + foreach my $parameter (keys %P) { + print OUT "\n[$parameter]\n"; + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + } + + close(INI); + close(OUT); + print STDERR "Saved: $outfn\n"; +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} +sub ensure_full_path { + my $PATH = shift; +$PATH =~ s/\/nfsmnt//; + return $PATH if $PATH =~ /^\//; + my $dir = `pawd 2>/dev/null`; + if(!$dir){$dir = `pwd`;} + chomp($dir); + $PATH = $dir."/".$PATH; + $PATH =~ s/[\r\n]//g; + $PATH =~ s/\/\.\//\//g; + $PATH =~ s/\/+/\//g; + my $sanity = 0; + while($PATH =~ /\/\.\.\// && $sanity++<10) { + $PATH =~ s/\/+/\//g; + $PATH =~ s/\/[^\/]+\/\.\.\//\//g; + } + $PATH =~ s/\/[^\/]+\/\.\.$//; + $PATH =~ s/\/+$//; +$PATH =~ s/\/nfsmnt//; + return $PATH; +} + + + + +sub scan_config { + my $ini = shift; + my $inishortname = $ini; $inishortname =~ s/^.*\///; # for error reporting + # we get a pre-filled counts, because some lambdas are always needed (word penalty, for instance) + # as we walk though the ini file, we record how many extra lambdas do we need + # and finally, we report it + + # in which field (counting from zero) is the filename to check? + my %where_is_filename = ( + "ttable-file" => 4, + "generation-file" => 3, + "lmodel-file" => 3, + "distortion-file" => 3, + "global-lexical-file" => 1, + ); + # by default, each line of each section means one lambda, but some sections + # explicitly state a custom number of lambdas + my %where_is_lambda_count = ( + "ttable-file" => 3, + "generation-file" => 2, + "distortion-file" => 2, + "link-param-count" => 0, + ); + + my %weight_section_short_names = (%FULL2ABBR, + map { ($_, $_) } keys %ABBR2FULL); + # maps both long and short names of weight sections to the short names + + my $config_weights; + # to collect all weight values from moses.ini + # $config_weights->{shortname} is a reference to array of features + + open INI, $ini or die "Can't read $ini"; + my $section = undef; # name of the section we are reading + my $shortname = undef; # the corresponding short name + my $nr = 0; + my $error = 0; + my %defined_files; + my %defined_steps; # check the ini file for compatible mapping steps and actually defined files + while () { + $nr++; + chomp; + next if /^\s*#/; # skip comments + next if /^\s*$/; # skip blank lines + if (/^\[([^\]]*)\]\s*$/) { + $section = $1; + $shortname = $TABLECONFIG2ABBR{$section}; + next; + } + if (defined $section && defined $weight_section_short_names{$section}) { + # this is a weight, store it + my $weightname = $weight_section_short_names{$section}; + $config_weights->{$weightname} = [] + if ! defined $config_weights->{$weightname}; + push @{$config_weights->{$weightname}}, $_; + } + if (defined $section && $section eq "mapping") { + # keep track of mapping steps used + $defined_steps{$1}++ if /^([TG])/ || /^\d+ ([TG])/; + } + if (defined $section + && (defined $where_is_filename{$section} + || defined $where_is_lambda_count{$section})) { + # this ini section is relevant to lambdas + my @flds = split / +/; + my $filenamefield = $where_is_filename{$section}; + if (defined $filenamefield) { + my $fn = $flds[$filenamefield]; + print STDERR "Checking the filename in $section: $fn\n" + if $verbose; + if (defined $fn && $fn !~ /^\s+$/) { + # this is a filename! check it + if ($fn !~ /^\//) { + $error = 1; + print STDERR "$inishortname:$nr:Filename not absolute: $fn\n"; + } + if (! -s $fn && ! -s "$fn.gz" && ! -s "$fn.binphr.idx" + && ! -s "$fn.binlexr.idx" ) { + $error = 1; + print STDERR "$inishortname:$nr:File does not exist or empty: $fn\n"; + } + # remember the number of files used, to know how many lambdas do we need + die "No short name was defined for section $section!" + if ! defined $shortname; + $defined_files{$shortname}++; + } + } + + my $lambdacountfield = $where_is_lambda_count{$section}; + # how many lambdas does this model need? + # either specified explicitly, or the default, i.e. one + my $needlambdas = defined $lambdacountfield + ? $flds[$lambdacountfield] : 1; + + print STDERR "Config needs $needlambdas lambdas for $section (i.e. $shortname)\n" if $verbose; + if (!defined $___LAMBDA # user provides all lambdas on his own + && (!defined $additional_triples->{$shortname} + || scalar(@{$additional_triples->{$shortname}}) < $needlambdas) + && (!defined $additional_tripes_loop->{$shortname}) + ) { + # Add triples with default values + if (!defined $additional_triples->{$shortname}) { + $additional_triples->{$shortname} = (); + } + while (scalar(@{$additional_triples->{$shortname}}) < $needlambdas) { + push @{$additional_triples->{$shortname}}, [1,-1,1]; + } + + } + # note: models may use less parameters than the maximum number + # of triples, but it is actually bad, because then the ranges + # may be meant for another parameter + my @triplets = @{$additional_triples->{$shortname}}; + for(my $lambda=0;$lambda<$needlambdas;$lambda++) { + my $triplet = $lambda; + $triplet %= scalar(@triplets) + if $additional_tripes_loop->{$shortname}; + my ($start, $min, $max) + = @{$triplets[$triplet]}; + push @{$used_triples{$shortname}}, [$start, $min, $max]; + } + } + } + die "$inishortname: File was empty!" if !$nr; + close INI; + for my $pair (qw/T=tm=translation G=g=generation/) { + my ($tg, $shortname, $label) = split /=/, $pair; + $defined_files{$shortname} = 0 if ! defined $defined_files{$shortname}; + $defined_steps{$tg} = 0 if ! defined $defined_steps{$tg}; + + if ($defined_files{$shortname} != $defined_steps{$tg}) { + print STDERR "$inishortname: You defined $defined_files{$shortname} files for $label but use $defined_steps{$tg} in [mapping]!\n"; + $error = 1; + } + } + + # The distance-based reordering model is never mentioned in moses.ini, + # except there is one extra weight-d in the list. So if we spot this + # one extra weight-d, we actually insert the triple for it. + # Hierarchical moses has no distance-based reordering. + push @{$used_triples{"d"}}, [1.0, 0.0, 1.0] + if defined $config_weights->{"d"} + && (!defined $used_triples{"d"} + || scalar @{$config_weights->{"d"}} + == scalar @{$used_triples{"d"}} +1); + + # check the weights provided in the ini file and plug them into the triples + # if --starting-weights-from_ini + foreach my $weightname (keys %used_triples) { + if (!defined $config_weights->{$weightname}) { + print STDERR "$inishortname:Model requires weights '$weightname' but none were found in the ini file.\n"; + $error = 1; + next; + } + my $thesetriplets = $used_triples{$weightname}; + my $theseconfig_weights = $config_weights->{$weightname}; + if (scalar(@$thesetriplets) != scalar(@$theseconfig_weights)) { + print STDERR "$inishortname:Mismatched number of weights for '$weightname'. Expected " + .scalar(@$thesetriplets) .", got ".scalar(@$theseconfig_weights)."\n"; + $error = 1; + next; + } + if ($starting_weights_from_ini) { + # copy weights from moses.ini to the starting value of used_triplets + for (my $i=0; $i < @$theseconfig_weights; $i++) { + $thesetriplets->[$i]->[0] = $theseconfig_weights->[$i]; + } + } + } + + exit(1) if $error; + return (\%defined_files); +} + diff --git a/regmtl.py b/regmtl.py new file mode 100755 index 0000000..8a976f4 --- /dev/null +++ b/regmtl.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python2.6 + +""" +mmert v0.2 +multi-task mert +Copyright 2011 +Patrick Simianer +Heidelberg University, ICL +""" + +# This file is part of MMERT. +# +# MMERT is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# MMERT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with MMERT. If not, see . + + +import os, sys, shutil + + +def read_vec_from_file(fname): + """ File looks like: "w1 w2 ... wn" (on one line). """ + return [float(el) for el in open(fname, 'r').read().strip().split()] + +def vec2str(vector): + """ vec looks like: [0., 1.1, 2.2, ..., d] """ + return " ".join([str(el) for el in vector]) + +def get_biggest_change(vector1, vector2): + """ Biggest difference of two vectors (element-wise) in abs value. """ + biggest = 0 + idx_ = 0 + for element in vector1: + diff = abs(element - vector2[idx_]) + if diff > biggest: + biggest = diff + idx_ += 1 + return biggest + +def get_avg_vec(vecs, vec_len): + """ 'vecs' is a list of vectors; calc arithmetic mean element-wise. """ + avg_vec = [] + num_vecs = len(vecs) + for idx_ in range(vec_len): + mysum = 0. + for vector in vecs: + mysum += vector[idx_] + avg = mysum/num_vecs + avg_vec.append(avg) + return avg_vec + +def get_best_points_line_idx(lines): + """ Line number where to find best point in a mert.log. """ + idx_ = 0 + for line in lines: + if line.startswith("Best point: "): + break + idx_ += 1 + return idx_ + +def get_weights_from_log(fname): + """ Get weights from a mert.log. """ + logfile = open(fname, "r") + lines = logfile.readlines() + logfile.close() + idx_ = get_best_points_line_idx(lines) + return [float(el) for el in \ + lines[idx_].split("Best point: ")[1].split("=>")[0].split()] + +def write_new_log_file(fname, vec_): + """ Write a new mert.log file with weights found in vec. """ + logfile = open(fname, "r") + lines = logfile.readlines() + logfile.close() + idx_ = get_best_points_line_idx(lines) + log = "".join(lines[0:idx_]) + log += "Best point: "+vec2str(vec_)+" => 99\n" + log += "".join(lines[idx_+1:]) + open(fname, "w+").write(log) + + + + +# parameters +DIR_PREFIX = sys.argv[1] # directory prefix + # (for task specific directories) +WORKDIR = sys.argv[2] # work directory +TASKS = sys.argv[3].split(",") # task list "a,b,c..." +CUR_IT = int(sys.argv[4]) # current (global) iteration +NUM_WEIGHTS = int(sys.argv[5]) # dimensionality of a weight vector? +MIN_CHANGE = float(sys.argv[6]) # min change in a vector + # to still continue +LAMBDA = float(sys.argv[7]) # lambda for regularization +FIRST_AVERAGE = bool(int(sys.argv[8])) # first average vector, + # 0 or self provided + +print " CUR_IT %s" % str(CUR_IT) +if CUR_IT == 1: + print " DIR_PREFIX %s" % DIR_PREFIX + print " WORKDIR %s" % WORKDIR + print " TASKS %s" % str(TASKS) + print " NUM_WEIGHTS %s" % str(NUM_WEIGHTS) + print " MIN_CHANGE %s" % str(MIN_CHANGE) + print " LAMBDA %s" % str(LAMBDA) + print " FIRST_AVERAGE %s" % str(FIRST_AVERAGE) + if not FIRST_AVERAGE: + print " first average is 0 vector, creating file.." + open(WORKDIR+"/run0.avector.txt", "w+").write(\ + " ".join(["0.0" for i in range(NUM_WEIGHTS)])) + else: + print " looking for pre-defined first average.." + if not os.path.exists(WORKDIR+"/run0.avector.txt"): + sys.stderr.write(sys.argv[0] + \ + " ERROR: You have to provide your own run0.avector.txt \ + containing the first average vector! arg FIRST_AVERAGE=" \ + +str(FIRST_AVERAGE)+"\n") + sys.exit (1) + +ITERATIONS = {} # current iteration(s) +CUR_VECS = {} # current weight vectors (from latest mert iteration) +VEC_LEN = 0 # check vector length +for t in TASKS: + print " reading vector from task %s" % t + ITERATIONS[t] = int(open(DIR_PREFIX + "_" + t +\ + "/finished_step.txt").read().strip()) + cur = get_weights_from_log(DIR_PREFIX+"_"+t+"/run"+str((ITERATIONS[t])) + \ + ".mert.log") + print cur + VEC_LEN = len(cur) + if VEC_LEN != NUM_WEIGHTS: + sys.stderr.write(sys.argv[0] + \ + " ERROR: vector lengths differ! (task: "+t+", len: "+str(VEC_LEN) + \ + ", expected: "+str(NUM_WEIGHTS)+")\n") + CUR_VECS[t] = cur + +# get previous average vector +PREV_AVG_VEC = read_vec_from_file(WORKDIR+"/run"+str(CUR_IT-1)+".avector.txt") +print " Previous average vector (clip against):" +print PREV_AVG_VEC + +# new weight vecs +print " Calculating new weight vectors (clipping).." +NEXT_VECS = {} +for (t, vec) in CUR_VECS.items(): + print " Task %s, before (current)" % t + print vec + idx = 0 + nxt = [] + for w in vec: + if w == 0: + nxt.append(0.0) + idx += 1 + continue + if (w - PREV_AVG_VEC[idx] > 0): + nxt.append(max(PREV_AVG_VEC[idx], w - LAMBDA)) + elif (w - PREV_AVG_VEC[idx] < 0): + nxt.append(min(PREV_AVG_VEC[idx], w + LAMBDA)) + else: + nxt.append(w) + idx += 1 + NEXT_VECS[t] = nxt + print " Task %s, after (next)" % t + print nxt + +# get current average vector +AVG_VEC = get_avg_vec(NEXT_VECS.values(), VEC_LEN) + +print " Current average vector (after clipping):" +print AVG_VEC + +# convergence? +print " Previous average:", PREV_AVG_VEC +print " Current average:", AVG_VEC +BIGGEST_CHANGE = get_biggest_change(PREV_AVG_VEC, AVG_VEC) +print " Biggest change:", BIGGEST_CHANGE +# note: use weights from latest mert.log for eval! +if BIGGEST_CHANGE < MIN_CHANGE: + print " Converged!" + open(WORKDIR+"/CONVERGED", "w+").close() + +# overwrite (and backup) mert.log +for t in TASKS: + logfilename = DIR_PREFIX+"_"+t+"/run"+str(ITERATIONS[t])+".mert.log" + shutil.copy(logfilename, logfilename+".bak") + write_new_log_file(logfilename, NEXT_VECS[t]) + open(DIR_PREFIX+"_"+t+"/weightshist", "a+").write(str(ITERATIONS[t])+"\n"+ \ + "cur "+str(CUR_VECS[t])+"\n"+ \ + "prev avg "+str(PREV_AVG_VEC)+"\n"+ \ + "next "+str(NEXT_VECS[t])+"\n"+ \ + "cur avg "+str(AVG_VEC)+"\n---\n") + +# write out current avg vector +open(WORKDIR+"/run"+str(CUR_IT)+".avector.txt", "w+").write(vec2str(AVG_VEC)) + -- cgit v1.2.3