summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compound-split/cdec-de.ini3
-rw-r--r--compound-split/de/dev.nouns595
-rw-r--r--compound-split/de/weights.noun-only-1best-only20
-rw-r--r--gi/pf/base_measures.cc56
-rw-r--r--gi/pf/base_measures.h31
-rw-r--r--gi/pf/dpnaive.cc17
-rw-r--r--gi/pf/monotonic_pseg.h5
-rw-r--r--gi/pf/pf.h84
-rw-r--r--gi/pf/pfdist.cc18
-rw-r--r--gi/pf/pfnaive.cc36
-rw-r--r--training/lbfgs_test.cc7
-rw-r--r--training/mpi_flex_optimize.cc145
-rw-r--r--training/optimize.cc3
-rwxr-xr-xvest/parallelize.pl24
-rwxr-xr-xword-aligner/aligner.pl8
-rwxr-xr-xword-aligner/ortho-norm/mg.pl13
-rwxr-xr-xword-aligner/ortho-norm/rw.pl13
-rwxr-xr-xword-aligner/stemmers/mg.pl39
-rwxr-xr-xword-aligner/stemmers/rw.pl38
19 files changed, 1049 insertions, 106 deletions
diff --git a/compound-split/cdec-de.ini b/compound-split/cdec-de.ini
index 2bfe63d2..85424e91 100644
--- a/compound-split/cdec-de.ini
+++ b/compound-split/cdec-de.ini
@@ -1,5 +1,6 @@
formalism=csplit
intersection_strategy=full
weights=de/weights.trained
+#weights=de/weights.noun-only-1best-only
feature_function=CSplit_BasicFeatures de/large_dict.de.gz de/badlist.de.gz
-feature_function=CSplit_ReverseCharLM de/charlm.rev.5gm.de.klm
+feature_function=CSplit_ReverseCharLM de/charlm.rev.5gm.de.lm.gz
diff --git a/compound-split/de/dev.nouns b/compound-split/de/dev.nouns
new file mode 100644
index 00000000..04f597f2
--- /dev/null
+++ b/compound-split/de/dev.nouns
@@ -0,0 +1,595 @@
+untersuchungsausschuss ||| # untersuchung ausschuss
+hochzeiten ||| # hochzeiten
+damalige ||| # damalige
+siegfried ||| # siegfried
+jaschinski ||| # jaschinski
+verdächtigenkreis ||| # verdächtige kreis
+staatsanwaltschaft ||| # staat anwaltschaft
+landesbank ||| # land bank
+feldherrn ||| # feld herrn
+handelsblatt ||| # handel blatt
+alexander ||| # alexander
+großen ||| # großen
+wachstumsplänen ||| # wachstum plänen
+konsolidierung ||| # konsolidierung
+branche ||| # branche
+institut ||| # institut
+augenhöhe ||| ((('#',0,1),),(('auge',0,1),('augen',0,1),),(('höhe',0,1),),)
+wiederherstellung ||| # wiederherstellung
+grundstück ||| ((('#',0,1),),(('grund',0,1),('grundstück',0,2),),(('stück',0,1),),)
+ruhestand ||| # ruhestand
+widerstand ||| # widerstand
+versammlungsgesetz ||| ((('#',0,1),),(('versammlung',0,1),('versammlungs',0,1),),(('gesetz',0,1),),)
+verstöße ||| # verstöße
+sachbeschädigung ||| # sach beschädigung
+vereinigungen ||| # vereinigungen
+glashaus ||| # glas haus
+auenlandschaft ||| # auen landschaft
+fundament ||| # fundament
+vierhock ||| # vierhock
+idylle ||| # idylle
+johannes ||| # johannes
+sportjacke ||| # sport jacke
+mittelmeerinseln ||| # mittelmeer inseln
+stiefelförmige ||| # stiefel förmige
+wiederaufnahme ||| # wiederaufnahme
+tonbandaufnahme ||| # tonband aufnahme
+korrekturzucker ||| # korrektur zucker
+meeresspiegel ||| # meer spiegel
+gesamtausgabe ||| # gesamt ausgabe
+losfertigung ||| # los fertigung
+reichstag ||| # reichstag
+monntag ||| # monntag
+mittwoch ||| # mittwoch
+freitag ||| # freitag
+sonntag ||| # sonntag
+nachbarschaftsbeziehungen ||| # nachbarschaft beziehungen
+saftbar ||| # saft bar
+produktionsprozesses ||| # produktion prozesses
+gesamtzusammenhang ||| # gesamt zusammenhang
+volkswirtschaftslehre ||| # volkswirtschaft lehre
+losverfahren ||| # los verfahren
+schifffahrt ||| # schiff fahrt
+dienstag ||| # dienstag
+donnerstag ||| # donnerstag
+samstag ||| # samstag
+sonnabend ||| # sonnabend
+mitglied ||| # mitglied
+abblendlicht ||| # abblend licht
+abbrucharbeiten ||| # abbruch arbeiten
+abergläubischen ||| # abergläubischen
+abfallbewirtschaftungshierarchie ||| # abfall bewirtschaftung hierarchie
+abfallbewirtschaftungsplänen ||| # abfall bewirtschaftung plänen
+abfertigungsgebühren ||| ((('#',0,1),),(('abfertigung',0,1),('abfertigungs',0,1),),(('gebühren',0,1),),)
+beitragsunabhängig ||| ((('#',0,1),),(('beitrag',0,1),('beitrags',0,1),),(('unabhängig',0,1),),)
+einreichungsfrist ||| ((('#',0,1),),(('einreichung',0,1),),(('frist',0,1),),)
+europaabgeordnete ||| # europa abgeordnete
+früherkennungskampagnen ||| ((('#',0,1),),(('früh',0,1),),(('erkennung',0,1),('erkennungs',0,1),),(('kampagnen',0,1),),)
+hilfsagenturen ||| ((('#',0,1),),(('hilf',0,1),('hilfs',0,1),),(('agenturen',0,1),),)
+hilfsinstitutionen ||| ((('#',0,1),),(('hilf',0,1),('hilfs',0,1),),(('institutionen',0,1),),)
+kaisergranat ||| # kaiser granat
+kalifornien ||| # kalifornien
+kampfflugzeug ||| # kampf flugzeug
+luftangriffen ||| # luft angriffen
+luftsicherheit ||| # luft sicherheit
+mithäftlingen ||| # mithäftlingen
+wartungsarbeiten ||| # wartung arbeiten
+wasseraufbereitungsanlagen ||| # wasser aufbereitung anlagen
+wasserdienstleistungen ||| # wasser dienstleistungen
+weihnachtsgeschenk ||| # weihnacht geschenk
+weltarbeitsorganisation ||| # welt arbeit organisation
+zeitdruck ||| # zeit druck
+bundesagentur ||| # bund agentur
+fahrstuhlschächten ||| # fahrstuhl schächten
+elektroindustrie ||| # elektro industrie
+sonderpreis ||| # sonder preis
+modellrechnung ||| # modell rechnung
+treibhauseffekt ||| # treibhaus effekt
+europatournee ||| # europa tournee
+wirtschaftskrise ||| # wirtschaft krise
+mittwochabend ||| # mittwoch abend
+abschiedsgala ||| # abschied gala
+betriebstag ||| # betrieb tag
+flughafen ||| # flughafen
+truthahn ||| # truthahn
+gehirnregion ||| # gehirn region
+feststellung ||| # feststellung
+thanksgiving ||| # thanks giving
+börsenhändler ||| # börse händler
+risikoforschung ||| # risiko forschung
+finanzkrise ||| # finanz krise
+erkenntnis ||| # erkenntnis
+vergangenheit ||| # vergangenheit
+festtagsbratenesser ||| # festtag braten esser
+ungemütlichkeit ||| # ungemütlichkeit
+privatanlegern ||| # privat anlegern
+finanzmakler ||| # finanz makler
+immobilienfonds ||| # immobilien fonds
+wertzuwachs ||| # wert zuwachs
+marmelade ||| # marmelade
+roboterzofe ||| # roboter zofe
+heinrich ||| # heinrich
+zeitdehnung ||| # zeit dehnung
+zeitlosigkeit ||| # zeitlosigkeit
+auseinandersetzung ||| # auseinandersetzung
+immobilienfondskrise ||| # immobilien fond krise
+stackelberg ||| # stackelberg
+bodenseedampfschifffahrtsgesellschaftskapitänsmütze ||| # bodensee dampf schiff fahrt gesellschaft kapitän mütze
+hamburg ||| # hamburg
+justiz ||| # justiz
+bischof ||| # bischof
+richard ||| # richard
+williamson ||| # williamson
+standes ||| # standes
+soutane ||| # soutane
+millionen ||| # millionen
+gaskammern ||| # gas kammern
+holocaustleugner ||| # holocaust leugner
+großansicht ||| # groß ansicht
+gegenfrage ||| # gegen frage
+journalisten ||| # journalisten
+fernsehsender ||| # fernseh sender
+revisionisten ||| # revisionisten
+dreihunderttausend ||| # drei hundert tausend
+konzentrationslagern ||| # konzentration lagern
+schornsteinhöhen ||| # schornstein höhen
+touristen ||| # touristen
+auschwitz ||| # auschwitz
+birkenau ||| # birkenau
+antisemitismus ||| # antisemitismus
+interviewer ||| # interviewer
+bischof ||| # bischof
+williamson ||| # williamson
+wahrheit ||| # wahrheit
+schlecht ||| # schlecht
+interessiert ||| # interessiert
+entrückte ||| # entrückte
+deutscher ||| # deutscher
+blamiert ||| # blamiert
+katholische ||| # katholische
+ausgabe ||| # ausgabe
+skandal ||| # skandal
+interview ||| # interview
+berichtet ||| # berichtet
+bayerischen ||| # bayerischen
+zaitzhofen ||| # zaitzhofen
+fernsehen ||| # fernsehen
+regensburger ||| # regensburger
+ruckdäschel ||| # ruck däschel
+artikel ||| # artikel
+leitete ||| # leitete
+angaben ||| # angaben
+ermittlungsverfahren ||| # ermittlung verfahren
+verdacht ||| # verdacht
+volksverhetzung ||| # volk verhetzung
+rechtsanwalt ||| # recht anwalt
+einlassung ||| # einlassung
+behörde ||| # behörde
+gespräch ||| # gespräch
+veröffentlichung ||| # veröffentlichung
+deutschland ||| # deutschland
+exkommunikation ||| # exkommunikation
+hintergrund ||| # hintergrund
+strafgesetzbuchs ||| # straf gesetz buchs
+leugnung ||| # leugnung
+strafverfolgung ||| # straf verfolgung
+hintertür ||| # hintertür
+entziehen ||| # entziehen
+inzwischen ||| # inzwischen
+einsicht ||| # einsicht
+ermittlungsakten ||| # ermittlung akten
+ruckdäschel ||| # ruckdäschel
+überlegen ||| # überlegen
+schwedischen ||| # schwedischen
+journalisten ||| # journalisten
+hintergründen ||| # hintergründen
+absprachen ||| # absprachen
+vorfeld ||| # vorfeld
+oberstaatsanwalt ||| # ober staat anwalt
+mendelssohn ||| # mendelssohn
+bartholdy ||| # bartholdy
+familie ||| # familie
+philosophen ||| # philosophen
+abraham ||| # abraham
+bankkaufmannslehre ||| # bank kaufmann lehre
+kompagnon ||| # kompagnon
+mutter ||| # mutter
+salomon ||| # salomon
+fabrikantenfamilie ||| # fabrikanten familie
+schwester ||| # schwester
+hensel ||| # hensel
+geschwister ||| # geschwister
+rebecca ||| # rebecca
+mathematiker ||| # mathematiker
+dirichlet ||| # dirichlet
+kinder ||| # kinder
+pfarrer ||| # pfarrer
+gemeinde ||| # gemeinde
+berliner ||| # berliner
+jerusalems ||| # jerusalems
+haustaufe ||| # haus taufe
+gelegenheit ||| # gelegenheit
+taufnamen ||| # tauf namen
+ludwig ||| # ludwig
+familiennamen ||| # familie namen
+christliche ||| # christliche
+vorbesitzers ||| # vorbesitzers
+gartens ||| # gartens
+abraham ||| # abraham
+mendelssohn ||| # mendelssohn
+konvertierten ||| # konvertierten
+christentum ||| # christentum
+kindheit ||| # kindheit
+besetzung ||| # besetzung
+familie ||| # familie
+großmutter ||| # groß mutter
+großvater ||| # groß vater
+musikunterricht ||| # musik unterricht
+schülerin ||| # schülerin
+philipp ||| # philipp
+kirnbergers ||| # kirnbergers
+großtante ||| # groß tante
+tochter ||| # tochter
+finanzmanns ||| # finanz manns
+daniel ||| # daniel
+preußischer ||| # preußischer
+friedrich ||| # friedrich
+wilhelm ||| # wilhelm
+naturalisationspatent ||| # naturalisation patent
+tradition ||| # tradition
+gönnerin ||| # gönnerin
+lehrerin ||| # lehrerin
+aufenthalts ||| # aufenthalts
+rückkehr ||| # rückkehr
+unterricht ||| # unterricht
+komposition ||| # komposition
+ausbildung ||| # ausbildung
+schriftstellers ||| # schriftstellers
+neunjähriger ||| # neun jähriger
+klavierpart ||| # klavier part
+klaviertrio ||| # klavier trio
+altsänger ||| # alt sänger
+akademie ||| # akademie
+kirchenmusik ||| # kirche musik
+schnelligkeit ||| # schnelligkeit
+klaviersonaten ||| # klavier sonaten
+klaviertrio ||| # klavier trio
+orgelstücke ||| # orgel stücke
+streichersinfonien ||| # streicher sinfonien
+motetten ||| # motetten
+singspiele ||| # sing spiele
+soldatenliebschaft ||| # soldaten liebschaft
+singspiels ||| # sing spiels
+komödianten ||| # komödianten
+freundschaft ||| # freundschaft
+barack ||| # barack
+getappt ||| # getappt
+anstehenden ||| # anstehenden
+herkulesaufgaben ||| # herkules aufgaben
+westen ||| # westen
+regierungsgrundsätzen ||| # regierung grundsätzen
+genügen ||| # genügen
+problem ||| # problem
+krankheit ||| # krankheit
+washington ||| # washington
+symptome ||| # symptome
+verquickung ||| # verquickung
+öffentlichen ||| # öffentlichen
+interessen ||| # interessen
+lobbyistenumtriebe ||| # lobbyisten umtriebe
+steuerzahlungsphobie ||| # steuer zahlung phobie
+krankheit ||| # krankheit
+probleme ||| # probleme
+problembewältiger ||| # problem bewältiger
+verzichten ||| # verzichten
+ausnahmen ||| # ausnahmen
+nummer ||| # nummer
+verteidigungsministerium ||| # verteidigung ministerium
+fachmann ||| # fachmann
+rüstungslobbyist ||| # rüstung lobbyist
+geithner ||| # geithner
+finanzminister ||| # finanz minister
+fiskus ||| # fiskus
+daschle ||| # daschle
+obamas ||| # obamas
+wunschkandidat ||| # wunsch kandidat
+gesundheitsministerium ||| # gesundheit ministerium
+begleichung ||| # begleichung
+feststand ||| # fest stand
+regierung ||| # regierung
+proteststurm ||| # protest sturm
+handtuch ||| # hand tuch
+konzentrationslager ||| # konzentration lager
+entwicklungspolitik ||| # entwicklung politik
+zeitungsbericht ||| # zeitung bericht
+managergehälter ||| # manager gehälter
+unternehmen ||| # unternehmen
+staatshilfen ||| # staat hilfen
+präsidenten ||| # präsidenten
+gehaltsobergrenze ||| # gehalt obergrenze
+sonnensystems ||| # sonnen systems
+planet ||| # planet
+exoplanet ||| # exo planet
+durchbruch ||| # durch bruch
+ulrich ||| # ulrich
+schnabel ||| # schnabel
+kulturgeschichte ||| # kultur geschichte
+milchwirtschaft ||| # milch wirtschaft
+menschen ||| # menschen
+geschäftsführung ||| # geschäft führung
+amtsgericht ||| # amts gericht
+göppingen ||| # göppingen
+antrag ||| # antrag
+eröffnung ||| # eröffnung
+insolvenzverfahrens ||| # insolvenz verfahrens
+dieter ||| # dieter
+informationsangebote ||| # information angebote
+vereinfacht ||| # vereinfacht
+vokabelglossar ||| # vokabel glossar
+gewaltige ||| # gewaltige
+haushohe ||| # haushohe
+segler ||| # segler
+oktober ||| # oktober
+gibraltar ||| # gibraltar
+bronzekanonen ||| # bronze kanonen
+tonnen ||| # tonnen
+goldmünzen ||| # gold münzen
+kaufleute ||| # kauf leute
+schätzungen ||| # schätzungen
+vergangenen ||| # vergangenen
+millionen ||| # millionen
+schiffe ||| # schiffe
+meeresgrund ||| # meer grund
+wasserfahrzeuge ||| # wasser fahrzeuge
+kriegsschiffe ||| # krieg schiffe
+passagierdampfer ||| # passagier dampfer
+handelsflotten ||| # handel flotten
+piratenboote ||| # pirate boote
+tanker ||| # tanker
+großbritannien ||| # groß britannien
+ältesten ||| # ältesten
+seefahrernationen ||| # see fahrer nationen
+auffassung ||| # auffassung
+marinehistoriker ||| # marine historiker
+schiffe ||| # schiffe
+ortung ||| # ortung
+victory ||| # victory
+bergungsunternehmen ||| # bergung unternehmen
+meeresfunde ||| # meer funde
+geschichte ||| # geschichte
+entdeckern ||| # entdeckern
+kriegsboot ||| # krieg boot
+jahrhundert ||| # jahrhundert
+tausend ||| # tausend
+schiffen ||| # schiffen
+entdeckung ||| # entdeckung
+tauchjagd ||| # tauch jagd
+überreste ||| # überreste
+epochen ||| # epochen
+länder ||| # länder
+streiten ||| # streiten
+schiffsfriedhöfe ||| # schiff friedhöfe
+tauchjagd ||| # tauch jagd
+schatzjäger ||| # schatz jäger
+moderne ||| # moderne
+technik ||| # technik
+jahrhundertealte ||| # jahrhunderte alte
+wracks ||| # wracks
+geheimnisse ||| # geheimnisse
+besitzrechte ||| # besitz rechte
+wahrscheinlichkeit ||| # wahrscheinlichkeit
+minimaltemperatur ||| # minimal temperatur
+winterhalbjahr ||| # winter halb jahr
+celsius ||| # celsius
+sommerhalbjahr ||| # sommer halb jahr
+maximaltemperatur ||| # maximal temperatur
+kreuzungsversuche ||| # kreuzung versuche
+erbsenpflanzen ||| # erbse pflanzen
+klostergarten ||| # kloster garten
+befruchtungen ||| # befruchtungen
+zehntausende ||| # zehn tausende
+pflanzenhybriden ||| # pflanze hybriden
+fleißaufgabe ||| ((('#',0,1),),(('fleiß',0,1),('fleißaufgabe',0,2),),(('aufgabe',0,1),),)
+hartnäckigkeit ||| # hartnäckigkeit
+naturforschers ||| # natur forschers
+innenminister ||| # innen minister
+falschinformationen ||| # falsch informationen
+analysen ||| # analysen
+hintergründe ||| # hintergründe
+menschen ||| # menschen
+containerschiff ||| # container schiff
+unfall ||| # unfall
+umweltdesaster ||| # umwelt desaster
+australien ||| # australien
+containerschiff ||| # container schiff
+unfall ||| # unfall
+australien ||| # australien
+schweröl ||| # schweröl
+freitag ||| # freitag
+flüssigkeit ||| # flüssigkeit
+strände ||| # strände
+kilometern ||| # kilometern
+kapitän ||| # kapitän
+pacific ||| # pacific
+adventurer ||| # adventurer
+hongkong ||| # hongkong
+verlust ||| # verlust
+tonnen ||| # tonnen
+regierungschef ||| ((('#',0,1),),(('regierung',0,1),('regierungs',0,1),),(('chef',0,1),),)
+queensland ||| # queensland
+samstag ||| # samstag
+kapitän ||| # kapitän
+vortag ||| # vortag
+vertuschung ||| # vertuschung
+vorgeworfen ||| # vorgeworfen
+mittwoch ||| # mittwoch
+containerschiff ||| # container schiff
+naturreservate ||| # natur reservate
+touristenstrände ||| # touristen strände
+ostküste ||| # ost küste
+nördlich ||| # nördlich
+brisbane ||| # brisbane
+bestechungsversuch ||| ((('#',0,1),),(('bestechungs',0,1),('bestechung',0,1),),(('versuch',0,1),),)
+campingkocher ||| # camping kocher
+integration ||| # integration
+funktechnik ||| # funk technik
+mobiltelefone ||| # mobil telefone
+aktenkoffers ||| ((('#',0,1),),(('akten',0,1),('aktenkoffers',0,2),),(('koffers',0,1),),)
+vorstufe ||| # vorstufe
+kohlendioxids ||| ((('#',0,1),),(('kohle',0,1),('kohlen',0,1),),(('dioxids',0,1),),)
+computermodelle ||| # computer modelle
+schützen ||| # schützen
+climategate ||| ((('#',0,1),),(('climate',0,1),('climategate',0,2),),(('gate',0,1),),)
+skandal ||| # skandal
+forschungsinstitut ||| ((('#',0,1),),(('forschung',0,1),('forschungs',0,1),),(('institut',0,1),),)
+climate ||| # climate
+research ||| # research
+folgenschwerste ||| ((('#',0,1),),(('folgenschwerste',0,2),('folgen',0,1),('folge',0,1),),(('schwerste',0,1),),)
+wissenschaftsdebatte ||| ((('#',0,1),),(('wissenschaft',0,1),('wissenschafts',0,1),),(('debatte',0,1),),)
+computerhacker ||| # computer hacker
+instituts ||| # instituts
+forscher ||| # forscher
+prognose ||| # prognose
+erwärmung ||| # erwärmung
+wissenschaftler ||| # wissenschaftler
+datenlieferanten ||| # daten lieferanten
+weltklimarats ||| # welt klima rats
+fraktionen ||| # fraktionen
+debatte ||| # debatte
+schule ||| # schule
+repräsentiert ||| # repräsentiert
+mehrheit ||| # mehrheit
+wissenschaftler ||| # wissenschaftler
+stelle ||| # stelle
+forscher ||| # forscher
+klimamodelle ||| # klimamodelle
+computersimulationen ||| # computer simulationen
+hauptverantwortlichen ||| # haupt verantwortlichen
+erwärmung ||| # erwärmung
+menschen ||| # menschen
+spatenstich ||| # spaten stich
+kirchenneubau ||| # kirche neu bau
+ostdeutschlands ||| # ost deutschlands
+rumänien ||| # rumänien
+berlin ||| # berlin
+regierung ||| # regierung
+ankündigung ||| # ankündigung
+deutschlands ||| # deutschlands
+frankreichs ||| # frankreichs
+rumäniens ||| # rumäniens
+bulgariens ||| # bulgariens
+schengen ||| # schengen
+bukarest ||| # bukarest
+informationen ||| # informationen
+verletzung ||| # verletzung
+vertrags ||| # vertrags
+lissabon ||| # lissabon
+rumänischer ||| # rumänischer
+zollbeamter ||| # zoll beamter
+grenze ||| # grenze
+rumänien ||| # rumänien
+republik ||| # republik
+moldau ||| # moldau
+dezember ||| # dezember
+regierung ||| # regierung
+bukarest ||| # bukarest
+kontrollen ||| # kontrollen
+grenzen ||| # grenzen
+rumänien ||| # rumänien
+bulgarien ||| # bulgarien
+länder ||| # länder
+fortschritte ||| # fortschritte
+korruption ||| # korruption
+kriminalität ||| # kriminalität
+informationen ||| # informationen
+zeitung ||| # zeitung
+vertragsverletzung ||| # vertrag verletzung
+deutschland ||| # deutschland
+frankreich ||| # frankreich
+haltung ||| # haltung
+rumäniens ||| # rumäniens
+außenministerium ||| ((('#',0,1),),(('außen',0,1),),(('ministerium',0,1),),)
+präzedenzfall ||| ((('#',0,1),),(('präzedenzfall',0,2),('präzedenz',0,1),),(('fall',0,1),),)
+staatspräsident ||| ((('#',0,1),),(('staatspräsident',0,2),('staats',0,1),('staat',0,1),),(('präsident',0,1),),)
+georgi ||| # georgi
+parwanow ||| # parwanow
+verständnis ||| # verständnis
+bulgarien ||| # bulgarien
+verstehen ||| # verstehen
+erklärung ||| # erklärung
+verzögerung ||| # verzögerung
+mittwoch ||| # mittwoch
+haltung ||| # haltung
+hintergrund ||| # hintergrund
+streits ||| # streits
+regierung ||| # regierung
+ministerpräsident ||| ((('#',0,1),),(('minister',0,1),),(('präsident',0,1),),)
+grenzkontrollen ||| ((('#',0,1),),(('grenz',0,1),),(('kontrollen',0,1),),)
+weltkrieg ||| ((('#',0,1),),(('welt',0,1),('weltkrieg',0,2),),(('krieg',0,1),),)
+abwehr ||| # abwehr
+admirals ||| # admirals
+canaris ||| # canaris
+sprengsätze ||| # spreng sätze
+apfelsinenkisten ||| # apfelsine kisten
+hafenarbeiter ||| # hafen arbeiter
+schiffe ||| # schiffe
+zeiten ||| # zeiten
+militärdiktatur ||| ((('#',0,1),),(('militär',0,1),),(('diktatur',0,1),),)
+widerstandsgruppe ||| ((('#',0,1),),(('widerstand',0,1),('widerstands',0,1),),(('gruppe',0,1),),)
+pfirsiche ||| # pfirsiche
+aprikosen ||| # aprikosen
+vergiftet ||| # vergiftet
+kuklina ||| # kuklina
+trägerin ||| # trägerin
+alternativen ||| # alternativen
+nobelpreis ||| # nobel preis
+kämpft ||| # kämpft
+rechte ||| # rechte
+soldaten ||| # soldaten
+russlands ||| # russlands
+online ||| # online
+sprach ||| # sprach
+menschenrechte ||| # menschen rechte
+heimat ||| # heimat
+kaufrausch ||| ((('#',0,1),),(('kauf',0,1),),(('rausch',0,1),),)
+kommerzialisierung ||| # kommerzialisierung
+weihnachten ||| # weihnachten
+wikileaks ||| # wikileaks
+verfassungsgericht ||| ((('#',0,1),),(('verfassung',0,1),('verfassungs',0,1),),(('gericht',0,1),),)
+berlusconis ||| # berlusconis
+immunität ||| # immunität
+zinspolitik ||| # zins politik
+inflation ||| # inflation
+eurozone ||| ((('#',0,1),),(('euro',0,1),('eurozone',0,2),),(('zone',0,1),),)
+kontrolle ||| # kontrolle
+futtermittelindustrie ||| # futter mittel industrie
+deutschland ||| # deutschland
+regierung ||| # regierung
+vertrauen ||| # vertrauen
+westerwelle ||| # westerwelle
+abzugsdatum ||| ((('#',0,1),),(('abzugs',0,1),('abzug',0,1),),(('datum',0,1),),)
+ghettoblaster ||| # ghetto blaster
+alltag ||| # alltag
+ikonen ||| # ikonen
+jugendkultur ||| # jugend kultur
+hochrechnungen ||| # hochrechnungen
+mckinsey ||| # mckinsey
+company ||| # company
+kaufkraft ||| # kauf kraft
+aufstrebenden ||| # aufstrebenden
+mittelschicht ||| # mittel schicht
+jahrzehnts ||| # jahrzehnts
+billionen ||| # billionen
+dollar ||| # dollar
+erbrauchsniveau ||| ((('#',0,1),),(('erbrauch',0,1),('erbrauchs',0,1),),(('niveau',0,1),),)
+staaten ||| # staaten
+schwellenländer ||| ((('#',0,1),),(('schwellen',0,1),('schwelle',0,1),),(('länder',0,1),),)
+brasilien ||| # brasilien
+russland ||| # russland
+indien ||| # indien
+frühstück ||| # frühstück
+fortschritt ||| # fortschritt
+frühstückstisch ||| # frühstück tisch
+familie ||| # familie
+jahren ||| # jahren
+tageszeitung ||| # tages zeitung
diff --git a/compound-split/de/weights.noun-only-1best-only b/compound-split/de/weights.noun-only-1best-only
new file mode 100644
index 00000000..d5f09b82
--- /dev/null
+++ b/compound-split/de/weights.noun-only-1best-only
@@ -0,0 +1,20 @@
+# Objective = 138.848 (eval count=264)
+WordCount 11.333477067970925
+LettersSq -0.029215039089113905
+LettersSqrt 2.9632214600626146
+InDict -13.179505941917153
+InDictSubWord 1.3841593332953199
+Short 0.69941278454918432
+Long -0.25571511002085112
+OOV 24.512983009660331
+OOVSubWord -1.5433992715303351
+ShortRange -1.1723773936213888
+HighFreq -3.7705003072208156
+MedFreq -0.051578574665604303
+Freq 0.21221436730925425
+FreqLen1 -1.5685463970117162
+FreqLen2 -0.78986836749506351
+Bad -29.51141124634686
+RevCharLM 0.62803557957086842
+FugS 0.1285546128533254
+FugN 0.47201502660323558
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index f8ddfd32..8adb37d7 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -89,6 +89,62 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,
return p;
}
+prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));
+ prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));
+
+ prob_t p1;
+ p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1)
+ // elen | flen ~Pois(flen + 0.01)
+ prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01));
+ p1 *= ptrglen;
+ p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform
+ for (int i = 0; i < elen; ++i) { // for each position i in E
+ const WordID trg = vtrg[i + start_trg];
+ prob_t tp = prob_t::Zero();
+ for (int j = -1; j < flen; ++j) {
+ const WordID src = j < 0 ? 0 : vsrc[j + start_src];
+ tp += kM1MIXTURE * model1(src, trg);
+ tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET;
+ }
+ tp *= uniform_src_alignment; // draw a_i ~uniform
+ p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform
+ }
+ if (p1.is_0()) {
+ cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+ abort();
+ }
+
+ prob_t p2;
+ p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1)
+ // flen | elen ~Pois(flen + 0.01)
+ prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01));
+ p2 *= psrclen;
+ p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform
+ for (int i = 0; i < flen; ++i) { // for each position i in E
+ const WordID src = vsrc[i + start_src];
+ prob_t tp = prob_t::Zero();
+ for (int j = -1; j < elen; ++j) {
+ const WordID trg = j < 0 ? 0 : vtrg[j + start_trg];
+ tp += kM1MIXTURE * invmodel1(trg, src);
+ tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE;
+ }
+ tp *= uniform_trg_alignment; // draw a_i ~uniform
+ p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform
+ }
+ if (p2.is_0()) {
+ cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl;
+ abort();
+ }
+
+ static const prob_t kHALF(0.5);
+ return (p1 + p2) * kHALF;
+}
+
JumpBase::JumpBase() : p(200) {
for (unsigned src_len = 1; src_len < 200; ++src_len) {
map<int, prob_t>& cpd = p[src_len];
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index df17aa62..7ce7e2e6 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -97,6 +97,37 @@ struct PhraseJointBase {
const prob_t kUNIFORM_TARGET;
};
+struct PhraseJointBase_BiDir {
+ explicit PhraseJointBase_BiDir(const Model1& m1,
+ const Model1& im1,
+ const double m1mixture,
+ const unsigned vocab_e_size,
+ const unsigned vocab_f_size) :
+ model1(m1),
+ invmodel1(im1),
+ kM1MIXTURE(m1mixture),
+ kUNIFORM_MIXTURE(1.0 - m1mixture),
+ kUNIFORM_SOURCE(1.0 / vocab_f_size),
+ kUNIFORM_TARGET(1.0 / vocab_e_size) {
+ assert(m1mixture >= 0.0 && m1mixture <= 1.0);
+ assert(vocab_e_size > 0);
+ }
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ const Model1& model1;
+ const Model1& invmodel1;
+ const prob_t kM1MIXTURE; // Model 1 mixture component
+ const prob_t kUNIFORM_MIXTURE; // uniform mixture component
+ const prob_t kUNIFORM_SOURCE;
+ const prob_t kUNIFORM_TARGET;
+};
+
// base distribution for jump size multinomials
// basically p(0) = 0 and then, p(1) is max, and then
// you drop as you move to the max jump distance
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index c926487b..db1c43c7 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -31,6 +31,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases")
("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases")
("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
+ ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in base distribution)")
("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
@@ -58,7 +59,7 @@ shared_ptr<MT19937> prng;
template <typename Base>
struct ModelAndData {
- explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
+ explicit ModelAndData(MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) :
model(m),
rng(&*prng),
p0(b),
@@ -139,7 +140,7 @@ struct ModelAndData {
void Sample();
- MonotonicParallelSegementationModel& model;
+ MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& model;
MT19937* rng;
const Base& p0;
prob_t baseprob; // cached value of generating the table table labels from p0
@@ -267,6 +268,10 @@ int main(int argc, char** argv) {
cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
return 1;
}
+ if (!conf.count("inverse_model1")) {
+ cerr << argv[0] << "Please use --inverse_model1 to specify inverse model 1 parameters\n";
+ return 1;
+ }
if (conf.count("random_seed"))
prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
else
@@ -283,10 +288,12 @@ int main(int argc, char** argv) {
assert(corpusf.size() == corpuse.size());
Model1 m1(conf["model1"].as<string>());
- PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- MonotonicParallelSegementationModel m(lp0);
+ Model1 invm1(conf["inverse_model1"].as<string>());
+// PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+ PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+ MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0);
- ModelAndData<PhraseJointBase> posterior(m, lp0, corpuse, corpusf, vocabe, vocabf);
+ ModelAndData<PhraseJointBase_BiDir> posterior(m, alp0, corpuse, corpusf, vocabe, vocabf);
posterior.Sample();
return 0;
diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h
index 7e6af3fc..301aa6d8 100644
--- a/gi/pf/monotonic_pseg.h
+++ b/gi/pf/monotonic_pseg.h
@@ -8,8 +8,9 @@
#include "trule.h"
#include "base_measures.h"
+template <typename BaseMeasure>
struct MonotonicParallelSegementationModel {
- explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) :
+ explicit MonotonicParallelSegementationModel(BaseMeasure& rcp0) :
rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {}
void DecrementRule(const TRule& rule) {
@@ -78,7 +79,7 @@ struct MonotonicParallelSegementationModel {
return prob_t(stop.prob(false, 0.5));
}
- const PhraseJointBase& rp0;
+ const BaseMeasure& rp0;
prob_t base;
CCRP_NoTable<TRule> rules;
CCRP_NoTable<bool> stop;
diff --git a/gi/pf/pf.h b/gi/pf/pf.h
new file mode 100644
index 00000000..ede7cda8
--- /dev/null
+++ b/gi/pf/pf.h
@@ -0,0 +1,84 @@
+#ifndef _PF_H_
+#define _PF_H_
+
+#include <cassert>
+#include <vector>
+#include "sampler.h"
+#include "prob.h"
+
+template <typename ParticleType>
+struct ParticleRenormalizer {
+ void operator()(std::vector<ParticleType>* pv) const {
+ if (pv->empty()) return;
+ prob_t z = prob_t::Zero();
+ for (unsigned i = 0; i < pv->size(); ++i)
+ z += (*pv)[i].weight;
+ assert(z > prob_t::Zero());
+ for (unsigned i = 0; i < pv->size(); ++i)
+ (*pv)[i].weight /= z;
+ }
+};
+
+template <typename ParticleType>
+struct MultinomialResampleFilter {
+ explicit MultinomialResampleFilter(MT19937* rng) : rng_(rng) {}
+
+ void operator()(std::vector<ParticleType>* pv) {
+ if (pv->empty()) return;
+ std::vector<ParticleType>& ps = *pv;
+ SampleSet<prob_t> ss;
+ for (int i = 0; i < ps.size(); ++i)
+ ss.add(ps[i].weight);
+ std::vector<ParticleType> nps; nps.reserve(ps.size());
+ const prob_t uniform_weight(1.0 / ps.size());
+ for (int i = 0; i < ps.size(); ++i) {
+ nps.push_back(ps[rng_->SelectSample(ss)]);
+ nps[i].weight = uniform_weight;
+ }
+ nps.swap(ps);
+ }
+
+ private:
+ MT19937* rng_;
+};
+
+template <typename ParticleType>
+struct SystematicResampleFilter {
+ explicit SystematicResampleFilter(MT19937* rng) : rng_(rng), renorm_() {}
+
+ void operator()(std::vector<ParticleType>* pv) {
+ if (pv->empty()) return;
+ renorm_(pv);
+ std::vector<ParticleType>& ps = *pv;
+ std::vector<ParticleType> nps; nps.reserve(ps.size());
+ double lower = 0, upper = 0;
+ const double skip = 1.0 / ps.size();
+ double u_j = rng_->next() * skip;
+ //std::cerr << "u_0: " << u_j << std::endl;
+ int j = 0;
+ for (unsigned i = 0; i < ps.size(); ++i) {
+ upper += ps[i].weight.as_float();
+ //std::cerr << "lower: " << lower << " upper: " << upper << std::endl;
+ // how many children does ps[i] have?
+ while (u_j < lower) { u_j += skip; ++j; }
+ while (u_j >= lower && u_j <= upper) {
+ assert(j < ps.size());
+ nps.push_back(ps[i]);
+ u_j += skip;
+ //std::cerr << " add u_j=" << u_j << std::endl;
+ ++j;
+ }
+ lower = upper;
+ }
+ //std::cerr << ps.size() << " " << nps.size() << "\n";
+ assert(ps.size() == nps.size());
+ //exit(1);
+ ps.swap(nps);
+ }
+
+ private:
+ MT19937* rng_;
+ ParticleRenormalizer<ParticleType> renorm_;
+};
+
+#endif
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index 81abd61b..aae5f798 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -6,6 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "pf.h"
#include "base_measures.h"
#include "reachability.h"
#include "viterbi.h"
@@ -413,20 +414,6 @@ ostream& operator<<(ostream& o, const Particle& p) {
return o;
}
-void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
- vector<Particle>& ps = *pps;
- SampleSet<prob_t> ss;
- for (int i = 0; i < ps.size(); ++i)
- ss.add(ps[i].weight);
- vector<Particle> nps; nps.reserve(ps.size());
- const prob_t uniform_weight(1.0 / ps.size());
- for (int i = 0; i < ps.size(); ++i) {
- nps.push_back(ps[prng->SelectSample(ss)]);
- nps[i].weight = uniform_weight;
- }
- nps.swap(ps);
-}
-
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
@@ -466,6 +453,7 @@ int main(int argc, char** argv) {
MyJointModel m(lp0);
#endif
+ MultinomialResampleFilter<Particle> filter(&rng);
cerr << "Initializing reachability limits...\n";
vector<Particle> ps(corpusf.size());
vector<Reachability> reaches; reaches.reserve(corpusf.size());
@@ -500,7 +488,7 @@ int main(int argc, char** argv) {
// all particles have now been extended a bit, we will reweight them now
if (lps[0].trg_cov > 0)
- FilterCrapParticlesAndReweight(&lps);
+ filter(&lps);
// loop over all particles and extend them
bool done_nothing = true;
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index 33dc08c3..728ec00d 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -6,6 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "pf.h"
#include "base_measures.h"
#include "monotonic_pseg.h"
#include "reachability.h"
@@ -135,20 +136,6 @@ ostream& operator<<(ostream& o, const Particle& p) {
return o;
}
-void FilterCrapParticlesAndReweight(vector<Particle>* pps) {
- vector<Particle>& ps = *pps;
- SampleSet<prob_t> ss;
- for (int i = 0; i < ps.size(); ++i)
- ss.add(ps[i].weight);
- vector<Particle> nps; nps.reserve(ps.size());
- const prob_t uniform_weight(1.0 / ps.size());
- for (int i = 0; i < ps.size(); ++i) {
- nps.push_back(ps[prng->SelectSample(ss)]);
- nps[i].weight = uniform_weight;
- }
- nps.swap(ps);
-}
-
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
@@ -181,7 +168,17 @@ int main(int argc, char** argv) {
Model1 invm1(conf["inverse_model1"].as<string>());
PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
- MonotonicParallelSegementationModel m(lp0);
+ PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size());
+ MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0);
+ TRule xx("[X] ||| ms. kimura ||| MS. KIMURA ||| X=0");
+ cerr << xx << endl << lp0(xx) << " " << alp0(xx) << endl;
+ TRule xx12("[X] ||| . ||| PHARMACY . ||| X=0");
+ TRule xx21("[X] ||| pharmacy . ||| . ||| X=0");
+// TRule xx22("[X] ||| . ||| . ||| X=0");
+ TRule xx22("[X] ||| . ||| THE . ||| X=0");
+ cerr << xx12 << "\t" << lp0(xx12) << " " << alp0(xx12) << endl;
+ cerr << xx21 << "\t" << lp0(xx21) << " " << alp0(xx21) << endl;
+ cerr << xx22 << "\t" << lp0(xx22) << " " << alp0(xx22) << endl;
cerr << "Initializing reachability limits...\n";
vector<Particle> ps(corpusf.size());
@@ -194,6 +191,8 @@ int main(int argc, char** argv) {
cerr << "Sampling...\n";
vector<Particle> tmp_p(10000); // work space
SampleSet<prob_t> pfss;
+ SystematicResampleFilter<Particle> filter(&rng);
+ // MultinomialResampleFilter<Particle> filter(&rng);
for (int SS=0; SS < samples; ++SS) {
for (int ci = 0; ci < corpusf.size(); ++ci) {
vector<int>& src = corpusf[ci];
@@ -213,7 +212,7 @@ int main(int argc, char** argv) {
// all particles have now been extended a bit, we will reweight them now
if (lps[0].trg_cov > 0)
- FilterCrapParticlesAndReweight(&lps);
+ filter(&lps);
// loop over all particles and extend them
bool done_nothing = true;
@@ -263,6 +262,11 @@ int main(int argc, char** argv) {
}
} // loop over particles (pi = 0 .. particles)
if (done_nothing) all_complete = true;
+ prob_t wv = prob_t::Zero();
+ for (int pp = 0; pp < lps.size(); ++pp)
+ wv += lps[pp].weight;
+ for (int pp = 0; pp < lps.size(); ++pp)
+ lps[pp].weight /= wv;
}
pfss.clear();
for (int i = 0; i < lps.size(); ++i)
diff --git a/training/lbfgs_test.cc b/training/lbfgs_test.cc
index fc21e98d..c94682e9 100644
--- a/training/lbfgs_test.cc
+++ b/training/lbfgs_test.cc
@@ -28,11 +28,14 @@ double TestOptimizer() {
g[2] = 2 * x[2] + 6;
obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
opt.run(x, obj, g);
-
+ if (!opt.requests_f_and_g()) {
+ if (converged(x,g)) break;
+ opt.run(x, obj, g);
+ }
cerr << x[0] << " " << x[1] << " " << x[2] << endl;
cerr << " obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
cerr << opt << endl;
- } while (!converged(x, g));
+ } while (true);
return obj;
}
diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc
index 87c5f331..00746532 100644
--- a/training/mpi_flex_optimize.cc
+++ b/training/mpi_flex_optimize.cc
@@ -39,15 +39,12 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("weights,w",po::value<string>(),"Initial feature weights")
("training_data,d",po::value<string>(),"Training data")
("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(6), "Number of training instances evaluated per processor in each minibatch")
- ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (options: lbfgs, sgd, rprop)")
- ("minibatch_iterations,i", po::value<unsigned>()->default_value(10), "Number of optimization iterations per minibatch (1 = standard SGD)")
+ ("minibatch_iterations,i", po::value<unsigned>()->default_value(10), "Number of optimization iterations per minibatch")
("iterations,I", po::value<unsigned>()->default_value(50), "Number of passes through the training data before termination")
+ ("regularization_strength,C", po::value<double>()->default_value(0.2), "Regularization strength")
+ ("time_series_strength,T", po::value<double>()->default_value(0.0), "Time series regularization strength")
("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
- ("lbfgs_memory_buffers,M", po::value<unsigned>()->default_value(10), "Number of memory buffers for LBFGS history")
- ("eta_0,e", po::value<double>()->default_value(0.1), "Initial learning rate for SGD")
- ("L1,1","Use L1 regularization")
- ("L2,2","Use L2 regularization")
- ("regularization_strength,C", po::value<double>()->default_value(1.0), "Regularization strength (C)");
+ ("lbfgs_memory_buffers,M", po::value<unsigned>()->default_value(10), "Number of memory buffers for LBFGS history");
po::options_description clo("Command line options");
clo.add_options()
("config", po::value<string>(), "Configuration file")
@@ -64,7 +61,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::notify(*conf);
if (conf->count("help") || !conf->count("training_data") || !conf->count("cdec_config")) {
- cerr << "General-purpose minibatch online optimizer (MPI support "
+ cerr << "LBFGS minibatch online optimizer (MPI support "
#if HAVE_MPI
<< "enabled"
#else
@@ -166,6 +163,38 @@ void AddGrad(const SparseVector<prob_t> x, double s, SparseVector<double>* acc)
acc->add_value(it->first, it->second.as_float() * s);
}
+double PNorm(const vector<double>& v, const double p) {
+ double acc = 0;
+ for (int i = 0; i < v.size(); ++i)
+ acc += pow(v[i], p);
+ return pow(acc, 1.0 / p);
+}
+
+void VV(ostream&os, const vector<double>& v) {
+ for (int i = 1; i < v.size(); ++i)
+ if (v[i]) os << FD::Convert(i) << "=" << v[i] << " ";
+}
+
+double ApplyRegularizationTerms(const double C,
+ const double T,
+ const vector<double>& weights,
+ const vector<double>& prev_weights,
+ vector<double>* g) {
+ assert(weights.size() == g->size());
+ double reg = 0;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
+ const double& w_i = weights[i];
+ double& g_i = (*g)[i];
+ reg += C * w_i * w_i;
+ g_i += 2 * C * w_i;
+
+ reg += T * (w_i - prev_w_i) * (w_i - prev_w_i);
+ g_i += 2 * T * (w_i - prev_w_i);
+ }
+ return reg;
+}
+
int main(int argc, char** argv) {
#ifdef HAVE_MPI
mpi::environment env(argc, argv);
@@ -176,7 +205,7 @@ int main(int argc, char** argv) {
const int size = 1;
const int rank = 0;
#endif
- if (size > 1) SetSilent(true); // turn off verbose decoder output
+ if (size > 0) SetSilent(true); // turn off verbose decoder output
register_feature_functions();
MT19937* rng = NULL;
@@ -186,56 +215,60 @@ int main(int argc, char** argv) {
boost::shared_ptr<BatchOptimizer> o;
const unsigned lbfgs_memory_buffers = conf["lbfgs_memory_buffers"].as<unsigned>();
-
- istringstream ins;
- ReadConfig(conf["cdec_config"].as<string>(), &ins);
- Decoder decoder(&ins);
-
- // load initial weights
- vector<weight_t> init_weights;
- if (conf.count("weights"))
- Weights::InitFromFile(conf["weights"].as<string>(), &init_weights);
+ const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
+ const unsigned minibatch_iterations = conf["minibatch_iterations"].as<unsigned>();
+ const double regularization_strength = conf["regularization_strength"].as<double>();
+ const double time_series_strength = conf["time_series_strength"].as<double>();
+ const bool use_time_series_reg = time_series_strength > 0.0;
+ const unsigned max_iteration = conf["iterations"].as<unsigned>();
vector<string> corpus;
vector<int> ids;
ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
assert(corpus.size() > 0);
- const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
if (size_per_proc > corpus.size()) {
- cerr << "Minibatch size must be smaller than corpus size!\n";
+ cerr << "Minibatch size (per processor) must be smaller or equal to the local corpus size!\n";
return 1;
}
- size_t total_corpus_size = 0;
-#ifdef HAVE_MPI
- reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
-#else
- total_corpus_size = corpus.size();
-#endif
+ // initialize decoder (loads hash functions if necessary)
+ istringstream ins;
+ ReadConfig(conf["cdec_config"].as<string>(), &ins);
+ Decoder decoder(&ins);
+
+ // load initial weights
+ vector<weight_t> prev_weights;
+ if (conf.count("weights"))
+ Weights::InitFromFile(conf["weights"].as<string>(), &prev_weights);
if (conf.count("random_seed"))
rng = new MT19937(conf["random_seed"].as<uint32_t>());
else
rng = new MT19937;
- const unsigned minibatch_iterations = conf["minibatch_iterations"].as<unsigned>();
+ size_t total_corpus_size = 0;
+#ifdef HAVE_MPI
+ reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
+#else
+ total_corpus_size = corpus.size();
+#endif
- if (rank == 0) {
+ if (rank == 0)
cerr << "Total corpus size: " << total_corpus_size << endl;
- const unsigned batch_size = size_per_proc * size;
- }
- SparseVector<double> x;
- Weights::InitSparseVector(init_weights, &x);
CopyHGsObserver observer;
int write_weights_every_ith = 100; // TODO configure
int titer = -1;
- vector<weight_t>& lambdas = decoder.CurrentWeightVector();
- lambdas.swap(init_weights);
- init_weights.clear();
+ vector<weight_t>& cur_weights = decoder.CurrentWeightVector();
+ if (use_time_series_reg) {
+ cur_weights = prev_weights;
+ } else {
+ cur_weights.swap(prev_weights);
+ prev_weights.clear();
+ }
int iter = -1;
bool converged = false;
@@ -243,26 +276,20 @@ int main(int argc, char** argv) {
#ifdef HAVE_MPI
mpi::timer timer;
#endif
- x.init_vector(&lambdas);
++iter; ++titer;
-#if 0
if (rank == 0) {
converged = (iter == max_iteration);
- Weights::SanityCheck(lambdas);
- Weights::ShowLargestFeatures(lambdas);
string fname = "weights.cur.gz";
if (iter % write_weights_every_ith == 0) {
- ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
+ ostringstream o; o << "weights.epoch_" << iter << ".gz";
fname = o.str();
}
- if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
+ if (converged) { fname = "weights.final.gz"; }
ostringstream vv;
- vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer);
+ vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size()));
const string svv = vv.str();
- cerr << svv << endl;
- Weights::WriteToFile(fname, lambdas, true, &svv);
+ Weights::WriteToFile(fname, cur_weights, true, &svv);
}
-#endif
vector<Hypergraph> hgs(size_per_proc);
vector<Hypergraph> gold_hgs(size_per_proc);
@@ -287,8 +314,8 @@ int main(int argc, char** argv) {
Hypergraph& hg_gold = gold_hgs[i];
if (hg.edges_.size() < 2) continue;
- hg.Reweight(lambdas);
- hg_gold.Reweight(lambdas);
+ hg.Reweight(cur_weights);
+ hg_gold.Reweight(cur_weights);
SparseVector<prob_t> model_exp, gold_exp;
const prob_t z = InsideOutside<prob_t,
EdgeProb,
@@ -324,23 +351,37 @@ int main(int argc, char** argv) {
#endif
local_grad.clear();
if (rank == 0) {
- g /= (size_per_proc * size);
+ // g /= (size_per_proc * size);
if (!o)
o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers));
vector<double> gg(FD::NumFeats());
- if (gg.size() != lambdas.size()) { lambdas.resize(gg.size()); }
+ if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); }
for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it)
if (it->first) { gg[it->first] = it->second; }
- cerr << "OBJ: " << obj << endl;
- o->Optimize(obj, gg, &lambdas);
+ g.clear();
+ double r = ApplyRegularizationTerms(regularization_strength,
+ time_series_strength * (iter == 0 ? 0.0 : 1.0),
+ cur_weights,
+ prev_weights,
+ &gg);
+ obj += r;
+ if (mi == 0 || mi == (minibatch_iterations - 1)) {
+ if (!mi) cerr << iter << ' '; else cerr << ' ';
+ cerr << "OBJ=" << obj << " (REG=" << r << ")" << " |g|=" << PNorm(gg, 2) << " |w|=" << PNorm(cur_weights, 2);
+ if (mi > 0) cerr << endl << flush; else cerr << ' ';
+ } else { cerr << '.' << flush; }
+ // cerr << "w = "; VV(cerr, cur_weights); cerr << endl;
+ // cerr << "g = "; VV(cerr, gg); cerr << endl;
+ o->Optimize(obj, gg, &cur_weights);
}
#ifdef HAVE_MPI
- broadcast(world, x, 0);
+ // broadcast(world, x, 0);
broadcast(world, converged, 0);
world.barrier();
if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
#endif
}
+ prev_weights = cur_weights;
}
return 0;
}
diff --git a/training/optimize.cc b/training/optimize.cc
index f0740d5c..41ac90d8 100644
--- a/training/optimize.cc
+++ b/training/optimize.cc
@@ -96,6 +96,7 @@ void LBFGSOptimizer::OptimizeImpl(const double& obj,
const vector<double>& g,
vector<double>* x) {
opt_.run(&(*x)[0], obj, &g[0]);
- cerr << opt_ << endl;
+ if (!opt_.requests_f_and_g()) opt_.run(&(*x)[0], obj, &g[0]);
+ // cerr << opt_ << endl;
}
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index b4783f91..869f430b 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -240,12 +240,11 @@ my $node_count = 0;
my $script = "";
# fork == one thread runs the sentserver, while the
# other spawns the sentclient commands.
-if (my $pid = fork) {
+my $pid = fork;
+if ($pid == 0) { # child
sleep 8; # give other thread time to start sentserver
- $script =
- qq{wait
-$cdcmd$sentclient $host:$port:$key $cmd
-};
+ $script = "$cdcmd$sentclient $host:$port:$key $cmd";
+
if ($verbose){
print STDERR "Client script:\n====\n";
print STDERR $script;
@@ -270,13 +269,18 @@ $cdcmd$sentclient $host:$port:$key $cmd
}
}
}
- waitpid($pid, 0);
- cleanup();
+ print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n";
+ for my $p (@pids) {
+ waitpid($p, 0);
+ }
} else {
# my $todo = "$sentserver -k $key $multiflag $port ";
my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag ";
if ($verbose){ print STDERR "Running: $todo\n"; }
check_call($todo);
+ print STDERR "Call to $sentserver returned.\n";
+ cleanup();
+ exit(0);
}
sub numof_live_jobs {
@@ -343,16 +347,18 @@ sub launch_job_fork {
push @errors,$errorfile;
push @outs,$outfile;
}
- if (my $pid = fork) {
+ my $pid = fork;
+ if ($pid == 0) {
my ($fh, $scr_name) = get_temp_script();
print $fh $script;
close $fh;
my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
print STDERR "EXEC: $todo\n";
my $out = check_output("$todo");
- print STDERR "RES: $out\n";
unlink $scr_name or warn "Failed to remove $scr_name";
exit 0;
+ } else {
+ push @pids, $pid;
}
}
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 3a385a88..c5078645 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -27,11 +27,13 @@ die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfo
my $f_lang = $1;
my $e_lang = $2;
+print STDERR " Using mkcls in: $mkcls\n\n";
print STDERR "Source language: $f_lang\n";
print STDERR "Target language: $e_lang\n";
-print STDERR " Using mkcls in: $mkcls\n\n";
-die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
-die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
+die "Don't have an stemmer for $f_lang: please create $SCRIPT_DIR/stemmers/$f_lang.pl\n" unless -f "$SCRIPT_DIR/stemmers/$f_lang.pl";
+die "Don't have an stemmer for $e_lang: please create $SCRIPT_DIR/stemmers/$e_lang.pl\n" unless -f "$SCRIPT_DIR/stemmers/$e_lang.pl";
+die "Don't have an orthographic normalizer for $f_lang: please create $SCRIPT_DIR/ortho-norm/$f_lang.pl\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
+die "Don't have an orthographic normalizer for $e_lang: please create $SCRIPT_DIR/ortho-norm/$e_lang.pl\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
my @directions = qw(f-e);
diff --git a/word-aligner/ortho-norm/mg.pl b/word-aligner/ortho-norm/mg.pl
new file mode 100755
index 00000000..4cb0e8e7
--- /dev/null
+++ b/word-aligner/ortho-norm/mg.pl
@@ -0,0 +1,13 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+ $_ = lc $_;
+ s/([a-z])'( |$)/$1$2/g;
+ print;
+}
+
diff --git a/word-aligner/ortho-norm/rw.pl b/word-aligner/ortho-norm/rw.pl
new file mode 100755
index 00000000..4cb0e8e7
--- /dev/null
+++ b/word-aligner/ortho-norm/rw.pl
@@ -0,0 +1,13 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+ $_ = lc $_;
+ s/([a-z])'( |$)/$1$2/g;
+ print;
+}
+
diff --git a/word-aligner/stemmers/mg.pl b/word-aligner/stemmers/mg.pl
new file mode 100755
index 00000000..2f79a94e
--- /dev/null
+++ b/word-aligner/stemmers/mg.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl -w
+
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT,":utf8");
+
+my $vocab = undef;
+if (scalar @ARGV > 0) {
+ die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1);
+ $vocab = 1;
+}
+
+my %dict;
+while(<STDIN>) {
+ chomp;
+ my @words = split /\s+/;
+ my @out = ();
+ for my $w (@words) {
+ my $tw = $dict{$w};
+ if (!defined $tw) {
+ my $el = 5;
+ if ($w =~ /(ndz|ndr|nts|ntr)/) { $el++; }
+ if ($w =~ /^(mp|mb|nd)/) { $el++; }
+ if ($el > length($w)) { $el = length($w); }
+ $tw = substr $w, 0, $el;
+ $dict{$w} = $tw;
+ }
+ push @out, $tw;
+ }
+ if ($vocab) {
+ die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1;
+ print "$_ @out\n";
+ } else {
+ print "@out\n";
+ }
+}
+
diff --git a/word-aligner/stemmers/rw.pl b/word-aligner/stemmers/rw.pl
new file mode 100755
index 00000000..6d873b40
--- /dev/null
+++ b/word-aligner/stemmers/rw.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl -w
+
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT,":utf8");
+
+my $vocab = undef;
+if (scalar @ARGV > 0) {
+ die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1);
+ $vocab = 1;
+}
+
+my %dict;
+while(<STDIN>) {
+ chomp;
+ my @words = split /\s+/;
+ my @out = ();
+ for my $w (@words) {
+ my $tw = $dict{$w};
+ if (!defined $tw) {
+ my $el = 5;
+ if ($w =~ /(ny|jy|nk|nt|sh|cy)/) { $el++; }
+ if ($el > length($w)) { $el = length($w); }
+ $tw = substr $w, 0, $el;
+ $dict{$w} = $tw;
+ }
+ push @out, $tw;
+ }
+ if ($vocab) {
+ die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1;
+ print "$_ @out\n";
+ } else {
+ print "@out\n";
+ }
+}
+