summaryrefslogtreecommitdiff
path: root/python/pkg/cdec
diff options
context:
space:
mode:
authorVictor Chahuneau <vchahune@cs.cmu.edu>2013-08-26 20:12:32 -0400
committerVictor Chahuneau <vchahune@cs.cmu.edu>2013-08-26 20:12:32 -0400
commitca9b58716214148eeaeaa3076e1a1dc8f8bb5892 (patch)
treebfa2fd84c86e0fdd499110e86fd464b391379df1 /python/pkg/cdec
parent9d5071692ceab8d09c2bfdba24f6b927ec84b7f9 (diff)
Improve the package structure of pycdec
This change should not break anything, but now you can run: python setup.py build_ext --inplace and use the cleaner: PYTHONPATH=/path/to/cdec/python python -m ...
Diffstat (limited to 'python/pkg/cdec')
-rw-r--r--python/pkg/cdec/__init__.py2
-rw-r--r--python/pkg/cdec/configobj.py2468
-rw-r--r--python/pkg/cdec/sa/__init__.py25
-rw-r--r--python/pkg/cdec/sa/compile.py132
-rw-r--r--python/pkg/cdec/sa/extract.py113
-rw-r--r--python/pkg/cdec/sa/extractor.py106
-rw-r--r--python/pkg/cdec/sa/features.py142
-rw-r--r--python/pkg/cdec/score.py1
8 files changed, 0 insertions, 2989 deletions
diff --git a/python/pkg/cdec/__init__.py b/python/pkg/cdec/__init__.py
deleted file mode 100644
index 8e10f340..00000000
--- a/python/pkg/cdec/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from cdec._cdec import Decoder, Lattice, TRule, MRule, NT, NTRef,\
- ParseFailed, InvalidConfig, set_silent
diff --git a/python/pkg/cdec/configobj.py b/python/pkg/cdec/configobj.py
deleted file mode 100644
index c1f6e6df..00000000
--- a/python/pkg/cdec/configobj.py
+++ /dev/null
@@ -1,2468 +0,0 @@
-# configobj.py
-# A config file reader/writer that supports nested sections in config files.
-# Copyright (C) 2005-2010 Michael Foord, Nicola Larosa
-# E-mail: fuzzyman AT voidspace DOT org DOT uk
-# nico AT tekNico DOT net
-
-# ConfigObj 4
-# http://www.voidspace.org.uk/python/configobj.html
-
-# Released subject to the BSD License
-# Please see http://www.voidspace.org.uk/python/license.shtml
-
-# Scripts maintained at http://www.voidspace.org.uk/python/index.shtml
-# For information about bugfixes, updates and support, please join the
-# ConfigObj mailing list:
-# http://lists.sourceforge.net/lists/listinfo/configobj-develop
-# Comments, suggestions and bug reports welcome.
-
-from __future__ import generators
-
-import os
-import re
-import sys
-
-from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF16_BE, BOM_UTF16_LE
-
-
-# imported lazily to avoid startup performance hit if it isn't used
-compiler = None
-
-# A dictionary mapping BOM to
-# the encoding to decode with, and what to set the
-# encoding attribute to.
-BOMS = {
- BOM_UTF8: ('utf_8', None),
- BOM_UTF16_BE: ('utf16_be', 'utf_16'),
- BOM_UTF16_LE: ('utf16_le', 'utf_16'),
- BOM_UTF16: ('utf_16', 'utf_16'),
- }
-# All legal variants of the BOM codecs.
-# TODO: the list of aliases is not meant to be exhaustive, is there a
-# better way ?
-BOM_LIST = {
- 'utf_16': 'utf_16',
- 'u16': 'utf_16',
- 'utf16': 'utf_16',
- 'utf-16': 'utf_16',
- 'utf16_be': 'utf16_be',
- 'utf_16_be': 'utf16_be',
- 'utf-16be': 'utf16_be',
- 'utf16_le': 'utf16_le',
- 'utf_16_le': 'utf16_le',
- 'utf-16le': 'utf16_le',
- 'utf_8': 'utf_8',
- 'u8': 'utf_8',
- 'utf': 'utf_8',
- 'utf8': 'utf_8',
- 'utf-8': 'utf_8',
- }
-
-# Map of encodings to the BOM to write.
-BOM_SET = {
- 'utf_8': BOM_UTF8,
- 'utf_16': BOM_UTF16,
- 'utf16_be': BOM_UTF16_BE,
- 'utf16_le': BOM_UTF16_LE,
- None: BOM_UTF8
- }
-
-
-def match_utf8(encoding):
- return BOM_LIST.get(encoding.lower()) == 'utf_8'
-
-
-# Quote strings used for writing values
-squot = "'%s'"
-dquot = '"%s"'
-noquot = "%s"
-wspace_plus = ' \r\n\v\t\'"'
-tsquot = '"""%s"""'
-tdquot = "'''%s'''"
-
-# Sentinel for use in getattr calls to replace hasattr
-MISSING = object()
-
-__version__ = '4.7.2'
-
-try:
- any
-except NameError:
- def any(iterable):
- for entry in iterable:
- if entry:
- return True
- return False
-
-
-__all__ = (
- '__version__',
- 'DEFAULT_INDENT_TYPE',
- 'DEFAULT_INTERPOLATION',
- 'ConfigObjError',
- 'NestingError',
- 'ParseError',
- 'DuplicateError',
- 'ConfigspecError',
- 'ConfigObj',
- 'SimpleVal',
- 'InterpolationError',
- 'InterpolationLoopError',
- 'MissingInterpolationOption',
- 'RepeatSectionError',
- 'ReloadError',
- 'UnreprError',
- 'UnknownType',
- 'flatten_errors',
- 'get_extra_values'
-)
-
-DEFAULT_INTERPOLATION = 'configparser'
-DEFAULT_INDENT_TYPE = ' '
-MAX_INTERPOL_DEPTH = 10
-
-OPTION_DEFAULTS = {
- 'interpolation': True,
- 'raise_errors': False,
- 'list_values': True,
- 'create_empty': False,
- 'file_error': False,
- 'configspec': None,
- 'stringify': True,
- # option may be set to one of ('', ' ', '\t')
- 'indent_type': None,
- 'encoding': None,
- 'default_encoding': None,
- 'unrepr': False,
- 'write_empty_values': False,
-}
-
-
-
-def getObj(s):
- global compiler
- if compiler is None:
- import compiler
- s = "a=" + s
- p = compiler.parse(s)
- return p.getChildren()[1].getChildren()[0].getChildren()[1]
-
-
-class UnknownType(Exception):
- pass
-
-
-class Builder(object):
-
- def build(self, o):
- m = getattr(self, 'build_' + o.__class__.__name__, None)
- if m is None:
- raise UnknownType(o.__class__.__name__)
- return m(o)
-
- def build_List(self, o):
- return map(self.build, o.getChildren())
-
- def build_Const(self, o):
- return o.value
-
- def build_Dict(self, o):
- d = {}
- i = iter(map(self.build, o.getChildren()))
- for el in i:
- d[el] = i.next()
- return d
-
- def build_Tuple(self, o):
- return tuple(self.build_List(o))
-
- def build_Name(self, o):
- if o.name == 'None':
- return None
- if o.name == 'True':
- return True
- if o.name == 'False':
- return False
-
- # An undefined Name
- raise UnknownType('Undefined Name')
-
- def build_Add(self, o):
- real, imag = map(self.build_Const, o.getChildren())
- try:
- real = float(real)
- except TypeError:
- raise UnknownType('Add')
- if not isinstance(imag, complex) or imag.real != 0.0:
- raise UnknownType('Add')
- return real+imag
-
- def build_Getattr(self, o):
- parent = self.build(o.expr)
- return getattr(parent, o.attrname)
-
- def build_UnarySub(self, o):
- return -self.build_Const(o.getChildren()[0])
-
- def build_UnaryAdd(self, o):
- return self.build_Const(o.getChildren()[0])
-
-
-_builder = Builder()
-
-
-def unrepr(s):
- if not s:
- return s
- return _builder.build(getObj(s))
-
-
-
-class ConfigObjError(SyntaxError):
- """
- This is the base class for all errors that ConfigObj raises.
- It is a subclass of SyntaxError.
- """
- def __init__(self, message='', line_number=None, line=''):
- self.line = line
- self.line_number = line_number
- SyntaxError.__init__(self, message)
-
-
-class NestingError(ConfigObjError):
- """
- This error indicates a level of nesting that doesn't match.
- """
-
-
-class ParseError(ConfigObjError):
- """
- This error indicates that a line is badly written.
- It is neither a valid ``key = value`` line,
- nor a valid section marker line.
- """
-
-
-class ReloadError(IOError):
- """
- A 'reload' operation failed.
- This exception is a subclass of ``IOError``.
- """
- def __init__(self):
- IOError.__init__(self, 'reload failed, filename is not set.')
-
-
-class DuplicateError(ConfigObjError):
- """
- The keyword or section specified already exists.
- """
-
-
-class ConfigspecError(ConfigObjError):
- """
- An error occured whilst parsing a configspec.
- """
-
-
-class InterpolationError(ConfigObjError):
- """Base class for the two interpolation errors."""
-
-
-class InterpolationLoopError(InterpolationError):
- """Maximum interpolation depth exceeded in string interpolation."""
-
- def __init__(self, option):
- InterpolationError.__init__(
- self,
- 'interpolation loop detected in value "%s".' % option)
-
-
-class RepeatSectionError(ConfigObjError):
- """
- This error indicates additional sections in a section with a
- ``__many__`` (repeated) section.
- """
-
-
-class MissingInterpolationOption(InterpolationError):
- """A value specified for interpolation was missing."""
- def __init__(self, option):
- msg = 'missing option "%s" in interpolation.' % option
- InterpolationError.__init__(self, msg)
-
-
-class UnreprError(ConfigObjError):
- """An error parsing in unrepr mode."""
-
-
-
-class InterpolationEngine(object):
- """
- A helper class to help perform string interpolation.
-
- This class is an abstract base class; its descendants perform
- the actual work.
- """
-
- # compiled regexp to use in self.interpolate()
- _KEYCRE = re.compile(r"%\(([^)]*)\)s")
- _cookie = '%'
-
- def __init__(self, section):
- # the Section instance that "owns" this engine
- self.section = section
-
-
- def interpolate(self, key, value):
- # short-cut
- if not self._cookie in value:
- return value
-
- def recursive_interpolate(key, value, section, backtrail):
- """The function that does the actual work.
-
- ``value``: the string we're trying to interpolate.
- ``section``: the section in which that string was found
- ``backtrail``: a dict to keep track of where we've been,
- to detect and prevent infinite recursion loops
-
- This is similar to a depth-first-search algorithm.
- """
- # Have we been here already?
- if (key, section.name) in backtrail:
- # Yes - infinite loop detected
- raise InterpolationLoopError(key)
- # Place a marker on our backtrail so we won't come back here again
- backtrail[(key, section.name)] = 1
-
- # Now start the actual work
- match = self._KEYCRE.search(value)
- while match:
- # The actual parsing of the match is implementation-dependent,
- # so delegate to our helper function
- k, v, s = self._parse_match(match)
- if k is None:
- # That's the signal that no further interpolation is needed
- replacement = v
- else:
- # Further interpolation may be needed to obtain final value
- replacement = recursive_interpolate(k, v, s, backtrail)
- # Replace the matched string with its final value
- start, end = match.span()
- value = ''.join((value[:start], replacement, value[end:]))
- new_search_start = start + len(replacement)
- # Pick up the next interpolation key, if any, for next time
- # through the while loop
- match = self._KEYCRE.search(value, new_search_start)
-
- # Now safe to come back here again; remove marker from backtrail
- del backtrail[(key, section.name)]
-
- return value
-
- # Back in interpolate(), all we have to do is kick off the recursive
- # function with appropriate starting values
- value = recursive_interpolate(key, value, self.section, {})
- return value
-
-
- def _fetch(self, key):
- """Helper function to fetch values from owning section.
-
- Returns a 2-tuple: the value, and the section where it was found.
- """
- # switch off interpolation before we try and fetch anything !
- save_interp = self.section.main.interpolation
- self.section.main.interpolation = False
-
- # Start at section that "owns" this InterpolationEngine
- current_section = self.section
- while True:
- # try the current section first
- val = current_section.get(key)
- if val is not None and not isinstance(val, Section):
- break
- # try "DEFAULT" next
- val = current_section.get('DEFAULT', {}).get(key)
- if val is not None and not isinstance(val, Section):
- break
- # move up to parent and try again
- # top-level's parent is itself
- if current_section.parent is current_section:
- # reached top level, time to give up
- break
- current_section = current_section.parent
-
- # restore interpolation to previous value before returning
- self.section.main.interpolation = save_interp
- if val is None:
- raise MissingInterpolationOption(key)
- return val, current_section
-
-
- def _parse_match(self, match):
- """Implementation-dependent helper function.
-
- Will be passed a match object corresponding to the interpolation
- key we just found (e.g., "%(foo)s" or "$foo"). Should look up that
- key in the appropriate config file section (using the ``_fetch()``
- helper function) and return a 3-tuple: (key, value, section)
-
- ``key`` is the name of the key we're looking for
- ``value`` is the value found for that key
- ``section`` is a reference to the section where it was found
-
- ``key`` and ``section`` should be None if no further
- interpolation should be performed on the resulting value
- (e.g., if we interpolated "$$" and returned "$").
- """
- raise NotImplementedError()
-
-
-
-class ConfigParserInterpolation(InterpolationEngine):
- """Behaves like ConfigParser."""
- _cookie = '%'
- _KEYCRE = re.compile(r"%\(([^)]*)\)s")
-
- def _parse_match(self, match):
- key = match.group(1)
- value, section = self._fetch(key)
- return key, value, section
-
-
-
-class TemplateInterpolation(InterpolationEngine):
- """Behaves like string.Template."""
- _cookie = '$'
- _delimiter = '$'
- _KEYCRE = re.compile(r"""
- \$(?:
- (?P<escaped>\$) | # Two $ signs
- (?P<named>[_a-z][_a-z0-9]*) | # $name format
- {(?P<braced>[^}]*)} # ${name} format
- )
- """, re.IGNORECASE | re.VERBOSE)
-
- def _parse_match(self, match):
- # Valid name (in or out of braces): fetch value from section
- key = match.group('named') or match.group('braced')
- if key is not None:
- value, section = self._fetch(key)
- return key, value, section
- # Escaped delimiter (e.g., $$): return single delimiter
- if match.group('escaped') is not None:
- # Return None for key and section to indicate it's time to stop
- return None, self._delimiter, None
- # Anything else: ignore completely, just return it unchanged
- return None, match.group(), None
-
-
-interpolation_engines = {
- 'configparser': ConfigParserInterpolation,
- 'template': TemplateInterpolation,
-}
-
-
-def __newobj__(cls, *args):
- # Hack for pickle
- return cls.__new__(cls, *args)
-
-class Section(dict):
- """
- A dictionary-like object that represents a section in a config file.
-
- It does string interpolation if the 'interpolation' attribute
- of the 'main' object is set to True.
-
- Interpolation is tried first from this object, then from the 'DEFAULT'
- section of this object, next from the parent and its 'DEFAULT' section,
- and so on until the main object is reached.
-
- A Section will behave like an ordered dictionary - following the
- order of the ``scalars`` and ``sections`` attributes.
- You can use this to change the order of members.
-
- Iteration follows the order: scalars, then sections.
- """
-
-
- def __setstate__(self, state):
- dict.update(self, state[0])
- self.__dict__.update(state[1])
-
- def __reduce__(self):
- state = (dict(self), self.__dict__)
- return (__newobj__, (self.__class__,), state)
-
-
- def __init__(self, parent, depth, main, indict=None, name=None):
- """
- * parent is the section above
- * depth is the depth level of this section
- * main is the main ConfigObj
- * indict is a dictionary to initialise the section with
- """
- if indict is None:
- indict = {}
- dict.__init__(self)
- # used for nesting level *and* interpolation
- self.parent = parent
- # used for the interpolation attribute
- self.main = main
- # level of nesting depth of this Section
- self.depth = depth
- # purely for information
- self.name = name
- #
- self._initialise()
- # we do this explicitly so that __setitem__ is used properly
- # (rather than just passing to ``dict.__init__``)
- for entry, value in indict.iteritems():
- self[entry] = value
-
-
- def _initialise(self):
- # the sequence of scalar values in this Section
- self.scalars = []
- # the sequence of sections in this Section
- self.sections = []
- # for comments :-)
- self.comments = {}
- self.inline_comments = {}
- # the configspec
- self.configspec = None
- # for defaults
- self.defaults = []
- self.default_values = {}
- self.extra_values = []
- self._created = False
-
-
- def _interpolate(self, key, value):
- try:
- # do we already have an interpolation engine?
- engine = self._interpolation_engine
- except AttributeError:
- # not yet: first time running _interpolate(), so pick the engine
- name = self.main.interpolation
- if name == True: # note that "if name:" would be incorrect here
- # backwards-compatibility: interpolation=True means use default
- name = DEFAULT_INTERPOLATION
- name = name.lower() # so that "Template", "template", etc. all work
- class_ = interpolation_engines.get(name, None)
- if class_ is None:
- # invalid value for self.main.interpolation
- self.main.interpolation = False
- return value
- else:
- # save reference to engine so we don't have to do this again
- engine = self._interpolation_engine = class_(self)
- # let the engine do the actual work
- return engine.interpolate(key, value)
-
-
- def __getitem__(self, key):
- """Fetch the item and do string interpolation."""
- val = dict.__getitem__(self, key)
- if self.main.interpolation:
- if isinstance(val, basestring):
- return self._interpolate(key, val)
- if isinstance(val, list):
- def _check(entry):
- if isinstance(entry, basestring):
- return self._interpolate(key, entry)
- return entry
- new = [_check(entry) for entry in val]
- if new != val:
- return new
- return val
-
-
- def __setitem__(self, key, value, unrepr=False):
- """
- Correctly set a value.
-
- Making dictionary values Section instances.
- (We have to special case 'Section' instances - which are also dicts)
-
- Keys must be strings.
- Values need only be strings (or lists of strings) if
- ``main.stringify`` is set.
-
- ``unrepr`` must be set when setting a value to a dictionary, without
- creating a new sub-section.
- """
- if not isinstance(key, basestring):
- raise ValueError('The key "%s" is not a string.' % key)
-
- # add the comment
- if key not in self.comments:
- self.comments[key] = []
- self.inline_comments[key] = ''
- # remove the entry from defaults
- if key in self.defaults:
- self.defaults.remove(key)
- #
- if isinstance(value, Section):
- if key not in self:
- self.sections.append(key)
- dict.__setitem__(self, key, value)
- elif isinstance(value, dict) and not unrepr:
- # First create the new depth level,
- # then create the section
- if key not in self:
- self.sections.append(key)
- new_depth = self.depth + 1
- dict.__setitem__(
- self,
- key,
- Section(
- self,
- new_depth,
- self.main,
- indict=value,
- name=key))
- else:
- if key not in self:
- self.scalars.append(key)
- if not self.main.stringify:
- if isinstance(value, basestring):
- pass
- elif isinstance(value, (list, tuple)):
- for entry in value:
- if not isinstance(entry, basestring):
- raise TypeError('Value is not a string "%s".' % entry)
- else:
- raise TypeError('Value is not a string "%s".' % value)
- dict.__setitem__(self, key, value)
-
-
- def __delitem__(self, key):
- """Remove items from the sequence when deleting."""
- dict. __delitem__(self, key)
- if key in self.scalars:
- self.scalars.remove(key)
- else:
- self.sections.remove(key)
- del self.comments[key]
- del self.inline_comments[key]
-
-
- def get(self, key, default=None):
- """A version of ``get`` that doesn't bypass string interpolation."""
- try:
- return self[key]
- except KeyError:
- return default
-
-
- def update(self, indict):
- """
- A version of update that uses our ``__setitem__``.
- """
- for entry in indict:
- self[entry] = indict[entry]
-
-
- def pop(self, key, default=MISSING):
- """
- 'D.pop(k[,d]) -> v, remove specified key and return the corresponding value.
- If key is not found, d is returned if given, otherwise KeyError is raised'
- """
- try:
- val = self[key]
- except KeyError:
- if default is MISSING:
- raise
- val = default
- else:
- del self[key]
- return val
-
-
- def popitem(self):
- """Pops the first (key,val)"""
- sequence = (self.scalars + self.sections)
- if not sequence:
- raise KeyError(": 'popitem(): dictionary is empty'")
- key = sequence[0]
- val = self[key]
- del self[key]
- return key, val
-
-
- def clear(self):
- """
- A version of clear that also affects scalars/sections
- Also clears comments and configspec.
-
- Leaves other attributes alone :
- depth/main/parent are not affected
- """
- dict.clear(self)
- self.scalars = []
- self.sections = []
- self.comments = {}
- self.inline_comments = {}
- self.configspec = None
- self.defaults = []
- self.extra_values = []
-
-
- def setdefault(self, key, default=None):
- """A version of setdefault that sets sequence if appropriate."""
- try:
- return self[key]
- except KeyError:
- self[key] = default
- return self[key]
-
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return zip((self.scalars + self.sections), self.values())
-
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return (self.scalars + self.sections)
-
-
- def values(self):
- """D.values() -> list of D's values"""
- return [self[key] for key in (self.scalars + self.sections)]
-
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- return iter(self.items())
-
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter((self.scalars + self.sections))
-
- __iter__ = iterkeys
-
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- return iter(self.values())
-
-
- def __repr__(self):
- """x.__repr__() <==> repr(x)"""
- def _getval(key):
- try:
- return self[key]
- except MissingInterpolationOption:
- return dict.__getitem__(self, key)
- return '{%s}' % ', '.join([('%s: %s' % (repr(key), repr(_getval(key))))
- for key in (self.scalars + self.sections)])
-
- __str__ = __repr__
- __str__.__doc__ = "x.__str__() <==> str(x)"
-
-
- # Extra methods - not in a normal dictionary
-
- def dict(self):
- """
- Return a deepcopy of self as a dictionary.
-
- All members that are ``Section`` instances are recursively turned to
- ordinary dictionaries - by calling their ``dict`` method.
-
- >>> n = a.dict()
- >>> n == a
- 1
- >>> n is a
- 0
- """
- newdict = {}
- for entry in self:
- this_entry = self[entry]
- if isinstance(this_entry, Section):
- this_entry = this_entry.dict()
- elif isinstance(this_entry, list):
- # create a copy rather than a reference
- this_entry = list(this_entry)
- elif isinstance(this_entry, tuple):
- # create a copy rather than a reference
- this_entry = tuple(this_entry)
- newdict[entry] = this_entry
- return newdict
-
-
- def merge(self, indict):
- """
- A recursive update - useful for merging config files.
-
- >>> a = '''[section1]
- ... option1 = True
- ... [[subsection]]
- ... more_options = False
- ... # end of file'''.splitlines()
- >>> b = '''# File is user.ini
- ... [section1]
- ... option1 = False
- ... # end of file'''.splitlines()
- >>> c1 = ConfigObj(b)
- >>> c2 = ConfigObj(a)
- >>> c2.merge(c1)
- >>> c2
- ConfigObj({'section1': {'option1': 'False', 'subsection': {'more_options': 'False'}}})
- """
- for key, val in indict.items():
- if (key in self and isinstance(self[key], dict) and
- isinstance(val, dict)):
- self[key].merge(val)
- else:
- self[key] = val
-
-
- def rename(self, oldkey, newkey):
- """
- Change a keyname to another, without changing position in sequence.
-
- Implemented so that transformations can be made on keys,
- as well as on values. (used by encode and decode)
-
- Also renames comments.
- """
- if oldkey in self.scalars:
- the_list = self.scalars
- elif oldkey in self.sections:
- the_list = self.sections
- else:
- raise KeyError('Key "%s" not found.' % oldkey)
- pos = the_list.index(oldkey)
- #
- val = self[oldkey]
- dict.__delitem__(self, oldkey)
- dict.__setitem__(self, newkey, val)
- the_list.remove(oldkey)
- the_list.insert(pos, newkey)
- comm = self.comments[oldkey]
- inline_comment = self.inline_comments[oldkey]
- del self.comments[oldkey]
- del self.inline_comments[oldkey]
- self.comments[newkey] = comm
- self.inline_comments[newkey] = inline_comment
-
-
- def walk(self, function, raise_errors=True,
- call_on_sections=False, **keywargs):
- """
- Walk every member and call a function on the keyword and value.
-
- Return a dictionary of the return values
-
- If the function raises an exception, raise the errror
- unless ``raise_errors=False``, in which case set the return value to
- ``False``.
-
- Any unrecognised keyword arguments you pass to walk, will be pased on
- to the function you pass in.
-
- Note: if ``call_on_sections`` is ``True`` then - on encountering a
- subsection, *first* the function is called for the *whole* subsection,
- and then recurses into it's members. This means your function must be
- able to handle strings, dictionaries and lists. This allows you
- to change the key of subsections as well as for ordinary members. The
- return value when called on the whole subsection has to be discarded.
-
- See the encode and decode methods for examples, including functions.
-
- .. admonition:: caution
-
- You can use ``walk`` to transform the names of members of a section
- but you mustn't add or delete members.
-
- >>> config = '''[XXXXsection]
- ... XXXXkey = XXXXvalue'''.splitlines()
- >>> cfg = ConfigObj(config)
- >>> cfg
- ConfigObj({'XXXXsection': {'XXXXkey': 'XXXXvalue'}})
- >>> def transform(section, key):
- ... val = section[key]
- ... newkey = key.replace('XXXX', 'CLIENT1')
- ... section.rename(key, newkey)
- ... if isinstance(val, (tuple, list, dict)):
- ... pass
- ... else:
- ... val = val.replace('XXXX', 'CLIENT1')
- ... section[newkey] = val
- >>> cfg.walk(transform, call_on_sections=True)
- {'CLIENT1section': {'CLIENT1key': None}}
- >>> cfg
- ConfigObj({'CLIENT1section': {'CLIENT1key': 'CLIENT1value'}})
- """
- out = {}
- # scalars first
- for i in range(len(self.scalars)):
- entry = self.scalars[i]
- try:
- val = function(self, entry, **keywargs)
- # bound again in case name has changed
- entry = self.scalars[i]
- out[entry] = val
- except Exception:
- if raise_errors:
- raise
- else:
- entry = self.scalars[i]
- out[entry] = False
- # then sections
- for i in range(len(self.sections)):
- entry = self.sections[i]
- if call_on_sections:
- try:
- function(self, entry, **keywargs)
- except Exception:
- if raise_errors:
- raise
- else:
- entry = self.sections[i]
- out[entry] = False
- # bound again in case name has changed
- entry = self.sections[i]
- # previous result is discarded
- out[entry] = self[entry].walk(
- function,
- raise_errors=raise_errors,
- call_on_sections=call_on_sections,
- **keywargs)
- return out
-
-
- def as_bool(self, key):
- """
- Accepts a key as input. The corresponding value must be a string or
- the objects (``True`` or 1) or (``False`` or 0). We allow 0 and 1 to
- retain compatibility with Python 2.2.
-
- If the string is one of ``True``, ``On``, ``Yes``, or ``1`` it returns
- ``True``.
-
- If the string is one of ``False``, ``Off``, ``No``, or ``0`` it returns
- ``False``.
-
- ``as_bool`` is not case sensitive.
-
- Any other input will raise a ``ValueError``.
-
- >>> a = ConfigObj()
- >>> a['a'] = 'fish'
- >>> a.as_bool('a')
- Traceback (most recent call last):
- ValueError: Value "fish" is neither True nor False
- >>> a['b'] = 'True'
- >>> a.as_bool('b')
- 1
- >>> a['b'] = 'off'
- >>> a.as_bool('b')
- 0
- """
- val = self[key]
- if val == True:
- return True
- elif val == False:
- return False
- else:
- try:
- if not isinstance(val, basestring):
- # TODO: Why do we raise a KeyError here?
- raise KeyError()
- else:
- return self.main._bools[val.lower()]
- except KeyError:
- raise ValueError('Value "%s" is neither True nor False' % val)
-
-
- def as_int(self, key):
- """
- A convenience method which coerces the specified value to an integer.
-
- If the value is an invalid literal for ``int``, a ``ValueError`` will
- be raised.
-
- >>> a = ConfigObj()
- >>> a['a'] = 'fish'
- >>> a.as_int('a')
- Traceback (most recent call last):
- ValueError: invalid literal for int() with base 10: 'fish'
- >>> a['b'] = '1'
- >>> a.as_int('b')
- 1
- >>> a['b'] = '3.2'
- >>> a.as_int('b')
- Traceback (most recent call last):
- ValueError: invalid literal for int() with base 10: '3.2'
- """
- return int(self[key])
-
-
- def as_float(self, key):
- """
- A convenience method which coerces the specified value to a float.
-
- If the value is an invalid literal for ``float``, a ``ValueError`` will
- be raised.
-
- >>> a = ConfigObj()
- >>> a['a'] = 'fish'
- >>> a.as_float('a')
- Traceback (most recent call last):
- ValueError: invalid literal for float(): fish
- >>> a['b'] = '1'
- >>> a.as_float('b')
- 1.0
- >>> a['b'] = '3.2'
- >>> a.as_float('b')
- 3.2000000000000002
- """
- return float(self[key])
-
-
- def as_list(self, key):
- """
- A convenience method which fetches the specified value, guaranteeing
- that it is a list.
-
- >>> a = ConfigObj()
- >>> a['a'] = 1
- >>> a.as_list('a')
- [1]
- >>> a['a'] = (1,)
- >>> a.as_list('a')
- [1]
- >>> a['a'] = [1]
- >>> a.as_list('a')
- [1]
- """
- result = self[key]
- if isinstance(result, (tuple, list)):
- return list(result)
- return [result]
-
-
- def restore_default(self, key):
- """
- Restore (and return) default value for the specified key.
-
- This method will only work for a ConfigObj that was created
- with a configspec and has been validated.
-
- If there is no default value for this key, ``KeyError`` is raised.
- """
- default = self.default_values[key]
- dict.__setitem__(self, key, default)
- if key not in self.defaults:
- self.defaults.append(key)
- return default
-
-
- def restore_defaults(self):
- """
- Recursively restore default values to all members
- that have them.
-
- This method will only work for a ConfigObj that was created
- with a configspec and has been validated.
-
- It doesn't delete or modify entries without default values.
- """
- for key in self.default_values:
- self.restore_default(key)
-
- for section in self.sections:
- self[section].restore_defaults()
-
-
-class ConfigObj(Section):
- """An object to read, create, and write config files."""
-
- _keyword = re.compile(r'''^ # line start
- (\s*) # indentation
- ( # keyword
- (?:".*?")| # double quotes
- (?:'.*?')| # single quotes
- (?:[^'"=].*?) # no quotes
- )
- \s*=\s* # divider
- (.*) # value (including list values and comments)
- $ # line end
- ''',
- re.VERBOSE)
-
- _sectionmarker = re.compile(r'''^
- (\s*) # 1: indentation
- ((?:\[\s*)+) # 2: section marker open
- ( # 3: section name open
- (?:"\s*\S.*?\s*")| # at least one non-space with double quotes
- (?:'\s*\S.*?\s*')| # at least one non-space with single quotes
- (?:[^'"\s].*?) # at least one non-space unquoted
- ) # section name close
- ((?:\s*\])+) # 4: section marker close
- \s*(\#.*)? # 5: optional comment
- $''',
- re.VERBOSE)
-
- # this regexp pulls list values out as a single string
- # or single values and comments
- # FIXME: this regex adds a '' to the end of comma terminated lists
- # workaround in ``_handle_value``
- _valueexp = re.compile(r'''^
- (?:
- (?:
- (
- (?:
- (?:
- (?:".*?")| # double quotes
- (?:'.*?')| # single quotes
- (?:[^'",\#][^,\#]*?) # unquoted
- )
- \s*,\s* # comma
- )* # match all list items ending in a comma (if any)
- )
- (
- (?:".*?")| # double quotes
- (?:'.*?')| # single quotes
- (?:[^'",\#\s][^,]*?)| # unquoted
- (?:(?<!,)) # Empty value
- )? # last item in a list - or string value
- )|
- (,) # alternatively a single comma - empty list
- )
- \s*(\#.*)? # optional comment
- $''',
- re.VERBOSE)
-
- # use findall to get the members of a list value
- _listvalueexp = re.compile(r'''
- (
- (?:".*?")| # double quotes
- (?:'.*?')| # single quotes
- (?:[^'",\#]?.*?) # unquoted
- )
- \s*,\s* # comma
- ''',
- re.VERBOSE)
-
- # this regexp is used for the value
- # when lists are switched off
- _nolistvalue = re.compile(r'''^
- (
- (?:".*?")| # double quotes
- (?:'.*?')| # single quotes
- (?:[^'"\#].*?)| # unquoted
- (?:) # Empty value
- )
- \s*(\#.*)? # optional comment
- $''',
- re.VERBOSE)
-
- # regexes for finding triple quoted values on one line
- _single_line_single = re.compile(r"^'''(.*?)'''\s*(#.*)?$")
- _single_line_double = re.compile(r'^"""(.*?)"""\s*(#.*)?$')
- _multi_line_single = re.compile(r"^(.*?)'''\s*(#.*)?$")
- _multi_line_double = re.compile(r'^(.*?)"""\s*(#.*)?$')
-
- _triple_quote = {
- "'''": (_single_line_single, _multi_line_single),
- '"""': (_single_line_double, _multi_line_double),
- }
-
- # Used by the ``istrue`` Section method
- _bools = {
- 'yes': True, 'no': False,
- 'on': True, 'off': False,
- '1': True, '0': False,
- 'true': True, 'false': False,
- }
-
-
- def __init__(self, infile=None, options=None, configspec=None, encoding=None,
- interpolation=True, raise_errors=False, list_values=True,
- create_empty=False, file_error=False, stringify=True,
- indent_type=None, default_encoding=None, unrepr=False,
- write_empty_values=False, _inspec=False):
- """
- Parse a config file or create a config file object.
-
- ``ConfigObj(infile=None, configspec=None, encoding=None,
- interpolation=True, raise_errors=False, list_values=True,
- create_empty=False, file_error=False, stringify=True,
- indent_type=None, default_encoding=None, unrepr=False,
- write_empty_values=False, _inspec=False)``
- """
- self._inspec = _inspec
- # init the superclass
- Section.__init__(self, self, 0, self)
-
- infile = infile or []
-
- _options = {'configspec': configspec,
- 'encoding': encoding, 'interpolation': interpolation,
- 'raise_errors': raise_errors, 'list_values': list_values,
- 'create_empty': create_empty, 'file_error': file_error,
- 'stringify': stringify, 'indent_type': indent_type,
- 'default_encoding': default_encoding, 'unrepr': unrepr,
- 'write_empty_values': write_empty_values}
-
- if options is None:
- options = _options
- else:
- import warnings
- warnings.warn('Passing in an options dictionary to ConfigObj() is '
- 'deprecated. Use **options instead.',
- DeprecationWarning, stacklevel=2)
-
- # TODO: check the values too.
- for entry in options:
- if entry not in OPTION_DEFAULTS:
- raise TypeError('Unrecognised option "%s".' % entry)
- for entry, value in OPTION_DEFAULTS.items():
- if entry not in options:
- options[entry] = value
- keyword_value = _options[entry]
- if value != keyword_value:
- options[entry] = keyword_value
-
- # XXXX this ignores an explicit list_values = True in combination
- # with _inspec. The user should *never* do that anyway, but still...
- if _inspec:
- options['list_values'] = False
-
- self._initialise(options)
- configspec = options['configspec']
- self._original_configspec = configspec
- self._load(infile, configspec)
-
-
- def _load(self, infile, configspec):
- if isinstance(infile, basestring):
- self.filename = infile
- if os.path.isfile(infile):
- h = open(infile, 'rb')
- infile = h.read() or []
- h.close()
- elif self.file_error:
- # raise an error if the file doesn't exist
- raise IOError('Config file not found: "%s".' % self.filename)
- else:
- # file doesn't already exist
- if self.create_empty:
- # this is a good test that the filename specified
- # isn't impossible - like on a non-existent device
- h = open(infile, 'w')
- h.write('')
- h.close()
- infile = []
-
- elif isinstance(infile, (list, tuple)):
- infile = list(infile)
-
- elif isinstance(infile, dict):
- # initialise self
- # the Section class handles creating subsections
- if isinstance(infile, ConfigObj):
- # get a copy of our ConfigObj
- def set_section(in_section, this_section):
- for entry in in_section.scalars:
- this_section[entry] = in_section[entry]
- for section in in_section.sections:
- this_section[section] = {}
- set_section(in_section[section], this_section[section])
- set_section(infile, self)
-
- else:
- for entry in infile:
- self[entry] = infile[entry]
- del self._errors
-
- if configspec is not None:
- self._handle_configspec(configspec)
- else:
- self.configspec = None
- return
-
- elif getattr(infile, 'read', MISSING) is not MISSING:
- # This supports file like objects
- infile = infile.read() or []
- # needs splitting into lines - but needs doing *after* decoding
- # in case it's not an 8 bit encoding
- else:
- raise TypeError('infile must be a filename, file like object, or list of lines.')
-
- if infile:
- # don't do it for the empty ConfigObj
- infile = self._handle_bom(infile)
- # infile is now *always* a list
- #
- # Set the newlines attribute (first line ending it finds)
- # and strip trailing '\n' or '\r' from lines
- for line in infile:
- if (not line) or (line[-1] not in ('\r', '\n', '\r\n')):
- continue
- for end in ('\r\n', '\n', '\r'):
- if line.endswith(end):
- self.newlines = end
- break
- break
-
- infile = [line.rstrip('\r\n') for line in infile]
-
- self._parse(infile)
- # if we had any errors, now is the time to raise them
- if self._errors:
- info = "at line %s." % self._errors[0].line_number
- if len(self._errors) > 1:
- msg = "Parsing failed with several errors.\nFirst error %s" % info
- error = ConfigObjError(msg)
- else:
- error = self._errors[0]
- # set the errors attribute; it's a list of tuples:
- # (error_type, message, line_number)
- error.errors = self._errors
- # set the config attribute
- error.config = self
- raise error
- # delete private attributes
- del self._errors
-
- if configspec is None:
- self.configspec = None
- else:
- self._handle_configspec(configspec)
-
-
- def _initialise(self, options=None):
- if options is None:
- options = OPTION_DEFAULTS
-
- # initialise a few variables
- self.filename = None
- self._errors = []
- self.raise_errors = options['raise_errors']
- self.interpolation = options['interpolation']
- self.list_values = options['list_values']
- self.create_empty = options['create_empty']
- self.file_error = options['file_error']
- self.stringify = options['stringify']
- self.indent_type = options['indent_type']
- self.encoding = options['encoding']
- self.default_encoding = options['default_encoding']
- self.BOM = False
- self.newlines = None
- self.write_empty_values = options['write_empty_values']
- self.unrepr = options['unrepr']
-
- self.initial_comment = []
- self.final_comment = []
- self.configspec = None
-
- if self._inspec:
- self.list_values = False
-
- # Clear section attributes as well
- Section._initialise(self)
-
-
- def __repr__(self):
- def _getval(key):
- try:
- return self[key]
- except MissingInterpolationOption:
- return dict.__getitem__(self, key)
- return ('ConfigObj({%s})' %
- ', '.join([('%s: %s' % (repr(key), repr(_getval(key))))
- for key in (self.scalars + self.sections)]))
-
-
- def _handle_bom(self, infile):
- """
- Handle any BOM, and decode if necessary.
-
- If an encoding is specified, that *must* be used - but the BOM should
- still be removed (and the BOM attribute set).
-
- (If the encoding is wrongly specified, then a BOM for an alternative
- encoding won't be discovered or removed.)
-
- If an encoding is not specified, UTF8 or UTF16 BOM will be detected and
- removed. The BOM attribute will be set. UTF16 will be decoded to
- unicode.
-
- NOTE: This method must not be called with an empty ``infile``.
-
- Specifying the *wrong* encoding is likely to cause a
- ``UnicodeDecodeError``.
-
- ``infile`` must always be returned as a list of lines, but may be
- passed in as a single string.
- """
- if ((self.encoding is not None) and
- (self.encoding.lower() not in BOM_LIST)):
- # No need to check for a BOM
- # the encoding specified doesn't have one
- # just decode
- return self._decode(infile, self.encoding)
-
- if isinstance(infile, (list, tuple)):
- line = infile[0]
- else:
- line = infile
- if self.encoding is not None:
- # encoding explicitly supplied
- # And it could have an associated BOM
- # TODO: if encoding is just UTF16 - we ought to check for both
- # TODO: big endian and little endian versions.
- enc = BOM_LIST[self.encoding.lower()]
- if enc == 'utf_16':
- # For UTF16 we try big endian and little endian
- for BOM, (encoding, final_encoding) in BOMS.items():
- if not final_encoding:
- # skip UTF8
- continue
- if infile.startswith(BOM):
- ### BOM discovered
- ##self.BOM = True
- # Don't need to remove BOM
- return self._decode(infile, encoding)
-
- # If we get this far, will *probably* raise a DecodeError
- # As it doesn't appear to start with a BOM
- return self._decode(infile, self.encoding)
-
- # Must be UTF8
- BOM = BOM_SET[enc]
- if not line.startswith(BOM):
- return self._decode(infile, self.encoding)
-
- newline = line[len(BOM):]
-
- # BOM removed
- if isinstance(infile, (list, tuple)):
- infile[0] = newline
- else:
- infile = newline
- self.BOM = True
- return self._decode(infile, self.encoding)
-
- # No encoding specified - so we need to check for UTF8/UTF16
- for BOM, (encoding, final_encoding) in BOMS.items():
- if not line.startswith(BOM):
- continue
- else:
- # BOM discovered
- self.encoding = final_encoding
- if not final_encoding:
- self.BOM = True
- # UTF8
- # remove BOM
- newline = line[len(BOM):]
- if isinstance(infile, (list, tuple)):
- infile[0] = newline
- else:
- infile = newline
- # UTF8 - don't decode
- if isinstance(infile, basestring):
- return infile.splitlines(True)
- else:
- return infile
- # UTF16 - have to decode
- return self._decode(infile, encoding)
-
- # No BOM discovered and no encoding specified, just return
- if isinstance(infile, basestring):
- # infile read from a file will be a single string
- return infile.splitlines(True)
- return infile
-
-
- def _a_to_u(self, aString):
- """Decode ASCII strings to unicode if a self.encoding is specified."""
- if self.encoding:
- return aString.decode('ascii')
- else:
- return aString
-
-
- def _decode(self, infile, encoding):
- """
- Decode infile to unicode. Using the specified encoding.
-
- if is a string, it also needs converting to a list.
- """
- if isinstance(infile, basestring):
- # can't be unicode
- # NOTE: Could raise a ``UnicodeDecodeError``
- return infile.decode(encoding).splitlines(True)
- for i, line in enumerate(infile):
- if not isinstance(line, unicode):
- # NOTE: The isinstance test here handles mixed lists of unicode/string
- # NOTE: But the decode will break on any non-string values
- # NOTE: Or could raise a ``UnicodeDecodeError``
- infile[i] = line.decode(encoding)
- return infile
-
-
- def _decode_element(self, line):
- """Decode element to unicode if necessary."""
- if not self.encoding:
- return line
- if isinstance(line, str) and self.default_encoding:
- return line.decode(self.default_encoding)
- return line
-
-
- def _str(self, value):
- """
- Used by ``stringify`` within validate, to turn non-string values
- into strings.
- """
- if not isinstance(value, basestring):
- return str(value)
- else:
- return value
-
-
- def _parse(self, infile):
- """Actually parse the config file."""
- temp_list_values = self.list_values
- if self.unrepr:
- self.list_values = False
-
- comment_list = []
- done_start = False
- this_section = self
- maxline = len(infile) - 1
- cur_index = -1
- reset_comment = False
-
- while cur_index < maxline:
- if reset_comment:
- comment_list = []
- cur_index += 1
- line = infile[cur_index]
- sline = line.strip()
- # do we have anything on the line ?
- if not sline or sline.startswith('#'):
- reset_comment = False
- comment_list.append(line)
- continue
-
- if not done_start:
- # preserve initial comment
- self.initial_comment = comment_list
- comment_list = []
- done_start = True
-
- reset_comment = True
- # first we check if it's a section marker
- mat = self._sectionmarker.match(line)
- if mat is not None:
- # is a section line
- (indent, sect_open, sect_name, sect_close, comment) = mat.groups()
- if indent and (self.indent_type is None):
- self.indent_type = indent
- cur_depth = sect_open.count('[')
- if cur_depth != sect_close.count(']'):
- self._handle_error("Cannot compute the section depth at line %s.",
- NestingError, infile, cur_index)
- continue
-
- if cur_depth < this_section.depth:
- # the new section is dropping back to a previous level
- try:
- parent = self._match_depth(this_section,
- cur_depth).parent
- except SyntaxError:
- self._handle_error("Cannot compute nesting level at line %s.",
- NestingError, infile, cur_index)
- continue
- elif cur_depth == this_section.depth:
- # the new section is a sibling of the current section
- parent = this_section.parent
- elif cur_depth == this_section.depth + 1:
- # the new section is a child the current section
- parent = this_section
- else:
- self._handle_error("Section too nested at line %s.",
- NestingError, infile, cur_index)
-
- sect_name = self._unquote(sect_name)
- if sect_name in parent:
- self._handle_error('Duplicate section name at line %s.',
- DuplicateError, infile, cur_index)
- continue
-
- # create the new section
- this_section = Section(
- parent,
- cur_depth,
- self,
- name=sect_name)
- parent[sect_name] = this_section
- parent.inline_comments[sect_name] = comment
- parent.comments[sect_name] = comment_list
- continue
- #
- # it's not a section marker,
- # so it should be a valid ``key = value`` line
- mat = self._keyword.match(line)
- if mat is None:
- # it neither matched as a keyword
- # or a section marker
- self._handle_error(
- 'Invalid line at line "%s".',
- ParseError, infile, cur_index)
- else:
- # is a keyword value
- # value will include any inline comment
- (indent, key, value) = mat.groups()
- if indent and (self.indent_type is None):
- self.indent_type = indent
- # check for a multiline value
- if value[:3] in ['"""', "'''"]:
- try:
- value, comment, cur_index = self._multiline(
- value, infile, cur_index, maxline)
- except SyntaxError:
- self._handle_error(
- 'Parse error in value at line %s.',
- ParseError, infile, cur_index)
- continue
- else:
- if self.unrepr:
- comment = ''
- try:
- value = unrepr(value)
- except Exception, e:
- if type(e) == UnknownType:
- msg = 'Unknown name or type in value at line %s.'
- else:
- msg = 'Parse error in value at line %s.'
- self._handle_error(msg, UnreprError, infile,
- cur_index)
- continue
- else:
- if self.unrepr:
- comment = ''
- try:
- value = unrepr(value)
- except Exception, e:
- if isinstance(e, UnknownType):
- msg = 'Unknown name or type in value at line %s.'
- else:
- msg = 'Parse error in value at line %s.'
- self._handle_error(msg, UnreprError, infile,
- cur_index)
- continue
- else:
- # extract comment and lists
- try:
- (value, comment) = self._handle_value(value)
- except SyntaxError:
- self._handle_error(
- 'Parse error in value at line %s.',
- ParseError, infile, cur_index)
- continue
- #
- key = self._unquote(key)
- if key in this_section:
- self._handle_error(
- 'Duplicate keyword name at line %s.',
- DuplicateError, infile, cur_index)
- continue
- # add the key.
- # we set unrepr because if we have got this far we will never
- # be creating a new section
- this_section.__setitem__(key, value, unrepr=True)
- this_section.inline_comments[key] = comment
- this_section.comments[key] = comment_list
- continue
- #
- if self.indent_type is None:
- # no indentation used, set the type accordingly
- self.indent_type = ''
-
- # preserve the final comment
- if not self and not self.initial_comment:
- self.initial_comment = comment_list
- elif not reset_comment:
- self.final_comment = comment_list
- self.list_values = temp_list_values
-
-
- def _match_depth(self, sect, depth):
- """
- Given a section and a depth level, walk back through the sections
- parents to see if the depth level matches a previous section.
-
- Return a reference to the right section,
- or raise a SyntaxError.
- """
- while depth < sect.depth:
- if sect is sect.parent:
- # we've reached the top level already
- raise SyntaxError()
- sect = sect.parent
- if sect.depth == depth:
- return sect
- # shouldn't get here
- raise SyntaxError()
-
-
- def _handle_error(self, text, ErrorClass, infile, cur_index):
- """
- Handle an error according to the error settings.
-
- Either raise the error or store it.
- The error will have occured at ``cur_index``
- """
- line = infile[cur_index]
- cur_index += 1
- message = text % cur_index
- error = ErrorClass(message, cur_index, line)
- if self.raise_errors:
- # raise the error - parsing stops here
- raise error
- # store the error
- # reraise when parsing has finished
- self._errors.append(error)
-
-
- def _unquote(self, value):
- """Return an unquoted version of a value"""
- if not value:
- # should only happen during parsing of lists
- raise SyntaxError
- if (value[0] == value[-1]) and (value[0] in ('"', "'")):
- value = value[1:-1]
- return value
-
-
- def _quote(self, value, multiline=True):
- """
- Return a safely quoted version of a value.
-
- Raise a ConfigObjError if the value cannot be safely quoted.
- If multiline is ``True`` (default) then use triple quotes
- if necessary.
-
- * Don't quote values that don't need it.
- * Recursively quote members of a list and return a comma joined list.
- * Multiline is ``False`` for lists.
- * Obey list syntax for empty and single member lists.
-
- If ``list_values=False`` then the value is only quoted if it contains
- a ``\\n`` (is multiline) or '#'.
-
- If ``write_empty_values`` is set, and the value is an empty string, it
- won't be quoted.
- """
- if multiline and self.write_empty_values and value == '':
- # Only if multiline is set, so that it is used for values not
- # keys, and not values that are part of a list
- return ''
-
- if multiline and isinstance(value, (list, tuple)):
- if not value:
- return ','
- elif len(value) == 1:
- return self._quote(value[0], multiline=False) + ','
- return ', '.join([self._quote(val, multiline=False)
- for val in value])
- if not isinstance(value, basestring):
- if self.stringify:
- value = str(value)
- else:
- raise TypeError('Value "%s" is not a string.' % value)
-
- if not value:
- return '""'
-
- no_lists_no_quotes = not self.list_values and '\n' not in value and '#' not in value
- need_triple = multiline and ((("'" in value) and ('"' in value)) or ('\n' in value ))
- hash_triple_quote = multiline and not need_triple and ("'" in value) and ('"' in value) and ('#' in value)
- check_for_single = (no_lists_no_quotes or not need_triple) and not hash_triple_quote
-
- if check_for_single:
- if not self.list_values:
- # we don't quote if ``list_values=False``
- quot = noquot
- # for normal values either single or double quotes will do
- elif '\n' in value:
- # will only happen if multiline is off - e.g. '\n' in key
- raise ConfigObjError('Value "%s" cannot be safely quoted.' % value)
- elif ((value[0] not in wspace_plus) and
- (value[-1] not in wspace_plus) and
- (',' not in value)):
- quot = noquot
- else:
- quot = self._get_single_quote(value)
- else:
- # if value has '\n' or "'" *and* '"', it will need triple quotes
- quot = self._get_triple_quote(value)
-
- if quot == noquot and '#' in value and self.list_values:
- quot = self._get_single_quote(value)
-
- return quot % value
-
-
- def _get_single_quote(self, value):
- if ("'" in value) and ('"' in value):
- raise ConfigObjError('Value "%s" cannot be safely quoted.' % value)
- elif '"' in value:
- quot = squot
- else:
- quot = dquot
- return quot
-
-
- def _get_triple_quote(self, value):
- if (value.find('"""') != -1) and (value.find("'''") != -1):
- raise ConfigObjError('Value "%s" cannot be safely quoted.' % value)
- if value.find('"""') == -1:
- quot = tdquot
- else:
- quot = tsquot
- return quot
-
-
- def _handle_value(self, value):
- """
- Given a value string, unquote, remove comment,
- handle lists. (including empty and single member lists)
- """
- if self._inspec:
- # Parsing a configspec so don't handle comments
- return (value, '')
- # do we look for lists in values ?
- if not self.list_values:
- mat = self._nolistvalue.match(value)
- if mat is None:
- raise SyntaxError()
- # NOTE: we don't unquote here
- return mat.groups()
- #
- mat = self._valueexp.match(value)
- if mat is None:
- # the value is badly constructed, probably badly quoted,
- # or an invalid list
- raise SyntaxError()
- (list_values, single, empty_list, comment) = mat.groups()
- if (list_values == '') and (single is None):
- # change this if you want to accept empty values
- raise SyntaxError()
- # NOTE: note there is no error handling from here if the regex
- # is wrong: then incorrect values will slip through
- if empty_list is not None:
- # the single comma - meaning an empty list
- return ([], comment)
- if single is not None:
- # handle empty values
- if list_values and not single:
- # FIXME: the '' is a workaround because our regex now matches
- # '' at the end of a list if it has a trailing comma
- single = None
- else:
- single = single or '""'
- single = self._unquote(single)
- if list_values == '':
- # not a list value
- return (single, comment)
- the_list = self._listvalueexp.findall(list_values)
- the_list = [self._unquote(val) for val in the_list]
- if single is not None:
- the_list += [single]
- return (the_list, comment)
-
-
- def _multiline(self, value, infile, cur_index, maxline):
- """Extract the value, where we are in a multiline situation."""
- quot = value[:3]
- newvalue = value[3:]
- single_line = self._triple_quote[quot][0]
- multi_line = self._triple_quote[quot][1]
- mat = single_line.match(value)
- if mat is not None:
- retval = list(mat.groups())
- retval.append(cur_index)
- return retval
- elif newvalue.find(quot) != -1:
- # somehow the triple quote is missing
- raise SyntaxError()
- #
- while cur_index < maxline:
- cur_index += 1
- newvalue += '\n'
- line = infile[cur_index]
- if line.find(quot) == -1:
- newvalue += line
- else:
- # end of multiline, process it
- break
- else:
- # we've got to the end of the config, oops...
- raise SyntaxError()
- mat = multi_line.match(line)
- if mat is None:
- # a badly formed line
- raise SyntaxError()
- (value, comment) = mat.groups()
- return (newvalue + value, comment, cur_index)
-
-
- def _handle_configspec(self, configspec):
- """Parse the configspec."""
- # FIXME: Should we check that the configspec was created with the
- # correct settings ? (i.e. ``list_values=False``)
- if not isinstance(configspec, ConfigObj):
- try:
- configspec = ConfigObj(configspec,
- raise_errors=True,
- file_error=True,
- _inspec=True)
- except ConfigObjError, e:
- # FIXME: Should these errors have a reference
- # to the already parsed ConfigObj ?
- raise ConfigspecError('Parsing configspec failed: %s' % e)
- except IOError, e:
- raise IOError('Reading configspec failed: %s' % e)
-
- self.configspec = configspec
-
-
-
- def _set_configspec(self, section, copy):
- """
- Called by validate. Handles setting the configspec on subsections
- including sections to be validated by __many__
- """
- configspec = section.configspec
- many = configspec.get('__many__')
- if isinstance(many, dict):
- for entry in section.sections:
- if entry not in configspec:
- section[entry].configspec = many
-
- for entry in configspec.sections:
- if entry == '__many__':
- continue
- if entry not in section:
- section[entry] = {}
- section[entry]._created = True
- if copy:
- # copy comments
- section.comments[entry] = configspec.comments.get(entry, [])
- section.inline_comments[entry] = configspec.inline_comments.get(entry, '')
-
- # Could be a scalar when we expect a section
- if isinstance(section[entry], Section):
- section[entry].configspec = configspec[entry]
-
-
- def _write_line(self, indent_string, entry, this_entry, comment):
- """Write an individual line, for the write method"""
- # NOTE: the calls to self._quote here handles non-StringType values.
- if not self.unrepr:
- val = self._decode_element(self._quote(this_entry))
- else:
- val = repr(this_entry)
- return '%s%s%s%s%s' % (indent_string,
- self._decode_element(self._quote(entry, multiline=False)),
- self._a_to_u(' = '),
- val,
- self._decode_element(comment))
-
-
- def _write_marker(self, indent_string, depth, entry, comment):
- """Write a section marker line"""
- return '%s%s%s%s%s' % (indent_string,
- self._a_to_u('[' * depth),
- self._quote(self._decode_element(entry), multiline=False),
- self._a_to_u(']' * depth),
- self._decode_element(comment))
-
-
- def _handle_comment(self, comment):
- """Deal with a comment."""
- if not comment:
- return ''
- start = self.indent_type
- if not comment.startswith('#'):
- start += self._a_to_u(' # ')
- return (start + comment)
-
-
- # Public methods
-
- def write(self, outfile=None, section=None):
- """
- Write the current ConfigObj as a file
-
- tekNico: FIXME: use StringIO instead of real files
-
- >>> filename = a.filename
- >>> a.filename = 'test.ini'
- >>> a.write()
- >>> a.filename = filename
- >>> a == ConfigObj('test.ini', raise_errors=True)
- 1
- >>> import os
- >>> os.remove('test.ini')
- """
- if self.indent_type is None:
- # this can be true if initialised from a dictionary
- self.indent_type = DEFAULT_INDENT_TYPE
-
- out = []
- cs = self._a_to_u('#')
- csp = self._a_to_u('# ')
- if section is None:
- int_val = self.interpolation
- self.interpolation = False
- section = self
- for line in self.initial_comment:
- line = self._decode_element(line)
- stripped_line = line.strip()
- if stripped_line and not stripped_line.startswith(cs):
- line = csp + line
- out.append(line)
-
- indent_string = self.indent_type * section.depth
- for entry in (section.scalars + section.sections):
- if entry in section.defaults:
- # don't write out default values
- continue
- for comment_line in section.comments[entry]:
- comment_line = self._decode_element(comment_line.lstrip())
- if comment_line and not comment_line.startswith(cs):
- comment_line = csp + comment_line
- out.append(indent_string + comment_line)
- this_entry = section[entry]
- comment = self._handle_comment(section.inline_comments[entry])
-
- if isinstance(this_entry, dict):
- # a section
- out.append(self._write_marker(
- indent_string,
- this_entry.depth,
- entry,
- comment))
- out.extend(self.write(section=this_entry))
- else:
- out.append(self._write_line(
- indent_string,
- entry,
- this_entry,
- comment))
-
- if section is self:
- for line in self.final_comment:
- line = self._decode_element(line)
- stripped_line = line.strip()
- if stripped_line and not stripped_line.startswith(cs):
- line = csp + line
- out.append(line)
- self.interpolation = int_val
-
- if section is not self:
- return out
-
- if (self.filename is None) and (outfile is None):
- # output a list of lines
- # might need to encode
- # NOTE: This will *screw* UTF16, each line will start with the BOM
- if self.encoding:
- out = [l.encode(self.encoding) for l in out]
- if (self.BOM and ((self.encoding is None) or
- (BOM_LIST.get(self.encoding.lower()) == 'utf_8'))):
- # Add the UTF8 BOM
- if not out:
- out.append('')
- out[0] = BOM_UTF8 + out[0]
- return out
-
- # Turn the list to a string, joined with correct newlines
- newline = self.newlines or os.linesep
- if (getattr(outfile, 'mode', None) is not None and outfile.mode == 'w'
- and sys.platform == 'win32' and newline == '\r\n'):
- # Windows specific hack to avoid writing '\r\r\n'
- newline = '\n'
- output = self._a_to_u(newline).join(out)
- if self.encoding:
- output = output.encode(self.encoding)
- if self.BOM and ((self.encoding is None) or match_utf8(self.encoding)):
- # Add the UTF8 BOM
- output = BOM_UTF8 + output
-
- if not output.endswith(newline):
- output += newline
- if outfile is not None:
- outfile.write(output)
- else:
- h = open(self.filename, 'wb')
- h.write(output)
- h.close()
-
-
- def validate(self, validator, preserve_errors=False, copy=False,
- section=None):
- """
- Test the ConfigObj against a configspec.
-
- It uses the ``validator`` object from *validate.py*.
-
- To run ``validate`` on the current ConfigObj, call: ::
-
- test = config.validate(validator)
-
- (Normally having previously passed in the configspec when the ConfigObj
- was created - you can dynamically assign a dictionary of checks to the
- ``configspec`` attribute of a section though).
-
- It returns ``True`` if everything passes, or a dictionary of
- pass/fails (True/False). If every member of a subsection passes, it
- will just have the value ``True``. (It also returns ``False`` if all
- members fail).
-
- In addition, it converts the values from strings to their native
- types if their checks pass (and ``stringify`` is set).
-
- If ``preserve_errors`` is ``True`` (``False`` is default) then instead
- of a marking a fail with a ``False``, it will preserve the actual
- exception object. This can contain info about the reason for failure.
- For example the ``VdtValueTooSmallError`` indicates that the value
- supplied was too small. If a value (or section) is missing it will
- still be marked as ``False``.
-
- You must have the validate module to use ``preserve_errors=True``.
-
- You can then use the ``flatten_errors`` function to turn your nested
- results dictionary into a flattened list of failures - useful for
- displaying meaningful error messages.
- """
- if section is None:
- if self.configspec is None:
- raise ValueError('No configspec supplied.')
- if preserve_errors:
- # We do this once to remove a top level dependency on the validate module
- # Which makes importing configobj faster
- from validate import VdtMissingValue
- self._vdtMissingValue = VdtMissingValue
-
- section = self
-
- if copy:
- section.initial_comment = section.configspec.initial_comment
- section.final_comment = section.configspec.final_comment
- section.encoding = section.configspec.encoding
- section.BOM = section.configspec.BOM
- section.newlines = section.configspec.newlines
- section.indent_type = section.configspec.indent_type
-
- #
- # section.default_values.clear() #??
- configspec = section.configspec
- self._set_configspec(section, copy)
-
-
- def validate_entry(entry, spec, val, missing, ret_true, ret_false):
- section.default_values.pop(entry, None)
-
- try:
- section.default_values[entry] = validator.get_default_value(configspec[entry])
- except (KeyError, AttributeError, validator.baseErrorClass):
- # No default, bad default or validator has no 'get_default_value'
- # (e.g. SimpleVal)
- pass
-
- try:
- check = validator.check(spec,
- val,
- missing=missing
- )
- except validator.baseErrorClass, e:
- if not preserve_errors or isinstance(e, self._vdtMissingValue):
- out[entry] = False
- else:
- # preserve the error
- out[entry] = e
- ret_false = False
- ret_true = False
- else:
- ret_false = False
- out[entry] = True
- if self.stringify or missing:
- # if we are doing type conversion
- # or the value is a supplied default
- if not self.stringify:
- if isinstance(check, (list, tuple)):
- # preserve lists
- check = [self._str(item) for item in check]
- elif missing and check is None:
- # convert the None from a default to a ''
- check = ''
- else:
- check = self._str(check)
- if (check != val) or missing:
- section[entry] = check
- if not copy and missing and entry not in section.defaults:
- section.defaults.append(entry)
- return ret_true, ret_false
-
- #
- out = {}
- ret_true = True
- ret_false = True
-
- unvalidated = [k for k in section.scalars if k not in configspec]
- incorrect_sections = [k for k in configspec.sections if k in section.scalars]
- incorrect_scalars = [k for k in configspec.scalars if k in section.sections]
-
- for entry in configspec.scalars:
- if entry in ('__many__', '___many___'):
- # reserved names
- continue
- if (not entry in section.scalars) or (entry in section.defaults):
- # missing entries
- # or entries from defaults
- missing = True
- val = None
- if copy and entry not in section.scalars:
- # copy comments
- section.comments[entry] = (
- configspec.comments.get(entry, []))
- section.inline_comments[entry] = (
- configspec.inline_comments.get(entry, ''))
- #
- else:
- missing = False
- val = section[entry]
-
- ret_true, ret_false = validate_entry(entry, configspec[entry], val,
- missing, ret_true, ret_false)
-
- many = None
- if '__many__' in configspec.scalars:
- many = configspec['__many__']
- elif '___many___' in configspec.scalars:
- many = configspec['___many___']
-
- if many is not None:
- for entry in unvalidated:
- val = section[entry]
- ret_true, ret_false = validate_entry(entry, many, val, False,
- ret_true, ret_false)
- unvalidated = []
-
- for entry in incorrect_scalars:
- ret_true = False
- if not preserve_errors:
- out[entry] = False
- else:
- ret_false = False
- msg = 'Value %r was provided as a section' % entry
- out[entry] = validator.baseErrorClass(msg)
- for entry in incorrect_sections:
- ret_true = False
- if not preserve_errors:
- out[entry] = False
- else:
- ret_false = False
- msg = 'Section %r was provided as a single value' % entry
- out[entry] = validator.baseErrorClass(msg)
-
- # Missing sections will have been created as empty ones when the
- # configspec was read.
- for entry in section.sections:
- # FIXME: this means DEFAULT is not copied in copy mode
- if section is self and entry == 'DEFAULT':
- continue
- if section[entry].configspec is None:
- unvalidated.append(entry)
- continue
- if copy:
- section.comments[entry] = configspec.comments.get(entry, [])
- section.inline_comments[entry] = configspec.inline_comments.get(entry, '')
- check = self.validate(validator, preserve_errors=preserve_errors, copy=copy, section=section[entry])
- out[entry] = check
- if check == False:
- ret_true = False
- elif check == True:
- ret_false = False
- else:
- ret_true = False
-
- section.extra_values = unvalidated
- if preserve_errors and not section._created:
- # If the section wasn't created (i.e. it wasn't missing)
- # then we can't return False, we need to preserve errors
- ret_false = False
- #
- if ret_false and preserve_errors and out:
- # If we are preserving errors, but all
- # the failures are from missing sections / values
- # then we can return False. Otherwise there is a
- # real failure that we need to preserve.
- ret_false = not any(out.values())
- if ret_true:
- return True
- elif ret_false:
- return False
- return out
-
-
- def reset(self):
- """Clear ConfigObj instance and restore to 'freshly created' state."""
- self.clear()
- self._initialise()
- # FIXME: Should be done by '_initialise', but ConfigObj constructor (and reload)
- # requires an empty dictionary
- self.configspec = None
- # Just to be sure ;-)
- self._original_configspec = None
-
-
- def reload(self):
- """
- Reload a ConfigObj from file.
-
- This method raises a ``ReloadError`` if the ConfigObj doesn't have
- a filename attribute pointing to a file.
- """
- if not isinstance(self.filename, basestring):
- raise ReloadError()
-
- filename = self.filename
- current_options = {}
- for entry in OPTION_DEFAULTS:
- if entry == 'configspec':
- continue
- current_options[entry] = getattr(self, entry)
-
- configspec = self._original_configspec
- current_options['configspec'] = configspec
-
- self.clear()
- self._initialise(current_options)
- self._load(filename, configspec)
-
-
-
-class SimpleVal(object):
- """
- A simple validator.
- Can be used to check that all members expected are present.
-
- To use it, provide a configspec with all your members in (the value given
- will be ignored). Pass an instance of ``SimpleVal`` to the ``validate``
- method of your ``ConfigObj``. ``validate`` will return ``True`` if all
- members are present, or a dictionary with True/False meaning
- present/missing. (Whole missing sections will be replaced with ``False``)
- """
-
- def __init__(self):
- self.baseErrorClass = ConfigObjError
-
- def check(self, check, member, missing=False):
- """A dummy check method, always returns the value unchanged."""
- if missing:
- raise self.baseErrorClass()
- return member
-
-
-def flatten_errors(cfg, res, levels=None, results=None):
- """
- An example function that will turn a nested dictionary of results
- (as returned by ``ConfigObj.validate``) into a flat list.
-
- ``cfg`` is the ConfigObj instance being checked, ``res`` is the results
- dictionary returned by ``validate``.
-
- (This is a recursive function, so you shouldn't use the ``levels`` or
- ``results`` arguments - they are used by the function.)
-
- Returns a list of keys that failed. Each member of the list is a tuple::
-
- ([list of sections...], key, result)
-
- If ``validate`` was called with ``preserve_errors=False`` (the default)
- then ``result`` will always be ``False``.
-
- *list of sections* is a flattened list of sections that the key was found
- in.
-
- If the section was missing (or a section was expected and a scalar provided
- - or vice-versa) then key will be ``None``.
-
- If the value (or section) was missing then ``result`` will be ``False``.
-
- If ``validate`` was called with ``preserve_errors=True`` and a value
- was present, but failed the check, then ``result`` will be the exception
- object returned. You can use this as a string that describes the failure.
-
- For example *The value "3" is of the wrong type*.
- """
- if levels is None:
- # first time called
- levels = []
- results = []
- if res == True:
- return results
- if res == False or isinstance(res, Exception):
- results.append((levels[:], None, res))
- if levels:
- levels.pop()
- return results
- for (key, val) in res.items():
- if val == True:
- continue
- if isinstance(cfg.get(key), dict):
- # Go down one level
- levels.append(key)
- flatten_errors(cfg[key], val, levels, results)
- continue
- results.append((levels[:], key, val))
- #
- # Go up one level
- if levels:
- levels.pop()
- #
- return results
-
-
-def get_extra_values(conf, _prepend=()):
- """
- Find all the values and sections not in the configspec from a validated
- ConfigObj.
-
- ``get_extra_values`` returns a list of tuples where each tuple represents
- either an extra section, or an extra value.
-
- The tuples contain two values, a tuple representing the section the value
- is in and the name of the extra values. For extra values in the top level
- section the first member will be an empty tuple. For values in the 'foo'
- section the first member will be ``('foo',)``. For members in the 'bar'
- subsection of the 'foo' section the first member will be ``('foo', 'bar')``.
-
- NOTE: If you call ``get_extra_values`` on a ConfigObj instance that hasn't
- been validated it will return an empty list.
- """
- out = []
-
- out.extend([(_prepend, name) for name in conf.extra_values])
- for name in conf.sections:
- if name not in conf.extra_values:
- out.extend(get_extra_values(conf[name], _prepend + (name,)))
- return out
-
-
-"""*A programming language is a medium of expression.* - Paul Graham"""
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
deleted file mode 100644
index 14ba5ecb..00000000
--- a/python/pkg/cdec/sa/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\
- encode_words, decode_words, isvar,\
- SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
- HieroCachingRuleFactory, Sampler, Scorer
-from cdec.sa.extractor import GrammarExtractor
-
-_SA_FEATURES = []
-_SA_ANNOTATORS = {}
-_SA_CONFIGURE = []
-
-def feature(fn):
- _SA_FEATURES.append(fn)
- return fn
-
-def annotator(fn):
- _SA_ANNOTATORS[fn.__name__] = fn
-
-def annotate(sentence):
- meta = {}
- for name, fn in _SA_ANNOTATORS.iteritems():
- meta[name] = fn(sentence)
- return meta
-
-def configure(fn):
- _SA_CONFIGURE.append(fn)
diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py
deleted file mode 100644
index d4cd8387..00000000
--- a/python/pkg/cdec/sa/compile.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python
-import argparse
-import os
-import logging
-import cdec.configobj
-import cdec.sa
-from cdec.sa._sa import monitor_cpu
-import sys
-
-MAX_PHRASE_LENGTH = 4
-def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phrases):
- lcp = cdec.sa.LCP(f_sa)
- stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True)
- precomp = cdec.sa.Precomputation(from_stats=stats,
- fsarray=f_sa,
- precompute_rank=rank1,
- precompute_secondary_rank=rank2,
- max_length=max_len,
- max_nonterminals=max_nt,
- train_max_initial_size=max_size,
- train_min_gap_size=min_gap)
- return precomp
-
-def main():
- preprocess_start_time = monitor_cpu()
- sys.setrecursionlimit(sys.getrecursionlimit() * 100)
-
- logging.basicConfig(level=logging.INFO)
- logger = logging.getLogger('cdec.sa.compile')
- parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.')
- parser.add_argument('--maxnt', '-n', type=int, default=2,
- help='Maximum number of non-terminal symbols')
- parser.add_argument('--maxlen', '-l', type=int, default=5,
- help='Maximum number of terminals')
- parser.add_argument('--maxsize', '-s', type=int, default=15,
- help='Maximum rule span')
- parser.add_argument('--mingap', '-g', type=int, default=1,
- help='Minimum gap size')
- parser.add_argument('--rank1', '-r1', type=int, default=100,
- help='Number of pre-computed frequent patterns')
- parser.add_argument('--rank2', '-r2', type=int, default=10,
- help='Number of pre-computed super-frequent patterns)')
- parser.add_argument('--loose', action='store_true',
- help='Enable loose phrase extraction (default: tight)')
- parser.add_argument('-c', '--config', default='/dev/stdout',
- help='Output configuration')
- parser.add_argument('-f', '--source',
- help='Source language corpus')
- parser.add_argument('-e', '--target',
- help='Target language corpus')
- parser.add_argument('-b', '--bitext',
- help='Parallel text (source ||| target)')
- parser.add_argument('-a', '--alignment', required=True,
- help='Bitext word alignment')
- parser.add_argument('-o', '--output', required=True,
- help='Output path')
- args = parser.parse_args()
-
- if not ((args.source and args.target) or args.bitext):
- parser.error('a parallel corpus is required\n'
- '\tuse -f (source) with -e (target) or -b (bitext)')
-
- param_names = ('max_len', 'max_nt', 'max_size', 'min_gap',
- 'rank1', 'rank2', 'tight_phrases')
- params = (args.maxlen, args.maxnt, args.maxsize, args.mingap,
- args.rank1, args.rank2, not args.loose)
-
- if not os.path.exists(args.output):
- os.mkdir(args.output)
-
- f_sa_bin = os.path.join(args.output, 'f.sa.bin')
- e_bin = os.path.join(args.output, 'e.bin')
- precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params)
- precomp_bin = os.path.join(args.output, precomp_file)
- a_bin = os.path.join(args.output, 'a.bin')
- lex_bin = os.path.join(args.output, 'lex.bin')
-
- start_time = monitor_cpu()
- logger.info('Compiling source suffix array')
- if args.bitext:
- f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')
- else:
- f_sa = cdec.sa.SuffixArray(from_text=args.source)
- f_sa.write_binary(f_sa_bin)
- stop_time = monitor_cpu()
- logger.info('Compiling source suffix array took %f seconds', stop_time - start_time)
-
- start_time = monitor_cpu()
- logger.info('Compiling target data array')
- if args.bitext:
- e = cdec.sa.DataArray(from_text=args.bitext, side='target')
- else:
- e = cdec.sa.DataArray(from_text=args.target)
- e.write_binary(e_bin)
- stop_time = monitor_cpu()
- logger.info('Compiling target data array took %f seconds', stop_time - start_time)
-
- start_time = monitor_cpu()
- logger.info('Precomputing frequent phrases')
- precompute(f_sa, *params).write_binary(precomp_bin)
- stop_time = monitor_cpu()
- logger.info('Compiling precomputations took %f seconds', stop_time - start_time)
-
- start_time = monitor_cpu()
- logger.info('Compiling alignment')
- a = cdec.sa.Alignment(from_text=args.alignment)
- a.write_binary(a_bin)
- stop_time = monitor_cpu()
- logger.info('Compiling alignment took %f seonds', stop_time - start_time)
-
- start_time = monitor_cpu()
- logger.info('Compiling bilexical dictionary')
- lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)
- lex.write_binary(lex_bin)
- stop_time = monitor_cpu()
- logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time)
-
- # Write configuration
- config = cdec.configobj.ConfigObj(args.config, unrepr=True)
- config['f_sa_file'] = os.path.abspath(f_sa_bin)
- config['e_file'] = os.path.abspath(e_bin)
- config['a_file'] = os.path.abspath(a_bin)
- config['lex_file'] = os.path.abspath(lex_bin)
- config['precompute_file'] = os.path.abspath(precomp_bin)
- for name, value in zip(param_names, params):
- config[name] = value
- config.write()
- preprocess_stop_time = monitor_cpu()
- logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time)
-
-if __name__ == '__main__':
- main()
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
deleted file mode 100644
index b6502c52..00000000
--- a/python/pkg/cdec/sa/extract.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python
-import sys
-import os
-import re
-import gzip
-import argparse
-import logging
-import signal
-import multiprocessing as mp
-import cdec.sa
-from cdec.sa._sa import monitor_cpu
-
-extractor, prefix = None, None
-online, compress = False, False
-
-def make_extractor(args):
- global extractor, prefix, online, compress
- signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
- load_features(args.features)
- extractor = cdec.sa.GrammarExtractor(args.config, online)
- prefix = args.grammars
- online = args.online
- compress = args.compress
-
-def load_features(features):
- for featdef in features:
- logging.info('Loading additional feature definitions from %s', featdef)
- prefix = os.path.dirname(featdef)
- sys.path.append(prefix)
- __import__(os.path.basename(featdef).replace('.py', ''))
- sys.path.remove(prefix)
-
-def extract(inp):
- global extractor, prefix, online, compress
- i, sentence = inp
- sentence = sentence[:-1]
- fields = re.split('\s*\|\|\|\s*', sentence)
- suffix = ''
- # 3 fields for online mode, 1 for normal
- if online:
- if len(fields) < 3:
- sys.stderr.write('Error: online mode requires references and alignments.'
- ' Not adding sentence to training data: {}\n'.format(sentence))
- sentence = fields[0]
- else:
- sentence, reference, alignment = fields[0:3]
- if len(fields) > 3:
- suffix = ' ||| ' + ' ||| '.join(fields[3:])
- else:
- if len(fields) > 1:
- sentence = fields[0]
- suffix = ' ||| ' + ' ||| '.join(fields[1:])
-
- grammar_file = os.path.join(prefix, 'grammar.'+str(i))
- if compress: grammar_file += '.gz'
- with (gzip.open if compress else open)(grammar_file, 'w') as output:
- for rule in extractor.grammar(sentence):
- output.write(str(rule)+'\n')
- # Add training instance _after_ extracting grammars
- if online:
- extractor.add_instance(sentence, reference, alignment)
- grammar_file = os.path.abspath(grammar_file)
- return '<seg grammar="{}" id="{}">{}</seg>{}'.format(grammar_file, i, sentence, suffix)
-
-def main():
- global online
- logging.basicConfig(level=logging.INFO)
- parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')
- parser.add_argument('-c', '--config', required=True,
- help='extractor configuration')
- parser.add_argument('-g', '--grammars', required=True,
- help='grammar output path')
- parser.add_argument('-j', '--jobs', type=int, default=1,
- help='number of parallel extractors')
- parser.add_argument('-s', '--chunksize', type=int, default=10,
- help='number of sentences / chunk')
- parser.add_argument('-f', '--features', nargs='*', default=[],
- help='additional feature definitions')
- parser.add_argument('-o', '--online', action='store_true',
- help='online grammar extraction')
- parser.add_argument('-z', '--compress', action='store_true',
- help='compress grammars with gzip')
- args = parser.parse_args()
-
- if not os.path.exists(args.grammars):
- os.mkdir(args.grammars)
- for featdef in args.features:
- if not featdef.endswith('.py'):
- sys.stderr.write('Error: feature definition file <{}>'
- ' should be a python module\n'.format(featdef))
- sys.exit(1)
-
- online = args.online
-
- start_time = monitor_cpu()
- if args.jobs > 1:
- logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
- pool = mp.Pool(args.jobs, make_extractor, (args,))
- try:
- for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
- print(output)
- except KeyboardInterrupt:
- pool.terminate()
- else:
- make_extractor(args)
- for output in map(extract, enumerate(sys.stdin)):
- print(output)
-
- stop_time = monitor_cpu()
- logging.info("Overall extraction step took %f seconds", stop_time - start_time)
-
-if __name__ == '__main__':
- main()
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
deleted file mode 100644
index acc13cbc..00000000
--- a/python/pkg/cdec/sa/extractor.py
+++ /dev/null
@@ -1,106 +0,0 @@
-from itertools import chain
-import os, sys
-import cdec.configobj
-from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
- MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\
- IsSupportedOnline
-import cdec.sa
-
-# maximum span of a grammar rule in TEST DATA
-MAX_INITIAL_SIZE = 15
-
-class GrammarExtractor:
- def __init__(self, config, online=False, features=None):
- if isinstance(config, basestring):
- if not os.path.exists(config):
- raise IOError('cannot read configuration from {0}'.format(config))
- config = cdec.configobj.ConfigObj(config, unrepr=True)
- alignment = cdec.sa.Alignment(from_binary=config['a_file'])
- self.factory = cdec.sa.HieroCachingRuleFactory(
- # compiled alignment object (REQUIRED)
- alignment,
- # name of generic nonterminal used by Hiero
- category="[X]",
- # maximum number of contiguous chunks of terminal symbols in RHS of a rule
- max_chunks=config['max_nt']+1,
- # maximum span of a grammar rule in TEST DATA
- max_initial_size=MAX_INITIAL_SIZE,
- # maximum number of symbols (both T and NT) allowed in a rule
- max_length=config['max_len'],
- # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
- max_nonterminals=config['max_nt'],
- # maximum number of contiguous chunks of terminal symbols
- # in target-side RHS of a rule.
- max_target_chunks=config['max_nt']+1,
- # maximum number of target side symbols (both T and NT) allowed in a rule.
- max_target_length=MAX_INITIAL_SIZE,
- # minimum span of a nonterminal in the RHS of a rule in TEST DATA
- min_gap_size=1,
- # filename of file containing precomputed collocations
- precompute_file=config['precompute_file'],
- # maximum frequency rank of patterns used to compute triples (< 20)
- precompute_secondary_rank=config['rank2'],
- # maximum frequency rank of patterns used to compute collocations (< 300)
- precompute_rank=config['rank1'],
- # require extracted rules to have at least one aligned word
- require_aligned_terminal=True,
- # require each contiguous chunk of extracted rules
- # to have at least one aligned word
- require_aligned_chunks=False,
- # maximum span of a grammar rule extracted from TRAINING DATA
- train_max_initial_size=config['max_size'],
- # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
- train_min_gap_size=config['min_gap'],
- # False if phrases should be loose (better but slower), True otherwise
- tight_phrases=config.get('tight_phrases', True),
- )
-
- # lexical weighting tables
- tt = cdec.sa.BiLex(from_binary=config['lex_file'])
-
- # TODO: clean this up
- extended_features = []
- if online:
- extended_features.append(IsSupportedOnline)
-
- # TODO: use @cdec.sa.features decorator for standard features too
- # + add a mask to disable features
- for f in cdec.sa._SA_FEATURES:
- extended_features.append(f)
-
- scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,
- MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
- *extended_features)
-
- fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
- edarray = cdec.sa.DataArray(from_binary=config['e_file'])
-
- # lower=faster, higher=better; improvements level off above 200-300 range,
- # -1 = don't sample, use all data (VERY SLOW!)
- sampler = cdec.sa.Sampler(300, fsarray)
-
- self.factory.configure(fsarray, edarray, sampler, scorer)
- # Initialize feature definitions with configuration
- for fn in cdec.sa._SA_CONFIGURE:
- fn(config)
-
- def grammar(self, sentence):
- if isinstance(sentence, unicode):
- sentence = sentence.encode('utf8')
- words = tuple(chain(('<s>',), sentence.split(), ('</s>',)))
- meta = cdec.sa.annotate(words)
- cnet = cdec.sa.make_lattice(words)
- return self.factory.input(cnet, meta)
-
- # Add training instance to data
- def add_instance(self, sentence, reference, alignment):
- f_words = cdec.sa.encode_words(sentence.split())
- e_words = cdec.sa.encode_words(reference.split())
- al = sorted(tuple(int(i) for i in pair.split('-')) for pair in alignment.split())
- self.factory.add_instance(f_words, e_words, al)
-
- # Debugging
- def dump_online_stats(self):
- self.factory.dump_online_stats()
- def dump_online_rules(self):
- self.factory.dump_online_rules() \ No newline at end of file
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
deleted file mode 100644
index c8fc1cca..00000000
--- a/python/pkg/cdec/sa/features.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from __future__ import division
-import math
-
-from cdec.sa import isvar
-
-MAXSCORE = 99
-
-def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
- if not ctx.online:
- prob = ctx.paircount/ctx.fcount
- else:
- prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
- return -math.log10(prob)
-
-def CountEF(ctx): # c(e, f)
- if not ctx.online:
- count = 1 + ctx.paircount
- else:
- count = 1 + ctx.paircount + ctx.online.paircount
- return math.log10(count)
-
-def SampleCountF(ctx): # sample c(f)
- if not ctx.online:
- count = 1 + ctx.fsample_count
- else:
- count = 1 + ctx.fsample_count + ctx.online.fsample_count
- return math.log10(count)
-
-def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
- if not ctx.online:
- prob = ctx.paircount/ctx.fsample_count
- else:
- prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
- return -math.log10(prob) if prob > 0 else MAXSCORE
-
-def CoherenceProb(ctx): # c(f) / sample c(f)
- if not ctx.online:
- prob = ctx.fcount/ctx.fsample_count
- else:
- prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
- return -math.log10(prob)
-
-def MaxLexEgivenF(ttable):
- def MaxLexEgivenF(ctx):
- fwords = ctx.fphrase.words
- fwords.append('NULL')
- # Always use this for now
- if not ctx.online or ctx.online:
- maxOffScore = 0.0
- for e in ctx.ephrase.words:
- maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
- maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- return maxOffScore
- else:
- # For now, straight average
- maxOffScore = 0.0
- maxOnScore = 0.0
- for e in ctx.ephrase.words:
- maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
- maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- for e in ctx.ephrase:
- if not isvar(e):
- maxScore = 0.0
- for f in ctx.fphrase:
- if not isvar(f):
- b_f = ctx.online.bilex_f.get(f, 0)
- if b_f:
- maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e))
- maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- return (maxOffScore + maxOnScore) / 2
- return MaxLexEgivenF
-
-def MaxLexFgivenE(ttable):
- def MaxLexFgivenE(ctx):
- ewords = ctx.ephrase.words
- ewords.append('NULL')
- # Always use this for now
- if not ctx.online or ctx.online:
- maxOffScore = 0.0
- for f in ctx.fphrase.words:
- maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
- maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- return maxOffScore
- else:
- # For now, straight average
- maxOffScore = 0.0
- maxOnScore = 0.0
- for f in ctx.fphrase.words:
- maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
- maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- for f in ctx.fphrase:
- if not isvar(f):
- maxScore = 0.0
- for e in ctx.ephrase:
- if not isvar(e):
- b_e = ctx.online.bilex_e.get(e, 0)
- if b_e:
- maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e )
- maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- return (maxOffScore + maxOnScore) / 2
- return MaxLexFgivenE
-
-def IsSingletonF(ctx):
- if not ctx.online:
- count = ctx.fcount
- else:
- count = ctx.fcount + ctx.online.fcount
- return math.fabs(count - 1) < 1e-6
-
-def IsSingletonFE(ctx):
- if not ctx.online:
- count = ctx.paircount
- else:
- count = ctx.paircount + ctx.online.paircount
- return (count == 1)
-
-def IsNotSingletonF(ctx):
- if not ctx.online:
- count = ctx.fcount
- else:
- count = ctx.fcount + ctx.online.fcount
- return (count > 1)
-
-def IsNotSingletonFE(ctx):
- if not ctx.online:
- count = ctx.paircount
- else:
- count = ctx.paircount + ctx.online.paircount
- return (ctx.paircount > 1)
-
-def IsFEGreaterThanZero(ctx):
- if not ctx.online:
- count = ctx.paircount
- else:
- count = ctx.paircount + ctx.online.paircount
- return (ctx.paircount > 0.01)
-
-def IsSupportedOnline(ctx): # Occurs in online data?
- if ctx.online:
- return (ctx.online.paircount > 0.01)
- else:
- return False
diff --git a/python/pkg/cdec/score.py b/python/pkg/cdec/score.py
deleted file mode 100644
index 657b4547..00000000
--- a/python/pkg/cdec/score.py
+++ /dev/null
@@ -1 +0,0 @@
-from _cdec import BLEU, TER, CER, SSK, QCRI, Metric, Scorer