From ee5e376e263d9aeabdeee6968b4457f53d3fc772 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Fri, 27 Jul 2012 23:33:45 -0400 Subject: [python] Move python files to avoid pythonpath conflicts --- python/cdec/__init__.py | 1 - python/cdec/configobj.py | 2468 --------------------------------------- python/cdec/sa/__init__.py | 4 - python/cdec/sa/compile.py | 94 -- python/cdec/sa/extract.py | 31 - python/cdec/sa/extractor.py | 78 -- python/cdec/sa/features.py | 57 - python/cdec/score.py | 1 - python/pkg/cdec/__init__.py | 1 + python/pkg/cdec/configobj.py | 2468 +++++++++++++++++++++++++++++++++++++++ python/pkg/cdec/sa/__init__.py | 4 + python/pkg/cdec/sa/compile.py | 94 ++ python/pkg/cdec/sa/extract.py | 31 + python/pkg/cdec/sa/extractor.py | 78 ++ python/pkg/cdec/sa/features.py | 57 + python/pkg/cdec/score.py | 1 + python/setup.py | 3 +- python/src/sa/_sa.c | 39 +- python/src/sa/sym.pxi | 2 +- 19 files changed, 2753 insertions(+), 2759 deletions(-) delete mode 100644 python/cdec/__init__.py delete mode 100644 python/cdec/configobj.py delete mode 100644 python/cdec/sa/__init__.py delete mode 100644 python/cdec/sa/compile.py delete mode 100644 python/cdec/sa/extract.py delete mode 100644 python/cdec/sa/extractor.py delete mode 100644 python/cdec/sa/features.py delete mode 100644 python/cdec/score.py create mode 100644 python/pkg/cdec/__init__.py create mode 100644 python/pkg/cdec/configobj.py create mode 100644 python/pkg/cdec/sa/__init__.py create mode 100644 python/pkg/cdec/sa/compile.py create mode 100644 python/pkg/cdec/sa/extract.py create mode 100644 python/pkg/cdec/sa/extractor.py create mode 100644 python/pkg/cdec/sa/features.py create mode 100644 python/pkg/cdec/score.py (limited to 'python') diff --git a/python/cdec/__init__.py b/python/cdec/__init__.py deleted file mode 100644 index 19058493..00000000 --- a/python/cdec/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from _cdec import Decoder, Lattice, TRule, NT, NTRef, ParseFailed, InvalidConfig diff --git a/python/cdec/configobj.py b/python/cdec/configobj.py deleted file mode 100644 index c1f6e6df..00000000 --- a/python/cdec/configobj.py +++ /dev/null @@ -1,2468 +0,0 @@ -# configobj.py -# A config file reader/writer that supports nested sections in config files. -# Copyright (C) 2005-2010 Michael Foord, Nicola Larosa -# E-mail: fuzzyman AT voidspace DOT org DOT uk -# nico AT tekNico DOT net - -# ConfigObj 4 -# http://www.voidspace.org.uk/python/configobj.html - -# Released subject to the BSD License -# Please see http://www.voidspace.org.uk/python/license.shtml - -# Scripts maintained at http://www.voidspace.org.uk/python/index.shtml -# For information about bugfixes, updates and support, please join the -# ConfigObj mailing list: -# http://lists.sourceforge.net/lists/listinfo/configobj-develop -# Comments, suggestions and bug reports welcome. - -from __future__ import generators - -import os -import re -import sys - -from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF16_BE, BOM_UTF16_LE - - -# imported lazily to avoid startup performance hit if it isn't used -compiler = None - -# A dictionary mapping BOM to -# the encoding to decode with, and what to set the -# encoding attribute to. -BOMS = { - BOM_UTF8: ('utf_8', None), - BOM_UTF16_BE: ('utf16_be', 'utf_16'), - BOM_UTF16_LE: ('utf16_le', 'utf_16'), - BOM_UTF16: ('utf_16', 'utf_16'), - } -# All legal variants of the BOM codecs. -# TODO: the list of aliases is not meant to be exhaustive, is there a -# better way ? -BOM_LIST = { - 'utf_16': 'utf_16', - 'u16': 'utf_16', - 'utf16': 'utf_16', - 'utf-16': 'utf_16', - 'utf16_be': 'utf16_be', - 'utf_16_be': 'utf16_be', - 'utf-16be': 'utf16_be', - 'utf16_le': 'utf16_le', - 'utf_16_le': 'utf16_le', - 'utf-16le': 'utf16_le', - 'utf_8': 'utf_8', - 'u8': 'utf_8', - 'utf': 'utf_8', - 'utf8': 'utf_8', - 'utf-8': 'utf_8', - } - -# Map of encodings to the BOM to write. -BOM_SET = { - 'utf_8': BOM_UTF8, - 'utf_16': BOM_UTF16, - 'utf16_be': BOM_UTF16_BE, - 'utf16_le': BOM_UTF16_LE, - None: BOM_UTF8 - } - - -def match_utf8(encoding): - return BOM_LIST.get(encoding.lower()) == 'utf_8' - - -# Quote strings used for writing values -squot = "'%s'" -dquot = '"%s"' -noquot = "%s" -wspace_plus = ' \r\n\v\t\'"' -tsquot = '"""%s"""' -tdquot = "'''%s'''" - -# Sentinel for use in getattr calls to replace hasattr -MISSING = object() - -__version__ = '4.7.2' - -try: - any -except NameError: - def any(iterable): - for entry in iterable: - if entry: - return True - return False - - -__all__ = ( - '__version__', - 'DEFAULT_INDENT_TYPE', - 'DEFAULT_INTERPOLATION', - 'ConfigObjError', - 'NestingError', - 'ParseError', - 'DuplicateError', - 'ConfigspecError', - 'ConfigObj', - 'SimpleVal', - 'InterpolationError', - 'InterpolationLoopError', - 'MissingInterpolationOption', - 'RepeatSectionError', - 'ReloadError', - 'UnreprError', - 'UnknownType', - 'flatten_errors', - 'get_extra_values' -) - -DEFAULT_INTERPOLATION = 'configparser' -DEFAULT_INDENT_TYPE = ' ' -MAX_INTERPOL_DEPTH = 10 - -OPTION_DEFAULTS = { - 'interpolation': True, - 'raise_errors': False, - 'list_values': True, - 'create_empty': False, - 'file_error': False, - 'configspec': None, - 'stringify': True, - # option may be set to one of ('', ' ', '\t') - 'indent_type': None, - 'encoding': None, - 'default_encoding': None, - 'unrepr': False, - 'write_empty_values': False, -} - - - -def getObj(s): - global compiler - if compiler is None: - import compiler - s = "a=" + s - p = compiler.parse(s) - return p.getChildren()[1].getChildren()[0].getChildren()[1] - - -class UnknownType(Exception): - pass - - -class Builder(object): - - def build(self, o): - m = getattr(self, 'build_' + o.__class__.__name__, None) - if m is None: - raise UnknownType(o.__class__.__name__) - return m(o) - - def build_List(self, o): - return map(self.build, o.getChildren()) - - def build_Const(self, o): - return o.value - - def build_Dict(self, o): - d = {} - i = iter(map(self.build, o.getChildren())) - for el in i: - d[el] = i.next() - return d - - def build_Tuple(self, o): - return tuple(self.build_List(o)) - - def build_Name(self, o): - if o.name == 'None': - return None - if o.name == 'True': - return True - if o.name == 'False': - return False - - # An undefined Name - raise UnknownType('Undefined Name') - - def build_Add(self, o): - real, imag = map(self.build_Const, o.getChildren()) - try: - real = float(real) - except TypeError: - raise UnknownType('Add') - if not isinstance(imag, complex) or imag.real != 0.0: - raise UnknownType('Add') - return real+imag - - def build_Getattr(self, o): - parent = self.build(o.expr) - return getattr(parent, o.attrname) - - def build_UnarySub(self, o): - return -self.build_Const(o.getChildren()[0]) - - def build_UnaryAdd(self, o): - return self.build_Const(o.getChildren()[0]) - - -_builder = Builder() - - -def unrepr(s): - if not s: - return s - return _builder.build(getObj(s)) - - - -class ConfigObjError(SyntaxError): - """ - This is the base class for all errors that ConfigObj raises. - It is a subclass of SyntaxError. - """ - def __init__(self, message='', line_number=None, line=''): - self.line = line - self.line_number = line_number - SyntaxError.__init__(self, message) - - -class NestingError(ConfigObjError): - """ - This error indicates a level of nesting that doesn't match. - """ - - -class ParseError(ConfigObjError): - """ - This error indicates that a line is badly written. - It is neither a valid ``key = value`` line, - nor a valid section marker line. - """ - - -class ReloadError(IOError): - """ - A 'reload' operation failed. - This exception is a subclass of ``IOError``. - """ - def __init__(self): - IOError.__init__(self, 'reload failed, filename is not set.') - - -class DuplicateError(ConfigObjError): - """ - The keyword or section specified already exists. - """ - - -class ConfigspecError(ConfigObjError): - """ - An error occured whilst parsing a configspec. - """ - - -class InterpolationError(ConfigObjError): - """Base class for the two interpolation errors.""" - - -class InterpolationLoopError(InterpolationError): - """Maximum interpolation depth exceeded in string interpolation.""" - - def __init__(self, option): - InterpolationError.__init__( - self, - 'interpolation loop detected in value "%s".' % option) - - -class RepeatSectionError(ConfigObjError): - """ - This error indicates additional sections in a section with a - ``__many__`` (repeated) section. - """ - - -class MissingInterpolationOption(InterpolationError): - """A value specified for interpolation was missing.""" - def __init__(self, option): - msg = 'missing option "%s" in interpolation.' % option - InterpolationError.__init__(self, msg) - - -class UnreprError(ConfigObjError): - """An error parsing in unrepr mode.""" - - - -class InterpolationEngine(object): - """ - A helper class to help perform string interpolation. - - This class is an abstract base class; its descendants perform - the actual work. - """ - - # compiled regexp to use in self.interpolate() - _KEYCRE = re.compile(r"%\(([^)]*)\)s") - _cookie = '%' - - def __init__(self, section): - # the Section instance that "owns" this engine - self.section = section - - - def interpolate(self, key, value): - # short-cut - if not self._cookie in value: - return value - - def recursive_interpolate(key, value, section, backtrail): - """The function that does the actual work. - - ``value``: the string we're trying to interpolate. - ``section``: the section in which that string was found - ``backtrail``: a dict to keep track of where we've been, - to detect and prevent infinite recursion loops - - This is similar to a depth-first-search algorithm. - """ - # Have we been here already? - if (key, section.name) in backtrail: - # Yes - infinite loop detected - raise InterpolationLoopError(key) - # Place a marker on our backtrail so we won't come back here again - backtrail[(key, section.name)] = 1 - - # Now start the actual work - match = self._KEYCRE.search(value) - while match: - # The actual parsing of the match is implementation-dependent, - # so delegate to our helper function - k, v, s = self._parse_match(match) - if k is None: - # That's the signal that no further interpolation is needed - replacement = v - else: - # Further interpolation may be needed to obtain final value - replacement = recursive_interpolate(k, v, s, backtrail) - # Replace the matched string with its final value - start, end = match.span() - value = ''.join((value[:start], replacement, value[end:])) - new_search_start = start + len(replacement) - # Pick up the next interpolation key, if any, for next time - # through the while loop - match = self._KEYCRE.search(value, new_search_start) - - # Now safe to come back here again; remove marker from backtrail - del backtrail[(key, section.name)] - - return value - - # Back in interpolate(), all we have to do is kick off the recursive - # function with appropriate starting values - value = recursive_interpolate(key, value, self.section, {}) - return value - - - def _fetch(self, key): - """Helper function to fetch values from owning section. - - Returns a 2-tuple: the value, and the section where it was found. - """ - # switch off interpolation before we try and fetch anything ! - save_interp = self.section.main.interpolation - self.section.main.interpolation = False - - # Start at section that "owns" this InterpolationEngine - current_section = self.section - while True: - # try the current section first - val = current_section.get(key) - if val is not None and not isinstance(val, Section): - break - # try "DEFAULT" next - val = current_section.get('DEFAULT', {}).get(key) - if val is not None and not isinstance(val, Section): - break - # move up to parent and try again - # top-level's parent is itself - if current_section.parent is current_section: - # reached top level, time to give up - break - current_section = current_section.parent - - # restore interpolation to previous value before returning - self.section.main.interpolation = save_interp - if val is None: - raise MissingInterpolationOption(key) - return val, current_section - - - def _parse_match(self, match): - """Implementation-dependent helper function. - - Will be passed a match object corresponding to the interpolation - key we just found (e.g., "%(foo)s" or "$foo"). Should look up that - key in the appropriate config file section (using the ``_fetch()`` - helper function) and return a 3-tuple: (key, value, section) - - ``key`` is the name of the key we're looking for - ``value`` is the value found for that key - ``section`` is a reference to the section where it was found - - ``key`` and ``section`` should be None if no further - interpolation should be performed on the resulting value - (e.g., if we interpolated "$$" and returned "$"). - """ - raise NotImplementedError() - - - -class ConfigParserInterpolation(InterpolationEngine): - """Behaves like ConfigParser.""" - _cookie = '%' - _KEYCRE = re.compile(r"%\(([^)]*)\)s") - - def _parse_match(self, match): - key = match.group(1) - value, section = self._fetch(key) - return key, value, section - - - -class TemplateInterpolation(InterpolationEngine): - """Behaves like string.Template.""" - _cookie = '$' - _delimiter = '$' - _KEYCRE = re.compile(r""" - \$(?: - (?P\$) | # Two $ signs - (?P[_a-z][_a-z0-9]*) | # $name format - {(?P[^}]*)} # ${name} format - ) - """, re.IGNORECASE | re.VERBOSE) - - def _parse_match(self, match): - # Valid name (in or out of braces): fetch value from section - key = match.group('named') or match.group('braced') - if key is not None: - value, section = self._fetch(key) - return key, value, section - # Escaped delimiter (e.g., $$): return single delimiter - if match.group('escaped') is not None: - # Return None for key and section to indicate it's time to stop - return None, self._delimiter, None - # Anything else: ignore completely, just return it unchanged - return None, match.group(), None - - -interpolation_engines = { - 'configparser': ConfigParserInterpolation, - 'template': TemplateInterpolation, -} - - -def __newobj__(cls, *args): - # Hack for pickle - return cls.__new__(cls, *args) - -class Section(dict): - """ - A dictionary-like object that represents a section in a config file. - - It does string interpolation if the 'interpolation' attribute - of the 'main' object is set to True. - - Interpolation is tried first from this object, then from the 'DEFAULT' - section of this object, next from the parent and its 'DEFAULT' section, - and so on until the main object is reached. - - A Section will behave like an ordered dictionary - following the - order of the ``scalars`` and ``sections`` attributes. - You can use this to change the order of members. - - Iteration follows the order: scalars, then sections. - """ - - - def __setstate__(self, state): - dict.update(self, state[0]) - self.__dict__.update(state[1]) - - def __reduce__(self): - state = (dict(self), self.__dict__) - return (__newobj__, (self.__class__,), state) - - - def __init__(self, parent, depth, main, indict=None, name=None): - """ - * parent is the section above - * depth is the depth level of this section - * main is the main ConfigObj - * indict is a dictionary to initialise the section with - """ - if indict is None: - indict = {} - dict.__init__(self) - # used for nesting level *and* interpolation - self.parent = parent - # used for the interpolation attribute - self.main = main - # level of nesting depth of this Section - self.depth = depth - # purely for information - self.name = name - # - self._initialise() - # we do this explicitly so that __setitem__ is used properly - # (rather than just passing to ``dict.__init__``) - for entry, value in indict.iteritems(): - self[entry] = value - - - def _initialise(self): - # the sequence of scalar values in this Section - self.scalars = [] - # the sequence of sections in this Section - self.sections = [] - # for comments :-) - self.comments = {} - self.inline_comments = {} - # the configspec - self.configspec = None - # for defaults - self.defaults = [] - self.default_values = {} - self.extra_values = [] - self._created = False - - - def _interpolate(self, key, value): - try: - # do we already have an interpolation engine? - engine = self._interpolation_engine - except AttributeError: - # not yet: first time running _interpolate(), so pick the engine - name = self.main.interpolation - if name == True: # note that "if name:" would be incorrect here - # backwards-compatibility: interpolation=True means use default - name = DEFAULT_INTERPOLATION - name = name.lower() # so that "Template", "template", etc. all work - class_ = interpolation_engines.get(name, None) - if class_ is None: - # invalid value for self.main.interpolation - self.main.interpolation = False - return value - else: - # save reference to engine so we don't have to do this again - engine = self._interpolation_engine = class_(self) - # let the engine do the actual work - return engine.interpolate(key, value) - - - def __getitem__(self, key): - """Fetch the item and do string interpolation.""" - val = dict.__getitem__(self, key) - if self.main.interpolation: - if isinstance(val, basestring): - return self._interpolate(key, val) - if isinstance(val, list): - def _check(entry): - if isinstance(entry, basestring): - return self._interpolate(key, entry) - return entry - new = [_check(entry) for entry in val] - if new != val: - return new - return val - - - def __setitem__(self, key, value, unrepr=False): - """ - Correctly set a value. - - Making dictionary values Section instances. - (We have to special case 'Section' instances - which are also dicts) - - Keys must be strings. - Values need only be strings (or lists of strings) if - ``main.stringify`` is set. - - ``unrepr`` must be set when setting a value to a dictionary, without - creating a new sub-section. - """ - if not isinstance(key, basestring): - raise ValueError('The key "%s" is not a string.' % key) - - # add the comment - if key not in self.comments: - self.comments[key] = [] - self.inline_comments[key] = '' - # remove the entry from defaults - if key in self.defaults: - self.defaults.remove(key) - # - if isinstance(value, Section): - if key not in self: - self.sections.append(key) - dict.__setitem__(self, key, value) - elif isinstance(value, dict) and not unrepr: - # First create the new depth level, - # then create the section - if key not in self: - self.sections.append(key) - new_depth = self.depth + 1 - dict.__setitem__( - self, - key, - Section( - self, - new_depth, - self.main, - indict=value, - name=key)) - else: - if key not in self: - self.scalars.append(key) - if not self.main.stringify: - if isinstance(value, basestring): - pass - elif isinstance(value, (list, tuple)): - for entry in value: - if not isinstance(entry, basestring): - raise TypeError('Value is not a string "%s".' % entry) - else: - raise TypeError('Value is not a string "%s".' % value) - dict.__setitem__(self, key, value) - - - def __delitem__(self, key): - """Remove items from the sequence when deleting.""" - dict. __delitem__(self, key) - if key in self.scalars: - self.scalars.remove(key) - else: - self.sections.remove(key) - del self.comments[key] - del self.inline_comments[key] - - - def get(self, key, default=None): - """A version of ``get`` that doesn't bypass string interpolation.""" - try: - return self[key] - except KeyError: - return default - - - def update(self, indict): - """ - A version of update that uses our ``__setitem__``. - """ - for entry in indict: - self[entry] = indict[entry] - - - def pop(self, key, default=MISSING): - """ - 'D.pop(k[,d]) -> v, remove specified key and return the corresponding value. - If key is not found, d is returned if given, otherwise KeyError is raised' - """ - try: - val = self[key] - except KeyError: - if default is MISSING: - raise - val = default - else: - del self[key] - return val - - - def popitem(self): - """Pops the first (key,val)""" - sequence = (self.scalars + self.sections) - if not sequence: - raise KeyError(": 'popitem(): dictionary is empty'") - key = sequence[0] - val = self[key] - del self[key] - return key, val - - - def clear(self): - """ - A version of clear that also affects scalars/sections - Also clears comments and configspec. - - Leaves other attributes alone : - depth/main/parent are not affected - """ - dict.clear(self) - self.scalars = [] - self.sections = [] - self.comments = {} - self.inline_comments = {} - self.configspec = None - self.defaults = [] - self.extra_values = [] - - - def setdefault(self, key, default=None): - """A version of setdefault that sets sequence if appropriate.""" - try: - return self[key] - except KeyError: - self[key] = default - return self[key] - - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return zip((self.scalars + self.sections), self.values()) - - - def keys(self): - """D.keys() -> list of D's keys""" - return (self.scalars + self.sections) - - - def values(self): - """D.values() -> list of D's values""" - return [self[key] for key in (self.scalars + self.sections)] - - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - return iter(self.items()) - - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter((self.scalars + self.sections)) - - __iter__ = iterkeys - - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - return iter(self.values()) - - - def __repr__(self): - """x.__repr__() <==> repr(x)""" - def _getval(key): - try: - return self[key] - except MissingInterpolationOption: - return dict.__getitem__(self, key) - return '{%s}' % ', '.join([('%s: %s' % (repr(key), repr(_getval(key)))) - for key in (self.scalars + self.sections)]) - - __str__ = __repr__ - __str__.__doc__ = "x.__str__() <==> str(x)" - - - # Extra methods - not in a normal dictionary - - def dict(self): - """ - Return a deepcopy of self as a dictionary. - - All members that are ``Section`` instances are recursively turned to - ordinary dictionaries - by calling their ``dict`` method. - - >>> n = a.dict() - >>> n == a - 1 - >>> n is a - 0 - """ - newdict = {} - for entry in self: - this_entry = self[entry] - if isinstance(this_entry, Section): - this_entry = this_entry.dict() - elif isinstance(this_entry, list): - # create a copy rather than a reference - this_entry = list(this_entry) - elif isinstance(this_entry, tuple): - # create a copy rather than a reference - this_entry = tuple(this_entry) - newdict[entry] = this_entry - return newdict - - - def merge(self, indict): - """ - A recursive update - useful for merging config files. - - >>> a = '''[section1] - ... option1 = True - ... [[subsection]] - ... more_options = False - ... # end of file'''.splitlines() - >>> b = '''# File is user.ini - ... [section1] - ... option1 = False - ... # end of file'''.splitlines() - >>> c1 = ConfigObj(b) - >>> c2 = ConfigObj(a) - >>> c2.merge(c1) - >>> c2 - ConfigObj({'section1': {'option1': 'False', 'subsection': {'more_options': 'False'}}}) - """ - for key, val in indict.items(): - if (key in self and isinstance(self[key], dict) and - isinstance(val, dict)): - self[key].merge(val) - else: - self[key] = val - - - def rename(self, oldkey, newkey): - """ - Change a keyname to another, without changing position in sequence. - - Implemented so that transformations can be made on keys, - as well as on values. (used by encode and decode) - - Also renames comments. - """ - if oldkey in self.scalars: - the_list = self.scalars - elif oldkey in self.sections: - the_list = self.sections - else: - raise KeyError('Key "%s" not found.' % oldkey) - pos = the_list.index(oldkey) - # - val = self[oldkey] - dict.__delitem__(self, oldkey) - dict.__setitem__(self, newkey, val) - the_list.remove(oldkey) - the_list.insert(pos, newkey) - comm = self.comments[oldkey] - inline_comment = self.inline_comments[oldkey] - del self.comments[oldkey] - del self.inline_comments[oldkey] - self.comments[newkey] = comm - self.inline_comments[newkey] = inline_comment - - - def walk(self, function, raise_errors=True, - call_on_sections=False, **keywargs): - """ - Walk every member and call a function on the keyword and value. - - Return a dictionary of the return values - - If the function raises an exception, raise the errror - unless ``raise_errors=False``, in which case set the return value to - ``False``. - - Any unrecognised keyword arguments you pass to walk, will be pased on - to the function you pass in. - - Note: if ``call_on_sections`` is ``True`` then - on encountering a - subsection, *first* the function is called for the *whole* subsection, - and then recurses into it's members. This means your function must be - able to handle strings, dictionaries and lists. This allows you - to change the key of subsections as well as for ordinary members. The - return value when called on the whole subsection has to be discarded. - - See the encode and decode methods for examples, including functions. - - .. admonition:: caution - - You can use ``walk`` to transform the names of members of a section - but you mustn't add or delete members. - - >>> config = '''[XXXXsection] - ... XXXXkey = XXXXvalue'''.splitlines() - >>> cfg = ConfigObj(config) - >>> cfg - ConfigObj({'XXXXsection': {'XXXXkey': 'XXXXvalue'}}) - >>> def transform(section, key): - ... val = section[key] - ... newkey = key.replace('XXXX', 'CLIENT1') - ... section.rename(key, newkey) - ... if isinstance(val, (tuple, list, dict)): - ... pass - ... else: - ... val = val.replace('XXXX', 'CLIENT1') - ... section[newkey] = val - >>> cfg.walk(transform, call_on_sections=True) - {'CLIENT1section': {'CLIENT1key': None}} - >>> cfg - ConfigObj({'CLIENT1section': {'CLIENT1key': 'CLIENT1value'}}) - """ - out = {} - # scalars first - for i in range(len(self.scalars)): - entry = self.scalars[i] - try: - val = function(self, entry, **keywargs) - # bound again in case name has changed - entry = self.scalars[i] - out[entry] = val - except Exception: - if raise_errors: - raise - else: - entry = self.scalars[i] - out[entry] = False - # then sections - for i in range(len(self.sections)): - entry = self.sections[i] - if call_on_sections: - try: - function(self, entry, **keywargs) - except Exception: - if raise_errors: - raise - else: - entry = self.sections[i] - out[entry] = False - # bound again in case name has changed - entry = self.sections[i] - # previous result is discarded - out[entry] = self[entry].walk( - function, - raise_errors=raise_errors, - call_on_sections=call_on_sections, - **keywargs) - return out - - - def as_bool(self, key): - """ - Accepts a key as input. The corresponding value must be a string or - the objects (``True`` or 1) or (``False`` or 0). We allow 0 and 1 to - retain compatibility with Python 2.2. - - If the string is one of ``True``, ``On``, ``Yes``, or ``1`` it returns - ``True``. - - If the string is one of ``False``, ``Off``, ``No``, or ``0`` it returns - ``False``. - - ``as_bool`` is not case sensitive. - - Any other input will raise a ``ValueError``. - - >>> a = ConfigObj() - >>> a['a'] = 'fish' - >>> a.as_bool('a') - Traceback (most recent call last): - ValueError: Value "fish" is neither True nor False - >>> a['b'] = 'True' - >>> a.as_bool('b') - 1 - >>> a['b'] = 'off' - >>> a.as_bool('b') - 0 - """ - val = self[key] - if val == True: - return True - elif val == False: - return False - else: - try: - if not isinstance(val, basestring): - # TODO: Why do we raise a KeyError here? - raise KeyError() - else: - return self.main._bools[val.lower()] - except KeyError: - raise ValueError('Value "%s" is neither True nor False' % val) - - - def as_int(self, key): - """ - A convenience method which coerces the specified value to an integer. - - If the value is an invalid literal for ``int``, a ``ValueError`` will - be raised. - - >>> a = ConfigObj() - >>> a['a'] = 'fish' - >>> a.as_int('a') - Traceback (most recent call last): - ValueError: invalid literal for int() with base 10: 'fish' - >>> a['b'] = '1' - >>> a.as_int('b') - 1 - >>> a['b'] = '3.2' - >>> a.as_int('b') - Traceback (most recent call last): - ValueError: invalid literal for int() with base 10: '3.2' - """ - return int(self[key]) - - - def as_float(self, key): - """ - A convenience method which coerces the specified value to a float. - - If the value is an invalid literal for ``float``, a ``ValueError`` will - be raised. - - >>> a = ConfigObj() - >>> a['a'] = 'fish' - >>> a.as_float('a') - Traceback (most recent call last): - ValueError: invalid literal for float(): fish - >>> a['b'] = '1' - >>> a.as_float('b') - 1.0 - >>> a['b'] = '3.2' - >>> a.as_float('b') - 3.2000000000000002 - """ - return float(self[key]) - - - def as_list(self, key): - """ - A convenience method which fetches the specified value, guaranteeing - that it is a list. - - >>> a = ConfigObj() - >>> a['a'] = 1 - >>> a.as_list('a') - [1] - >>> a['a'] = (1,) - >>> a.as_list('a') - [1] - >>> a['a'] = [1] - >>> a.as_list('a') - [1] - """ - result = self[key] - if isinstance(result, (tuple, list)): - return list(result) - return [result] - - - def restore_default(self, key): - """ - Restore (and return) default value for the specified key. - - This method will only work for a ConfigObj that was created - with a configspec and has been validated. - - If there is no default value for this key, ``KeyError`` is raised. - """ - default = self.default_values[key] - dict.__setitem__(self, key, default) - if key not in self.defaults: - self.defaults.append(key) - return default - - - def restore_defaults(self): - """ - Recursively restore default values to all members - that have them. - - This method will only work for a ConfigObj that was created - with a configspec and has been validated. - - It doesn't delete or modify entries without default values. - """ - for key in self.default_values: - self.restore_default(key) - - for section in self.sections: - self[section].restore_defaults() - - -class ConfigObj(Section): - """An object to read, create, and write config files.""" - - _keyword = re.compile(r'''^ # line start - (\s*) # indentation - ( # keyword - (?:".*?")| # double quotes - (?:'.*?')| # single quotes - (?:[^'"=].*?) # no quotes - ) - \s*=\s* # divider - (.*) # value (including list values and comments) - $ # line end - ''', - re.VERBOSE) - - _sectionmarker = re.compile(r'''^ - (\s*) # 1: indentation - ((?:\[\s*)+) # 2: section marker open - ( # 3: section name open - (?:"\s*\S.*?\s*")| # at least one non-space with double quotes - (?:'\s*\S.*?\s*')| # at least one non-space with single quotes - (?:[^'"\s].*?) # at least one non-space unquoted - ) # section name close - ((?:\s*\])+) # 4: section marker close - \s*(\#.*)? # 5: optional comment - $''', - re.VERBOSE) - - # this regexp pulls list values out as a single string - # or single values and comments - # FIXME: this regex adds a '' to the end of comma terminated lists - # workaround in ``_handle_value`` - _valueexp = re.compile(r'''^ - (?: - (?: - ( - (?: - (?: - (?:".*?")| # double quotes - (?:'.*?')| # single quotes - (?:[^'",\#][^,\#]*?) # unquoted - ) - \s*,\s* # comma - )* # match all list items ending in a comma (if any) - ) - ( - (?:".*?")| # double quotes - (?:'.*?')| # single quotes - (?:[^'",\#\s][^,]*?)| # unquoted - (?:(? 1: - msg = "Parsing failed with several errors.\nFirst error %s" % info - error = ConfigObjError(msg) - else: - error = self._errors[0] - # set the errors attribute; it's a list of tuples: - # (error_type, message, line_number) - error.errors = self._errors - # set the config attribute - error.config = self - raise error - # delete private attributes - del self._errors - - if configspec is None: - self.configspec = None - else: - self._handle_configspec(configspec) - - - def _initialise(self, options=None): - if options is None: - options = OPTION_DEFAULTS - - # initialise a few variables - self.filename = None - self._errors = [] - self.raise_errors = options['raise_errors'] - self.interpolation = options['interpolation'] - self.list_values = options['list_values'] - self.create_empty = options['create_empty'] - self.file_error = options['file_error'] - self.stringify = options['stringify'] - self.indent_type = options['indent_type'] - self.encoding = options['encoding'] - self.default_encoding = options['default_encoding'] - self.BOM = False - self.newlines = None - self.write_empty_values = options['write_empty_values'] - self.unrepr = options['unrepr'] - - self.initial_comment = [] - self.final_comment = [] - self.configspec = None - - if self._inspec: - self.list_values = False - - # Clear section attributes as well - Section._initialise(self) - - - def __repr__(self): - def _getval(key): - try: - return self[key] - except MissingInterpolationOption: - return dict.__getitem__(self, key) - return ('ConfigObj({%s})' % - ', '.join([('%s: %s' % (repr(key), repr(_getval(key)))) - for key in (self.scalars + self.sections)])) - - - def _handle_bom(self, infile): - """ - Handle any BOM, and decode if necessary. - - If an encoding is specified, that *must* be used - but the BOM should - still be removed (and the BOM attribute set). - - (If the encoding is wrongly specified, then a BOM for an alternative - encoding won't be discovered or removed.) - - If an encoding is not specified, UTF8 or UTF16 BOM will be detected and - removed. The BOM attribute will be set. UTF16 will be decoded to - unicode. - - NOTE: This method must not be called with an empty ``infile``. - - Specifying the *wrong* encoding is likely to cause a - ``UnicodeDecodeError``. - - ``infile`` must always be returned as a list of lines, but may be - passed in as a single string. - """ - if ((self.encoding is not None) and - (self.encoding.lower() not in BOM_LIST)): - # No need to check for a BOM - # the encoding specified doesn't have one - # just decode - return self._decode(infile, self.encoding) - - if isinstance(infile, (list, tuple)): - line = infile[0] - else: - line = infile - if self.encoding is not None: - # encoding explicitly supplied - # And it could have an associated BOM - # TODO: if encoding is just UTF16 - we ought to check for both - # TODO: big endian and little endian versions. - enc = BOM_LIST[self.encoding.lower()] - if enc == 'utf_16': - # For UTF16 we try big endian and little endian - for BOM, (encoding, final_encoding) in BOMS.items(): - if not final_encoding: - # skip UTF8 - continue - if infile.startswith(BOM): - ### BOM discovered - ##self.BOM = True - # Don't need to remove BOM - return self._decode(infile, encoding) - - # If we get this far, will *probably* raise a DecodeError - # As it doesn't appear to start with a BOM - return self._decode(infile, self.encoding) - - # Must be UTF8 - BOM = BOM_SET[enc] - if not line.startswith(BOM): - return self._decode(infile, self.encoding) - - newline = line[len(BOM):] - - # BOM removed - if isinstance(infile, (list, tuple)): - infile[0] = newline - else: - infile = newline - self.BOM = True - return self._decode(infile, self.encoding) - - # No encoding specified - so we need to check for UTF8/UTF16 - for BOM, (encoding, final_encoding) in BOMS.items(): - if not line.startswith(BOM): - continue - else: - # BOM discovered - self.encoding = final_encoding - if not final_encoding: - self.BOM = True - # UTF8 - # remove BOM - newline = line[len(BOM):] - if isinstance(infile, (list, tuple)): - infile[0] = newline - else: - infile = newline - # UTF8 - don't decode - if isinstance(infile, basestring): - return infile.splitlines(True) - else: - return infile - # UTF16 - have to decode - return self._decode(infile, encoding) - - # No BOM discovered and no encoding specified, just return - if isinstance(infile, basestring): - # infile read from a file will be a single string - return infile.splitlines(True) - return infile - - - def _a_to_u(self, aString): - """Decode ASCII strings to unicode if a self.encoding is specified.""" - if self.encoding: - return aString.decode('ascii') - else: - return aString - - - def _decode(self, infile, encoding): - """ - Decode infile to unicode. Using the specified encoding. - - if is a string, it also needs converting to a list. - """ - if isinstance(infile, basestring): - # can't be unicode - # NOTE: Could raise a ``UnicodeDecodeError`` - return infile.decode(encoding).splitlines(True) - for i, line in enumerate(infile): - if not isinstance(line, unicode): - # NOTE: The isinstance test here handles mixed lists of unicode/string - # NOTE: But the decode will break on any non-string values - # NOTE: Or could raise a ``UnicodeDecodeError`` - infile[i] = line.decode(encoding) - return infile - - - def _decode_element(self, line): - """Decode element to unicode if necessary.""" - if not self.encoding: - return line - if isinstance(line, str) and self.default_encoding: - return line.decode(self.default_encoding) - return line - - - def _str(self, value): - """ - Used by ``stringify`` within validate, to turn non-string values - into strings. - """ - if not isinstance(value, basestring): - return str(value) - else: - return value - - - def _parse(self, infile): - """Actually parse the config file.""" - temp_list_values = self.list_values - if self.unrepr: - self.list_values = False - - comment_list = [] - done_start = False - this_section = self - maxline = len(infile) - 1 - cur_index = -1 - reset_comment = False - - while cur_index < maxline: - if reset_comment: - comment_list = [] - cur_index += 1 - line = infile[cur_index] - sline = line.strip() - # do we have anything on the line ? - if not sline or sline.startswith('#'): - reset_comment = False - comment_list.append(line) - continue - - if not done_start: - # preserve initial comment - self.initial_comment = comment_list - comment_list = [] - done_start = True - - reset_comment = True - # first we check if it's a section marker - mat = self._sectionmarker.match(line) - if mat is not None: - # is a section line - (indent, sect_open, sect_name, sect_close, comment) = mat.groups() - if indent and (self.indent_type is None): - self.indent_type = indent - cur_depth = sect_open.count('[') - if cur_depth != sect_close.count(']'): - self._handle_error("Cannot compute the section depth at line %s.", - NestingError, infile, cur_index) - continue - - if cur_depth < this_section.depth: - # the new section is dropping back to a previous level - try: - parent = self._match_depth(this_section, - cur_depth).parent - except SyntaxError: - self._handle_error("Cannot compute nesting level at line %s.", - NestingError, infile, cur_index) - continue - elif cur_depth == this_section.depth: - # the new section is a sibling of the current section - parent = this_section.parent - elif cur_depth == this_section.depth + 1: - # the new section is a child the current section - parent = this_section - else: - self._handle_error("Section too nested at line %s.", - NestingError, infile, cur_index) - - sect_name = self._unquote(sect_name) - if sect_name in parent: - self._handle_error('Duplicate section name at line %s.', - DuplicateError, infile, cur_index) - continue - - # create the new section - this_section = Section( - parent, - cur_depth, - self, - name=sect_name) - parent[sect_name] = this_section - parent.inline_comments[sect_name] = comment - parent.comments[sect_name] = comment_list - continue - # - # it's not a section marker, - # so it should be a valid ``key = value`` line - mat = self._keyword.match(line) - if mat is None: - # it neither matched as a keyword - # or a section marker - self._handle_error( - 'Invalid line at line "%s".', - ParseError, infile, cur_index) - else: - # is a keyword value - # value will include any inline comment - (indent, key, value) = mat.groups() - if indent and (self.indent_type is None): - self.indent_type = indent - # check for a multiline value - if value[:3] in ['"""', "'''"]: - try: - value, comment, cur_index = self._multiline( - value, infile, cur_index, maxline) - except SyntaxError: - self._handle_error( - 'Parse error in value at line %s.', - ParseError, infile, cur_index) - continue - else: - if self.unrepr: - comment = '' - try: - value = unrepr(value) - except Exception, e: - if type(e) == UnknownType: - msg = 'Unknown name or type in value at line %s.' - else: - msg = 'Parse error in value at line %s.' - self._handle_error(msg, UnreprError, infile, - cur_index) - continue - else: - if self.unrepr: - comment = '' - try: - value = unrepr(value) - except Exception, e: - if isinstance(e, UnknownType): - msg = 'Unknown name or type in value at line %s.' - else: - msg = 'Parse error in value at line %s.' - self._handle_error(msg, UnreprError, infile, - cur_index) - continue - else: - # extract comment and lists - try: - (value, comment) = self._handle_value(value) - except SyntaxError: - self._handle_error( - 'Parse error in value at line %s.', - ParseError, infile, cur_index) - continue - # - key = self._unquote(key) - if key in this_section: - self._handle_error( - 'Duplicate keyword name at line %s.', - DuplicateError, infile, cur_index) - continue - # add the key. - # we set unrepr because if we have got this far we will never - # be creating a new section - this_section.__setitem__(key, value, unrepr=True) - this_section.inline_comments[key] = comment - this_section.comments[key] = comment_list - continue - # - if self.indent_type is None: - # no indentation used, set the type accordingly - self.indent_type = '' - - # preserve the final comment - if not self and not self.initial_comment: - self.initial_comment = comment_list - elif not reset_comment: - self.final_comment = comment_list - self.list_values = temp_list_values - - - def _match_depth(self, sect, depth): - """ - Given a section and a depth level, walk back through the sections - parents to see if the depth level matches a previous section. - - Return a reference to the right section, - or raise a SyntaxError. - """ - while depth < sect.depth: - if sect is sect.parent: - # we've reached the top level already - raise SyntaxError() - sect = sect.parent - if sect.depth == depth: - return sect - # shouldn't get here - raise SyntaxError() - - - def _handle_error(self, text, ErrorClass, infile, cur_index): - """ - Handle an error according to the error settings. - - Either raise the error or store it. - The error will have occured at ``cur_index`` - """ - line = infile[cur_index] - cur_index += 1 - message = text % cur_index - error = ErrorClass(message, cur_index, line) - if self.raise_errors: - # raise the error - parsing stops here - raise error - # store the error - # reraise when parsing has finished - self._errors.append(error) - - - def _unquote(self, value): - """Return an unquoted version of a value""" - if not value: - # should only happen during parsing of lists - raise SyntaxError - if (value[0] == value[-1]) and (value[0] in ('"', "'")): - value = value[1:-1] - return value - - - def _quote(self, value, multiline=True): - """ - Return a safely quoted version of a value. - - Raise a ConfigObjError if the value cannot be safely quoted. - If multiline is ``True`` (default) then use triple quotes - if necessary. - - * Don't quote values that don't need it. - * Recursively quote members of a list and return a comma joined list. - * Multiline is ``False`` for lists. - * Obey list syntax for empty and single member lists. - - If ``list_values=False`` then the value is only quoted if it contains - a ``\\n`` (is multiline) or '#'. - - If ``write_empty_values`` is set, and the value is an empty string, it - won't be quoted. - """ - if multiline and self.write_empty_values and value == '': - # Only if multiline is set, so that it is used for values not - # keys, and not values that are part of a list - return '' - - if multiline and isinstance(value, (list, tuple)): - if not value: - return ',' - elif len(value) == 1: - return self._quote(value[0], multiline=False) + ',' - return ', '.join([self._quote(val, multiline=False) - for val in value]) - if not isinstance(value, basestring): - if self.stringify: - value = str(value) - else: - raise TypeError('Value "%s" is not a string.' % value) - - if not value: - return '""' - - no_lists_no_quotes = not self.list_values and '\n' not in value and '#' not in value - need_triple = multiline and ((("'" in value) and ('"' in value)) or ('\n' in value )) - hash_triple_quote = multiline and not need_triple and ("'" in value) and ('"' in value) and ('#' in value) - check_for_single = (no_lists_no_quotes or not need_triple) and not hash_triple_quote - - if check_for_single: - if not self.list_values: - # we don't quote if ``list_values=False`` - quot = noquot - # for normal values either single or double quotes will do - elif '\n' in value: - # will only happen if multiline is off - e.g. '\n' in key - raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) - elif ((value[0] not in wspace_plus) and - (value[-1] not in wspace_plus) and - (',' not in value)): - quot = noquot - else: - quot = self._get_single_quote(value) - else: - # if value has '\n' or "'" *and* '"', it will need triple quotes - quot = self._get_triple_quote(value) - - if quot == noquot and '#' in value and self.list_values: - quot = self._get_single_quote(value) - - return quot % value - - - def _get_single_quote(self, value): - if ("'" in value) and ('"' in value): - raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) - elif '"' in value: - quot = squot - else: - quot = dquot - return quot - - - def _get_triple_quote(self, value): - if (value.find('"""') != -1) and (value.find("'''") != -1): - raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) - if value.find('"""') == -1: - quot = tdquot - else: - quot = tsquot - return quot - - - def _handle_value(self, value): - """ - Given a value string, unquote, remove comment, - handle lists. (including empty and single member lists) - """ - if self._inspec: - # Parsing a configspec so don't handle comments - return (value, '') - # do we look for lists in values ? - if not self.list_values: - mat = self._nolistvalue.match(value) - if mat is None: - raise SyntaxError() - # NOTE: we don't unquote here - return mat.groups() - # - mat = self._valueexp.match(value) - if mat is None: - # the value is badly constructed, probably badly quoted, - # or an invalid list - raise SyntaxError() - (list_values, single, empty_list, comment) = mat.groups() - if (list_values == '') and (single is None): - # change this if you want to accept empty values - raise SyntaxError() - # NOTE: note there is no error handling from here if the regex - # is wrong: then incorrect values will slip through - if empty_list is not None: - # the single comma - meaning an empty list - return ([], comment) - if single is not None: - # handle empty values - if list_values and not single: - # FIXME: the '' is a workaround because our regex now matches - # '' at the end of a list if it has a trailing comma - single = None - else: - single = single or '""' - single = self._unquote(single) - if list_values == '': - # not a list value - return (single, comment) - the_list = self._listvalueexp.findall(list_values) - the_list = [self._unquote(val) for val in the_list] - if single is not None: - the_list += [single] - return (the_list, comment) - - - def _multiline(self, value, infile, cur_index, maxline): - """Extract the value, where we are in a multiline situation.""" - quot = value[:3] - newvalue = value[3:] - single_line = self._triple_quote[quot][0] - multi_line = self._triple_quote[quot][1] - mat = single_line.match(value) - if mat is not None: - retval = list(mat.groups()) - retval.append(cur_index) - return retval - elif newvalue.find(quot) != -1: - # somehow the triple quote is missing - raise SyntaxError() - # - while cur_index < maxline: - cur_index += 1 - newvalue += '\n' - line = infile[cur_index] - if line.find(quot) == -1: - newvalue += line - else: - # end of multiline, process it - break - else: - # we've got to the end of the config, oops... - raise SyntaxError() - mat = multi_line.match(line) - if mat is None: - # a badly formed line - raise SyntaxError() - (value, comment) = mat.groups() - return (newvalue + value, comment, cur_index) - - - def _handle_configspec(self, configspec): - """Parse the configspec.""" - # FIXME: Should we check that the configspec was created with the - # correct settings ? (i.e. ``list_values=False``) - if not isinstance(configspec, ConfigObj): - try: - configspec = ConfigObj(configspec, - raise_errors=True, - file_error=True, - _inspec=True) - except ConfigObjError, e: - # FIXME: Should these errors have a reference - # to the already parsed ConfigObj ? - raise ConfigspecError('Parsing configspec failed: %s' % e) - except IOError, e: - raise IOError('Reading configspec failed: %s' % e) - - self.configspec = configspec - - - - def _set_configspec(self, section, copy): - """ - Called by validate. Handles setting the configspec on subsections - including sections to be validated by __many__ - """ - configspec = section.configspec - many = configspec.get('__many__') - if isinstance(many, dict): - for entry in section.sections: - if entry not in configspec: - section[entry].configspec = many - - for entry in configspec.sections: - if entry == '__many__': - continue - if entry not in section: - section[entry] = {} - section[entry]._created = True - if copy: - # copy comments - section.comments[entry] = configspec.comments.get(entry, []) - section.inline_comments[entry] = configspec.inline_comments.get(entry, '') - - # Could be a scalar when we expect a section - if isinstance(section[entry], Section): - section[entry].configspec = configspec[entry] - - - def _write_line(self, indent_string, entry, this_entry, comment): - """Write an individual line, for the write method""" - # NOTE: the calls to self._quote here handles non-StringType values. - if not self.unrepr: - val = self._decode_element(self._quote(this_entry)) - else: - val = repr(this_entry) - return '%s%s%s%s%s' % (indent_string, - self._decode_element(self._quote(entry, multiline=False)), - self._a_to_u(' = '), - val, - self._decode_element(comment)) - - - def _write_marker(self, indent_string, depth, entry, comment): - """Write a section marker line""" - return '%s%s%s%s%s' % (indent_string, - self._a_to_u('[' * depth), - self._quote(self._decode_element(entry), multiline=False), - self._a_to_u(']' * depth), - self._decode_element(comment)) - - - def _handle_comment(self, comment): - """Deal with a comment.""" - if not comment: - return '' - start = self.indent_type - if not comment.startswith('#'): - start += self._a_to_u(' # ') - return (start + comment) - - - # Public methods - - def write(self, outfile=None, section=None): - """ - Write the current ConfigObj as a file - - tekNico: FIXME: use StringIO instead of real files - - >>> filename = a.filename - >>> a.filename = 'test.ini' - >>> a.write() - >>> a.filename = filename - >>> a == ConfigObj('test.ini', raise_errors=True) - 1 - >>> import os - >>> os.remove('test.ini') - """ - if self.indent_type is None: - # this can be true if initialised from a dictionary - self.indent_type = DEFAULT_INDENT_TYPE - - out = [] - cs = self._a_to_u('#') - csp = self._a_to_u('# ') - if section is None: - int_val = self.interpolation - self.interpolation = False - section = self - for line in self.initial_comment: - line = self._decode_element(line) - stripped_line = line.strip() - if stripped_line and not stripped_line.startswith(cs): - line = csp + line - out.append(line) - - indent_string = self.indent_type * section.depth - for entry in (section.scalars + section.sections): - if entry in section.defaults: - # don't write out default values - continue - for comment_line in section.comments[entry]: - comment_line = self._decode_element(comment_line.lstrip()) - if comment_line and not comment_line.startswith(cs): - comment_line = csp + comment_line - out.append(indent_string + comment_line) - this_entry = section[entry] - comment = self._handle_comment(section.inline_comments[entry]) - - if isinstance(this_entry, dict): - # a section - out.append(self._write_marker( - indent_string, - this_entry.depth, - entry, - comment)) - out.extend(self.write(section=this_entry)) - else: - out.append(self._write_line( - indent_string, - entry, - this_entry, - comment)) - - if section is self: - for line in self.final_comment: - line = self._decode_element(line) - stripped_line = line.strip() - if stripped_line and not stripped_line.startswith(cs): - line = csp + line - out.append(line) - self.interpolation = int_val - - if section is not self: - return out - - if (self.filename is None) and (outfile is None): - # output a list of lines - # might need to encode - # NOTE: This will *screw* UTF16, each line will start with the BOM - if self.encoding: - out = [l.encode(self.encoding) for l in out] - if (self.BOM and ((self.encoding is None) or - (BOM_LIST.get(self.encoding.lower()) == 'utf_8'))): - # Add the UTF8 BOM - if not out: - out.append('') - out[0] = BOM_UTF8 + out[0] - return out - - # Turn the list to a string, joined with correct newlines - newline = self.newlines or os.linesep - if (getattr(outfile, 'mode', None) is not None and outfile.mode == 'w' - and sys.platform == 'win32' and newline == '\r\n'): - # Windows specific hack to avoid writing '\r\r\n' - newline = '\n' - output = self._a_to_u(newline).join(out) - if self.encoding: - output = output.encode(self.encoding) - if self.BOM and ((self.encoding is None) or match_utf8(self.encoding)): - # Add the UTF8 BOM - output = BOM_UTF8 + output - - if not output.endswith(newline): - output += newline - if outfile is not None: - outfile.write(output) - else: - h = open(self.filename, 'wb') - h.write(output) - h.close() - - - def validate(self, validator, preserve_errors=False, copy=False, - section=None): - """ - Test the ConfigObj against a configspec. - - It uses the ``validator`` object from *validate.py*. - - To run ``validate`` on the current ConfigObj, call: :: - - test = config.validate(validator) - - (Normally having previously passed in the configspec when the ConfigObj - was created - you can dynamically assign a dictionary of checks to the - ``configspec`` attribute of a section though). - - It returns ``True`` if everything passes, or a dictionary of - pass/fails (True/False). If every member of a subsection passes, it - will just have the value ``True``. (It also returns ``False`` if all - members fail). - - In addition, it converts the values from strings to their native - types if their checks pass (and ``stringify`` is set). - - If ``preserve_errors`` is ``True`` (``False`` is default) then instead - of a marking a fail with a ``False``, it will preserve the actual - exception object. This can contain info about the reason for failure. - For example the ``VdtValueTooSmallError`` indicates that the value - supplied was too small. If a value (or section) is missing it will - still be marked as ``False``. - - You must have the validate module to use ``preserve_errors=True``. - - You can then use the ``flatten_errors`` function to turn your nested - results dictionary into a flattened list of failures - useful for - displaying meaningful error messages. - """ - if section is None: - if self.configspec is None: - raise ValueError('No configspec supplied.') - if preserve_errors: - # We do this once to remove a top level dependency on the validate module - # Which makes importing configobj faster - from validate import VdtMissingValue - self._vdtMissingValue = VdtMissingValue - - section = self - - if copy: - section.initial_comment = section.configspec.initial_comment - section.final_comment = section.configspec.final_comment - section.encoding = section.configspec.encoding - section.BOM = section.configspec.BOM - section.newlines = section.configspec.newlines - section.indent_type = section.configspec.indent_type - - # - # section.default_values.clear() #?? - configspec = section.configspec - self._set_configspec(section, copy) - - - def validate_entry(entry, spec, val, missing, ret_true, ret_false): - section.default_values.pop(entry, None) - - try: - section.default_values[entry] = validator.get_default_value(configspec[entry]) - except (KeyError, AttributeError, validator.baseErrorClass): - # No default, bad default or validator has no 'get_default_value' - # (e.g. SimpleVal) - pass - - try: - check = validator.check(spec, - val, - missing=missing - ) - except validator.baseErrorClass, e: - if not preserve_errors or isinstance(e, self._vdtMissingValue): - out[entry] = False - else: - # preserve the error - out[entry] = e - ret_false = False - ret_true = False - else: - ret_false = False - out[entry] = True - if self.stringify or missing: - # if we are doing type conversion - # or the value is a supplied default - if not self.stringify: - if isinstance(check, (list, tuple)): - # preserve lists - check = [self._str(item) for item in check] - elif missing and check is None: - # convert the None from a default to a '' - check = '' - else: - check = self._str(check) - if (check != val) or missing: - section[entry] = check - if not copy and missing and entry not in section.defaults: - section.defaults.append(entry) - return ret_true, ret_false - - # - out = {} - ret_true = True - ret_false = True - - unvalidated = [k for k in section.scalars if k not in configspec] - incorrect_sections = [k for k in configspec.sections if k in section.scalars] - incorrect_scalars = [k for k in configspec.scalars if k in section.sections] - - for entry in configspec.scalars: - if entry in ('__many__', '___many___'): - # reserved names - continue - if (not entry in section.scalars) or (entry in section.defaults): - # missing entries - # or entries from defaults - missing = True - val = None - if copy and entry not in section.scalars: - # copy comments - section.comments[entry] = ( - configspec.comments.get(entry, [])) - section.inline_comments[entry] = ( - configspec.inline_comments.get(entry, '')) - # - else: - missing = False - val = section[entry] - - ret_true, ret_false = validate_entry(entry, configspec[entry], val, - missing, ret_true, ret_false) - - many = None - if '__many__' in configspec.scalars: - many = configspec['__many__'] - elif '___many___' in configspec.scalars: - many = configspec['___many___'] - - if many is not None: - for entry in unvalidated: - val = section[entry] - ret_true, ret_false = validate_entry(entry, many, val, False, - ret_true, ret_false) - unvalidated = [] - - for entry in incorrect_scalars: - ret_true = False - if not preserve_errors: - out[entry] = False - else: - ret_false = False - msg = 'Value %r was provided as a section' % entry - out[entry] = validator.baseErrorClass(msg) - for entry in incorrect_sections: - ret_true = False - if not preserve_errors: - out[entry] = False - else: - ret_false = False - msg = 'Section %r was provided as a single value' % entry - out[entry] = validator.baseErrorClass(msg) - - # Missing sections will have been created as empty ones when the - # configspec was read. - for entry in section.sections: - # FIXME: this means DEFAULT is not copied in copy mode - if section is self and entry == 'DEFAULT': - continue - if section[entry].configspec is None: - unvalidated.append(entry) - continue - if copy: - section.comments[entry] = configspec.comments.get(entry, []) - section.inline_comments[entry] = configspec.inline_comments.get(entry, '') - check = self.validate(validator, preserve_errors=preserve_errors, copy=copy, section=section[entry]) - out[entry] = check - if check == False: - ret_true = False - elif check == True: - ret_false = False - else: - ret_true = False - - section.extra_values = unvalidated - if preserve_errors and not section._created: - # If the section wasn't created (i.e. it wasn't missing) - # then we can't return False, we need to preserve errors - ret_false = False - # - if ret_false and preserve_errors and out: - # If we are preserving errors, but all - # the failures are from missing sections / values - # then we can return False. Otherwise there is a - # real failure that we need to preserve. - ret_false = not any(out.values()) - if ret_true: - return True - elif ret_false: - return False - return out - - - def reset(self): - """Clear ConfigObj instance and restore to 'freshly created' state.""" - self.clear() - self._initialise() - # FIXME: Should be done by '_initialise', but ConfigObj constructor (and reload) - # requires an empty dictionary - self.configspec = None - # Just to be sure ;-) - self._original_configspec = None - - - def reload(self): - """ - Reload a ConfigObj from file. - - This method raises a ``ReloadError`` if the ConfigObj doesn't have - a filename attribute pointing to a file. - """ - if not isinstance(self.filename, basestring): - raise ReloadError() - - filename = self.filename - current_options = {} - for entry in OPTION_DEFAULTS: - if entry == 'configspec': - continue - current_options[entry] = getattr(self, entry) - - configspec = self._original_configspec - current_options['configspec'] = configspec - - self.clear() - self._initialise(current_options) - self._load(filename, configspec) - - - -class SimpleVal(object): - """ - A simple validator. - Can be used to check that all members expected are present. - - To use it, provide a configspec with all your members in (the value given - will be ignored). Pass an instance of ``SimpleVal`` to the ``validate`` - method of your ``ConfigObj``. ``validate`` will return ``True`` if all - members are present, or a dictionary with True/False meaning - present/missing. (Whole missing sections will be replaced with ``False``) - """ - - def __init__(self): - self.baseErrorClass = ConfigObjError - - def check(self, check, member, missing=False): - """A dummy check method, always returns the value unchanged.""" - if missing: - raise self.baseErrorClass() - return member - - -def flatten_errors(cfg, res, levels=None, results=None): - """ - An example function that will turn a nested dictionary of results - (as returned by ``ConfigObj.validate``) into a flat list. - - ``cfg`` is the ConfigObj instance being checked, ``res`` is the results - dictionary returned by ``validate``. - - (This is a recursive function, so you shouldn't use the ``levels`` or - ``results`` arguments - they are used by the function.) - - Returns a list of keys that failed. Each member of the list is a tuple:: - - ([list of sections...], key, result) - - If ``validate`` was called with ``preserve_errors=False`` (the default) - then ``result`` will always be ``False``. - - *list of sections* is a flattened list of sections that the key was found - in. - - If the section was missing (or a section was expected and a scalar provided - - or vice-versa) then key will be ``None``. - - If the value (or section) was missing then ``result`` will be ``False``. - - If ``validate`` was called with ``preserve_errors=True`` and a value - was present, but failed the check, then ``result`` will be the exception - object returned. You can use this as a string that describes the failure. - - For example *The value "3" is of the wrong type*. - """ - if levels is None: - # first time called - levels = [] - results = [] - if res == True: - return results - if res == False or isinstance(res, Exception): - results.append((levels[:], None, res)) - if levels: - levels.pop() - return results - for (key, val) in res.items(): - if val == True: - continue - if isinstance(cfg.get(key), dict): - # Go down one level - levels.append(key) - flatten_errors(cfg[key], val, levels, results) - continue - results.append((levels[:], key, val)) - # - # Go up one level - if levels: - levels.pop() - # - return results - - -def get_extra_values(conf, _prepend=()): - """ - Find all the values and sections not in the configspec from a validated - ConfigObj. - - ``get_extra_values`` returns a list of tuples where each tuple represents - either an extra section, or an extra value. - - The tuples contain two values, a tuple representing the section the value - is in and the name of the extra values. For extra values in the top level - section the first member will be an empty tuple. For values in the 'foo' - section the first member will be ``('foo',)``. For members in the 'bar' - subsection of the 'foo' section the first member will be ``('foo', 'bar')``. - - NOTE: If you call ``get_extra_values`` on a ConfigObj instance that hasn't - been validated it will return an empty list. - """ - out = [] - - out.extend([(_prepend, name) for name in conf.extra_values]) - for name in conf.sections: - if name not in conf.extra_values: - out.extend(get_extra_values(conf[name], _prepend + (name,))) - return out - - -"""*A programming language is a medium of expression.* - Paul Graham""" diff --git a/python/cdec/sa/__init__.py b/python/cdec/sa/__init__.py deleted file mode 100644 index 8645e837..00000000 --- a/python/cdec/sa/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from _sa import sym_fromstring,\ - SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ - HieroCachingRuleFactory, Sampler -from extractor import GrammarExtractor diff --git a/python/cdec/sa/compile.py b/python/cdec/sa/compile.py deleted file mode 100644 index 30e605a6..00000000 --- a/python/cdec/sa/compile.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python -import argparse -import os -import logging -import cdec.configobj -import cdec.sa - -MAX_PHRASE_LENGTH = 4 -def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2): - lcp = cdec.sa.LCP(f_sa) - stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True) - precomp = cdec.sa.Precomputation(from_stats=stats, - fsarray=f_sa, - precompute_rank=rank1, - precompute_secondary_rank=rank2, - max_length=max_len, - max_nonterminals=max_nt, - train_max_initial_size=max_size, - train_min_gap_size=min_gap) - return precomp - -def main(): - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger('cdec.sa.compile') - parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.') - parser.add_argument('--maxnt', '-n', type=int, default=2, - help='Maximum number of non-terminal symbols') - parser.add_argument('--maxlen', '-l', type=int, default=5, - help='Maximum number of terminals') - parser.add_argument('--maxsize', '-s', type=int, default=15, - help='Maximum rule span') - parser.add_argument('--mingap', '-g', type=int, default=1, - help='Minimum gap size') - parser.add_argument('--rank1', '-r1', type=int, default=100, - help='Number of pre-computed frequent patterns') - parser.add_argument('--rank2', '-r2', type=int, default=10, - help='Number of pre-computed super-frequent patterns)') - parser.add_argument('-c', '--config', default='/dev/stdout', - help='Output configuration') - parser.add_argument('-o', '--output', required=True, - help='Output path') - parser.add_argument('-f', '--source', required=True, - help='Source language corpus') - parser.add_argument('-e', '--target', required=True, - help='Target language corpus') - parser.add_argument('-a', '--alignment', required=True, - help='Bitext word alignment') - args = parser.parse_args() - - param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") - params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) - - if not os.path.exists(args.output): - os.mkdir(args.output) - - f_sa_bin = os.path.join(args.output, 'f.sa.bin') - e_bin = os.path.join(args.output, 'e.bin') - precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params) - precomp_bin = os.path.join(args.output, precomp_file) - a_bin = os.path.join(args.output, 'a.bin') - lex_bin = os.path.join(args.output, 'lex.bin') - - logger.info('Compiling source suffix array') - f_sa = cdec.sa.SuffixArray(from_text=args.source) - f_sa.write_binary(f_sa_bin) - - logger.info('Compiling target data array') - e = cdec.sa.DataArray(from_text=args.target) - e.write_binary(e_bin) - - logger.info('Precomputing frequent phrases') - precompute(f_sa, *params).write_binary(precomp_bin) - - logger.info('Compiling alignment') - a = cdec.sa.Alignment(from_text=args.alignment) - a.write_binary(a_bin) - - logger.info('Compiling bilexical dictionary') - lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa) - lex.write_binary(lex_bin) - - # Write configuration - config = cdec.configobj.ConfigObj(args.config, unrepr=True) - config['f_sa_file'] = f_sa_bin - config['e_file'] = e_bin - config['a_file'] = a_bin - config['lex_file'] = lex_bin - config['precompute_file'] = precomp_bin - for name, value in zip(param_names, params): - config[name] = value - config.write() - -if __name__ == '__main__': - main() diff --git a/python/cdec/sa/extract.py b/python/cdec/sa/extract.py deleted file mode 100644 index 918aa3bb..00000000 --- a/python/cdec/sa/extract.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -import sys -import os -import argparse -import logging -import cdec.sa - -def main(): - logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') - parser.add_argument('-c', '--config', required=True, - help='Extractor configuration') - parser.add_argument('-g', '--grammars', required=True, - help='Grammar output path') - args = parser.parse_args() - - if not os.path.exists(args.grammars): - os.mkdir(args.grammars) - - extractor = cdec.sa.GrammarExtractor(args.config) - for i, sentence in enumerate(sys.stdin): - sentence = sentence[:-1] - grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) - with open(grammar_file, 'w') as output: - for rule in extractor.grammar(sentence): - output.write(str(rule)+'\n') - grammar_file = os.path.abspath(grammar_file) - print('{1}'.format(grammar_file, sentence)) - -if __name__ == '__main__': - main() diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py deleted file mode 100644 index bb912e16..00000000 --- a/python/cdec/sa/extractor.py +++ /dev/null @@ -1,78 +0,0 @@ -from itertools import chain -import os -import cdec.configobj -from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ - MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE -import cdec.sa - -# maximum span of a grammar rule in TEST DATA -MAX_INITIAL_SIZE = 15 - -class GrammarExtractor: - def __init__(self, config): - if isinstance(config, str) or isinstance(config, unicode): - if not os.path.exists(config): - raise IOError('cannot read configuration from {0}'.format(config)) - config = cdec.configobj.ConfigObj(config, unrepr=True) - alignment = cdec.sa.Alignment(from_binary=config['a_file']) - self.factory = cdec.sa.HieroCachingRuleFactory( - # compiled alignment object (REQUIRED) - alignment, - # name of generic nonterminal used by Hiero - category="[X]", - # maximum number of contiguous chunks of terminal symbols in RHS of a rule - max_chunks=config['max_nt']+1, - # maximum span of a grammar rule in TEST DATA - max_initial_size=MAX_INITIAL_SIZE, - # maximum number of symbols (both T and NT) allowed in a rule - max_length=config['max_len'], - # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_nonterminals=config['max_nt'], - # maximum number of contiguous chunks of terminal symbols - # in target-side RHS of a rule. - max_target_chunks=config['max_nt']+1, - # maximum number of target side symbols (both T and NT) allowed in a rule. - max_target_length=MAX_INITIAL_SIZE, - # minimum span of a nonterminal in the RHS of a rule in TEST DATA - min_gap_size=1, - # filename of file containing precomputed collocations - precompute_file=config['precompute_file'], - # maximum frequency rank of patterns used to compute triples (< 20) - precompute_secondary_rank=config['rank2'], - # maximum frequency rank of patterns used to compute collocations (< 300) - precompute_rank=config['rank1'], - # require extracted rules to have at least one aligned word - require_aligned_terminal=True, - # require each contiguous chunk of extracted rules - # to have at least one aligned word - require_aligned_chunks=False, - # maximum span of a grammar rule extracted from TRAINING DATA - train_max_initial_size=config['max_size'], - # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - train_min_gap_size=config['min_gap'], - # True if phrases should be tight, False otherwise (better but slower) - tight_phrases=True, - ) - - # lexical weighting tables - tt = cdec.sa.BiLex(from_binary=config['lex_file']) - - self.models = (EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) - - fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) - edarray = cdec.sa.DataArray(from_binary=config['e_file']) - - # lower=faster, higher=better; improvements level off above 200-300 range, - # -1 = don't sample, use all data (VERY SLOW!) - sampler = cdec.sa.Sampler(300, fsarray) - - self.factory.configure(fsarray, edarray, sampler) - - def grammar(self, sentence): - if isinstance(sentence, unicode): - sentence = sentence.encode('utf8') - cnet = chain(('',), sentence.split(), ('',)) - cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) - cnet = tuple(((word, None, 1), ) for word in cnet) - return self.factory.input(cnet, self.models) diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py deleted file mode 100644 index 325b9e13..00000000 --- a/python/cdec/sa/features.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import division -import math - -MAXSCORE = 99 - -def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) - return -math.log10(paircount/fcount) - -def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + paircount) - -def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + fsample_count) - -def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): - prob = paircount/fsample_count - return -math.log10(prob) if prob > 0 else MAXSCORE - -def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): - return -math.log10(fcount/fsample_count) - -def MaxLexEgivenF(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = fphrase.words - fwords.append('NULL') - def score(): - for e in ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) - return feature - -def MaxLexFgivenE(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): - ewords = ephrase.words - ewords.append('NULL') - def score(): - for f in fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) - return feature - -def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount == 1) - -def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount == 1) - -def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount > 1) - -def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 1) - -def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 0.01) diff --git a/python/cdec/score.py b/python/cdec/score.py deleted file mode 100644 index 22257774..00000000 --- a/python/cdec/score.py +++ /dev/null @@ -1 +0,0 @@ -from _cdec import BLEU, TER, CER, Metric diff --git a/python/pkg/cdec/__init__.py b/python/pkg/cdec/__init__.py new file mode 100644 index 00000000..503ac787 --- /dev/null +++ b/python/pkg/cdec/__init__.py @@ -0,0 +1 @@ +from cdec._cdec import Decoder, Lattice, TRule, NT, NTRef, ParseFailed, InvalidConfig diff --git a/python/pkg/cdec/configobj.py b/python/pkg/cdec/configobj.py new file mode 100644 index 00000000..c1f6e6df --- /dev/null +++ b/python/pkg/cdec/configobj.py @@ -0,0 +1,2468 @@ +# configobj.py +# A config file reader/writer that supports nested sections in config files. +# Copyright (C) 2005-2010 Michael Foord, Nicola Larosa +# E-mail: fuzzyman AT voidspace DOT org DOT uk +# nico AT tekNico DOT net + +# ConfigObj 4 +# http://www.voidspace.org.uk/python/configobj.html + +# Released subject to the BSD License +# Please see http://www.voidspace.org.uk/python/license.shtml + +# Scripts maintained at http://www.voidspace.org.uk/python/index.shtml +# For information about bugfixes, updates and support, please join the +# ConfigObj mailing list: +# http://lists.sourceforge.net/lists/listinfo/configobj-develop +# Comments, suggestions and bug reports welcome. + +from __future__ import generators + +import os +import re +import sys + +from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF16_BE, BOM_UTF16_LE + + +# imported lazily to avoid startup performance hit if it isn't used +compiler = None + +# A dictionary mapping BOM to +# the encoding to decode with, and what to set the +# encoding attribute to. +BOMS = { + BOM_UTF8: ('utf_8', None), + BOM_UTF16_BE: ('utf16_be', 'utf_16'), + BOM_UTF16_LE: ('utf16_le', 'utf_16'), + BOM_UTF16: ('utf_16', 'utf_16'), + } +# All legal variants of the BOM codecs. +# TODO: the list of aliases is not meant to be exhaustive, is there a +# better way ? +BOM_LIST = { + 'utf_16': 'utf_16', + 'u16': 'utf_16', + 'utf16': 'utf_16', + 'utf-16': 'utf_16', + 'utf16_be': 'utf16_be', + 'utf_16_be': 'utf16_be', + 'utf-16be': 'utf16_be', + 'utf16_le': 'utf16_le', + 'utf_16_le': 'utf16_le', + 'utf-16le': 'utf16_le', + 'utf_8': 'utf_8', + 'u8': 'utf_8', + 'utf': 'utf_8', + 'utf8': 'utf_8', + 'utf-8': 'utf_8', + } + +# Map of encodings to the BOM to write. +BOM_SET = { + 'utf_8': BOM_UTF8, + 'utf_16': BOM_UTF16, + 'utf16_be': BOM_UTF16_BE, + 'utf16_le': BOM_UTF16_LE, + None: BOM_UTF8 + } + + +def match_utf8(encoding): + return BOM_LIST.get(encoding.lower()) == 'utf_8' + + +# Quote strings used for writing values +squot = "'%s'" +dquot = '"%s"' +noquot = "%s" +wspace_plus = ' \r\n\v\t\'"' +tsquot = '"""%s"""' +tdquot = "'''%s'''" + +# Sentinel for use in getattr calls to replace hasattr +MISSING = object() + +__version__ = '4.7.2' + +try: + any +except NameError: + def any(iterable): + for entry in iterable: + if entry: + return True + return False + + +__all__ = ( + '__version__', + 'DEFAULT_INDENT_TYPE', + 'DEFAULT_INTERPOLATION', + 'ConfigObjError', + 'NestingError', + 'ParseError', + 'DuplicateError', + 'ConfigspecError', + 'ConfigObj', + 'SimpleVal', + 'InterpolationError', + 'InterpolationLoopError', + 'MissingInterpolationOption', + 'RepeatSectionError', + 'ReloadError', + 'UnreprError', + 'UnknownType', + 'flatten_errors', + 'get_extra_values' +) + +DEFAULT_INTERPOLATION = 'configparser' +DEFAULT_INDENT_TYPE = ' ' +MAX_INTERPOL_DEPTH = 10 + +OPTION_DEFAULTS = { + 'interpolation': True, + 'raise_errors': False, + 'list_values': True, + 'create_empty': False, + 'file_error': False, + 'configspec': None, + 'stringify': True, + # option may be set to one of ('', ' ', '\t') + 'indent_type': None, + 'encoding': None, + 'default_encoding': None, + 'unrepr': False, + 'write_empty_values': False, +} + + + +def getObj(s): + global compiler + if compiler is None: + import compiler + s = "a=" + s + p = compiler.parse(s) + return p.getChildren()[1].getChildren()[0].getChildren()[1] + + +class UnknownType(Exception): + pass + + +class Builder(object): + + def build(self, o): + m = getattr(self, 'build_' + o.__class__.__name__, None) + if m is None: + raise UnknownType(o.__class__.__name__) + return m(o) + + def build_List(self, o): + return map(self.build, o.getChildren()) + + def build_Const(self, o): + return o.value + + def build_Dict(self, o): + d = {} + i = iter(map(self.build, o.getChildren())) + for el in i: + d[el] = i.next() + return d + + def build_Tuple(self, o): + return tuple(self.build_List(o)) + + def build_Name(self, o): + if o.name == 'None': + return None + if o.name == 'True': + return True + if o.name == 'False': + return False + + # An undefined Name + raise UnknownType('Undefined Name') + + def build_Add(self, o): + real, imag = map(self.build_Const, o.getChildren()) + try: + real = float(real) + except TypeError: + raise UnknownType('Add') + if not isinstance(imag, complex) or imag.real != 0.0: + raise UnknownType('Add') + return real+imag + + def build_Getattr(self, o): + parent = self.build(o.expr) + return getattr(parent, o.attrname) + + def build_UnarySub(self, o): + return -self.build_Const(o.getChildren()[0]) + + def build_UnaryAdd(self, o): + return self.build_Const(o.getChildren()[0]) + + +_builder = Builder() + + +def unrepr(s): + if not s: + return s + return _builder.build(getObj(s)) + + + +class ConfigObjError(SyntaxError): + """ + This is the base class for all errors that ConfigObj raises. + It is a subclass of SyntaxError. + """ + def __init__(self, message='', line_number=None, line=''): + self.line = line + self.line_number = line_number + SyntaxError.__init__(self, message) + + +class NestingError(ConfigObjError): + """ + This error indicates a level of nesting that doesn't match. + """ + + +class ParseError(ConfigObjError): + """ + This error indicates that a line is badly written. + It is neither a valid ``key = value`` line, + nor a valid section marker line. + """ + + +class ReloadError(IOError): + """ + A 'reload' operation failed. + This exception is a subclass of ``IOError``. + """ + def __init__(self): + IOError.__init__(self, 'reload failed, filename is not set.') + + +class DuplicateError(ConfigObjError): + """ + The keyword or section specified already exists. + """ + + +class ConfigspecError(ConfigObjError): + """ + An error occured whilst parsing a configspec. + """ + + +class InterpolationError(ConfigObjError): + """Base class for the two interpolation errors.""" + + +class InterpolationLoopError(InterpolationError): + """Maximum interpolation depth exceeded in string interpolation.""" + + def __init__(self, option): + InterpolationError.__init__( + self, + 'interpolation loop detected in value "%s".' % option) + + +class RepeatSectionError(ConfigObjError): + """ + This error indicates additional sections in a section with a + ``__many__`` (repeated) section. + """ + + +class MissingInterpolationOption(InterpolationError): + """A value specified for interpolation was missing.""" + def __init__(self, option): + msg = 'missing option "%s" in interpolation.' % option + InterpolationError.__init__(self, msg) + + +class UnreprError(ConfigObjError): + """An error parsing in unrepr mode.""" + + + +class InterpolationEngine(object): + """ + A helper class to help perform string interpolation. + + This class is an abstract base class; its descendants perform + the actual work. + """ + + # compiled regexp to use in self.interpolate() + _KEYCRE = re.compile(r"%\(([^)]*)\)s") + _cookie = '%' + + def __init__(self, section): + # the Section instance that "owns" this engine + self.section = section + + + def interpolate(self, key, value): + # short-cut + if not self._cookie in value: + return value + + def recursive_interpolate(key, value, section, backtrail): + """The function that does the actual work. + + ``value``: the string we're trying to interpolate. + ``section``: the section in which that string was found + ``backtrail``: a dict to keep track of where we've been, + to detect and prevent infinite recursion loops + + This is similar to a depth-first-search algorithm. + """ + # Have we been here already? + if (key, section.name) in backtrail: + # Yes - infinite loop detected + raise InterpolationLoopError(key) + # Place a marker on our backtrail so we won't come back here again + backtrail[(key, section.name)] = 1 + + # Now start the actual work + match = self._KEYCRE.search(value) + while match: + # The actual parsing of the match is implementation-dependent, + # so delegate to our helper function + k, v, s = self._parse_match(match) + if k is None: + # That's the signal that no further interpolation is needed + replacement = v + else: + # Further interpolation may be needed to obtain final value + replacement = recursive_interpolate(k, v, s, backtrail) + # Replace the matched string with its final value + start, end = match.span() + value = ''.join((value[:start], replacement, value[end:])) + new_search_start = start + len(replacement) + # Pick up the next interpolation key, if any, for next time + # through the while loop + match = self._KEYCRE.search(value, new_search_start) + + # Now safe to come back here again; remove marker from backtrail + del backtrail[(key, section.name)] + + return value + + # Back in interpolate(), all we have to do is kick off the recursive + # function with appropriate starting values + value = recursive_interpolate(key, value, self.section, {}) + return value + + + def _fetch(self, key): + """Helper function to fetch values from owning section. + + Returns a 2-tuple: the value, and the section where it was found. + """ + # switch off interpolation before we try and fetch anything ! + save_interp = self.section.main.interpolation + self.section.main.interpolation = False + + # Start at section that "owns" this InterpolationEngine + current_section = self.section + while True: + # try the current section first + val = current_section.get(key) + if val is not None and not isinstance(val, Section): + break + # try "DEFAULT" next + val = current_section.get('DEFAULT', {}).get(key) + if val is not None and not isinstance(val, Section): + break + # move up to parent and try again + # top-level's parent is itself + if current_section.parent is current_section: + # reached top level, time to give up + break + current_section = current_section.parent + + # restore interpolation to previous value before returning + self.section.main.interpolation = save_interp + if val is None: + raise MissingInterpolationOption(key) + return val, current_section + + + def _parse_match(self, match): + """Implementation-dependent helper function. + + Will be passed a match object corresponding to the interpolation + key we just found (e.g., "%(foo)s" or "$foo"). Should look up that + key in the appropriate config file section (using the ``_fetch()`` + helper function) and return a 3-tuple: (key, value, section) + + ``key`` is the name of the key we're looking for + ``value`` is the value found for that key + ``section`` is a reference to the section where it was found + + ``key`` and ``section`` should be None if no further + interpolation should be performed on the resulting value + (e.g., if we interpolated "$$" and returned "$"). + """ + raise NotImplementedError() + + + +class ConfigParserInterpolation(InterpolationEngine): + """Behaves like ConfigParser.""" + _cookie = '%' + _KEYCRE = re.compile(r"%\(([^)]*)\)s") + + def _parse_match(self, match): + key = match.group(1) + value, section = self._fetch(key) + return key, value, section + + + +class TemplateInterpolation(InterpolationEngine): + """Behaves like string.Template.""" + _cookie = '$' + _delimiter = '$' + _KEYCRE = re.compile(r""" + \$(?: + (?P\$) | # Two $ signs + (?P[_a-z][_a-z0-9]*) | # $name format + {(?P[^}]*)} # ${name} format + ) + """, re.IGNORECASE | re.VERBOSE) + + def _parse_match(self, match): + # Valid name (in or out of braces): fetch value from section + key = match.group('named') or match.group('braced') + if key is not None: + value, section = self._fetch(key) + return key, value, section + # Escaped delimiter (e.g., $$): return single delimiter + if match.group('escaped') is not None: + # Return None for key and section to indicate it's time to stop + return None, self._delimiter, None + # Anything else: ignore completely, just return it unchanged + return None, match.group(), None + + +interpolation_engines = { + 'configparser': ConfigParserInterpolation, + 'template': TemplateInterpolation, +} + + +def __newobj__(cls, *args): + # Hack for pickle + return cls.__new__(cls, *args) + +class Section(dict): + """ + A dictionary-like object that represents a section in a config file. + + It does string interpolation if the 'interpolation' attribute + of the 'main' object is set to True. + + Interpolation is tried first from this object, then from the 'DEFAULT' + section of this object, next from the parent and its 'DEFAULT' section, + and so on until the main object is reached. + + A Section will behave like an ordered dictionary - following the + order of the ``scalars`` and ``sections`` attributes. + You can use this to change the order of members. + + Iteration follows the order: scalars, then sections. + """ + + + def __setstate__(self, state): + dict.update(self, state[0]) + self.__dict__.update(state[1]) + + def __reduce__(self): + state = (dict(self), self.__dict__) + return (__newobj__, (self.__class__,), state) + + + def __init__(self, parent, depth, main, indict=None, name=None): + """ + * parent is the section above + * depth is the depth level of this section + * main is the main ConfigObj + * indict is a dictionary to initialise the section with + """ + if indict is None: + indict = {} + dict.__init__(self) + # used for nesting level *and* interpolation + self.parent = parent + # used for the interpolation attribute + self.main = main + # level of nesting depth of this Section + self.depth = depth + # purely for information + self.name = name + # + self._initialise() + # we do this explicitly so that __setitem__ is used properly + # (rather than just passing to ``dict.__init__``) + for entry, value in indict.iteritems(): + self[entry] = value + + + def _initialise(self): + # the sequence of scalar values in this Section + self.scalars = [] + # the sequence of sections in this Section + self.sections = [] + # for comments :-) + self.comments = {} + self.inline_comments = {} + # the configspec + self.configspec = None + # for defaults + self.defaults = [] + self.default_values = {} + self.extra_values = [] + self._created = False + + + def _interpolate(self, key, value): + try: + # do we already have an interpolation engine? + engine = self._interpolation_engine + except AttributeError: + # not yet: first time running _interpolate(), so pick the engine + name = self.main.interpolation + if name == True: # note that "if name:" would be incorrect here + # backwards-compatibility: interpolation=True means use default + name = DEFAULT_INTERPOLATION + name = name.lower() # so that "Template", "template", etc. all work + class_ = interpolation_engines.get(name, None) + if class_ is None: + # invalid value for self.main.interpolation + self.main.interpolation = False + return value + else: + # save reference to engine so we don't have to do this again + engine = self._interpolation_engine = class_(self) + # let the engine do the actual work + return engine.interpolate(key, value) + + + def __getitem__(self, key): + """Fetch the item and do string interpolation.""" + val = dict.__getitem__(self, key) + if self.main.interpolation: + if isinstance(val, basestring): + return self._interpolate(key, val) + if isinstance(val, list): + def _check(entry): + if isinstance(entry, basestring): + return self._interpolate(key, entry) + return entry + new = [_check(entry) for entry in val] + if new != val: + return new + return val + + + def __setitem__(self, key, value, unrepr=False): + """ + Correctly set a value. + + Making dictionary values Section instances. + (We have to special case 'Section' instances - which are also dicts) + + Keys must be strings. + Values need only be strings (or lists of strings) if + ``main.stringify`` is set. + + ``unrepr`` must be set when setting a value to a dictionary, without + creating a new sub-section. + """ + if not isinstance(key, basestring): + raise ValueError('The key "%s" is not a string.' % key) + + # add the comment + if key not in self.comments: + self.comments[key] = [] + self.inline_comments[key] = '' + # remove the entry from defaults + if key in self.defaults: + self.defaults.remove(key) + # + if isinstance(value, Section): + if key not in self: + self.sections.append(key) + dict.__setitem__(self, key, value) + elif isinstance(value, dict) and not unrepr: + # First create the new depth level, + # then create the section + if key not in self: + self.sections.append(key) + new_depth = self.depth + 1 + dict.__setitem__( + self, + key, + Section( + self, + new_depth, + self.main, + indict=value, + name=key)) + else: + if key not in self: + self.scalars.append(key) + if not self.main.stringify: + if isinstance(value, basestring): + pass + elif isinstance(value, (list, tuple)): + for entry in value: + if not isinstance(entry, basestring): + raise TypeError('Value is not a string "%s".' % entry) + else: + raise TypeError('Value is not a string "%s".' % value) + dict.__setitem__(self, key, value) + + + def __delitem__(self, key): + """Remove items from the sequence when deleting.""" + dict. __delitem__(self, key) + if key in self.scalars: + self.scalars.remove(key) + else: + self.sections.remove(key) + del self.comments[key] + del self.inline_comments[key] + + + def get(self, key, default=None): + """A version of ``get`` that doesn't bypass string interpolation.""" + try: + return self[key] + except KeyError: + return default + + + def update(self, indict): + """ + A version of update that uses our ``__setitem__``. + """ + for entry in indict: + self[entry] = indict[entry] + + + def pop(self, key, default=MISSING): + """ + 'D.pop(k[,d]) -> v, remove specified key and return the corresponding value. + If key is not found, d is returned if given, otherwise KeyError is raised' + """ + try: + val = self[key] + except KeyError: + if default is MISSING: + raise + val = default + else: + del self[key] + return val + + + def popitem(self): + """Pops the first (key,val)""" + sequence = (self.scalars + self.sections) + if not sequence: + raise KeyError(": 'popitem(): dictionary is empty'") + key = sequence[0] + val = self[key] + del self[key] + return key, val + + + def clear(self): + """ + A version of clear that also affects scalars/sections + Also clears comments and configspec. + + Leaves other attributes alone : + depth/main/parent are not affected + """ + dict.clear(self) + self.scalars = [] + self.sections = [] + self.comments = {} + self.inline_comments = {} + self.configspec = None + self.defaults = [] + self.extra_values = [] + + + def setdefault(self, key, default=None): + """A version of setdefault that sets sequence if appropriate.""" + try: + return self[key] + except KeyError: + self[key] = default + return self[key] + + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return zip((self.scalars + self.sections), self.values()) + + + def keys(self): + """D.keys() -> list of D's keys""" + return (self.scalars + self.sections) + + + def values(self): + """D.values() -> list of D's values""" + return [self[key] for key in (self.scalars + self.sections)] + + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + return iter(self.items()) + + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter((self.scalars + self.sections)) + + __iter__ = iterkeys + + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + return iter(self.values()) + + + def __repr__(self): + """x.__repr__() <==> repr(x)""" + def _getval(key): + try: + return self[key] + except MissingInterpolationOption: + return dict.__getitem__(self, key) + return '{%s}' % ', '.join([('%s: %s' % (repr(key), repr(_getval(key)))) + for key in (self.scalars + self.sections)]) + + __str__ = __repr__ + __str__.__doc__ = "x.__str__() <==> str(x)" + + + # Extra methods - not in a normal dictionary + + def dict(self): + """ + Return a deepcopy of self as a dictionary. + + All members that are ``Section`` instances are recursively turned to + ordinary dictionaries - by calling their ``dict`` method. + + >>> n = a.dict() + >>> n == a + 1 + >>> n is a + 0 + """ + newdict = {} + for entry in self: + this_entry = self[entry] + if isinstance(this_entry, Section): + this_entry = this_entry.dict() + elif isinstance(this_entry, list): + # create a copy rather than a reference + this_entry = list(this_entry) + elif isinstance(this_entry, tuple): + # create a copy rather than a reference + this_entry = tuple(this_entry) + newdict[entry] = this_entry + return newdict + + + def merge(self, indict): + """ + A recursive update - useful for merging config files. + + >>> a = '''[section1] + ... option1 = True + ... [[subsection]] + ... more_options = False + ... # end of file'''.splitlines() + >>> b = '''# File is user.ini + ... [section1] + ... option1 = False + ... # end of file'''.splitlines() + >>> c1 = ConfigObj(b) + >>> c2 = ConfigObj(a) + >>> c2.merge(c1) + >>> c2 + ConfigObj({'section1': {'option1': 'False', 'subsection': {'more_options': 'False'}}}) + """ + for key, val in indict.items(): + if (key in self and isinstance(self[key], dict) and + isinstance(val, dict)): + self[key].merge(val) + else: + self[key] = val + + + def rename(self, oldkey, newkey): + """ + Change a keyname to another, without changing position in sequence. + + Implemented so that transformations can be made on keys, + as well as on values. (used by encode and decode) + + Also renames comments. + """ + if oldkey in self.scalars: + the_list = self.scalars + elif oldkey in self.sections: + the_list = self.sections + else: + raise KeyError('Key "%s" not found.' % oldkey) + pos = the_list.index(oldkey) + # + val = self[oldkey] + dict.__delitem__(self, oldkey) + dict.__setitem__(self, newkey, val) + the_list.remove(oldkey) + the_list.insert(pos, newkey) + comm = self.comments[oldkey] + inline_comment = self.inline_comments[oldkey] + del self.comments[oldkey] + del self.inline_comments[oldkey] + self.comments[newkey] = comm + self.inline_comments[newkey] = inline_comment + + + def walk(self, function, raise_errors=True, + call_on_sections=False, **keywargs): + """ + Walk every member and call a function on the keyword and value. + + Return a dictionary of the return values + + If the function raises an exception, raise the errror + unless ``raise_errors=False``, in which case set the return value to + ``False``. + + Any unrecognised keyword arguments you pass to walk, will be pased on + to the function you pass in. + + Note: if ``call_on_sections`` is ``True`` then - on encountering a + subsection, *first* the function is called for the *whole* subsection, + and then recurses into it's members. This means your function must be + able to handle strings, dictionaries and lists. This allows you + to change the key of subsections as well as for ordinary members. The + return value when called on the whole subsection has to be discarded. + + See the encode and decode methods for examples, including functions. + + .. admonition:: caution + + You can use ``walk`` to transform the names of members of a section + but you mustn't add or delete members. + + >>> config = '''[XXXXsection] + ... XXXXkey = XXXXvalue'''.splitlines() + >>> cfg = ConfigObj(config) + >>> cfg + ConfigObj({'XXXXsection': {'XXXXkey': 'XXXXvalue'}}) + >>> def transform(section, key): + ... val = section[key] + ... newkey = key.replace('XXXX', 'CLIENT1') + ... section.rename(key, newkey) + ... if isinstance(val, (tuple, list, dict)): + ... pass + ... else: + ... val = val.replace('XXXX', 'CLIENT1') + ... section[newkey] = val + >>> cfg.walk(transform, call_on_sections=True) + {'CLIENT1section': {'CLIENT1key': None}} + >>> cfg + ConfigObj({'CLIENT1section': {'CLIENT1key': 'CLIENT1value'}}) + """ + out = {} + # scalars first + for i in range(len(self.scalars)): + entry = self.scalars[i] + try: + val = function(self, entry, **keywargs) + # bound again in case name has changed + entry = self.scalars[i] + out[entry] = val + except Exception: + if raise_errors: + raise + else: + entry = self.scalars[i] + out[entry] = False + # then sections + for i in range(len(self.sections)): + entry = self.sections[i] + if call_on_sections: + try: + function(self, entry, **keywargs) + except Exception: + if raise_errors: + raise + else: + entry = self.sections[i] + out[entry] = False + # bound again in case name has changed + entry = self.sections[i] + # previous result is discarded + out[entry] = self[entry].walk( + function, + raise_errors=raise_errors, + call_on_sections=call_on_sections, + **keywargs) + return out + + + def as_bool(self, key): + """ + Accepts a key as input. The corresponding value must be a string or + the objects (``True`` or 1) or (``False`` or 0). We allow 0 and 1 to + retain compatibility with Python 2.2. + + If the string is one of ``True``, ``On``, ``Yes``, or ``1`` it returns + ``True``. + + If the string is one of ``False``, ``Off``, ``No``, or ``0`` it returns + ``False``. + + ``as_bool`` is not case sensitive. + + Any other input will raise a ``ValueError``. + + >>> a = ConfigObj() + >>> a['a'] = 'fish' + >>> a.as_bool('a') + Traceback (most recent call last): + ValueError: Value "fish" is neither True nor False + >>> a['b'] = 'True' + >>> a.as_bool('b') + 1 + >>> a['b'] = 'off' + >>> a.as_bool('b') + 0 + """ + val = self[key] + if val == True: + return True + elif val == False: + return False + else: + try: + if not isinstance(val, basestring): + # TODO: Why do we raise a KeyError here? + raise KeyError() + else: + return self.main._bools[val.lower()] + except KeyError: + raise ValueError('Value "%s" is neither True nor False' % val) + + + def as_int(self, key): + """ + A convenience method which coerces the specified value to an integer. + + If the value is an invalid literal for ``int``, a ``ValueError`` will + be raised. + + >>> a = ConfigObj() + >>> a['a'] = 'fish' + >>> a.as_int('a') + Traceback (most recent call last): + ValueError: invalid literal for int() with base 10: 'fish' + >>> a['b'] = '1' + >>> a.as_int('b') + 1 + >>> a['b'] = '3.2' + >>> a.as_int('b') + Traceback (most recent call last): + ValueError: invalid literal for int() with base 10: '3.2' + """ + return int(self[key]) + + + def as_float(self, key): + """ + A convenience method which coerces the specified value to a float. + + If the value is an invalid literal for ``float``, a ``ValueError`` will + be raised. + + >>> a = ConfigObj() + >>> a['a'] = 'fish' + >>> a.as_float('a') + Traceback (most recent call last): + ValueError: invalid literal for float(): fish + >>> a['b'] = '1' + >>> a.as_float('b') + 1.0 + >>> a['b'] = '3.2' + >>> a.as_float('b') + 3.2000000000000002 + """ + return float(self[key]) + + + def as_list(self, key): + """ + A convenience method which fetches the specified value, guaranteeing + that it is a list. + + >>> a = ConfigObj() + >>> a['a'] = 1 + >>> a.as_list('a') + [1] + >>> a['a'] = (1,) + >>> a.as_list('a') + [1] + >>> a['a'] = [1] + >>> a.as_list('a') + [1] + """ + result = self[key] + if isinstance(result, (tuple, list)): + return list(result) + return [result] + + + def restore_default(self, key): + """ + Restore (and return) default value for the specified key. + + This method will only work for a ConfigObj that was created + with a configspec and has been validated. + + If there is no default value for this key, ``KeyError`` is raised. + """ + default = self.default_values[key] + dict.__setitem__(self, key, default) + if key not in self.defaults: + self.defaults.append(key) + return default + + + def restore_defaults(self): + """ + Recursively restore default values to all members + that have them. + + This method will only work for a ConfigObj that was created + with a configspec and has been validated. + + It doesn't delete or modify entries without default values. + """ + for key in self.default_values: + self.restore_default(key) + + for section in self.sections: + self[section].restore_defaults() + + +class ConfigObj(Section): + """An object to read, create, and write config files.""" + + _keyword = re.compile(r'''^ # line start + (\s*) # indentation + ( # keyword + (?:".*?")| # double quotes + (?:'.*?')| # single quotes + (?:[^'"=].*?) # no quotes + ) + \s*=\s* # divider + (.*) # value (including list values and comments) + $ # line end + ''', + re.VERBOSE) + + _sectionmarker = re.compile(r'''^ + (\s*) # 1: indentation + ((?:\[\s*)+) # 2: section marker open + ( # 3: section name open + (?:"\s*\S.*?\s*")| # at least one non-space with double quotes + (?:'\s*\S.*?\s*')| # at least one non-space with single quotes + (?:[^'"\s].*?) # at least one non-space unquoted + ) # section name close + ((?:\s*\])+) # 4: section marker close + \s*(\#.*)? # 5: optional comment + $''', + re.VERBOSE) + + # this regexp pulls list values out as a single string + # or single values and comments + # FIXME: this regex adds a '' to the end of comma terminated lists + # workaround in ``_handle_value`` + _valueexp = re.compile(r'''^ + (?: + (?: + ( + (?: + (?: + (?:".*?")| # double quotes + (?:'.*?')| # single quotes + (?:[^'",\#][^,\#]*?) # unquoted + ) + \s*,\s* # comma + )* # match all list items ending in a comma (if any) + ) + ( + (?:".*?")| # double quotes + (?:'.*?')| # single quotes + (?:[^'",\#\s][^,]*?)| # unquoted + (?:(? 1: + msg = "Parsing failed with several errors.\nFirst error %s" % info + error = ConfigObjError(msg) + else: + error = self._errors[0] + # set the errors attribute; it's a list of tuples: + # (error_type, message, line_number) + error.errors = self._errors + # set the config attribute + error.config = self + raise error + # delete private attributes + del self._errors + + if configspec is None: + self.configspec = None + else: + self._handle_configspec(configspec) + + + def _initialise(self, options=None): + if options is None: + options = OPTION_DEFAULTS + + # initialise a few variables + self.filename = None + self._errors = [] + self.raise_errors = options['raise_errors'] + self.interpolation = options['interpolation'] + self.list_values = options['list_values'] + self.create_empty = options['create_empty'] + self.file_error = options['file_error'] + self.stringify = options['stringify'] + self.indent_type = options['indent_type'] + self.encoding = options['encoding'] + self.default_encoding = options['default_encoding'] + self.BOM = False + self.newlines = None + self.write_empty_values = options['write_empty_values'] + self.unrepr = options['unrepr'] + + self.initial_comment = [] + self.final_comment = [] + self.configspec = None + + if self._inspec: + self.list_values = False + + # Clear section attributes as well + Section._initialise(self) + + + def __repr__(self): + def _getval(key): + try: + return self[key] + except MissingInterpolationOption: + return dict.__getitem__(self, key) + return ('ConfigObj({%s})' % + ', '.join([('%s: %s' % (repr(key), repr(_getval(key)))) + for key in (self.scalars + self.sections)])) + + + def _handle_bom(self, infile): + """ + Handle any BOM, and decode if necessary. + + If an encoding is specified, that *must* be used - but the BOM should + still be removed (and the BOM attribute set). + + (If the encoding is wrongly specified, then a BOM for an alternative + encoding won't be discovered or removed.) + + If an encoding is not specified, UTF8 or UTF16 BOM will be detected and + removed. The BOM attribute will be set. UTF16 will be decoded to + unicode. + + NOTE: This method must not be called with an empty ``infile``. + + Specifying the *wrong* encoding is likely to cause a + ``UnicodeDecodeError``. + + ``infile`` must always be returned as a list of lines, but may be + passed in as a single string. + """ + if ((self.encoding is not None) and + (self.encoding.lower() not in BOM_LIST)): + # No need to check for a BOM + # the encoding specified doesn't have one + # just decode + return self._decode(infile, self.encoding) + + if isinstance(infile, (list, tuple)): + line = infile[0] + else: + line = infile + if self.encoding is not None: + # encoding explicitly supplied + # And it could have an associated BOM + # TODO: if encoding is just UTF16 - we ought to check for both + # TODO: big endian and little endian versions. + enc = BOM_LIST[self.encoding.lower()] + if enc == 'utf_16': + # For UTF16 we try big endian and little endian + for BOM, (encoding, final_encoding) in BOMS.items(): + if not final_encoding: + # skip UTF8 + continue + if infile.startswith(BOM): + ### BOM discovered + ##self.BOM = True + # Don't need to remove BOM + return self._decode(infile, encoding) + + # If we get this far, will *probably* raise a DecodeError + # As it doesn't appear to start with a BOM + return self._decode(infile, self.encoding) + + # Must be UTF8 + BOM = BOM_SET[enc] + if not line.startswith(BOM): + return self._decode(infile, self.encoding) + + newline = line[len(BOM):] + + # BOM removed + if isinstance(infile, (list, tuple)): + infile[0] = newline + else: + infile = newline + self.BOM = True + return self._decode(infile, self.encoding) + + # No encoding specified - so we need to check for UTF8/UTF16 + for BOM, (encoding, final_encoding) in BOMS.items(): + if not line.startswith(BOM): + continue + else: + # BOM discovered + self.encoding = final_encoding + if not final_encoding: + self.BOM = True + # UTF8 + # remove BOM + newline = line[len(BOM):] + if isinstance(infile, (list, tuple)): + infile[0] = newline + else: + infile = newline + # UTF8 - don't decode + if isinstance(infile, basestring): + return infile.splitlines(True) + else: + return infile + # UTF16 - have to decode + return self._decode(infile, encoding) + + # No BOM discovered and no encoding specified, just return + if isinstance(infile, basestring): + # infile read from a file will be a single string + return infile.splitlines(True) + return infile + + + def _a_to_u(self, aString): + """Decode ASCII strings to unicode if a self.encoding is specified.""" + if self.encoding: + return aString.decode('ascii') + else: + return aString + + + def _decode(self, infile, encoding): + """ + Decode infile to unicode. Using the specified encoding. + + if is a string, it also needs converting to a list. + """ + if isinstance(infile, basestring): + # can't be unicode + # NOTE: Could raise a ``UnicodeDecodeError`` + return infile.decode(encoding).splitlines(True) + for i, line in enumerate(infile): + if not isinstance(line, unicode): + # NOTE: The isinstance test here handles mixed lists of unicode/string + # NOTE: But the decode will break on any non-string values + # NOTE: Or could raise a ``UnicodeDecodeError`` + infile[i] = line.decode(encoding) + return infile + + + def _decode_element(self, line): + """Decode element to unicode if necessary.""" + if not self.encoding: + return line + if isinstance(line, str) and self.default_encoding: + return line.decode(self.default_encoding) + return line + + + def _str(self, value): + """ + Used by ``stringify`` within validate, to turn non-string values + into strings. + """ + if not isinstance(value, basestring): + return str(value) + else: + return value + + + def _parse(self, infile): + """Actually parse the config file.""" + temp_list_values = self.list_values + if self.unrepr: + self.list_values = False + + comment_list = [] + done_start = False + this_section = self + maxline = len(infile) - 1 + cur_index = -1 + reset_comment = False + + while cur_index < maxline: + if reset_comment: + comment_list = [] + cur_index += 1 + line = infile[cur_index] + sline = line.strip() + # do we have anything on the line ? + if not sline or sline.startswith('#'): + reset_comment = False + comment_list.append(line) + continue + + if not done_start: + # preserve initial comment + self.initial_comment = comment_list + comment_list = [] + done_start = True + + reset_comment = True + # first we check if it's a section marker + mat = self._sectionmarker.match(line) + if mat is not None: + # is a section line + (indent, sect_open, sect_name, sect_close, comment) = mat.groups() + if indent and (self.indent_type is None): + self.indent_type = indent + cur_depth = sect_open.count('[') + if cur_depth != sect_close.count(']'): + self._handle_error("Cannot compute the section depth at line %s.", + NestingError, infile, cur_index) + continue + + if cur_depth < this_section.depth: + # the new section is dropping back to a previous level + try: + parent = self._match_depth(this_section, + cur_depth).parent + except SyntaxError: + self._handle_error("Cannot compute nesting level at line %s.", + NestingError, infile, cur_index) + continue + elif cur_depth == this_section.depth: + # the new section is a sibling of the current section + parent = this_section.parent + elif cur_depth == this_section.depth + 1: + # the new section is a child the current section + parent = this_section + else: + self._handle_error("Section too nested at line %s.", + NestingError, infile, cur_index) + + sect_name = self._unquote(sect_name) + if sect_name in parent: + self._handle_error('Duplicate section name at line %s.', + DuplicateError, infile, cur_index) + continue + + # create the new section + this_section = Section( + parent, + cur_depth, + self, + name=sect_name) + parent[sect_name] = this_section + parent.inline_comments[sect_name] = comment + parent.comments[sect_name] = comment_list + continue + # + # it's not a section marker, + # so it should be a valid ``key = value`` line + mat = self._keyword.match(line) + if mat is None: + # it neither matched as a keyword + # or a section marker + self._handle_error( + 'Invalid line at line "%s".', + ParseError, infile, cur_index) + else: + # is a keyword value + # value will include any inline comment + (indent, key, value) = mat.groups() + if indent and (self.indent_type is None): + self.indent_type = indent + # check for a multiline value + if value[:3] in ['"""', "'''"]: + try: + value, comment, cur_index = self._multiline( + value, infile, cur_index, maxline) + except SyntaxError: + self._handle_error( + 'Parse error in value at line %s.', + ParseError, infile, cur_index) + continue + else: + if self.unrepr: + comment = '' + try: + value = unrepr(value) + except Exception, e: + if type(e) == UnknownType: + msg = 'Unknown name or type in value at line %s.' + else: + msg = 'Parse error in value at line %s.' + self._handle_error(msg, UnreprError, infile, + cur_index) + continue + else: + if self.unrepr: + comment = '' + try: + value = unrepr(value) + except Exception, e: + if isinstance(e, UnknownType): + msg = 'Unknown name or type in value at line %s.' + else: + msg = 'Parse error in value at line %s.' + self._handle_error(msg, UnreprError, infile, + cur_index) + continue + else: + # extract comment and lists + try: + (value, comment) = self._handle_value(value) + except SyntaxError: + self._handle_error( + 'Parse error in value at line %s.', + ParseError, infile, cur_index) + continue + # + key = self._unquote(key) + if key in this_section: + self._handle_error( + 'Duplicate keyword name at line %s.', + DuplicateError, infile, cur_index) + continue + # add the key. + # we set unrepr because if we have got this far we will never + # be creating a new section + this_section.__setitem__(key, value, unrepr=True) + this_section.inline_comments[key] = comment + this_section.comments[key] = comment_list + continue + # + if self.indent_type is None: + # no indentation used, set the type accordingly + self.indent_type = '' + + # preserve the final comment + if not self and not self.initial_comment: + self.initial_comment = comment_list + elif not reset_comment: + self.final_comment = comment_list + self.list_values = temp_list_values + + + def _match_depth(self, sect, depth): + """ + Given a section and a depth level, walk back through the sections + parents to see if the depth level matches a previous section. + + Return a reference to the right section, + or raise a SyntaxError. + """ + while depth < sect.depth: + if sect is sect.parent: + # we've reached the top level already + raise SyntaxError() + sect = sect.parent + if sect.depth == depth: + return sect + # shouldn't get here + raise SyntaxError() + + + def _handle_error(self, text, ErrorClass, infile, cur_index): + """ + Handle an error according to the error settings. + + Either raise the error or store it. + The error will have occured at ``cur_index`` + """ + line = infile[cur_index] + cur_index += 1 + message = text % cur_index + error = ErrorClass(message, cur_index, line) + if self.raise_errors: + # raise the error - parsing stops here + raise error + # store the error + # reraise when parsing has finished + self._errors.append(error) + + + def _unquote(self, value): + """Return an unquoted version of a value""" + if not value: + # should only happen during parsing of lists + raise SyntaxError + if (value[0] == value[-1]) and (value[0] in ('"', "'")): + value = value[1:-1] + return value + + + def _quote(self, value, multiline=True): + """ + Return a safely quoted version of a value. + + Raise a ConfigObjError if the value cannot be safely quoted. + If multiline is ``True`` (default) then use triple quotes + if necessary. + + * Don't quote values that don't need it. + * Recursively quote members of a list and return a comma joined list. + * Multiline is ``False`` for lists. + * Obey list syntax for empty and single member lists. + + If ``list_values=False`` then the value is only quoted if it contains + a ``\\n`` (is multiline) or '#'. + + If ``write_empty_values`` is set, and the value is an empty string, it + won't be quoted. + """ + if multiline and self.write_empty_values and value == '': + # Only if multiline is set, so that it is used for values not + # keys, and not values that are part of a list + return '' + + if multiline and isinstance(value, (list, tuple)): + if not value: + return ',' + elif len(value) == 1: + return self._quote(value[0], multiline=False) + ',' + return ', '.join([self._quote(val, multiline=False) + for val in value]) + if not isinstance(value, basestring): + if self.stringify: + value = str(value) + else: + raise TypeError('Value "%s" is not a string.' % value) + + if not value: + return '""' + + no_lists_no_quotes = not self.list_values and '\n' not in value and '#' not in value + need_triple = multiline and ((("'" in value) and ('"' in value)) or ('\n' in value )) + hash_triple_quote = multiline and not need_triple and ("'" in value) and ('"' in value) and ('#' in value) + check_for_single = (no_lists_no_quotes or not need_triple) and not hash_triple_quote + + if check_for_single: + if not self.list_values: + # we don't quote if ``list_values=False`` + quot = noquot + # for normal values either single or double quotes will do + elif '\n' in value: + # will only happen if multiline is off - e.g. '\n' in key + raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) + elif ((value[0] not in wspace_plus) and + (value[-1] not in wspace_plus) and + (',' not in value)): + quot = noquot + else: + quot = self._get_single_quote(value) + else: + # if value has '\n' or "'" *and* '"', it will need triple quotes + quot = self._get_triple_quote(value) + + if quot == noquot and '#' in value and self.list_values: + quot = self._get_single_quote(value) + + return quot % value + + + def _get_single_quote(self, value): + if ("'" in value) and ('"' in value): + raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) + elif '"' in value: + quot = squot + else: + quot = dquot + return quot + + + def _get_triple_quote(self, value): + if (value.find('"""') != -1) and (value.find("'''") != -1): + raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) + if value.find('"""') == -1: + quot = tdquot + else: + quot = tsquot + return quot + + + def _handle_value(self, value): + """ + Given a value string, unquote, remove comment, + handle lists. (including empty and single member lists) + """ + if self._inspec: + # Parsing a configspec so don't handle comments + return (value, '') + # do we look for lists in values ? + if not self.list_values: + mat = self._nolistvalue.match(value) + if mat is None: + raise SyntaxError() + # NOTE: we don't unquote here + return mat.groups() + # + mat = self._valueexp.match(value) + if mat is None: + # the value is badly constructed, probably badly quoted, + # or an invalid list + raise SyntaxError() + (list_values, single, empty_list, comment) = mat.groups() + if (list_values == '') and (single is None): + # change this if you want to accept empty values + raise SyntaxError() + # NOTE: note there is no error handling from here if the regex + # is wrong: then incorrect values will slip through + if empty_list is not None: + # the single comma - meaning an empty list + return ([], comment) + if single is not None: + # handle empty values + if list_values and not single: + # FIXME: the '' is a workaround because our regex now matches + # '' at the end of a list if it has a trailing comma + single = None + else: + single = single or '""' + single = self._unquote(single) + if list_values == '': + # not a list value + return (single, comment) + the_list = self._listvalueexp.findall(list_values) + the_list = [self._unquote(val) for val in the_list] + if single is not None: + the_list += [single] + return (the_list, comment) + + + def _multiline(self, value, infile, cur_index, maxline): + """Extract the value, where we are in a multiline situation.""" + quot = value[:3] + newvalue = value[3:] + single_line = self._triple_quote[quot][0] + multi_line = self._triple_quote[quot][1] + mat = single_line.match(value) + if mat is not None: + retval = list(mat.groups()) + retval.append(cur_index) + return retval + elif newvalue.find(quot) != -1: + # somehow the triple quote is missing + raise SyntaxError() + # + while cur_index < maxline: + cur_index += 1 + newvalue += '\n' + line = infile[cur_index] + if line.find(quot) == -1: + newvalue += line + else: + # end of multiline, process it + break + else: + # we've got to the end of the config, oops... + raise SyntaxError() + mat = multi_line.match(line) + if mat is None: + # a badly formed line + raise SyntaxError() + (value, comment) = mat.groups() + return (newvalue + value, comment, cur_index) + + + def _handle_configspec(self, configspec): + """Parse the configspec.""" + # FIXME: Should we check that the configspec was created with the + # correct settings ? (i.e. ``list_values=False``) + if not isinstance(configspec, ConfigObj): + try: + configspec = ConfigObj(configspec, + raise_errors=True, + file_error=True, + _inspec=True) + except ConfigObjError, e: + # FIXME: Should these errors have a reference + # to the already parsed ConfigObj ? + raise ConfigspecError('Parsing configspec failed: %s' % e) + except IOError, e: + raise IOError('Reading configspec failed: %s' % e) + + self.configspec = configspec + + + + def _set_configspec(self, section, copy): + """ + Called by validate. Handles setting the configspec on subsections + including sections to be validated by __many__ + """ + configspec = section.configspec + many = configspec.get('__many__') + if isinstance(many, dict): + for entry in section.sections: + if entry not in configspec: + section[entry].configspec = many + + for entry in configspec.sections: + if entry == '__many__': + continue + if entry not in section: + section[entry] = {} + section[entry]._created = True + if copy: + # copy comments + section.comments[entry] = configspec.comments.get(entry, []) + section.inline_comments[entry] = configspec.inline_comments.get(entry, '') + + # Could be a scalar when we expect a section + if isinstance(section[entry], Section): + section[entry].configspec = configspec[entry] + + + def _write_line(self, indent_string, entry, this_entry, comment): + """Write an individual line, for the write method""" + # NOTE: the calls to self._quote here handles non-StringType values. + if not self.unrepr: + val = self._decode_element(self._quote(this_entry)) + else: + val = repr(this_entry) + return '%s%s%s%s%s' % (indent_string, + self._decode_element(self._quote(entry, multiline=False)), + self._a_to_u(' = '), + val, + self._decode_element(comment)) + + + def _write_marker(self, indent_string, depth, entry, comment): + """Write a section marker line""" + return '%s%s%s%s%s' % (indent_string, + self._a_to_u('[' * depth), + self._quote(self._decode_element(entry), multiline=False), + self._a_to_u(']' * depth), + self._decode_element(comment)) + + + def _handle_comment(self, comment): + """Deal with a comment.""" + if not comment: + return '' + start = self.indent_type + if not comment.startswith('#'): + start += self._a_to_u(' # ') + return (start + comment) + + + # Public methods + + def write(self, outfile=None, section=None): + """ + Write the current ConfigObj as a file + + tekNico: FIXME: use StringIO instead of real files + + >>> filename = a.filename + >>> a.filename = 'test.ini' + >>> a.write() + >>> a.filename = filename + >>> a == ConfigObj('test.ini', raise_errors=True) + 1 + >>> import os + >>> os.remove('test.ini') + """ + if self.indent_type is None: + # this can be true if initialised from a dictionary + self.indent_type = DEFAULT_INDENT_TYPE + + out = [] + cs = self._a_to_u('#') + csp = self._a_to_u('# ') + if section is None: + int_val = self.interpolation + self.interpolation = False + section = self + for line in self.initial_comment: + line = self._decode_element(line) + stripped_line = line.strip() + if stripped_line and not stripped_line.startswith(cs): + line = csp + line + out.append(line) + + indent_string = self.indent_type * section.depth + for entry in (section.scalars + section.sections): + if entry in section.defaults: + # don't write out default values + continue + for comment_line in section.comments[entry]: + comment_line = self._decode_element(comment_line.lstrip()) + if comment_line and not comment_line.startswith(cs): + comment_line = csp + comment_line + out.append(indent_string + comment_line) + this_entry = section[entry] + comment = self._handle_comment(section.inline_comments[entry]) + + if isinstance(this_entry, dict): + # a section + out.append(self._write_marker( + indent_string, + this_entry.depth, + entry, + comment)) + out.extend(self.write(section=this_entry)) + else: + out.append(self._write_line( + indent_string, + entry, + this_entry, + comment)) + + if section is self: + for line in self.final_comment: + line = self._decode_element(line) + stripped_line = line.strip() + if stripped_line and not stripped_line.startswith(cs): + line = csp + line + out.append(line) + self.interpolation = int_val + + if section is not self: + return out + + if (self.filename is None) and (outfile is None): + # output a list of lines + # might need to encode + # NOTE: This will *screw* UTF16, each line will start with the BOM + if self.encoding: + out = [l.encode(self.encoding) for l in out] + if (self.BOM and ((self.encoding is None) or + (BOM_LIST.get(self.encoding.lower()) == 'utf_8'))): + # Add the UTF8 BOM + if not out: + out.append('') + out[0] = BOM_UTF8 + out[0] + return out + + # Turn the list to a string, joined with correct newlines + newline = self.newlines or os.linesep + if (getattr(outfile, 'mode', None) is not None and outfile.mode == 'w' + and sys.platform == 'win32' and newline == '\r\n'): + # Windows specific hack to avoid writing '\r\r\n' + newline = '\n' + output = self._a_to_u(newline).join(out) + if self.encoding: + output = output.encode(self.encoding) + if self.BOM and ((self.encoding is None) or match_utf8(self.encoding)): + # Add the UTF8 BOM + output = BOM_UTF8 + output + + if not output.endswith(newline): + output += newline + if outfile is not None: + outfile.write(output) + else: + h = open(self.filename, 'wb') + h.write(output) + h.close() + + + def validate(self, validator, preserve_errors=False, copy=False, + section=None): + """ + Test the ConfigObj against a configspec. + + It uses the ``validator`` object from *validate.py*. + + To run ``validate`` on the current ConfigObj, call: :: + + test = config.validate(validator) + + (Normally having previously passed in the configspec when the ConfigObj + was created - you can dynamically assign a dictionary of checks to the + ``configspec`` attribute of a section though). + + It returns ``True`` if everything passes, or a dictionary of + pass/fails (True/False). If every member of a subsection passes, it + will just have the value ``True``. (It also returns ``False`` if all + members fail). + + In addition, it converts the values from strings to their native + types if their checks pass (and ``stringify`` is set). + + If ``preserve_errors`` is ``True`` (``False`` is default) then instead + of a marking a fail with a ``False``, it will preserve the actual + exception object. This can contain info about the reason for failure. + For example the ``VdtValueTooSmallError`` indicates that the value + supplied was too small. If a value (or section) is missing it will + still be marked as ``False``. + + You must have the validate module to use ``preserve_errors=True``. + + You can then use the ``flatten_errors`` function to turn your nested + results dictionary into a flattened list of failures - useful for + displaying meaningful error messages. + """ + if section is None: + if self.configspec is None: + raise ValueError('No configspec supplied.') + if preserve_errors: + # We do this once to remove a top level dependency on the validate module + # Which makes importing configobj faster + from validate import VdtMissingValue + self._vdtMissingValue = VdtMissingValue + + section = self + + if copy: + section.initial_comment = section.configspec.initial_comment + section.final_comment = section.configspec.final_comment + section.encoding = section.configspec.encoding + section.BOM = section.configspec.BOM + section.newlines = section.configspec.newlines + section.indent_type = section.configspec.indent_type + + # + # section.default_values.clear() #?? + configspec = section.configspec + self._set_configspec(section, copy) + + + def validate_entry(entry, spec, val, missing, ret_true, ret_false): + section.default_values.pop(entry, None) + + try: + section.default_values[entry] = validator.get_default_value(configspec[entry]) + except (KeyError, AttributeError, validator.baseErrorClass): + # No default, bad default or validator has no 'get_default_value' + # (e.g. SimpleVal) + pass + + try: + check = validator.check(spec, + val, + missing=missing + ) + except validator.baseErrorClass, e: + if not preserve_errors or isinstance(e, self._vdtMissingValue): + out[entry] = False + else: + # preserve the error + out[entry] = e + ret_false = False + ret_true = False + else: + ret_false = False + out[entry] = True + if self.stringify or missing: + # if we are doing type conversion + # or the value is a supplied default + if not self.stringify: + if isinstance(check, (list, tuple)): + # preserve lists + check = [self._str(item) for item in check] + elif missing and check is None: + # convert the None from a default to a '' + check = '' + else: + check = self._str(check) + if (check != val) or missing: + section[entry] = check + if not copy and missing and entry not in section.defaults: + section.defaults.append(entry) + return ret_true, ret_false + + # + out = {} + ret_true = True + ret_false = True + + unvalidated = [k for k in section.scalars if k not in configspec] + incorrect_sections = [k for k in configspec.sections if k in section.scalars] + incorrect_scalars = [k for k in configspec.scalars if k in section.sections] + + for entry in configspec.scalars: + if entry in ('__many__', '___many___'): + # reserved names + continue + if (not entry in section.scalars) or (entry in section.defaults): + # missing entries + # or entries from defaults + missing = True + val = None + if copy and entry not in section.scalars: + # copy comments + section.comments[entry] = ( + configspec.comments.get(entry, [])) + section.inline_comments[entry] = ( + configspec.inline_comments.get(entry, '')) + # + else: + missing = False + val = section[entry] + + ret_true, ret_false = validate_entry(entry, configspec[entry], val, + missing, ret_true, ret_false) + + many = None + if '__many__' in configspec.scalars: + many = configspec['__many__'] + elif '___many___' in configspec.scalars: + many = configspec['___many___'] + + if many is not None: + for entry in unvalidated: + val = section[entry] + ret_true, ret_false = validate_entry(entry, many, val, False, + ret_true, ret_false) + unvalidated = [] + + for entry in incorrect_scalars: + ret_true = False + if not preserve_errors: + out[entry] = False + else: + ret_false = False + msg = 'Value %r was provided as a section' % entry + out[entry] = validator.baseErrorClass(msg) + for entry in incorrect_sections: + ret_true = False + if not preserve_errors: + out[entry] = False + else: + ret_false = False + msg = 'Section %r was provided as a single value' % entry + out[entry] = validator.baseErrorClass(msg) + + # Missing sections will have been created as empty ones when the + # configspec was read. + for entry in section.sections: + # FIXME: this means DEFAULT is not copied in copy mode + if section is self and entry == 'DEFAULT': + continue + if section[entry].configspec is None: + unvalidated.append(entry) + continue + if copy: + section.comments[entry] = configspec.comments.get(entry, []) + section.inline_comments[entry] = configspec.inline_comments.get(entry, '') + check = self.validate(validator, preserve_errors=preserve_errors, copy=copy, section=section[entry]) + out[entry] = check + if check == False: + ret_true = False + elif check == True: + ret_false = False + else: + ret_true = False + + section.extra_values = unvalidated + if preserve_errors and not section._created: + # If the section wasn't created (i.e. it wasn't missing) + # then we can't return False, we need to preserve errors + ret_false = False + # + if ret_false and preserve_errors and out: + # If we are preserving errors, but all + # the failures are from missing sections / values + # then we can return False. Otherwise there is a + # real failure that we need to preserve. + ret_false = not any(out.values()) + if ret_true: + return True + elif ret_false: + return False + return out + + + def reset(self): + """Clear ConfigObj instance and restore to 'freshly created' state.""" + self.clear() + self._initialise() + # FIXME: Should be done by '_initialise', but ConfigObj constructor (and reload) + # requires an empty dictionary + self.configspec = None + # Just to be sure ;-) + self._original_configspec = None + + + def reload(self): + """ + Reload a ConfigObj from file. + + This method raises a ``ReloadError`` if the ConfigObj doesn't have + a filename attribute pointing to a file. + """ + if not isinstance(self.filename, basestring): + raise ReloadError() + + filename = self.filename + current_options = {} + for entry in OPTION_DEFAULTS: + if entry == 'configspec': + continue + current_options[entry] = getattr(self, entry) + + configspec = self._original_configspec + current_options['configspec'] = configspec + + self.clear() + self._initialise(current_options) + self._load(filename, configspec) + + + +class SimpleVal(object): + """ + A simple validator. + Can be used to check that all members expected are present. + + To use it, provide a configspec with all your members in (the value given + will be ignored). Pass an instance of ``SimpleVal`` to the ``validate`` + method of your ``ConfigObj``. ``validate`` will return ``True`` if all + members are present, or a dictionary with True/False meaning + present/missing. (Whole missing sections will be replaced with ``False``) + """ + + def __init__(self): + self.baseErrorClass = ConfigObjError + + def check(self, check, member, missing=False): + """A dummy check method, always returns the value unchanged.""" + if missing: + raise self.baseErrorClass() + return member + + +def flatten_errors(cfg, res, levels=None, results=None): + """ + An example function that will turn a nested dictionary of results + (as returned by ``ConfigObj.validate``) into a flat list. + + ``cfg`` is the ConfigObj instance being checked, ``res`` is the results + dictionary returned by ``validate``. + + (This is a recursive function, so you shouldn't use the ``levels`` or + ``results`` arguments - they are used by the function.) + + Returns a list of keys that failed. Each member of the list is a tuple:: + + ([list of sections...], key, result) + + If ``validate`` was called with ``preserve_errors=False`` (the default) + then ``result`` will always be ``False``. + + *list of sections* is a flattened list of sections that the key was found + in. + + If the section was missing (or a section was expected and a scalar provided + - or vice-versa) then key will be ``None``. + + If the value (or section) was missing then ``result`` will be ``False``. + + If ``validate`` was called with ``preserve_errors=True`` and a value + was present, but failed the check, then ``result`` will be the exception + object returned. You can use this as a string that describes the failure. + + For example *The value "3" is of the wrong type*. + """ + if levels is None: + # first time called + levels = [] + results = [] + if res == True: + return results + if res == False or isinstance(res, Exception): + results.append((levels[:], None, res)) + if levels: + levels.pop() + return results + for (key, val) in res.items(): + if val == True: + continue + if isinstance(cfg.get(key), dict): + # Go down one level + levels.append(key) + flatten_errors(cfg[key], val, levels, results) + continue + results.append((levels[:], key, val)) + # + # Go up one level + if levels: + levels.pop() + # + return results + + +def get_extra_values(conf, _prepend=()): + """ + Find all the values and sections not in the configspec from a validated + ConfigObj. + + ``get_extra_values`` returns a list of tuples where each tuple represents + either an extra section, or an extra value. + + The tuples contain two values, a tuple representing the section the value + is in and the name of the extra values. For extra values in the top level + section the first member will be an empty tuple. For values in the 'foo' + section the first member will be ``('foo',)``. For members in the 'bar' + subsection of the 'foo' section the first member will be ``('foo', 'bar')``. + + NOTE: If you call ``get_extra_values`` on a ConfigObj instance that hasn't + been validated it will return an empty list. + """ + out = [] + + out.extend([(_prepend, name) for name in conf.extra_values]) + for name in conf.sections: + if name not in conf.extra_values: + out.extend(get_extra_values(conf[name], _prepend + (name,))) + return out + + +"""*A programming language is a medium of expression.* - Paul Graham""" diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py new file mode 100644 index 00000000..fd4a4148 --- /dev/null +++ b/python/pkg/cdec/sa/__init__.py @@ -0,0 +1,4 @@ +from cdec.sa._sa import sym_fromstring,\ + SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ + HieroCachingRuleFactory, Sampler +from cdec.sa.extractor import GrammarExtractor diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py new file mode 100644 index 00000000..30e605a6 --- /dev/null +++ b/python/pkg/cdec/sa/compile.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +import argparse +import os +import logging +import cdec.configobj +import cdec.sa + +MAX_PHRASE_LENGTH = 4 +def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2): + lcp = cdec.sa.LCP(f_sa) + stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True) + precomp = cdec.sa.Precomputation(from_stats=stats, + fsarray=f_sa, + precompute_rank=rank1, + precompute_secondary_rank=rank2, + max_length=max_len, + max_nonterminals=max_nt, + train_max_initial_size=max_size, + train_min_gap_size=min_gap) + return precomp + +def main(): + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger('cdec.sa.compile') + parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.') + parser.add_argument('--maxnt', '-n', type=int, default=2, + help='Maximum number of non-terminal symbols') + parser.add_argument('--maxlen', '-l', type=int, default=5, + help='Maximum number of terminals') + parser.add_argument('--maxsize', '-s', type=int, default=15, + help='Maximum rule span') + parser.add_argument('--mingap', '-g', type=int, default=1, + help='Minimum gap size') + parser.add_argument('--rank1', '-r1', type=int, default=100, + help='Number of pre-computed frequent patterns') + parser.add_argument('--rank2', '-r2', type=int, default=10, + help='Number of pre-computed super-frequent patterns)') + parser.add_argument('-c', '--config', default='/dev/stdout', + help='Output configuration') + parser.add_argument('-o', '--output', required=True, + help='Output path') + parser.add_argument('-f', '--source', required=True, + help='Source language corpus') + parser.add_argument('-e', '--target', required=True, + help='Target language corpus') + parser.add_argument('-a', '--alignment', required=True, + help='Bitext word alignment') + args = parser.parse_args() + + param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") + params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) + + if not os.path.exists(args.output): + os.mkdir(args.output) + + f_sa_bin = os.path.join(args.output, 'f.sa.bin') + e_bin = os.path.join(args.output, 'e.bin') + precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params) + precomp_bin = os.path.join(args.output, precomp_file) + a_bin = os.path.join(args.output, 'a.bin') + lex_bin = os.path.join(args.output, 'lex.bin') + + logger.info('Compiling source suffix array') + f_sa = cdec.sa.SuffixArray(from_text=args.source) + f_sa.write_binary(f_sa_bin) + + logger.info('Compiling target data array') + e = cdec.sa.DataArray(from_text=args.target) + e.write_binary(e_bin) + + logger.info('Precomputing frequent phrases') + precompute(f_sa, *params).write_binary(precomp_bin) + + logger.info('Compiling alignment') + a = cdec.sa.Alignment(from_text=args.alignment) + a.write_binary(a_bin) + + logger.info('Compiling bilexical dictionary') + lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa) + lex.write_binary(lex_bin) + + # Write configuration + config = cdec.configobj.ConfigObj(args.config, unrepr=True) + config['f_sa_file'] = f_sa_bin + config['e_file'] = e_bin + config['a_file'] = a_bin + config['lex_file'] = lex_bin + config['precompute_file'] = precomp_bin + for name, value in zip(param_names, params): + config[name] = value + config.write() + +if __name__ == '__main__': + main() diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py new file mode 100644 index 00000000..918aa3bb --- /dev/null +++ b/python/pkg/cdec/sa/extract.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import sys +import os +import argparse +import logging +import cdec.sa + +def main(): + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') + parser.add_argument('-c', '--config', required=True, + help='Extractor configuration') + parser.add_argument('-g', '--grammars', required=True, + help='Grammar output path') + args = parser.parse_args() + + if not os.path.exists(args.grammars): + os.mkdir(args.grammars) + + extractor = cdec.sa.GrammarExtractor(args.config) + for i, sentence in enumerate(sys.stdin): + sentence = sentence[:-1] + grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) + with open(grammar_file, 'w') as output: + for rule in extractor.grammar(sentence): + output.write(str(rule)+'\n') + grammar_file = os.path.abspath(grammar_file) + print('{1}'.format(grammar_file, sentence)) + +if __name__ == '__main__': + main() diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py new file mode 100644 index 00000000..bb912e16 --- /dev/null +++ b/python/pkg/cdec/sa/extractor.py @@ -0,0 +1,78 @@ +from itertools import chain +import os +import cdec.configobj +from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ + MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE +import cdec.sa + +# maximum span of a grammar rule in TEST DATA +MAX_INITIAL_SIZE = 15 + +class GrammarExtractor: + def __init__(self, config): + if isinstance(config, str) or isinstance(config, unicode): + if not os.path.exists(config): + raise IOError('cannot read configuration from {0}'.format(config)) + config = cdec.configobj.ConfigObj(config, unrepr=True) + alignment = cdec.sa.Alignment(from_binary=config['a_file']) + self.factory = cdec.sa.HieroCachingRuleFactory( + # compiled alignment object (REQUIRED) + alignment, + # name of generic nonterminal used by Hiero + category="[X]", + # maximum number of contiguous chunks of terminal symbols in RHS of a rule + max_chunks=config['max_nt']+1, + # maximum span of a grammar rule in TEST DATA + max_initial_size=MAX_INITIAL_SIZE, + # maximum number of symbols (both T and NT) allowed in a rule + max_length=config['max_len'], + # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_nonterminals=config['max_nt'], + # maximum number of contiguous chunks of terminal symbols + # in target-side RHS of a rule. + max_target_chunks=config['max_nt']+1, + # maximum number of target side symbols (both T and NT) allowed in a rule. + max_target_length=MAX_INITIAL_SIZE, + # minimum span of a nonterminal in the RHS of a rule in TEST DATA + min_gap_size=1, + # filename of file containing precomputed collocations + precompute_file=config['precompute_file'], + # maximum frequency rank of patterns used to compute triples (< 20) + precompute_secondary_rank=config['rank2'], + # maximum frequency rank of patterns used to compute collocations (< 300) + precompute_rank=config['rank1'], + # require extracted rules to have at least one aligned word + require_aligned_terminal=True, + # require each contiguous chunk of extracted rules + # to have at least one aligned word + require_aligned_chunks=False, + # maximum span of a grammar rule extracted from TRAINING DATA + train_max_initial_size=config['max_size'], + # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + train_min_gap_size=config['min_gap'], + # True if phrases should be tight, False otherwise (better but slower) + tight_phrases=True, + ) + + # lexical weighting tables + tt = cdec.sa.BiLex(from_binary=config['lex_file']) + + self.models = (EgivenFCoherent, SampleCountF, CountEF, + MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) + + fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) + edarray = cdec.sa.DataArray(from_binary=config['e_file']) + + # lower=faster, higher=better; improvements level off above 200-300 range, + # -1 = don't sample, use all data (VERY SLOW!) + sampler = cdec.sa.Sampler(300, fsarray) + + self.factory.configure(fsarray, edarray, sampler) + + def grammar(self, sentence): + if isinstance(sentence, unicode): + sentence = sentence.encode('utf8') + cnet = chain(('',), sentence.split(), ('',)) + cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) + cnet = tuple(((word, None, 1), ) for word in cnet) + return self.factory.input(cnet, self.models) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py new file mode 100644 index 00000000..325b9e13 --- /dev/null +++ b/python/pkg/cdec/sa/features.py @@ -0,0 +1,57 @@ +from __future__ import division +import math + +MAXSCORE = 99 + +def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) + return -math.log10(paircount/fcount) + +def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1 + paircount) + +def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1 + fsample_count) + +def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): + prob = paircount/fsample_count + return -math.log10(prob) if prob > 0 else MAXSCORE + +def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): + return -math.log10(fcount/fsample_count) + +def MaxLexEgivenF(ttable): + def feature(fphrase, ephrase, paircount, fcount, fsample_count): + fwords = fphrase.words + fwords.append('NULL') + def score(): + for e in ephrase.words: + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return sum(score()) + return feature + +def MaxLexFgivenE(ttable): + def feature(fphrase, ephrase, paircount, fcount, fsample_count): + ewords = ephrase.words + ewords.append('NULL') + def score(): + for f in fphrase.words: + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return sum(score()) + return feature + +def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount == 1) + +def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount == 1) + +def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount > 1) + +def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 1) + +def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 0.01) diff --git a/python/pkg/cdec/score.py b/python/pkg/cdec/score.py new file mode 100644 index 00000000..22257774 --- /dev/null +++ b/python/pkg/cdec/score.py @@ -0,0 +1 @@ +from _cdec import BLEU, TER, CER, Metric diff --git a/python/setup.py b/python/setup.py index 1d1d7e45..7be976e8 100644 --- a/python/setup.py +++ b/python/setup.py @@ -48,5 +48,6 @@ setup( name='cdec', ext_modules=ext_modules, requires=['configobj'], - packages=['cdec', 'cdec.sa'] + packages=['cdec', 'cdec.sa'], + package_dir={'': 'pkg'} ) diff --git a/python/src/sa/_sa.c b/python/src/sa/_sa.c index 34f170bf..b7f3627a 100644 --- a/python/src/sa/_sa.c +++ b/python/src/sa/_sa.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.17.beta1 on Fri Jul 27 22:15:31 2012 */ +/* Generated by Cython 0.17.beta1 on Fri Jul 27 23:31:04 2012 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -2013,7 +2013,7 @@ static int __pyx_pf_3_sa_8Alphabet___cinit__(struct __pyx_obj_3_sa_Alphabet *__p static void __pyx_pf_3_sa_8Alphabet_2__dealloc__(CYTHON_UNUSED struct __pyx_obj_3_sa_Alphabet *__pyx_v_self); /* proto */ static PyObject *__pyx_pf_3_sa_8Alphabet_9terminals___get__(struct __pyx_obj_3_sa_Alphabet *__pyx_v_self); /* proto */ static PyObject *__pyx_pf_3_sa_8Alphabet_12nonterminals___get__(struct __pyx_obj_3_sa_Alphabet *__pyx_v_self); /* proto */ -static PyObject *__pyx_pf_3_sa_2sym_fromstring(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_terminal); /* proto */ +static PyObject *__pyx_pf_3_sa_2sym_fromstring(CYTHON_UNUSED PyObject *__pyx_self, char *__pyx_v_string, int __pyx_v_terminal); /* proto */ static int __pyx_pf_3_sa_6Phrase___cinit__(struct __pyx_obj_3_sa_Phrase *__pyx_v_self, PyObject *__pyx_v_words); /* proto */ static void __pyx_pf_3_sa_6Phrase_2__dealloc__(struct __pyx_obj_3_sa_Phrase *__pyx_v_self); /* proto */ static PyObject *__pyx_pf_3_sa_6Phrase_4__str__(struct __pyx_obj_3_sa_Phrase *__pyx_v_self); /* proto */ @@ -21750,7 +21750,7 @@ static int __pyx_f_3_sa_sym_setindex(int __pyx_v_sym, int __pyx_v_id) { * cdef int sym_setindex(int sym, int id): * return ALPHABET.setindex(sym, id) # <<<<<<<<<<<<<< * - * def sym_fromstring(bytes string, bint terminal): + * def sym_fromstring(char* string, bint terminal): */ __pyx_r = ((struct __pyx_vtabstruct_3_sa_Alphabet *)__pyx_v_3_sa_ALPHABET->__pyx_vtab)->setindex(__pyx_v_3_sa_ALPHABET, __pyx_v_sym, __pyx_v_id); goto __pyx_L0; @@ -21765,7 +21765,7 @@ static int __pyx_f_3_sa_sym_setindex(int __pyx_v_sym, int __pyx_v_id) { static PyObject *__pyx_pw_3_sa_3sym_fromstring(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyMethodDef __pyx_mdef_3_sa_3sym_fromstring = {__Pyx_NAMESTR("sym_fromstring"), (PyCFunction)__pyx_pw_3_sa_3sym_fromstring, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)}; static PyObject *__pyx_pw_3_sa_3sym_fromstring(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { - PyObject *__pyx_v_string = 0; + char *__pyx_v_string; int __pyx_v_terminal; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations @@ -21802,7 +21802,7 @@ static PyObject *__pyx_pw_3_sa_3sym_fromstring(PyObject *__pyx_self, PyObject *_ values[0] = PyTuple_GET_ITEM(__pyx_args, 0); values[1] = PyTuple_GET_ITEM(__pyx_args, 1); } - __pyx_v_string = ((PyObject*)values[0]); + __pyx_v_string = PyBytes_AsString(values[0]); if (unlikely((!__pyx_v_string) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 104; __pyx_clineno = __LINE__; goto __pyx_L3_error;} __pyx_v_terminal = __Pyx_PyObject_IsTrue(values[1]); if (unlikely((__pyx_v_terminal == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 104; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; @@ -21813,12 +21813,7 @@ static PyObject *__pyx_pw_3_sa_3sym_fromstring(PyObject *__pyx_self, PyObject *_ __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyBytes_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 104; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = __pyx_pf_3_sa_2sym_fromstring(__pyx_self, __pyx_v_string, __pyx_v_terminal); - goto __pyx_L0; - __pyx_L1_error:; - __pyx_r = NULL; - __pyx_L0:; __Pyx_RefNannyFinishContext(); return __pyx_r; } @@ -21826,15 +21821,14 @@ static PyObject *__pyx_pw_3_sa_3sym_fromstring(PyObject *__pyx_self, PyObject *_ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/sym.pxi":104 * return ALPHABET.setindex(sym, id) * - * def sym_fromstring(bytes string, bint terminal): # <<<<<<<<<<<<<< + * def sym_fromstring(char* string, bint terminal): # <<<<<<<<<<<<<< * return ALPHABET.fromstring(string, terminal) */ -static PyObject *__pyx_pf_3_sa_2sym_fromstring(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_terminal) { +static PyObject *__pyx_pf_3_sa_2sym_fromstring(CYTHON_UNUSED PyObject *__pyx_self, char *__pyx_v_string, int __pyx_v_terminal) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations - char *__pyx_t_1; - PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_1 = NULL; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; @@ -21842,21 +21836,20 @@ static PyObject *__pyx_pf_3_sa_2sym_fromstring(CYTHON_UNUSED PyObject *__pyx_sel /* "/Users/vchahun/Sandbox/cdec/python/src/sa/sym.pxi":105 * - * def sym_fromstring(bytes string, bint terminal): + * def sym_fromstring(char* string, bint terminal): * return ALPHABET.fromstring(string, terminal) # <<<<<<<<<<<<<< */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = PyBytes_AsString(((PyObject *)__pyx_v_string)); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_2 = PyInt_FromLong(((struct __pyx_vtabstruct_3_sa_Alphabet *)__pyx_v_3_sa_ALPHABET->__pyx_vtab)->fromstring(__pyx_v_3_sa_ALPHABET, __pyx_t_1, __pyx_v_terminal)); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_2); - __pyx_r = __pyx_t_2; - __pyx_t_2 = 0; + __pyx_t_1 = PyInt_FromLong(((struct __pyx_vtabstruct_3_sa_Alphabet *)__pyx_v_3_sa_ALPHABET->__pyx_vtab)->fromstring(__pyx_v_3_sa_ALPHABET, __pyx_v_string, __pyx_v_terminal)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; goto __pyx_L0; __pyx_r = Py_None; __Pyx_INCREF(Py_None); goto __pyx_L0; __pyx_L1_error:; - __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_1); __Pyx_AddTraceback("_sa.sym_fromstring", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; @@ -60511,7 +60504,7 @@ static int __Pyx_InitCachedConstants(void) { /* "/Users/vchahun/Sandbox/cdec/python/src/sa/sym.pxi":104 * return ALPHABET.setindex(sym, id) * - * def sym_fromstring(bytes string, bint terminal): # <<<<<<<<<<<<<< + * def sym_fromstring(char* string, bint terminal): # <<<<<<<<<<<<<< * return ALPHABET.fromstring(string, terminal) */ __pyx_k_tuple_137 = PyTuple_New(2); if (unlikely(!__pyx_k_tuple_137)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 104; __pyx_clineno = __LINE__; goto __pyx_L1_error;} @@ -60933,7 +60926,7 @@ PyMODINIT_FUNC PyInit__sa(void) /* "/Users/vchahun/Sandbox/cdec/python/src/sa/sym.pxi":104 * return ALPHABET.setindex(sym, id) * - * def sym_fromstring(bytes string, bint terminal): # <<<<<<<<<<<<<< + * def sym_fromstring(char* string, bint terminal): # <<<<<<<<<<<<<< * return ALPHABET.fromstring(string, terminal) */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3_sa_3sym_fromstring, NULL, __pyx_n_s___sa); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 104; __pyx_clineno = __LINE__; goto __pyx_L1_error;} diff --git a/python/src/sa/sym.pxi b/python/src/sa/sym.pxi index 4b41886f..132925f6 100644 --- a/python/src/sa/sym.pxi +++ b/python/src/sa/sym.pxi @@ -101,5 +101,5 @@ cdef int sym_getindex(int sym): cdef int sym_setindex(int sym, int id): return ALPHABET.setindex(sym, id) -def sym_fromstring(bytes string, bint terminal): +def sym_fromstring(char* string, bint terminal): return ALPHABET.fromstring(string, terminal) -- cgit v1.2.3