diff options
| author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-08-01 17:32:37 +0200 | 
|---|---|---|
| committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-08-01 17:32:37 +0200 | 
| commit | 3f8e33cfe481a09c121a410e66a6074b5d05683e (patch) | |
| tree | a41ecaf0bbb69fa91a581623abe89d41219c04f8 /python/pkg/cdec | |
| parent | c139ce495861bb341e1b86a85ad4559f9ad53c14 (diff) | |
| parent | 9fe0219562e5db25171cce8776381600ff9a5649 (diff) | |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'python/pkg/cdec')
| -rw-r--r-- | python/pkg/cdec/__init__.py | 1 | ||||
| -rw-r--r-- | python/pkg/cdec/configobj.py | 2468 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/__init__.py | 4 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/compile.py | 106 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/extract.py | 31 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/extractor.py | 78 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/features.py | 57 | ||||
| -rw-r--r-- | python/pkg/cdec/score.py | 1 | 
8 files changed, 2746 insertions, 0 deletions
| diff --git a/python/pkg/cdec/__init__.py b/python/pkg/cdec/__init__.py new file mode 100644 index 00000000..531fea49 --- /dev/null +++ b/python/pkg/cdec/__init__.py @@ -0,0 +1 @@ +from cdec._cdec import Decoder, Lattice, TRule, MRule, NT, NTRef, ParseFailed, InvalidConfig diff --git a/python/pkg/cdec/configobj.py b/python/pkg/cdec/configobj.py new file mode 100644 index 00000000..c1f6e6df --- /dev/null +++ b/python/pkg/cdec/configobj.py @@ -0,0 +1,2468 @@ +# configobj.py +# A config file reader/writer that supports nested sections in config files. +# Copyright (C) 2005-2010 Michael Foord, Nicola Larosa +# E-mail: fuzzyman AT voidspace DOT org DOT uk +#         nico AT tekNico DOT net + +# ConfigObj 4 +# http://www.voidspace.org.uk/python/configobj.html + +# Released subject to the BSD License +# Please see http://www.voidspace.org.uk/python/license.shtml + +# Scripts maintained at http://www.voidspace.org.uk/python/index.shtml +# For information about bugfixes, updates and support, please join the +# ConfigObj mailing list: +# http://lists.sourceforge.net/lists/listinfo/configobj-develop +# Comments, suggestions and bug reports welcome. + +from __future__ import generators + +import os +import re +import sys + +from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF16_BE, BOM_UTF16_LE + + +# imported lazily to avoid startup performance hit if it isn't used +compiler = None + +# A dictionary mapping BOM to +# the encoding to decode with, and what to set the +# encoding attribute to. +BOMS = { +    BOM_UTF8: ('utf_8', None), +    BOM_UTF16_BE: ('utf16_be', 'utf_16'), +    BOM_UTF16_LE: ('utf16_le', 'utf_16'), +    BOM_UTF16: ('utf_16', 'utf_16'), +    } +# All legal variants of the BOM codecs. +# TODO: the list of aliases is not meant to be exhaustive, is there a +#   better way ? +BOM_LIST = { +    'utf_16': 'utf_16', +    'u16': 'utf_16', +    'utf16': 'utf_16', +    'utf-16': 'utf_16', +    'utf16_be': 'utf16_be', +    'utf_16_be': 'utf16_be', +    'utf-16be': 'utf16_be', +    'utf16_le': 'utf16_le', +    'utf_16_le': 'utf16_le', +    'utf-16le': 'utf16_le', +    'utf_8': 'utf_8', +    'u8': 'utf_8', +    'utf': 'utf_8', +    'utf8': 'utf_8', +    'utf-8': 'utf_8', +    } + +# Map of encodings to the BOM to write. +BOM_SET = { +    'utf_8': BOM_UTF8, +    'utf_16': BOM_UTF16, +    'utf16_be': BOM_UTF16_BE, +    'utf16_le': BOM_UTF16_LE, +    None: BOM_UTF8 +    } + + +def match_utf8(encoding): +    return BOM_LIST.get(encoding.lower()) == 'utf_8' + + +# Quote strings used for writing values +squot = "'%s'" +dquot = '"%s"' +noquot = "%s" +wspace_plus = ' \r\n\v\t\'"' +tsquot = '"""%s"""' +tdquot = "'''%s'''" + +# Sentinel for use in getattr calls to replace hasattr +MISSING = object() + +__version__ = '4.7.2' + +try: +    any +except NameError: +    def any(iterable): +        for entry in iterable: +            if entry: +                return True +        return False + + +__all__ = ( +    '__version__', +    'DEFAULT_INDENT_TYPE', +    'DEFAULT_INTERPOLATION', +    'ConfigObjError', +    'NestingError', +    'ParseError', +    'DuplicateError', +    'ConfigspecError', +    'ConfigObj', +    'SimpleVal', +    'InterpolationError', +    'InterpolationLoopError', +    'MissingInterpolationOption', +    'RepeatSectionError', +    'ReloadError', +    'UnreprError', +    'UnknownType', +    'flatten_errors', +    'get_extra_values' +) + +DEFAULT_INTERPOLATION = 'configparser' +DEFAULT_INDENT_TYPE = '    ' +MAX_INTERPOL_DEPTH = 10 + +OPTION_DEFAULTS = { +    'interpolation': True, +    'raise_errors': False, +    'list_values': True, +    'create_empty': False, +    'file_error': False, +    'configspec': None, +    'stringify': True, +    # option may be set to one of ('', ' ', '\t') +    'indent_type': None, +    'encoding': None, +    'default_encoding': None, +    'unrepr': False, +    'write_empty_values': False, +} + + + +def getObj(s): +    global compiler +    if compiler is None: +        import compiler +    s = "a=" + s +    p = compiler.parse(s) +    return p.getChildren()[1].getChildren()[0].getChildren()[1] + + +class UnknownType(Exception): +    pass + + +class Builder(object): +     +    def build(self, o): +        m = getattr(self, 'build_' + o.__class__.__name__, None) +        if m is None: +            raise UnknownType(o.__class__.__name__) +        return m(o) +     +    def build_List(self, o): +        return map(self.build, o.getChildren()) +     +    def build_Const(self, o): +        return o.value +     +    def build_Dict(self, o): +        d = {} +        i = iter(map(self.build, o.getChildren())) +        for el in i: +            d[el] = i.next() +        return d +     +    def build_Tuple(self, o): +        return tuple(self.build_List(o)) +     +    def build_Name(self, o): +        if o.name == 'None': +            return None +        if o.name == 'True': +            return True +        if o.name == 'False': +            return False +         +        # An undefined Name +        raise UnknownType('Undefined Name') +     +    def build_Add(self, o): +        real, imag = map(self.build_Const, o.getChildren()) +        try: +            real = float(real) +        except TypeError: +            raise UnknownType('Add') +        if not isinstance(imag, complex) or imag.real != 0.0: +            raise UnknownType('Add') +        return real+imag +     +    def build_Getattr(self, o): +        parent = self.build(o.expr) +        return getattr(parent, o.attrname) +     +    def build_UnarySub(self, o): +        return -self.build_Const(o.getChildren()[0]) +     +    def build_UnaryAdd(self, o): +        return self.build_Const(o.getChildren()[0]) + + +_builder = Builder() + + +def unrepr(s): +    if not s: +        return s +    return _builder.build(getObj(s)) + + + +class ConfigObjError(SyntaxError): +    """ +    This is the base class for all errors that ConfigObj raises. +    It is a subclass of SyntaxError. +    """ +    def __init__(self, message='', line_number=None, line=''): +        self.line = line +        self.line_number = line_number +        SyntaxError.__init__(self, message) + + +class NestingError(ConfigObjError): +    """ +    This error indicates a level of nesting that doesn't match. +    """ + + +class ParseError(ConfigObjError): +    """ +    This error indicates that a line is badly written. +    It is neither a valid ``key = value`` line, +    nor a valid section marker line. +    """ + + +class ReloadError(IOError): +    """ +    A 'reload' operation failed. +    This exception is a subclass of ``IOError``. +    """ +    def __init__(self): +        IOError.__init__(self, 'reload failed, filename is not set.') + + +class DuplicateError(ConfigObjError): +    """ +    The keyword or section specified already exists. +    """ + + +class ConfigspecError(ConfigObjError): +    """ +    An error occured whilst parsing a configspec. +    """ + + +class InterpolationError(ConfigObjError): +    """Base class for the two interpolation errors.""" + + +class InterpolationLoopError(InterpolationError): +    """Maximum interpolation depth exceeded in string interpolation.""" + +    def __init__(self, option): +        InterpolationError.__init__( +            self, +            'interpolation loop detected in value "%s".' % option) + + +class RepeatSectionError(ConfigObjError): +    """ +    This error indicates additional sections in a section with a +    ``__many__`` (repeated) section. +    """ + + +class MissingInterpolationOption(InterpolationError): +    """A value specified for interpolation was missing.""" +    def __init__(self, option): +        msg = 'missing option "%s" in interpolation.' % option +        InterpolationError.__init__(self, msg) + + +class UnreprError(ConfigObjError): +    """An error parsing in unrepr mode.""" + + + +class InterpolationEngine(object): +    """ +    A helper class to help perform string interpolation. + +    This class is an abstract base class; its descendants perform +    the actual work. +    """ + +    # compiled regexp to use in self.interpolate() +    _KEYCRE = re.compile(r"%\(([^)]*)\)s") +    _cookie = '%' + +    def __init__(self, section): +        # the Section instance that "owns" this engine +        self.section = section + + +    def interpolate(self, key, value): +        # short-cut +        if not self._cookie in value: +            return value +         +        def recursive_interpolate(key, value, section, backtrail): +            """The function that does the actual work. + +            ``value``: the string we're trying to interpolate. +            ``section``: the section in which that string was found +            ``backtrail``: a dict to keep track of where we've been, +            to detect and prevent infinite recursion loops + +            This is similar to a depth-first-search algorithm. +            """ +            # Have we been here already? +            if (key, section.name) in backtrail: +                # Yes - infinite loop detected +                raise InterpolationLoopError(key) +            # Place a marker on our backtrail so we won't come back here again +            backtrail[(key, section.name)] = 1 + +            # Now start the actual work +            match = self._KEYCRE.search(value) +            while match: +                # The actual parsing of the match is implementation-dependent, +                # so delegate to our helper function +                k, v, s = self._parse_match(match) +                if k is None: +                    # That's the signal that no further interpolation is needed +                    replacement = v +                else: +                    # Further interpolation may be needed to obtain final value +                    replacement = recursive_interpolate(k, v, s, backtrail) +                # Replace the matched string with its final value +                start, end = match.span() +                value = ''.join((value[:start], replacement, value[end:])) +                new_search_start = start + len(replacement) +                # Pick up the next interpolation key, if any, for next time +                # through the while loop +                match = self._KEYCRE.search(value, new_search_start) + +            # Now safe to come back here again; remove marker from backtrail +            del backtrail[(key, section.name)] + +            return value + +        # Back in interpolate(), all we have to do is kick off the recursive +        # function with appropriate starting values +        value = recursive_interpolate(key, value, self.section, {}) +        return value + + +    def _fetch(self, key): +        """Helper function to fetch values from owning section. + +        Returns a 2-tuple: the value, and the section where it was found. +        """ +        # switch off interpolation before we try and fetch anything ! +        save_interp = self.section.main.interpolation +        self.section.main.interpolation = False + +        # Start at section that "owns" this InterpolationEngine +        current_section = self.section +        while True: +            # try the current section first +            val = current_section.get(key) +            if val is not None and not isinstance(val, Section): +                break +            # try "DEFAULT" next +            val = current_section.get('DEFAULT', {}).get(key) +            if val is not None and not isinstance(val, Section): +                break +            # move up to parent and try again +            # top-level's parent is itself +            if current_section.parent is current_section: +                # reached top level, time to give up +                break +            current_section = current_section.parent + +        # restore interpolation to previous value before returning +        self.section.main.interpolation = save_interp +        if val is None: +            raise MissingInterpolationOption(key) +        return val, current_section + + +    def _parse_match(self, match): +        """Implementation-dependent helper function. + +        Will be passed a match object corresponding to the interpolation +        key we just found (e.g., "%(foo)s" or "$foo"). Should look up that +        key in the appropriate config file section (using the ``_fetch()`` +        helper function) and return a 3-tuple: (key, value, section) + +        ``key`` is the name of the key we're looking for +        ``value`` is the value found for that key +        ``section`` is a reference to the section where it was found + +        ``key`` and ``section`` should be None if no further +        interpolation should be performed on the resulting value +        (e.g., if we interpolated "$$" and returned "$"). +        """ +        raise NotImplementedError() +     + + +class ConfigParserInterpolation(InterpolationEngine): +    """Behaves like ConfigParser.""" +    _cookie = '%' +    _KEYCRE = re.compile(r"%\(([^)]*)\)s") + +    def _parse_match(self, match): +        key = match.group(1) +        value, section = self._fetch(key) +        return key, value, section + + + +class TemplateInterpolation(InterpolationEngine): +    """Behaves like string.Template.""" +    _cookie = '$' +    _delimiter = '$' +    _KEYCRE = re.compile(r""" +        \$(?: +          (?P<escaped>\$)              |   # Two $ signs +          (?P<named>[_a-z][_a-z0-9]*)  |   # $name format +          {(?P<braced>[^}]*)}              # ${name} format +        ) +        """, re.IGNORECASE | re.VERBOSE) + +    def _parse_match(self, match): +        # Valid name (in or out of braces): fetch value from section +        key = match.group('named') or match.group('braced') +        if key is not None: +            value, section = self._fetch(key) +            return key, value, section +        # Escaped delimiter (e.g., $$): return single delimiter +        if match.group('escaped') is not None: +            # Return None for key and section to indicate it's time to stop +            return None, self._delimiter, None +        # Anything else: ignore completely, just return it unchanged +        return None, match.group(), None + + +interpolation_engines = { +    'configparser': ConfigParserInterpolation, +    'template': TemplateInterpolation, +} + + +def __newobj__(cls, *args): +    # Hack for pickle +    return cls.__new__(cls, *args)  + +class Section(dict): +    """ +    A dictionary-like object that represents a section in a config file. +     +    It does string interpolation if the 'interpolation' attribute +    of the 'main' object is set to True. +     +    Interpolation is tried first from this object, then from the 'DEFAULT' +    section of this object, next from the parent and its 'DEFAULT' section, +    and so on until the main object is reached. +     +    A Section will behave like an ordered dictionary - following the +    order of the ``scalars`` and ``sections`` attributes. +    You can use this to change the order of members. +     +    Iteration follows the order: scalars, then sections. +    """ + +     +    def __setstate__(self, state): +        dict.update(self, state[0]) +        self.__dict__.update(state[1]) + +    def __reduce__(self): +        state = (dict(self), self.__dict__) +        return (__newobj__, (self.__class__,), state) +     +     +    def __init__(self, parent, depth, main, indict=None, name=None): +        """ +        * parent is the section above +        * depth is the depth level of this section +        * main is the main ConfigObj +        * indict is a dictionary to initialise the section with +        """ +        if indict is None: +            indict = {} +        dict.__init__(self) +        # used for nesting level *and* interpolation +        self.parent = parent +        # used for the interpolation attribute +        self.main = main +        # level of nesting depth of this Section +        self.depth = depth +        # purely for information +        self.name = name +        # +        self._initialise() +        # we do this explicitly so that __setitem__ is used properly +        # (rather than just passing to ``dict.__init__``) +        for entry, value in indict.iteritems(): +            self[entry] = value +             +             +    def _initialise(self): +        # the sequence of scalar values in this Section +        self.scalars = [] +        # the sequence of sections in this Section +        self.sections = [] +        # for comments :-) +        self.comments = {} +        self.inline_comments = {} +        # the configspec +        self.configspec = None +        # for defaults +        self.defaults = [] +        self.default_values = {} +        self.extra_values = [] +        self._created = False + + +    def _interpolate(self, key, value): +        try: +            # do we already have an interpolation engine? +            engine = self._interpolation_engine +        except AttributeError: +            # not yet: first time running _interpolate(), so pick the engine +            name = self.main.interpolation +            if name == True:  # note that "if name:" would be incorrect here +                # backwards-compatibility: interpolation=True means use default +                name = DEFAULT_INTERPOLATION +            name = name.lower()  # so that "Template", "template", etc. all work +            class_ = interpolation_engines.get(name, None) +            if class_ is None: +                # invalid value for self.main.interpolation +                self.main.interpolation = False +                return value +            else: +                # save reference to engine so we don't have to do this again +                engine = self._interpolation_engine = class_(self) +        # let the engine do the actual work +        return engine.interpolate(key, value) + + +    def __getitem__(self, key): +        """Fetch the item and do string interpolation.""" +        val = dict.__getitem__(self, key) +        if self.main.interpolation:  +            if isinstance(val, basestring): +                return self._interpolate(key, val) +            if isinstance(val, list): +                def _check(entry): +                    if isinstance(entry, basestring): +                        return self._interpolate(key, entry) +                    return entry +                new = [_check(entry) for entry in val] +                if new != val: +                    return new +        return val + + +    def __setitem__(self, key, value, unrepr=False): +        """ +        Correctly set a value. +         +        Making dictionary values Section instances. +        (We have to special case 'Section' instances - which are also dicts) +         +        Keys must be strings. +        Values need only be strings (or lists of strings) if +        ``main.stringify`` is set. +         +        ``unrepr`` must be set when setting a value to a dictionary, without +        creating a new sub-section. +        """ +        if not isinstance(key, basestring): +            raise ValueError('The key "%s" is not a string.' % key) +         +        # add the comment +        if key not in self.comments: +            self.comments[key] = [] +            self.inline_comments[key] = '' +        # remove the entry from defaults +        if key in self.defaults: +            self.defaults.remove(key) +        # +        if isinstance(value, Section): +            if key not in self: +                self.sections.append(key) +            dict.__setitem__(self, key, value) +        elif isinstance(value, dict) and not unrepr: +            # First create the new depth level, +            # then create the section +            if key not in self: +                self.sections.append(key) +            new_depth = self.depth + 1 +            dict.__setitem__( +                self, +                key, +                Section( +                    self, +                    new_depth, +                    self.main, +                    indict=value, +                    name=key)) +        else: +            if key not in self: +                self.scalars.append(key) +            if not self.main.stringify: +                if isinstance(value, basestring): +                    pass +                elif isinstance(value, (list, tuple)): +                    for entry in value: +                        if not isinstance(entry, basestring): +                            raise TypeError('Value is not a string "%s".' % entry) +                else: +                    raise TypeError('Value is not a string "%s".' % value) +            dict.__setitem__(self, key, value) + + +    def __delitem__(self, key): +        """Remove items from the sequence when deleting.""" +        dict. __delitem__(self, key) +        if key in self.scalars: +            self.scalars.remove(key) +        else: +            self.sections.remove(key) +        del self.comments[key] +        del self.inline_comments[key] + + +    def get(self, key, default=None): +        """A version of ``get`` that doesn't bypass string interpolation.""" +        try: +            return self[key] +        except KeyError: +            return default + + +    def update(self, indict): +        """ +        A version of update that uses our ``__setitem__``. +        """ +        for entry in indict: +            self[entry] = indict[entry] + + +    def pop(self, key, default=MISSING): +        """ +        'D.pop(k[,d]) -> v, remove specified key and return the corresponding value. +        If key is not found, d is returned if given, otherwise KeyError is raised' +        """ +        try: +            val = self[key] +        except KeyError: +            if default is MISSING: +                raise +            val = default +        else: +            del self[key] +        return val + + +    def popitem(self): +        """Pops the first (key,val)""" +        sequence = (self.scalars + self.sections) +        if not sequence: +            raise KeyError(": 'popitem(): dictionary is empty'") +        key = sequence[0] +        val =  self[key] +        del self[key] +        return key, val + + +    def clear(self): +        """ +        A version of clear that also affects scalars/sections +        Also clears comments and configspec. +         +        Leaves other attributes alone : +            depth/main/parent are not affected +        """ +        dict.clear(self) +        self.scalars = [] +        self.sections = [] +        self.comments = {} +        self.inline_comments = {} +        self.configspec = None +        self.defaults = [] +        self.extra_values = [] + + +    def setdefault(self, key, default=None): +        """A version of setdefault that sets sequence if appropriate.""" +        try: +            return self[key] +        except KeyError: +            self[key] = default +            return self[key] + + +    def items(self): +        """D.items() -> list of D's (key, value) pairs, as 2-tuples""" +        return zip((self.scalars + self.sections), self.values()) + + +    def keys(self): +        """D.keys() -> list of D's keys""" +        return (self.scalars + self.sections) + + +    def values(self): +        """D.values() -> list of D's values""" +        return [self[key] for key in (self.scalars + self.sections)] + + +    def iteritems(self): +        """D.iteritems() -> an iterator over the (key, value) items of D""" +        return iter(self.items()) + + +    def iterkeys(self): +        """D.iterkeys() -> an iterator over the keys of D""" +        return iter((self.scalars + self.sections)) + +    __iter__ = iterkeys + + +    def itervalues(self): +        """D.itervalues() -> an iterator over the values of D""" +        return iter(self.values()) + + +    def __repr__(self): +        """x.__repr__() <==> repr(x)""" +        def _getval(key): +            try: +                return self[key] +            except MissingInterpolationOption: +                return dict.__getitem__(self, key) +        return '{%s}' % ', '.join([('%s: %s' % (repr(key), repr(_getval(key)))) +            for key in (self.scalars + self.sections)]) + +    __str__ = __repr__ +    __str__.__doc__ = "x.__str__() <==> str(x)" + + +    # Extra methods - not in a normal dictionary + +    def dict(self): +        """ +        Return a deepcopy of self as a dictionary. +         +        All members that are ``Section`` instances are recursively turned to +        ordinary dictionaries - by calling their ``dict`` method. +         +        >>> n = a.dict() +        >>> n == a +        1 +        >>> n is a +        0 +        """ +        newdict = {} +        for entry in self: +            this_entry = self[entry] +            if isinstance(this_entry, Section): +                this_entry = this_entry.dict() +            elif isinstance(this_entry, list): +                # create a copy rather than a reference +                this_entry = list(this_entry) +            elif isinstance(this_entry, tuple): +                # create a copy rather than a reference +                this_entry = tuple(this_entry) +            newdict[entry] = this_entry +        return newdict + + +    def merge(self, indict): +        """ +        A recursive update - useful for merging config files. +         +        >>> a = '''[section1] +        ...     option1 = True +        ...     [[subsection]] +        ...     more_options = False +        ...     # end of file'''.splitlines() +        >>> b = '''# File is user.ini +        ...     [section1] +        ...     option1 = False +        ...     # end of file'''.splitlines() +        >>> c1 = ConfigObj(b) +        >>> c2 = ConfigObj(a) +        >>> c2.merge(c1) +        >>> c2 +        ConfigObj({'section1': {'option1': 'False', 'subsection': {'more_options': 'False'}}}) +        """ +        for key, val in indict.items(): +            if (key in self and isinstance(self[key], dict) and +                                isinstance(val, dict)): +                self[key].merge(val) +            else:    +                self[key] = val + + +    def rename(self, oldkey, newkey): +        """ +        Change a keyname to another, without changing position in sequence. +         +        Implemented so that transformations can be made on keys, +        as well as on values. (used by encode and decode) +         +        Also renames comments. +        """ +        if oldkey in self.scalars: +            the_list = self.scalars +        elif oldkey in self.sections: +            the_list = self.sections +        else: +            raise KeyError('Key "%s" not found.' % oldkey) +        pos = the_list.index(oldkey) +        # +        val = self[oldkey] +        dict.__delitem__(self, oldkey) +        dict.__setitem__(self, newkey, val) +        the_list.remove(oldkey) +        the_list.insert(pos, newkey) +        comm = self.comments[oldkey] +        inline_comment = self.inline_comments[oldkey] +        del self.comments[oldkey] +        del self.inline_comments[oldkey] +        self.comments[newkey] = comm +        self.inline_comments[newkey] = inline_comment + + +    def walk(self, function, raise_errors=True, +            call_on_sections=False, **keywargs): +        """ +        Walk every member and call a function on the keyword and value. +         +        Return a dictionary of the return values +         +        If the function raises an exception, raise the errror +        unless ``raise_errors=False``, in which case set the return value to +        ``False``. +         +        Any unrecognised keyword arguments you pass to walk, will be pased on +        to the function you pass in. +         +        Note: if ``call_on_sections`` is ``True`` then - on encountering a +        subsection, *first* the function is called for the *whole* subsection, +        and then recurses into it's members. This means your function must be +        able to handle strings, dictionaries and lists. This allows you +        to change the key of subsections as well as for ordinary members. The +        return value when called on the whole subsection has to be discarded. +         +        See  the encode and decode methods for examples, including functions. +         +        .. admonition:: caution +         +            You can use ``walk`` to transform the names of members of a section +            but you mustn't add or delete members. +         +        >>> config = '''[XXXXsection] +        ... XXXXkey = XXXXvalue'''.splitlines() +        >>> cfg = ConfigObj(config) +        >>> cfg +        ConfigObj({'XXXXsection': {'XXXXkey': 'XXXXvalue'}}) +        >>> def transform(section, key): +        ...     val = section[key] +        ...     newkey = key.replace('XXXX', 'CLIENT1') +        ...     section.rename(key, newkey) +        ...     if isinstance(val, (tuple, list, dict)): +        ...         pass +        ...     else: +        ...         val = val.replace('XXXX', 'CLIENT1') +        ...         section[newkey] = val +        >>> cfg.walk(transform, call_on_sections=True) +        {'CLIENT1section': {'CLIENT1key': None}} +        >>> cfg +        ConfigObj({'CLIENT1section': {'CLIENT1key': 'CLIENT1value'}}) +        """ +        out = {} +        # scalars first +        for i in range(len(self.scalars)): +            entry = self.scalars[i] +            try: +                val = function(self, entry, **keywargs) +                # bound again in case name has changed +                entry = self.scalars[i] +                out[entry] = val +            except Exception: +                if raise_errors: +                    raise +                else: +                    entry = self.scalars[i] +                    out[entry] = False +        # then sections +        for i in range(len(self.sections)): +            entry = self.sections[i] +            if call_on_sections: +                try: +                    function(self, entry, **keywargs) +                except Exception: +                    if raise_errors: +                        raise +                    else: +                        entry = self.sections[i] +                        out[entry] = False +                # bound again in case name has changed +                entry = self.sections[i] +            # previous result is discarded +            out[entry] = self[entry].walk( +                function, +                raise_errors=raise_errors, +                call_on_sections=call_on_sections, +                **keywargs) +        return out + + +    def as_bool(self, key): +        """ +        Accepts a key as input. The corresponding value must be a string or +        the objects (``True`` or 1) or (``False`` or 0). We allow 0 and 1 to +        retain compatibility with Python 2.2. +         +        If the string is one of  ``True``, ``On``, ``Yes``, or ``1`` it returns  +        ``True``. +         +        If the string is one of  ``False``, ``Off``, ``No``, or ``0`` it returns  +        ``False``. +         +        ``as_bool`` is not case sensitive. +         +        Any other input will raise a ``ValueError``. +         +        >>> a = ConfigObj() +        >>> a['a'] = 'fish' +        >>> a.as_bool('a') +        Traceback (most recent call last): +        ValueError: Value "fish" is neither True nor False +        >>> a['b'] = 'True' +        >>> a.as_bool('b') +        1 +        >>> a['b'] = 'off' +        >>> a.as_bool('b') +        0 +        """ +        val = self[key] +        if val == True: +            return True +        elif val == False: +            return False +        else: +            try: +                if not isinstance(val, basestring): +                    # TODO: Why do we raise a KeyError here? +                    raise KeyError() +                else: +                    return self.main._bools[val.lower()] +            except KeyError: +                raise ValueError('Value "%s" is neither True nor False' % val) + + +    def as_int(self, key): +        """ +        A convenience method which coerces the specified value to an integer. +         +        If the value is an invalid literal for ``int``, a ``ValueError`` will +        be raised. +         +        >>> a = ConfigObj() +        >>> a['a'] = 'fish' +        >>> a.as_int('a') +        Traceback (most recent call last): +        ValueError: invalid literal for int() with base 10: 'fish' +        >>> a['b'] = '1' +        >>> a.as_int('b') +        1 +        >>> a['b'] = '3.2' +        >>> a.as_int('b') +        Traceback (most recent call last): +        ValueError: invalid literal for int() with base 10: '3.2' +        """ +        return int(self[key]) + + +    def as_float(self, key): +        """ +        A convenience method which coerces the specified value to a float. +         +        If the value is an invalid literal for ``float``, a ``ValueError`` will +        be raised. +         +        >>> a = ConfigObj() +        >>> a['a'] = 'fish' +        >>> a.as_float('a') +        Traceback (most recent call last): +        ValueError: invalid literal for float(): fish +        >>> a['b'] = '1' +        >>> a.as_float('b') +        1.0 +        >>> a['b'] = '3.2' +        >>> a.as_float('b') +        3.2000000000000002 +        """ +        return float(self[key]) +     +     +    def as_list(self, key): +        """ +        A convenience method which fetches the specified value, guaranteeing +        that it is a list. +         +        >>> a = ConfigObj() +        >>> a['a'] = 1 +        >>> a.as_list('a') +        [1] +        >>> a['a'] = (1,) +        >>> a.as_list('a') +        [1] +        >>> a['a'] = [1] +        >>> a.as_list('a') +        [1] +        """ +        result = self[key] +        if isinstance(result, (tuple, list)): +            return list(result) +        return [result] +         + +    def restore_default(self, key): +        """ +        Restore (and return) default value for the specified key. +         +        This method will only work for a ConfigObj that was created +        with a configspec and has been validated. +         +        If there is no default value for this key, ``KeyError`` is raised. +        """ +        default = self.default_values[key] +        dict.__setitem__(self, key, default) +        if key not in self.defaults: +            self.defaults.append(key) +        return default + +     +    def restore_defaults(self): +        """ +        Recursively restore default values to all members +        that have them. +         +        This method will only work for a ConfigObj that was created +        with a configspec and has been validated. +         +        It doesn't delete or modify entries without default values. +        """ +        for key in self.default_values: +            self.restore_default(key) +             +        for section in self.sections: +            self[section].restore_defaults() + + +class ConfigObj(Section): +    """An object to read, create, and write config files.""" + +    _keyword = re.compile(r'''^ # line start +        (\s*)                   # indentation +        (                       # keyword +            (?:".*?")|          # double quotes +            (?:'.*?')|          # single quotes +            (?:[^'"=].*?)       # no quotes +        ) +        \s*=\s*                 # divider +        (.*)                    # value (including list values and comments) +        $   # line end +        ''', +        re.VERBOSE) + +    _sectionmarker = re.compile(r'''^ +        (\s*)                     # 1: indentation +        ((?:\[\s*)+)              # 2: section marker open +        (                         # 3: section name open +            (?:"\s*\S.*?\s*")|    # at least one non-space with double quotes +            (?:'\s*\S.*?\s*')|    # at least one non-space with single quotes +            (?:[^'"\s].*?)        # at least one non-space unquoted +        )                         # section name close +        ((?:\s*\])+)              # 4: section marker close +        \s*(\#.*)?                # 5: optional comment +        $''', +        re.VERBOSE) + +    # this regexp pulls list values out as a single string +    # or single values and comments +    # FIXME: this regex adds a '' to the end of comma terminated lists +    #   workaround in ``_handle_value`` +    _valueexp = re.compile(r'''^ +        (?: +            (?: +                ( +                    (?: +                        (?: +                            (?:".*?")|              # double quotes +                            (?:'.*?')|              # single quotes +                            (?:[^'",\#][^,\#]*?)    # unquoted +                        ) +                        \s*,\s*                     # comma +                    )*      # match all list items ending in a comma (if any) +                ) +                ( +                    (?:".*?")|                      # double quotes +                    (?:'.*?')|                      # single quotes +                    (?:[^'",\#\s][^,]*?)|           # unquoted +                    (?:(?<!,))                      # Empty value +                )?          # last item in a list - or string value +            )| +            (,)             # alternatively a single comma - empty list +        ) +        \s*(\#.*)?          # optional comment +        $''', +        re.VERBOSE) + +    # use findall to get the members of a list value +    _listvalueexp = re.compile(r''' +        ( +            (?:".*?")|          # double quotes +            (?:'.*?')|          # single quotes +            (?:[^'",\#]?.*?)       # unquoted +        ) +        \s*,\s*                 # comma +        ''', +        re.VERBOSE) + +    # this regexp is used for the value +    # when lists are switched off +    _nolistvalue = re.compile(r'''^ +        ( +            (?:".*?")|          # double quotes +            (?:'.*?')|          # single quotes +            (?:[^'"\#].*?)|     # unquoted +            (?:)                # Empty value +        ) +        \s*(\#.*)?              # optional comment +        $''', +        re.VERBOSE) + +    # regexes for finding triple quoted values on one line +    _single_line_single = re.compile(r"^'''(.*?)'''\s*(#.*)?$") +    _single_line_double = re.compile(r'^"""(.*?)"""\s*(#.*)?$') +    _multi_line_single = re.compile(r"^(.*?)'''\s*(#.*)?$") +    _multi_line_double = re.compile(r'^(.*?)"""\s*(#.*)?$') + +    _triple_quote = { +        "'''": (_single_line_single, _multi_line_single), +        '"""': (_single_line_double, _multi_line_double), +    } + +    # Used by the ``istrue`` Section method +    _bools = { +        'yes': True, 'no': False, +        'on': True, 'off': False, +        '1': True, '0': False, +        'true': True, 'false': False, +        } + + +    def __init__(self, infile=None, options=None, configspec=None, encoding=None, +                 interpolation=True, raise_errors=False, list_values=True, +                 create_empty=False, file_error=False, stringify=True, +                 indent_type=None, default_encoding=None, unrepr=False, +                 write_empty_values=False, _inspec=False): +        """ +        Parse a config file or create a config file object. +         +        ``ConfigObj(infile=None, configspec=None, encoding=None, +                    interpolation=True, raise_errors=False, list_values=True, +                    create_empty=False, file_error=False, stringify=True, +                    indent_type=None, default_encoding=None, unrepr=False, +                    write_empty_values=False, _inspec=False)`` +        """ +        self._inspec = _inspec +        # init the superclass +        Section.__init__(self, self, 0, self) +         +        infile = infile or [] +         +        _options = {'configspec': configspec, +                    'encoding': encoding, 'interpolation': interpolation, +                    'raise_errors': raise_errors, 'list_values': list_values, +                    'create_empty': create_empty, 'file_error': file_error, +                    'stringify': stringify, 'indent_type': indent_type, +                    'default_encoding': default_encoding, 'unrepr': unrepr, +                    'write_empty_values': write_empty_values} + +        if options is None: +            options = _options +        else: +            import warnings +            warnings.warn('Passing in an options dictionary to ConfigObj() is ' +                          'deprecated. Use **options instead.', +                          DeprecationWarning, stacklevel=2) +             +            # TODO: check the values too. +            for entry in options: +                if entry not in OPTION_DEFAULTS: +                    raise TypeError('Unrecognised option "%s".' % entry) +            for entry, value in OPTION_DEFAULTS.items(): +                if entry not in options: +                    options[entry] = value +                keyword_value = _options[entry] +                if value != keyword_value: +                    options[entry] = keyword_value +         +        # XXXX this ignores an explicit list_values = True in combination +        # with _inspec. The user should *never* do that anyway, but still... +        if _inspec: +            options['list_values'] = False +         +        self._initialise(options) +        configspec = options['configspec'] +        self._original_configspec = configspec +        self._load(infile, configspec) +         +         +    def _load(self, infile, configspec): +        if isinstance(infile, basestring): +            self.filename = infile +            if os.path.isfile(infile): +                h = open(infile, 'rb') +                infile = h.read() or [] +                h.close() +            elif self.file_error: +                # raise an error if the file doesn't exist +                raise IOError('Config file not found: "%s".' % self.filename) +            else: +                # file doesn't already exist +                if self.create_empty: +                    # this is a good test that the filename specified +                    # isn't impossible - like on a non-existent device +                    h = open(infile, 'w') +                    h.write('') +                    h.close() +                infile = [] +                 +        elif isinstance(infile, (list, tuple)): +            infile = list(infile) +             +        elif isinstance(infile, dict): +            # initialise self +            # the Section class handles creating subsections +            if isinstance(infile, ConfigObj): +                # get a copy of our ConfigObj +                def set_section(in_section, this_section): +                    for entry in in_section.scalars: +                        this_section[entry] = in_section[entry] +                    for section in in_section.sections: +                        this_section[section] = {} +                        set_section(in_section[section], this_section[section]) +                set_section(infile, self) +                 +            else: +                for entry in infile: +                    self[entry] = infile[entry] +            del self._errors +             +            if configspec is not None: +                self._handle_configspec(configspec) +            else: +                self.configspec = None +            return +         +        elif getattr(infile, 'read', MISSING) is not MISSING: +            # This supports file like objects +            infile = infile.read() or [] +            # needs splitting into lines - but needs doing *after* decoding +            # in case it's not an 8 bit encoding +        else: +            raise TypeError('infile must be a filename, file like object, or list of lines.') +         +        if infile: +            # don't do it for the empty ConfigObj +            infile = self._handle_bom(infile) +            # infile is now *always* a list +            # +            # Set the newlines attribute (first line ending it finds) +            # and strip trailing '\n' or '\r' from lines +            for line in infile: +                if (not line) or (line[-1] not in ('\r', '\n', '\r\n')): +                    continue +                for end in ('\r\n', '\n', '\r'): +                    if line.endswith(end): +                        self.newlines = end +                        break +                break + +            infile = [line.rstrip('\r\n') for line in infile] +             +        self._parse(infile) +        # if we had any errors, now is the time to raise them +        if self._errors: +            info = "at line %s." % self._errors[0].line_number +            if len(self._errors) > 1: +                msg = "Parsing failed with several errors.\nFirst error %s" % info +                error = ConfigObjError(msg) +            else: +                error = self._errors[0] +            # set the errors attribute; it's a list of tuples: +            # (error_type, message, line_number) +            error.errors = self._errors +            # set the config attribute +            error.config = self +            raise error +        # delete private attributes +        del self._errors +         +        if configspec is None: +            self.configspec = None +        else: +            self._handle_configspec(configspec) +     +     +    def _initialise(self, options=None): +        if options is None: +            options = OPTION_DEFAULTS +             +        # initialise a few variables +        self.filename = None +        self._errors = [] +        self.raise_errors = options['raise_errors'] +        self.interpolation = options['interpolation'] +        self.list_values = options['list_values'] +        self.create_empty = options['create_empty'] +        self.file_error = options['file_error'] +        self.stringify = options['stringify'] +        self.indent_type = options['indent_type'] +        self.encoding = options['encoding'] +        self.default_encoding = options['default_encoding'] +        self.BOM = False +        self.newlines = None +        self.write_empty_values = options['write_empty_values'] +        self.unrepr = options['unrepr'] +         +        self.initial_comment = [] +        self.final_comment = [] +        self.configspec = None +         +        if self._inspec: +            self.list_values = False +         +        # Clear section attributes as well +        Section._initialise(self) +         +         +    def __repr__(self): +        def _getval(key): +            try: +                return self[key] +            except MissingInterpolationOption: +                return dict.__getitem__(self, key) +        return ('ConfigObj({%s})' %  +                ', '.join([('%s: %s' % (repr(key), repr(_getval(key))))  +                for key in (self.scalars + self.sections)])) +     +     +    def _handle_bom(self, infile): +        """ +        Handle any BOM, and decode if necessary. +         +        If an encoding is specified, that *must* be used - but the BOM should +        still be removed (and the BOM attribute set). +         +        (If the encoding is wrongly specified, then a BOM for an alternative +        encoding won't be discovered or removed.) +         +        If an encoding is not specified, UTF8 or UTF16 BOM will be detected and +        removed. The BOM attribute will be set. UTF16 will be decoded to +        unicode. +         +        NOTE: This method must not be called with an empty ``infile``. +         +        Specifying the *wrong* encoding is likely to cause a +        ``UnicodeDecodeError``. +         +        ``infile`` must always be returned as a list of lines, but may be +        passed in as a single string. +        """ +        if ((self.encoding is not None) and +            (self.encoding.lower() not in BOM_LIST)): +            # No need to check for a BOM +            # the encoding specified doesn't have one +            # just decode +            return self._decode(infile, self.encoding) +         +        if isinstance(infile, (list, tuple)): +            line = infile[0] +        else: +            line = infile +        if self.encoding is not None: +            # encoding explicitly supplied +            # And it could have an associated BOM +            # TODO: if encoding is just UTF16 - we ought to check for both +            # TODO: big endian and little endian versions. +            enc = BOM_LIST[self.encoding.lower()] +            if enc == 'utf_16': +                # For UTF16 we try big endian and little endian +                for BOM, (encoding, final_encoding) in BOMS.items(): +                    if not final_encoding: +                        # skip UTF8 +                        continue +                    if infile.startswith(BOM): +                        ### BOM discovered +                        ##self.BOM = True +                        # Don't need to remove BOM +                        return self._decode(infile, encoding) +                     +                # If we get this far, will *probably* raise a DecodeError +                # As it doesn't appear to start with a BOM +                return self._decode(infile, self.encoding) +             +            # Must be UTF8 +            BOM = BOM_SET[enc] +            if not line.startswith(BOM): +                return self._decode(infile, self.encoding) +             +            newline = line[len(BOM):] +             +            # BOM removed +            if isinstance(infile, (list, tuple)): +                infile[0] = newline +            else: +                infile = newline +            self.BOM = True +            return self._decode(infile, self.encoding) +         +        # No encoding specified - so we need to check for UTF8/UTF16 +        for BOM, (encoding, final_encoding) in BOMS.items(): +            if not line.startswith(BOM): +                continue +            else: +                # BOM discovered +                self.encoding = final_encoding +                if not final_encoding: +                    self.BOM = True +                    # UTF8 +                    # remove BOM +                    newline = line[len(BOM):] +                    if isinstance(infile, (list, tuple)): +                        infile[0] = newline +                    else: +                        infile = newline +                    # UTF8 - don't decode +                    if isinstance(infile, basestring): +                        return infile.splitlines(True) +                    else: +                        return infile +                # UTF16 - have to decode +                return self._decode(infile, encoding) +             +        # No BOM discovered and no encoding specified, just return +        if isinstance(infile, basestring): +            # infile read from a file will be a single string +            return infile.splitlines(True) +        return infile + + +    def _a_to_u(self, aString): +        """Decode ASCII strings to unicode if a self.encoding is specified.""" +        if self.encoding: +            return aString.decode('ascii') +        else: +            return aString + + +    def _decode(self, infile, encoding): +        """ +        Decode infile to unicode. Using the specified encoding. +         +        if is a string, it also needs converting to a list. +        """ +        if isinstance(infile, basestring): +            # can't be unicode +            # NOTE: Could raise a ``UnicodeDecodeError`` +            return infile.decode(encoding).splitlines(True) +        for i, line in enumerate(infile): +            if not isinstance(line, unicode): +                # NOTE: The isinstance test here handles mixed lists of unicode/string +                # NOTE: But the decode will break on any non-string values +                # NOTE: Or could raise a ``UnicodeDecodeError`` +                infile[i] = line.decode(encoding) +        return infile + + +    def _decode_element(self, line): +        """Decode element to unicode if necessary.""" +        if not self.encoding: +            return line +        if isinstance(line, str) and self.default_encoding: +            return line.decode(self.default_encoding) +        return line + + +    def _str(self, value): +        """ +        Used by ``stringify`` within validate, to turn non-string values +        into strings. +        """ +        if not isinstance(value, basestring): +            return str(value) +        else: +            return value + + +    def _parse(self, infile): +        """Actually parse the config file.""" +        temp_list_values = self.list_values +        if self.unrepr: +            self.list_values = False +             +        comment_list = [] +        done_start = False +        this_section = self +        maxline = len(infile) - 1 +        cur_index = -1 +        reset_comment = False +         +        while cur_index < maxline: +            if reset_comment: +                comment_list = [] +            cur_index += 1 +            line = infile[cur_index] +            sline = line.strip() +            # do we have anything on the line ? +            if not sline or sline.startswith('#'): +                reset_comment = False +                comment_list.append(line) +                continue +             +            if not done_start: +                # preserve initial comment +                self.initial_comment = comment_list +                comment_list = [] +                done_start = True +                 +            reset_comment = True +            # first we check if it's a section marker +            mat = self._sectionmarker.match(line) +            if mat is not None: +                # is a section line +                (indent, sect_open, sect_name, sect_close, comment) = mat.groups() +                if indent and (self.indent_type is None): +                    self.indent_type = indent +                cur_depth = sect_open.count('[') +                if cur_depth != sect_close.count(']'): +                    self._handle_error("Cannot compute the section depth at line %s.", +                                       NestingError, infile, cur_index) +                    continue +                 +                if cur_depth < this_section.depth: +                    # the new section is dropping back to a previous level +                    try: +                        parent = self._match_depth(this_section, +                                                   cur_depth).parent +                    except SyntaxError: +                        self._handle_error("Cannot compute nesting level at line %s.", +                                           NestingError, infile, cur_index) +                        continue +                elif cur_depth == this_section.depth: +                    # the new section is a sibling of the current section +                    parent = this_section.parent +                elif cur_depth == this_section.depth + 1: +                    # the new section is a child the current section +                    parent = this_section +                else: +                    self._handle_error("Section too nested at line %s.", +                                       NestingError, infile, cur_index) +                     +                sect_name = self._unquote(sect_name) +                if sect_name in parent: +                    self._handle_error('Duplicate section name at line %s.', +                                       DuplicateError, infile, cur_index) +                    continue +                 +                # create the new section +                this_section = Section( +                    parent, +                    cur_depth, +                    self, +                    name=sect_name) +                parent[sect_name] = this_section +                parent.inline_comments[sect_name] = comment +                parent.comments[sect_name] = comment_list +                continue +            # +            # it's not a section marker, +            # so it should be a valid ``key = value`` line +            mat = self._keyword.match(line) +            if mat is None: +                # it neither matched as a keyword +                # or a section marker +                self._handle_error( +                    'Invalid line at line "%s".', +                    ParseError, infile, cur_index) +            else: +                # is a keyword value +                # value will include any inline comment +                (indent, key, value) = mat.groups() +                if indent and (self.indent_type is None): +                    self.indent_type = indent +                # check for a multiline value +                if value[:3] in ['"""', "'''"]: +                    try: +                        value, comment, cur_index = self._multiline( +                            value, infile, cur_index, maxline) +                    except SyntaxError: +                        self._handle_error( +                            'Parse error in value at line %s.', +                            ParseError, infile, cur_index) +                        continue +                    else: +                        if self.unrepr: +                            comment = '' +                            try: +                                value = unrepr(value) +                            except Exception, e: +                                if type(e) == UnknownType: +                                    msg = 'Unknown name or type in value at line %s.' +                                else: +                                    msg = 'Parse error in value at line %s.' +                                self._handle_error(msg, UnreprError, infile, +                                    cur_index) +                                continue +                else: +                    if self.unrepr: +                        comment = '' +                        try: +                            value = unrepr(value) +                        except Exception, e: +                            if isinstance(e, UnknownType): +                                msg = 'Unknown name or type in value at line %s.' +                            else: +                                msg = 'Parse error in value at line %s.' +                            self._handle_error(msg, UnreprError, infile, +                                cur_index) +                            continue +                    else: +                        # extract comment and lists +                        try: +                            (value, comment) = self._handle_value(value) +                        except SyntaxError: +                            self._handle_error( +                                'Parse error in value at line %s.', +                                ParseError, infile, cur_index) +                            continue +                # +                key = self._unquote(key) +                if key in this_section: +                    self._handle_error( +                        'Duplicate keyword name at line %s.', +                        DuplicateError, infile, cur_index) +                    continue +                # add the key. +                # we set unrepr because if we have got this far we will never +                # be creating a new section +                this_section.__setitem__(key, value, unrepr=True) +                this_section.inline_comments[key] = comment +                this_section.comments[key] = comment_list +                continue +        # +        if self.indent_type is None: +            # no indentation used, set the type accordingly +            self.indent_type = '' + +        # preserve the final comment +        if not self and not self.initial_comment: +            self.initial_comment = comment_list +        elif not reset_comment: +            self.final_comment = comment_list +        self.list_values = temp_list_values + + +    def _match_depth(self, sect, depth): +        """ +        Given a section and a depth level, walk back through the sections +        parents to see if the depth level matches a previous section. +         +        Return a reference to the right section, +        or raise a SyntaxError. +        """ +        while depth < sect.depth: +            if sect is sect.parent: +                # we've reached the top level already +                raise SyntaxError() +            sect = sect.parent +        if sect.depth == depth: +            return sect +        # shouldn't get here +        raise SyntaxError() + + +    def _handle_error(self, text, ErrorClass, infile, cur_index): +        """ +        Handle an error according to the error settings. +         +        Either raise the error or store it. +        The error will have occured at ``cur_index`` +        """ +        line = infile[cur_index] +        cur_index += 1 +        message = text % cur_index +        error = ErrorClass(message, cur_index, line) +        if self.raise_errors: +            # raise the error - parsing stops here +            raise error +        # store the error +        # reraise when parsing has finished +        self._errors.append(error) + + +    def _unquote(self, value): +        """Return an unquoted version of a value""" +        if not value: +            # should only happen during parsing of lists +            raise SyntaxError +        if (value[0] == value[-1]) and (value[0] in ('"', "'")): +            value = value[1:-1] +        return value + + +    def _quote(self, value, multiline=True): +        """ +        Return a safely quoted version of a value. +         +        Raise a ConfigObjError if the value cannot be safely quoted. +        If multiline is ``True`` (default) then use triple quotes +        if necessary. +         +        * Don't quote values that don't need it. +        * Recursively quote members of a list and return a comma joined list. +        * Multiline is ``False`` for lists. +        * Obey list syntax for empty and single member lists. +         +        If ``list_values=False`` then the value is only quoted if it contains +        a ``\\n`` (is multiline) or '#'. +         +        If ``write_empty_values`` is set, and the value is an empty string, it +        won't be quoted. +        """ +        if multiline and self.write_empty_values and value == '': +            # Only if multiline is set, so that it is used for values not +            # keys, and not values that are part of a list +            return '' +         +        if multiline and isinstance(value, (list, tuple)): +            if not value: +                return ',' +            elif len(value) == 1: +                return self._quote(value[0], multiline=False) + ',' +            return ', '.join([self._quote(val, multiline=False) +                for val in value]) +        if not isinstance(value, basestring): +            if self.stringify: +                value = str(value) +            else: +                raise TypeError('Value "%s" is not a string.' % value) + +        if not value: +            return '""' +         +        no_lists_no_quotes = not self.list_values and '\n' not in value and '#' not in value +        need_triple = multiline and ((("'" in value) and ('"' in value)) or ('\n' in value )) +        hash_triple_quote = multiline and not need_triple and ("'" in value) and ('"' in value) and ('#' in value) +        check_for_single = (no_lists_no_quotes or not need_triple) and not hash_triple_quote +         +        if check_for_single: +            if not self.list_values: +                # we don't quote if ``list_values=False`` +                quot = noquot +            # for normal values either single or double quotes will do +            elif '\n' in value: +                # will only happen if multiline is off - e.g. '\n' in key +                raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) +            elif ((value[0] not in wspace_plus) and +                    (value[-1] not in wspace_plus) and +                    (',' not in value)): +                quot = noquot +            else: +                quot = self._get_single_quote(value) +        else: +            # if value has '\n' or "'" *and* '"', it will need triple quotes +            quot = self._get_triple_quote(value) +         +        if quot == noquot and '#' in value and self.list_values: +            quot = self._get_single_quote(value) +                 +        return quot % value +     +     +    def _get_single_quote(self, value): +        if ("'" in value) and ('"' in value): +            raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) +        elif '"' in value: +            quot = squot +        else: +            quot = dquot +        return quot +     +     +    def _get_triple_quote(self, value): +        if (value.find('"""') != -1) and (value.find("'''") != -1): +            raise ConfigObjError('Value "%s" cannot be safely quoted.' % value) +        if value.find('"""') == -1: +            quot = tdquot +        else: +            quot = tsquot  +        return quot + + +    def _handle_value(self, value): +        """ +        Given a value string, unquote, remove comment, +        handle lists. (including empty and single member lists) +        """ +        if self._inspec: +            # Parsing a configspec so don't handle comments +            return (value, '') +        # do we look for lists in values ? +        if not self.list_values: +            mat = self._nolistvalue.match(value) +            if mat is None: +                raise SyntaxError() +            # NOTE: we don't unquote here +            return mat.groups() +        # +        mat = self._valueexp.match(value) +        if mat is None: +            # the value is badly constructed, probably badly quoted, +            # or an invalid list +            raise SyntaxError() +        (list_values, single, empty_list, comment) = mat.groups() +        if (list_values == '') and (single is None): +            # change this if you want to accept empty values +            raise SyntaxError() +        # NOTE: note there is no error handling from here if the regex +        # is wrong: then incorrect values will slip through +        if empty_list is not None: +            # the single comma - meaning an empty list +            return ([], comment) +        if single is not None: +            # handle empty values +            if list_values and not single: +                # FIXME: the '' is a workaround because our regex now matches +                #   '' at the end of a list if it has a trailing comma +                single = None +            else: +                single = single or '""' +                single = self._unquote(single) +        if list_values == '': +            # not a list value +            return (single, comment) +        the_list = self._listvalueexp.findall(list_values) +        the_list = [self._unquote(val) for val in the_list] +        if single is not None: +            the_list += [single] +        return (the_list, comment) + + +    def _multiline(self, value, infile, cur_index, maxline): +        """Extract the value, where we are in a multiline situation.""" +        quot = value[:3] +        newvalue = value[3:] +        single_line = self._triple_quote[quot][0] +        multi_line = self._triple_quote[quot][1] +        mat = single_line.match(value) +        if mat is not None: +            retval = list(mat.groups()) +            retval.append(cur_index) +            return retval +        elif newvalue.find(quot) != -1: +            # somehow the triple quote is missing +            raise SyntaxError() +        # +        while cur_index < maxline: +            cur_index += 1 +            newvalue += '\n' +            line = infile[cur_index] +            if line.find(quot) == -1: +                newvalue += line +            else: +                # end of multiline, process it +                break +        else: +            # we've got to the end of the config, oops... +            raise SyntaxError() +        mat = multi_line.match(line) +        if mat is None: +            # a badly formed line +            raise SyntaxError() +        (value, comment) = mat.groups() +        return (newvalue + value, comment, cur_index) + + +    def _handle_configspec(self, configspec): +        """Parse the configspec.""" +        # FIXME: Should we check that the configspec was created with the  +        #        correct settings ? (i.e. ``list_values=False``) +        if not isinstance(configspec, ConfigObj): +            try: +                configspec = ConfigObj(configspec, +                                       raise_errors=True, +                                       file_error=True, +                                       _inspec=True) +            except ConfigObjError, e: +                # FIXME: Should these errors have a reference +                #        to the already parsed ConfigObj ? +                raise ConfigspecError('Parsing configspec failed: %s' % e) +            except IOError, e: +                raise IOError('Reading configspec failed: %s' % e) +         +        self.configspec = configspec +             + +         +    def _set_configspec(self, section, copy): +        """ +        Called by validate. Handles setting the configspec on subsections +        including sections to be validated by __many__ +        """ +        configspec = section.configspec +        many = configspec.get('__many__') +        if isinstance(many, dict): +            for entry in section.sections: +                if entry not in configspec: +                    section[entry].configspec = many +                     +        for entry in configspec.sections: +            if entry == '__many__': +                continue +            if entry not in section: +                section[entry] = {} +                section[entry]._created = True +                if copy: +                    # copy comments +                    section.comments[entry] = configspec.comments.get(entry, []) +                    section.inline_comments[entry] = configspec.inline_comments.get(entry, '') +                 +            # Could be a scalar when we expect a section +            if isinstance(section[entry], Section): +                section[entry].configspec = configspec[entry] +                         + +    def _write_line(self, indent_string, entry, this_entry, comment): +        """Write an individual line, for the write method""" +        # NOTE: the calls to self._quote here handles non-StringType values. +        if not self.unrepr: +            val = self._decode_element(self._quote(this_entry)) +        else: +            val = repr(this_entry) +        return '%s%s%s%s%s' % (indent_string, +                               self._decode_element(self._quote(entry, multiline=False)), +                               self._a_to_u(' = '), +                               val, +                               self._decode_element(comment)) + + +    def _write_marker(self, indent_string, depth, entry, comment): +        """Write a section marker line""" +        return '%s%s%s%s%s' % (indent_string, +                               self._a_to_u('[' * depth), +                               self._quote(self._decode_element(entry), multiline=False), +                               self._a_to_u(']' * depth), +                               self._decode_element(comment)) + + +    def _handle_comment(self, comment): +        """Deal with a comment.""" +        if not comment: +            return '' +        start = self.indent_type +        if not comment.startswith('#'): +            start += self._a_to_u(' # ') +        return (start + comment) + + +    # Public methods + +    def write(self, outfile=None, section=None): +        """ +        Write the current ConfigObj as a file +         +        tekNico: FIXME: use StringIO instead of real files +         +        >>> filename = a.filename +        >>> a.filename = 'test.ini' +        >>> a.write() +        >>> a.filename = filename +        >>> a == ConfigObj('test.ini', raise_errors=True) +        1 +        >>> import os +        >>> os.remove('test.ini') +        """ +        if self.indent_type is None: +            # this can be true if initialised from a dictionary +            self.indent_type = DEFAULT_INDENT_TYPE +             +        out = [] +        cs = self._a_to_u('#') +        csp = self._a_to_u('# ') +        if section is None: +            int_val = self.interpolation +            self.interpolation = False +            section = self +            for line in self.initial_comment: +                line = self._decode_element(line) +                stripped_line = line.strip() +                if stripped_line and not stripped_line.startswith(cs): +                    line = csp + line +                out.append(line) +                 +        indent_string = self.indent_type * section.depth +        for entry in (section.scalars + section.sections): +            if entry in section.defaults: +                # don't write out default values +                continue +            for comment_line in section.comments[entry]: +                comment_line = self._decode_element(comment_line.lstrip()) +                if comment_line and not comment_line.startswith(cs): +                    comment_line = csp + comment_line +                out.append(indent_string + comment_line) +            this_entry = section[entry] +            comment = self._handle_comment(section.inline_comments[entry]) +             +            if isinstance(this_entry, dict): +                # a section +                out.append(self._write_marker( +                    indent_string, +                    this_entry.depth, +                    entry, +                    comment)) +                out.extend(self.write(section=this_entry)) +            else: +                out.append(self._write_line( +                    indent_string, +                    entry, +                    this_entry, +                    comment)) +                 +        if section is self: +            for line in self.final_comment: +                line = self._decode_element(line) +                stripped_line = line.strip() +                if stripped_line and not stripped_line.startswith(cs): +                    line = csp + line +                out.append(line) +            self.interpolation = int_val +             +        if section is not self: +            return out +         +        if (self.filename is None) and (outfile is None): +            # output a list of lines +            # might need to encode +            # NOTE: This will *screw* UTF16, each line will start with the BOM +            if self.encoding: +                out = [l.encode(self.encoding) for l in out] +            if (self.BOM and ((self.encoding is None) or +                (BOM_LIST.get(self.encoding.lower()) == 'utf_8'))): +                # Add the UTF8 BOM +                if not out: +                    out.append('') +                out[0] = BOM_UTF8 + out[0] +            return out +         +        # Turn the list to a string, joined with correct newlines +        newline = self.newlines or os.linesep +        if (getattr(outfile, 'mode', None) is not None and outfile.mode == 'w' +            and sys.platform == 'win32' and newline == '\r\n'): +            # Windows specific hack to avoid writing '\r\r\n' +            newline = '\n' +        output = self._a_to_u(newline).join(out) +        if self.encoding: +            output = output.encode(self.encoding) +        if self.BOM and ((self.encoding is None) or match_utf8(self.encoding)): +            # Add the UTF8 BOM +            output = BOM_UTF8 + output +             +        if not output.endswith(newline): +            output += newline +        if outfile is not None: +            outfile.write(output) +        else: +            h = open(self.filename, 'wb') +            h.write(output) +            h.close() + + +    def validate(self, validator, preserve_errors=False, copy=False, +                 section=None): +        """ +        Test the ConfigObj against a configspec. +         +        It uses the ``validator`` object from *validate.py*. +         +        To run ``validate`` on the current ConfigObj, call: :: +         +            test = config.validate(validator) +         +        (Normally having previously passed in the configspec when the ConfigObj +        was created - you can dynamically assign a dictionary of checks to the +        ``configspec`` attribute of a section though). +         +        It returns ``True`` if everything passes, or a dictionary of +        pass/fails (True/False). If every member of a subsection passes, it +        will just have the value ``True``. (It also returns ``False`` if all +        members fail). +         +        In addition, it converts the values from strings to their native +        types if their checks pass (and ``stringify`` is set). +         +        If ``preserve_errors`` is ``True`` (``False`` is default) then instead +        of a marking a fail with a ``False``, it will preserve the actual +        exception object. This can contain info about the reason for failure. +        For example the ``VdtValueTooSmallError`` indicates that the value +        supplied was too small. If a value (or section) is missing it will +        still be marked as ``False``. +         +        You must have the validate module to use ``preserve_errors=True``. +         +        You can then use the ``flatten_errors`` function to turn your nested +        results dictionary into a flattened list of failures - useful for +        displaying meaningful error messages. +        """ +        if section is None: +            if self.configspec is None: +                raise ValueError('No configspec supplied.') +            if preserve_errors: +                # We do this once to remove a top level dependency on the validate module +                # Which makes importing configobj faster +                from validate import VdtMissingValue +                self._vdtMissingValue = VdtMissingValue +                 +            section = self + +            if copy: +                section.initial_comment = section.configspec.initial_comment +                section.final_comment = section.configspec.final_comment +                section.encoding = section.configspec.encoding +                section.BOM = section.configspec.BOM +                section.newlines = section.configspec.newlines +                section.indent_type = section.configspec.indent_type +             +        # +        # section.default_values.clear() #?? +        configspec = section.configspec +        self._set_configspec(section, copy) + +         +        def validate_entry(entry, spec, val, missing, ret_true, ret_false): +            section.default_values.pop(entry, None) +                 +            try: +                section.default_values[entry] = validator.get_default_value(configspec[entry]) +            except (KeyError, AttributeError, validator.baseErrorClass): +                # No default, bad default or validator has no 'get_default_value' +                # (e.g. SimpleVal) +                pass +             +            try: +                check = validator.check(spec, +                                        val, +                                        missing=missing +                                        ) +            except validator.baseErrorClass, e: +                if not preserve_errors or isinstance(e, self._vdtMissingValue): +                    out[entry] = False +                else: +                    # preserve the error +                    out[entry] = e +                    ret_false = False +                ret_true = False +            else: +                ret_false = False +                out[entry] = True +                if self.stringify or missing: +                    # if we are doing type conversion +                    # or the value is a supplied default +                    if not self.stringify: +                        if isinstance(check, (list, tuple)): +                            # preserve lists +                            check = [self._str(item) for item in check] +                        elif missing and check is None: +                            # convert the None from a default to a '' +                            check = '' +                        else: +                            check = self._str(check) +                    if (check != val) or missing: +                        section[entry] = check +                if not copy and missing and entry not in section.defaults: +                    section.defaults.append(entry) +            return ret_true, ret_false +         +        # +        out = {} +        ret_true = True +        ret_false = True +         +        unvalidated = [k for k in section.scalars if k not in configspec] +        incorrect_sections = [k for k in configspec.sections if k in section.scalars]         +        incorrect_scalars = [k for k in configspec.scalars if k in section.sections] +         +        for entry in configspec.scalars: +            if entry in ('__many__', '___many___'): +                # reserved names +                continue +            if (not entry in section.scalars) or (entry in section.defaults): +                # missing entries +                # or entries from defaults +                missing = True +                val = None +                if copy and entry not in section.scalars: +                    # copy comments +                    section.comments[entry] = ( +                        configspec.comments.get(entry, [])) +                    section.inline_comments[entry] = ( +                        configspec.inline_comments.get(entry, '')) +                # +            else: +                missing = False +                val = section[entry] +             +            ret_true, ret_false = validate_entry(entry, configspec[entry], val,  +                                                 missing, ret_true, ret_false) +         +        many = None +        if '__many__' in configspec.scalars: +            many = configspec['__many__'] +        elif '___many___' in configspec.scalars: +            many = configspec['___many___'] +         +        if many is not None: +            for entry in unvalidated: +                val = section[entry] +                ret_true, ret_false = validate_entry(entry, many, val, False, +                                                     ret_true, ret_false) +            unvalidated = [] + +        for entry in incorrect_scalars: +            ret_true = False +            if not preserve_errors: +                out[entry] = False +            else: +                ret_false = False +                msg = 'Value %r was provided as a section' % entry +                out[entry] = validator.baseErrorClass(msg) +        for entry in incorrect_sections: +            ret_true = False +            if not preserve_errors: +                out[entry] = False +            else: +                ret_false = False +                msg = 'Section %r was provided as a single value' % entry +                out[entry] = validator.baseErrorClass(msg) +                 +        # Missing sections will have been created as empty ones when the +        # configspec was read. +        for entry in section.sections: +            # FIXME: this means DEFAULT is not copied in copy mode +            if section is self and entry == 'DEFAULT': +                continue +            if section[entry].configspec is None: +                unvalidated.append(entry) +                continue +            if copy: +                section.comments[entry] = configspec.comments.get(entry, []) +                section.inline_comments[entry] = configspec.inline_comments.get(entry, '') +            check = self.validate(validator, preserve_errors=preserve_errors, copy=copy, section=section[entry]) +            out[entry] = check +            if check == False: +                ret_true = False +            elif check == True: +                ret_false = False +            else: +                ret_true = False +         +        section.extra_values = unvalidated +        if preserve_errors and not section._created: +            # If the section wasn't created (i.e. it wasn't missing) +            # then we can't return False, we need to preserve errors +            ret_false = False +        # +        if ret_false and preserve_errors and out: +            # If we are preserving errors, but all +            # the failures are from missing sections / values +            # then we can return False. Otherwise there is a +            # real failure that we need to preserve. +            ret_false = not any(out.values()) +        if ret_true: +            return True +        elif ret_false: +            return False +        return out + + +    def reset(self): +        """Clear ConfigObj instance and restore to 'freshly created' state.""" +        self.clear() +        self._initialise() +        # FIXME: Should be done by '_initialise', but ConfigObj constructor (and reload) +        #        requires an empty dictionary +        self.configspec = None +        # Just to be sure ;-) +        self._original_configspec = None +         +         +    def reload(self): +        """ +        Reload a ConfigObj from file. +         +        This method raises a ``ReloadError`` if the ConfigObj doesn't have +        a filename attribute pointing to a file. +        """ +        if not isinstance(self.filename, basestring): +            raise ReloadError() + +        filename = self.filename +        current_options = {} +        for entry in OPTION_DEFAULTS: +            if entry == 'configspec': +                continue +            current_options[entry] = getattr(self, entry) +             +        configspec = self._original_configspec +        current_options['configspec'] = configspec +             +        self.clear() +        self._initialise(current_options) +        self._load(filename, configspec) +         + + +class SimpleVal(object): +    """ +    A simple validator. +    Can be used to check that all members expected are present. +     +    To use it, provide a configspec with all your members in (the value given +    will be ignored). Pass an instance of ``SimpleVal`` to the ``validate`` +    method of your ``ConfigObj``. ``validate`` will return ``True`` if all +    members are present, or a dictionary with True/False meaning +    present/missing. (Whole missing sections will be replaced with ``False``) +    """ +     +    def __init__(self): +        self.baseErrorClass = ConfigObjError +     +    def check(self, check, member, missing=False): +        """A dummy check method, always returns the value unchanged.""" +        if missing: +            raise self.baseErrorClass() +        return member + + +def flatten_errors(cfg, res, levels=None, results=None): +    """ +    An example function that will turn a nested dictionary of results +    (as returned by ``ConfigObj.validate``) into a flat list. +     +    ``cfg`` is the ConfigObj instance being checked, ``res`` is the results +    dictionary returned by ``validate``. +     +    (This is a recursive function, so you shouldn't use the ``levels`` or +    ``results`` arguments - they are used by the function.) +     +    Returns a list of keys that failed. Each member of the list is a tuple:: +     +        ([list of sections...], key, result) +     +    If ``validate`` was called with ``preserve_errors=False`` (the default) +    then ``result`` will always be ``False``. + +    *list of sections* is a flattened list of sections that the key was found +    in. +     +    If the section was missing (or a section was expected and a scalar provided +    - or vice-versa) then key will be ``None``. +     +    If the value (or section) was missing then ``result`` will be ``False``. +     +    If ``validate`` was called with ``preserve_errors=True`` and a value +    was present, but failed the check, then ``result`` will be the exception +    object returned. You can use this as a string that describes the failure. +     +    For example *The value "3" is of the wrong type*. +    """ +    if levels is None: +        # first time called +        levels = [] +        results = [] +    if res == True: +        return results +    if res == False or isinstance(res, Exception): +        results.append((levels[:], None, res)) +        if levels: +            levels.pop() +        return results +    for (key, val) in res.items(): +        if val == True: +            continue +        if isinstance(cfg.get(key), dict): +            # Go down one level +            levels.append(key) +            flatten_errors(cfg[key], val, levels, results) +            continue +        results.append((levels[:], key, val)) +    # +    # Go up one level +    if levels: +        levels.pop() +    # +    return results + + +def get_extra_values(conf, _prepend=()): +    """ +    Find all the values and sections not in the configspec from a validated +    ConfigObj. +     +    ``get_extra_values`` returns a list of tuples where each tuple represents +    either an extra section, or an extra value. +     +    The tuples contain two values, a tuple representing the section the value  +    is in and the name of the extra values. For extra values in the top level +    section the first member will be an empty tuple. For values in the 'foo' +    section the first member will be ``('foo',)``. For members in the 'bar' +    subsection of the 'foo' section the first member will be ``('foo', 'bar')``. +     +    NOTE: If you call ``get_extra_values`` on a ConfigObj instance that hasn't +    been validated it will return an empty list. +    """ +    out = [] +     +    out.extend([(_prepend, name) for name in conf.extra_values]) +    for name in conf.sections: +        if name not in conf.extra_values: +            out.extend(get_extra_values(conf[name], _prepend + (name,))) +    return out + + +"""*A programming language is a medium of expression.* - Paul Graham""" diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py new file mode 100644 index 00000000..fd4a4148 --- /dev/null +++ b/python/pkg/cdec/sa/__init__.py @@ -0,0 +1,4 @@ +from cdec.sa._sa import sym_fromstring,\ +        SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ +        HieroCachingRuleFactory, Sampler +from cdec.sa.extractor import GrammarExtractor diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py new file mode 100644 index 00000000..2a89243b --- /dev/null +++ b/python/pkg/cdec/sa/compile.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +import argparse +import os +import logging +import cdec.configobj +import cdec.sa + +MAX_PHRASE_LENGTH = 4 +def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2): +    lcp = cdec.sa.LCP(f_sa) +    stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True) +    precomp = cdec.sa.Precomputation(from_stats=stats, +            fsarray=f_sa, +            precompute_rank=rank1, +            precompute_secondary_rank=rank2, +            max_length=max_len, +            max_nonterminals=max_nt, +            train_max_initial_size=max_size, +            train_min_gap_size=min_gap) +    return precomp + +def main(): +    logging.basicConfig(level=logging.INFO) +    logger = logging.getLogger('cdec.sa.compile') +    parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.') +    parser.add_argument('--maxnt', '-n', type=int, default=2, +                        help='Maximum number of non-terminal symbols') +    parser.add_argument('--maxlen', '-l', type=int, default=5, +                        help='Maximum number of terminals') +    parser.add_argument('--maxsize', '-s', type=int, default=15, +                        help='Maximum rule span') +    parser.add_argument('--mingap', '-g', type=int, default=1, +                        help='Minimum gap size') +    parser.add_argument('--rank1', '-r1', type=int, default=100, +                        help='Number of pre-computed frequent patterns') +    parser.add_argument('--rank2', '-r2', type=int, default=10, +                        help='Number of pre-computed super-frequent patterns)') +    parser.add_argument('-c', '--config', default='/dev/stdout', +                        help='Output configuration') +    parser.add_argument('-f', '--source', +                        help='Source language corpus') +    parser.add_argument('-e', '--target', +                        help='Target language corpus') +    parser.add_argument('-b', '--bitext', +                        help='Parallel text (source ||| target)') +    parser.add_argument('-a', '--alignment', required=True, +                        help='Bitext word alignment') +    parser.add_argument('-o', '--output', required=True, +                        help='Output path') +    args = parser.parse_args() + +    if not ((args.source and args.target) or args.bitext): +        parser.error('a parallel corpus is required\n' +        '\tuse -f (source) with -e (target) or -b (bitext)') + +    param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") +    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) + +    if not os.path.exists(args.output): +        os.mkdir(args.output) + +    f_sa_bin = os.path.join(args.output, 'f.sa.bin') +    e_bin = os.path.join(args.output, 'e.bin') +    precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params) +    precomp_bin = os.path.join(args.output, precomp_file) +    a_bin = os.path.join(args.output, 'a.bin') +    lex_bin = os.path.join(args.output, 'lex.bin') + +    logger.info('Compiling source suffix array') +    if args.bitext: +        f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source') +    else: +        f_sa = cdec.sa.SuffixArray(from_text=args.source) +    f_sa.write_binary(f_sa_bin) + +    logger.info('Compiling target data array') +    if args.bitext: +        e = cdec.sa.DataArray(from_text=args.bitext, side='target') +    else: +        e = cdec.sa.DataArray(from_text=args.target) +    e.write_binary(e_bin) + +    logger.info('Precomputing frequent phrases') +    precompute(f_sa, *params).write_binary(precomp_bin) + +    logger.info('Compiling alignment') +    a = cdec.sa.Alignment(from_text=args.alignment) +    a.write_binary(a_bin) + +    logger.info('Compiling bilexical dictionary') +    lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa) +    lex.write_binary(lex_bin) +     +    # Write configuration +    config = cdec.configobj.ConfigObj(args.config, unrepr=True) +    config['f_sa_file'] = f_sa_bin +    config['e_file'] = e_bin +    config['a_file'] = a_bin +    config['lex_file'] = lex_bin +    config['precompute_file'] = precomp_bin +    for name, value in zip(param_names, params): +        config[name] = value +    config.write() + +if __name__ == '__main__': +    main() diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py new file mode 100644 index 00000000..918aa3bb --- /dev/null +++ b/python/pkg/cdec/sa/extract.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import sys +import os +import argparse +import logging +import cdec.sa + +def main(): +    logging.basicConfig(level=logging.INFO) +    parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') +    parser.add_argument('-c', '--config', required=True, +                        help='Extractor configuration') +    parser.add_argument('-g', '--grammars', required=True, +                        help='Grammar output path') +    args = parser.parse_args() + +    if not os.path.exists(args.grammars): +        os.mkdir(args.grammars) + +    extractor = cdec.sa.GrammarExtractor(args.config) +    for i, sentence in enumerate(sys.stdin): +        sentence = sentence[:-1] +        grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) +        with open(grammar_file, 'w') as output: +            for rule in extractor.grammar(sentence): +                output.write(str(rule)+'\n') +        grammar_file = os.path.abspath(grammar_file) +        print('<seg grammar="{0}">{1}</seg>'.format(grammar_file, sentence)) + +if __name__ == '__main__': +    main() diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py new file mode 100644 index 00000000..bb912e16 --- /dev/null +++ b/python/pkg/cdec/sa/extractor.py @@ -0,0 +1,78 @@ +from itertools import chain +import os +import cdec.configobj +from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ +        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE +import cdec.sa + +# maximum span of a grammar rule in TEST DATA +MAX_INITIAL_SIZE = 15 + +class GrammarExtractor: +    def __init__(self, config): +        if isinstance(config, str) or isinstance(config, unicode): +            if not os.path.exists(config): +                raise IOError('cannot read configuration from {0}'.format(config)) +            config = cdec.configobj.ConfigObj(config, unrepr=True) +        alignment = cdec.sa.Alignment(from_binary=config['a_file']) +        self.factory = cdec.sa.HieroCachingRuleFactory( +                # compiled alignment object (REQUIRED) +                alignment, +                # name of generic nonterminal used by Hiero +                category="[X]", +                # maximum number of contiguous chunks of terminal symbols in RHS of a rule +                max_chunks=config['max_nt']+1, +                # maximum span of a grammar rule in TEST DATA +                max_initial_size=MAX_INITIAL_SIZE, +                # maximum number of symbols (both T and NT) allowed in a rule +                max_length=config['max_len'], +                # maximum number of nonterminals allowed in a rule (set >2 at your own risk) +                max_nonterminals=config['max_nt'], +                # maximum number of contiguous chunks of terminal symbols +                # in target-side RHS of a rule. +                max_target_chunks=config['max_nt']+1, +                # maximum number of target side symbols (both T and NT) allowed in a rule. +                max_target_length=MAX_INITIAL_SIZE, +                # minimum span of a nonterminal in the RHS of a rule in TEST DATA +                min_gap_size=1, +                # filename of file containing precomputed collocations +                precompute_file=config['precompute_file'], +                # maximum frequency rank of patterns used to compute triples (< 20) +                precompute_secondary_rank=config['rank2'], +                # maximum frequency rank of patterns used to compute collocations (< 300) +                precompute_rank=config['rank1'], +                # require extracted rules to have at least one aligned word +                require_aligned_terminal=True, +                # require each contiguous chunk of extracted rules +                # to have at least one aligned word +                require_aligned_chunks=False, +                # maximum span of a grammar rule extracted from TRAINING DATA +                train_max_initial_size=config['max_size'], +                # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA +                train_min_gap_size=config['min_gap'], +                # True if phrases should be tight, False otherwise (better but slower) +                tight_phrases=True, +                ) + +        # lexical weighting tables +        tt = cdec.sa.BiLex(from_binary=config['lex_file']) + +        self.models = (EgivenFCoherent, SampleCountF, CountEF,  +                MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) + +        fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) +        edarray = cdec.sa.DataArray(from_binary=config['e_file']) + +        # lower=faster, higher=better; improvements level off above 200-300 range, +        # -1 = don't sample, use all data (VERY SLOW!) +        sampler = cdec.sa.Sampler(300, fsarray) + +        self.factory.configure(fsarray, edarray, sampler) + +    def grammar(self, sentence): +        if isinstance(sentence, unicode): +            sentence = sentence.encode('utf8') +        cnet = chain(('<s>',), sentence.split(), ('</s>',)) +        cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) +        cnet = tuple(((word, None, 1), ) for word in cnet) +        return self.factory.input(cnet, self.models) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py new file mode 100644 index 00000000..325b9e13 --- /dev/null +++ b/python/pkg/cdec/sa/features.py @@ -0,0 +1,57 @@ +from __future__ import division +import math + +MAXSCORE = 99 + +def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) +    return -math.log10(paircount/fcount) + +def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): +    return math.log10(1 + paircount) + +def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): +    return math.log10(1 + fsample_count) + +def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): +    prob = paircount/fsample_count +    return -math.log10(prob) if prob > 0 else MAXSCORE + +def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): +    return -math.log10(fcount/fsample_count) + +def MaxLexEgivenF(ttable): +    def feature(fphrase, ephrase, paircount, fcount, fsample_count): +        fwords = fphrase.words +        fwords.append('NULL') +        def score(): +            for e in ephrase.words: +              maxScore = max(ttable.get_score(f, e, 0) for f in fwords) +              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE +        return sum(score()) +    return feature + +def MaxLexFgivenE(ttable): +    def feature(fphrase, ephrase, paircount, fcount, fsample_count): +        ewords = ephrase.words +        ewords.append('NULL') +        def score(): +            for f in fphrase.words: +              maxScore = max(ttable.get_score(f, e, 1) for e in ewords) +              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE +        return sum(score()) +    return feature + +def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): +    return (fcount == 1) + +def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): +    return (paircount == 1) + +def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): +    return (fcount > 1) + +def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): +    return (paircount > 1) + +def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): +    return (paircount > 0.01) diff --git a/python/pkg/cdec/score.py b/python/pkg/cdec/score.py new file mode 100644 index 00000000..22257774 --- /dev/null +++ b/python/pkg/cdec/score.py @@ -0,0 +1 @@ +from _cdec import BLEU, TER, CER, Metric | 
