kll/funcparserlib/lexer.py

# -*- coding: utf-8 -*-

# Copyright (c) 2008/2013 Andrey Vlasovskikh
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

__all__ = ['make_tokenizer', 'Token', 'LexerError']

import re


class LexerError(Exception):
    def __init__(self, place, msg):
        self.place = place
        self.msg = msg

    def __str__(self):
        s = 'cannot tokenize data'
        line, pos = self.place
        return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)


class Token(object):
    def __init__(self, type, value, start=None, end=None):
        self.type = type
        self.value = value
        self.start = start
        self.end = end

    def __repr__(self):
        return 'Token(%r, %r)' % (self.type, self.value)

    def __eq__(self, other):
        # FIXME: Case sensitivity is assumed here
        return self.type == other.type and self.value == other.value

    def _pos_str(self):
        if self.start is None or self.end is None:
            return ''
        else:
            sl, sp = self.start
            el, ep = self.end
            return '%d,%d-%d,%d:' % (sl, sp, el, ep)

    def __str__(self):
        s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
        return s.strip()

    @property
    def name(self):
        return self.value

    def pformat(self):
        return "%s %s '%s'" % (self._pos_str().ljust(20),
                                self.type.ljust(14),
                                self.value)


def make_tokenizer(specs):
    """[(str, (str, int?))] -> (str -> Iterable(Token))"""

    def compile_spec(spec):
        name, args = spec
        return name, re.compile(*args)

    compiled = [compile_spec(s) for s in specs]

    def match_specs(specs, str, i, position):
        line, pos = position
        for type, regexp in specs:
            m = regexp.match(str, i)
            if m is not None:
                value = m.group()
                nls = value.count('\n')
                n_line = line + nls
                if nls == 0:
                    n_pos = pos + len(value)
                else:
                    n_pos = len(value) - value.rfind('\n') - 1
                return Token(type, value, (line, pos + 1), (n_line, n_pos))
        else:
            errline = str.splitlines()[line - 1]
            raise LexerError((line, pos + 1), errline)

    def f(str):
        length = len(str)
        line, pos = 1, 0
        i = 0
        while i < length:
            t = match_specs(compiled, str, i, (line, pos))
            yield t
            line, pos = t.end
            i += len(t.value)

    return f

# This is an example of a token spec. See also [this article][1] for a
# discussion of searching for multiline comments using regexps (including `*?`).
#
#   [1]: http://ostermiller.org/findcomment.html
_example_token_specs = [
    ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
    ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
    ('COMMENT', (r'//.*',)),
    ('NL', (r'[\r\n]+',)),
    ('SPACE', (r'[ \t\r\n]+',)),
    ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
    ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
    ('INT', (r'[0-9]+',)),
    ('INT', (r'\$[0-9A-Fa-f]+',)),
    ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)),
    ('STRING', (r"'([^']|(''))*'",)),
    ('CHAR', (r'#[0-9]+',)),
    ('CHAR', (r'#\$[0-9A-Fa-f]+',)),
]
#tokenize = make_tokenizer(_example_token_specs)
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00			`# -- coding: utf-8 --`

			`# Copyright (c) 2008/2013 Andrey Vlasovskikh`
			`#`
			`# Permission is hereby granted, free of charge, to any person obtaining`
			`# a copy of this software and associated documentation files (the`
			`# "Software"), to deal in the Software without restriction, including`
			`# without limitation the rights to use, copy, modify, merge, publish,`
			`# distribute, sublicense, and/or sell copies of the Software, and to`
			`# permit persons to whom the Software is furnished to do so, subject to`
			`# the following conditions:`
			`#`
			`# The above copyright notice and this permission notice shall be included`
			`# in all copies or substantial portions of the Software.`
			`#`
			`# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,`
			`# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY`
			`# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,`
			`# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE`
			`# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`

			`__all__ = ['make_tokenizer', 'Token', 'LexerError']`

			`import re`


			`class LexerError(Exception):`
			`def __init__(self, place, msg):`
			`self.place = place`
			`self.msg = msg`

			`def __str__(self):`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`s = 'cannot tokenize data'`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00			`line, pos = self.place`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00

			`class Token(object):`
			`def __init__(self, type, value, start=None, end=None):`
			`self.type = type`
			`self.value = value`
			`self.start = start`
			`self.end = end`

			`def __repr__(self):`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`return 'Token(%r, %r)' % (self.type, self.value)`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00
			`def __eq__(self, other):`
			`# FIXME: Case sensitivity is assumed here`
			`return self.type == other.type and self.value == other.value`

			`def _pos_str(self):`
			`if self.start is None or self.end is None:`
			`return ''`
			`else:`
			`sl, sp = self.start`
			`el, ep = self.end`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`return '%d,%d-%d,%d:' % (sl, sp, el, ep)`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00
			`def __str__(self):`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00			`return s.strip()`

			`@property`
			`def name(self):`
			`return self.value`

			`def pformat(self):`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`return "%s %s '%s'" % (self._pos_str().ljust(20),`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00			`self.type.ljust(14),`
			`self.value)`


			`def make_tokenizer(specs):`
			`"""[(str, (str, int?))] -> (str -> Iterable(Token))"""`

			`def compile_spec(spec):`
			`name, args = spec`
			`return name, re.compile(*args)`

			`compiled = [compile_spec(s) for s in specs]`

			`def match_specs(specs, str, i, position):`
			`line, pos = position`
			`for type, regexp in specs:`
			`m = regexp.match(str, i)`
			`if m is not None:`
			`value = m.group()`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`nls = value.count('\n')`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00			`n_line = line + nls`
			`if nls == 0:`
			`n_pos = pos + len(value)`
			`else:`
Fixing unicode strings in funcparserlib - Required to work with Cygwin Python3 2014-09-16 00:32:07 +00:00			`n_pos = len(value) - value.rfind('\n') - 1`
Initial source dump. - Not quite complete. - Most of the parser is done (excluding analog) for 0.3 of the KLL spec - Transformation and correlation isn't complete yet. - Backend generation for Kiibohd capabilties is complete. 2014-09-02 17:03:50 +00:00			`return Token(type, value, (line, pos + 1), (n_line, n_pos))`
			`else:`
			`errline = str.splitlines()[line - 1]`
			`raise LexerError((line, pos + 1), errline)`

			`def f(str):`
			`length = len(str)`
			`line, pos = 1, 0`
			`i = 0`
			`while i < length:`
			`t = match_specs(compiled, str, i, (line, pos))`
			`yield t`
			`line, pos = t.end`
			`i += len(t.value)`

			`return f`

			`# This is an example of a token spec. See also [this article][1] for a`
			# discussion of searching for multiline comments using regexps (including `*?`).
			`#`
			`# [1]: http://ostermiller.org/findcomment.html`
			`_example_token_specs = [`
			`('COMMENT', (r'\(\(.\|[\r\n])?\*\)', re.MULTILINE)),`
			`('COMMENT', (r'\{(.\|[\r\n])*?\}', re.MULTILINE)),`
			`('COMMENT', (r'//.*',)),`
			`('NL', (r'[\r\n]+',)),`
			`('SPACE', (r'[ \t\r\n]+',)),`
			`('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),`
			`('REAL', (r'[0-9]+\.[0-9]([Ee][+\-]?[0-9]+)',)),`
			`('INT', (r'[0-9]+',)),`
			`('INT', (r'\$[0-9A-Fa-f]+',)),`
			`('OP', (r'(\.\.)\|(<>)\|(<=)\|(>=)\|(:=)\|[;,=\(\):\[\]\.+\-<>\*/@\^]',)),`
			`('STRING', (r"'([^']\|(''))*'",)),`
			`('CHAR', (r'#[0-9]+',)),`
			`('CHAR', (r'#\$[0-9A-Fa-f]+',)),`
			`]`
			`#tokenize = make_tokenizer(_example_token_specs)`