- # -*- coding: utf-8 -*-
-
- # Copyright (c) 2008/2013 Andrey Vlasovskikh
- #
- # Permission is hereby granted, free of charge, to any person obtaining
- # a copy of this software and associated documentation files (the
- # "Software"), to deal in the Software without restriction, including
- # without limitation the rights to use, copy, modify, merge, publish,
- # distribute, sublicense, and/or sell copies of the Software, and to
- # permit persons to whom the Software is furnished to do so, subject to
- # the following conditions:
- #
- # The above copyright notice and this permission notice shall be included
- # in all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- __all__ = ['make_tokenizer', 'Token', 'LexerError']
-
- import re
-
-
- class LexerError(Exception):
- def __init__(self, place, msg):
- self.place = place
- self.msg = msg
-
- def __str__(self):
- s = 'cannot tokenize data'
- line, pos = self.place
- return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)
-
-
- class Token(object):
- def __init__(self, type, value, start=None, end=None):
- self.type = type
- self.value = value
- self.start = start
- self.end = end
-
- def __repr__(self):
- return 'Token(%r, %r)' % (self.type, self.value)
-
- def __eq__(self, other):
- # FIXME: Case sensitivity is assumed here
- return self.type == other.type and self.value == other.value
-
- def _pos_str(self):
- if self.start is None or self.end is None:
- return ''
- else:
- sl, sp = self.start
- el, ep = self.end
- return '%d,%d-%d,%d:' % (sl, sp, el, ep)
-
- def __str__(self):
- s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
- return s.strip()
-
- @property
- def name(self):
- return self.value
-
- def pformat(self):
- return "%s %s '%s'" % (self._pos_str().ljust(20),
- self.type.ljust(14),
- self.value)
-
-
- def make_tokenizer(specs):
- """[(str, (str, int?))] -> (str -> Iterable(Token))"""
-
- def compile_spec(spec):
- name, args = spec
- return name, re.compile(*args)
-
- compiled = [compile_spec(s) for s in specs]
-
- def match_specs(specs, str, i, position):
- line, pos = position
- for type, regexp in specs:
- m = regexp.match(str, i)
- if m is not None:
- value = m.group()
- nls = value.count('\n')
- n_line = line + nls
- if nls == 0:
- n_pos = pos + len(value)
- else:
- n_pos = len(value) - value.rfind('\n') - 1
- return Token(type, value, (line, pos + 1), (n_line, n_pos))
- else:
- errline = str.splitlines()[line - 1]
- raise LexerError((line, pos + 1), errline)
-
- def f(str):
- length = len(str)
- line, pos = 1, 0
- i = 0
- while i < length:
- t = match_specs(compiled, str, i, (line, pos))
- yield t
- line, pos = t.end
- i += len(t.value)
-
- return f
-
- # This is an example of a token spec. See also [this article][1] for a
- # discussion of searching for multiline comments using regexps (including `*?`).
- #
- # [1]: http://ostermiller.org/findcomment.html
- _example_token_specs = [
- ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
- ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
- ('COMMENT', (r'//.*',)),
- ('NL', (r'[\r\n]+',)),
- ('SPACE', (r'[ \t\r\n]+',)),
- ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
- ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
- ('INT', (r'[0-9]+',)),
- ('INT', (r'\$[0-9A-Fa-f]+',)),
- ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)),
- ('STRING', (r"'([^']|(''))*'",)),
- ('CHAR', (r'#[0-9]+',)),
- ('CHAR', (r'#\$[0-9A-Fa-f]+',)),
- ]
- #tokenize = make_tokenizer(_example_token_specs)
|