KLL Compiler
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

lexer.py 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2008/2013 Andrey Vlasovskikh
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining
  5. # a copy of this software and associated documentation files (the
  6. # "Software"), to deal in the Software without restriction, including
  7. # without limitation the rights to use, copy, modify, merge, publish,
  8. # distribute, sublicense, and/or sell copies of the Software, and to
  9. # permit persons to whom the Software is furnished to do so, subject to
  10. # the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included
  13. # in all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  18. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  19. # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  20. # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  21. # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22. __all__ = ['make_tokenizer', 'Token', 'LexerError']
  23. import re
  24. class LexerError(Exception):
  25. def __init__(self, place, msg):
  26. self.place = place
  27. self.msg = msg
  28. def __str__(self):
  29. s = 'cannot tokenize data'
  30. line, pos = self.place
  31. return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)
  32. class Token(object):
  33. def __init__(self, type, value, start=None, end=None):
  34. self.type = type
  35. self.value = value
  36. self.start = start
  37. self.end = end
  38. def __repr__(self):
  39. return 'Token(%r, %r)' % (self.type, self.value)
  40. def __eq__(self, other):
  41. # FIXME: Case sensitivity is assumed here
  42. return self.type == other.type and self.value == other.value
  43. def _pos_str(self):
  44. if self.start is None or self.end is None:
  45. return ''
  46. else:
  47. sl, sp = self.start
  48. el, ep = self.end
  49. return '%d,%d-%d,%d:' % (sl, sp, el, ep)
  50. def __str__(self):
  51. s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
  52. return s.strip()
  53. @property
  54. def name(self):
  55. return self.value
  56. def pformat(self):
  57. return "%s %s '%s'" % (self._pos_str().ljust(20),
  58. self.type.ljust(14),
  59. self.value)
  60. def make_tokenizer(specs):
  61. """[(str, (str, int?))] -> (str -> Iterable(Token))"""
  62. def compile_spec(spec):
  63. name, args = spec
  64. return name, re.compile(*args)
  65. compiled = [compile_spec(s) for s in specs]
  66. def match_specs(specs, str, i, position):
  67. line, pos = position
  68. for type, regexp in specs:
  69. m = regexp.match(str, i)
  70. if m is not None:
  71. value = m.group()
  72. nls = value.count('\n')
  73. n_line = line + nls
  74. if nls == 0:
  75. n_pos = pos + len(value)
  76. else:
  77. n_pos = len(value) - value.rfind('\n') - 1
  78. return Token(type, value, (line, pos + 1), (n_line, n_pos))
  79. else:
  80. errline = str.splitlines()[line - 1]
  81. raise LexerError((line, pos + 1), errline)
  82. def f(str):
  83. length = len(str)
  84. line, pos = 1, 0
  85. i = 0
  86. while i < length:
  87. t = match_specs(compiled, str, i, (line, pos))
  88. yield t
  89. line, pos = t.end
  90. i += len(t.value)
  91. return f
  92. # This is an example of a token spec. See also [this article][1] for a
  93. # discussion of searching for multiline comments using regexps (including `*?`).
  94. #
  95. # [1]: http://ostermiller.org/findcomment.html
  96. _example_token_specs = [
  97. ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
  98. ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
  99. ('COMMENT', (r'//.*',)),
  100. ('NL', (r'[\r\n]+',)),
  101. ('SPACE', (r'[ \t\r\n]+',)),
  102. ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
  103. ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
  104. ('INT', (r'[0-9]+',)),
  105. ('INT', (r'\$[0-9A-Fa-f]+',)),
  106. ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)),
  107. ('STRING', (r"'([^']|(''))*'",)),
  108. ('CHAR', (r'#[0-9]+',)),
  109. ('CHAR', (r'#\$[0-9A-Fa-f]+',)),
  110. ]
  111. #tokenize = make_tokenizer(_example_token_specs)