123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409 |
- # -*- coding: utf-8 -*-
-
- # Copyright (c) 2008/2013 Andrey Vlasovskikh
- # Small Python 3 modifications by Jacob Alexander 2014
- #
- # Permission is hereby granted, free of charge, to any person obtaining
- # a copy of this software and associated documentation files (the
- # "Software"), to deal in the Software without restriction, including
- # without limitation the rights to use, copy, modify, merge, publish,
- # distribute, sublicense, and/or sell copies of the Software, and to
- # permit persons to whom the Software is furnished to do so, subject to
- # the following conditions:
- #
- # The above copyright notice and this permission notice shall be included
- # in all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- """A recurisve descent parser library based on functional combinators.
-
- Basic combinators are taken from Harrison's book ["Introduction to Functional
- Programming"][1] and translated from ML into Python. See also [a Russian
- translation of the book][2].
-
- [1]: http://www.cl.cam.ac.uk/teaching/Lectures/funprog-jrh-1996/
- [2]: http://code.google.com/p/funprog-ru/
-
- A parser `p` is represented by a function of type:
-
- p :: Sequence(a), State -> (b, State)
-
- that takes as its input a sequence of tokens of arbitrary type `a` and a
- current parsing state and return a pair of a parsed token of arbitrary type
- `b` and the new parsing state.
-
- The parsing state includes the current position in the sequence being parsed and
- the position of the rightmost token that has been consumed while parsing.
-
- Parser functions are wrapped into an object of the class `Parser`. This class
- implements custom operators `+` for sequential composition of parsers, `|` for
- choice composition, `>>` for transforming the result of parsing. The method
- `Parser.parse` provides an easier way for invoking a parser hiding details
- related to a parser state:
-
- Parser.parse :: Parser(a, b), Sequence(a) -> b
-
- Altough this module is able to deal with a sequences of any kind of objects, the
- recommended way of using it is applying a parser to a `Sequence(Token)`.
- `Token` objects are produced by a regexp-based tokenizer defined in
- `funcparserlib.lexer`. By using it this way you get more readable parsing error
- messages (as `Token` objects contain their position in the source file) and good
- separation of lexical and syntactic levels of the grammar. See examples for more
- info.
-
- Debug messages are emitted via a `logging.Logger` object named
- `"funcparserlib"`.
- """
-
- __all__ = [
- 'some', 'a', 'many', 'pure', 'finished', 'maybe', 'skip', 'oneplus',
- 'forward_decl', 'NoParseError',
- ]
-
- import logging
-
- log = logging.getLogger('funcparserlib')
-
- debug = False
-
-
- class Parser(object):
- """A wrapper around a parser function that defines some operators for parser
- composition.
- """
-
- def __init__(self, p):
- """Wraps a parser function p into an object."""
- self.define(p)
-
- def named(self, name):
- """Specifies the name of the parser for more readable parsing log."""
- self.name = name
- return self
-
- def define(self, p):
- """Defines a parser wrapped into this object."""
- f = getattr(p, 'run', p)
- if debug:
- setattr(self, '_run', f)
- else:
- setattr(self, 'run', f)
- self.named(getattr(p, 'name', p.__doc__))
-
- def run(self, tokens, s):
- """Sequence(a), State -> (b, State)
-
- Runs a parser wrapped into this object.
- """
- if debug:
- log.debug('trying %s' % self.name)
- return self._run(tokens, s)
-
- def _run(self, tokens, s):
- raise NotImplementedError('you must define() a parser')
-
- def parse(self, tokens):
- """Sequence(a) -> b
-
- Applies the parser to a sequence of tokens producing a parsing result.
-
- It provides a way to invoke a parser hiding details related to the
- parser state. Also it makes error messages more readable by specifying
- the position of the rightmost token that has been reached.
- """
- try:
- (tree, _) = self.run(tokens, State())
- return tree
- except NoParseError as e:
- max = e.state.max
- if len(tokens) > max:
- tok = tokens[max]
- else:
- tok = '<EOF>'
- raise NoParseError('%s: %s' % (e.msg, tok), e.state)
-
- def __add__(self, other):
- """Parser(a, b), Parser(a, c) -> Parser(a, _Tuple(b, c))
-
- A sequential composition of parsers.
-
- NOTE: The real type of the parsed value isn't always such as specified.
- Here we use dynamic typing for ignoring the tokens that are of no
- interest to the user. Also we merge parsing results into a single _Tuple
- unless the user explicitely prevents it. See also skip and >>
- combinators.
- """
-
- def magic(v1, v2):
- vs = [v for v in [v1, v2] if not isinstance(v, _Ignored)]
- if len(vs) == 1:
- return vs[0]
- elif len(vs) == 2:
- if isinstance(vs[0], _Tuple):
- return _Tuple(v1 + (v2,))
- else:
- return _Tuple(vs)
- else:
- return _Ignored(())
-
- @Parser
- def _add(tokens, s):
- (v1, s2) = self.run(tokens, s)
- (v2, s3) = other.run(tokens, s2)
- return magic(v1, v2), s3
-
- # or in terms of bind and pure:
- # _add = self.bind(lambda x: other.bind(lambda y: pure(magic(x, y))))
- _add.name = '(%s , %s)' % (self.name, other.name)
- return _add
-
- def __or__(self, other):
- """Parser(a, b), Parser(a, c) -> Parser(a, b or c)
-
- A choice composition of two parsers.
-
- NOTE: Here we are not providing the exact type of the result. In a
- statically typed langage something like Either b c could be used. See
- also + combinator.
- """
-
- @Parser
- def _or(tokens, s):
- try:
- return self.run(tokens, s)
- except NoParseError as e:
- return other.run(tokens, State(s.pos, e.state.max))
-
- _or.name = '(%s | %s)' % (self.name, other.name)
- return _or
-
- def __rshift__(self, f):
- """Parser(a, b), (b -> c) -> Parser(a, c)
-
- Given a function from b to c, transforms a parser of b into a parser of
- c. It is useful for transorming a parser value into another value for
- making it a part of a parse tree or an AST.
-
- This combinator may be thought of as a functor from b -> c to Parser(a,
- b) -> Parser(a, c).
- """
-
- @Parser
- def _shift(tokens, s):
- (v, s2) = self.run(tokens, s)
- return f(v), s2
-
- # or in terms of bind and pure:
- # _shift = self.bind(lambda x: pure(f(x)))
- _shift.name = '(%s)' % (self.name,)
- return _shift
-
- def bind(self, f):
- """Parser(a, b), (b -> Parser(a, c)) -> Parser(a, c)
-
- NOTE: A monadic bind function. It is used internally to implement other
- combinators. Functions bind and pure make the Parser a Monad.
- """
-
- @Parser
- def _bind(tokens, s):
- (v, s2) = self.run(tokens, s)
- return f(v).run(tokens, s2)
-
- _bind.name = '(%s >>=)' % (self.name,)
- return _bind
-
-
- class State(object):
- """A parsing state that is maintained basically for error reporting.
-
- It consists of the current position pos in the sequence being parsed and
- the position max of the rightmost token that has been consumed while
- parsing.
- """
-
- def __init__(self, pos=0, max=0):
- self.pos = pos
- self.max = max
-
- def __str__(self):
- return unicode((self.pos, self.max))
-
- def __repr__(self):
- return 'State(%r, %r)' % (self.pos, self.max)
-
-
- class NoParseError(Exception):
- def __init__(self, msg='', state=None):
- self.msg = msg
- self.state = state
-
- def __str__(self):
- return self.msg
-
-
- class _Tuple(tuple):
- pass
-
-
- class _Ignored(object):
- def __init__(self, value):
- self.value = value
-
- def __repr__(self):
- return '_Ignored(%s)' % repr(self.value)
-
-
- @Parser
- def finished(tokens, s):
- """Parser(a, None)
-
- Throws an exception if any tokens are left in the input unparsed.
- """
- if s.pos >= len(tokens):
- return None, s
- else:
- raise NoParseError('should have reached <EOF>', s)
-
-
- finished.name = 'finished'
-
-
- def many(p):
- """Parser(a, b) -> Parser(a, [b])
-
- Returns a parser that infinitely applies the parser p to the input sequence
- of tokens while it successfully parsers them. The resulting parser returns a
- list of parsed values.
- """
-
- @Parser
- def _many(tokens, s):
- """Iterative implementation preventing the stack overflow."""
- res = []
- try:
- while True:
- (v, s) = p.run(tokens, s)
- res.append(v)
- except NoParseError as e:
- return res, State(s.pos, e.state.max)
-
- _many.name = '{ %s }' % p.name
- return _many
-
-
- def some(pred):
- """(a -> bool) -> Parser(a, a)
-
- Returns a parser that parses a token if it satisfies a predicate pred.
- """
-
- @Parser
- def _some(tokens, s):
- if s.pos >= len(tokens):
- raise NoParseError('no tokens left in the stream', s)
- else:
- t = tokens[s.pos]
- if pred(t):
- pos = s.pos + 1
- s2 = State(pos, max(pos, s.max))
- if debug:
- log.debug('*matched* "%s", new state = %s' % (t, s2))
- return t, s2
- else:
- if debug:
- log.debug('failed "%s", state = %s' % (t, s))
- raise NoParseError('got unexpected token', s)
-
- _some.name = '(some)'
- return _some
-
-
- def a(value):
- """Eq(a) -> Parser(a, a)
-
- Returns a parser that parses a token that is equal to the value value.
- """
- name = getattr(value, 'name', value)
- return some(lambda t: t == value).named('(a "%s")' % (name,))
-
-
- def pure(x):
- @Parser
- def _pure(_, s):
- return x, s
-
- _pure.name = '(pure %r)' % (x,)
- return _pure
-
-
- def maybe(p):
- """Parser(a, b) -> Parser(a, b or None)
-
- Returns a parser that retuns None if parsing fails.
-
- NOTE: In a statically typed language, the type Maybe b could be more
- approprieate.
- """
- return (p | pure(None)).named('[ %s ]' % (p.name,))
-
-
- def skip(p):
- """Parser(a, b) -> Parser(a, _Ignored(b))
-
- Returns a parser which results are ignored by the combinator +. It is useful
- for throwing away elements of concrete syntax (e. g. ",", ";").
- """
- return p >> _Ignored
-
-
- def oneplus(p):
- """Parser(a, b) -> Parser(a, [b])
-
- Returns a parser that applies the parser p one or more times.
- """
- q = p + many(p) >> (lambda x: [x[0]] + x[1])
- return q.named('(%s , { %s })' % (p.name, p.name))
-
-
- def with_forward_decls(suspension):
- """(None -> Parser(a, b)) -> Parser(a, b)
-
- Returns a parser that computes itself lazily as a result of the suspension
- provided. It is needed when some parsers contain forward references to
- parsers defined later and such references are cyclic. See examples for more
- details.
- """
-
- @Parser
- def f(tokens, s):
- return suspension().run(tokens, s)
-
- return f
-
-
- def forward_decl():
- """None -> Parser(?, ?)
-
- Returns an undefined parser that can be used as a forward declaration. You
- will be able to define() it when all the parsers it depends on are
- available.
- """
-
- @Parser
- def f(tokens, s):
- raise NotImplementedError('you must define() a forward_decl somewhere')
-
- return f
-
-
- if __name__ == '__main__':
- import doctest
- doctest.testmod()
|