2014-09-02 17:03:50 +00:00
# -*- coding: utf-8 -*-
# Copyright (c) 2008/2013 Andrey Vlasovskikh
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
__all__ = [ ' make_tokenizer ' , ' Token ' , ' LexerError ' ]
import re
class LexerError ( Exception ) :
def __init__ ( self , place , msg ) :
self . place = place
self . msg = msg
def __str__ ( self ) :
2014-09-16 00:32:07 +00:00
s = ' cannot tokenize data '
2014-09-02 17:03:50 +00:00
line , pos = self . place
2014-09-16 00:32:07 +00:00
return ' %s : %d , %d : " %s " ' % ( s , line , pos , self . msg )
2014-09-02 17:03:50 +00:00
class Token ( object ) :
def __init__ ( self , type , value , start = None , end = None ) :
self . type = type
self . value = value
self . start = start
self . end = end
def __repr__ ( self ) :
2014-09-16 00:32:07 +00:00
return ' Token( %r , %r ) ' % ( self . type , self . value )
2014-09-02 17:03:50 +00:00
def __eq__ ( self , other ) :
# FIXME: Case sensitivity is assumed here
return self . type == other . type and self . value == other . value
def _pos_str ( self ) :
if self . start is None or self . end is None :
return ' '
else :
sl , sp = self . start
el , ep = self . end
2014-09-16 00:32:07 +00:00
return ' %d , %d - %d , %d : ' % ( sl , sp , el , ep )
2014-09-02 17:03:50 +00:00
def __str__ ( self ) :
2014-09-16 00:32:07 +00:00
s = " %s %s ' %s ' " % ( self . _pos_str ( ) , self . type , self . value )
2014-09-02 17:03:50 +00:00
return s . strip ( )
@property
def name ( self ) :
return self . value
def pformat ( self ) :
2014-09-16 00:32:07 +00:00
return " %s %s ' %s ' " % ( self . _pos_str ( ) . ljust ( 20 ) ,
2014-09-02 17:03:50 +00:00
self . type . ljust ( 14 ) ,
self . value )
def make_tokenizer ( specs ) :
""" [(str, (str, int?))] -> (str -> Iterable(Token)) """
def compile_spec ( spec ) :
name , args = spec
return name , re . compile ( * args )
compiled = [ compile_spec ( s ) for s in specs ]
def match_specs ( specs , str , i , position ) :
line , pos = position
for type , regexp in specs :
m = regexp . match ( str , i )
if m is not None :
value = m . group ( )
2014-09-16 00:32:07 +00:00
nls = value . count ( ' \n ' )
2014-09-02 17:03:50 +00:00
n_line = line + nls
if nls == 0 :
n_pos = pos + len ( value )
else :
2014-09-16 00:32:07 +00:00
n_pos = len ( value ) - value . rfind ( ' \n ' ) - 1
2014-09-02 17:03:50 +00:00
return Token ( type , value , ( line , pos + 1 ) , ( n_line , n_pos ) )
else :
errline = str . splitlines ( ) [ line - 1 ]
raise LexerError ( ( line , pos + 1 ) , errline )
def f ( str ) :
length = len ( str )
line , pos = 1 , 0
i = 0
while i < length :
t = match_specs ( compiled , str , i , ( line , pos ) )
yield t
line , pos = t . end
i + = len ( t . value )
return f
# This is an example of a token spec. See also [this article][1] for a
# discussion of searching for multiline comments using regexps (including `*?`).
#
# [1]: http://ostermiller.org/findcomment.html
_example_token_specs = [
( ' COMMENT ' , ( r ' \ ( \ *(.|[ \ r \ n])*? \ * \ ) ' , re . MULTILINE ) ) ,
( ' COMMENT ' , ( r ' \ { (.|[ \ r \ n])*? \ } ' , re . MULTILINE ) ) ,
( ' COMMENT ' , ( r ' //.* ' , ) ) ,
( ' NL ' , ( r ' [ \ r \ n]+ ' , ) ) ,
( ' SPACE ' , ( r ' [ \ t \ r \ n]+ ' , ) ) ,
( ' NAME ' , ( r ' [A-Za-z_][A-Za-z_0-9]* ' , ) ) ,
( ' REAL ' , ( r ' [0-9]+ \ .[0-9]*([Ee][+ \ -]?[0-9]+)* ' , ) ) ,
( ' INT ' , ( r ' [0-9]+ ' , ) ) ,
( ' INT ' , ( r ' \ $[0-9A-Fa-f]+ ' , ) ) ,
( ' OP ' , ( r ' ( \ . \ .)|(<>)|(<=)|(>=)|(:=)|[;,= \ ( \ ): \ [ \ ] \ .+ \ -<> \ */@ \ ^] ' , ) ) ,
( ' STRING ' , ( r " ' ([^ ' ]|( ' ' ))* ' " , ) ) ,
( ' CHAR ' , ( r ' #[0-9]+ ' , ) ) ,
( ' CHAR ' , ( r ' # \ $[0-9A-Fa-f]+ ' , ) ) ,
]
#tokenize = make_tokenizer(_example_token_specs)