This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

255 Zeilen
8.6 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. ###{standalone
  6. class LexError(Exception):
  7. pass
  8. class UnexpectedInput(LexError):
  9. def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None):
  10. context = seq[lex_pos:lex_pos+5]
  11. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  12. if allowed:
  13. message += '\n\nExpecting: %s\n' % allowed
  14. super(UnexpectedInput, self).__init__(message)
  15. self.line = line
  16. self.column = column
  17. self.context = context
  18. self.allowed = allowed
  19. self.considered_rules = considered_rules
  20. class Token(Str):
  21. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
  22. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  23. self = super(Token, cls).__new__(cls, value)
  24. self.type = type_
  25. self.pos_in_stream = pos_in_stream
  26. self.value = value
  27. self.line = line
  28. self.column = column
  29. return self
  30. @classmethod
  31. def new_borrow_pos(cls, type_, value, borrow_t):
  32. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  33. def __reduce__(self):
  34. return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, ))
  35. def __repr__(self):
  36. return 'Token(%s, %r)' % (self.type, self.value)
  37. def __deepcopy__(self, memo):
  38. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  39. def __eq__(self, other):
  40. if isinstance(other, Token) and self.type != other.type:
  41. return False
  42. return Str.__eq__(self, other)
  43. __hash__ = Str.__hash__
  44. class LineCounter:
  45. def __init__(self):
  46. self.newline_char = '\n'
  47. self.char_pos = 0
  48. self.line = 1
  49. self.column = 0
  50. self.line_start_pos = 0
  51. def feed(self, token, test_newline=True):
  52. """Consume a token and calculate the new line & column.
  53. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  54. """
  55. if test_newline:
  56. newlines = token.count(self.newline_char)
  57. if newlines:
  58. self.line += newlines
  59. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  60. self.char_pos += len(token)
  61. self.column = self.char_pos - self.line_start_pos
  62. class _Lex:
  63. "Built to serve both Lexer and ContextualLexer"
  64. def __init__(self, lexer):
  65. self.lexer = lexer
  66. def lex(self, stream, newline_types, ignore_types):
  67. newline_types = list(newline_types)
  68. ignore_types = list(ignore_types)
  69. line_ctr = LineCounter()
  70. t = None
  71. while True:
  72. lexer = self.lexer
  73. for mre, type_from_index in lexer.mres:
  74. m = mre.match(stream, line_ctr.char_pos)
  75. if m:
  76. value = m.group(0)
  77. type_ = type_from_index[m.lastindex]
  78. if type_ not in ignore_types:
  79. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  80. if t.type in lexer.callback:
  81. t = lexer.callback[t.type](t)
  82. yield t
  83. else:
  84. if type_ in lexer.callback:
  85. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  86. lexer.callback[type_](t)
  87. line_ctr.feed(value, type_ in newline_types)
  88. if t:
  89. t.end_line = line_ctr.line
  90. t.end_column = line_ctr.column
  91. break
  92. else:
  93. if line_ctr.char_pos < len(stream):
  94. raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  95. break
  96. class UnlessCallback:
  97. def __init__(self, mres):
  98. self.mres = mres
  99. def __call__(self, t):
  100. for mre, type_from_index in self.mres:
  101. m = mre.match(t.value)
  102. if m:
  103. value = m.group(0)
  104. t.type = type_from_index[m.lastindex]
  105. break
  106. return t
  107. ###}
  108. def _create_unless(tokens):
  109. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  110. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  111. embedded_strs = set()
  112. callback = {}
  113. for retok in tokens_by_type.get(PatternRE, []):
  114. unless = [] # {}
  115. for strtok in tokens_by_type.get(PatternStr, []):
  116. s = strtok.pattern.value
  117. m = re.match(retok.pattern.to_regexp(), s)
  118. if m and m.group(0) == s:
  119. unless.append(strtok)
  120. if strtok.pattern.flags <= retok.pattern.flags:
  121. embedded_strs.add(strtok)
  122. if unless:
  123. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  124. tokens = [t for t in tokens if t not in embedded_strs]
  125. return tokens, callback
  126. def _build_mres(tokens, max_size, match_whole):
  127. # Python sets an unreasonable group limit (currently 100) in its re module
  128. # Worse, the only way to know we reached it is by catching an AssertionError!
  129. # This function recursively tries less and less groups until it's successful.
  130. postfix = '$' if match_whole else ''
  131. mres = []
  132. while tokens:
  133. try:
  134. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  135. except AssertionError: # Yes, this is what Python provides us.. :/
  136. return _build_mres(tokens, max_size//2, match_whole)
  137. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  138. tokens = tokens[max_size:]
  139. return mres
  140. def build_mres(tokens, match_whole=False):
  141. return _build_mres(tokens, len(tokens), match_whole)
  142. def _regexp_has_newline(r):
  143. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  144. class Lexer:
  145. def __init__(self, tokens, ignore=(), user_callbacks={}):
  146. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  147. tokens = list(tokens)
  148. # Sanitization
  149. for t in tokens:
  150. try:
  151. re.compile(t.pattern.to_regexp())
  152. except:
  153. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  154. if t.pattern.min_width == 0:
  155. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  156. assert set(ignore) <= {t.name for t in tokens}
  157. # Init
  158. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  159. self.ignore_types = list(ignore)
  160. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  161. tokens, self.callback = _create_unless(tokens)
  162. assert all(self.callback.values())
  163. for type_, f in user_callbacks.items():
  164. assert type_ not in self.callback
  165. self.callback[type_] = f
  166. self.tokens = tokens
  167. self.mres = build_mres(tokens)
  168. def lex(self, stream):
  169. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  170. class ContextualLexer:
  171. def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
  172. tokens_by_name = {}
  173. for t in tokens:
  174. assert t.name not in tokens_by_name, t
  175. tokens_by_name[t.name] = t
  176. lexer_by_tokens = {}
  177. self.lexers = {}
  178. for state, accepts in states.items():
  179. key = frozenset(accepts)
  180. try:
  181. lexer = lexer_by_tokens[key]
  182. except KeyError:
  183. accepts = set(accepts) | set(ignore) | set(always_accept)
  184. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
  185. lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  186. lexer_by_tokens[key] = lexer
  187. self.lexers[state] = lexer
  188. self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
  189. self.set_parser_state(None) # Needs to be set on the outside
  190. def set_parser_state(self, state):
  191. self.parser_state = state
  192. def lex(self, stream):
  193. l = _Lex(self.lexers[self.parser_state])
  194. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  195. yield x
  196. l.lexer = self.lexers[self.parser_state]