This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

249 rader
8.3 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. ###{standalone
  6. class LexError(Exception):
  7. pass
  8. class UnexpectedInput(LexError):
  9. def __init__(self, seq, lex_pos, line, column, allowed=None):
  10. context = seq[lex_pos:lex_pos+5]
  11. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  12. if allowed:
  13. message += '\n\nExpecting: %s\n' % allowed
  14. super(UnexpectedInput, self).__init__(message)
  15. self.line = line
  16. self.column = column
  17. self.context = context
  18. self.allowed = allowed
  19. class Token(Str):
  20. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  21. inst = Str.__new__(cls, value)
  22. inst.type = type_
  23. inst.pos_in_stream = pos_in_stream
  24. inst.value = value
  25. inst.line = line
  26. inst.column = column
  27. return inst
  28. @classmethod
  29. def new_borrow_pos(cls, type_, value, borrow_t):
  30. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  31. def __repr__(self):
  32. return 'Token(%s, %r)' % (self.type, self.value)
  33. def __deepcopy__(self, memo):
  34. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  35. def __eq__(self, other):
  36. if isinstance(other, Token) and self.type != other.type:
  37. return False
  38. return Str.__eq__(self, other)
  39. __hash__ = Str.__hash__
  40. class LineCounter:
  41. def __init__(self):
  42. self.newline_char = '\n'
  43. self.char_pos = 0
  44. self.line = 1
  45. self.column = 0
  46. self.line_start_pos = 0
  47. def feed(self, token, test_newline=True):
  48. """Consume a token and calculate the new line & column.
  49. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  50. """
  51. if test_newline:
  52. newlines = token.count(self.newline_char)
  53. if newlines:
  54. self.line += newlines
  55. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  56. self.char_pos += len(token)
  57. self.column = self.char_pos - self.line_start_pos
  58. class _Lex:
  59. "Built to serve both Lexer and ContextualLexer"
  60. def __init__(self, lexer):
  61. self.lexer = lexer
  62. def lex(self, stream, newline_types, ignore_types):
  63. newline_types = list(newline_types)
  64. ignore_types = list(ignore_types)
  65. line_ctr = LineCounter()
  66. t = None
  67. while True:
  68. lexer = self.lexer
  69. for mre, type_from_index in lexer.mres:
  70. m = mre.match(stream, line_ctr.char_pos)
  71. if m:
  72. value = m.group(0)
  73. type_ = type_from_index[m.lastindex]
  74. if type_ not in ignore_types:
  75. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  76. if t.type in lexer.callback:
  77. t = lexer.callback[t.type](t)
  78. yield t
  79. else:
  80. if type_ in lexer.callback:
  81. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  82. lexer.callback[type_](t)
  83. line_ctr.feed(value, type_ in newline_types)
  84. if t:
  85. t.end_line = line_ctr.line
  86. t.end_column = line_ctr.column
  87. break
  88. else:
  89. if line_ctr.char_pos < len(stream):
  90. raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  91. break
  92. class UnlessCallback:
  93. def __init__(self, mres):
  94. self.mres = mres
  95. def __call__(self, t):
  96. for mre, type_from_index in self.mres:
  97. m = mre.match(t.value)
  98. if m:
  99. value = m.group(0)
  100. t.type = type_from_index[m.lastindex]
  101. break
  102. return t
  103. ###}
  104. def _create_unless(tokens):
  105. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  106. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  107. embedded_strs = set()
  108. callback = {}
  109. for retok in tokens_by_type.get(PatternRE, []):
  110. unless = [] # {}
  111. for strtok in tokens_by_type.get(PatternStr, []):
  112. s = strtok.pattern.value
  113. m = re.match(retok.pattern.to_regexp(), s)
  114. if m and m.group(0) == s:
  115. unless.append(strtok)
  116. if strtok.pattern.flags <= retok.pattern.flags:
  117. embedded_strs.add(strtok)
  118. if unless:
  119. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  120. tokens = [t for t in tokens if t not in embedded_strs]
  121. return tokens, callback
  122. def _build_mres(tokens, max_size, match_whole):
  123. # Python sets an unreasonable group limit (currently 100) in its re module
  124. # Worse, the only way to know we reached it is by catching an AssertionError!
  125. # This function recursively tries less and less groups until it's successful.
  126. postfix = '$' if match_whole else ''
  127. mres = []
  128. while tokens:
  129. try:
  130. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  131. except AssertionError: # Yes, this is what Python provides us.. :/
  132. return _build_mres(tokens, max_size//2, match_whole)
  133. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  134. tokens = tokens[max_size:]
  135. return mres
  136. def build_mres(tokens, match_whole=False):
  137. return _build_mres(tokens, len(tokens), match_whole)
  138. def _regexp_has_newline(r):
  139. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  140. class Lexer:
  141. def __init__(self, tokens, ignore=(), user_callbacks={}):
  142. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  143. tokens = list(tokens)
  144. # Sanitization
  145. for t in tokens:
  146. try:
  147. re.compile(t.pattern.to_regexp())
  148. except:
  149. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  150. if t.pattern.min_width == 0:
  151. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  152. assert set(ignore) <= {t.name for t in tokens}
  153. # Init
  154. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  155. self.ignore_types = list(ignore)
  156. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  157. tokens, self.callback = _create_unless(tokens)
  158. assert all(self.callback.values())
  159. for type_, f in user_callbacks.items():
  160. assert type_ not in self.callback
  161. self.callback[type_] = f
  162. self.tokens = tokens
  163. self.mres = build_mres(tokens)
  164. def lex(self, stream):
  165. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  166. class ContextualLexer:
  167. def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
  168. tokens_by_name = {}
  169. for t in tokens:
  170. assert t.name not in tokens_by_name, t
  171. tokens_by_name[t.name] = t
  172. lexer_by_tokens = {}
  173. self.lexers = {}
  174. for state, accepts in states.items():
  175. key = frozenset(accepts)
  176. try:
  177. lexer = lexer_by_tokens[key]
  178. except KeyError:
  179. accepts = set(accepts) | set(ignore) | set(always_accept)
  180. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
  181. lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  182. lexer_by_tokens[key] = lexer
  183. self.lexers[state] = lexer
  184. self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
  185. self.set_parser_state(None) # Needs to be set on the outside
  186. def set_parser_state(self, state):
  187. self.parser_state = state
  188. def lex(self, stream):
  189. l = _Lex(self.lexers[self.parser_state])
  190. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  191. yield x
  192. l.lexer = self.lexers[self.parser_state]