This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

257 linhas
8.7 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import PatternStr, PatternRE, TokenDef
  5. ###{standalone
  6. class LexError(Exception):
  7. pass
  8. class UnexpectedInput(LexError):
  9. def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None):
  10. context = seq[lex_pos:lex_pos+5]
  11. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  12. if allowed:
  13. message += '\n\nExpecting: %s\n' % allowed
  14. super(UnexpectedInput, self).__init__(message)
  15. self.line = line
  16. self.column = column
  17. self.context = context
  18. self.allowed = allowed
  19. self.considered_rules = considered_rules
  20. class Token(Str):
  21. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
  22. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  23. self = super(Token, cls).__new__(cls, value)
  24. self.type = type_
  25. self.pos_in_stream = pos_in_stream
  26. self.value = value
  27. self.line = line
  28. self.column = column
  29. return self
  30. @classmethod
  31. def new_borrow_pos(cls, type_, value, borrow_t):
  32. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  33. def __reduce__(self):
  34. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  35. def __repr__(self):
  36. return 'Token(%s, %r)' % (self.type, self.value)
  37. def __deepcopy__(self, memo):
  38. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  39. def __eq__(self, other):
  40. if isinstance(other, Token) and self.type != other.type:
  41. return False
  42. return Str.__eq__(self, other)
  43. __hash__ = Str.__hash__
  44. class LineCounter:
  45. def __init__(self):
  46. self.newline_char = '\n'
  47. self.char_pos = 0
  48. self.line = 1
  49. self.column = 0
  50. self.line_start_pos = 0
  51. def feed(self, token, test_newline=True):
  52. """Consume a token and calculate the new line & column.
  53. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  54. """
  55. if test_newline:
  56. newlines = token.count(self.newline_char)
  57. if newlines:
  58. self.line += newlines
  59. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  60. self.char_pos += len(token)
  61. self.column = self.char_pos - self.line_start_pos
  62. class _Lex:
  63. "Built to serve both Lexer and ContextualLexer"
  64. def __init__(self, lexer):
  65. self.lexer = lexer
  66. def lex(self, stream, newline_types, ignore_types):
  67. newline_types = list(newline_types)
  68. ignore_types = list(ignore_types)
  69. line_ctr = LineCounter()
  70. t = None
  71. while True:
  72. lexer = self.lexer
  73. for mre, type_from_index in lexer.mres:
  74. m = mre.match(stream, line_ctr.char_pos)
  75. if m:
  76. value = m.group(0)
  77. type_ = type_from_index[m.lastindex]
  78. if type_ not in ignore_types:
  79. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  80. if t.type in lexer.callback:
  81. t = lexer.callback[t.type](t)
  82. yield t
  83. else:
  84. if type_ in lexer.callback:
  85. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  86. lexer.callback[type_](t)
  87. line_ctr.feed(value, type_ in newline_types)
  88. if t:
  89. t.end_line = line_ctr.line
  90. t.end_column = line_ctr.column
  91. break
  92. else:
  93. if line_ctr.char_pos < len(stream):
  94. raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  95. break
  96. class UnlessCallback:
  97. def __init__(self, mres):
  98. self.mres = mres
  99. def __call__(self, t):
  100. for mre, type_from_index in self.mres:
  101. m = mre.match(t.value)
  102. if m:
  103. value = m.group(0)
  104. t.type = type_from_index[m.lastindex]
  105. break
  106. return t
  107. ###}
  108. def _create_unless(tokens):
  109. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  110. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  111. embedded_strs = set()
  112. callback = {}
  113. for retok in tokens_by_type.get(PatternRE, []):
  114. unless = [] # {}
  115. for strtok in tokens_by_type.get(PatternStr, []):
  116. if strtok.priority > retok.priority:
  117. continue
  118. s = strtok.pattern.value
  119. m = re.match(retok.pattern.to_regexp(), s)
  120. if m and m.group(0) == s:
  121. unless.append(strtok)
  122. if strtok.pattern.flags <= retok.pattern.flags:
  123. embedded_strs.add(strtok)
  124. if unless:
  125. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  126. tokens = [t for t in tokens if t not in embedded_strs]
  127. return tokens, callback
  128. def _build_mres(tokens, max_size, match_whole):
  129. # Python sets an unreasonable group limit (currently 100) in its re module
  130. # Worse, the only way to know we reached it is by catching an AssertionError!
  131. # This function recursively tries less and less groups until it's successful.
  132. postfix = '$' if match_whole else ''
  133. mres = []
  134. while tokens:
  135. try:
  136. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  137. except AssertionError: # Yes, this is what Python provides us.. :/
  138. return _build_mres(tokens, max_size//2, match_whole)
  139. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  140. tokens = tokens[max_size:]
  141. return mres
  142. def build_mres(tokens, match_whole=False):
  143. return _build_mres(tokens, len(tokens), match_whole)
  144. def _regexp_has_newline(r):
  145. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  146. class Lexer:
  147. def __init__(self, tokens, ignore=(), user_callbacks={}):
  148. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  149. tokens = list(tokens)
  150. # Sanitization
  151. for t in tokens:
  152. try:
  153. re.compile(t.pattern.to_regexp())
  154. except:
  155. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  156. if t.pattern.min_width == 0:
  157. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  158. assert set(ignore) <= {t.name for t in tokens}
  159. # Init
  160. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  161. self.ignore_types = list(ignore)
  162. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  163. tokens, self.callback = _create_unless(tokens)
  164. assert all(self.callback.values())
  165. for type_, f in user_callbacks.items():
  166. assert type_ not in self.callback
  167. self.callback[type_] = f
  168. self.tokens = tokens
  169. self.mres = build_mres(tokens)
  170. def lex(self, stream):
  171. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  172. class ContextualLexer:
  173. def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
  174. tokens_by_name = {}
  175. for t in tokens:
  176. assert t.name not in tokens_by_name, t
  177. tokens_by_name[t.name] = t
  178. lexer_by_tokens = {}
  179. self.lexers = {}
  180. for state, accepts in states.items():
  181. key = frozenset(accepts)
  182. try:
  183. lexer = lexer_by_tokens[key]
  184. except KeyError:
  185. accepts = set(accepts) | set(ignore) | set(always_accept)
  186. state_tokens = [tokens_by_name[n] for n in accepts if n.is_term and n.name!='$END']
  187. lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  188. lexer_by_tokens[key] = lexer
  189. self.lexers[state] = lexer
  190. self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
  191. self.set_parser_state(None) # Needs to be set on the outside
  192. def set_parser_state(self, state):
  193. self.parser_state = state
  194. def lex(self, stream):
  195. l = _Lex(self.lexers[self.parser_state])
  196. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  197. yield x
  198. l.lexer = self.lexers[self.parser_state]