This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

265 lines
8.8 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import PatternStr, PatternRE, TokenDef
  5. from .exceptions import UnexpectedCharacters, LexError
  6. ###{standalone
  7. class Token(Str):
  8. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
  9. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  10. self = super(Token, cls).__new__(cls, value)
  11. self.type = type_
  12. self.pos_in_stream = pos_in_stream
  13. self.value = value
  14. self.line = line
  15. self.column = column
  16. self.end_line = None
  17. self.end_column = None
  18. return self
  19. @classmethod
  20. def new_borrow_pos(cls, type_, value, borrow_t):
  21. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  22. def __reduce__(self):
  23. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  24. def __repr__(self):
  25. return 'Token(%s, %r)' % (self.type, self.value)
  26. def __deepcopy__(self, memo):
  27. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  28. def __eq__(self, other):
  29. if isinstance(other, Token) and self.type != other.type:
  30. return False
  31. return Str.__eq__(self, other)
  32. __hash__ = Str.__hash__
  33. class LineCounter:
  34. def __init__(self):
  35. self.newline_char = '\n'
  36. self.char_pos = 0
  37. self.line = 1
  38. self.column = 1
  39. self.line_start_pos = 0
  40. def feed(self, token, test_newline=True):
  41. """Consume a token and calculate the new line & column.
  42. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  43. """
  44. if test_newline:
  45. newlines = token.count(self.newline_char)
  46. if newlines:
  47. self.line += newlines
  48. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  49. self.char_pos += len(token)
  50. self.column = self.char_pos - self.line_start_pos + 1
  51. class _Lex:
  52. "Built to serve both Lexer and ContextualLexer"
  53. def __init__(self, lexer, state=None):
  54. self.lexer = lexer
  55. self.state = state
  56. def lex(self, stream, newline_types, ignore_types):
  57. newline_types = list(newline_types)
  58. ignore_types = list(ignore_types)
  59. line_ctr = LineCounter()
  60. t = None
  61. while True:
  62. lexer = self.lexer
  63. for mre, type_from_index in lexer.mres:
  64. m = mre.match(stream, line_ctr.char_pos)
  65. if m:
  66. value = m.group(0)
  67. type_ = type_from_index[m.lastindex]
  68. if type_ not in ignore_types:
  69. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  70. if t.type in lexer.callback:
  71. t = lexer.callback[t.type](t)
  72. yield t
  73. else:
  74. if type_ in lexer.callback:
  75. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  76. lexer.callback[type_](t)
  77. line_ctr.feed(value, type_ in newline_types)
  78. if t:
  79. t.end_line = line_ctr.line
  80. t.end_column = line_ctr.column
  81. break
  82. else:
  83. if line_ctr.char_pos < len(stream):
  84. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, state=self.state)
  85. break
  86. if t:
  87. t.end_line = line_ctr.line
  88. t.end_column = line_ctr.column
  89. class UnlessCallback:
  90. def __init__(self, mres):
  91. self.mres = mres
  92. def __call__(self, t):
  93. for mre, type_from_index in self.mres:
  94. m = mre.match(t.value)
  95. if m:
  96. t.type = type_from_index[m.lastindex]
  97. break
  98. return t
  99. ###}
  100. def _create_unless(tokens):
  101. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  102. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  103. embedded_strs = set()
  104. callback = {}
  105. for retok in tokens_by_type.get(PatternRE, []):
  106. unless = [] # {}
  107. for strtok in tokens_by_type.get(PatternStr, []):
  108. if strtok.priority > retok.priority:
  109. continue
  110. s = strtok.pattern.value
  111. m = re.match(retok.pattern.to_regexp(), s)
  112. if m and m.group(0) == s:
  113. unless.append(strtok)
  114. if strtok.pattern.flags <= retok.pattern.flags:
  115. embedded_strs.add(strtok)
  116. if unless:
  117. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  118. tokens = [t for t in tokens if t not in embedded_strs]
  119. return tokens, callback
  120. def _build_mres(tokens, max_size, match_whole):
  121. # Python sets an unreasonable group limit (currently 100) in its re module
  122. # Worse, the only way to know we reached it is by catching an AssertionError!
  123. # This function recursively tries less and less groups until it's successful.
  124. postfix = '$' if match_whole else ''
  125. mres = []
  126. while tokens:
  127. try:
  128. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  129. except AssertionError: # Yes, this is what Python provides us.. :/
  130. return _build_mres(tokens, max_size//2, match_whole)
  131. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  132. tokens = tokens[max_size:]
  133. return mres
  134. def build_mres(tokens, match_whole=False):
  135. return _build_mres(tokens, len(tokens), match_whole)
  136. def _regexp_has_newline(r):
  137. """Expressions that may indicate newlines in a regexp:
  138. - newlines (\n)
  139. - escaped newline (\n)
  140. - anything but ([^...])
  141. - any-char (.) when the flag (?s) exists
  142. """
  143. return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
  144. class Lexer:
  145. """Lexer interface
  146. Method Signatures:
  147. lex(self, stream) -> Iterator[Token]
  148. set_parser_state(self, state) # Optional
  149. """
  150. set_parser_state = NotImplemented
  151. lex = NotImplemented
  152. class TraditionalLexer(Lexer):
  153. def __init__(self, tokens, ignore=(), user_callbacks={}):
  154. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  155. tokens = list(tokens)
  156. # Sanitization
  157. for t in tokens:
  158. try:
  159. re.compile(t.pattern.to_regexp())
  160. except:
  161. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  162. if t.pattern.min_width == 0:
  163. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  164. assert set(ignore) <= {t.name for t in tokens}
  165. # Init
  166. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  167. self.ignore_types = list(ignore)
  168. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  169. tokens, self.callback = _create_unless(tokens)
  170. assert all(self.callback.values())
  171. for type_, f in user_callbacks.items():
  172. assert type_ not in self.callback
  173. self.callback[type_] = f
  174. self.tokens = tokens
  175. self.mres = build_mres(tokens)
  176. def lex(self, stream):
  177. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  178. class ContextualLexer(Lexer):
  179. def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
  180. tokens_by_name = {}
  181. for t in tokens:
  182. assert t.name not in tokens_by_name, t
  183. tokens_by_name[t.name] = t
  184. lexer_by_tokens = {}
  185. self.lexers = {}
  186. for state, accepts in states.items():
  187. key = frozenset(accepts)
  188. try:
  189. lexer = lexer_by_tokens[key]
  190. except KeyError:
  191. accepts = set(accepts) | set(ignore) | set(always_accept)
  192. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  193. lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  194. lexer_by_tokens[key] = lexer
  195. self.lexers[state] = lexer
  196. self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
  197. self.set_parser_state(None) # Needs to be set on the outside
  198. def set_parser_state(self, state):
  199. self.parser_state = state
  200. def lex(self, stream):
  201. l = _Lex(self.lexers[self.parser_state], self.parser_state)
  202. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  203. yield x
  204. l.lexer = self.lexers[self.parser_state]
  205. l.state = self.parser_state