This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

235 lignes
7.6 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. ###{standalone
  6. class LexError(Exception):
  7. pass
  8. class UnexpectedInput(LexError):
  9. def __init__(self, seq, lex_pos, line, column, allowed=None):
  10. context = seq[lex_pos:lex_pos+5]
  11. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  12. super(UnexpectedInput, self).__init__(message)
  13. self.line = line
  14. self.column = column
  15. self.context = context
  16. self.allowed = allowed
  17. class Token(Str):
  18. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  19. inst = Str.__new__(cls, value)
  20. inst.type = type_
  21. inst.pos_in_stream = pos_in_stream
  22. inst.value = value
  23. inst.line = line
  24. inst.column = column
  25. return inst
  26. @classmethod
  27. def new_borrow_pos(cls, type_, value, borrow_t):
  28. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  29. def __repr__(self):
  30. return 'Token(%s, %r)' % (self.type, self.value)
  31. def __deepcopy__(self, memo):
  32. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  33. def __eq__(self, other):
  34. if isinstance(other, Token) and self.type != other.type:
  35. return False
  36. return Str.__eq__(self, other)
  37. __hash__ = Str.__hash__
  38. class LineCounter:
  39. def __init__(self):
  40. self.newline_char = '\n'
  41. self.char_pos = 0
  42. self.line = 1
  43. self.column = 0
  44. self.line_start_pos = 0
  45. def feed(self, token, test_newline=True):
  46. """Consume a token and calculate the new line & column.
  47. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  48. """
  49. if test_newline:
  50. newlines = token.count(self.newline_char)
  51. if newlines:
  52. self.line += newlines
  53. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  54. self.char_pos += len(token)
  55. self.column = self.char_pos - self.line_start_pos
  56. class _Lex:
  57. "Built to serve both Lexer and ContextualLexer"
  58. def __init__(self, lexer):
  59. self.lexer = lexer
  60. def lex(self, stream, newline_types, ignore_types):
  61. newline_types = list(newline_types)
  62. ignore_types = list(ignore_types)
  63. line_ctr = LineCounter()
  64. while True:
  65. lexer = self.lexer
  66. for mre, type_from_index in lexer.mres:
  67. m = mre.match(stream, line_ctr.char_pos)
  68. if m:
  69. value = m.group(0)
  70. type_ = type_from_index[m.lastindex]
  71. if type_ not in ignore_types:
  72. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  73. if t.type in lexer.callback:
  74. t = lexer.callback[t.type](t)
  75. yield t
  76. line_ctr.feed(value, type_ in newline_types)
  77. break
  78. else:
  79. if line_ctr.char_pos < len(stream):
  80. raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  81. break
  82. class UnlessCallback:
  83. def __init__(self, mres):
  84. self.mres = mres
  85. def __call__(self, t):
  86. for mre, type_from_index in self.mres:
  87. m = mre.match(t.value)
  88. if m:
  89. value = m.group(0)
  90. t.type = type_from_index[m.lastindex]
  91. break
  92. return t
  93. ###}
  94. def _create_unless(tokens):
  95. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  96. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  97. embedded_strs = set()
  98. callback = {}
  99. for retok in tokens_by_type.get(PatternRE, []):
  100. unless = [] # {}
  101. for strtok in tokens_by_type.get(PatternStr, []):
  102. s = strtok.pattern.value
  103. m = re.match(retok.pattern.to_regexp(), s)
  104. if m and m.group(0) == s:
  105. unless.append(strtok)
  106. if strtok.pattern.flags <= retok.pattern.flags:
  107. embedded_strs.add(strtok)
  108. if unless:
  109. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  110. tokens = [t for t in tokens if t not in embedded_strs]
  111. return tokens, callback
  112. def _build_mres(tokens, max_size, match_whole):
  113. # Python sets an unreasonable group limit (currently 100) in its re module
  114. # Worse, the only way to know we reached it is by catching an AssertionError!
  115. # This function recursively tries less and less groups until it's successful.
  116. postfix = '$' if match_whole else ''
  117. mres = []
  118. while tokens:
  119. try:
  120. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  121. except AssertionError: # Yes, this is what Python provides us.. :/
  122. return _build_mres(tokens, max_size//2, match_whole)
  123. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  124. tokens = tokens[max_size:]
  125. return mres
  126. def build_mres(tokens, match_whole=False):
  127. return _build_mres(tokens, len(tokens), match_whole)
  128. def _regexp_has_newline(r):
  129. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  130. class Lexer:
  131. def __init__(self, tokens, ignore=()):
  132. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  133. tokens = list(tokens)
  134. # Sanitization
  135. for t in tokens:
  136. try:
  137. re.compile(t.pattern.to_regexp())
  138. except:
  139. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  140. if t.pattern.min_width == 0:
  141. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  142. assert set(ignore) <= {t.name for t in tokens}
  143. # Init
  144. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  145. self.ignore_types = list(ignore)
  146. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  147. tokens, self.callback = _create_unless(tokens)
  148. assert all(self.callback.values())
  149. self.tokens = tokens
  150. self.mres = build_mres(tokens)
  151. def lex(self, stream):
  152. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  153. class ContextualLexer:
  154. def __init__(self, tokens, states, ignore=(), always_accept=()):
  155. tokens_by_name = {}
  156. for t in tokens:
  157. assert t.name not in tokens_by_name, t
  158. tokens_by_name[t.name] = t
  159. lexer_by_tokens = {}
  160. self.lexers = {}
  161. for state, accepts in states.items():
  162. key = frozenset(accepts)
  163. try:
  164. lexer = lexer_by_tokens[key]
  165. except KeyError:
  166. accepts = set(accepts) | set(ignore) | set(always_accept)
  167. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
  168. lexer = Lexer(state_tokens, ignore=ignore)
  169. lexer_by_tokens[key] = lexer
  170. self.lexers[state] = lexer
  171. self.root_lexer = Lexer(tokens, ignore=ignore)
  172. self.set_parser_state(None) # Needs to be set on the outside
  173. def set_parser_state(self, state):
  174. self.parser_state = state
  175. def lex(self, stream):
  176. l = _Lex(self.lexers[self.parser_state])
  177. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  178. yield x
  179. l.lexer = self.lexers[self.parser_state]