This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

253 linhas
8.7 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. class LexError(Exception):
  6. pass
  7. class UnexpectedInput(LexError):
  8. def __init__(self, seq, lex_pos, line, column, allowed=None):
  9. context = seq[lex_pos:lex_pos+5]
  10. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  11. super(UnexpectedInput, self).__init__(message)
  12. self.line = line
  13. self.column = column
  14. self.context = context
  15. self.allowed = allowed
  16. class Token(Str):
  17. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  18. inst = Str.__new__(cls, value)
  19. inst.type = type_
  20. inst.pos_in_stream = pos_in_stream
  21. inst.value = value
  22. inst.line = line
  23. inst.column = column
  24. return inst
  25. @classmethod
  26. def new_borrow_pos(cls, type_, value, borrow_t):
  27. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  28. def __repr__(self):
  29. return 'Token(%s, %r)' % (self.type, self.value)
  30. def __deepcopy__(self, memo):
  31. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  32. def __eq__(self, other):
  33. if isinstance(other, Token) and self.type != other.type:
  34. return False
  35. return Str.__eq__(self, other)
  36. __hash__ = Str.__hash__
  37. class Regex:
  38. def __init__(self, pattern, flags=()):
  39. self.pattern = pattern
  40. self.flags = flags
  41. def _regexp_has_newline(r):
  42. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  43. def _create_unless_callback(strs):
  44. mres = build_mres(strs, match_whole=True)
  45. def unless_callback(t):
  46. # if t in strs:
  47. # t.type = strs[t]
  48. for mre, type_from_index in mres:
  49. m = mre.match(t.value)
  50. if m:
  51. value = m.group(0)
  52. t.type = type_from_index[m.lastindex]
  53. break
  54. return t
  55. return unless_callback
  56. def _create_unless(tokens):
  57. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  58. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  59. embedded_strs = set()
  60. delayed_strs = []
  61. callback = {}
  62. for retok in tokens_by_type.get(PatternRE, []):
  63. unless = [] # {}
  64. for strtok in tokens_by_type.get(PatternStr, []):
  65. s = strtok.pattern.value
  66. m = re.match(retok.pattern.to_regexp(), s)
  67. if m and m.group(0) == s:
  68. if strtok.pattern.flags:
  69. delayed_strs.append(strtok)
  70. embedded_strs.add(strtok.name)
  71. unless.append(strtok)
  72. if unless:
  73. callback[retok.name] = _create_unless_callback(unless)
  74. tokens = [t for t in tokens if t.name not in embedded_strs] + delayed_strs
  75. return tokens, callback
  76. def _build_mres(tokens, max_size, match_whole):
  77. # Python sets an unreasonable group limit (currently 100) in its re module
  78. # Worse, the only way to know we reached it is by catching an AssertionError!
  79. # This function recursively tries less and less groups until it's successful.
  80. postfix = '$' if match_whole else ''
  81. mres = []
  82. while tokens:
  83. try:
  84. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  85. except AssertionError: # Yes, this is what Python provides us.. :/
  86. return _build_mres(tokens, max_size//2, match_whole)
  87. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  88. tokens = tokens[max_size:]
  89. return mres
  90. def build_mres(tokens, match_whole=False):
  91. return _build_mres(tokens, len(tokens), match_whole)
  92. class Lexer(object):
  93. def __init__(self, tokens, ignore=()):
  94. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  95. self.ignore = ignore
  96. self.newline_char = '\n'
  97. tokens = list(tokens)
  98. # Sanitization
  99. for t in tokens:
  100. try:
  101. re.compile(t.pattern.to_regexp())
  102. except:
  103. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  104. if t.pattern.min_width == 0:
  105. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  106. token_names = {t.name for t in tokens}
  107. for t in ignore:
  108. if t not in token_names:
  109. raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
  110. # Init
  111. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  112. self.ignore_types = [t for t in ignore]
  113. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  114. tokens, self.callback = _create_unless(tokens)
  115. assert all(self.callback.values())
  116. self.tokens = tokens
  117. self.mres = build_mres(tokens)
  118. def lex(self, stream):
  119. lex_pos = 0
  120. line = 1
  121. col_start_pos = 0
  122. newline_types = list(self.newline_types)
  123. ignore_types = list(self.ignore_types)
  124. while True:
  125. for mre, type_from_index in self.mres:
  126. m = mre.match(stream, lex_pos)
  127. if m:
  128. value = m.group(0)
  129. type_ = type_from_index[m.lastindex]
  130. to_yield = type_ not in ignore_types
  131. if to_yield:
  132. t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
  133. end_col = t.column + len(value)
  134. if t.type in self.callback:
  135. t = self.callback[t.type](t)
  136. if type_ in newline_types:
  137. newlines = value.count(self.newline_char)
  138. if newlines:
  139. line += newlines
  140. last_newline_index = value.rindex(self.newline_char) + 1
  141. col_start_pos = lex_pos + last_newline_index
  142. end_col = len(value) - last_newline_index
  143. if to_yield:
  144. t.end_line = line
  145. t.end_col = end_col
  146. yield t
  147. lex_pos += len(value)
  148. break
  149. else:
  150. if lex_pos < len(stream):
  151. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  152. break
  153. class ContextualLexer:
  154. def __init__(self, tokens, states, ignore=(), always_accept=()):
  155. tokens_by_name = {}
  156. for t in tokens:
  157. assert t.name not in tokens_by_name, t
  158. tokens_by_name[t.name] = t
  159. lexer_by_tokens = {}
  160. self.lexers = {}
  161. for state, accepts in states.items():
  162. key = frozenset(accepts)
  163. try:
  164. lexer = lexer_by_tokens[key]
  165. except KeyError:
  166. accepts = set(accepts) | set(ignore) | set(always_accept)
  167. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
  168. lexer = Lexer(state_tokens, ignore=ignore)
  169. lexer_by_tokens[key] = lexer
  170. self.lexers[state] = lexer
  171. self.root_lexer = Lexer(tokens, ignore=ignore)
  172. self.set_parser_state(None) # Needs to be set on the outside
  173. def set_parser_state(self, state):
  174. self.parser_state = state
  175. def lex(self, stream):
  176. lex_pos = 0
  177. line = 1
  178. col_start_pos = 0
  179. newline_types = list(self.root_lexer.newline_types)
  180. ignore_types = list(self.root_lexer.ignore_types)
  181. while True:
  182. lexer = self.lexers[self.parser_state]
  183. for mre, type_from_index in lexer.mres:
  184. m = mre.match(stream, lex_pos)
  185. if m:
  186. value = m.group(0)
  187. type_ = type_from_index[m.lastindex]
  188. if type_ not in ignore_types:
  189. t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
  190. if t.type in lexer.callback:
  191. t = lexer.callback[t.type](t)
  192. yield t
  193. if type_ in newline_types:
  194. newlines = value.count(lexer.newline_char)
  195. if newlines:
  196. line += newlines
  197. col_start_pos = lex_pos + value.rindex(lexer.newline_char)
  198. lex_pos += len(value)
  199. break
  200. else:
  201. if lex_pos < len(stream):
  202. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
  203. break