This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

250 satır
8.4 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. ###{standalone
  6. class LexError(Exception):
  7. pass
  8. class UnexpectedInput(LexError):
  9. def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None):
  10. context = seq[lex_pos:lex_pos+5]
  11. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  12. if allowed:
  13. message += '\n\nExpecting: %s\n' % allowed
  14. super(UnexpectedInput, self).__init__(message)
  15. self.line = line
  16. self.column = column
  17. self.context = context
  18. self.allowed = allowed
  19. self.considered_rules = considered_rules
  20. class Token(Str):
  21. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  22. inst = Str.__new__(cls, value)
  23. inst.type = type_
  24. inst.pos_in_stream = pos_in_stream
  25. inst.value = value
  26. inst.line = line
  27. inst.column = column
  28. return inst
  29. @classmethod
  30. def new_borrow_pos(cls, type_, value, borrow_t):
  31. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  32. def __repr__(self):
  33. return 'Token(%s, %r)' % (self.type, self.value)
  34. def __deepcopy__(self, memo):
  35. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  36. def __eq__(self, other):
  37. if isinstance(other, Token) and self.type != other.type:
  38. return False
  39. return Str.__eq__(self, other)
  40. __hash__ = Str.__hash__
  41. class LineCounter:
  42. def __init__(self):
  43. self.newline_char = '\n'
  44. self.char_pos = 0
  45. self.line = 1
  46. self.column = 0
  47. self.line_start_pos = 0
  48. def feed(self, token, test_newline=True):
  49. """Consume a token and calculate the new line & column.
  50. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  51. """
  52. if test_newline:
  53. newlines = token.count(self.newline_char)
  54. if newlines:
  55. self.line += newlines
  56. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  57. self.char_pos += len(token)
  58. self.column = self.char_pos - self.line_start_pos
  59. class _Lex:
  60. "Built to serve both Lexer and ContextualLexer"
  61. def __init__(self, lexer):
  62. self.lexer = lexer
  63. def lex(self, stream, newline_types, ignore_types):
  64. newline_types = list(newline_types)
  65. ignore_types = list(ignore_types)
  66. line_ctr = LineCounter()
  67. t = None
  68. while True:
  69. lexer = self.lexer
  70. for mre, type_from_index in lexer.mres:
  71. m = mre.match(stream, line_ctr.char_pos)
  72. if m:
  73. value = m.group(0)
  74. type_ = type_from_index[m.lastindex]
  75. if type_ not in ignore_types:
  76. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  77. if t.type in lexer.callback:
  78. t = lexer.callback[t.type](t)
  79. yield t
  80. else:
  81. if type_ in lexer.callback:
  82. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  83. lexer.callback[type_](t)
  84. line_ctr.feed(value, type_ in newline_types)
  85. if t:
  86. t.end_line = line_ctr.line
  87. t.end_column = line_ctr.column
  88. break
  89. else:
  90. if line_ctr.char_pos < len(stream):
  91. raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  92. break
  93. class UnlessCallback:
  94. def __init__(self, mres):
  95. self.mres = mres
  96. def __call__(self, t):
  97. for mre, type_from_index in self.mres:
  98. m = mre.match(t.value)
  99. if m:
  100. value = m.group(0)
  101. t.type = type_from_index[m.lastindex]
  102. break
  103. return t
  104. ###}
  105. def _create_unless(tokens):
  106. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  107. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  108. embedded_strs = set()
  109. callback = {}
  110. for retok in tokens_by_type.get(PatternRE, []):
  111. unless = [] # {}
  112. for strtok in tokens_by_type.get(PatternStr, []):
  113. s = strtok.pattern.value
  114. m = re.match(retok.pattern.to_regexp(), s)
  115. if m and m.group(0) == s:
  116. unless.append(strtok)
  117. if strtok.pattern.flags <= retok.pattern.flags:
  118. embedded_strs.add(strtok)
  119. if unless:
  120. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  121. tokens = [t for t in tokens if t not in embedded_strs]
  122. return tokens, callback
  123. def _build_mres(tokens, max_size, match_whole):
  124. # Python sets an unreasonable group limit (currently 100) in its re module
  125. # Worse, the only way to know we reached it is by catching an AssertionError!
  126. # This function recursively tries less and less groups until it's successful.
  127. postfix = '$' if match_whole else ''
  128. mres = []
  129. while tokens:
  130. try:
  131. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  132. except AssertionError: # Yes, this is what Python provides us.. :/
  133. return _build_mres(tokens, max_size//2, match_whole)
  134. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  135. tokens = tokens[max_size:]
  136. return mres
  137. def build_mres(tokens, match_whole=False):
  138. return _build_mres(tokens, len(tokens), match_whole)
  139. def _regexp_has_newline(r):
  140. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  141. class Lexer:
  142. def __init__(self, tokens, ignore=(), user_callbacks={}):
  143. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  144. tokens = list(tokens)
  145. # Sanitization
  146. for t in tokens:
  147. try:
  148. re.compile(t.pattern.to_regexp())
  149. except:
  150. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  151. if t.pattern.min_width == 0:
  152. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  153. assert set(ignore) <= {t.name for t in tokens}
  154. # Init
  155. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  156. self.ignore_types = list(ignore)
  157. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  158. tokens, self.callback = _create_unless(tokens)
  159. assert all(self.callback.values())
  160. for type_, f in user_callbacks.items():
  161. assert type_ not in self.callback
  162. self.callback[type_] = f
  163. self.tokens = tokens
  164. self.mres = build_mres(tokens)
  165. def lex(self, stream):
  166. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  167. class ContextualLexer:
  168. def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
  169. tokens_by_name = {}
  170. for t in tokens:
  171. assert t.name not in tokens_by_name, t
  172. tokens_by_name[t.name] = t
  173. lexer_by_tokens = {}
  174. self.lexers = {}
  175. for state, accepts in states.items():
  176. key = frozenset(accepts)
  177. try:
  178. lexer = lexer_by_tokens[key]
  179. except KeyError:
  180. accepts = set(accepts) | set(ignore) | set(always_accept)
  181. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
  182. lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  183. lexer_by_tokens[key] = lexer
  184. self.lexers[state] = lexer
  185. self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
  186. self.set_parser_state(None) # Needs to be set on the outside
  187. def set_parser_state(self, state):
  188. self.parser_state = state
  189. def lex(self, stream):
  190. l = _Lex(self.lexers[self.parser_state])
  191. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  192. yield x
  193. l.lexer = self.lexers[self.parser_state]