This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

250 rindas
8.4 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. ###{standalone
  6. class LexError(Exception):
  7. pass
  8. class UnexpectedInput(LexError):
  9. def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None):
  10. context = seq[lex_pos:lex_pos+5]
  11. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  12. if allowed:
  13. message += '\n\nExpecting: %s\n' % allowed
  14. super(UnexpectedInput, self).__init__(message)
  15. self.line = line
  16. self.column = column
  17. self.context = context
  18. self.allowed = allowed
  19. self.considered_rules = considered_rules
  20. class Token(Str):
  21. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  22. inst = Str.__new__(cls, value)
  23. inst.type = type_
  24. inst.pos_in_stream = pos_in_stream
  25. inst.value = value
  26. inst.line = line
  27. inst.column = column
  28. return inst
  29. @classmethod
  30. def new_borrow_pos(cls, type_, value, borrow_t):
  31. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  32. def __repr__(self):
  33. return 'Token(%s, %r)' % (self.type, self.value)
  34. def __deepcopy__(self, memo):
  35. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  36. def __eq__(self, other):
  37. if isinstance(other, Token) and self.type != other.type:
  38. return False
  39. return Str.__eq__(self, other)
  40. __hash__ = Str.__hash__
  41. class LineCounter:
  42. def __init__(self):
  43. self.newline_char = '\n'
  44. self.char_pos = 0
  45. self.line = 1
  46. self.column = 0
  47. self.line_start_pos = 0
  48. def feed(self, token, test_newline=True):
  49. """Consume a token and calculate the new line & column.
  50. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  51. """
  52. if test_newline:
  53. newlines = token.count(self.newline_char)
  54. if newlines:
  55. self.line += newlines
  56. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  57. self.char_pos += len(token)
  58. self.column = self.char_pos - self.line_start_pos
  59. class _Lex:
  60. "Built to serve both Lexer and ContextualLexer"
  61. def __init__(self, lexer):
  62. self.lexer = lexer
  63. def lex(self, stream, newline_types, ignore_types):
  64. newline_types = list(newline_types)
  65. ignore_types = list(ignore_types)
  66. line_ctr = LineCounter()
  67. t = None
  68. while True:
  69. lexer = self.lexer
  70. for mre, type_from_index in lexer.mres:
  71. m = mre.match(stream, line_ctr.char_pos)
  72. if m:
  73. value = m.group(0)
  74. type_ = type_from_index[m.lastindex]
  75. if type_ not in ignore_types:
  76. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  77. if t.type in lexer.callback:
  78. t = lexer.callback[t.type](t)
  79. yield t
  80. else:
  81. if type_ in lexer.callback:
  82. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  83. lexer.callback[type_](t)
  84. line_ctr.feed(value, type_ in newline_types)
  85. if t:
  86. t.end_line = line_ctr.line
  87. t.end_column = line_ctr.column
  88. break
  89. else:
  90. if line_ctr.char_pos < len(stream):
  91. raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  92. break
  93. class UnlessCallback:
  94. def __init__(self, mres):
  95. self.mres = mres
  96. def __call__(self, t):
  97. for mre, type_from_index in self.mres:
  98. m = mre.match(t.value)
  99. if m:
  100. value = m.group(0)
  101. t.type = type_from_index[m.lastindex]
  102. break
  103. return t
  104. ###}
  105. def _create_unless(tokens):
  106. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  107. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  108. embedded_strs = set()
  109. callback = {}
  110. for retok in tokens_by_type.get(PatternRE, []):
  111. unless = [] # {}
  112. for strtok in tokens_by_type.get(PatternStr, []):
  113. s = strtok.pattern.value
  114. m = re.match(retok.pattern.to_regexp(), s)
  115. if m and m.group(0) == s:
  116. unless.append(strtok)
  117. if strtok.pattern.flags <= retok.pattern.flags:
  118. embedded_strs.add(strtok)
  119. if unless:
  120. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  121. tokens = [t for t in tokens if t not in embedded_strs]
  122. return tokens, callback
  123. def _build_mres(tokens, max_size, match_whole):
  124. # Python sets an unreasonable group limit (currently 100) in its re module
  125. # Worse, the only way to know we reached it is by catching an AssertionError!
  126. # This function recursively tries less and less groups until it's successful.
  127. postfix = '$' if match_whole else ''
  128. mres = []
  129. while tokens:
  130. try:
  131. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  132. except AssertionError: # Yes, this is what Python provides us.. :/
  133. return _build_mres(tokens, max_size//2, match_whole)
  134. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  135. tokens = tokens[max_size:]
  136. return mres
  137. def build_mres(tokens, match_whole=False):
  138. return _build_mres(tokens, len(tokens), match_whole)
  139. def _regexp_has_newline(r):
  140. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  141. class Lexer:
  142. def __init__(self, tokens, ignore=(), user_callbacks={}):
  143. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  144. tokens = list(tokens)
  145. # Sanitization
  146. for t in tokens:
  147. try:
  148. re.compile(t.pattern.to_regexp())
  149. except:
  150. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  151. if t.pattern.min_width == 0:
  152. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  153. assert set(ignore) <= {t.name for t in tokens}
  154. # Init
  155. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  156. self.ignore_types = list(ignore)
  157. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  158. tokens, self.callback = _create_unless(tokens)
  159. assert all(self.callback.values())
  160. for type_, f in user_callbacks.items():
  161. assert type_ not in self.callback
  162. self.callback[type_] = f
  163. self.tokens = tokens
  164. self.mres = build_mres(tokens)
  165. def lex(self, stream):
  166. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  167. class ContextualLexer:
  168. def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
  169. tokens_by_name = {}
  170. for t in tokens:
  171. assert t.name not in tokens_by_name, t
  172. tokens_by_name[t.name] = t
  173. lexer_by_tokens = {}
  174. self.lexers = {}
  175. for state, accepts in states.items():
  176. key = frozenset(accepts)
  177. try:
  178. lexer = lexer_by_tokens[key]
  179. except KeyError:
  180. accepts = set(accepts) | set(ignore) | set(always_accept)
  181. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
  182. lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  183. lexer_by_tokens[key] = lexer
  184. self.lexers[state] = lexer
  185. self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
  186. self.set_parser_state(None) # Needs to be set on the outside
  187. def set_parser_state(self, state):
  188. self.parser_state = state
  189. def lex(self, stream):
  190. l = _Lex(self.lexers[self.parser_state])
  191. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  192. yield x
  193. l.lexer = self.lexers[self.parser_state]