This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

219 wiersze
7.5 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify, STRING_TYPE
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. class LexError(Exception):
  6. pass
  7. class UnexpectedInput(LexError):
  8. def __init__(self, seq, lex_pos, line, column):
  9. context = seq[lex_pos:lex_pos+5]
  10. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  11. super(LexError, self).__init__(message)
  12. self.line = line
  13. self.column = column
  14. self.context = context
  15. class Token(Str):
  16. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  17. inst = Str.__new__(cls, value)
  18. inst.type = type_
  19. inst.pos_in_stream = pos_in_stream
  20. inst.value = value
  21. inst.line = line
  22. inst.column = column
  23. return inst
  24. @classmethod
  25. def new_borrow_pos(cls, type_, value, borrow_t):
  26. inst = cls(type_, value, borrow_t.pos_in_stream)
  27. inst.line = borrow_t.line
  28. inst.column = borrow_t.column
  29. return inst
  30. def __repr__(self):
  31. return 'Token(%s, %r)' % (self.type, self.value)
  32. class Regex:
  33. def __init__(self, pattern, flags=()):
  34. self.pattern = pattern
  35. self.flags = flags
  36. def _regexp_has_newline(r):
  37. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  38. def _create_unless_callback(strs):
  39. def unless_callback(t):
  40. if t in strs:
  41. t.type = strs[t]
  42. return t
  43. return unless_callback
  44. def _create_unless(tokens):
  45. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  46. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  47. embedded_strs = set()
  48. callback = {}
  49. for retok in tokens_by_type.get(PatternRE, []):
  50. unless = {}
  51. for strtok in tokens_by_type.get(PatternStr, []):
  52. s = strtok.pattern.value
  53. m = re.match(retok.pattern.value, s)
  54. if m and m.group(0) == s:
  55. embedded_strs.add(strtok.name)
  56. unless[s] = strtok.name
  57. if unless:
  58. callback[retok.name] = _create_unless_callback(unless)
  59. tokens = [t for t in tokens if t.name not in embedded_strs]
  60. return tokens, callback
  61. class Lexer(object):
  62. def __init__(self, tokens, ignore=()):
  63. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  64. self.ignore = ignore
  65. self.newline_char = '\n'
  66. tokens = list(tokens)
  67. # Sanitization
  68. for t in tokens:
  69. try:
  70. re.compile(t.pattern.to_regexp())
  71. except:
  72. raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern))
  73. token_names = {t.name for t in tokens}
  74. for t in ignore:
  75. if t not in token_names:
  76. raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
  77. # Init
  78. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  79. self.ignore_types = [t for t in ignore]
  80. tokens, self.callback = _create_unless(tokens)
  81. assert all(self.callback.values())
  82. tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True)
  83. self.tokens = tokens
  84. self.mres = self._build_mres(tokens, len(tokens))
  85. def _build_mres(self, tokens, max_size):
  86. # Python sets an unreasonable group limit (currently 100) in its re module
  87. # Worse, the only way to know we reached it is by catching an AssertionError!
  88. # This function recursively tries less and less groups until it's successful.
  89. mres = []
  90. while tokens:
  91. try:
  92. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()) for t in tokens[:max_size]))
  93. except AssertionError: # Yes, this is what Python provides us.. :/
  94. return self._build_mres(tokens, max_size//2)
  95. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  96. tokens = tokens[max_size:]
  97. return mres
  98. def lex(self, stream):
  99. lex_pos = 0
  100. line = 1
  101. col_start_pos = 0
  102. newline_types = list(self.newline_types)
  103. ignore_types = list(self.ignore_types)
  104. while True:
  105. for mre, type_from_index in self.mres:
  106. m = mre.match(stream, lex_pos)
  107. if m:
  108. value = m.group(0)
  109. type_ = type_from_index[m.lastindex]
  110. if type_ not in ignore_types:
  111. t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
  112. if t.type in self.callback:
  113. t = self.callback[t.type](t)
  114. yield t
  115. if type_ in newline_types:
  116. newlines = value.count(self.newline_char)
  117. if newlines:
  118. line += newlines
  119. col_start_pos = lex_pos + value.rindex(self.newline_char)
  120. lex_pos += len(value)
  121. break
  122. else:
  123. if lex_pos < len(stream):
  124. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  125. break
  126. class ContextualLexer:
  127. def __init__(self, tokens, states, ignore=(), always_accept=()):
  128. tokens_by_name = {}
  129. for t in tokens:
  130. assert t.name not in tokens_by_name, t
  131. tokens_by_name[t.name] = t
  132. lexer_by_tokens = {}
  133. self.lexers = {}
  134. for state, accepts in states.items():
  135. key = frozenset(accepts)
  136. try:
  137. lexer = lexer_by_tokens[key]
  138. except KeyError:
  139. accepts = set(accepts) # For python3
  140. accepts |= set(ignore)
  141. accepts |= set(always_accept)
  142. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
  143. lexer = Lexer(state_tokens, ignore=ignore)
  144. lexer_by_tokens[key] = lexer
  145. self.lexers[state] = lexer
  146. self.root_lexer = Lexer(tokens, ignore=ignore)
  147. self.set_parser_state(None) # Needs to be set on the outside
  148. def set_parser_state(self, state):
  149. self.parser_state = state
  150. def lex(self, stream):
  151. lex_pos = 0
  152. line = 1
  153. col_start_pos = 0
  154. newline_types = list(self.root_lexer.newline_types)
  155. ignore_types = list(self.root_lexer.ignore_types)
  156. while True:
  157. lexer = self.lexers[self.parser_state]
  158. for mre, type_from_index in lexer.mres:
  159. m = mre.match(stream, lex_pos)
  160. if m:
  161. value = m.group(0)
  162. type_ = type_from_index[m.lastindex]
  163. if type_ not in ignore_types:
  164. t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
  165. if t.type in lexer.callback:
  166. t = lexer.callback[t.type](t)
  167. yield t
  168. if type_ in newline_types:
  169. newlines = value.count(lexer.newline_char)
  170. if newlines:
  171. line += newlines
  172. col_start_pos = lex_pos + value.rindex(lexer.newline_char)
  173. lex_pos += len(value)
  174. break
  175. else:
  176. if lex_pos < len(stream):
  177. print("Allowed tokens:", lexer.tokens)
  178. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  179. break