This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

219 line
7.5 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify, STRING_TYPE
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. class LexError(Exception):
  6. pass
  7. class UnexpectedInput(LexError):
  8. def __init__(self, seq, lex_pos, line, column):
  9. context = seq[lex_pos:lex_pos+5]
  10. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  11. super(LexError, self).__init__(message)
  12. self.line = line
  13. self.column = column
  14. self.context = context
  15. class Token(Str):
  16. def __new__(cls, type_, value, pos_in_stream=None):
  17. inst = Str.__new__(cls, value)
  18. inst.type = type_
  19. inst.pos_in_stream = pos_in_stream
  20. inst.value = value
  21. return inst
  22. @classmethod
  23. def new_borrow_pos(cls, type_, value, borrow_t):
  24. inst = cls(type_, value, borrow_t.pos_in_stream)
  25. inst.line = borrow_t.line
  26. inst.column = borrow_t.column
  27. return inst
  28. def __repr__(self):
  29. return 'Token(%s, %r)' % (self.type, self.value)
  30. class Regex:
  31. def __init__(self, pattern, flags=()):
  32. self.pattern = pattern
  33. self.flags = flags
  34. def _regexp_has_newline(r):
  35. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  36. def _create_unless_callback(strs):
  37. def unless_callback(t):
  38. if t in strs:
  39. t.type = strs[t]
  40. return t
  41. return unless_callback
  42. def _create_unless(tokens):
  43. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  44. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  45. embedded_strs = set()
  46. callback = {}
  47. for retok in tokens_by_type.get(PatternRE, []):
  48. unless = {}
  49. for strtok in tokens_by_type.get(PatternStr, []):
  50. s = strtok.pattern.value
  51. m = re.match(retok.pattern.value, s)
  52. if m and m.group(0) == s:
  53. embedded_strs.add(strtok.name)
  54. unless[s] = strtok.name
  55. if unless:
  56. callback[retok.name] = _create_unless_callback(unless)
  57. tokens = [t for t in tokens if t.name not in embedded_strs]
  58. return tokens, callback
  59. class Lexer(object):
  60. def __init__(self, tokens, ignore=()):
  61. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  62. self.ignore = ignore
  63. self.newline_char = '\n'
  64. tokens = list(tokens)
  65. # Sanitization
  66. for t in tokens:
  67. try:
  68. re.compile(t.pattern.to_regexp())
  69. except:
  70. raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern))
  71. token_names = {t.name for t in tokens}
  72. assert all(t in token_names for t in ignore)
  73. # Init
  74. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  75. self.ignore_types = [t for t in ignore]
  76. tokens, self.callback = _create_unless(tokens)
  77. assert all(self.callback.values())
  78. tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True)
  79. self.tokens = tokens
  80. self.mres = self._build_mres(tokens, len(tokens))
  81. def _build_mres(self, tokens, max_size):
  82. # Python sets an unreasonable group limit (currently 100) in its re module
  83. # Worse, the only way to know we reached it is by catching an AssertionError!
  84. # This function recursively tries less and less groups until it's successful.
  85. mres = []
  86. while tokens:
  87. try:
  88. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()) for t in tokens[:max_size]))
  89. except AssertionError: # Yes, this is what Python provides us.. :/
  90. return self._build_mres(tokens, max_size//2)
  91. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  92. tokens = tokens[max_size:]
  93. return mres
  94. def lex(self, stream):
  95. lex_pos = 0
  96. line = 1
  97. col_start_pos = 0
  98. newline_types = list(self.newline_types)
  99. ignore_types = list(self.ignore_types)
  100. while True:
  101. for mre, type_from_index in self.mres:
  102. m = mre.match(stream, lex_pos)
  103. if m:
  104. value = m.group(0)
  105. type_ = type_from_index[m.lastindex]
  106. if type_ not in ignore_types:
  107. t = Token(type_, value, lex_pos)
  108. t.line = line
  109. t.column = lex_pos - col_start_pos
  110. if t.type in self.callback:
  111. t = self.callback[t.type](t)
  112. yield t
  113. if type_ in newline_types:
  114. newlines = value.count(self.newline_char)
  115. if newlines:
  116. line += newlines
  117. col_start_pos = lex_pos + value.rindex(self.newline_char)
  118. lex_pos += len(value)
  119. break
  120. else:
  121. if lex_pos < len(stream):
  122. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  123. break
  124. class ContextualLexer:
  125. def __init__(self, tokens, states, ignore=(), always_accept=()):
  126. tokens_by_name = {}
  127. for t in tokens:
  128. assert t.name not in tokens_by_name, t
  129. tokens_by_name[t.name] = t
  130. lexer_by_tokens = {}
  131. self.lexers = {}
  132. for state, accepts in states.items():
  133. key = frozenset(accepts)
  134. try:
  135. lexer = lexer_by_tokens[key]
  136. except KeyError:
  137. accepts = set(accepts) # For python3
  138. accepts |= set(ignore)
  139. accepts |= set(always_accept)
  140. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
  141. lexer = Lexer(state_tokens, ignore=ignore)
  142. lexer_by_tokens[key] = lexer
  143. self.lexers[state] = lexer
  144. self.root_lexer = Lexer(tokens, ignore=ignore)
  145. self.set_parser_state(None) # Needs to be set on the outside
  146. def set_parser_state(self, state):
  147. self.parser_state = state
  148. def lex(self, stream):
  149. lex_pos = 0
  150. line = 1
  151. col_start_pos = 0
  152. newline_types = list(self.root_lexer.newline_types)
  153. ignore_types = list(self.root_lexer.ignore_types)
  154. while True:
  155. lexer = self.lexers[self.parser_state]
  156. for mre, type_from_index in lexer.mres:
  157. m = mre.match(stream, lex_pos)
  158. if m:
  159. value = m.group(0)
  160. type_ = type_from_index[m.lastindex]
  161. if type_ not in ignore_types:
  162. t = Token(type_, value, lex_pos)
  163. t.line = line
  164. t.column = lex_pos - col_start_pos
  165. if t.type in lexer.callback:
  166. t = lexer.callback[t.type](t)
  167. yield t
  168. if type_ in newline_types:
  169. newlines = value.count(lexer.newline_char)
  170. if newlines:
  171. line += newlines
  172. col_start_pos = lex_pos + value.rindex(lexer.newline_char)
  173. lex_pos += len(value)
  174. break
  175. else:
  176. if lex_pos < len(stream):
  177. print("Allowed tokens:", lexer.tokens)
  178. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  179. break