This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

239 行
7.8 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal
  5. class LexError(Exception):
  6. pass
  7. class TokenDef(object):
  8. def __init__(self, name, value):
  9. self.name = name
  10. self.value = value
  11. def __repr__(self):
  12. return '%s(%r, %r)' % (type(self).__name__, self.name, self.value)
  13. class TokenDef__Str(TokenDef):
  14. def to_regexp(self):
  15. return re.escape(self.value)
  16. priority = 0
  17. class TokenDef__Regexp(TokenDef):
  18. def to_regexp(self):
  19. return self.value
  20. priority = 1
  21. class UnexpectedInput(LexError):
  22. def __init__(self, seq, lex_pos, line, column):
  23. context = seq[lex_pos:lex_pos+5]
  24. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  25. super(LexError, self).__init__(message)
  26. self.line = line
  27. self.column = column
  28. self.context = context
  29. class Token(Str):
  30. def __new__(cls, type_, value, pos_in_stream=None):
  31. inst = Str.__new__(cls, value)
  32. inst.type = type_
  33. inst.pos_in_stream = pos_in_stream
  34. inst.value = value
  35. return inst
  36. @classmethod
  37. def new_borrow_pos(cls, type_, value, borrow_t):
  38. inst = cls(type_, value, borrow_t.pos_in_stream)
  39. inst.line = borrow_t.line
  40. inst.column = borrow_t.column
  41. return inst
  42. def __repr__(self):
  43. return 'Token(%s, %r)' % (self.type, self.value)
  44. class Regex:
  45. def __init__(self, pattern, flags=()):
  46. self.pattern = pattern
  47. self.flags = flags
  48. def _regexp_has_newline(r):
  49. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  50. def _create_unless_callback(strs):
  51. def unless_callback(t):
  52. if t in strs:
  53. t.type = strs[t]
  54. return t
  55. return unless_callback
  56. def _create_unless(tokens):
  57. tokens_by_type = classify(tokens, type)
  58. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  59. embedded_strs = set()
  60. callback = {}
  61. for retok in tokens_by_type.get(TokenDef__Regexp, []):
  62. unless = {}
  63. for strtok in tokens_by_type.get(TokenDef__Str, []):
  64. m = re.match(retok.value, strtok.value)
  65. if m and m.group(0) == strtok.value:
  66. embedded_strs.add(strtok.name)
  67. unless[strtok.value] = strtok.name
  68. if unless:
  69. callback[retok.name] = _create_unless_callback(unless)
  70. tokens = [t for t in tokens if t.name not in embedded_strs]
  71. return tokens, callback
  72. class Lexer(object):
  73. def __init__(self, tokens, ignore=()):
  74. assert all(isinstance(t, TokenDef) for t in tokens)
  75. self.ignore = ignore
  76. self.newline_char = '\n'
  77. tokens = list(tokens)
  78. # Sanitization
  79. for t in tokens:
  80. try:
  81. re.compile(t.to_regexp())
  82. except:
  83. raise LexError("Cannot compile token: %s: %s" % t)
  84. token_names = {t.name for t in tokens}
  85. assert all(t in token_names for t in ignore)
  86. # Init
  87. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())]
  88. self.ignore_types = [t for t in ignore]
  89. tokens, self.callback = _create_unless(tokens)
  90. assert all(self.callback.values())
  91. tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True)
  92. self.tokens = tokens
  93. self.mres = self._build_mres(tokens, len(tokens))
  94. def _build_mres(self, tokens, max_size):
  95. # Python sets an unreasonable group limit (currently 100) in its re module
  96. # Worse, the only way to know we reached it is by catching an AssertionError!
  97. # This function recursively tries less and less groups until it's successful.
  98. mres = []
  99. while tokens:
  100. try:
  101. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size]))
  102. except AssertionError: # Yes, this is what Python provides us.. :/
  103. return self._build_mres(tokens, max_size//2)
  104. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  105. tokens = tokens[max_size:]
  106. return mres
  107. def lex(self, stream):
  108. lex_pos = 0
  109. line = 1
  110. col_start_pos = 0
  111. newline_types = list(self.newline_types)
  112. ignore_types = list(self.ignore_types)
  113. while True:
  114. for mre, type_from_index in self.mres:
  115. m = mre.match(stream, lex_pos)
  116. if m:
  117. value = m.group(0)
  118. type_ = type_from_index[m.lastindex]
  119. if type_ not in ignore_types:
  120. t = Token(type_, value, lex_pos)
  121. t.line = line
  122. t.column = lex_pos - col_start_pos
  123. if t.type in self.callback:
  124. t = self.callback[t.type](t)
  125. yield t
  126. if type_ in newline_types:
  127. newlines = value.count(self.newline_char)
  128. if newlines:
  129. line += newlines
  130. col_start_pos = lex_pos + value.rindex(self.newline_char)
  131. lex_pos += len(value)
  132. break
  133. else:
  134. if lex_pos < len(stream):
  135. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  136. break
  137. class ContextualLexer:
  138. def __init__(self, tokens, states, ignore=(), always_accept=()):
  139. tokens_by_name = {}
  140. for t in tokens:
  141. assert t.name not in tokens_by_name
  142. tokens_by_name[t.name] = t
  143. lexer_by_tokens = {}
  144. self.lexers = {}
  145. for state, accepts in states.items():
  146. key = frozenset(accepts)
  147. try:
  148. lexer = lexer_by_tokens[key]
  149. except KeyError:
  150. accepts = set(accepts) # For python3
  151. accepts |= set(ignore)
  152. accepts |= set(always_accept)
  153. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
  154. lexer = Lexer(state_tokens, ignore=ignore)
  155. lexer_by_tokens[key] = lexer
  156. self.lexers[state] = lexer
  157. self.root_lexer = Lexer(tokens, ignore=ignore)
  158. self.set_parser_state(None) # Needs to be set on the outside
  159. def set_parser_state(self, state):
  160. self.parser_state = state
  161. def lex(self, stream):
  162. lex_pos = 0
  163. line = 1
  164. col_start_pos = 0
  165. newline_types = list(self.root_lexer.newline_types)
  166. ignore_types = list(self.root_lexer.ignore_types)
  167. while True:
  168. lexer = self.lexers[self.parser_state]
  169. for mre, type_from_index in lexer.mres:
  170. m = mre.match(stream, lex_pos)
  171. if m:
  172. value = m.group(0)
  173. type_ = type_from_index[m.lastindex]
  174. if type_ not in ignore_types:
  175. t = Token(type_, value, lex_pos)
  176. t.line = line
  177. t.column = lex_pos - col_start_pos
  178. if t.type in lexer.callback:
  179. t = lexer.callback[t.type](t)
  180. yield t
  181. if type_ in newline_types:
  182. newlines = value.count(lexer.newline_char)
  183. if newlines:
  184. line += newlines
  185. col_start_pos = lex_pos + value.rindex(lexer.newline_char)
  186. lex_pos += len(value)
  187. break
  188. else:
  189. if lex_pos < len(stream):
  190. print("Allowed tokens:", lexer.tokens)
  191. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  192. break