This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

253 行
8.7 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal, PatternStr, PatternRE, TokenDef
  5. class LexError(Exception):
  6. pass
  7. class UnexpectedInput(LexError):
  8. def __init__(self, seq, lex_pos, line, column, allowed=None):
  9. context = seq[lex_pos:lex_pos+5]
  10. message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
  11. super(UnexpectedInput, self).__init__(message)
  12. self.line = line
  13. self.column = column
  14. self.context = context
  15. self.allowed = allowed
  16. class Token(Str):
  17. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  18. inst = Str.__new__(cls, value)
  19. inst.type = type_
  20. inst.pos_in_stream = pos_in_stream
  21. inst.value = value
  22. inst.line = line
  23. inst.column = column
  24. return inst
  25. @classmethod
  26. def new_borrow_pos(cls, type_, value, borrow_t):
  27. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  28. def __repr__(self):
  29. return 'Token(%s, %r)' % (self.type, self.value)
  30. def __deepcopy__(self, memo):
  31. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  32. def __eq__(self, other):
  33. if isinstance(other, Token) and self.type != other.type:
  34. return False
  35. return Str.__eq__(self, other)
  36. __hash__ = Str.__hash__
  37. class Regex:
  38. def __init__(self, pattern, flags=()):
  39. self.pattern = pattern
  40. self.flags = flags
  41. def _regexp_has_newline(r):
  42. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  43. def _create_unless_callback(strs):
  44. mres = build_mres(strs, match_whole=True)
  45. def unless_callback(t):
  46. # if t in strs:
  47. # t.type = strs[t]
  48. for mre, type_from_index in mres:
  49. m = mre.match(t.value)
  50. if m:
  51. value = m.group(0)
  52. t.type = type_from_index[m.lastindex]
  53. break
  54. return t
  55. return unless_callback
  56. def _create_unless(tokens):
  57. tokens_by_type = classify(tokens, lambda t: type(t.pattern))
  58. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  59. embedded_strs = set()
  60. delayed_strs = []
  61. callback = {}
  62. for retok in tokens_by_type.get(PatternRE, []):
  63. unless = [] # {}
  64. for strtok in tokens_by_type.get(PatternStr, []):
  65. s = strtok.pattern.value
  66. m = re.match(retok.pattern.to_regexp(), s)
  67. if m and m.group(0) == s:
  68. if strtok.pattern.flags:
  69. delayed_strs.append(strtok)
  70. embedded_strs.add(strtok.name)
  71. unless.append(strtok)
  72. if unless:
  73. callback[retok.name] = _create_unless_callback(unless)
  74. tokens = [t for t in tokens if t.name not in embedded_strs] + delayed_strs
  75. return tokens, callback
  76. def _build_mres(tokens, max_size, match_whole):
  77. # Python sets an unreasonable group limit (currently 100) in its re module
  78. # Worse, the only way to know we reached it is by catching an AssertionError!
  79. # This function recursively tries less and less groups until it's successful.
  80. postfix = '$' if match_whole else ''
  81. mres = []
  82. while tokens:
  83. try:
  84. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
  85. except AssertionError: # Yes, this is what Python provides us.. :/
  86. return _build_mres(tokens, max_size//2, match_whole)
  87. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  88. tokens = tokens[max_size:]
  89. return mres
  90. def build_mres(tokens, match_whole=False):
  91. return _build_mres(tokens, len(tokens), match_whole)
  92. class Lexer(object):
  93. def __init__(self, tokens, ignore=()):
  94. assert all(isinstance(t, TokenDef) for t in tokens), tokens
  95. self.ignore = ignore
  96. self.newline_char = '\n'
  97. tokens = list(tokens)
  98. # Sanitization
  99. for t in tokens:
  100. try:
  101. re.compile(t.pattern.to_regexp())
  102. except:
  103. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  104. if t.pattern.min_width == 0:
  105. raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
  106. token_names = {t.name for t in tokens}
  107. for t in ignore:
  108. if t not in token_names:
  109. raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
  110. # Init
  111. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
  112. self.ignore_types = [t for t in ignore]
  113. tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  114. tokens, self.callback = _create_unless(tokens)
  115. assert all(self.callback.values())
  116. self.tokens = tokens
  117. self.mres = build_mres(tokens)
  118. def lex(self, stream):
  119. lex_pos = 0
  120. line = 1
  121. col_start_pos = 0
  122. newline_types = list(self.newline_types)
  123. ignore_types = list(self.ignore_types)
  124. while True:
  125. for mre, type_from_index in self.mres:
  126. m = mre.match(stream, lex_pos)
  127. if m:
  128. value = m.group(0)
  129. type_ = type_from_index[m.lastindex]
  130. to_yield = type_ not in ignore_types
  131. if to_yield:
  132. t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
  133. end_col = t.column + len(value)
  134. if t.type in self.callback:
  135. t = self.callback[t.type](t)
  136. if type_ in newline_types:
  137. newlines = value.count(self.newline_char)
  138. if newlines:
  139. line += newlines
  140. last_newline_index = value.rindex(self.newline_char) + 1
  141. col_start_pos = lex_pos + last_newline_index
  142. end_col = len(value) - last_newline_index
  143. if to_yield:
  144. t.end_line = line
  145. t.end_col = end_col
  146. yield t
  147. lex_pos += len(value)
  148. break
  149. else:
  150. if lex_pos < len(stream):
  151. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  152. break
  153. class ContextualLexer:
  154. def __init__(self, tokens, states, ignore=(), always_accept=()):
  155. tokens_by_name = {}
  156. for t in tokens:
  157. assert t.name not in tokens_by_name, t
  158. tokens_by_name[t.name] = t
  159. lexer_by_tokens = {}
  160. self.lexers = {}
  161. for state, accepts in states.items():
  162. key = frozenset(accepts)
  163. try:
  164. lexer = lexer_by_tokens[key]
  165. except KeyError:
  166. accepts = set(accepts) | set(ignore) | set(always_accept)
  167. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
  168. lexer = Lexer(state_tokens, ignore=ignore)
  169. lexer_by_tokens[key] = lexer
  170. self.lexers[state] = lexer
  171. self.root_lexer = Lexer(tokens, ignore=ignore)
  172. self.set_parser_state(None) # Needs to be set on the outside
  173. def set_parser_state(self, state):
  174. self.parser_state = state
  175. def lex(self, stream):
  176. lex_pos = 0
  177. line = 1
  178. col_start_pos = 0
  179. newline_types = list(self.root_lexer.newline_types)
  180. ignore_types = list(self.root_lexer.ignore_types)
  181. while True:
  182. lexer = self.lexers[self.parser_state]
  183. for mre, type_from_index in lexer.mres:
  184. m = mre.match(stream, lex_pos)
  185. if m:
  186. value = m.group(0)
  187. type_ = type_from_index[m.lastindex]
  188. if type_ not in ignore_types:
  189. t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
  190. if t.type in lexer.callback:
  191. t = lexer.callback[t.type](t)
  192. yield t
  193. if type_ in newline_types:
  194. newlines = value.count(lexer.newline_char)
  195. if newlines:
  196. line += newlines
  197. col_start_pos = lex_pos + value.rindex(lexer.newline_char)
  198. lex_pos += len(value)
  199. break
  200. else:
  201. if lex_pos < len(stream):
  202. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
  203. break