This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

328 lines
11 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify, get_regexp_width, Py36
  4. from .exceptions import UnexpectedCharacters, LexError
  5. class Pattern(object):
  6. def __init__(self, value, flags=()):
  7. self.value = value
  8. self.flags = frozenset(flags)
  9. def __repr__(self):
  10. return repr(self.to_regexp())
  11. # Pattern Hashing assumes all subclasses have a different priority!
  12. def __hash__(self):
  13. return hash((type(self), self.value, self.flags))
  14. def __eq__(self, other):
  15. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  16. def to_regexp(self):
  17. raise NotImplementedError()
  18. if Py36:
  19. # Python 3.6 changed syntax for flags in regular expression
  20. def _get_flags(self, value):
  21. for f in self.flags:
  22. value = ('(?%s:%s)' % (f, value))
  23. return value
  24. else:
  25. def _get_flags(self, value):
  26. for f in self.flags:
  27. value = ('(?%s)' % f) + value
  28. return value
  29. class PatternStr(Pattern):
  30. def to_regexp(self):
  31. return self._get_flags(re.escape(self.value))
  32. @property
  33. def min_width(self):
  34. return len(self.value)
  35. max_width = min_width
  36. class PatternRE(Pattern):
  37. def to_regexp(self):
  38. return self._get_flags(self.value)
  39. @property
  40. def min_width(self):
  41. return get_regexp_width(self.to_regexp())[0]
  42. @property
  43. def max_width(self):
  44. return get_regexp_width(self.to_regexp())[1]
  45. class TerminalDef(object):
  46. def __init__(self, name, pattern, priority=1):
  47. assert isinstance(pattern, Pattern), pattern
  48. self.name = name
  49. self.pattern = pattern
  50. self.priority = priority
  51. def __repr__(self):
  52. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  53. ###{standalone
  54. class Token(Str):
  55. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
  56. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
  57. self = super(Token, cls).__new__(cls, value)
  58. self.type = type_
  59. self.pos_in_stream = pos_in_stream
  60. self.value = value
  61. self.line = line
  62. self.column = column
  63. self.end_line = None
  64. self.end_column = None
  65. return self
  66. @classmethod
  67. def new_borrow_pos(cls, type_, value, borrow_t):
  68. return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
  69. def __reduce__(self):
  70. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  71. def __repr__(self):
  72. return 'Token(%s, %r)' % (self.type, self.value)
  73. def __deepcopy__(self, memo):
  74. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  75. def __eq__(self, other):
  76. if isinstance(other, Token) and self.type != other.type:
  77. return False
  78. return Str.__eq__(self, other)
  79. __hash__ = Str.__hash__
  80. class LineCounter:
  81. def __init__(self):
  82. self.newline_char = '\n'
  83. self.char_pos = 0
  84. self.line = 1
  85. self.column = 1
  86. self.line_start_pos = 0
  87. def feed(self, token, test_newline=True):
  88. """Consume a token and calculate the new line & column.
  89. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  90. """
  91. if test_newline:
  92. newlines = token.count(self.newline_char)
  93. if newlines:
  94. self.line += newlines
  95. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  96. self.char_pos += len(token)
  97. self.column = self.char_pos - self.line_start_pos + 1
  98. class _Lex:
  99. "Built to serve both Lexer and ContextualLexer"
  100. def __init__(self, lexer, state=None):
  101. self.lexer = lexer
  102. self.state = state
  103. def lex(self, stream, newline_types, ignore_types):
  104. newline_types = list(newline_types)
  105. ignore_types = list(ignore_types)
  106. line_ctr = LineCounter()
  107. t = None
  108. while True:
  109. lexer = self.lexer
  110. for mre, type_from_index in lexer.mres:
  111. m = mre.match(stream, line_ctr.char_pos)
  112. if m:
  113. value = m.group(0)
  114. type_ = type_from_index[m.lastindex]
  115. if type_ not in ignore_types:
  116. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  117. if t.type in lexer.callback:
  118. t = lexer.callback[t.type](t)
  119. yield t
  120. else:
  121. if type_ in lexer.callback:
  122. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  123. lexer.callback[type_](t)
  124. line_ctr.feed(value, type_ in newline_types)
  125. if t:
  126. t.end_line = line_ctr.line
  127. t.end_column = line_ctr.column
  128. break
  129. else:
  130. if line_ctr.char_pos < len(stream):
  131. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, state=self.state)
  132. break
  133. if t:
  134. t.end_line = line_ctr.line
  135. t.end_column = line_ctr.column
  136. class UnlessCallback:
  137. def __init__(self, mres):
  138. self.mres = mres
  139. def __call__(self, t):
  140. for mre, type_from_index in self.mres:
  141. m = mre.match(t.value)
  142. if m:
  143. t.type = type_from_index[m.lastindex]
  144. break
  145. return t
  146. ###}
  147. def _create_unless(terminals):
  148. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  149. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  150. embedded_strs = set()
  151. callback = {}
  152. for retok in tokens_by_type.get(PatternRE, []):
  153. unless = [] # {}
  154. for strtok in tokens_by_type.get(PatternStr, []):
  155. if strtok.priority > retok.priority:
  156. continue
  157. s = strtok.pattern.value
  158. m = re.match(retok.pattern.to_regexp(), s)
  159. if m and m.group(0) == s:
  160. unless.append(strtok)
  161. if strtok.pattern.flags <= retok.pattern.flags:
  162. embedded_strs.add(strtok)
  163. if unless:
  164. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  165. terminals = [t for t in terminals if t not in embedded_strs]
  166. return terminals, callback
  167. def _build_mres(terminals, max_size, match_whole):
  168. # Python sets an unreasonable group limit (currently 100) in its re module
  169. # Worse, the only way to know we reached it is by catching an AssertionError!
  170. # This function recursively tries less and less groups until it's successful.
  171. postfix = '$' if match_whole else ''
  172. mres = []
  173. while terminals:
  174. try:
  175. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
  176. except AssertionError: # Yes, this is what Python provides us.. :/
  177. return _build_mres(terminals, max_size//2, match_whole)
  178. # terms_from_name = {t.name: t for t in terminals[:max_size]}
  179. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  180. terminals = terminals[max_size:]
  181. return mres
  182. def build_mres(terminals, match_whole=False):
  183. return _build_mres(terminals, len(terminals), match_whole)
  184. def _regexp_has_newline(r):
  185. """Expressions that may indicate newlines in a regexp:
  186. - newlines (\n)
  187. - escaped newline (\\n)
  188. - anything but ([^...])
  189. - any-char (.) when the flag (?s) exists
  190. """
  191. return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
  192. class Lexer:
  193. """Lexer interface
  194. Method Signatures:
  195. lex(self, stream) -> Iterator[Token]
  196. set_parser_state(self, state) # Optional
  197. """
  198. set_parser_state = NotImplemented
  199. lex = NotImplemented
  200. class TraditionalLexer(Lexer):
  201. def __init__(self, terminals, ignore=(), user_callbacks={}):
  202. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  203. terminals = list(terminals)
  204. # Sanitization
  205. for t in terminals:
  206. try:
  207. re.compile(t.pattern.to_regexp())
  208. except:
  209. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  210. if t.pattern.min_width == 0:
  211. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  212. assert set(ignore) <= {t.name for t in terminals}
  213. # Init
  214. self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
  215. self.ignore_types = list(ignore)
  216. terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  217. terminals, self.callback = _create_unless(terminals)
  218. assert all(self.callback.values())
  219. for type_, f in user_callbacks.items():
  220. assert type_ not in self.callback
  221. self.callback[type_] = f
  222. self.terminals = terminals
  223. self.mres = build_mres(terminals)
  224. def lex(self, stream):
  225. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  226. class ContextualLexer(Lexer):
  227. def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
  228. tokens_by_name = {}
  229. for t in terminals:
  230. assert t.name not in tokens_by_name, t
  231. tokens_by_name[t.name] = t
  232. lexer_by_tokens = {}
  233. self.lexers = {}
  234. for state, accepts in states.items():
  235. key = frozenset(accepts)
  236. try:
  237. lexer = lexer_by_tokens[key]
  238. except KeyError:
  239. accepts = set(accepts) | set(ignore) | set(always_accept)
  240. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  241. lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  242. lexer_by_tokens[key] = lexer
  243. self.lexers[state] = lexer
  244. self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)
  245. self.set_parser_state(None) # Needs to be set on the outside
  246. def set_parser_state(self, state):
  247. self.parser_state = state
  248. def lex(self, stream):
  249. l = _Lex(self.lexers[self.parser_state], self.parser_state)
  250. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  251. yield x
  252. l.lexer = self.lexers[self.parser_state]
  253. l.state = self.parser_state