This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

389 lines
13 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify, get_regexp_width, Py36, Serialize
  4. from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
  5. ###{standalone
  6. class Pattern(Serialize):
  7. def __init__(self, value, flags=()):
  8. self.value = value
  9. self.flags = frozenset(flags)
  10. def __repr__(self):
  11. return repr(self.to_regexp())
  12. # Pattern Hashing assumes all subclasses have a different priority!
  13. def __hash__(self):
  14. return hash((type(self), self.value, self.flags))
  15. def __eq__(self, other):
  16. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  17. def to_regexp(self):
  18. raise NotImplementedError()
  19. if Py36:
  20. # Python 3.6 changed syntax for flags in regular expression
  21. def _get_flags(self, value):
  22. for f in self.flags:
  23. value = ('(?%s:%s)' % (f, value))
  24. return value
  25. else:
  26. def _get_flags(self, value):
  27. for f in self.flags:
  28. value = ('(?%s)' % f) + value
  29. return value
  30. class PatternStr(Pattern):
  31. __serialize_fields__ = 'value', 'flags'
  32. type = "str"
  33. def to_regexp(self):
  34. return self._get_flags(re.escape(self.value))
  35. @property
  36. def min_width(self):
  37. return len(self.value)
  38. max_width = min_width
  39. class PatternRE(Pattern):
  40. __serialize_fields__ = 'value', 'flags', '_width'
  41. type = "re"
  42. def to_regexp(self):
  43. return self._get_flags(self.value)
  44. _width = None
  45. def _get_width(self):
  46. if self._width is None:
  47. self._width = get_regexp_width(self.to_regexp())
  48. return self._width
  49. @property
  50. def min_width(self):
  51. return self._get_width()[0]
  52. @property
  53. def max_width(self):
  54. return self._get_width()[1]
  55. class TerminalDef(Serialize):
  56. __serialize_fields__ = 'name', 'pattern', 'priority'
  57. __serialize_namespace__ = PatternStr, PatternRE
  58. def __init__(self, name, pattern, priority=1):
  59. assert isinstance(pattern, Pattern), pattern
  60. self.name = name
  61. self.pattern = pattern
  62. self.priority = priority
  63. def __repr__(self):
  64. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  65. class Token(Str):
  66. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
  67. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None):
  68. try:
  69. self = super(Token, cls).__new__(cls, value)
  70. except UnicodeDecodeError:
  71. value = value.decode('latin1')
  72. self = super(Token, cls).__new__(cls, value)
  73. self.type = type_
  74. self.pos_in_stream = pos_in_stream
  75. self.value = value
  76. self.line = line
  77. self.column = column
  78. self.end_line = end_line
  79. self.end_column = end_column
  80. return self
  81. @classmethod
  82. def new_borrow_pos(cls, type_, value, borrow_t):
  83. return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column)
  84. def __reduce__(self):
  85. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  86. def __repr__(self):
  87. return 'Token(%s, %r)' % (self.type, self.value)
  88. def __deepcopy__(self, memo):
  89. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  90. def __eq__(self, other):
  91. if isinstance(other, Token) and self.type != other.type:
  92. return False
  93. return Str.__eq__(self, other)
  94. __hash__ = Str.__hash__
  95. class LineCounter:
  96. def __init__(self):
  97. self.newline_char = '\n'
  98. self.char_pos = 0
  99. self.line = 1
  100. self.column = 1
  101. self.line_start_pos = 0
  102. def feed(self, token, test_newline=True):
  103. """Consume a token and calculate the new line & column.
  104. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  105. """
  106. if test_newline:
  107. newlines = token.count(self.newline_char)
  108. if newlines:
  109. self.line += newlines
  110. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  111. self.char_pos += len(token)
  112. self.column = self.char_pos - self.line_start_pos + 1
  113. class _Lex:
  114. "Built to serve both Lexer and ContextualLexer"
  115. def __init__(self, lexer, state=None):
  116. self.lexer = lexer
  117. self.state = state
  118. def lex(self, stream, newline_types, ignore_types):
  119. newline_types = frozenset(newline_types)
  120. ignore_types = frozenset(ignore_types)
  121. line_ctr = LineCounter()
  122. last_token = None
  123. while line_ctr.char_pos < len(stream):
  124. lexer = self.lexer
  125. res = lexer.match(stream, line_ctr.char_pos)
  126. if not res:
  127. allowed = {v for m, tfi in lexer.mres for v in tfi.values()}
  128. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])
  129. value, type_ = res
  130. t = None
  131. if type_ not in ignore_types:
  132. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  133. if t.type in lexer.callback:
  134. t = lexer.callback[t.type](t)
  135. if not isinstance(t, Token):
  136. raise ValueError("Callbacks must return a token (returned %r)" % t)
  137. last_token = t
  138. yield t
  139. else:
  140. if type_ in lexer.callback:
  141. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  142. lexer.callback[type_](t)
  143. line_ctr.feed(value, type_ in newline_types)
  144. if t:
  145. t.end_line = line_ctr.line
  146. t.end_column = line_ctr.column
  147. class UnlessCallback:
  148. def __init__(self, mres):
  149. self.mres = mres
  150. def __call__(self, t):
  151. for mre, type_from_index in self.mres:
  152. m = mre.match(t.value)
  153. if m:
  154. t.type = type_from_index[m.lastindex]
  155. break
  156. return t
  157. class CallChain:
  158. def __init__(self, callback1, callback2, cond):
  159. self.callback1 = callback1
  160. self.callback2 = callback2
  161. self.cond = cond
  162. def __call__(self, t):
  163. t2 = self.callback1(t)
  164. return self.callback2(t) if self.cond(t2) else t2
  165. def _create_unless(terminals):
  166. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  167. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  168. embedded_strs = set()
  169. callback = {}
  170. for retok in tokens_by_type.get(PatternRE, []):
  171. unless = [] # {}
  172. for strtok in tokens_by_type.get(PatternStr, []):
  173. if strtok.priority > retok.priority:
  174. continue
  175. s = strtok.pattern.value
  176. m = re.match(retok.pattern.to_regexp(), s)
  177. if m and m.group(0) == s:
  178. unless.append(strtok)
  179. if strtok.pattern.flags <= retok.pattern.flags:
  180. embedded_strs.add(strtok)
  181. if unless:
  182. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  183. terminals = [t for t in terminals if t not in embedded_strs]
  184. return terminals, callback
  185. def _build_mres(terminals, max_size, match_whole):
  186. # Python sets an unreasonable group limit (currently 100) in its re module
  187. # Worse, the only way to know we reached it is by catching an AssertionError!
  188. # This function recursively tries less and less groups until it's successful.
  189. postfix = '$' if match_whole else ''
  190. mres = []
  191. while terminals:
  192. try:
  193. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
  194. except AssertionError: # Yes, this is what Python provides us.. :/
  195. return _build_mres(terminals, max_size//2, match_whole)
  196. # terms_from_name = {t.name: t for t in terminals[:max_size]}
  197. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  198. terminals = terminals[max_size:]
  199. return mres
  200. def build_mres(terminals, match_whole=False):
  201. return _build_mres(terminals, len(terminals), match_whole)
  202. def _regexp_has_newline(r):
  203. r"""Expressions that may indicate newlines in a regexp:
  204. - newlines (\n)
  205. - escaped newline (\\n)
  206. - anything but ([^...])
  207. - any-char (.) when the flag (?s) exists
  208. - spaces (\s)
  209. """
  210. return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
  211. class Lexer(object):
  212. """Lexer interface
  213. Method Signatures:
  214. lex(self, stream) -> Iterator[Token]
  215. set_parser_state(self, state) # Optional
  216. """
  217. set_parser_state = NotImplemented
  218. lex = NotImplemented
  219. class TraditionalLexer(Lexer):
  220. def __init__(self, terminals, ignore=(), user_callbacks={}):
  221. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  222. terminals = list(terminals)
  223. # Sanitization
  224. for t in terminals:
  225. try:
  226. re.compile(t.pattern.to_regexp())
  227. except:
  228. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  229. if t.pattern.min_width == 0:
  230. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  231. assert set(ignore) <= {t.name for t in terminals}
  232. # Init
  233. self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
  234. self.ignore_types = list(ignore)
  235. terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  236. self.terminals = terminals
  237. self.user_callbacks = user_callbacks
  238. self.build()
  239. def build(self):
  240. terminals, self.callback = _create_unless(self.terminals)
  241. assert all(self.callback.values())
  242. for type_, f in self.user_callbacks.items():
  243. if type_ in self.callback:
  244. # Already a callback there, probably UnlessCallback
  245. self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
  246. else:
  247. self.callback[type_] = f
  248. self.mres = build_mres(terminals)
  249. def match(self, stream, pos):
  250. for mre, type_from_index in self.mres:
  251. m = mre.match(stream, pos)
  252. if m:
  253. return m.group(0), type_from_index[m.lastindex]
  254. def lex(self, stream):
  255. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  256. class ContextualLexer(Lexer):
  257. def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
  258. tokens_by_name = {}
  259. for t in terminals:
  260. assert t.name not in tokens_by_name, t
  261. tokens_by_name[t.name] = t
  262. lexer_by_tokens = {}
  263. self.lexers = {}
  264. for state, accepts in states.items():
  265. key = frozenset(accepts)
  266. try:
  267. lexer = lexer_by_tokens[key]
  268. except KeyError:
  269. accepts = set(accepts) | set(ignore) | set(always_accept)
  270. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  271. lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  272. lexer_by_tokens[key] = lexer
  273. self.lexers[state] = lexer
  274. self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)
  275. self.set_parser_state(None) # Needs to be set on the outside
  276. def set_parser_state(self, state):
  277. self.parser_state = state
  278. def lex(self, stream):
  279. l = _Lex(self.lexers[self.parser_state], self.parser_state)
  280. try:
  281. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  282. yield x
  283. l.lexer = self.lexers[self.parser_state]
  284. l.state = self.parser_state
  285. except UnexpectedCharacters as e:
  286. # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
  287. # but not in the current context.
  288. # This tests the input against the global context, to provide a nicer error.
  289. root_match = self.root_lexer.match(stream, e.pos_in_stream)
  290. if not root_match:
  291. raise
  292. value, type_ = root_match
  293. t = Token(type_, value, e.pos_in_stream, e.line, e.column)
  294. expected = {v for m, tfi in l.lexer.mres for v in tfi.values()}
  295. raise UnexpectedToken(t, expected)
  296. ###}