This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

416 lines
14 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify, get_regexp_width, Py36, Serialize
  4. from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
  5. ###{standalone
  6. from copy import copy
  7. class Pattern(Serialize):
  8. def __init__(self, value, flags=()):
  9. self.value = value
  10. self.flags = frozenset(flags)
  11. def __repr__(self):
  12. return repr(self.to_regexp())
  13. # Pattern Hashing assumes all subclasses have a different priority!
  14. def __hash__(self):
  15. return hash((type(self), self.value, self.flags))
  16. def __eq__(self, other):
  17. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  18. def to_regexp(self):
  19. raise NotImplementedError()
  20. if Py36:
  21. # Python 3.6 changed syntax for flags in regular expression
  22. def _get_flags(self, value):
  23. for f in self.flags:
  24. value = ('(?%s:%s)' % (f, value))
  25. return value
  26. else:
  27. def _get_flags(self, value):
  28. for f in self.flags:
  29. value = ('(?%s)' % f) + value
  30. return value
  31. class PatternStr(Pattern):
  32. __serialize_fields__ = 'value', 'flags'
  33. type = "str"
  34. def to_regexp(self):
  35. return self._get_flags(re.escape(self.value))
  36. @property
  37. def min_width(self):
  38. return len(self.value)
  39. max_width = min_width
  40. class PatternRE(Pattern):
  41. __serialize_fields__ = 'value', 'flags', '_width'
  42. type = "re"
  43. def to_regexp(self):
  44. return self._get_flags(self.value)
  45. _width = None
  46. def _get_width(self):
  47. if self._width is None:
  48. self._width = get_regexp_width(self.to_regexp())
  49. return self._width
  50. @property
  51. def min_width(self):
  52. return self._get_width()[0]
  53. @property
  54. def max_width(self):
  55. return self._get_width()[1]
  56. class TerminalDef(Serialize):
  57. __serialize_fields__ = 'name', 'pattern', 'priority'
  58. __serialize_namespace__ = PatternStr, PatternRE
  59. def __init__(self, name, pattern, priority=1):
  60. assert isinstance(pattern, Pattern), pattern
  61. self.name = name
  62. self.pattern = pattern
  63. self.priority = priority
  64. def __repr__(self):
  65. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  66. class Token(Str):
  67. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
  68. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
  69. try:
  70. self = super(Token, cls).__new__(cls, value)
  71. except UnicodeDecodeError:
  72. value = value.decode('latin1')
  73. self = super(Token, cls).__new__(cls, value)
  74. self.type = type_
  75. self.pos_in_stream = pos_in_stream
  76. self.value = value
  77. self.line = line
  78. self.column = column
  79. self.end_line = end_line
  80. self.end_column = end_column
  81. self.end_pos = end_pos
  82. return self
  83. def update(self, type_=None, value=None):
  84. return Token.new_borrow_pos(
  85. type_ if type_ is not None else self.type,
  86. value if value is not None else self.value,
  87. self
  88. )
  89. @classmethod
  90. def new_borrow_pos(cls, type_, value, borrow_t):
  91. return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
  92. def __reduce__(self):
  93. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  94. def __repr__(self):
  95. return 'Token(%s, %r)' % (self.type, self.value)
  96. def __deepcopy__(self, memo):
  97. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  98. def __eq__(self, other):
  99. if isinstance(other, Token) and self.type != other.type:
  100. return False
  101. return Str.__eq__(self, other)
  102. __hash__ = Str.__hash__
  103. class LineCounter:
  104. def __init__(self, newline_char):
  105. self.newline_char = newline_char
  106. self.char_pos = 0
  107. self.line = 1
  108. self.column = 1
  109. self.line_start_pos = 0
  110. def feed(self, token, test_newline=True):
  111. """Consume a token and calculate the new line & column.
  112. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  113. """
  114. if test_newline:
  115. newlines = token.count(self.newline_char)
  116. if newlines:
  117. self.line += newlines
  118. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  119. self.char_pos += len(token)
  120. self.column = self.char_pos - self.line_start_pos + 1
  121. class _Lex:
  122. "Built to serve both Lexer and ContextualLexer"
  123. def __init__(self, lexer, state=None):
  124. self.lexer = lexer
  125. self.state = state
  126. def lex(self, stream, newline_types, ignore_types):
  127. newline_types = frozenset(newline_types)
  128. ignore_types = frozenset(ignore_types)
  129. line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
  130. last_token = None
  131. while line_ctr.char_pos < len(stream):
  132. lexer = self.lexer
  133. res = lexer.match(stream, line_ctr.char_pos)
  134. if not res:
  135. allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
  136. if not allowed:
  137. allowed = {"<END-OF-FILE>"}
  138. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])
  139. value, type_ = res
  140. if type_ not in ignore_types:
  141. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  142. line_ctr.feed(value, type_ in newline_types)
  143. t.end_line = line_ctr.line
  144. t.end_column = line_ctr.column
  145. t.end_pos = line_ctr.char_pos
  146. if t.type in lexer.callback:
  147. t = lexer.callback[t.type](t)
  148. if not isinstance(t, Token):
  149. raise ValueError("Callbacks must return a token (returned %r)" % t)
  150. yield t
  151. last_token = t
  152. else:
  153. if type_ in lexer.callback:
  154. t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  155. lexer.callback[type_](t2)
  156. line_ctr.feed(value, type_ in newline_types)
  157. class UnlessCallback:
  158. def __init__(self, mres):
  159. self.mres = mres
  160. def __call__(self, t):
  161. for mre, type_from_index in self.mres:
  162. m = mre.match(t.value)
  163. if m:
  164. t.type = type_from_index[m.lastindex]
  165. break
  166. return t
  167. class CallChain:
  168. def __init__(self, callback1, callback2, cond):
  169. self.callback1 = callback1
  170. self.callback2 = callback2
  171. self.cond = cond
  172. def __call__(self, t):
  173. t2 = self.callback1(t)
  174. return self.callback2(t) if self.cond(t2) else t2
  175. def _create_unless(terminals, g_regex_flags, re_, use_bytes):
  176. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  177. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  178. embedded_strs = set()
  179. callback = {}
  180. for retok in tokens_by_type.get(PatternRE, []):
  181. unless = [] # {}
  182. for strtok in tokens_by_type.get(PatternStr, []):
  183. if strtok.priority > retok.priority:
  184. continue
  185. s = strtok.pattern.value
  186. m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
  187. if m and m.group(0) == s:
  188. unless.append(strtok)
  189. if strtok.pattern.flags <= retok.pattern.flags:
  190. embedded_strs.add(strtok)
  191. if unless:
  192. callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
  193. terminals = [t for t in terminals if t not in embedded_strs]
  194. return terminals, callback
  195. def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
  196. # Python sets an unreasonable group limit (currently 100) in its re module
  197. # Worse, the only way to know we reached it is by catching an AssertionError!
  198. # This function recursively tries less and less groups until it's successful.
  199. postfix = '$' if match_whole else ''
  200. mres = []
  201. while terminals:
  202. pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
  203. if use_bytes:
  204. pattern = pattern.encode('latin-1')
  205. try:
  206. mre = re_.compile(pattern, g_regex_flags)
  207. except AssertionError: # Yes, this is what Python provides us.. :/
  208. return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
  209. # terms_from_name = {t.name: t for t in terminals[:max_size]}
  210. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  211. terminals = terminals[max_size:]
  212. return mres
  213. def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
  214. return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
  215. def _regexp_has_newline(r):
  216. r"""Expressions that may indicate newlines in a regexp:
  217. - newlines (\n)
  218. - escaped newline (\\n)
  219. - anything but ([^...])
  220. - any-char (.) when the flag (?s) exists
  221. - spaces (\s)
  222. """
  223. return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
  224. class Lexer(object):
  225. """Lexer interface
  226. Method Signatures:
  227. lex(self, stream) -> Iterator[Token]
  228. """
  229. lex = NotImplemented
  230. class TraditionalLexer(Lexer):
  231. def __init__(self, conf):
  232. terminals = list(conf.tokens)
  233. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  234. self.re = conf.re_module
  235. if not conf.skip_validation:
  236. # Sanitization
  237. for t in terminals:
  238. try:
  239. self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
  240. except self.re.error:
  241. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  242. if t.pattern.min_width == 0:
  243. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  244. assert set(conf.ignore) <= {t.name for t in terminals}
  245. # Init
  246. self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
  247. self.ignore_types = list(conf.ignore)
  248. terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  249. self.terminals = terminals
  250. self.user_callbacks = conf.callbacks
  251. self.g_regex_flags = conf.g_regex_flags
  252. self.use_bytes = conf.use_bytes
  253. self._mres = None
  254. # self.build(g_regex_flags)
  255. def _build(self):
  256. terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
  257. assert all(self.callback.values())
  258. for type_, f in self.user_callbacks.items():
  259. if type_ in self.callback:
  260. # Already a callback there, probably UnlessCallback
  261. self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
  262. else:
  263. self.callback[type_] = f
  264. self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
  265. @property
  266. def mres(self):
  267. if self._mres is None:
  268. self._build()
  269. return self._mres
  270. def match(self, stream, pos):
  271. for mre, type_from_index in self.mres:
  272. m = mre.match(stream, pos)
  273. if m:
  274. return m.group(0), type_from_index[m.lastindex]
  275. def lex(self, stream):
  276. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  277. class ContextualLexer(Lexer):
  278. def __init__(self, conf, states, always_accept=()):
  279. terminals = list(conf.tokens)
  280. tokens_by_name = {}
  281. for t in terminals:
  282. assert t.name not in tokens_by_name, t
  283. tokens_by_name[t.name] = t
  284. trad_conf = copy(conf)
  285. trad_conf.tokens = terminals
  286. lexer_by_tokens = {}
  287. self.lexers = {}
  288. for state, accepts in states.items():
  289. key = frozenset(accepts)
  290. try:
  291. lexer = lexer_by_tokens[key]
  292. except KeyError:
  293. accepts = set(accepts) | set(conf.ignore) | set(always_accept)
  294. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  295. lexer_conf = copy(trad_conf)
  296. lexer_conf.tokens = state_tokens
  297. lexer = TraditionalLexer(lexer_conf)
  298. lexer_by_tokens[key] = lexer
  299. self.lexers[state] = lexer
  300. assert trad_conf.tokens is terminals
  301. self.root_lexer = TraditionalLexer(trad_conf)
  302. def lex(self, stream, get_parser_state):
  303. parser_state = get_parser_state()
  304. l = _Lex(self.lexers[parser_state], parser_state)
  305. try:
  306. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  307. yield x
  308. parser_state = get_parser_state()
  309. l.lexer = self.lexers[parser_state]
  310. l.state = parser_state # For debug only, no need to worry about multithreading
  311. except UnexpectedCharacters as e:
  312. # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
  313. # but not in the current context.
  314. # This tests the input against the global context, to provide a nicer error.
  315. root_match = self.root_lexer.match(stream, e.pos_in_stream)
  316. if not root_match:
  317. raise
  318. value, type_ = root_match
  319. t = Token(type_, value, e.pos_in_stream, e.line, e.column)
  320. raise UnexpectedToken(t, e.allowed, state=e.state)
  321. ###}