This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

367 Zeilen
12 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify, get_regexp_width, Py36, Serialize
  4. from .exceptions import UnexpectedCharacters, LexError
  5. ###{standalone
  6. class Pattern(Serialize):
  7. __serialize_fields__ = 'value', 'flags'
  8. def __init__(self, value, flags=()):
  9. self.value = value
  10. self.flags = frozenset(flags)
  11. def __repr__(self):
  12. return repr(self.to_regexp())
  13. # Pattern Hashing assumes all subclasses have a different priority!
  14. def __hash__(self):
  15. return hash((type(self), self.value, self.flags))
  16. def __eq__(self, other):
  17. return type(self) == type(other) and self.value == other.value and self.flags == other.flags
  18. def to_regexp(self):
  19. raise NotImplementedError()
  20. if Py36:
  21. # Python 3.6 changed syntax for flags in regular expression
  22. def _get_flags(self, value):
  23. for f in self.flags:
  24. value = ('(?%s:%s)' % (f, value))
  25. return value
  26. else:
  27. def _get_flags(self, value):
  28. for f in self.flags:
  29. value = ('(?%s)' % f) + value
  30. return value
  31. class PatternStr(Pattern):
  32. def to_regexp(self):
  33. return self._get_flags(re.escape(self.value))
  34. @property
  35. def min_width(self):
  36. return len(self.value)
  37. max_width = min_width
  38. class PatternRE(Pattern):
  39. def to_regexp(self):
  40. return self._get_flags(self.value)
  41. @property
  42. def min_width(self):
  43. return get_regexp_width(self.to_regexp())[0]
  44. @property
  45. def max_width(self):
  46. return get_regexp_width(self.to_regexp())[1]
  47. class TerminalDef(Serialize):
  48. __serialize_fields__ = 'name', 'pattern', 'priority'
  49. __serialize_namespace__ = PatternStr, PatternRE
  50. def __init__(self, name, pattern, priority=1):
  51. assert isinstance(pattern, Pattern), pattern
  52. self.name = name
  53. self.pattern = pattern
  54. self.priority = priority
  55. def __repr__(self):
  56. return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
  57. class Token(Str):
  58. __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
  59. def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None):
  60. try:
  61. self = super(Token, cls).__new__(cls, value)
  62. except UnicodeDecodeError:
  63. value = value.decode('latin1')
  64. self = super(Token, cls).__new__(cls, value)
  65. self.type = type_
  66. self.pos_in_stream = pos_in_stream
  67. self.value = value
  68. self.line = line
  69. self.column = column
  70. self.end_line = end_line
  71. self.end_column = end_column
  72. return self
  73. @classmethod
  74. def new_borrow_pos(cls, type_, value, borrow_t):
  75. return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column)
  76. def __reduce__(self):
  77. return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
  78. def __repr__(self):
  79. return 'Token(%s, %r)' % (self.type, self.value)
  80. def __deepcopy__(self, memo):
  81. return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
  82. def __eq__(self, other):
  83. if isinstance(other, Token) and self.type != other.type:
  84. return False
  85. return Str.__eq__(self, other)
  86. __hash__ = Str.__hash__
  87. class LineCounter:
  88. def __init__(self):
  89. self.newline_char = '\n'
  90. self.char_pos = 0
  91. self.line = 1
  92. self.column = 1
  93. self.line_start_pos = 0
  94. def feed(self, token, test_newline=True):
  95. """Consume a token and calculate the new line & column.
  96. As an optional optimization, set test_newline=False is token doesn't contain a newline.
  97. """
  98. if test_newline:
  99. newlines = token.count(self.newline_char)
  100. if newlines:
  101. self.line += newlines
  102. self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
  103. self.char_pos += len(token)
  104. self.column = self.char_pos - self.line_start_pos + 1
  105. class _Lex:
  106. "Built to serve both Lexer and ContextualLexer"
  107. def __init__(self, lexer, state=None):
  108. self.lexer = lexer
  109. self.state = state
  110. def lex(self, stream, newline_types, ignore_types):
  111. newline_types = frozenset(newline_types)
  112. ignore_types = frozenset(ignore_types)
  113. line_ctr = LineCounter()
  114. while line_ctr.char_pos < len(stream):
  115. lexer = self.lexer
  116. for mre, type_from_index in lexer.mres:
  117. m = mre.match(stream, line_ctr.char_pos)
  118. if not m:
  119. continue
  120. t = None
  121. value = m.group(0)
  122. type_ = type_from_index[m.lastindex]
  123. if type_ not in ignore_types:
  124. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  125. if t.type in lexer.callback:
  126. t = lexer.callback[t.type](t)
  127. if not isinstance(t, Token):
  128. raise ValueError("Callbacks must return a token (returned %r)" % t)
  129. yield t
  130. else:
  131. if type_ in lexer.callback:
  132. t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
  133. lexer.callback[type_](t)
  134. line_ctr.feed(value, type_ in newline_types)
  135. if t:
  136. t.end_line = line_ctr.line
  137. t.end_column = line_ctr.column
  138. break
  139. else:
  140. allowed = [v for m, tfi in lexer.mres for v in tfi.values()]
  141. raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state)
  142. class UnlessCallback:
  143. def __init__(self, mres):
  144. self.mres = mres
  145. def __call__(self, t):
  146. for mre, type_from_index in self.mres:
  147. m = mre.match(t.value)
  148. if m:
  149. t.type = type_from_index[m.lastindex]
  150. break
  151. return t
  152. class CallChain:
  153. def __init__(self, callback1, callback2, cond):
  154. self.callback1 = callback1
  155. self.callback2 = callback2
  156. self.cond = cond
  157. def __call__(self, t):
  158. t2 = self.callback1(t)
  159. return self.callback2(t) if self.cond(t2) else t2
  160. def _create_unless(terminals):
  161. tokens_by_type = classify(terminals, lambda t: type(t.pattern))
  162. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  163. embedded_strs = set()
  164. callback = {}
  165. for retok in tokens_by_type.get(PatternRE, []):
  166. unless = [] # {}
  167. for strtok in tokens_by_type.get(PatternStr, []):
  168. if strtok.priority > retok.priority:
  169. continue
  170. s = strtok.pattern.value
  171. m = re.match(retok.pattern.to_regexp(), s)
  172. if m and m.group(0) == s:
  173. unless.append(strtok)
  174. if strtok.pattern.flags <= retok.pattern.flags:
  175. embedded_strs.add(strtok)
  176. if unless:
  177. callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
  178. terminals = [t for t in terminals if t not in embedded_strs]
  179. return terminals, callback
  180. def _build_mres(terminals, max_size, match_whole):
  181. # Python sets an unreasonable group limit (currently 100) in its re module
  182. # Worse, the only way to know we reached it is by catching an AssertionError!
  183. # This function recursively tries less and less groups until it's successful.
  184. postfix = '$' if match_whole else ''
  185. mres = []
  186. while terminals:
  187. try:
  188. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
  189. except AssertionError: # Yes, this is what Python provides us.. :/
  190. return _build_mres(terminals, max_size//2, match_whole)
  191. # terms_from_name = {t.name: t for t in terminals[:max_size]}
  192. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  193. terminals = terminals[max_size:]
  194. return mres
  195. def build_mres(terminals, match_whole=False):
  196. return _build_mres(terminals, len(terminals), match_whole)
  197. def _regexp_has_newline(r):
  198. """Expressions that may indicate newlines in a regexp:
  199. - newlines (\n)
  200. - escaped newline (\\n)
  201. - anything but ([^...])
  202. - any-char (.) when the flag (?s) exists
  203. """
  204. return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
  205. class Lexer(Serialize):
  206. """Lexer interface
  207. Method Signatures:
  208. lex(self, stream) -> Iterator[Token]
  209. set_parser_state(self, state) # Optional
  210. """
  211. set_parser_state = NotImplemented
  212. lex = NotImplemented
  213. class TraditionalLexer(Lexer):
  214. __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types'
  215. __serialize_namespace__ = TerminalDef,
  216. def _deserialize(self):
  217. self.mres = build_mres(self.terminals)
  218. self.callback = {} # TODO implement
  219. def __init__(self, terminals, ignore=(), user_callbacks={}):
  220. assert all(isinstance(t, TerminalDef) for t in terminals), terminals
  221. terminals = list(terminals)
  222. # Sanitization
  223. for t in terminals:
  224. try:
  225. re.compile(t.pattern.to_regexp())
  226. except:
  227. raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
  228. if t.pattern.min_width == 0:
  229. raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
  230. assert set(ignore) <= {t.name for t in terminals}
  231. # Init
  232. self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
  233. self.ignore_types = list(ignore)
  234. terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
  235. terminals, self.callback = _create_unless(terminals)
  236. assert all(self.callback.values())
  237. for type_, f in user_callbacks.items():
  238. if type_ in self.callback:
  239. # Already a callback there, probably UnlessCallback
  240. self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
  241. else:
  242. self.callback[type_] = f
  243. self.terminals = terminals
  244. self.mres = build_mres(terminals)
  245. def lex(self, stream):
  246. return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
  247. class ContextualLexer(Lexer):
  248. __serialize_fields__ = 'root_lexer', 'lexers'
  249. __serialize_namespace__ = TraditionalLexer,
  250. def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
  251. tokens_by_name = {}
  252. for t in terminals:
  253. assert t.name not in tokens_by_name, t
  254. tokens_by_name[t.name] = t
  255. lexer_by_tokens = {}
  256. self.lexers = {}
  257. for state, accepts in states.items():
  258. key = frozenset(accepts)
  259. try:
  260. lexer = lexer_by_tokens[key]
  261. except KeyError:
  262. accepts = set(accepts) | set(ignore) | set(always_accept)
  263. state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
  264. lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
  265. lexer_by_tokens[key] = lexer
  266. self.lexers[state] = lexer
  267. self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)
  268. self.set_parser_state(None) # Needs to be set on the outside
  269. def set_parser_state(self, state):
  270. self.parser_state = state
  271. def lex(self, stream):
  272. l = _Lex(self.lexers[self.parser_state], self.parser_state)
  273. for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
  274. yield x
  275. l.lexer = self.lexers[self.parser_state]
  276. l.state = self.parser_state
  277. ###}