This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

239 rindas
7.8 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str, classify
  4. from .common import is_terminal
  5. class LexError(Exception):
  6. pass
  7. class TokenDef(object):
  8. def __init__(self, name, value):
  9. self.name = name
  10. self.value = value
  11. def __repr__(self):
  12. return '%s(%r, %r)' % (type(self).__name__, self.name, self.value)
  13. class TokenDef__Str(TokenDef):
  14. def to_regexp(self):
  15. return re.escape(self.value)
  16. priority = 0
  17. class TokenDef__Regexp(TokenDef):
  18. def to_regexp(self):
  19. return self.value
  20. priority = 1
  21. class UnexpectedInput(LexError):
  22. def __init__(self, seq, lex_pos, line, column):
  23. context = seq[lex_pos:lex_pos+5]
  24. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  25. super(LexError, self).__init__(message)
  26. self.line = line
  27. self.column = column
  28. self.context = context
  29. class Token(Str):
  30. def __new__(cls, type_, value, pos_in_stream=None):
  31. inst = Str.__new__(cls, value)
  32. inst.type = type_
  33. inst.pos_in_stream = pos_in_stream
  34. inst.value = value
  35. return inst
  36. @classmethod
  37. def new_borrow_pos(cls, type_, value, borrow_t):
  38. inst = cls(type_, value, borrow_t.pos_in_stream)
  39. inst.line = borrow_t.line
  40. inst.column = borrow_t.column
  41. return inst
  42. def __repr__(self):
  43. return 'Token(%s, %r)' % (self.type, self.value)
  44. class Regex:
  45. def __init__(self, pattern, flags=()):
  46. self.pattern = pattern
  47. self.flags = flags
  48. def _regexp_has_newline(r):
  49. return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
  50. def _create_unless_callback(strs):
  51. def unless_callback(t):
  52. if t in strs:
  53. t.type = strs[t]
  54. return t
  55. return unless_callback
  56. def _create_unless(tokens):
  57. tokens_by_type = classify(tokens, type)
  58. assert len(tokens_by_type) <= 2, tokens_by_type.keys()
  59. embedded_strs = set()
  60. callback = {}
  61. for retok in tokens_by_type.get(TokenDef__Regexp, []):
  62. unless = {}
  63. for strtok in tokens_by_type.get(TokenDef__Str, []):
  64. m = re.match(retok.value, strtok.value)
  65. if m and m.group(0) == strtok.value:
  66. embedded_strs.add(strtok.name)
  67. unless[strtok.value] = strtok.name
  68. if unless:
  69. callback[retok.name] = _create_unless_callback(unless)
  70. tokens = [t for t in tokens if t.name not in embedded_strs]
  71. return tokens, callback
  72. class Lexer(object):
  73. def __init__(self, tokens, ignore=()):
  74. assert all(isinstance(t, TokenDef) for t in tokens)
  75. self.ignore = ignore
  76. self.newline_char = '\n'
  77. tokens = list(tokens)
  78. # Sanitization
  79. for t in tokens:
  80. try:
  81. re.compile(t.to_regexp())
  82. except:
  83. raise LexError("Cannot compile token: %s: %s" % t)
  84. token_names = {t.name for t in tokens}
  85. assert all(t in token_names for t in ignore)
  86. # Init
  87. self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())]
  88. self.ignore_types = [t for t in ignore]
  89. tokens, self.callback = _create_unless(tokens)
  90. assert all(self.callback.values())
  91. tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True)
  92. self.tokens = tokens
  93. self.mres = self._build_mres(tokens, len(tokens))
  94. def _build_mres(self, tokens, max_size):
  95. # Python sets an unreasonable group limit (currently 100) in its re module
  96. # Worse, the only way to know we reached it is by catching an AssertionError!
  97. # This function recursively tries less and less groups until it's successful.
  98. mres = []
  99. while tokens:
  100. try:
  101. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size]))
  102. except AssertionError: # Yes, this is what Python provides us.. :/
  103. return self._build_mres(tokens, max_size//2)
  104. mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
  105. tokens = tokens[max_size:]
  106. return mres
  107. def lex(self, stream):
  108. lex_pos = 0
  109. line = 1
  110. col_start_pos = 0
  111. newline_types = list(self.newline_types)
  112. ignore_types = list(self.ignore_types)
  113. while True:
  114. for mre, type_from_index in self.mres:
  115. m = mre.match(stream, lex_pos)
  116. if m:
  117. value = m.group(0)
  118. type_ = type_from_index[m.lastindex]
  119. if type_ not in ignore_types:
  120. t = Token(type_, value, lex_pos)
  121. t.line = line
  122. t.column = lex_pos - col_start_pos
  123. if t.type in self.callback:
  124. t = self.callback[t.type](t)
  125. yield t
  126. if type_ in newline_types:
  127. newlines = value.count(self.newline_char)
  128. if newlines:
  129. line += newlines
  130. col_start_pos = lex_pos + value.rindex(self.newline_char)
  131. lex_pos += len(value)
  132. break
  133. else:
  134. if lex_pos < len(stream):
  135. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  136. break
  137. class ContextualLexer:
  138. def __init__(self, tokens, states, ignore=(), always_accept=()):
  139. tokens_by_name = {}
  140. for t in tokens:
  141. assert t.name not in tokens_by_name
  142. tokens_by_name[t.name] = t
  143. lexer_by_tokens = {}
  144. self.lexers = {}
  145. for state, accepts in states.items():
  146. key = frozenset(accepts)
  147. try:
  148. lexer = lexer_by_tokens[key]
  149. except KeyError:
  150. accepts = set(accepts) # For python3
  151. accepts |= set(ignore)
  152. accepts |= set(always_accept)
  153. state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
  154. lexer = Lexer(state_tokens, ignore=ignore)
  155. lexer_by_tokens[key] = lexer
  156. self.lexers[state] = lexer
  157. self.root_lexer = Lexer(tokens, ignore=ignore)
  158. self.set_parser_state(None) # Needs to be set on the outside
  159. def set_parser_state(self, state):
  160. self.parser_state = state
  161. def lex(self, stream):
  162. lex_pos = 0
  163. line = 1
  164. col_start_pos = 0
  165. newline_types = list(self.root_lexer.newline_types)
  166. ignore_types = list(self.root_lexer.ignore_types)
  167. while True:
  168. lexer = self.lexers[self.parser_state]
  169. for mre, type_from_index in lexer.mres:
  170. m = mre.match(stream, lex_pos)
  171. if m:
  172. value = m.group(0)
  173. type_ = type_from_index[m.lastindex]
  174. if type_ not in ignore_types:
  175. t = Token(type_, value, lex_pos)
  176. t.line = line
  177. t.column = lex_pos - col_start_pos
  178. if t.type in lexer.callback:
  179. t = lexer.callback[t.type](t)
  180. yield t
  181. if type_ in newline_types:
  182. newlines = value.count(lexer.newline_char)
  183. if newlines:
  184. line += newlines
  185. col_start_pos = lex_pos + value.rindex(lexer.newline_char)
  186. lex_pos += len(value)
  187. break
  188. else:
  189. if lex_pos < len(stream):
  190. print("Allowed tokens:", lexer.tokens)
  191. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  192. break