This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

121 строка
4.0 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str
  4. class LexError(Exception):
  5. pass
  6. class UnexpectedInput(LexError):
  7. def __init__(self, seq, lex_pos, line, column):
  8. context = seq[lex_pos:lex_pos+5]
  9. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  10. super(LexError, self).__init__(message)
  11. self.line = line
  12. self.column = column
  13. self.context = context
  14. class Token(Str):
  15. def __new__(cls, type, value, pos_in_stream=None):
  16. inst = Str.__new__(cls, value)
  17. inst.type = type
  18. inst.pos_in_stream = pos_in_stream
  19. inst.value = value
  20. return inst
  21. @classmethod
  22. def new_borrow_pos(cls, type, value, borrow_t):
  23. inst = cls(type, value, borrow_t.pos_in_stream)
  24. inst.line = borrow_t.line
  25. inst.column = borrow_t.column
  26. return inst
  27. def __repr__(self):
  28. return 'Token(%s, %r)' % (self.type, self.value)
  29. class Regex:
  30. def __init__(self, pattern, flags=()):
  31. self.pattern = pattern
  32. self.flags = flags
  33. class Lexer(object):
  34. def __init__(self, tokens, callbacks, ignore=()):
  35. self.ignore = ignore
  36. self.newline_char = '\n'
  37. tokens = list(tokens)
  38. # Sanitization
  39. token_names = {t[0] for t in tokens}
  40. for t in tokens:
  41. try:
  42. re.compile(t[1])
  43. except:
  44. raise LexError("Cannot compile token: %s: %s" % t)
  45. assert all(t in token_names for t in ignore)
  46. # Init
  47. self.tokens = tokens
  48. self.callbacks = callbacks
  49. self.token_types = list(token_names)
  50. self.type_index = {name:i for i,name in enumerate(self.token_types)}
  51. self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]]
  52. self.ignore_types = [self.type_index[t] for t in ignore]
  53. self.mres = self._build_mres(tokens, len(tokens))
  54. def _build_mres(self, tokens, max_size):
  55. # Python sets an unreasonable group limit (currently 100) in its re module
  56. # Worse, the only way to know we reached it is by catching an AssertionError!
  57. # This function recursively tries less and less groups until it's successful.
  58. mres = []
  59. while tokens:
  60. try:
  61. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
  62. except AssertionError: # Yes, this is what Python provides us.. :/
  63. return self._build_mres(tokens, max_size/2)
  64. mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
  65. tokens = tokens[max_size:]
  66. return mres
  67. def lex(self, stream):
  68. lex_pos = 0
  69. line = 1
  70. col_start_pos = 0
  71. newline_types = list(self.newline_types)
  72. ignore_types = list(self.ignore_types)
  73. while True:
  74. for mre, type_from_index in self.mres:
  75. m = mre.match(stream, lex_pos)
  76. if m:
  77. value = m.group(0)
  78. type_num = type_from_index[m.lastindex]
  79. if type_num not in ignore_types:
  80. t = Token(self.token_types[type_num], value, lex_pos)
  81. t.line = line
  82. t.column = lex_pos - col_start_pos
  83. if t.type in self.callbacks:
  84. t = self.callbacks[t.type](t)
  85. yield t
  86. if type_num in newline_types:
  87. newlines = value.count(self.newline_char)
  88. if newlines:
  89. line += newlines
  90. col_start_pos = lex_pos + value.rindex(self.newline_char)
  91. lex_pos += len(value)
  92. break
  93. else:
  94. if lex_pos < len(stream):
  95. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  96. break