This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

121 Zeilen
4.0 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str
  4. class LexError(Exception):
  5. pass
  6. class UnexpectedInput(LexError):
  7. def __init__(self, seq, lex_pos, line, column):
  8. context = seq[lex_pos:lex_pos+5]
  9. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  10. super(LexError, self).__init__(message)
  11. self.line = line
  12. self.column = column
  13. self.context = context
  14. class Token(Str):
  15. def __new__(cls, type, value, pos_in_stream=None):
  16. inst = Str.__new__(cls, value)
  17. inst.type = type
  18. inst.pos_in_stream = pos_in_stream
  19. inst.value = value
  20. return inst
  21. @classmethod
  22. def new_borrow_pos(cls, type, value, borrow_t):
  23. inst = cls(type, value, borrow_t.pos_in_stream)
  24. inst.line = borrow_t.line
  25. inst.column = borrow_t.column
  26. return inst
  27. def __repr__(self):
  28. return 'Token(%s, %r)' % (self.type, self.value)
  29. class Regex:
  30. def __init__(self, pattern, flags=()):
  31. self.pattern = pattern
  32. self.flags = flags
  33. class Lexer(object):
  34. def __init__(self, tokens, callbacks, ignore=()):
  35. self.ignore = ignore
  36. self.newline_char = '\n'
  37. tokens = list(tokens)
  38. # Sanitization
  39. token_names = {t[0] for t in tokens}
  40. for t in tokens:
  41. try:
  42. re.compile(t[1])
  43. except:
  44. raise LexError("Cannot compile token: %s: %s" % t)
  45. assert all(t in token_names for t in ignore)
  46. # Init
  47. self.tokens = tokens
  48. self.callbacks = callbacks
  49. self.token_types = list(token_names)
  50. self.type_index = {name:i for i,name in enumerate(self.token_types)}
  51. self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]]
  52. self.ignore_types = [self.type_index[t] for t in ignore]
  53. self.mres = self._build_mres(tokens, len(tokens))
  54. def _build_mres(self, tokens, max_size):
  55. # Python sets an unreasonable group limit (currently 100) in its re module
  56. # Worse, the only way to know we reached it is by catching an AssertionError!
  57. # This function recursively tries less and less groups until it's successful.
  58. mres = []
  59. while tokens:
  60. try:
  61. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
  62. except AssertionError: # Yes, this is what Python provides us.. :/
  63. return self._build_mres(tokens, max_size/2)
  64. mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
  65. tokens = tokens[max_size:]
  66. return mres
  67. def lex(self, stream):
  68. lex_pos = 0
  69. line = 1
  70. col_start_pos = 0
  71. newline_types = list(self.newline_types)
  72. ignore_types = list(self.ignore_types)
  73. while True:
  74. for mre, type_from_index in self.mres:
  75. m = mre.match(stream, lex_pos)
  76. if m:
  77. value = m.group(0)
  78. type_num = type_from_index[m.lastindex]
  79. if type_num not in ignore_types:
  80. t = Token(self.token_types[type_num], value, lex_pos)
  81. t.line = line
  82. t.column = lex_pos - col_start_pos
  83. if t.type in self.callbacks:
  84. t = self.callbacks[t.type](t)
  85. yield t
  86. if type_num in newline_types:
  87. newlines = value.count(self.newline_char)
  88. if newlines:
  89. line += newlines
  90. col_start_pos = lex_pos + value.rindex(self.newline_char)
  91. lex_pos += len(value)
  92. break
  93. else:
  94. if lex_pos < len(stream):
  95. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  96. break