This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.

121 righe
4.0 KiB

  1. ## Lexer Implementation
  2. import re
  3. from .utils import Str
  4. class LexError(Exception):
  5. pass
  6. class UnexpectedInput(LexError):
  7. def __init__(self, seq, lex_pos, line, column):
  8. context = seq[lex_pos:lex_pos+5]
  9. message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line)
  10. super(LexError, self).__init__(message)
  11. self.line = line
  12. self.column = column
  13. self.context = context
  14. class Token(Str):
  15. def __new__(cls, type, value, pos_in_stream=None):
  16. inst = Str.__new__(cls, value)
  17. inst.type = type
  18. inst.pos_in_stream = pos_in_stream
  19. inst.value = value
  20. return inst
  21. @classmethod
  22. def new_borrow_pos(cls, type, value, borrow_t):
  23. inst = cls(type, value, borrow_t.pos_in_stream)
  24. inst.line = borrow_t.line
  25. inst.column = borrow_t.column
  26. return inst
  27. def __repr__(self):
  28. return 'Token(%s, %r)' % (self.type, self.value)
  29. class Regex:
  30. def __init__(self, pattern, flags=()):
  31. self.pattern = pattern
  32. self.flags = flags
  33. class Lexer(object):
  34. def __init__(self, tokens, callbacks, ignore=()):
  35. self.ignore = ignore
  36. self.newline_char = '\n'
  37. tokens = list(tokens)
  38. # Sanitization
  39. token_names = {t[0] for t in tokens}
  40. for t in tokens:
  41. try:
  42. re.compile(t[1])
  43. except:
  44. raise LexError("Cannot compile token: %s: %s" % t)
  45. assert all(t in token_names for t in ignore)
  46. # Init
  47. self.tokens = tokens
  48. self.callbacks = callbacks
  49. self.token_types = list(token_names)
  50. self.type_index = {name:i for i,name in enumerate(self.token_types)}
  51. self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]]
  52. self.ignore_types = [self.type_index[t] for t in ignore]
  53. self.mres = self._build_mres(tokens, len(tokens))
  54. def _build_mres(self, tokens, max_size):
  55. # Python sets an unreasonable group limit (currently 100) in its re module
  56. # Worse, the only way to know we reached it is by catching an AssertionError!
  57. # This function recursively tries less and less groups until it's successful.
  58. mres = []
  59. while tokens:
  60. try:
  61. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
  62. except AssertionError: # Yes, this is what Python provides us.. :/
  63. return self._build_mres(tokens, max_size/2)
  64. mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
  65. tokens = tokens[max_size:]
  66. return mres
  67. def lex(self, stream):
  68. lex_pos = 0
  69. line = 1
  70. col_start_pos = 0
  71. newline_types = list(self.newline_types)
  72. ignore_types = list(self.ignore_types)
  73. while True:
  74. for mre, type_from_index in self.mres:
  75. m = mre.match(stream, lex_pos)
  76. if m:
  77. value = m.group(0)
  78. type_num = type_from_index[m.lastindex]
  79. if type_num not in ignore_types:
  80. t = Token(self.token_types[type_num], value, lex_pos)
  81. t.line = line
  82. t.column = lex_pos - col_start_pos
  83. if t.type in self.callbacks:
  84. t = self.callbacks[t.type](t)
  85. yield t
  86. if type_num in newline_types:
  87. newlines = value.count(self.newline_char)
  88. if newlines:
  89. line += newlines
  90. col_start_pos = lex_pos + value.rindex(self.newline_char)
  91. lex_pos += len(value)
  92. break
  93. else:
  94. if lex_pos < len(stream):
  95. raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
  96. break