This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
2.4 KiB

  1. ## Lexer Implementation
  2. from utils import Str
  3. class LexError(Exception):
  4. pass
  5. class Token(Str):
  6. def __new__(cls, type, value, pos_in_stream=None):
  7. inst = Str.__new__(cls, value)
  8. inst.type = type
  9. inst.pos_in_stream = pos_in_stream
  10. inst.value = value
  11. return inst
  12. # class Token(object):
  13. # def __init__(self, type, value, lexpos):
  14. # self.type = type
  15. # self.value = value
  16. # self.lexpos = lexpos
  17. def __repr__(self):
  18. return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream)
  19. class Regex:
  20. def __init__(self, pattern, flags=()):
  21. self.pattern = pattern
  22. self.flags = flags
  23. import re
  24. LIMIT = 50 # Stupid named groups limit in python re
  25. class Lexer(object):
  26. def __init__(self, tokens, callbacks, ignore=()):
  27. self.ignore = ignore
  28. # Sanitization
  29. token_names = {t[0] for t in tokens}
  30. for t in tokens:
  31. try:
  32. re.compile(t[1])
  33. except:
  34. raise LexError("Cannot compile token: %s: %s" % t)
  35. assert all(t in token_names for t in ignore)
  36. # Init
  37. self.tokens = tokens
  38. self.callbacks = callbacks
  39. self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
  40. self.mres = []
  41. self.name_from_index = []
  42. x = tokens
  43. while x:
  44. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
  45. self.mres.append(mre)
  46. self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
  47. x = x[LIMIT:]
  48. def lex(self, stream):
  49. lex_pos = 0
  50. while True:
  51. i = 0
  52. for mre in self.mres:
  53. m = mre.match(stream, lex_pos)
  54. if m:
  55. value = m.group(0)
  56. type_ = self.name_from_index[i][m.lastindex]
  57. t = Token(type_, value, lex_pos)
  58. if t.type in self.callbacks:
  59. self.callbacks[t.type](t)
  60. if t.type not in self.ignore:
  61. yield t
  62. lex_pos += len(value)
  63. break
  64. i += 1
  65. else:
  66. if lex_pos < len(stream):
  67. context = stream[lex_pos:lex_pos+5]
  68. raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context))
  69. break