This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
2.8 KiB

  1. ## Lexer Implementation
  2. from utils import Str
  3. class LexError(Exception):
  4. pass
  5. class Token(Str):
  6. def __new__(cls, type, value, pos_in_stream=None):
  7. inst = Str.__new__(cls, value)
  8. inst.type = type
  9. inst.pos_in_stream = pos_in_stream
  10. inst.value = value
  11. return inst
  12. # class Token(object):
  13. # def __init__(self, type, value, lexpos):
  14. # self.type = type
  15. # self.value = value
  16. # self.lexpos = lexpos
  17. def __repr__(self):
  18. return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream)
  19. class Regex:
  20. def __init__(self, pattern, flags=()):
  21. self.pattern = pattern
  22. self.flags = flags
  23. import re
  24. LIMIT = 50 # Stupid named groups limit in python re
  25. class Lexer(object):
  26. def __init__(self, tokens, callbacks, ignore=()):
  27. self.ignore = ignore
  28. self.newline_char = '\n'
  29. # Sanitization
  30. token_names = {t[0] for t in tokens}
  31. for t in tokens:
  32. try:
  33. re.compile(t[1])
  34. except:
  35. raise LexError("Cannot compile token: %s: %s" % t)
  36. assert all(t in token_names for t in ignore)
  37. # Init
  38. self.tokens = tokens
  39. self.callbacks = callbacks
  40. # self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
  41. self.mres = []
  42. self.name_from_index = []
  43. x = tokens
  44. while x:
  45. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
  46. self.mres.append(mre)
  47. self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
  48. x = x[LIMIT:]
  49. def lex(self, stream):
  50. lex_pos = 0
  51. line = 1
  52. col_start_pos = 0
  53. while True:
  54. i = 0
  55. for mre in self.mres:
  56. m = mre.match(stream, lex_pos)
  57. if m:
  58. value = m.group(0)
  59. type_ = self.name_from_index[i][m.lastindex]
  60. if type_ not in self.ignore:
  61. t = Token(type_, value, lex_pos)
  62. t.line = line
  63. t.column = lex_pos - col_start_pos
  64. if t.type in self.callbacks:
  65. t = self.callbacks[t.type](t)
  66. yield t
  67. newlines = value.count(self.newline_char)
  68. if newlines:
  69. line += newlines
  70. col_start_pos = lex_pos + value.rindex(self.newline_char)
  71. lex_pos += len(value)
  72. break
  73. i += 1
  74. else:
  75. if lex_pos < len(stream):
  76. context = stream[lex_pos:lex_pos+5]
  77. raise LexError("No token defined for: '%s' in %s at line %d" % (stream[lex_pos], context, line))
  78. break