This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.
 
 

94 líneas
2.8 KiB

  1. ## Lexer Implementation
  2. from utils import Str
  3. class LexError(Exception):
  4. pass
  5. class Token(Str):
  6. def __new__(cls, type, value, pos_in_stream=None):
  7. inst = Str.__new__(cls, value)
  8. inst.type = type
  9. inst.pos_in_stream = pos_in_stream
  10. inst.value = value
  11. return inst
  12. # class Token(object):
  13. # def __init__(self, type, value, lexpos):
  14. # self.type = type
  15. # self.value = value
  16. # self.lexpos = lexpos
  17. def __repr__(self):
  18. return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream)
  19. class Regex:
  20. def __init__(self, pattern, flags=()):
  21. self.pattern = pattern
  22. self.flags = flags
  23. import re
  24. LIMIT = 50 # Stupid named groups limit in python re
  25. class Lexer(object):
  26. def __init__(self, tokens, callbacks, ignore=()):
  27. self.ignore = ignore
  28. self.newline_char = '\n'
  29. # Sanitization
  30. token_names = {t[0] for t in tokens}
  31. for t in tokens:
  32. try:
  33. re.compile(t[1])
  34. except:
  35. raise LexError("Cannot compile token: %s: %s" % t)
  36. assert all(t in token_names for t in ignore)
  37. # Init
  38. self.tokens = tokens
  39. self.callbacks = callbacks
  40. # self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
  41. self.mres = []
  42. self.name_from_index = []
  43. x = tokens
  44. while x:
  45. mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
  46. self.mres.append(mre)
  47. self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
  48. x = x[LIMIT:]
  49. def lex(self, stream):
  50. lex_pos = 0
  51. line = 1
  52. col_start_pos = 0
  53. while True:
  54. i = 0
  55. for mre in self.mres:
  56. m = mre.match(stream, lex_pos)
  57. if m:
  58. value = m.group(0)
  59. type_ = self.name_from_index[i][m.lastindex]
  60. if type_ not in self.ignore:
  61. t = Token(type_, value, lex_pos)
  62. t.line = line
  63. t.column = lex_pos - col_start_pos
  64. if t.type in self.callbacks:
  65. t = self.callbacks[t.type](t)
  66. yield t
  67. newlines = value.count(self.newline_char)
  68. if newlines:
  69. line += newlines
  70. col_start_pos = lex_pos + value.rindex(self.newline_char)
  71. lex_pos += len(value)
  72. break
  73. i += 1
  74. else:
  75. if lex_pos < len(stream):
  76. context = stream[lex_pos:lex_pos+5]
  77. raise LexError("No token defined for: '%s' in %s at line %d" % (stream[lex_pos], context, line))
  78. break