This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

345 linhas
10 KiB

  1. import unicodedata
  2. import os
  3. from functools import reduce
  4. from collections import deque
  5. ###{standalone
  6. import sys, re
  7. import logging
  8. logger: logging.Logger = logging.getLogger("lark")
  9. logger.addHandler(logging.StreamHandler())
  10. # Set to highest level, since we have some warnings amongst the code
  11. # By default, we should not output any log messages
  12. logger.setLevel(logging.CRITICAL)
  13. NO_VALUE = object()
  14. def classify(seq, key=None, value=None):
  15. d = {}
  16. for item in seq:
  17. k = key(item) if (key is not None) else item
  18. v = value(item) if (value is not None) else item
  19. if k in d:
  20. d[k].append(v)
  21. else:
  22. d[k] = [v]
  23. return d
  24. def _deserialize(data, namespace, memo):
  25. if isinstance(data, dict):
  26. if '__type__' in data: # Object
  27. class_ = namespace[data['__type__']]
  28. return class_.deserialize(data, memo)
  29. elif '@' in data:
  30. return memo[data['@']]
  31. return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
  32. elif isinstance(data, list):
  33. return [_deserialize(value, namespace, memo) for value in data]
  34. return data
  35. class Serialize(object):
  36. """Safe-ish serialization interface that doesn't rely on Pickle
  37. Attributes:
  38. __serialize_fields__ (List[str]): Fields (aka attributes) to serialize.
  39. __serialize_namespace__ (list): List of classes that deserialization is allowed to instantiate.
  40. Should include all field types that aren't builtin types.
  41. """
  42. def memo_serialize(self, types_to_memoize):
  43. memo = SerializeMemoizer(types_to_memoize)
  44. return self.serialize(memo), memo.serialize()
  45. def serialize(self, memo=None):
  46. if memo and memo.in_types(self):
  47. return {'@': memo.memoized.get(self)}
  48. fields = getattr(self, '__serialize_fields__')
  49. res = {f: _serialize(getattr(self, f), memo) for f in fields}
  50. res['__type__'] = type(self).__name__
  51. if hasattr(self, '_serialize'):
  52. self._serialize(res, memo)
  53. return res
  54. @classmethod
  55. def deserialize(cls, data, memo):
  56. namespace = getattr(cls, '__serialize_namespace__', [])
  57. namespace = {c.__name__:c for c in namespace}
  58. fields = getattr(cls, '__serialize_fields__')
  59. if '@' in data:
  60. return memo[data['@']]
  61. inst = cls.__new__(cls)
  62. for f in fields:
  63. try:
  64. setattr(inst, f, _deserialize(data[f], namespace, memo))
  65. except KeyError as e:
  66. raise KeyError("Cannot find key for class", cls, e)
  67. if hasattr(inst, '_deserialize'):
  68. inst._deserialize()
  69. return inst
  70. class SerializeMemoizer(Serialize):
  71. "A version of serialize that memoizes objects to reduce space"
  72. __serialize_fields__ = 'memoized',
  73. def __init__(self, types_to_memoize):
  74. self.types_to_memoize = tuple(types_to_memoize)
  75. self.memoized = Enumerator()
  76. def in_types(self, value):
  77. return isinstance(value, self.types_to_memoize)
  78. def serialize(self):
  79. return _serialize(self.memoized.reversed(), None)
  80. @classmethod
  81. def deserialize(cls, data, namespace, memo):
  82. return _deserialize(data, namespace, memo)
  83. import types
  84. from functools import wraps, partial
  85. def smart_decorator(f, create_decorator):
  86. if isinstance(f, types.FunctionType):
  87. return wraps(f)(create_decorator(f, True))
  88. elif isinstance(f, (type, types.BuiltinFunctionType)):
  89. return wraps(f)(create_decorator(f, False))
  90. elif isinstance(f, types.MethodType):
  91. return wraps(f)(create_decorator(f.__func__, True))
  92. elif isinstance(f, partial):
  93. # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
  94. return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))
  95. else:
  96. return create_decorator(f.__func__.__call__, True)
  97. try:
  98. import regex # type: ignore
  99. except ImportError:
  100. regex = None
  101. import sre_parse
  102. import sre_constants
  103. categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
  104. def get_regexp_width(expr):
  105. if regex:
  106. # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
  107. # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
  108. # match here below.
  109. regexp_final = re.sub(categ_pattern, 'A', expr)
  110. else:
  111. if re.search(categ_pattern, expr):
  112. raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
  113. regexp_final = expr
  114. try:
  115. return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
  116. except sre_constants.error:
  117. if not regex:
  118. raise ValueError(expr)
  119. else:
  120. # sre_parse does not support the new features in regex. To not completely fail in that case,
  121. # we manually test for the most important info (whether the empty string is matched)
  122. c = regex.compile(regexp_final)
  123. if c.match('') is None:
  124. return 1, sre_constants.MAXREPEAT
  125. else:
  126. return 0, sre_constants.MAXREPEAT
  127. ###}
  128. _ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
  129. _ID_CONTINUE = _ID_START + ('Nd', 'Nl',)
  130. def _test_unicode_category(s, categories):
  131. if len(s) != 1:
  132. return all(_test_unicode_category(char, categories) for char in s)
  133. return s == '_' or unicodedata.category(s) in categories
  134. def is_id_continue(s):
  135. """
  136. Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
  137. numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
  138. """
  139. return _test_unicode_category(s, _ID_CONTINUE)
  140. def is_id_start(s):
  141. """
  142. Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
  143. numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
  144. """
  145. return _test_unicode_category(s, _ID_START)
  146. def dedup_list(l):
  147. """Given a list (l) will removing duplicates from the list,
  148. preserving the original order of the list. Assumes that
  149. the list entries are hashable."""
  150. dedup = set()
  151. return [x for x in l if not (x in dedup or dedup.add(x))]
  152. class Enumerator(Serialize):
  153. def __init__(self):
  154. self.enums = {}
  155. def get(self, item):
  156. if item not in self.enums:
  157. self.enums[item] = len(self.enums)
  158. return self.enums[item]
  159. def __len__(self):
  160. return len(self.enums)
  161. def reversed(self):
  162. r = {v: k for k, v in self.enums.items()}
  163. assert len(r) == len(self.enums)
  164. return r
  165. def combine_alternatives(lists):
  166. """
  167. Accepts a list of alternatives, and enumerates all their possible concatinations.
  168. Examples:
  169. >>> combine_alternatives([range(2), [4,5]])
  170. [[0, 4], [0, 5], [1, 4], [1, 5]]
  171. >>> combine_alternatives(["abc", "xy", '$'])
  172. [['a', 'x', '$'], ['a', 'y', '$'], ['b', 'x', '$'], ['b', 'y', '$'], ['c', 'x', '$'], ['c', 'y', '$']]
  173. >>> combine_alternatives([])
  174. [[]]
  175. """
  176. if not lists:
  177. return [[]]
  178. assert all(l for l in lists), lists
  179. init = [[x] for x in lists[0]]
  180. return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)
  181. try:
  182. import atomicwrites
  183. except ImportError:
  184. atomicwrites = None # type: ignore
  185. class FS:
  186. exists = os.path.exists
  187. @staticmethod
  188. def open(name, mode="r", **kwargs):
  189. if atomicwrites and "w" in mode:
  190. return atomicwrites.atomic_write(name, mode=mode, overwrite=True, **kwargs)
  191. else:
  192. return open(name, mode, **kwargs)
  193. def isascii(s):
  194. """ str.isascii only exists in python3.7+ """
  195. try:
  196. return s.isascii()
  197. except AttributeError:
  198. try:
  199. s.encode('ascii')
  200. return True
  201. except (UnicodeDecodeError, UnicodeEncodeError):
  202. return False
  203. class fzset(frozenset):
  204. def __repr__(self):
  205. return '{%s}' % ', '.join(map(repr, self))
  206. def classify_bool(seq, pred):
  207. true_elems = []
  208. false_elems = []
  209. for elem in seq:
  210. if pred(elem):
  211. true_elems.append(elem)
  212. else:
  213. false_elems.append(elem)
  214. return true_elems, false_elems
  215. def bfs(initial, expand):
  216. open_q = deque(list(initial))
  217. visited = set(open_q)
  218. while open_q:
  219. node = open_q.popleft()
  220. yield node
  221. for next_node in expand(node):
  222. if next_node not in visited:
  223. visited.add(next_node)
  224. open_q.append(next_node)
  225. def bfs_all_unique(initial, expand):
  226. "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions"
  227. open_q = deque(list(initial))
  228. while open_q:
  229. node = open_q.popleft()
  230. yield node
  231. open_q += expand(node)
  232. def _serialize(value, memo):
  233. if isinstance(value, Serialize):
  234. return value.serialize(memo)
  235. elif isinstance(value, list):
  236. return [_serialize(elem, memo) for elem in value]
  237. elif isinstance(value, frozenset):
  238. return list(value) # TODO reversible?
  239. elif isinstance(value, dict):
  240. return {key:_serialize(elem, memo) for key, elem in value.items()}
  241. # assert value is None or isinstance(value, (int, float, str, tuple)), value
  242. return value
  243. def small_factors(n, max_factor):
  244. """
  245. Splits n up into smaller factors and summands <= max_factor.
  246. Returns a list of [(a, b), ...]
  247. so that the following code returns n:
  248. n = 1
  249. for a, b in values:
  250. n = n * a + b
  251. Currently, we also keep a + b <= max_factor, but that might change
  252. """
  253. assert n >= 0
  254. assert max_factor > 2
  255. if n <= max_factor:
  256. return [(n, 0)]
  257. for a in range(max_factor, 1, -1):
  258. r, b = divmod(n, a)
  259. if a + b <= max_factor:
  260. return small_factors(r, max_factor) + [(a, b)]
  261. assert False, "Failed to factorize %s" % n