This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

345 lines
10 KiB

  1. import unicodedata
  2. import os
  3. from functools import reduce
  4. from collections import deque
  5. ###{standalone
  6. import sys, re
  7. import logging
  8. logger: logging.Logger = logging.getLogger("lark")
  9. logger.addHandler(logging.StreamHandler())
  10. # Set to highest level, since we have some warnings amongst the code
  11. # By default, we should not output any log messages
  12. logger.setLevel(logging.CRITICAL)
  13. NO_VALUE = object()
  14. def classify(seq, key=None, value=None):
  15. d = {}
  16. for item in seq:
  17. k = key(item) if (key is not None) else item
  18. v = value(item) if (value is not None) else item
  19. if k in d:
  20. d[k].append(v)
  21. else:
  22. d[k] = [v]
  23. return d
  24. def _deserialize(data, namespace, memo):
  25. if isinstance(data, dict):
  26. if '__type__' in data: # Object
  27. class_ = namespace[data['__type__']]
  28. return class_.deserialize(data, memo)
  29. elif '@' in data:
  30. return memo[data['@']]
  31. return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
  32. elif isinstance(data, list):
  33. return [_deserialize(value, namespace, memo) for value in data]
  34. return data
  35. class Serialize(object):
  36. """Safe-ish serialization interface that doesn't rely on Pickle
  37. Attributes:
  38. __serialize_fields__ (List[str]): Fields (aka attributes) to serialize.
  39. __serialize_namespace__ (list): List of classes that deserialization is allowed to instantiate.
  40. Should include all field types that aren't builtin types.
  41. """
  42. def memo_serialize(self, types_to_memoize):
  43. memo = SerializeMemoizer(types_to_memoize)
  44. return self.serialize(memo), memo.serialize()
  45. def serialize(self, memo=None):
  46. if memo and memo.in_types(self):
  47. return {'@': memo.memoized.get(self)}
  48. fields = getattr(self, '__serialize_fields__')
  49. res = {f: _serialize(getattr(self, f), memo) for f in fields}
  50. res['__type__'] = type(self).__name__
  51. if hasattr(self, '_serialize'):
  52. self._serialize(res, memo)
  53. return res
  54. @classmethod
  55. def deserialize(cls, data, memo):
  56. namespace = getattr(cls, '__serialize_namespace__', [])
  57. namespace = {c.__name__:c for c in namespace}
  58. fields = getattr(cls, '__serialize_fields__')
  59. if '@' in data:
  60. return memo[data['@']]
  61. inst = cls.__new__(cls)
  62. for f in fields:
  63. try:
  64. setattr(inst, f, _deserialize(data[f], namespace, memo))
  65. except KeyError as e:
  66. raise KeyError("Cannot find key for class", cls, e)
  67. if hasattr(inst, '_deserialize'):
  68. inst._deserialize()
  69. return inst
  70. class SerializeMemoizer(Serialize):
  71. "A version of serialize that memoizes objects to reduce space"
  72. __serialize_fields__ = 'memoized',
  73. def __init__(self, types_to_memoize):
  74. self.types_to_memoize = tuple(types_to_memoize)
  75. self.memoized = Enumerator()
  76. def in_types(self, value):
  77. return isinstance(value, self.types_to_memoize)
  78. def serialize(self):
  79. return _serialize(self.memoized.reversed(), None)
  80. @classmethod
  81. def deserialize(cls, data, namespace, memo):
  82. return _deserialize(data, namespace, memo)
  83. import types
  84. from functools import wraps, partial
  85. def smart_decorator(f, create_decorator):
  86. if isinstance(f, types.FunctionType):
  87. return wraps(f)(create_decorator(f, True))
  88. elif isinstance(f, (type, types.BuiltinFunctionType)):
  89. return wraps(f)(create_decorator(f, False))
  90. elif isinstance(f, types.MethodType):
  91. return wraps(f)(create_decorator(f.__func__, True))
  92. elif isinstance(f, partial):
  93. # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
  94. return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))
  95. else:
  96. return create_decorator(f.__func__.__call__, True)
  97. try:
  98. import regex # type: ignore
  99. except ImportError:
  100. regex = None
  101. import sre_parse
  102. import sre_constants
  103. categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
  104. def get_regexp_width(expr):
  105. if regex:
  106. # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
  107. # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
  108. # match here below.
  109. regexp_final = re.sub(categ_pattern, 'A', expr)
  110. else:
  111. if re.search(categ_pattern, expr):
  112. raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
  113. regexp_final = expr
  114. try:
  115. return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
  116. except sre_constants.error:
  117. if not regex:
  118. raise ValueError(expr)
  119. else:
  120. # sre_parse does not support the new features in regex. To not completely fail in that case,
  121. # we manually test for the most important info (whether the empty string is matched)
  122. c = regex.compile(regexp_final)
  123. if c.match('') is None:
  124. return 1, sre_constants.MAXREPEAT
  125. else:
  126. return 0, sre_constants.MAXREPEAT
  127. ###}
  128. _ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
  129. _ID_CONTINUE = _ID_START + ('Nd', 'Nl',)
  130. def _test_unicode_category(s, categories):
  131. if len(s) != 1:
  132. return all(_test_unicode_category(char, categories) for char in s)
  133. return s == '_' or unicodedata.category(s) in categories
  134. def is_id_continue(s):
  135. """
  136. Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
  137. numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
  138. """
  139. return _test_unicode_category(s, _ID_CONTINUE)
  140. def is_id_start(s):
  141. """
  142. Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
  143. numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
  144. """
  145. return _test_unicode_category(s, _ID_START)
  146. def dedup_list(l):
  147. """Given a list (l) will removing duplicates from the list,
  148. preserving the original order of the list. Assumes that
  149. the list entries are hashable."""
  150. dedup = set()
  151. return [x for x in l if not (x in dedup or dedup.add(x))]
  152. class Enumerator(Serialize):
  153. def __init__(self):
  154. self.enums = {}
  155. def get(self, item):
  156. if item not in self.enums:
  157. self.enums[item] = len(self.enums)
  158. return self.enums[item]
  159. def __len__(self):
  160. return len(self.enums)
  161. def reversed(self):
  162. r = {v: k for k, v in self.enums.items()}
  163. assert len(r) == len(self.enums)
  164. return r
  165. def combine_alternatives(lists):
  166. """
  167. Accepts a list of alternatives, and enumerates all their possible concatinations.
  168. Examples:
  169. >>> combine_alternatives([range(2), [4,5]])
  170. [[0, 4], [0, 5], [1, 4], [1, 5]]
  171. >>> combine_alternatives(["abc", "xy", '$'])
  172. [['a', 'x', '$'], ['a', 'y', '$'], ['b', 'x', '$'], ['b', 'y', '$'], ['c', 'x', '$'], ['c', 'y', '$']]
  173. >>> combine_alternatives([])
  174. [[]]
  175. """
  176. if not lists:
  177. return [[]]
  178. assert all(l for l in lists), lists
  179. init = [[x] for x in lists[0]]
  180. return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)
  181. try:
  182. import atomicwrites
  183. except ImportError:
  184. atomicwrites = None # type: ignore
  185. class FS:
  186. exists = os.path.exists
  187. @staticmethod
  188. def open(name, mode="r", **kwargs):
  189. if atomicwrites and "w" in mode:
  190. return atomicwrites.atomic_write(name, mode=mode, overwrite=True, **kwargs)
  191. else:
  192. return open(name, mode, **kwargs)
  193. def isascii(s):
  194. """ str.isascii only exists in python3.7+ """
  195. try:
  196. return s.isascii()
  197. except AttributeError:
  198. try:
  199. s.encode('ascii')
  200. return True
  201. except (UnicodeDecodeError, UnicodeEncodeError):
  202. return False
  203. class fzset(frozenset):
  204. def __repr__(self):
  205. return '{%s}' % ', '.join(map(repr, self))
  206. def classify_bool(seq, pred):
  207. true_elems = []
  208. false_elems = []
  209. for elem in seq:
  210. if pred(elem):
  211. true_elems.append(elem)
  212. else:
  213. false_elems.append(elem)
  214. return true_elems, false_elems
  215. def bfs(initial, expand):
  216. open_q = deque(list(initial))
  217. visited = set(open_q)
  218. while open_q:
  219. node = open_q.popleft()
  220. yield node
  221. for next_node in expand(node):
  222. if next_node not in visited:
  223. visited.add(next_node)
  224. open_q.append(next_node)
  225. def bfs_all_unique(initial, expand):
  226. "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions"
  227. open_q = deque(list(initial))
  228. while open_q:
  229. node = open_q.popleft()
  230. yield node
  231. open_q += expand(node)
  232. def _serialize(value, memo):
  233. if isinstance(value, Serialize):
  234. return value.serialize(memo)
  235. elif isinstance(value, list):
  236. return [_serialize(elem, memo) for elem in value]
  237. elif isinstance(value, frozenset):
  238. return list(value) # TODO reversible?
  239. elif isinstance(value, dict):
  240. return {key:_serialize(elem, memo) for key, elem in value.items()}
  241. # assert value is None or isinstance(value, (int, float, str, tuple)), value
  242. return value
  243. def small_factors(n, max_factor):
  244. """
  245. Splits n up into smaller factors and summands <= max_factor.
  246. Returns a list of [(a, b), ...]
  247. so that the following code returns n:
  248. n = 1
  249. for a, b in values:
  250. n = n * a + b
  251. Currently, we also keep a + b <= max_factor, but that might change
  252. """
  253. assert n >= 0
  254. assert max_factor > 2
  255. if n <= max_factor:
  256. return [(n, 0)]
  257. for a in range(max_factor, 1, -1):
  258. r, b = divmod(n, a)
  259. if a + b <= max_factor:
  260. return small_factors(r, max_factor) + [(a, b)]
  261. assert False, "Failed to factorize %s" % n