f34ce41345
GitOrigin-RevId: b73c2221a46c13557b1b3be9c2070cc42cf01eb3
163 lines
6.2 KiB
Python
163 lines
6.2 KiB
Python
import json
|
|
import sys
|
|
|
|
from typing import Dict, Set
|
|
|
|
# this compresses MITM URL lists with Gradle-specific optimizations
|
|
# specifically, it splits each url into up to 3 parts - they will be
|
|
# concatenated like part1/part2.part3 or part1.part2
|
|
# part3 is simply always the file extension, but part1 and part2 is
|
|
# optimized using special heuristics
|
|
# additionally, if part2 ends with /a/b/{a}-{b}, the all occurences of
|
|
# /{a}/{b}/ are replaced with #
|
|
# finally, anything that ends with = is considered SHA256, anything that
|
|
# starts with http is considered a redirect URL, anything else is
|
|
# considered text
|
|
|
|
with open(sys.argv[1], "rt") as f:
|
|
data: dict = json.load(f)
|
|
|
|
new_data: Dict[str, Dict[str, Dict[str, dict]]] = {}
|
|
|
|
for url, info in data.items():
|
|
if url == "!version":
|
|
continue
|
|
ext, base = map(lambda x: x[::-1], url[::-1].split(".", 1))
|
|
if base.endswith(".tar"):
|
|
base = base[:-4]
|
|
ext = "tar." + ext
|
|
# special logic for Maven repos
|
|
if ext in ["jar", "pom", "module"]:
|
|
comps = base.split("/")
|
|
if "-" in comps[-1]:
|
|
# convert base/name/ver/name-ver into base#name/ver
|
|
|
|
filename = comps[-1]
|
|
name = comps[-3]
|
|
basever = comps[-2]
|
|
ver = basever
|
|
is_snapshot = ver.endswith("-SNAPSHOT")
|
|
if is_snapshot:
|
|
ver = ver.removesuffix("-SNAPSHOT")
|
|
if filename.startswith(f"{name}-{ver}"):
|
|
if is_snapshot:
|
|
if filename.startswith(f"{name}-{ver}-SNAPSHOT"):
|
|
ver += "-SNAPSHOT"
|
|
else:
|
|
ver += "-".join(
|
|
filename.removeprefix(f"{name}-{ver}").split("-")[:3]
|
|
)
|
|
comp_end = comps[-1].removeprefix(f"{name}-{ver}")
|
|
else:
|
|
ver, name, comp_end = None, None, None
|
|
if name and ver and (not comp_end or comp_end.startswith("-")):
|
|
base = "/".join(comps[:-1]) + "/"
|
|
base = base.replace(f"/{name}/{basever}/", "#")
|
|
base += f"{name}/{ver}"
|
|
if is_snapshot:
|
|
base += "/SNAPSHOT"
|
|
if comp_end:
|
|
base += "/" + comp_end[1:]
|
|
scheme, rest = base.split("/", 1)
|
|
if scheme not in new_data.keys():
|
|
new_data[scheme] = {}
|
|
if rest not in new_data[scheme].keys():
|
|
new_data[scheme][rest] = {}
|
|
if "hash" in info.keys():
|
|
new_data[scheme][rest][ext] = info["hash"]
|
|
elif "text" in info.keys() and ext == "xml":
|
|
# nix code in fetch-deps.nix will autogenerate metadata xml files groupId
|
|
# is part of the URL, but it can be tricky to parse as we don't know the
|
|
# exact repo base, so take it from the xml and pass it to nix
|
|
xml = "".join(info["text"].split())
|
|
new_data[scheme][rest][ext] = {
|
|
"groupId": xml.split("<groupId>")[1].split("</groupId>")[0],
|
|
}
|
|
if "<release>" in xml:
|
|
new_data[scheme][rest][ext]["release"] = xml.split("<release>")[1].split(
|
|
"</release>"
|
|
)[0]
|
|
if "<latest>" in xml:
|
|
latest = xml.split("<latest>")[1].split("</latest>")[0]
|
|
if latest != new_data[scheme][rest][ext].get("release"):
|
|
new_data[scheme][rest][ext]["latest"] = latest
|
|
if "<lastUpdated>" in xml:
|
|
new_data[scheme][rest][ext]["lastUpdated"] = xml.split("<lastUpdated>")[
|
|
1
|
|
].split("</lastUpdated>")[0]
|
|
else:
|
|
raise Exception("Unsupported key: " + repr(info))
|
|
|
|
# At this point, we have a map by part1 (initially the scheme), part2 (initially a
|
|
# slash-separated string without the scheme and with potential # substitution as
|
|
# seen above), extension.
|
|
# Now, push some segments from "part2" into "part1" like this:
|
|
# https # part1
|
|
# domain1/b # part2
|
|
# domain1/c
|
|
# domain2/a
|
|
# domain2/c
|
|
# ->
|
|
# https/domain1 # part1
|
|
# b # part2
|
|
# c
|
|
# https/domain2 # part1
|
|
# a # part2
|
|
# c
|
|
# This helps reduce the lockfile size because a Gradle project will usually use lots
|
|
# of files from a single Maven repo
|
|
|
|
data = new_data
|
|
changed = True
|
|
while changed:
|
|
changed = False
|
|
new_data = {}
|
|
for part1, info1 in data.items():
|
|
starts: Set[str] = set()
|
|
# by how many bytes the file size will be increased (roughly)
|
|
lose = 0
|
|
# by how many bytes the file size will be reduced (roughly)
|
|
win = 0
|
|
# how many different initial part2 segments there are
|
|
count = 0
|
|
for part2, info2 in info1.items():
|
|
if "/" not in part2:
|
|
# can't push a segment from part2 into part1
|
|
count = 0
|
|
break
|
|
st = part2.split("/", 1)[0]
|
|
if st not in starts:
|
|
lose += len(st) + 1
|
|
count += 1
|
|
starts.add(st)
|
|
win += len(st) + 1
|
|
if count == 0:
|
|
new_data[part1] = info1
|
|
continue
|
|
# only allow pushing part2 segments into path1 if *either*:
|
|
# - the domain isn't yet part of part1
|
|
# - the initial part2 segment is always the same
|
|
if count != 1 and "." in part1:
|
|
new_data[part1] = info1
|
|
continue
|
|
# some heuristics that may or may not work well (originally this was
|
|
# used when the above if wasn't here, but perhaps it's useless now)
|
|
lose += (count - 1) * max(0, len(part1) - 4)
|
|
if win > lose or ("." not in part1 and win >= lose):
|
|
changed = True
|
|
for part2, info2 in info1.items():
|
|
st, part3 = part2.split("/", 1)
|
|
new_part1 = part1 + "/" + st
|
|
if new_part1 not in new_data.keys():
|
|
new_data[new_part1] = {}
|
|
new_data[new_part1][part3] = info2
|
|
else:
|
|
new_data[part1] = info1
|
|
data = new_data
|
|
|
|
new_data["!comment"] = "This is a nixpkgs Gradle dependency lockfile. For more details, refer to the Gradle section in the nixpkgs manual." # type: ignore
|
|
new_data["!version"] = 1 # type: ignore
|
|
|
|
with open(sys.argv[2], "wt") as f:
|
|
json.dump(new_data, f, sort_keys=True, indent=1)
|
|
f.write("\n")
|