linux-firmware/check_whence.py
Emil Velikov ee8c336ab3 copy-firmware.sh: flesh out and fix dedup-firmware.sh
Flesh out the de-duplication logic in separate script. The copy-firmware.sh is
already complex enough and de-duplication doesn't really fit in there.

In the process we migrate away from the open-coded `ln --relative`. We also
avoid touching symlinks, which are not created by rdfind. Otherwise we end up
"fixing" the folder to folder symlinks (created earlier in the process) and
things explode.

As result we also get a few bonuses:
 - the COPYOPTS shell injection is gone - the variable was never used
 - people can dedup as separate step if/when they choose to do so

Aside: based on the noise in git log and around distros ... I'm wondering if
having the de-duplication as opt-in, would have been better. Is it too late to
change or the ship has sailed?

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
2024-10-10 14:33:32 +00:00

170 lines
5.5 KiB
Python
Executable File

#!/usr/bin/python3
import os, re, sys
from io import open
def list_whence():
with open("WHENCE", encoding="utf-8") as whence:
for line in whence:
match = re.match(r'(?:RawFile|File|Source):\s*"(.*)"', line)
if match:
yield match.group(1)
continue
match = re.match(r"(?:RawFile|File|Source):\s*(\S*)", line)
if match:
yield match.group(1)
continue
match = re.match(
r"Licen[cs]e: (?:.*\bSee (.*) for details\.?|(\S*))\n", line
)
if match:
if match.group(1):
for name in re.split(r", | and ", match.group(1)):
yield name
continue
if match.group(2):
# Just one word - may or may not be a filename
if not re.search(
r"unknown|distributable", match.group(2), re.IGNORECASE
):
yield match.group(2)
continue
def list_whence_files():
with open("WHENCE", encoding="utf-8") as whence:
for line in whence:
match = re.match(r"(?:RawFile|File):\s*(.*)", line)
if match:
yield match.group(1).replace(r"\ ", " ").replace('"', "")
continue
def list_links_list():
with open("WHENCE", encoding="utf-8") as whence:
for line in whence:
match = re.match(r"Link:\s*(.*)", line)
if match:
linkname, target = match.group(1).split("->")
linkname = linkname.strip().replace(r"\ ", " ").replace('"', "")
target = target.strip().replace(r"\ ", " ").replace('"', "")
# Link target is relative to the link
target = os.path.join(os.path.dirname(linkname), target)
target = os.path.normpath(target)
yield (linkname, target)
continue
def list_git():
with os.popen("git ls-files") as git_files:
for line in git_files:
yield line.rstrip("\n")
def main():
ret = 0
whence_list = list(list_whence())
whence_files = list(list_whence_files())
links_list = list(list_links_list())
whence_links = list(zip(*links_list))[0]
known_files = set(name for name in whence_list if not name.endswith("/")) | set(
[
".codespell.cfg",
".editorconfig",
".gitignore",
".gitlab-ci.yml",
".pre-commit-config.yaml",
"Dockerfile",
"Makefile",
"README.md",
"WHENCE",
"build_packages.py",
"check_whence.py",
"configure",
"contrib/process_linux_firmware.py",
"contrib/templates/debian.changelog",
"contrib/templates/debian.control",
"contrib/templates/debian.copyright",
"contrib/templates/rpm.spec",
"copy-firmware.sh",
"dedup-firmware.sh",
]
)
known_prefixes = set(name for name in whence_list if name.endswith("/"))
git_files = set(list_git())
for name in set(name for name in whence_files if name.endswith("/")):
sys.stderr.write("E: %s listed in WHENCE as File, but is directory\n" % name)
ret = 1
for name in set(name for name in whence_files if whence_files.count(name) > 1):
sys.stderr.write("E: %s listed in WHENCE twice\n" % name)
ret = 1
for name in set(link for link in whence_links if whence_links.count(link) > 1):
sys.stderr.write("E: %s listed in WHENCE twice\n" % name)
ret = 1
for name in set(file for file in whence_files if os.path.islink(file)):
sys.stderr.write("E: %s listed in WHENCE as File, but is a symlink\n" % name)
ret = 1
for name in set(link[0] for link in links_list if os.path.islink(link[0])):
sys.stderr.write("E: %s listed in WHENCE as Link, is in tree\n" % name)
ret = 1
invalid_targets = set(link[0] for link in links_list)
for link, target in sorted(links_list):
if target in invalid_targets:
sys.stderr.write(
"E: target %s of link %s is also a link\n" % (target, link)
)
ret = 1
for name in sorted(list(known_files - git_files)):
sys.stderr.write("E: %s listed in WHENCE does not exist\n" % name)
ret = 1
# A link can point to a file...
valid_targets = set(git_files)
# ... or to a directory
for target in set(valid_targets):
dirname = target
while True:
dirname = os.path.dirname(dirname)
if dirname == "":
break
valid_targets.add(dirname)
for link, target in sorted(links_list):
if target not in valid_targets:
sys.stderr.write(
"E: target %s of link %s in WHENCE" " does not exist\n" % (target, link)
)
ret = 1
for name in sorted(list(git_files - known_files)):
# Ignore subdirectory changelogs and GPG detached signatures
if name.endswith("/ChangeLog") or (
name.endswith(".asc") and name[:-4] in known_files
):
continue
# Ignore unknown files in known directories
for prefix in known_prefixes:
if name.startswith(prefix):
break
else:
sys.stderr.write("E: %s not listed in WHENCE\n" % name)
ret = 1
return ret
if __name__ == "__main__":
sys.exit(main())