Convert generate-NOTICE.py to Python 3, fix name.
Python module names should be lower case and not use hyphens (the
former is a convention, the latter is a requirement for importable
modules).
Also updates the shell script to always use Python 3 so we don't need
to maintain Python 2 compatibility.
Test: repo upload, in both a python 2 and python 3 virtualenv
Bug: None
Change-Id: I486e54a12686b4e528dc6c9c47af5c7a52a7b790
diff --git a/libc/tools/generate_notice.py b/libc/tools/generate_notice.py
new file mode 100755
index 0000000..e0e6b32
--- /dev/null
+++ b/libc/tools/generate_notice.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+# Run with directory arguments from any directory, with no special setup
+# required.
+
+import os
+from pathlib import Path
+import re
+import sys
+from typing import Sequence
+
+VERBOSE = False
+
+copyrights = set()
+
+
+def warn(s):
+ sys.stderr.write("warning: %s\n" % s)
+
+
+def warn_verbose(s):
+ if VERBOSE:
+ warn(s)
+
+
+def is_interesting(path_str: str) -> bool:
+ path = Path(path_str.lower())
+ uninteresting_extensions = [
+ ".bp",
+ ".map",
+ ".md",
+ ".mk",
+ ".py",
+ ".pyc",
+ ".swp",
+ ".txt",
+ ]
+ if path.suffix in uninteresting_extensions:
+ return False
+ if path.name in {"notice", "readme", "pylintrc"}:
+ return False
+ return True
+
+
+def is_auto_generated(content):
+ if "Generated by gensyscalls.py" in content or "generated by genserv.py" in content:
+ return True
+ if "This header was automatically generated from a Linux kernel header" in content:
+ return True
+ return False
+
+
+def is_copyright_end(line: str, first_line_was_hash: bool) -> bool:
+ endings = [
+ " $FreeBSD: ",
+ "$Citrus$",
+ "$FreeBSD$",
+ "*/",
+ "From: @(#)",
+ # OpenBSD likes to say where stuff originally came from:
+ "Original version ID:",
+ "\t$Citrus: ",
+ "\t$NetBSD: ",
+ "\t$OpenBSD: ",
+ "\t@(#)",
+ "\tcitrus Id: ",
+ "\tfrom: @(#)",
+ "from OpenBSD:",
+ ]
+ if first_line_was_hash and not line:
+ return True
+
+ for ending in endings:
+ if ending in line:
+ return True
+
+ return False
+
+
+def extract_copyright_at(lines: Sequence[str], i: int) -> int:
+ first_line_was_hash = lines[i].startswith("#")
+
+ # Do we need to back up to find the start of the copyright header?
+ start = i
+ if not first_line_was_hash:
+ while start > 0:
+ if "/*" in lines[start - 1]:
+ break
+ start -= 1
+
+ # Read comment lines until we hit something that terminates a
+ # copyright header.
+ while i < len(lines):
+ if is_copyright_end(lines[i], first_line_was_hash):
+ break
+ i += 1
+
+ end = i
+
+ # Trim trailing cruft.
+ while end > 0:
+ line = lines[end - 1]
+ if line not in {
+ " *", " * ===================================================="
+ }:
+ break
+ end -= 1
+
+ # Remove C/assembler comment formatting, pulling out just the text.
+ clean_lines = []
+ for line in lines[start:end]:
+ line = line.replace("\t", " ")
+ line = line.replace("/* ", "")
+ line = re.sub(r"^ \* ", "", line)
+ line = line.replace("** ", "")
+ line = line.replace("# ", "")
+ if "SPDX-License-Identifier:" in line:
+ continue
+ if line.startswith("++Copyright++"):
+ continue
+ line = line.replace("--Copyright--", "")
+ line = line.rstrip()
+ # These come last and take care of "blank" comment lines.
+ if line in {"#", " *", "**", "-"}:
+ line = ""
+ clean_lines.append(line)
+
+ # Trim blank lines from head and tail.
+ while clean_lines[0] == "":
+ clean_lines = clean_lines[1:]
+ while clean_lines[len(clean_lines) - 1] == "":
+ clean_lines = clean_lines[0:(len(clean_lines) - 1)]
+
+ copyrights.add("\n".join(clean_lines))
+
+ return i
+
+
+def do_file(path: str) -> None:
+ raw = Path(path).read_bytes()
+ try:
+ content = raw.decode("utf-8")
+ except UnicodeDecodeError:
+ warn("bad UTF-8 in %s" % path)
+ content = raw.decode("iso-8859-1")
+
+ lines = content.split("\n")
+
+ if len(lines) <= 4:
+ warn_verbose("ignoring short file %s" % path)
+ return
+
+ if is_auto_generated(content):
+ warn_verbose("ignoring auto-generated file %s" % path)
+ return
+
+ if not "Copyright" in content:
+ if "public domain" in content.lower():
+ warn_verbose("ignoring public domain file %s" % path)
+ return
+ warn('no copyright notice found in "%s" (%d lines)' %
+ (path, len(lines)))
+ return
+
+ # Manually iterate because extract_copyright_at tells us how many lines to
+ # skip.
+ i = 0
+ while i < len(lines):
+ if "Copyright" in lines[i] and not "@(#) Copyright" in lines[i]:
+ i = extract_copyright_at(lines, i)
+ else:
+ i += 1
+
+
+def do_dir(arg):
+ for directory, sub_directories, filenames in os.walk(arg):
+ if ".git" in sub_directories:
+ sub_directories.remove(".git")
+ sub_directories = sorted(sub_directories)
+
+ for filename in sorted(filenames):
+ path = os.path.join(directory, filename)
+ if is_interesting(path):
+ do_file(path)
+
+
+def main() -> None:
+ args = sys.argv[1:]
+ if len(args) == 0:
+ args = ["."]
+
+ for arg in args:
+ if os.path.isdir(arg):
+ do_dir(arg)
+ else:
+ do_file(arg)
+
+ for notice in sorted(copyrights):
+ print(notice)
+ print()
+ print("-" * 67)
+ print()
+
+
+if __name__ == "__main__":
+ main()