Convert generate-NOTICE.py to Python 3, fix name.

Python module names should be lower case and not use hyphens (the
former is a convention, the latter is a requirement for importable
modules).

Also updates the shell script to always use Python 3 so we don't need
to maintain Python 2 compatibility.

Test: repo upload, in both a python 2 and python 3 virtualenv
Bug: None
Change-Id: I486e54a12686b4e528dc6c9c47af5c7a52a7b790
diff --git a/libc/tools/generate_notice.py b/libc/tools/generate_notice.py
new file mode 100755
index 0000000..e0e6b32
--- /dev/null
+++ b/libc/tools/generate_notice.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+# Run with directory arguments from any directory, with no special setup
+# required.
+
+import os
+from pathlib import Path
+import re
+import sys
+from typing import Sequence
+
+VERBOSE = False
+
+copyrights = set()
+
+
+def warn(s):
+    sys.stderr.write("warning: %s\n" % s)
+
+
+def warn_verbose(s):
+    if VERBOSE:
+        warn(s)
+
+
+def is_interesting(path_str: str) -> bool:
+    path = Path(path_str.lower())
+    uninteresting_extensions = [
+        ".bp",
+        ".map",
+        ".md",
+        ".mk",
+        ".py",
+        ".pyc",
+        ".swp",
+        ".txt",
+    ]
+    if path.suffix in uninteresting_extensions:
+        return False
+    if path.name in {"notice", "readme", "pylintrc"}:
+        return False
+    return True
+
+
+def is_auto_generated(content):
+    if "Generated by gensyscalls.py" in content or "generated by genserv.py" in content:
+        return True
+    if "This header was automatically generated from a Linux kernel header" in content:
+        return True
+    return False
+
+
+def is_copyright_end(line: str, first_line_was_hash: bool) -> bool:
+    endings = [
+        " $FreeBSD: ",
+        "$Citrus$",
+        "$FreeBSD$",
+        "*/",
+        "From: @(#)",
+        # OpenBSD likes to say where stuff originally came from:
+        "Original version ID:",
+        "\t$Citrus: ",
+        "\t$NetBSD: ",
+        "\t$OpenBSD: ",
+        "\t@(#)",
+        "\tcitrus Id: ",
+        "\tfrom: @(#)",
+        "from OpenBSD:",
+    ]
+    if first_line_was_hash and not line:
+        return True
+
+    for ending in endings:
+        if ending in line:
+            return True
+
+    return False
+
+
+def extract_copyright_at(lines: Sequence[str], i: int) -> int:
+    first_line_was_hash = lines[i].startswith("#")
+
+    # Do we need to back up to find the start of the copyright header?
+    start = i
+    if not first_line_was_hash:
+        while start > 0:
+            if "/*" in lines[start - 1]:
+                break
+            start -= 1
+
+    # Read comment lines until we hit something that terminates a
+    # copyright header.
+    while i < len(lines):
+        if is_copyright_end(lines[i], first_line_was_hash):
+            break
+        i += 1
+
+    end = i
+
+    # Trim trailing cruft.
+    while end > 0:
+        line = lines[end - 1]
+        if line not in {
+                " *", " * ===================================================="
+        }:
+            break
+        end -= 1
+
+    # Remove C/assembler comment formatting, pulling out just the text.
+    clean_lines = []
+    for line in lines[start:end]:
+        line = line.replace("\t", "    ")
+        line = line.replace("/* ", "")
+        line = re.sub(r"^ \* ", "", line)
+        line = line.replace("** ", "")
+        line = line.replace("# ", "")
+        if "SPDX-License-Identifier:" in line:
+            continue
+        if line.startswith("++Copyright++"):
+            continue
+        line = line.replace("--Copyright--", "")
+        line = line.rstrip()
+        # These come last and take care of "blank" comment lines.
+        if line in {"#", " *", "**", "-"}:
+            line = ""
+        clean_lines.append(line)
+
+    # Trim blank lines from head and tail.
+    while clean_lines[0] == "":
+        clean_lines = clean_lines[1:]
+    while clean_lines[len(clean_lines) - 1] == "":
+        clean_lines = clean_lines[0:(len(clean_lines) - 1)]
+
+    copyrights.add("\n".join(clean_lines))
+
+    return i
+
+
+def do_file(path: str) -> None:
+    raw = Path(path).read_bytes()
+    try:
+        content = raw.decode("utf-8")
+    except UnicodeDecodeError:
+        warn("bad UTF-8 in %s" % path)
+        content = raw.decode("iso-8859-1")
+
+    lines = content.split("\n")
+
+    if len(lines) <= 4:
+        warn_verbose("ignoring short file %s" % path)
+        return
+
+    if is_auto_generated(content):
+        warn_verbose("ignoring auto-generated file %s" % path)
+        return
+
+    if not "Copyright" in content:
+        if "public domain" in content.lower():
+            warn_verbose("ignoring public domain file %s" % path)
+            return
+        warn('no copyright notice found in "%s" (%d lines)' %
+             (path, len(lines)))
+        return
+
+    # Manually iterate because extract_copyright_at tells us how many lines to
+    # skip.
+    i = 0
+    while i < len(lines):
+        if "Copyright" in lines[i] and not "@(#) Copyright" in lines[i]:
+            i = extract_copyright_at(lines, i)
+        else:
+            i += 1
+
+
+def do_dir(arg):
+    for directory, sub_directories, filenames in os.walk(arg):
+        if ".git" in sub_directories:
+            sub_directories.remove(".git")
+        sub_directories = sorted(sub_directories)
+
+        for filename in sorted(filenames):
+            path = os.path.join(directory, filename)
+            if is_interesting(path):
+                do_file(path)
+
+
+def main() -> None:
+    args = sys.argv[1:]
+    if len(args) == 0:
+        args = ["."]
+
+    for arg in args:
+        if os.path.isdir(arg):
+            do_dir(arg)
+        else:
+            do_file(arg)
+
+    for notice in sorted(copyrights):
+        print(notice)
+        print()
+        print("-" * 67)
+        print()
+
+
+if __name__ == "__main__":
+    main()