lib/abi/abi_parser.py

#!/usr/bin/env python3
# pylint: disable=R0902,R0903,R0911,R0912,R0913,R0914,R0915,R0917,C0302
# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
# SPDX-License-Identifier: GPL-2.0

"""
Parse ABI documentation and produce results from it.
"""

from argparse import Namespace
import logging
import os
import re

from pprint import pformat
from random import randrange, seed

# Import Python modules

from helpers import AbiDebug, ABI_DIR


class AbiParser:
    """Main class to parse ABI files"""

    TAGS = r"(what|where|date|kernelversion|contact|description|users)"
    XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)"

    def __init__(self, directory, logger=None,
                 enable_lineno=False, show_warnings=True, debug=0):
        """Stores arguments for the class and initialize class vars"""

        self.directory = directory
        self.enable_lineno = enable_lineno
        self.show_warnings = show_warnings
        self.debug = debug

        if not logger:
            self.log = logging.getLogger("get_abi")
        else:
            self.log = logger

        self.data = {}
        self.what_symbols = {}
        self.file_refs = {}
        self.what_refs = {}

        # Ignore files that contain such suffixes
        self.ignore_suffixes = (".rej", ".org", ".orig", ".bak", "~")

        # Regular expressions used on parser
        self.re_abi_dir = re.compile(r"(.*)" + ABI_DIR)
        self.re_tag = re.compile(r"(\S+)(:\s*)(.*)", re.I)
        self.re_valid = re.compile(self.TAGS)
        self.re_start_spc = re.compile(r"(\s*)(\S.*)")
        self.re_whitespace = re.compile(r"^\s+")

        # Regular used on print
        self.re_what = re.compile(r"(\/?(?:[\w\-]+\/?){1,2})")
        self.re_escape = re.compile(r"([\.\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])")
        self.re_unprintable = re.compile(r"([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff]+)")
        self.re_title_mark = re.compile(r"\n[\-\*\=\^\~]+\n")
        self.re_doc = re.compile(r"Documentation/(?!devicetree)(\S+)\.rst")
        self.re_abi = re.compile(r"(Documentation/ABI/)([\w\/\-]+)")
        self.re_xref_node = re.compile(self.XREF)

    def warn(self, fdata, msg, extra=None):
        """Displays a parse error if warning is enabled"""

        if not self.show_warnings:
            return

        msg = f"{fdata.fname}:{fdata.ln}: {msg}"
        if extra:
            msg += "\n\t\t" + extra

        self.log.warning(msg)

    def add_symbol(self, what, fname, ln=None, xref=None):
        """Create a reference table describing where each 'what' is located"""

        if what not in self.what_symbols:
            self.what_symbols[what] = {"file": {}}

        if fname not in self.what_symbols[what]["file"]:
            self.what_symbols[what]["file"][fname] = []

        if ln and ln not in self.what_symbols[what]["file"][fname]:
            self.what_symbols[what]["file"][fname].append(ln)

        if xref:
            self.what_symbols[what]["xref"] = xref

    def _parse_line(self, fdata, line):
        """Parse a single line of an ABI file"""

        new_what = False
        new_tag = False
        content = None

        match = self.re_tag.match(line)
        if match:
            new = match.group(1).lower()
            sep = match.group(2)
            content = match.group(3)

            match = self.re_valid.search(new)
            if match:
                new_tag = match.group(1)
            else:
                if fdata.tag == "description":
                    # New "tag" is actually part of description.
                    # Don't consider it a tag
                    new_tag = False
                elif fdata.tag != "":
                    self.warn(fdata, f"tag '{fdata.tag}' is invalid", line)

        if new_tag:
            # "where" is Invalid, but was a common mistake. Warn if found
            if new_tag == "where":
                self.warn(fdata, "tag 'Where' is invalid. Should be 'What:' instead")
                new_tag = "what"

            if new_tag == "what":
                fdata.space = None

                if content not in self.what_symbols:
                    self.add_symbol(what=content, fname=fdata.fname, ln=fdata.ln)

                if fdata.tag == "what":
                    fdata.what.append(content.strip("\n"))
                else:
                    if fdata.key:
                        if "description" not in self.data.get(fdata.key, {}):
                            self.warn(fdata, f"{fdata.key} doesn't have a description")

                        for w in fdata.what:
                            self.add_symbol(what=w, fname=fdata.fname,
                                            ln=fdata.what_ln, xref=fdata.key)

                    fdata.label = content
                    new_what = True

                    key = "abi_" + content.lower()
                    fdata.key = self.re_unprintable.sub("_", key).strip("_")

                    # Avoid duplicated keys but using a defined seed, to make
                    # the namespace identical if there aren't changes at the
                    # ABI symbols
                    seed(42)

                    while fdata.key in self.data:
                        char = randrange(0, 51) + ord("A")
                        if char > ord("Z"):
                            char += ord("a") - ord("Z") - 1

                        fdata.key += chr(char)

                    if fdata.key and fdata.key not in self.data:
                        self.data[fdata.key] = {
                            "what": [content],
                            "file": [fdata.file_ref],
                            "path": fdata.ftype,
                            "line_no": fdata.ln,
                        }

                    fdata.what = self.data[fdata.key]["what"]

                self.what_refs[content] = fdata.key
                fdata.tag = new_tag
                fdata.what_ln = fdata.ln

                if fdata.nametag["what"]:
                    t = (content, fdata.key)
                    if t not in fdata.nametag["symbols"]:
                        fdata.nametag["symbols"].append(t)

                return

            if fdata.tag and new_tag:
                fdata.tag = new_tag

                if new_what:
                    fdata.label = ""

                    if "description" in self.data[fdata.key]:
                        self.data[fdata.key]["description"] += "\n\n"

                    if fdata.file_ref not in self.data[fdata.key]["file"]:
                        self.data[fdata.key]["file"].append(fdata.file_ref)

                    if self.debug == AbiDebug.WHAT_PARSING:
                        self.log.debug("what: %s", fdata.what)

                if not fdata.what:
                    self.warn(fdata, "'What:' should come first:", line)
                    return

                if new_tag == "description":
                    fdata.space = None

                    if content:
                        sep = sep.replace(":", " ")

                        c = " " * len(new_tag) + sep + content
                        c = c.expandtabs()

                        match = self.re_start_spc.match(c)
                        if match:
                            # Preserve initial spaces for the first line
                            fdata.space = match.group(1)
                            content = match.group(2) + "\n"

                self.data[fdata.key][fdata.tag] = content

            return

        # Store any contents before tags at the database
        if not fdata.tag and "what" in fdata.nametag:
            fdata.nametag["description"] += line
            return

        if fdata.tag == "description":
            content = line.expandtabs()

            if self.re_whitespace.sub("", content) == "":
                self.data[fdata.key][fdata.tag] += "\n"
                return

            if fdata.space is None:
                match = self.re_start_spc.match(content)
                if match:
                    # Preserve initial spaces for the first line
                    fdata.space = match.group(1)

                    content = match.group(2) + "\n"
            else:
                if content.startswith(fdata.space):
                    content = content[len(fdata.space):]

                else:
                    fdata.space = ""

            if fdata.tag == "what":
                w = content.strip("\n")
                if w:
                    self.data[fdata.key][fdata.tag].append(w)
            else:
                self.data[fdata.key][fdata.tag] += content
            return

        content = line.strip()
        if fdata.tag:
            if fdata.tag == "what":
                w = content.strip("\n")
                if w:
                    self.data[fdata.key][fdata.tag].append(w)
            else:
                self.data[fdata.key][fdata.tag] += "\n" + content.rstrip("\n")
            return

        # Everything else is error
        if content:
            self.warn(fdata, "Unexpected content", line)

    def parse_readme(self, nametag, fname):
        """Parse ABI README file"""

        with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp:
            nametag["description"] = "```\n"
            for line in fp:
                nametag["description"] += "  " + line

            nametag["description"] += "```\n"

    def parse_file(self, fname, path, basename):
        """Parse a single file"""

        ref = f"abi_file_{path}_{basename}"
        ref = self.re_unprintable.sub("_", ref).strip("_")

        # Store per-file state into a namespace variable. This will be used
        # by the per-line parser state machine and by the warning function.
        fdata = Namespace

        fdata.fname = fname
        fdata.name = basename

        pos = fname.find(ABI_DIR)
        if pos > 0:
            f = fname[pos:]
        else:
            f = fname

        fdata.file_ref = (f, ref)
        self.file_refs[f] = ref

        fdata.ln = 0
        fdata.what_ln = 0
        fdata.tag = ""
        fdata.label = ""
        fdata.what = []
        fdata.key = None
        fdata.xrefs = None
        fdata.space = None
        fdata.ftype = path.split("/")[0]

        fdata.nametag = {}
        fdata.nametag["what"] = [f"File {path}/{basename}"]
        fdata.nametag["type"] = "File"
        fdata.nametag["path"] = fdata.ftype
        fdata.nametag["file"] = [fdata.file_ref]
        fdata.nametag["line_no"] = 1
        fdata.nametag["description"] = ""
        fdata.nametag["symbols"] = []

        self.data[ref] = fdata.nametag

        if self.debug & AbiDebug.WHAT_OPEN:
            self.log.debug("Opening file %s", fname)

        if basename == "README":
            self.parse_readme(fdata.nametag, fname)
            return

        with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp:
            for line in fp:
                fdata.ln += 1

                self._parse_line(fdata, line)

            if "description" in fdata.nametag:
                fdata.nametag["description"] = fdata.nametag["description"].lstrip("\n")

            if fdata.key:
                if "description" not in self.data.get(fdata.key, {}):
                    self.warn(fdata, f"{fdata.key} doesn't have a description")

                for w in fdata.what:
                    self.add_symbol(what=w, fname=fname, xref=fdata.key)

    def _parse_abi(self, root=None):
        """Internal function to parse documentation ABI recursively"""

        if not root:
            root = self.directory

        with os.scandir(root) as obj:
            for entry in obj:
                name = os.path.join(root, entry.name)

                if entry.is_dir():
                    self._parse_abi(name)
                    continue

                if not entry.is_file():
                    continue

                basename = os.path.basename(name)

                if basename.startswith("."):
                    continue

                if basename.endswith(self.ignore_suffixes):
                    continue

                path = self.re_abi_dir.sub("", os.path.dirname(name))

                self.parse_file(name, path, basename)

    def parse_abi(self, root=None):
        """Parse documentation ABI"""

        self._parse_abi(root)

        if self.debug & AbiDebug.DUMP_ABI_STRUCTS:
            self.log.debug(pformat(self.data))

    def desc_txt(self, desc):
        """Print description as found inside ABI files"""

        desc = desc.strip(" \t\n")

        return desc + "\n\n"

    def desc_rst(self, desc):
        """Enrich ReST output by creating cross-references"""

        # Remove title markups from the description
        # Having titles inside ABI files will only work if extra
        # care would be taken in order to strictly follow the same
        # level order for each markup.
        desc = self.re_title_mark.sub("\n\n", "\n" + desc)
        desc = desc.rstrip(" \t\n").lstrip("\n")

        # Python's regex performance for non-compiled expressions is a lot
        # than Perl, as Perl automatically caches them at their
        # first usage. Here, we'll need to do the same, as otherwise the
        # performance penalty is be high

        new_desc = ""
        for d in desc.split("\n"):
            if d == "":
                new_desc += "\n"
                continue

            # Use cross-references for doc files where needed
            d = self.re_doc.sub(r":doc:`/\1`", d)

            # Use cross-references for ABI generated docs where needed
            matches = self.re_abi.findall(d)
            for m in matches:
                abi = m[0] + m[1]

                xref = self.file_refs.get(abi)
                if not xref:
                    # This may happen if ABI is on a separate directory,
                    # like parsing ABI testing and symbol is at stable.
                    # The proper solution is to move this part of the code
                    # for it to be inside sphinx/kernel_abi.py
                    self.log.info("Didn't find ABI reference for '%s'", abi)
                else:
                    new = self.re_escape.sub(r"\\\1", m[1])
                    d = re.sub(fr"\b{abi}\b", f":ref:`{new} <{xref}>`", d)

            # Seek for cross reference symbols like /sys/...
            # Need to be careful to avoid doing it on a code block
            if d[0] not in [" ", "\t"]:
                matches = self.re_xref_node.findall(d)
                for m in matches:
                    # Finding ABI here is more complex due to wildcards
                    xref = self.what_refs.get(m)
                    if xref:
                        new = self.re_escape.sub(r"\\\1", m)
                        d = re.sub(fr"\b{m}\b", f":ref:`{new} <{xref}>`", d)

            new_desc += d + "\n"

        return new_desc + "\n\n"

    def doc(self, output_in_txt=False, show_symbols=True, show_file=True,
            filter_path=None):
        """Print ABI at stdout"""

        part = None
        for key, v in sorted(self.data.items(),
                             key=lambda x: (x[1].get("type", ""),
                                            x[1].get("what"))):

            wtype = v.get("type", "Symbol")
            file_ref = v.get("file")
            names = v.get("what", [""])

            if wtype == "File":
                if not show_file:
                    continue
            else:
                if not show_symbols:
                    continue

            if filter_path:
                if filter_path == "README":
                    if not names[0].endswith("README"):
                        continue
                else:
                    if v.get("path") != filter_path:
                        continue

            msg = ""

            if wtype != "File":
                cur_part = names[0]
                if cur_part.find("/") >= 0:
                    match = self.re_what.match(cur_part)
                    if match:
                        symbol = match.group(1).rstrip("/")
                        cur_part = "Symbols under " + symbol

                if cur_part and cur_part != part:
                    part = cur_part
                    msg += f"{part}\n{"-" * len(part)}\n\n"

                msg += f".. _{key}:\n\n"

                max_len = 0
                for i in range(0, len(names)):           # pylint: disable=C0200
                    names[i] = "**" + self.re_escape.sub(r"\\\1", names[i]) + "**"

                    max_len = max(max_len, len(names[i]))

                msg += "+-" + "-" * max_len + "-+\n"
                for name in names:
                    msg += f"| {name}" + " " * (max_len - len(name)) + " |\n"
                    msg += "+-" + "-" * max_len + "-+\n"
                msg += "\n"

            for ref in file_ref:
                if wtype == "File":
                    msg += f".. _{ref[1]}:\n\n"
                else:
                    base = os.path.basename(ref[0])
                    msg += f"Defined on file :ref:`{base} <{ref[1]}>`\n\n"

            if wtype == "File":
                msg += f"{names[0]}\n{"-" * len(names[0])}\n\n"

            desc = v.get("description")
            if not desc and wtype != "File":
                msg += f"DESCRIPTION MISSING for {names[0]}\n\n"

            if desc:
                if output_in_txt:
                    msg += self.desc_txt(desc)
                else:
                    msg += self.desc_rst(desc)

            symbols = v.get("symbols")
            if symbols:
                msg += "Has the following ABI:\n\n"

                for w, label in symbols:
                    # Escape special chars from content
                    content = self.re_escape.sub(r"\\\1", w)

                    msg += f"- :ref:`{content} <{label}>`\n\n"

            users = v.get("users")
            if users and users.strip(" \t\n"):
                msg += f"Users:\n\t{users.strip("\n").replace('\n', '\n\t')}\n\n"

            ln = v.get("line_no", 1)

            yield (msg, file_ref[0][0], ln)

    def check_issues(self):
        """Warn about duplicated ABI entries"""

        for what, v in self.what_symbols.items():
            files = v.get("file")
            if not files:
                # Should never happen if the parser works properly
                self.log.warning("%s doesn't have a file associated", what)
                continue

            if len(files) == 1:
                continue

            f = []
            for fname, lines in sorted(files.items()):
                if not lines:
                    f.append(f"{fname}")
                elif len(lines) == 1:
                    f.append(f"{fname}:{lines[0]}")
                else:
                    f.append(f"{fname} lines {", ".join(str(x) for x in lines)}")

            self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f))

    def search_symbols(self, expr):
        """ Searches for ABI symbols """

        regex = re.compile(expr, re.I)

        found_keys = 0
        for t in sorted(self.data.items(), key=lambda x: [0]):
            v = t[1]

            wtype = v.get("type", "")
            if wtype == "File":
                continue

            for what in v.get("what", [""]):
                if regex.search(what):
                    found_keys += 1

                    kernelversion = v.get("kernelversion", "").strip(" \t\n")
                    date = v.get("date", "").strip(" \t\n")
                    contact = v.get("contact", "").strip(" \t\n")
                    users = v.get("users", "").strip(" \t\n")
                    desc = v.get("description", "").strip(" \t\n")

                    files = []
                    for f in v.get("file", ()):
                        files.append(f[0])

                    what = str(found_keys) + ". " + what
                    title_tag = "-" * len(what)

                    print(f"\n{what}\n{title_tag}\n")

                    if kernelversion:
                        print(f"Kernel version:\t\t{kernelversion}")

                    if date:
                        print(f"Date:\t\t\t{date}")

                    if contact:
                        print(f"Contact:\t\t{contact}")

                    if users:
                        print(f"Users:\t\t\t{users}")

                    print(f"Defined on file{'s'[:len(files) ^ 1]}:\t{", ".join(files)}")

                    if desc:
                        print(f"\n{desc.strip("\n")}\n")

        if not found_keys:
            print(f"Regular expression /{expr}/ not found.")