xref: /linux/tools/lib/python/kdoc/parse_data_structs.py (revision e68c84b9f3ba138878581a9f36a02c67d2ae20d4)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <mchehab@kernel.org>.
4# pylint: disable=R0912,R0915
5
6"""
7Parse a source file or header, creating ReStructured Text cross references.
8
9It accepts an optional file to change the default symbol reference or to
10suppress symbols from the output.
11
12It is capable of identifying ``define``, function, ``struct``, ``typedef``,
13``enum`` and ``enum`` symbols and create cross-references for all of them.
14It is also capable of distinguish #define used for specifying a Linux
15ioctl.
16
17The optional rules file contains a set of rules like::
18
19    ignore ioctl VIDIOC_ENUM_FMT
20    replace ioctl VIDIOC_DQBUF vidioc_qbuf
21    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
22"""
23
24import os
25import re
26import sys
27
28
29class ParseDataStructs:
30    """
31    Creates an enriched version of a Kernel header file with cross-links
32    to each C data structure type.
33
34    It is meant to allow having a more comprehensive documentation, where
35    uAPI headers will create cross-reference links to the code.
36
37    It is capable of identifying ``define``, function, ``struct``, ``typedef``,
38    ``enum`` and ``enum`` symbols and create cross-references for all of them.
39    It is also capable of distinguish #define used for specifying a Linux
40    ioctl.
41
42    By default, it create rules for all symbols and defines, but it also
43    allows parsing an exception file. Such file contains a set of rules
44    using the syntax below:
45
46    1. Ignore rules::
47
48        ignore <type> <symbol>`
49
50    Removes the symbol from reference generation.
51
52    2. Replace rules::
53
54        replace <type> <old_symbol> <new_reference>
55
56       Replaces how old_symbol with a new reference. The new_reference can be:
57
58        - A simple symbol name;
59        - A full Sphinx reference.
60
61    3. Namespace rules::
62
63        namespace <namespace>
64
65       Sets C namespace to be used during cross-reference generation. Can
66       be overridden by replace rules.
67
68    On ignore and replace rules, ``<type>`` can be:
69        - ``ioctl``: for defines that end with ``_IO*``, e.g. ioctl definitions
70        - ``define``: for other defines
71        - ``symbol``: for symbols defined within enums;
72        - ``typedef``: for typedefs;
73        - ``enum``: for the name of a non-anonymous enum;
74        - ``struct``: for structs.
75
76    Examples::
77
78        ignore define __LINUX_MEDIA_H
79        ignore ioctl VIDIOC_ENUM_FMT
80        replace ioctl VIDIOC_DQBUF vidioc_qbuf
81        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
82
83        namespace MC
84    """
85
86    #: Parser regex with multiple ways to capture enums.
87    RE_ENUMS = [
88        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
89        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
90        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
91        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
92    ]
93
94    #: Parser regex with multiple ways to capture structs.
95    RE_STRUCTS = [
96        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
97        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
98        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
99        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
100    ]
101
102    # NOTE: the original code was written a long time before Sphinx C
103    # domain to have multiple namespaces. To avoid to much turn at the
104    # existing hyperlinks, the code kept using "c:type" instead of the
105    # right types. To change that, we need to change the types not only
106    # here, but also at the uAPI media documentation.
107
108    #: Dictionary containing C type identifiers to be transformed.
109    DEF_SYMBOL_TYPES = {
110        "ioctl": {
111            "prefix": "\\ ",
112            "suffix": "\\ ",
113            "ref_type": ":ref",
114            "description": "IOCTL Commands",
115        },
116        "define": {
117            "prefix": "\\ ",
118            "suffix": "\\ ",
119            "ref_type": ":ref",
120            "description": "Macros and Definitions",
121        },
122        # We're calling each definition inside an enum as "symbol"
123        "symbol": {
124            "prefix": "\\ ",
125            "suffix": "\\ ",
126            "ref_type": ":ref",
127            "description": "Enumeration values",
128        },
129        "typedef": {
130            "prefix": "\\ ",
131            "suffix": "\\ ",
132            "ref_type": ":c:type",
133            "description": "Type Definitions",
134        },
135        # This is the description of the enum itself
136        "enum": {
137            "prefix": "\\ ",
138            "suffix": "\\ ",
139            "ref_type": ":c:type",
140            "description": "Enumerations",
141        },
142        "struct": {
143            "prefix": "\\ ",
144            "suffix": "\\ ",
145            "ref_type": ":c:type",
146            "description": "Structures",
147        },
148    }
149
150    def __init__(self, debug: bool = False):
151        """Initialize internal vars"""
152        self.debug = debug
153        self.data = ""
154
155        self.symbols = {}
156
157        self.namespace = None
158        self.ignore = []
159        self.replace = []
160
161        for symbol_type in self.DEF_SYMBOL_TYPES:
162            self.symbols[symbol_type] = {}
163
164    def read_exceptions(self, fname: str):
165        """
166        Read an optional exceptions file, used to override defaults.
167        """
168
169        if not fname:
170            return
171
172        name = os.path.basename(fname)
173
174        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
175            for ln, line in enumerate(f):
176                ln += 1
177                line = line.strip()
178                if not line or line.startswith("#"):
179                    continue
180
181                # ignore rules
182                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
183
184                if match:
185                    self.ignore.append((ln, match.group(1), match.group(2)))
186                    continue
187
188                # replace rules
189                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
190                if match:
191                    self.replace.append((ln, match.group(1), match.group(2),
192                                         match.group(3)))
193                    continue
194
195                match = re.match(r"^namespace\s+(\S+)", line)
196                if match:
197                    self.namespace = match.group(1)
198                    continue
199
200                sys.exit(f"{name}:{ln}: invalid line: {line}")
201
202    def apply_exceptions(self):
203        """
204        Process exceptions file with rules to ignore or replace references.
205        """
206
207        # Handle ignore rules
208        for ln, c_type, symbol in self.ignore:
209            if c_type not in self.DEF_SYMBOL_TYPES:
210                sys.exit(f"{name}:{ln}: {c_type} is invalid")
211
212            d = self.symbols[c_type]
213            if symbol in d:
214                del d[symbol]
215
216        # Handle replace rules
217        for ln, c_type, old, new in self.replace:
218            if c_type not in self.DEF_SYMBOL_TYPES:
219                sys.exit(f"{name}:{ln}: {c_type} is invalid")
220
221            reftype = None
222
223            # Parse reference type when the type is specified
224
225            match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new)
226            if match:
227                reftype = f":c:{match.group(1)}"
228                new = match.group(2)
229            else:
230                match = re.search(r"(\:ref)\:\`(.+)\`", new)
231                if match:
232                    reftype = match.group(1)
233                    new = match.group(2)
234
235            # If the replacement rule doesn't have a type, get default
236            if not reftype:
237                reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
238                if not reftype:
239                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
240
241            new_ref = f"{reftype}:`{old} <{new}>`"
242
243            # Change self.symbols to use the replacement rule
244            if old in self.symbols[c_type]:
245                (_, ln) = self.symbols[c_type][old]
246                self.symbols[c_type][old] = (new_ref, ln)
247            else:
248                print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
249
250    def store_type(self, ln, symbol_type: str, symbol: str,
251                   ref_name: str = None, replace_underscores: bool = True):
252        """
253        Store a new symbol at self.symbols under symbol_type.
254
255        By default, underscores are replaced by ``-``.
256        """
257        defs = self.DEF_SYMBOL_TYPES[symbol_type]
258
259        prefix = defs.get("prefix", "")
260        suffix = defs.get("suffix", "")
261        ref_type = defs.get("ref_type")
262
263        # Determine ref_link based on symbol type
264        if ref_type or self.namespace:
265            if not ref_name:
266                ref_name = symbol.lower()
267
268            # c-type references don't support hash
269            if ref_type == ":ref" and replace_underscores:
270                ref_name = ref_name.replace("_", "-")
271
272            # C domain references may have namespaces
273            if ref_type.startswith(":c:"):
274                if self.namespace:
275                    ref_name = f"{self.namespace}.{ref_name}"
276
277            if ref_type:
278                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
279            else:
280                ref_link = f"`{symbol} <{ref_name}>`"
281        else:
282            ref_link = symbol
283
284        self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln)
285
286    def store_line(self, line):
287        """
288        Store a line at self.data, properly indented.
289        """
290        line = "    " + line.expandtabs()
291        self.data += line.rstrip(" ")
292
293    def parse_file(self, file_in: str, exceptions: str = None):
294        """
295        Read a C source file and get identifiers.
296        """
297        self.data = ""
298        is_enum = False
299        is_comment = False
300        multiline = ""
301
302        self.read_exceptions(exceptions)
303
304        with open(file_in, "r",
305                  encoding="utf-8", errors="backslashreplace") as f:
306            for line_no, line in enumerate(f):
307                self.store_line(line)
308                line = line.strip("\n")
309
310                # Handle continuation lines
311                if line.endswith(r"\\"):
312                    multiline += line[-1]
313                    continue
314
315                if multiline:
316                    line = multiline + line
317                    multiline = ""
318
319                # Handle comments. They can be multilined
320                if not is_comment:
321                    if re.search(r"/\*.*", line):
322                        is_comment = True
323                    else:
324                        # Strip C99-style comments
325                        line = re.sub(r"(//.*)", "", line)
326
327                if is_comment:
328                    if re.search(r".*\*/", line):
329                        is_comment = False
330                    else:
331                        multiline = line
332                        continue
333
334                # At this point, line variable may be a multilined statement,
335                # if lines end with \ or if they have multi-line comments
336                # With that, it can safely remove the entire comments,
337                # and there's no need to use re.DOTALL for the logic below
338
339                line = re.sub(r"(/\*.*\*/)", "", line)
340                if not line.strip():
341                    continue
342
343                # It can be useful for debug purposes to print the file after
344                # having comments stripped and multi-lines grouped.
345                if self.debug > 1:
346                    print(f"line {line_no + 1}: {line}")
347
348                # Now the fun begins: parse each type and store it.
349
350                # We opted for a two parsing logic here due to:
351                # 1. it makes easier to debug issues not-parsed symbols;
352                # 2. we want symbol replacement at the entire content, not
353                #    just when the symbol is detected.
354
355                if is_enum:
356                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
357                    if match:
358                        self.store_type(line_no, "symbol", match.group(1))
359                    if "}" in line:
360                        is_enum = False
361                    continue
362
363                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
364                if match:
365                    self.store_type(line_no, "ioctl", match.group(1),
366                                    replace_underscores=False)
367                    continue
368
369                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
370                if match:
371                    self.store_type(line_no, "define", match.group(1))
372                    continue
373
374                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
375                                 line)
376                if match:
377                    name = match.group(2).strip()
378                    symbol = match.group(3)
379                    self.store_type(line_no, "typedef", symbol, ref_name=name)
380                    continue
381
382                for re_enum in self.RE_ENUMS:
383                    match = re_enum.match(line)
384                    if match:
385                        self.store_type(line_no, "enum", match.group(1))
386                        is_enum = True
387                        break
388
389                for re_struct in self.RE_STRUCTS:
390                    match = re_struct.match(line)
391                    if match:
392                        self.store_type(line_no, "struct", match.group(1))
393                        break
394
395        self.apply_exceptions()
396
397    def debug_print(self):
398        """
399        Print debug information containing the replacement rules per symbol.
400        To make easier to check, group them per type.
401        """
402        if not self.debug:
403            return
404
405        for c_type, refs in self.symbols.items():
406            if not refs:  # Skip empty dictionaries
407                continue
408
409            print(f"{c_type}:")
410
411            for symbol, (ref, ln) in sorted(refs.items()):
412                print(f"  #{ln:<5d} {symbol} -> {ref}")
413
414            print()
415
416    def gen_output(self):
417        """Write the formatted output to a file."""
418
419        # Avoid extra blank lines
420        text = re.sub(r"\s+$", "", self.data) + "\n"
421        text = re.sub(r"\n\s+\n", "\n\n", text)
422
423        # Escape Sphinx special characters
424        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
425
426        # Source uAPI files may have special notes. Use bold font for them
427        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
428
429        # Delimiters to catch the entire symbol after escaped
430        start_delim = r"([ \n\t\(=\*\@])"
431        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
432
433        # Process all reference types
434        for ref_dict in self.symbols.values():
435            for symbol, (replacement, _) in ref_dict.items():
436                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
437                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
438                              fr'\1{replacement}\2', text)
439
440        # Remove "\ " where not needed: before spaces and at the end of lines
441        text = re.sub(r"\\ ([\n ])", r"\1", text)
442        text = re.sub(r" \\ ", " ", text)
443
444        return text
445
446    def gen_toc(self):
447        """
448        Create a list of symbols to be part of a TOC contents table.
449        """
450        text = []
451
452        # Sort symbol types per description
453        symbol_descriptions = []
454        for k, v in self.DEF_SYMBOL_TYPES.items():
455            symbol_descriptions.append((v['description'], k))
456
457        symbol_descriptions.sort()
458
459        # Process each category
460        for description, c_type in symbol_descriptions:
461
462            refs = self.symbols[c_type]
463            if not refs:  # Skip empty categories
464                continue
465
466            text.append(f"{description}")
467            text.append("-" * len(description))
468            text.append("")
469
470            # Sort symbols alphabetically
471            for symbol, (ref, ln) in sorted(refs.items()):
472                text.append(f"- LINENO_{ln}: {ref}")
473
474            text.append("")  # Add empty line between categories
475
476        return "\n".join(text)
477
478    def write_output(self, file_in: str, file_out: str, toc: bool):
479        """
480        Write a ReST output file.
481        """
482
483        title = os.path.basename(file_in)
484
485        if toc:
486            text = self.gen_toc()
487        else:
488            text = self.gen_output()
489
490        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
491            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
492            f.write(f"{title}\n")
493            f.write("=" * len(title) + "\n\n")
494
495            if not toc:
496                f.write(".. parsed-literal::\n\n")
497
498            f.write(text)
499