xref: /linux/tools/docs/lib/parse_data_structs.py (revision 6093a688a07da07808f0122f9aa2a3eed250d853)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <mchehab@kernel.org>.
4# pylint: disable=R0912,R0915
5
6"""
7Parse a source file or header, creating ReStructured Text cross references.
8
9It accepts an optional file to change the default symbol reference or to
10suppress symbols from the output.
11
12It is capable of identifying defines, functions, structs, typedefs,
13enums and enum symbols and create cross-references for all of them.
14It is also capable of distinguish #define used for specifying a Linux
15ioctl.
16
17The optional rules file contains a set of rules like:
18
19    ignore ioctl VIDIOC_ENUM_FMT
20    replace ioctl VIDIOC_DQBUF vidioc_qbuf
21    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
22"""
23
24import os
25import re
26import sys
27
28
29class ParseDataStructs:
30    """
31    Creates an enriched version of a Kernel header file with cross-links
32    to each C data structure type.
33
34    It is meant to allow having a more comprehensive documentation, where
35    uAPI headers will create cross-reference links to the code.
36
37    It is capable of identifying defines, functions, structs, typedefs,
38    enums and enum symbols and create cross-references for all of them.
39    It is also capable of distinguish #define used for specifying a Linux
40    ioctl.
41
42    By default, it create rules for all symbols and defines, but it also
43    allows parsing an exception file. Such file contains a set of rules
44    using the syntax below:
45
46    1. Ignore rules:
47
48        ignore <type> <symbol>`
49
50    Removes the symbol from reference generation.
51
52    2. Replace rules:
53
54        replace <type> <old_symbol> <new_reference>
55
56    Replaces how old_symbol with a new reference. The new_reference can be:
57        - A simple symbol name;
58        - A full Sphinx reference.
59
60    On both cases, <type> can be:
61        - ioctl: for defines that end with _IO*, e.g. ioctl definitions
62        - define: for other defines
63        - symbol: for symbols defined within enums;
64        - typedef: for typedefs;
65        - enum: for the name of a non-anonymous enum;
66        - struct: for structs.
67
68    Examples:
69
70        ignore define __LINUX_MEDIA_H
71        ignore ioctl VIDIOC_ENUM_FMT
72        replace ioctl VIDIOC_DQBUF vidioc_qbuf
73        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
74    """
75
76    # Parser regexes with multiple ways to capture enums and structs
77    RE_ENUMS = [
78        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
79        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
80        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
81        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
82    ]
83    RE_STRUCTS = [
84        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
85        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
86        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
87        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
88    ]
89
90    # FIXME: the original code was written a long time before Sphinx C
91    # domain to have multiple namespaces. To avoid to much turn at the
92    # existing hyperlinks, the code kept using "c:type" instead of the
93    # right types. To change that, we need to change the types not only
94    # here, but also at the uAPI media documentation.
95    DEF_SYMBOL_TYPES = {
96        "ioctl": {
97            "prefix": "\\ ",
98            "suffix": "\\ ",
99            "ref_type": ":ref",
100            "description": "IOCTL Commands",
101        },
102        "define": {
103            "prefix": "\\ ",
104            "suffix": "\\ ",
105            "ref_type": ":ref",
106            "description": "Macros and Definitions",
107        },
108        # We're calling each definition inside an enum as "symbol"
109        "symbol": {
110            "prefix": "\\ ",
111            "suffix": "\\ ",
112            "ref_type": ":ref",
113            "description": "Enumeration values",
114        },
115        "typedef": {
116            "prefix": "\\ ",
117            "suffix": "\\ ",
118            "ref_type": ":c:type",
119            "description": "Type Definitions",
120        },
121        # This is the description of the enum itself
122        "enum": {
123            "prefix": "\\ ",
124            "suffix": "\\ ",
125            "ref_type": ":c:type",
126            "description": "Enumerations",
127        },
128        "struct": {
129            "prefix": "\\ ",
130            "suffix": "\\ ",
131            "ref_type": ":c:type",
132            "description": "Structures",
133        },
134    }
135
136    def __init__(self, debug: bool = False):
137        """Initialize internal vars"""
138        self.debug = debug
139        self.data = ""
140
141        self.symbols = {}
142
143        for symbol_type in self.DEF_SYMBOL_TYPES:
144            self.symbols[symbol_type] = {}
145
146    def store_type(self, symbol_type: str, symbol: str,
147                   ref_name: str = None, replace_underscores: bool = True):
148        """
149        Stores a new symbol at self.symbols under symbol_type.
150
151        By default, underscores are replaced by "-"
152        """
153        defs = self.DEF_SYMBOL_TYPES[symbol_type]
154
155        prefix = defs.get("prefix", "")
156        suffix = defs.get("suffix", "")
157        ref_type = defs.get("ref_type")
158
159        # Determine ref_link based on symbol type
160        if ref_type:
161            if symbol_type == "enum":
162                ref_link = f"{ref_type}:`{symbol}`"
163            else:
164                if not ref_name:
165                    ref_name = symbol.lower()
166
167                # c-type references don't support hash
168                if ref_type == ":ref" and replace_underscores:
169                    ref_name = ref_name.replace("_", "-")
170
171                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
172        else:
173            ref_link = symbol
174
175        self.symbols[symbol_type][symbol] = f"{prefix}{ref_link}{suffix}"
176
177    def store_line(self, line):
178        """Stores a line at self.data, properly indented"""
179        line = "    " + line.expandtabs()
180        self.data += line.rstrip(" ")
181
182    def parse_file(self, file_in: str):
183        """Reads a C source file and get identifiers"""
184        self.data = ""
185        is_enum = False
186        is_comment = False
187        multiline = ""
188
189        with open(file_in, "r",
190                  encoding="utf-8", errors="backslashreplace") as f:
191            for line_no, line in enumerate(f):
192                self.store_line(line)
193                line = line.strip("\n")
194
195                # Handle continuation lines
196                if line.endswith(r"\\"):
197                    multiline += line[-1]
198                    continue
199
200                if multiline:
201                    line = multiline + line
202                    multiline = ""
203
204                # Handle comments. They can be multilined
205                if not is_comment:
206                    if re.search(r"/\*.*", line):
207                        is_comment = True
208                    else:
209                        # Strip C99-style comments
210                        line = re.sub(r"(//.*)", "", line)
211
212                if is_comment:
213                    if re.search(r".*\*/", line):
214                        is_comment = False
215                    else:
216                        multiline = line
217                        continue
218
219                # At this point, line variable may be a multilined statement,
220                # if lines end with \ or if they have multi-line comments
221                # With that, it can safely remove the entire comments,
222                # and there's no need to use re.DOTALL for the logic below
223
224                line = re.sub(r"(/\*.*\*/)", "", line)
225                if not line.strip():
226                    continue
227
228                # It can be useful for debug purposes to print the file after
229                # having comments stripped and multi-lines grouped.
230                if self.debug > 1:
231                    print(f"line {line_no + 1}: {line}")
232
233                # Now the fun begins: parse each type and store it.
234
235                # We opted for a two parsing logic here due to:
236                # 1. it makes easier to debug issues not-parsed symbols;
237                # 2. we want symbol replacement at the entire content, not
238                #    just when the symbol is detected.
239
240                if is_enum:
241                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
242                    if match:
243                        self.store_type("symbol", match.group(1))
244                    if "}" in line:
245                        is_enum = False
246                    continue
247
248                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
249                if match:
250                    self.store_type("ioctl", match.group(1),
251                                    replace_underscores=False)
252                    continue
253
254                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
255                if match:
256                    self.store_type("define", match.group(1))
257                    continue
258
259                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
260                                 line)
261                if match:
262                    name = match.group(2).strip()
263                    symbol = match.group(3)
264                    self.store_type("typedef", symbol, ref_name=name)
265                    continue
266
267                for re_enum in self.RE_ENUMS:
268                    match = re_enum.match(line)
269                    if match:
270                        self.store_type("enum", match.group(1))
271                        is_enum = True
272                        break
273
274                for re_struct in self.RE_STRUCTS:
275                    match = re_struct.match(line)
276                    if match:
277                        self.store_type("struct", match.group(1))
278                        break
279
280    def process_exceptions(self, fname: str):
281        """
282        Process exceptions file with rules to ignore or replace references.
283        """
284        if not fname:
285            return
286
287        name = os.path.basename(fname)
288
289        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
290            for ln, line in enumerate(f):
291                ln += 1
292                line = line.strip()
293                if not line or line.startswith("#"):
294                    continue
295
296                # Handle ignore rules
297                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
298                if match:
299                    c_type = match.group(1)
300                    symbol = match.group(2)
301
302                    if c_type not in self.DEF_SYMBOL_TYPES:
303                        sys.exit(f"{name}:{ln}: {c_type} is invalid")
304
305                    d = self.symbols[c_type]
306                    if symbol in d:
307                        del d[symbol]
308
309                    continue
310
311                # Handle replace rules
312                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
313                if not match:
314                    sys.exit(f"{name}:{ln}: invalid line: {line}")
315
316                c_type, old, new = match.groups()
317
318                if c_type not in self.DEF_SYMBOL_TYPES:
319                    sys.exit(f"{name}:{ln}: {c_type} is invalid")
320
321                reftype = None
322
323                # Parse reference type when the type is specified
324
325                match = re.match(r"^\:c\:(data|func|macro|type)\:\`(.+)\`", new)
326                if match:
327                    reftype = f":c:{match.group(1)}"
328                    new = match.group(2)
329                else:
330                    match = re.search(r"(\:ref)\:\`(.+)\`", new)
331                    if match:
332                        reftype = match.group(1)
333                        new = match.group(2)
334
335                # If the replacement rule doesn't have a type, get default
336                if not reftype:
337                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
338                    if not reftype:
339                        reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
340
341                new_ref = f"{reftype}:`{old} <{new}>`"
342
343                # Change self.symbols to use the replacement rule
344                if old in self.symbols[c_type]:
345                    self.symbols[c_type][old] = new_ref
346                else:
347                    print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
348
349    def debug_print(self):
350        """
351        Print debug information containing the replacement rules per symbol.
352        To make easier to check, group them per type.
353        """
354        if not self.debug:
355            return
356
357        for c_type, refs in self.symbols.items():
358            if not refs:  # Skip empty dictionaries
359                continue
360
361            print(f"{c_type}:")
362
363            for symbol, ref in sorted(refs.items()):
364                print(f"  {symbol} -> {ref}")
365
366            print()
367
368    def gen_output(self):
369        """Write the formatted output to a file."""
370
371        # Avoid extra blank lines
372        text = re.sub(r"\s+$", "", self.data) + "\n"
373        text = re.sub(r"\n\s+\n", "\n\n", text)
374
375        # Escape Sphinx special characters
376        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
377
378        # Source uAPI files may have special notes. Use bold font for them
379        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
380
381        # Delimiters to catch the entire symbol after escaped
382        start_delim = r"([ \n\t\(=\*\@])"
383        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
384
385        # Process all reference types
386        for ref_dict in self.symbols.values():
387            for symbol, replacement in ref_dict.items():
388                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
389                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
390                              fr'\1{replacement}\2', text)
391
392        # Remove "\ " where not needed: before spaces and at the end of lines
393        text = re.sub(r"\\ ([\n ])", r"\1", text)
394        text = re.sub(r" \\ ", " ", text)
395
396        return text
397
398    def gen_toc(self):
399        """
400        Create a TOC table pointing to each symbol from the header
401        """
402        text = []
403
404        # Add header
405        text.append(".. contents:: Table of Contents")
406        text.append("   :depth: 2")
407        text.append("   :local:")
408        text.append("")
409
410        # Sort symbol types per description
411        symbol_descriptions = []
412        for k, v in self.DEF_SYMBOL_TYPES.items():
413            symbol_descriptions.append((v['description'], k))
414
415        symbol_descriptions.sort()
416
417        # Process each category
418        for description, c_type in symbol_descriptions:
419
420            refs = self.symbols[c_type]
421            if not refs:  # Skip empty categories
422                continue
423
424            text.append(f"{description}")
425            text.append("-" * len(description))
426            text.append("")
427
428            # Sort symbols alphabetically
429            for symbol, ref in sorted(refs.items()):
430                text.append(f"* :{ref}:")
431
432            text.append("")  # Add empty line between categories
433
434        return "\n".join(text)
435
436    def write_output(self, file_in: str, file_out: str, toc: bool):
437        title = os.path.basename(file_in)
438
439        if toc:
440            text = self.gen_toc()
441        else:
442            text = self.gen_output()
443
444        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
445            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
446            f.write(f"{title}\n")
447            f.write("=" * len(title) + "\n\n")
448
449            if not toc:
450                f.write(".. parsed-literal::\n\n")
451
452            f.write(text)
453