xref: /linux/tools/lib/python/kdoc/parse_data_structs.py (revision db6b35cffe59c619ea3772b21d7c7c8a7b885dc1)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <mchehab@kernel.org>.
4# pylint: disable=R0912,R0915
5
6"""
7Parse a source file or header, creating ReStructured Text cross references.
8
9It accepts an optional file to change the default symbol reference or to
10suppress symbols from the output.
11
12It is capable of identifying defines, functions, structs, typedefs,
13enums and enum symbols and create cross-references for all of them.
14It is also capable of distinguish #define used for specifying a Linux
15ioctl.
16
17The optional rules file contains a set of rules like:
18
19    ignore ioctl VIDIOC_ENUM_FMT
20    replace ioctl VIDIOC_DQBUF vidioc_qbuf
21    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
22"""
23
24import os
25import re
26import sys
27
28
29class ParseDataStructs:
30    """
31    Creates an enriched version of a Kernel header file with cross-links
32    to each C data structure type.
33
34    It is meant to allow having a more comprehensive documentation, where
35    uAPI headers will create cross-reference links to the code.
36
37    It is capable of identifying defines, functions, structs, typedefs,
38    enums and enum symbols and create cross-references for all of them.
39    It is also capable of distinguish #define used for specifying a Linux
40    ioctl.
41
42    By default, it create rules for all symbols and defines, but it also
43    allows parsing an exception file. Such file contains a set of rules
44    using the syntax below:
45
46    1. Ignore rules:
47
48        ignore <type> <symbol>`
49
50    Removes the symbol from reference generation.
51
52    2. Replace rules:
53
54        replace <type> <old_symbol> <new_reference>
55
56       Replaces how old_symbol with a new reference. The new_reference can be:
57
58        - A simple symbol name;
59        - A full Sphinx reference.
60
61    3. Namespace rules
62
63        namespace <namespace>
64
65       Sets C namespace to be used during cross-reference generation. Can
66       be overridden by replace rules.
67
68    On ignore and replace rules, <type> can be:
69        - ioctl: for defines that end with _IO*, e.g. ioctl definitions
70        - define: for other defines
71        - symbol: for symbols defined within enums;
72        - typedef: for typedefs;
73        - enum: for the name of a non-anonymous enum;
74        - struct: for structs.
75
76    Examples:
77
78        ignore define __LINUX_MEDIA_H
79        ignore ioctl VIDIOC_ENUM_FMT
80        replace ioctl VIDIOC_DQBUF vidioc_qbuf
81        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
82
83        namespace MC
84    """
85
86    # Parser regexes with multiple ways to capture enums and structs
87    RE_ENUMS = [
88        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
89        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
90        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
91        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
92    ]
93    RE_STRUCTS = [
94        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
95        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
96        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
97        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
98    ]
99
100    # FIXME: the original code was written a long time before Sphinx C
101    # domain to have multiple namespaces. To avoid to much turn at the
102    # existing hyperlinks, the code kept using "c:type" instead of the
103    # right types. To change that, we need to change the types not only
104    # here, but also at the uAPI media documentation.
105    DEF_SYMBOL_TYPES = {
106        "ioctl": {
107            "prefix": "\\ ",
108            "suffix": "\\ ",
109            "ref_type": ":ref",
110            "description": "IOCTL Commands",
111        },
112        "define": {
113            "prefix": "\\ ",
114            "suffix": "\\ ",
115            "ref_type": ":ref",
116            "description": "Macros and Definitions",
117        },
118        # We're calling each definition inside an enum as "symbol"
119        "symbol": {
120            "prefix": "\\ ",
121            "suffix": "\\ ",
122            "ref_type": ":ref",
123            "description": "Enumeration values",
124        },
125        "typedef": {
126            "prefix": "\\ ",
127            "suffix": "\\ ",
128            "ref_type": ":c:type",
129            "description": "Type Definitions",
130        },
131        # This is the description of the enum itself
132        "enum": {
133            "prefix": "\\ ",
134            "suffix": "\\ ",
135            "ref_type": ":c:type",
136            "description": "Enumerations",
137        },
138        "struct": {
139            "prefix": "\\ ",
140            "suffix": "\\ ",
141            "ref_type": ":c:type",
142            "description": "Structures",
143        },
144    }
145
146    def __init__(self, debug: bool = False):
147        """Initialize internal vars"""
148        self.debug = debug
149        self.data = ""
150
151        self.symbols = {}
152
153        self.namespace = None
154        self.ignore = []
155        self.replace = []
156
157        for symbol_type in self.DEF_SYMBOL_TYPES:
158            self.symbols[symbol_type] = {}
159
160    def read_exceptions(self, fname: str):
161        if not fname:
162            return
163
164        name = os.path.basename(fname)
165
166        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
167            for ln, line in enumerate(f):
168                ln += 1
169                line = line.strip()
170                if not line or line.startswith("#"):
171                    continue
172
173                # ignore rules
174                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
175
176                if match:
177                    self.ignore.append((ln, match.group(1), match.group(2)))
178                    continue
179
180                # replace rules
181                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
182                if match:
183                    self.replace.append((ln, match.group(1), match.group(2),
184                                         match.group(3)))
185                    continue
186
187                match = re.match(r"^namespace\s+(\S+)", line)
188                if match:
189                    self.namespace = match.group(1)
190                    continue
191
192                sys.exit(f"{name}:{ln}: invalid line: {line}")
193
194    def apply_exceptions(self):
195        """
196        Process exceptions file with rules to ignore or replace references.
197        """
198
199        # Handle ignore rules
200        for ln, c_type, symbol in self.ignore:
201            if c_type not in self.DEF_SYMBOL_TYPES:
202                sys.exit(f"{name}:{ln}: {c_type} is invalid")
203
204            d = self.symbols[c_type]
205            if symbol in d:
206                del d[symbol]
207
208        # Handle replace rules
209        for ln, c_type, old, new in self.replace:
210            if c_type not in self.DEF_SYMBOL_TYPES:
211                sys.exit(f"{name}:{ln}: {c_type} is invalid")
212
213            reftype = None
214
215            # Parse reference type when the type is specified
216
217            match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new)
218            if match:
219                reftype = f":c:{match.group(1)}"
220                new = match.group(2)
221            else:
222                match = re.search(r"(\:ref)\:\`(.+)\`", new)
223                if match:
224                    reftype = match.group(1)
225                    new = match.group(2)
226
227            # If the replacement rule doesn't have a type, get default
228            if not reftype:
229                reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
230                if not reftype:
231                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
232
233            new_ref = f"{reftype}:`{old} <{new}>`"
234
235            # Change self.symbols to use the replacement rule
236            if old in self.symbols[c_type]:
237                (_, ln) = self.symbols[c_type][old]
238                self.symbols[c_type][old] = (new_ref, ln)
239            else:
240                print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
241
242    def store_type(self, ln, symbol_type: str, symbol: str,
243                   ref_name: str = None, replace_underscores: bool = True):
244        """
245        Stores a new symbol at self.symbols under symbol_type.
246
247        By default, underscores are replaced by "-"
248        """
249        defs = self.DEF_SYMBOL_TYPES[symbol_type]
250
251        prefix = defs.get("prefix", "")
252        suffix = defs.get("suffix", "")
253        ref_type = defs.get("ref_type")
254
255        # Determine ref_link based on symbol type
256        if ref_type or self.namespace:
257            if not ref_name:
258                ref_name = symbol.lower()
259
260            # c-type references don't support hash
261            if ref_type == ":ref" and replace_underscores:
262                ref_name = ref_name.replace("_", "-")
263
264            # C domain references may have namespaces
265            if ref_type.startswith(":c:"):
266                if self.namespace:
267                    ref_name = f"{self.namespace}.{ref_name}"
268
269            if ref_type:
270                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
271            else:
272                ref_link = f"`{symbol} <{ref_name}>`"
273        else:
274            ref_link = symbol
275
276        self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln)
277
278    def store_line(self, line):
279        """Stores a line at self.data, properly indented"""
280        line = "    " + line.expandtabs()
281        self.data += line.rstrip(" ")
282
283    def parse_file(self, file_in: str, exceptions: str = None):
284        """Reads a C source file and get identifiers"""
285        self.data = ""
286        is_enum = False
287        is_comment = False
288        multiline = ""
289
290        self.read_exceptions(exceptions)
291
292        with open(file_in, "r",
293                  encoding="utf-8", errors="backslashreplace") as f:
294            for line_no, line in enumerate(f):
295                self.store_line(line)
296                line = line.strip("\n")
297
298                # Handle continuation lines
299                if line.endswith(r"\\"):
300                    multiline += line[-1]
301                    continue
302
303                if multiline:
304                    line = multiline + line
305                    multiline = ""
306
307                # Handle comments. They can be multilined
308                if not is_comment:
309                    if re.search(r"/\*.*", line):
310                        is_comment = True
311                    else:
312                        # Strip C99-style comments
313                        line = re.sub(r"(//.*)", "", line)
314
315                if is_comment:
316                    if re.search(r".*\*/", line):
317                        is_comment = False
318                    else:
319                        multiline = line
320                        continue
321
322                # At this point, line variable may be a multilined statement,
323                # if lines end with \ or if they have multi-line comments
324                # With that, it can safely remove the entire comments,
325                # and there's no need to use re.DOTALL for the logic below
326
327                line = re.sub(r"(/\*.*\*/)", "", line)
328                if not line.strip():
329                    continue
330
331                # It can be useful for debug purposes to print the file after
332                # having comments stripped and multi-lines grouped.
333                if self.debug > 1:
334                    print(f"line {line_no + 1}: {line}")
335
336                # Now the fun begins: parse each type and store it.
337
338                # We opted for a two parsing logic here due to:
339                # 1. it makes easier to debug issues not-parsed symbols;
340                # 2. we want symbol replacement at the entire content, not
341                #    just when the symbol is detected.
342
343                if is_enum:
344                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
345                    if match:
346                        self.store_type(line_no, "symbol", match.group(1))
347                    if "}" in line:
348                        is_enum = False
349                    continue
350
351                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
352                if match:
353                    self.store_type(line_no, "ioctl", match.group(1),
354                                    replace_underscores=False)
355                    continue
356
357                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
358                if match:
359                    self.store_type(line_no, "define", match.group(1))
360                    continue
361
362                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
363                                 line)
364                if match:
365                    name = match.group(2).strip()
366                    symbol = match.group(3)
367                    self.store_type(line_no, "typedef", symbol, ref_name=name)
368                    continue
369
370                for re_enum in self.RE_ENUMS:
371                    match = re_enum.match(line)
372                    if match:
373                        self.store_type(line_no, "enum", match.group(1))
374                        is_enum = True
375                        break
376
377                for re_struct in self.RE_STRUCTS:
378                    match = re_struct.match(line)
379                    if match:
380                        self.store_type(line_no, "struct", match.group(1))
381                        break
382
383        self.apply_exceptions()
384
385    def debug_print(self):
386        """
387        Print debug information containing the replacement rules per symbol.
388        To make easier to check, group them per type.
389        """
390        if not self.debug:
391            return
392
393        for c_type, refs in self.symbols.items():
394            if not refs:  # Skip empty dictionaries
395                continue
396
397            print(f"{c_type}:")
398
399            for symbol, (ref, ln) in sorted(refs.items()):
400                print(f"  #{ln:<5d} {symbol} -> {ref}")
401
402            print()
403
404    def gen_output(self):
405        """Write the formatted output to a file."""
406
407        # Avoid extra blank lines
408        text = re.sub(r"\s+$", "", self.data) + "\n"
409        text = re.sub(r"\n\s+\n", "\n\n", text)
410
411        # Escape Sphinx special characters
412        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
413
414        # Source uAPI files may have special notes. Use bold font for them
415        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
416
417        # Delimiters to catch the entire symbol after escaped
418        start_delim = r"([ \n\t\(=\*\@])"
419        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
420
421        # Process all reference types
422        for ref_dict in self.symbols.values():
423            for symbol, (replacement, _) in ref_dict.items():
424                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
425                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
426                              fr'\1{replacement}\2', text)
427
428        # Remove "\ " where not needed: before spaces and at the end of lines
429        text = re.sub(r"\\ ([\n ])", r"\1", text)
430        text = re.sub(r" \\ ", " ", text)
431
432        return text
433
434    def gen_toc(self):
435        """
436        Create a list of symbols to be part of a TOC contents table
437        """
438        text = []
439
440        # Sort symbol types per description
441        symbol_descriptions = []
442        for k, v in self.DEF_SYMBOL_TYPES.items():
443            symbol_descriptions.append((v['description'], k))
444
445        symbol_descriptions.sort()
446
447        # Process each category
448        for description, c_type in symbol_descriptions:
449
450            refs = self.symbols[c_type]
451            if not refs:  # Skip empty categories
452                continue
453
454            text.append(f"{description}")
455            text.append("-" * len(description))
456            text.append("")
457
458            # Sort symbols alphabetically
459            for symbol, (ref, ln) in sorted(refs.items()):
460                text.append(f"- LINENO_{ln}: {ref}")
461
462            text.append("")  # Add empty line between categories
463
464        return "\n".join(text)
465
466    def write_output(self, file_in: str, file_out: str, toc: bool):
467        title = os.path.basename(file_in)
468
469        if toc:
470            text = self.gen_toc()
471        else:
472            text = self.gen_output()
473
474        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
475            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
476            f.write(f"{title}\n")
477            f.write("=" * len(title) + "\n\n")
478
479            if not toc:
480                f.write(".. parsed-literal::\n\n")
481
482            f.write(text)
483