1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <mchehab@kernel.org>. 4# pylint: disable=R0912,R0915 5 6""" 7Parse a source file or header, creating ReStructured Text cross references. 8 9It accepts an optional file to change the default symbol reference or to 10suppress symbols from the output. 11 12It is capable of identifying ``define``, function, ``struct``, ``typedef``, 13``enum`` and ``enum`` symbols and create cross-references for all of them. 14It is also capable of distinguish #define used for specifying a Linux 15ioctl. 16 17The optional rules file contains a set of rules like:: 18 19 ignore ioctl VIDIOC_ENUM_FMT 20 replace ioctl VIDIOC_DQBUF vidioc_qbuf 21 replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det` 22""" 23 24import os 25import re 26import sys 27 28 29class ParseDataStructs: 30 """ 31 Creates an enriched version of a Kernel header file with cross-links 32 to each C data structure type. 33 34 It is meant to allow having a more comprehensive documentation, where 35 uAPI headers will create cross-reference links to the code. 36 37 It is capable of identifying ``define``, function, ``struct``, ``typedef``, 38 ``enum`` and ``enum`` symbols and create cross-references for all of them. 39 It is also capable of distinguish #define used for specifying a Linux 40 ioctl. 41 42 By default, it create rules for all symbols and defines, but it also 43 allows parsing an exception file. Such file contains a set of rules 44 using the syntax below: 45 46 1. Ignore rules:: 47 48 ignore <type> <symbol>` 49 50 Removes the symbol from reference generation. 51 52 2. Replace rules:: 53 54 replace <type> <old_symbol> <new_reference> 55 56 Replaces how old_symbol with a new reference. The new_reference can be: 57 58 - A simple symbol name; 59 - A full Sphinx reference. 60 61 3. Namespace rules:: 62 63 namespace <namespace> 64 65 Sets C namespace to be used during cross-reference generation. Can 66 be overridden by replace rules. 67 68 On ignore and replace rules, ``<type>`` can be: 69 - ``ioctl``: for defines that end with ``_IO*``, e.g. ioctl definitions 70 - ``define``: for other defines 71 - ``symbol``: for symbols defined within enums; 72 - ``typedef``: for typedefs; 73 - ``enum``: for the name of a non-anonymous enum; 74 - ``struct``: for structs. 75 76 Examples:: 77 78 ignore define __LINUX_MEDIA_H 79 ignore ioctl VIDIOC_ENUM_FMT 80 replace ioctl VIDIOC_DQBUF vidioc_qbuf 81 replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det` 82 83 namespace MC 84 """ 85 86 #: Parser regex with multiple ways to capture enums. 87 RE_ENUMS = [ 88 re.compile(r"^\s*enum\s+([\w_]+)\s*\{"), 89 re.compile(r"^\s*enum\s+([\w_]+)\s*$"), 90 re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"), 91 re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"), 92 ] 93 94 #: Parser regex with multiple ways to capture structs. 95 RE_STRUCTS = [ 96 re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"), 97 re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"), 98 re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"), 99 re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"), 100 ] 101 102 # NOTE: the original code was written a long time before Sphinx C 103 # domain to have multiple namespaces. To avoid to much turn at the 104 # existing hyperlinks, the code kept using "c:type" instead of the 105 # right types. To change that, we need to change the types not only 106 # here, but also at the uAPI media documentation. 107 108 #: Dictionary containing C type identifiers to be transformed. 109 DEF_SYMBOL_TYPES = { 110 "ioctl": { 111 "prefix": "\\ ", 112 "suffix": "\\ ", 113 "ref_type": ":ref", 114 "description": "IOCTL Commands", 115 }, 116 "define": { 117 "prefix": "\\ ", 118 "suffix": "\\ ", 119 "ref_type": ":ref", 120 "description": "Macros and Definitions", 121 }, 122 # We're calling each definition inside an enum as "symbol" 123 "symbol": { 124 "prefix": "\\ ", 125 "suffix": "\\ ", 126 "ref_type": ":ref", 127 "description": "Enumeration values", 128 }, 129 "typedef": { 130 "prefix": "\\ ", 131 "suffix": "\\ ", 132 "ref_type": ":c:type", 133 "description": "Type Definitions", 134 }, 135 # This is the description of the enum itself 136 "enum": { 137 "prefix": "\\ ", 138 "suffix": "\\ ", 139 "ref_type": ":c:type", 140 "description": "Enumerations", 141 }, 142 "struct": { 143 "prefix": "\\ ", 144 "suffix": "\\ ", 145 "ref_type": ":c:type", 146 "description": "Structures", 147 }, 148 } 149 150 def __init__(self, debug: bool = False): 151 """Initialize internal vars""" 152 self.debug = debug 153 self.data = "" 154 155 self.symbols = {} 156 157 self.namespace = None 158 self.ignore = [] 159 self.replace = [] 160 161 for symbol_type in self.DEF_SYMBOL_TYPES: 162 self.symbols[symbol_type] = {} 163 164 def read_exceptions(self, fname: str): 165 """ 166 Read an optional exceptions file, used to override defaults. 167 """ 168 169 if not fname: 170 return 171 172 name = os.path.basename(fname) 173 174 with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f: 175 for ln, line in enumerate(f): 176 ln += 1 177 line = line.strip() 178 if not line or line.startswith("#"): 179 continue 180 181 # ignore rules 182 match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line) 183 184 if match: 185 self.ignore.append((ln, match.group(1), match.group(2))) 186 continue 187 188 # replace rules 189 match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line) 190 if match: 191 self.replace.append((ln, match.group(1), match.group(2), 192 match.group(3))) 193 continue 194 195 match = re.match(r"^namespace\s+(\S+)", line) 196 if match: 197 self.namespace = match.group(1) 198 continue 199 200 sys.exit(f"{name}:{ln}: invalid line: {line}") 201 202 def apply_exceptions(self): 203 """ 204 Process exceptions file with rules to ignore or replace references. 205 """ 206 207 # Handle ignore rules 208 for ln, c_type, symbol in self.ignore: 209 if c_type not in self.DEF_SYMBOL_TYPES: 210 sys.exit(f"{name}:{ln}: {c_type} is invalid") 211 212 d = self.symbols[c_type] 213 if symbol in d: 214 del d[symbol] 215 216 # Handle replace rules 217 for ln, c_type, old, new in self.replace: 218 if c_type not in self.DEF_SYMBOL_TYPES: 219 sys.exit(f"{name}:{ln}: {c_type} is invalid") 220 221 reftype = None 222 223 # Parse reference type when the type is specified 224 225 match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new) 226 if match: 227 reftype = f":c:{match.group(1)}" 228 new = match.group(2) 229 else: 230 match = re.search(r"(\:ref)\:\`(.+)\`", new) 231 if match: 232 reftype = match.group(1) 233 new = match.group(2) 234 235 # If the replacement rule doesn't have a type, get default 236 if not reftype: 237 reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type") 238 if not reftype: 239 reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type") 240 241 new_ref = f"{reftype}:`{old} <{new}>`" 242 243 # Change self.symbols to use the replacement rule 244 if old in self.symbols[c_type]: 245 (_, ln) = self.symbols[c_type][old] 246 self.symbols[c_type][old] = (new_ref, ln) 247 else: 248 print(f"{name}:{ln}: Warning: can't find {old} {c_type}") 249 250 def store_type(self, ln, symbol_type: str, symbol: str, 251 ref_name: str = None, replace_underscores: bool = True): 252 """ 253 Store a new symbol at self.symbols under symbol_type. 254 255 By default, underscores are replaced by ``-``. 256 """ 257 defs = self.DEF_SYMBOL_TYPES[symbol_type] 258 259 prefix = defs.get("prefix", "") 260 suffix = defs.get("suffix", "") 261 ref_type = defs.get("ref_type") 262 263 # Determine ref_link based on symbol type 264 if ref_type or self.namespace: 265 if not ref_name: 266 ref_name = symbol.lower() 267 268 # c-type references don't support hash 269 if ref_type == ":ref" and replace_underscores: 270 ref_name = ref_name.replace("_", "-") 271 272 # C domain references may have namespaces 273 if ref_type.startswith(":c:"): 274 if self.namespace: 275 ref_name = f"{self.namespace}.{ref_name}" 276 277 if ref_type: 278 ref_link = f"{ref_type}:`{symbol} <{ref_name}>`" 279 else: 280 ref_link = f"`{symbol} <{ref_name}>`" 281 else: 282 ref_link = symbol 283 284 self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln) 285 286 def store_line(self, line): 287 """ 288 Store a line at self.data, properly indented. 289 """ 290 line = " " + line.expandtabs() 291 self.data += line.rstrip(" ") 292 293 def parse_file(self, file_in: str, exceptions: str = None): 294 """ 295 Read a C source file and get identifiers. 296 """ 297 self.data = "" 298 is_enum = False 299 is_comment = False 300 multiline = "" 301 302 self.read_exceptions(exceptions) 303 304 with open(file_in, "r", 305 encoding="utf-8", errors="backslashreplace") as f: 306 for line_no, line in enumerate(f): 307 self.store_line(line) 308 line = line.strip("\n") 309 310 # Handle continuation lines 311 if line.endswith(r"\\"): 312 multiline += line[-1] 313 continue 314 315 if multiline: 316 line = multiline + line 317 multiline = "" 318 319 # Handle comments. They can be multilined 320 if not is_comment: 321 if re.search(r"/\*.*", line): 322 is_comment = True 323 else: 324 # Strip C99-style comments 325 line = re.sub(r"(//.*)", "", line) 326 327 if is_comment: 328 if re.search(r".*\*/", line): 329 is_comment = False 330 else: 331 multiline = line 332 continue 333 334 # At this point, line variable may be a multilined statement, 335 # if lines end with \ or if they have multi-line comments 336 # With that, it can safely remove the entire comments, 337 # and there's no need to use re.DOTALL for the logic below 338 339 line = re.sub(r"(/\*.*\*/)", "", line) 340 if not line.strip(): 341 continue 342 343 # It can be useful for debug purposes to print the file after 344 # having comments stripped and multi-lines grouped. 345 if self.debug > 1: 346 print(f"line {line_no + 1}: {line}") 347 348 # Now the fun begins: parse each type and store it. 349 350 # We opted for a two parsing logic here due to: 351 # 1. it makes easier to debug issues not-parsed symbols; 352 # 2. we want symbol replacement at the entire content, not 353 # just when the symbol is detected. 354 355 if is_enum: 356 match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line) 357 if match: 358 self.store_type(line_no, "symbol", match.group(1)) 359 if "}" in line: 360 is_enum = False 361 continue 362 363 match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line) 364 if match: 365 self.store_type(line_no, "ioctl", match.group(1), 366 replace_underscores=False) 367 continue 368 369 match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line) 370 if match: 371 self.store_type(line_no, "define", match.group(1)) 372 continue 373 374 match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);", 375 line) 376 if match: 377 name = match.group(2).strip() 378 symbol = match.group(3) 379 self.store_type(line_no, "typedef", symbol, ref_name=name) 380 continue 381 382 for re_enum in self.RE_ENUMS: 383 match = re_enum.match(line) 384 if match: 385 self.store_type(line_no, "enum", match.group(1)) 386 is_enum = True 387 break 388 389 for re_struct in self.RE_STRUCTS: 390 match = re_struct.match(line) 391 if match: 392 self.store_type(line_no, "struct", match.group(1)) 393 break 394 395 self.apply_exceptions() 396 397 def debug_print(self): 398 """ 399 Print debug information containing the replacement rules per symbol. 400 To make easier to check, group them per type. 401 """ 402 if not self.debug: 403 return 404 405 for c_type, refs in self.symbols.items(): 406 if not refs: # Skip empty dictionaries 407 continue 408 409 print(f"{c_type}:") 410 411 for symbol, (ref, ln) in sorted(refs.items()): 412 print(f" #{ln:<5d} {symbol} -> {ref}") 413 414 print() 415 416 def gen_output(self): 417 """Write the formatted output to a file.""" 418 419 # Avoid extra blank lines 420 text = re.sub(r"\s+$", "", self.data) + "\n" 421 text = re.sub(r"\n\s+\n", "\n\n", text) 422 423 # Escape Sphinx special characters 424 text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text) 425 426 # Source uAPI files may have special notes. Use bold font for them 427 text = re.sub(r"DEPRECATED", "**DEPRECATED**", text) 428 429 # Delimiters to catch the entire symbol after escaped 430 start_delim = r"([ \n\t\(=\*\@])" 431 end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)" 432 433 # Process all reference types 434 for ref_dict in self.symbols.values(): 435 for symbol, (replacement, _) in ref_dict.items(): 436 symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol)) 437 text = re.sub(fr'{start_delim}{symbol}{end_delim}', 438 fr'\1{replacement}\2', text) 439 440 # Remove "\ " where not needed: before spaces and at the end of lines 441 text = re.sub(r"\\ ([\n ])", r"\1", text) 442 text = re.sub(r" \\ ", " ", text) 443 444 return text 445 446 def gen_toc(self): 447 """ 448 Create a list of symbols to be part of a TOC contents table. 449 """ 450 text = [] 451 452 # Sort symbol types per description 453 symbol_descriptions = [] 454 for k, v in self.DEF_SYMBOL_TYPES.items(): 455 symbol_descriptions.append((v['description'], k)) 456 457 symbol_descriptions.sort() 458 459 # Process each category 460 for description, c_type in symbol_descriptions: 461 462 refs = self.symbols[c_type] 463 if not refs: # Skip empty categories 464 continue 465 466 text.append(f"{description}") 467 text.append("-" * len(description)) 468 text.append("") 469 470 # Sort symbols alphabetically 471 for symbol, (ref, ln) in sorted(refs.items()): 472 text.append(f"- LINENO_{ln}: {ref}") 473 474 text.append("") # Add empty line between categories 475 476 return "\n".join(text) 477 478 def write_output(self, file_in: str, file_out: str, toc: bool): 479 """ 480 Write a ReST output file. 481 """ 482 483 title = os.path.basename(file_in) 484 485 if toc: 486 text = self.gen_toc() 487 else: 488 text = self.gen_output() 489 490 with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f: 491 f.write(".. -*- coding: utf-8; mode: rst -*-\n\n") 492 f.write(f"{title}\n") 493 f.write("=" * len(title) + "\n\n") 494 495 if not toc: 496 f.write(".. parsed-literal::\n\n") 497 498 f.write(text) 499