1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <mchehab@kernel.org>. 4# pylint: disable=R0912,R0915 5 6""" 7Parse a source file or header, creating ReStructured Text cross references. 8 9It accepts an optional file to change the default symbol reference or to 10suppress symbols from the output. 11 12It is capable of identifying defines, functions, structs, typedefs, 13enums and enum symbols and create cross-references for all of them. 14It is also capable of distinguish #define used for specifying a Linux 15ioctl. 16 17The optional rules file contains a set of rules like: 18 19 ignore ioctl VIDIOC_ENUM_FMT 20 replace ioctl VIDIOC_DQBUF vidioc_qbuf 21 replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det` 22""" 23 24import os 25import re 26import sys 27 28 29class ParseDataStructs: 30 """ 31 Creates an enriched version of a Kernel header file with cross-links 32 to each C data structure type. 33 34 It is meant to allow having a more comprehensive documentation, where 35 uAPI headers will create cross-reference links to the code. 36 37 It is capable of identifying defines, functions, structs, typedefs, 38 enums and enum symbols and create cross-references for all of them. 39 It is also capable of distinguish #define used for specifying a Linux 40 ioctl. 41 42 By default, it create rules for all symbols and defines, but it also 43 allows parsing an exception file. Such file contains a set of rules 44 using the syntax below: 45 46 1. Ignore rules: 47 48 ignore <type> <symbol>` 49 50 Removes the symbol from reference generation. 51 52 2. Replace rules: 53 54 replace <type> <old_symbol> <new_reference> 55 56 Replaces how old_symbol with a new reference. The new_reference can be: 57 58 - A simple symbol name; 59 - A full Sphinx reference. 60 61 3. Namespace rules 62 63 namespace <namespace> 64 65 Sets C namespace to be used during cross-reference generation. Can 66 be overridden by replace rules. 67 68 On ignore and replace rules, <type> can be: 69 - ioctl: for defines that end with _IO*, e.g. ioctl definitions 70 - define: for other defines 71 - symbol: for symbols defined within enums; 72 - typedef: for typedefs; 73 - enum: for the name of a non-anonymous enum; 74 - struct: for structs. 75 76 Examples: 77 78 ignore define __LINUX_MEDIA_H 79 ignore ioctl VIDIOC_ENUM_FMT 80 replace ioctl VIDIOC_DQBUF vidioc_qbuf 81 replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det` 82 83 namespace MC 84 """ 85 86 # Parser regexes with multiple ways to capture enums and structs 87 RE_ENUMS = [ 88 re.compile(r"^\s*enum\s+([\w_]+)\s*\{"), 89 re.compile(r"^\s*enum\s+([\w_]+)\s*$"), 90 re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"), 91 re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"), 92 ] 93 RE_STRUCTS = [ 94 re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"), 95 re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"), 96 re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"), 97 re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"), 98 ] 99 100 # FIXME: the original code was written a long time before Sphinx C 101 # domain to have multiple namespaces. To avoid to much turn at the 102 # existing hyperlinks, the code kept using "c:type" instead of the 103 # right types. To change that, we need to change the types not only 104 # here, but also at the uAPI media documentation. 105 DEF_SYMBOL_TYPES = { 106 "ioctl": { 107 "prefix": "\\ ", 108 "suffix": "\\ ", 109 "ref_type": ":ref", 110 "description": "IOCTL Commands", 111 }, 112 "define": { 113 "prefix": "\\ ", 114 "suffix": "\\ ", 115 "ref_type": ":ref", 116 "description": "Macros and Definitions", 117 }, 118 # We're calling each definition inside an enum as "symbol" 119 "symbol": { 120 "prefix": "\\ ", 121 "suffix": "\\ ", 122 "ref_type": ":ref", 123 "description": "Enumeration values", 124 }, 125 "typedef": { 126 "prefix": "\\ ", 127 "suffix": "\\ ", 128 "ref_type": ":c:type", 129 "description": "Type Definitions", 130 }, 131 # This is the description of the enum itself 132 "enum": { 133 "prefix": "\\ ", 134 "suffix": "\\ ", 135 "ref_type": ":c:type", 136 "description": "Enumerations", 137 }, 138 "struct": { 139 "prefix": "\\ ", 140 "suffix": "\\ ", 141 "ref_type": ":c:type", 142 "description": "Structures", 143 }, 144 } 145 146 def __init__(self, debug: bool = False): 147 """Initialize internal vars""" 148 self.debug = debug 149 self.data = "" 150 151 self.symbols = {} 152 153 self.namespace = None 154 self.ignore = [] 155 self.replace = [] 156 157 for symbol_type in self.DEF_SYMBOL_TYPES: 158 self.symbols[symbol_type] = {} 159 160 def read_exceptions(self, fname: str): 161 if not fname: 162 return 163 164 name = os.path.basename(fname) 165 166 with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f: 167 for ln, line in enumerate(f): 168 ln += 1 169 line = line.strip() 170 if not line or line.startswith("#"): 171 continue 172 173 # ignore rules 174 match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line) 175 176 if match: 177 self.ignore.append((ln, match.group(1), match.group(2))) 178 continue 179 180 # replace rules 181 match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line) 182 if match: 183 self.replace.append((ln, match.group(1), match.group(2), 184 match.group(3))) 185 continue 186 187 match = re.match(r"^namespace\s+(\S+)", line) 188 if match: 189 self.namespace = match.group(1) 190 continue 191 192 sys.exit(f"{name}:{ln}: invalid line: {line}") 193 194 def apply_exceptions(self): 195 """ 196 Process exceptions file with rules to ignore or replace references. 197 """ 198 199 # Handle ignore rules 200 for ln, c_type, symbol in self.ignore: 201 if c_type not in self.DEF_SYMBOL_TYPES: 202 sys.exit(f"{name}:{ln}: {c_type} is invalid") 203 204 d = self.symbols[c_type] 205 if symbol in d: 206 del d[symbol] 207 208 # Handle replace rules 209 for ln, c_type, old, new in self.replace: 210 if c_type not in self.DEF_SYMBOL_TYPES: 211 sys.exit(f"{name}:{ln}: {c_type} is invalid") 212 213 reftype = None 214 215 # Parse reference type when the type is specified 216 217 match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new) 218 if match: 219 reftype = f":c:{match.group(1)}" 220 new = match.group(2) 221 else: 222 match = re.search(r"(\:ref)\:\`(.+)\`", new) 223 if match: 224 reftype = match.group(1) 225 new = match.group(2) 226 227 # If the replacement rule doesn't have a type, get default 228 if not reftype: 229 reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type") 230 if not reftype: 231 reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type") 232 233 new_ref = f"{reftype}:`{old} <{new}>`" 234 235 # Change self.symbols to use the replacement rule 236 if old in self.symbols[c_type]: 237 (_, ln) = self.symbols[c_type][old] 238 self.symbols[c_type][old] = (new_ref, ln) 239 else: 240 print(f"{name}:{ln}: Warning: can't find {old} {c_type}") 241 242 def store_type(self, ln, symbol_type: str, symbol: str, 243 ref_name: str = None, replace_underscores: bool = True): 244 """ 245 Stores a new symbol at self.symbols under symbol_type. 246 247 By default, underscores are replaced by "-" 248 """ 249 defs = self.DEF_SYMBOL_TYPES[symbol_type] 250 251 prefix = defs.get("prefix", "") 252 suffix = defs.get("suffix", "") 253 ref_type = defs.get("ref_type") 254 255 # Determine ref_link based on symbol type 256 if ref_type or self.namespace: 257 if not ref_name: 258 ref_name = symbol.lower() 259 260 # c-type references don't support hash 261 if ref_type == ":ref" and replace_underscores: 262 ref_name = ref_name.replace("_", "-") 263 264 # C domain references may have namespaces 265 if ref_type.startswith(":c:"): 266 if self.namespace: 267 ref_name = f"{self.namespace}.{ref_name}" 268 269 if ref_type: 270 ref_link = f"{ref_type}:`{symbol} <{ref_name}>`" 271 else: 272 ref_link = f"`{symbol} <{ref_name}>`" 273 else: 274 ref_link = symbol 275 276 self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln) 277 278 def store_line(self, line): 279 """Stores a line at self.data, properly indented""" 280 line = " " + line.expandtabs() 281 self.data += line.rstrip(" ") 282 283 def parse_file(self, file_in: str, exceptions: str = None): 284 """Reads a C source file and get identifiers""" 285 self.data = "" 286 is_enum = False 287 is_comment = False 288 multiline = "" 289 290 self.read_exceptions(exceptions) 291 292 with open(file_in, "r", 293 encoding="utf-8", errors="backslashreplace") as f: 294 for line_no, line in enumerate(f): 295 self.store_line(line) 296 line = line.strip("\n") 297 298 # Handle continuation lines 299 if line.endswith(r"\\"): 300 multiline += line[-1] 301 continue 302 303 if multiline: 304 line = multiline + line 305 multiline = "" 306 307 # Handle comments. They can be multilined 308 if not is_comment: 309 if re.search(r"/\*.*", line): 310 is_comment = True 311 else: 312 # Strip C99-style comments 313 line = re.sub(r"(//.*)", "", line) 314 315 if is_comment: 316 if re.search(r".*\*/", line): 317 is_comment = False 318 else: 319 multiline = line 320 continue 321 322 # At this point, line variable may be a multilined statement, 323 # if lines end with \ or if they have multi-line comments 324 # With that, it can safely remove the entire comments, 325 # and there's no need to use re.DOTALL for the logic below 326 327 line = re.sub(r"(/\*.*\*/)", "", line) 328 if not line.strip(): 329 continue 330 331 # It can be useful for debug purposes to print the file after 332 # having comments stripped and multi-lines grouped. 333 if self.debug > 1: 334 print(f"line {line_no + 1}: {line}") 335 336 # Now the fun begins: parse each type and store it. 337 338 # We opted for a two parsing logic here due to: 339 # 1. it makes easier to debug issues not-parsed symbols; 340 # 2. we want symbol replacement at the entire content, not 341 # just when the symbol is detected. 342 343 if is_enum: 344 match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line) 345 if match: 346 self.store_type(line_no, "symbol", match.group(1)) 347 if "}" in line: 348 is_enum = False 349 continue 350 351 match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line) 352 if match: 353 self.store_type(line_no, "ioctl", match.group(1), 354 replace_underscores=False) 355 continue 356 357 match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line) 358 if match: 359 self.store_type(line_no, "define", match.group(1)) 360 continue 361 362 match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);", 363 line) 364 if match: 365 name = match.group(2).strip() 366 symbol = match.group(3) 367 self.store_type(line_no, "typedef", symbol, ref_name=name) 368 continue 369 370 for re_enum in self.RE_ENUMS: 371 match = re_enum.match(line) 372 if match: 373 self.store_type(line_no, "enum", match.group(1)) 374 is_enum = True 375 break 376 377 for re_struct in self.RE_STRUCTS: 378 match = re_struct.match(line) 379 if match: 380 self.store_type(line_no, "struct", match.group(1)) 381 break 382 383 self.apply_exceptions() 384 385 def debug_print(self): 386 """ 387 Print debug information containing the replacement rules per symbol. 388 To make easier to check, group them per type. 389 """ 390 if not self.debug: 391 return 392 393 for c_type, refs in self.symbols.items(): 394 if not refs: # Skip empty dictionaries 395 continue 396 397 print(f"{c_type}:") 398 399 for symbol, (ref, ln) in sorted(refs.items()): 400 print(f" #{ln:<5d} {symbol} -> {ref}") 401 402 print() 403 404 def gen_output(self): 405 """Write the formatted output to a file.""" 406 407 # Avoid extra blank lines 408 text = re.sub(r"\s+$", "", self.data) + "\n" 409 text = re.sub(r"\n\s+\n", "\n\n", text) 410 411 # Escape Sphinx special characters 412 text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text) 413 414 # Source uAPI files may have special notes. Use bold font for them 415 text = re.sub(r"DEPRECATED", "**DEPRECATED**", text) 416 417 # Delimiters to catch the entire symbol after escaped 418 start_delim = r"([ \n\t\(=\*\@])" 419 end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)" 420 421 # Process all reference types 422 for ref_dict in self.symbols.values(): 423 for symbol, (replacement, _) in ref_dict.items(): 424 symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol)) 425 text = re.sub(fr'{start_delim}{symbol}{end_delim}', 426 fr'\1{replacement}\2', text) 427 428 # Remove "\ " where not needed: before spaces and at the end of lines 429 text = re.sub(r"\\ ([\n ])", r"\1", text) 430 text = re.sub(r" \\ ", " ", text) 431 432 return text 433 434 def gen_toc(self): 435 """ 436 Create a list of symbols to be part of a TOC contents table 437 """ 438 text = [] 439 440 # Sort symbol types per description 441 symbol_descriptions = [] 442 for k, v in self.DEF_SYMBOL_TYPES.items(): 443 symbol_descriptions.append((v['description'], k)) 444 445 symbol_descriptions.sort() 446 447 # Process each category 448 for description, c_type in symbol_descriptions: 449 450 refs = self.symbols[c_type] 451 if not refs: # Skip empty categories 452 continue 453 454 text.append(f"{description}") 455 text.append("-" * len(description)) 456 text.append("") 457 458 # Sort symbols alphabetically 459 for symbol, (ref, ln) in sorted(refs.items()): 460 text.append(f"- LINENO_{ln}: {ref}") 461 462 text.append("") # Add empty line between categories 463 464 return "\n".join(text) 465 466 def write_output(self, file_in: str, file_out: str, toc: bool): 467 title = os.path.basename(file_in) 468 469 if toc: 470 text = self.gen_toc() 471 else: 472 text = self.gen_output() 473 474 with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f: 475 f.write(".. -*- coding: utf-8; mode: rst -*-\n\n") 476 f.write(f"{title}\n") 477 f.write("=" * len(title) + "\n\n") 478 479 if not toc: 480 f.write(".. parsed-literal::\n\n") 481 482 f.write(text) 483