1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9 10Please notice that the code here may rise exceptions to indicate bad 11usage inside kdoc to indicate problems at the replace pattern. 12 13Other errors are logged via log instance. 14""" 15 16import logging 17import re 18 19from copy import copy 20 21from .kdoc_re import KernRe 22 23log = logging.getLogger(__name__) 24 25def tokenizer_set_log(logger, prefix = ""): 26 """ 27 Replace the module‑level logger with a LoggerAdapter that 28 prepends *prefix* to every message. 29 """ 30 global log 31 32 class PrefixAdapter(logging.LoggerAdapter): 33 """ 34 Ancillary class to set prefix on all message logs. 35 """ 36 def process(self, msg, kwargs): 37 return f"{prefix}{msg}", kwargs 38 39 # Wrap the provided logger in our adapter 40 log = PrefixAdapter(logger, {"prefix": prefix}) 41 42class CToken(): 43 """ 44 Data class to define a C token. 45 """ 46 47 # Tokens that can be used by the parser. Works like an C enum. 48 49 COMMENT = 0 #: A standard C or C99 comment, including delimiter. 50 STRING = 1 #: A string, including quotation marks. 51 CHAR = 2 #: A character, including apostophes. 52 NUMBER = 3 #: A number. 53 PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 54 BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 55 END = 6 #: A end character: ``}`` / ``]`` / ``)``. 56 CPP = 7 #: A preprocessor macro. 57 HASH = 8 #: The hash character - useful to handle other macros. 58 OP = 9 #: A C operator (add, subtract, ...). 59 STRUCT = 10 #: A ``struct`` keyword. 60 UNION = 11 #: An ``union`` keyword. 61 ENUM = 12 #: A ``struct`` keyword. 62 TYPEDEF = 13 #: A ``typedef`` keyword. 63 NAME = 14 #: A name. Can be an ID or a type. 64 SPACE = 15 #: Any space characters, including new lines 65 ENDSTMT = 16 #: End of an statement (``;``). 66 67 BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 68 69 MISMATCH = 255 #: an error indicator: should never happen in practice. 70 71 # Dict to convert from an enum interger into a string. 72 _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 73 74 # Dict to convert from string to an enum-like integer value. 75 _name_to_val = {k: v for v, k in _name_by_val.items()} 76 77 @staticmethod 78 def to_name(val): 79 """Convert from an integer value from CToken enum into a string""" 80 81 return CToken._name_by_val.get(val, f"UNKNOWN({val})") 82 83 @staticmethod 84 def from_name(name): 85 """Convert a string into a CToken enum value""" 86 if name in CToken._name_to_val: 87 return CToken._name_to_val[name] 88 89 return CToken.MISMATCH 90 91 92 def __init__(self, kind, value=None, pos=0, 93 brace_level=0, paren_level=0, bracket_level=0): 94 self.kind = kind 95 self.value = value 96 self.pos = pos 97 self.level = (bracket_level, paren_level, brace_level) 98 99 def __repr__(self): 100 name = self.to_name(self.kind) 101 if isinstance(self.value, str): 102 value = '"' + self.value + '"' 103 else: 104 value = self.value 105 106 return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 107 108#: Regexes to parse C code, transforming it into tokens. 109RE_SCANNER_LIST = [ 110 # 111 # Note that \s\S is different than .*, as it also catches \n 112 # 113 (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 114 115 (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 116 (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 117 118 (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 119 r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 120 121 (CToken.ENDSTMT, r"(?:\s+;|;)"), 122 123 (CToken.PUNC, r"[,\.]"), 124 125 (CToken.BEGIN, r"[\[\(\{]"), 126 127 (CToken.END, r"[\]\)\}]"), 128 129 (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 130 131 (CToken.HASH, r"#"), 132 133 (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 134 r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 135 136 (CToken.STRUCT, r"\bstruct\b"), 137 (CToken.UNION, r"\bunion\b"), 138 (CToken.ENUM, r"\benum\b"), 139 (CToken.TYPEDEF, r"\btypedef\b"), 140 141 (CToken.NAME, r"[A-Za-z_]\w*"), 142 143 (CToken.SPACE, r"\s+"), 144 145 (CToken.BACKREF, r"\\\d+"), 146 147 (CToken.MISMATCH,r"."), 148] 149 150def fill_re_scanner(token_list): 151 """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 152 re_tokens = [] 153 154 for kind, pattern in token_list: 155 name = CToken.to_name(kind) 156 re_tokens.append(f"(?P<{name}>{pattern})") 157 158 return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 159 160#: Handle C continuation lines. 161RE_CONT = KernRe(r"\\\n") 162 163RE_COMMENT_START = KernRe(r'/\*\s*') 164 165#: tokenizer regex. Will be filled at the first CTokenizer usage. 166RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 167 168 169class CTokenizer(): 170 """ 171 Scan C statements and definitions and produce tokens. 172 173 When converted to string, it drops comments and handle public/private 174 values, respecting depth. 175 """ 176 177 # This class is inspired and follows the basic concepts of: 178 # https://docs.python.org/3/library/re.html#writing-a-tokenizer 179 180 def __init__(self, source=None, log=None): 181 """ 182 Create a regular expression to handle RE_SCANNER_LIST. 183 184 While I generally don't like using regex group naming via: 185 (?P<name>...) 186 187 in this particular case, it makes sense, as we can pick the name 188 when matching a code via RE_SCANNER. 189 """ 190 191 self.tokens = [] 192 193 if not source: 194 return 195 196 if isinstance(source, list): 197 self.tokens = source 198 return 199 200 # 201 # While we could just use _tokenize directly via interator, 202 # As we'll need to use the tokenizer several times inside kernel-doc 203 # to handle macro transforms, cache the results on a list, as 204 # re-using it is cheaper than having to parse everytime. 205 # 206 for tok in self._tokenize(source): 207 self.tokens.append(tok) 208 209 def _tokenize(self, source): 210 """ 211 Iterator that parses ``source``, splitting it into tokens, as defined 212 at ``self.RE_SCANNER_LIST``. 213 214 The interactor returns a CToken class object. 215 """ 216 217 # Handle continuation lines. Note that kdoc_parser already has a 218 # logic to do that. Still, let's keep it for completeness, as we might 219 # end re-using this tokenizer outsize kernel-doc some day - or we may 220 # eventually remove from there as a future cleanup. 221 source = RE_CONT.sub("", source) 222 223 brace_level = 0 224 paren_level = 0 225 bracket_level = 0 226 227 for match in RE_SCANNER.finditer(source): 228 kind = CToken.from_name(match.lastgroup) 229 pos = match.start() 230 value = match.group() 231 232 if kind == CToken.MISMATCH: 233 log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 234 elif kind == CToken.BEGIN: 235 if value == '(': 236 paren_level += 1 237 elif value == '[': 238 bracket_level += 1 239 else: # value == '{' 240 brace_level += 1 241 242 elif kind == CToken.END: 243 if value == ')' and paren_level > 0: 244 paren_level -= 1 245 elif value == ']' and bracket_level > 0: 246 bracket_level -= 1 247 elif brace_level > 0: # value == '}' 248 brace_level -= 1 249 250 yield CToken(kind, value, pos, 251 brace_level, paren_level, bracket_level) 252 253 def __str__(self): 254 out="" 255 show_stack = [True] 256 257 for i, tok in enumerate(self.tokens): 258 if tok.kind == CToken.BEGIN: 259 show_stack.append(show_stack[-1]) 260 261 elif tok.kind == CToken.END: 262 prev = show_stack[-1] 263 if len(show_stack) > 1: 264 show_stack.pop() 265 266 if not prev and show_stack[-1]: 267 # 268 # Try to preserve indent 269 # 270 out += "\t" * (len(show_stack) - 1) 271 272 out += str(tok.value) 273 continue 274 275 elif tok.kind == CToken.COMMENT: 276 comment = RE_COMMENT_START.sub("", tok.value) 277 278 if comment.startswith("private:"): 279 show_stack[-1] = False 280 show = False 281 elif comment.startswith("public:"): 282 show_stack[-1] = True 283 284 continue 285 286 if not show_stack[-1]: 287 continue 288 289 if i < len(self.tokens) - 1: 290 next_tok = self.tokens[i + 1] 291 292 # Do some cleanups before ";" 293 294 if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 295 continue 296 297 if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 298 continue 299 300 out += str(tok.value) 301 302 return out 303 304 305class CTokenArgs: 306 """ 307 Ancillary class to help using backrefs from sub matches. 308 309 If the highest backref contain a "+" at the last element, 310 the logic will be greedy, picking all other delims. 311 312 This is needed to parse struct_group macros with end with ``MEMBERS...``. 313 """ 314 def __init__(self, sub_str): 315 self.sub_groups = set() 316 self.max_group = -1 317 self.greedy = None 318 319 for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): 320 group = int(m.group(1)) 321 if m.group(2) == "+": 322 if self.greedy and self.greedy != group: 323 raise ValueError("There are multiple greedy patterns!") 324 self.greedy = group 325 326 self.sub_groups.add(group) 327 self.max_group = max(self.max_group, group) 328 329 if self.greedy: 330 if self.greedy != self.max_group: 331 raise ValueError("Greedy pattern is not the last one!") 332 333 sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) 334 335 self.sub_str = sub_str 336 self.sub_tokeninzer = CTokenizer(sub_str) 337 338 def groups(self, new_tokenizer): 339 """ 340 Create replacement arguments for backrefs like: 341 342 ``\0``, ``\1``, ``\2``, ...``\n`` 343 344 It also accepts a ``+`` character to the highest backref. When used, 345 it means in practice to ignore delimins after it, being greedy. 346 347 The logic is smart enough to only go up to the maximum required 348 argument, even if there are more. 349 350 If there is a backref for an argument above the limit, it will 351 raise an exception. Please notice that, on C, square brackets 352 don't have any separator on it. Trying to use ``\1``..``\n`` for 353 brackets also raise an exception. 354 """ 355 356 level = (0, 0, 0) 357 358 if self.max_group < 0: 359 return level, [] 360 361 tokens = new_tokenizer.tokens 362 363 # 364 # Fill \0 with the full token contents 365 # 366 groups_list = [ [] ] 367 368 if 0 in self.sub_groups: 369 inner_level = 0 370 371 for i in range(0, len(tokens)): 372 tok = tokens[i] 373 374 if tok.kind == CToken.BEGIN: 375 inner_level += 1 376 377 # 378 # Discard first begin 379 # 380 if not groups_list[0]: 381 continue 382 elif tok.kind == CToken.END: 383 inner_level -= 1 384 if inner_level < 0: 385 break 386 387 if inner_level: 388 groups_list[0].append(tok) 389 390 if not self.max_group: 391 return level, groups_list 392 393 delim = None 394 395 # 396 # Ignore everything before BEGIN. The value of begin gives the 397 # delimiter to be used for the matches 398 # 399 for i in range(0, len(tokens)): 400 tok = tokens[i] 401 if tok.kind == CToken.BEGIN: 402 if tok.value == "{": 403 delim = ";" 404 elif tok.value == "(": 405 delim = "," 406 else: 407 self.log.error(fr"Can't handle \1..\n on {sub_str}") 408 409 level = tok.level 410 break 411 412 pos = 1 413 groups_list.append([]) 414 415 inner_level = 0 416 for i in range(i + 1, len(tokens)): 417 tok = tokens[i] 418 419 if tok.kind == CToken.BEGIN: 420 inner_level += 1 421 if tok.kind == CToken.END: 422 inner_level -= 1 423 if inner_level < 0: 424 break 425 426 if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value: 427 pos += 1 428 if self.greedy and pos > self.max_group: 429 pos -= 1 430 else: 431 groups_list.append([]) 432 433 if pos > self.max_group: 434 break 435 436 continue 437 438 groups_list[pos].append(tok) 439 440 if pos < self.max_group: 441 log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}") 442 443 return level, groups_list 444 445 def tokens(self, new_tokenizer): 446 level, groups = self.groups(new_tokenizer) 447 448 new = CTokenizer() 449 450 for tok in self.sub_tokeninzer.tokens: 451 if tok.kind == CToken.BACKREF: 452 group = int(tok.value[1:]) 453 454 for group_tok in groups[group]: 455 new_tok = copy(group_tok) 456 457 new_level = [0, 0, 0] 458 459 for i in range(0, len(level)): 460 new_level[i] = new_tok.level[i] + level[i] 461 462 new_tok.level = tuple(new_level) 463 464 new.tokens += [ new_tok ] 465 else: 466 new.tokens += [ tok ] 467 468 return new.tokens 469 470 471class CMatch: 472 """ 473 Finding nested delimiters is hard with regular expressions. It is 474 even harder on Python with its normal re module, as there are several 475 advanced regular expressions that are missing. 476 477 This is the case of this pattern:: 478 479 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 480 481 which is used to properly match open/close parentheses of the 482 string search STRUCT_GROUP(), 483 484 Add a class that counts pairs of delimiters, using it to match and 485 replace nested expressions. 486 487 The original approach was suggested by: 488 489 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 490 491 Although I re-implemented it to make it more generic and match 3 types 492 of delimiters. The logic checks if delimiters are paired. If not, it 493 will ignore the search string. 494 """ 495 496 497 def __init__(self, regex, delim="("): 498 self.regex = KernRe("^" + regex + r"\b") 499 self.start_delim = delim 500 501 def _search(self, tokenizer): 502 """ 503 Finds paired blocks for a regex that ends with a delimiter. 504 505 The suggestion of using finditer to match pairs came from: 506 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 507 but I ended using a different implementation to align all three types 508 of delimiters and seek for an initial regular expression. 509 510 The algorithm seeks for open/close paired delimiters and places them 511 into a stack, yielding a start/stop position of each match when the 512 stack is zeroed. 513 514 The algorithm should work fine for properly paired lines, but will 515 silently ignore end delimiters that precede a start delimiter. 516 This should be OK for kernel-doc parser, as unaligned delimiters 517 would cause compilation errors. So, we don't need to raise exceptions 518 to cover such issues. 519 """ 520 521 start = None 522 started = False 523 524 import sys 525 526 stack = [] 527 528 for i, tok in enumerate(tokenizer.tokens): 529 if start is None: 530 if tok.kind == CToken.NAME and self.regex.match(tok.value): 531 start = i 532 stack.append((start, tok.level)) 533 started = False 534 535 continue 536 537 if not started: 538 if tok.kind == CToken.SPACE: 539 continue 540 541 if tok.kind == CToken.BEGIN and tok.value == self.start_delim: 542 started = True 543 continue 544 545 # Name only token without BEGIN/END 546 if i > start: 547 i -= 1 548 yield start, i 549 start = None 550 551 if tok.kind == CToken.END and tok.level == stack[-1][1]: 552 start, level = stack.pop() 553 554 yield start, i 555 start = None 556 557 # 558 # If an END zeroing levels is not there, return remaining stuff 559 # This is meant to solve cases where the caller logic might be 560 # picking an incomplete block. 561 # 562 if start and stack: 563 if started: 564 s = str(tokenizer) 565 log.warning(f"can't find a final end at {s}") 566 567 yield start, len(tokenizer.tokens) 568 569 def search(self, source): 570 """ 571 This is similar to re.search: 572 573 It matches a regex that it is followed by a delimiter, 574 returning occurrences only if all delimiters are paired. 575 """ 576 577 if isinstance(source, CTokenizer): 578 tokenizer = source 579 is_token = True 580 else: 581 tokenizer = CTokenizer(source) 582 is_token = False 583 584 for start, end in self._search(tokenizer): 585 new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1]) 586 587 if is_token: 588 yield new_tokenizer 589 else: 590 yield str(new_tokenizer) 591 592 def sub(self, sub_str, source, count=0): 593 """ 594 This is similar to re.sub: 595 596 It matches a regex that it is followed by a delimiter, 597 replacing occurrences only if all delimiters are paired. 598 599 if the sub argument contains:: 600 601 r'\0' 602 603 it will work just like re: it places there the matched paired data 604 with the delimiter stripped. 605 606 If count is different than zero, it will replace at most count 607 items. 608 """ 609 if isinstance(source, CTokenizer): 610 is_token = True 611 tokenizer = source 612 else: 613 is_token = False 614 tokenizer = CTokenizer(source) 615 616 # Detect if sub_str contains sub arguments 617 618 args_match = CTokenArgs(sub_str) 619 620 new_tokenizer = CTokenizer() 621 pos = 0 622 n = 0 623 624 # 625 # NOTE: the code below doesn't consider overlays at sub. 626 # We may need to add some extra unit tests to check if those 627 # would cause problems. When replacing by "", this should not 628 # be a problem, but other transformations could be problematic 629 # 630 for start, end in self._search(tokenizer): 631 new_tokenizer.tokens += tokenizer.tokens[pos:start] 632 633 new = CTokenizer(tokenizer.tokens[start:end + 1]) 634 635 new_tokenizer.tokens += args_match.tokens(new) 636 637 pos = end + 1 638 639 n += 1 640 if count and n >= count: 641 break 642 643 new_tokenizer.tokens += tokenizer.tokens[pos:] 644 645 if not is_token: 646 return str(new_tokenizer) 647 648 return new_tokenizer 649 650 def __repr__(self): 651 """ 652 Returns a displayable version of the class init. 653 """ 654 655 return f'CMatch("{self.regex.regex.pattern}")' 656