1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9 10Please notice that the code here may rise exceptions to indicate bad 11usage inside kdoc to indicate problems at the replace pattern. 12 13Other errors are logged via log instance. 14""" 15 16import logging 17import re 18 19from copy import copy 20 21from .kdoc_re import KernRe 22 23log = logging.getLogger(__name__) 24 25 26class CToken(): 27 """ 28 Data class to define a C token. 29 """ 30 31 # Tokens that can be used by the parser. Works like an C enum. 32 33 COMMENT = 0 #: A standard C or C99 comment, including delimiter. 34 STRING = 1 #: A string, including quotation marks. 35 CHAR = 2 #: A character, including apostophes. 36 NUMBER = 3 #: A number. 37 PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 38 BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 39 END = 6 #: A end character: ``}`` / ``]`` / ``)``. 40 CPP = 7 #: A preprocessor macro. 41 HASH = 8 #: The hash character - useful to handle other macros. 42 OP = 9 #: A C operator (add, subtract, ...). 43 STRUCT = 10 #: A ``struct`` keyword. 44 UNION = 11 #: An ``union`` keyword. 45 ENUM = 12 #: A ``struct`` keyword. 46 TYPEDEF = 13 #: A ``typedef`` keyword. 47 NAME = 14 #: A name. Can be an ID or a type. 48 SPACE = 15 #: Any space characters, including new lines 49 ENDSTMT = 16 #: End of an statement (``;``). 50 51 BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 52 53 MISMATCH = 255 #: an error indicator: should never happen in practice. 54 55 # Dict to convert from an enum interger into a string. 56 _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 57 58 # Dict to convert from string to an enum-like integer value. 59 _name_to_val = {k: v for v, k in _name_by_val.items()} 60 61 @staticmethod 62 def to_name(val): 63 """Convert from an integer value from CToken enum into a string""" 64 65 return CToken._name_by_val.get(val, f"UNKNOWN({val})") 66 67 @staticmethod 68 def from_name(name): 69 """Convert a string into a CToken enum value""" 70 if name in CToken._name_to_val: 71 return CToken._name_to_val[name] 72 73 return CToken.MISMATCH 74 75 76 def __init__(self, kind, value=None, pos=0, 77 brace_level=0, paren_level=0, bracket_level=0): 78 self.kind = kind 79 self.value = value 80 self.pos = pos 81 self.level = (bracket_level, paren_level, brace_level) 82 83 def __repr__(self): 84 name = self.to_name(self.kind) 85 if isinstance(self.value, str): 86 value = '"' + self.value + '"' 87 else: 88 value = self.value 89 90 return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 91 92#: Regexes to parse C code, transforming it into tokens. 93RE_SCANNER_LIST = [ 94 # 95 # Note that \s\S is different than .*, as it also catches \n 96 # 97 (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 98 99 (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 100 (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 101 102 (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 103 r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 104 105 (CToken.ENDSTMT, r"(?:\s+;|;)"), 106 107 (CToken.PUNC, r"[,\.]"), 108 109 (CToken.BEGIN, r"[\[\(\{]"), 110 111 (CToken.END, r"[\]\)\}]"), 112 113 (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 114 115 (CToken.HASH, r"#"), 116 117 (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 118 r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 119 120 (CToken.STRUCT, r"\bstruct\b"), 121 (CToken.UNION, r"\bunion\b"), 122 (CToken.ENUM, r"\benum\b"), 123 (CToken.TYPEDEF, r"\btypedef\b"), 124 125 (CToken.NAME, r"[A-Za-z_]\w*"), 126 127 (CToken.SPACE, r"\s+"), 128 129 (CToken.BACKREF, r"\\\d+"), 130 131 (CToken.MISMATCH,r"."), 132] 133 134def fill_re_scanner(token_list): 135 """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 136 re_tokens = [] 137 138 for kind, pattern in token_list: 139 name = CToken.to_name(kind) 140 re_tokens.append(f"(?P<{name}>{pattern})") 141 142 return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 143 144#: Handle C continuation lines. 145RE_CONT = KernRe(r"\\\n") 146 147RE_COMMENT_START = KernRe(r'/\*\s*') 148 149#: tokenizer regex. Will be filled at the first CTokenizer usage. 150RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 151 152 153class CTokenizer(): 154 """ 155 Scan C statements and definitions and produce tokens. 156 157 When converted to string, it drops comments and handle public/private 158 values, respecting depth. 159 """ 160 161 # This class is inspired and follows the basic concepts of: 162 # https://docs.python.org/3/library/re.html#writing-a-tokenizer 163 164 def __init__(self, source=None, log=None): 165 """ 166 Create a regular expression to handle RE_SCANNER_LIST. 167 168 While I generally don't like using regex group naming via: 169 (?P<name>...) 170 171 in this particular case, it makes sense, as we can pick the name 172 when matching a code via RE_SCANNER. 173 """ 174 175 self.tokens = [] 176 177 if not source: 178 return 179 180 if isinstance(source, list): 181 self.tokens = source 182 return 183 184 # 185 # While we could just use _tokenize directly via interator, 186 # As we'll need to use the tokenizer several times inside kernel-doc 187 # to handle macro transforms, cache the results on a list, as 188 # re-using it is cheaper than having to parse everytime. 189 # 190 for tok in self._tokenize(source): 191 self.tokens.append(tok) 192 193 def _tokenize(self, source): 194 """ 195 Iterator that parses ``source``, splitting it into tokens, as defined 196 at ``self.RE_SCANNER_LIST``. 197 198 The interactor returns a CToken class object. 199 """ 200 201 # Handle continuation lines. Note that kdoc_parser already has a 202 # logic to do that. Still, let's keep it for completeness, as we might 203 # end re-using this tokenizer outsize kernel-doc some day - or we may 204 # eventually remove from there as a future cleanup. 205 source = RE_CONT.sub("", source) 206 207 brace_level = 0 208 paren_level = 0 209 bracket_level = 0 210 211 for match in RE_SCANNER.finditer(source): 212 kind = CToken.from_name(match.lastgroup) 213 pos = match.start() 214 value = match.group() 215 216 if kind == CToken.MISMATCH: 217 log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 218 elif kind == CToken.BEGIN: 219 if value == '(': 220 paren_level += 1 221 elif value == '[': 222 bracket_level += 1 223 else: # value == '{' 224 brace_level += 1 225 226 elif kind == CToken.END: 227 if value == ')' and paren_level > 0: 228 paren_level -= 1 229 elif value == ']' and bracket_level > 0: 230 bracket_level -= 1 231 elif brace_level > 0: # value == '}' 232 brace_level -= 1 233 234 yield CToken(kind, value, pos, 235 brace_level, paren_level, bracket_level) 236 237 def __str__(self): 238 out="" 239 show_stack = [True] 240 241 for i, tok in enumerate(self.tokens): 242 if tok.kind == CToken.BEGIN: 243 show_stack.append(show_stack[-1]) 244 245 elif tok.kind == CToken.END: 246 prev = show_stack[-1] 247 if len(show_stack) > 1: 248 show_stack.pop() 249 250 if not prev and show_stack[-1]: 251 # 252 # Try to preserve indent 253 # 254 out += "\t" * (len(show_stack) - 1) 255 256 out += str(tok.value) 257 continue 258 259 elif tok.kind == CToken.COMMENT: 260 comment = RE_COMMENT_START.sub("", tok.value) 261 262 if comment.startswith("private:"): 263 show_stack[-1] = False 264 show = False 265 elif comment.startswith("public:"): 266 show_stack[-1] = True 267 268 continue 269 270 if not show_stack[-1]: 271 continue 272 273 if i < len(self.tokens) - 1: 274 next_tok = self.tokens[i + 1] 275 276 # Do some cleanups before ";" 277 278 if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 279 continue 280 281 if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 282 continue 283 284 out += str(tok.value) 285 286 return out 287 288 289class CTokenArgs: 290 """ 291 Ancillary class to help using backrefs from sub matches. 292 293 If the highest backref contain a "+" at the last element, 294 the logic will be greedy, picking all other delims. 295 296 This is needed to parse struct_group macros with end with ``MEMBERS...``. 297 """ 298 def __init__(self, sub_str): 299 self.sub_groups = set() 300 self.max_group = -1 301 self.greedy = None 302 303 for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): 304 group = int(m.group(1)) 305 if m.group(2) == "+": 306 if self.greedy and self.greedy != group: 307 raise ValueError("There are multiple greedy patterns!") 308 self.greedy = group 309 310 self.sub_groups.add(group) 311 self.max_group = max(self.max_group, group) 312 313 if self.greedy: 314 if self.greedy != self.max_group: 315 raise ValueError("Greedy pattern is not the last one!") 316 317 sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) 318 319 self.sub_str = sub_str 320 self.sub_tokeninzer = CTokenizer(sub_str) 321 322 def groups(self, new_tokenizer): 323 """ 324 Create replacement arguments for backrefs like: 325 326 ``\0``, ``\1``, ``\2``, ...``\n`` 327 328 It also accepts a ``+`` character to the highest backref. When used, 329 it means in practice to ignore delimins after it, being greedy. 330 331 The logic is smart enough to only go up to the maximum required 332 argument, even if there are more. 333 334 If there is a backref for an argument above the limit, it will 335 raise an exception. Please notice that, on C, square brackets 336 don't have any separator on it. Trying to use ``\1``..``\n`` for 337 brackets also raise an exception. 338 """ 339 340 level = (0, 0, 0) 341 342 if self.max_group < 0: 343 return level, [] 344 345 tokens = new_tokenizer.tokens 346 347 # 348 # Fill \0 with the full token contents 349 # 350 groups_list = [ [] ] 351 352 if 0 in self.sub_groups: 353 inner_level = 0 354 355 for i in range(0, len(tokens)): 356 tok = tokens[i] 357 358 if tok.kind == CToken.BEGIN: 359 inner_level += 1 360 361 # 362 # Discard first begin 363 # 364 if not groups_list[0]: 365 continue 366 elif tok.kind == CToken.END: 367 inner_level -= 1 368 if inner_level < 0: 369 break 370 371 if inner_level: 372 groups_list[0].append(tok) 373 374 if not self.max_group: 375 return level, groups_list 376 377 delim = None 378 379 # 380 # Ignore everything before BEGIN. The value of begin gives the 381 # delimiter to be used for the matches 382 # 383 for i in range(0, len(tokens)): 384 tok = tokens[i] 385 if tok.kind == CToken.BEGIN: 386 if tok.value == "{": 387 delim = ";" 388 elif tok.value == "(": 389 delim = "," 390 else: 391 self.log.error(fr"Can't handle \1..\n on {sub_str}") 392 393 level = tok.level 394 break 395 396 pos = 1 397 groups_list.append([]) 398 399 inner_level = 0 400 for i in range(i + 1, len(tokens)): 401 tok = tokens[i] 402 403 if tok.kind == CToken.BEGIN: 404 inner_level += 1 405 if tok.kind == CToken.END: 406 inner_level -= 1 407 if inner_level < 0: 408 break 409 410 if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value: 411 pos += 1 412 if self.greedy and pos > self.max_group: 413 pos -= 1 414 else: 415 groups_list.append([]) 416 417 if pos > self.max_group: 418 break 419 420 continue 421 422 groups_list[pos].append(tok) 423 424 if pos < self.max_group: 425 log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}") 426 427 return level, groups_list 428 429 def tokens(self, new_tokenizer): 430 level, groups = self.groups(new_tokenizer) 431 432 new = CTokenizer() 433 434 for tok in self.sub_tokeninzer.tokens: 435 if tok.kind == CToken.BACKREF: 436 group = int(tok.value[1:]) 437 438 for group_tok in groups[group]: 439 new_tok = copy(group_tok) 440 441 new_level = [0, 0, 0] 442 443 for i in range(0, len(level)): 444 new_level[i] = new_tok.level[i] + level[i] 445 446 new_tok.level = tuple(new_level) 447 448 new.tokens += [ new_tok ] 449 else: 450 new.tokens += [ tok ] 451 452 return new.tokens 453 454 455class CMatch: 456 """ 457 Finding nested delimiters is hard with regular expressions. It is 458 even harder on Python with its normal re module, as there are several 459 advanced regular expressions that are missing. 460 461 This is the case of this pattern:: 462 463 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 464 465 which is used to properly match open/close parentheses of the 466 string search STRUCT_GROUP(), 467 468 Add a class that counts pairs of delimiters, using it to match and 469 replace nested expressions. 470 471 The original approach was suggested by: 472 473 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 474 475 Although I re-implemented it to make it more generic and match 3 types 476 of delimiters. The logic checks if delimiters are paired. If not, it 477 will ignore the search string. 478 """ 479 480 481 def __init__(self, regex, delim="("): 482 self.regex = KernRe("^" + regex + r"\b") 483 self.start_delim = delim 484 485 def _search(self, tokenizer): 486 """ 487 Finds paired blocks for a regex that ends with a delimiter. 488 489 The suggestion of using finditer to match pairs came from: 490 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 491 but I ended using a different implementation to align all three types 492 of delimiters and seek for an initial regular expression. 493 494 The algorithm seeks for open/close paired delimiters and places them 495 into a stack, yielding a start/stop position of each match when the 496 stack is zeroed. 497 498 The algorithm should work fine for properly paired lines, but will 499 silently ignore end delimiters that precede a start delimiter. 500 This should be OK for kernel-doc parser, as unaligned delimiters 501 would cause compilation errors. So, we don't need to raise exceptions 502 to cover such issues. 503 """ 504 505 start = None 506 started = False 507 508 import sys 509 510 stack = [] 511 512 for i, tok in enumerate(tokenizer.tokens): 513 if start is None: 514 if tok.kind == CToken.NAME and self.regex.match(tok.value): 515 start = i 516 stack.append((start, tok.level)) 517 started = False 518 519 continue 520 521 if not started: 522 if tok.kind == CToken.SPACE: 523 continue 524 525 if tok.kind == CToken.BEGIN and tok.value == self.start_delim: 526 started = True 527 continue 528 529 # Name only token without BEGIN/END 530 if i > start: 531 i -= 1 532 yield start, i 533 start = None 534 535 if tok.kind == CToken.END and tok.level == stack[-1][1]: 536 start, level = stack.pop() 537 538 yield start, i 539 start = None 540 541 # 542 # If an END zeroing levels is not there, return remaining stuff 543 # This is meant to solve cases where the caller logic might be 544 # picking an incomplete block. 545 # 546 if start and stack: 547 if started: 548 s = str(tokenizer) 549 log.warning(f"can't find a final end at {s}") 550 551 yield start, len(tokenizer.tokens) 552 553 def search(self, source): 554 """ 555 This is similar to re.search: 556 557 It matches a regex that it is followed by a delimiter, 558 returning occurrences only if all delimiters are paired. 559 """ 560 561 if isinstance(source, CTokenizer): 562 tokenizer = source 563 is_token = True 564 else: 565 tokenizer = CTokenizer(source) 566 is_token = False 567 568 for start, end in self._search(tokenizer): 569 new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1]) 570 571 if is_token: 572 yield new_tokenizer 573 else: 574 yield str(new_tokenizer) 575 576 def sub(self, sub_str, source, count=0): 577 """ 578 This is similar to re.sub: 579 580 It matches a regex that it is followed by a delimiter, 581 replacing occurrences only if all delimiters are paired. 582 583 if the sub argument contains:: 584 585 r'\0' 586 587 it will work just like re: it places there the matched paired data 588 with the delimiter stripped. 589 590 If count is different than zero, it will replace at most count 591 items. 592 """ 593 if isinstance(source, CTokenizer): 594 is_token = True 595 tokenizer = source 596 else: 597 is_token = False 598 tokenizer = CTokenizer(source) 599 600 # Detect if sub_str contains sub arguments 601 602 args_match = CTokenArgs(sub_str) 603 604 new_tokenizer = CTokenizer() 605 pos = 0 606 n = 0 607 608 # 609 # NOTE: the code below doesn't consider overlays at sub. 610 # We may need to add some extra unit tests to check if those 611 # would cause problems. When replacing by "", this should not 612 # be a problem, but other transformations could be problematic 613 # 614 for start, end in self._search(tokenizer): 615 new_tokenizer.tokens += tokenizer.tokens[pos:start] 616 617 new = CTokenizer(tokenizer.tokens[start:end + 1]) 618 619 new_tokenizer.tokens += args_match.tokens(new) 620 621 pos = end + 1 622 623 n += 1 624 if count and n >= count: 625 break 626 627 new_tokenizer.tokens += tokenizer.tokens[pos:] 628 629 if not is_token: 630 return str(new_tokenizer) 631 632 return new_tokenizer 633 634 def __repr__(self): 635 """ 636 Returns a displayable version of the class init. 637 """ 638 639 return f'CMatch("{self.regex.regex.pattern}")' 640