1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9 10Please notice that the code here may rise exceptions to indicate bad 11usage inside kdoc to indicate problems at the replace pattern. 12 13Other errors are logged via log instance. 14""" 15 16import logging 17import re 18 19from .kdoc_re import KernRe 20 21log = logging.getLogger(__name__) 22 23 24class CToken(): 25 """ 26 Data class to define a C token. 27 """ 28 29 # Tokens that can be used by the parser. Works like an C enum. 30 31 COMMENT = 0 #: A standard C or C99 comment, including delimiter. 32 STRING = 1 #: A string, including quotation marks. 33 CHAR = 2 #: A character, including apostophes. 34 NUMBER = 3 #: A number. 35 PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 36 BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 37 END = 6 #: A end character: ``}`` / ``]`` / ``)``. 38 CPP = 7 #: A preprocessor macro. 39 HASH = 8 #: The hash character - useful to handle other macros. 40 OP = 9 #: A C operator (add, subtract, ...). 41 STRUCT = 10 #: A ``struct`` keyword. 42 UNION = 11 #: An ``union`` keyword. 43 ENUM = 12 #: A ``struct`` keyword. 44 TYPEDEF = 13 #: A ``typedef`` keyword. 45 NAME = 14 #: A name. Can be an ID or a type. 46 SPACE = 15 #: Any space characters, including new lines 47 ENDSTMT = 16 #: End of an statement (``;``). 48 49 BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 50 51 MISMATCH = 255 #: an error indicator: should never happen in practice. 52 53 # Dict to convert from an enum interger into a string. 54 _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 55 56 # Dict to convert from string to an enum-like integer value. 57 _name_to_val = {k: v for v, k in _name_by_val.items()} 58 59 @staticmethod 60 def to_name(val): 61 """Convert from an integer value from CToken enum into a string""" 62 63 return CToken._name_by_val.get(val, f"UNKNOWN({val})") 64 65 @staticmethod 66 def from_name(name): 67 """Convert a string into a CToken enum value""" 68 if name in CToken._name_to_val: 69 return CToken._name_to_val[name] 70 71 return CToken.MISMATCH 72 73 74 def __init__(self, kind, value=None, pos=0, 75 brace_level=0, paren_level=0, bracket_level=0): 76 self.kind = kind 77 self.value = value 78 self.pos = pos 79 self.level = (bracket_level, paren_level, brace_level) 80 81 def __repr__(self): 82 name = self.to_name(self.kind) 83 if isinstance(self.value, str): 84 value = '"' + self.value + '"' 85 else: 86 value = self.value 87 88 return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 89 90#: Regexes to parse C code, transforming it into tokens. 91RE_SCANNER_LIST = [ 92 # 93 # Note that \s\S is different than .*, as it also catches \n 94 # 95 (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 96 97 (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 98 (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 99 100 (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 101 r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 102 103 (CToken.ENDSTMT, r"(?:\s+;|;)"), 104 105 (CToken.PUNC, r"[,\.]"), 106 107 (CToken.BEGIN, r"[\[\(\{]"), 108 109 (CToken.END, r"[\]\)\}]"), 110 111 (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 112 113 (CToken.HASH, r"#"), 114 115 (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 116 r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 117 118 (CToken.STRUCT, r"\bstruct\b"), 119 (CToken.UNION, r"\bunion\b"), 120 (CToken.ENUM, r"\benum\b"), 121 (CToken.TYPEDEF, r"\btypedef\b"), 122 123 (CToken.NAME, r"[A-Za-z_]\w*"), 124 125 (CToken.SPACE, r"\s+"), 126 127 (CToken.BACKREF, r"\\\d+"), 128 129 (CToken.MISMATCH,r"."), 130] 131 132def fill_re_scanner(token_list): 133 """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 134 re_tokens = [] 135 136 for kind, pattern in token_list: 137 name = CToken.to_name(kind) 138 re_tokens.append(f"(?P<{name}>{pattern})") 139 140 return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 141 142#: Handle C continuation lines. 143RE_CONT = KernRe(r"\\\n") 144 145RE_COMMENT_START = KernRe(r'/\*\s*') 146 147#: tokenizer regex. Will be filled at the first CTokenizer usage. 148RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 149 150 151class CTokenizer(): 152 """ 153 Scan C statements and definitions and produce tokens. 154 155 When converted to string, it drops comments and handle public/private 156 values, respecting depth. 157 """ 158 159 # This class is inspired and follows the basic concepts of: 160 # https://docs.python.org/3/library/re.html#writing-a-tokenizer 161 162 def __init__(self, source=None, log=None): 163 """ 164 Create a regular expression to handle RE_SCANNER_LIST. 165 166 While I generally don't like using regex group naming via: 167 (?P<name>...) 168 169 in this particular case, it makes sense, as we can pick the name 170 when matching a code via RE_SCANNER. 171 """ 172 173 self.tokens = [] 174 175 if not source: 176 return 177 178 if isinstance(source, list): 179 self.tokens = source 180 return 181 182 # 183 # While we could just use _tokenize directly via interator, 184 # As we'll need to use the tokenizer several times inside kernel-doc 185 # to handle macro transforms, cache the results on a list, as 186 # re-using it is cheaper than having to parse everytime. 187 # 188 for tok in self._tokenize(source): 189 self.tokens.append(tok) 190 191 def _tokenize(self, source): 192 """ 193 Iterator that parses ``source``, splitting it into tokens, as defined 194 at ``self.RE_SCANNER_LIST``. 195 196 The interactor returns a CToken class object. 197 """ 198 199 # Handle continuation lines. Note that kdoc_parser already has a 200 # logic to do that. Still, let's keep it for completeness, as we might 201 # end re-using this tokenizer outsize kernel-doc some day - or we may 202 # eventually remove from there as a future cleanup. 203 source = RE_CONT.sub("", source) 204 205 brace_level = 0 206 paren_level = 0 207 bracket_level = 0 208 209 for match in RE_SCANNER.finditer(source): 210 kind = CToken.from_name(match.lastgroup) 211 pos = match.start() 212 value = match.group() 213 214 if kind == CToken.MISMATCH: 215 log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 216 elif kind == CToken.BEGIN: 217 if value == '(': 218 paren_level += 1 219 elif value == '[': 220 bracket_level += 1 221 else: # value == '{' 222 brace_level += 1 223 224 elif kind == CToken.END: 225 if value == ')' and paren_level > 0: 226 paren_level -= 1 227 elif value == ']' and bracket_level > 0: 228 bracket_level -= 1 229 elif brace_level > 0: # value == '}' 230 brace_level -= 1 231 232 yield CToken(kind, value, pos, 233 brace_level, paren_level, bracket_level) 234 235 def __str__(self): 236 out="" 237 show_stack = [True] 238 239 for i, tok in enumerate(self.tokens): 240 if tok.kind == CToken.BEGIN: 241 show_stack.append(show_stack[-1]) 242 243 elif tok.kind == CToken.END: 244 prev = show_stack[-1] 245 if len(show_stack) > 1: 246 show_stack.pop() 247 248 if not prev and show_stack[-1]: 249 # 250 # Try to preserve indent 251 # 252 out += "\t" * (len(show_stack) - 1) 253 254 out += str(tok.value) 255 continue 256 257 elif tok.kind == CToken.COMMENT: 258 comment = RE_COMMENT_START.sub("", tok.value) 259 260 if comment.startswith("private:"): 261 show_stack[-1] = False 262 show = False 263 elif comment.startswith("public:"): 264 show_stack[-1] = True 265 266 continue 267 268 if not show_stack[-1]: 269 continue 270 271 if i < len(self.tokens) - 1: 272 next_tok = self.tokens[i + 1] 273 274 # Do some cleanups before ";" 275 276 if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 277 continue 278 279 if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 280 continue 281 282 out += str(tok.value) 283 284 return out 285 286 287class CMatch: 288 """ 289 Finding nested delimiters is hard with regular expressions. It is 290 even harder on Python with its normal re module, as there are several 291 advanced regular expressions that are missing. 292 293 This is the case of this pattern:: 294 295 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 296 297 which is used to properly match open/close parentheses of the 298 string search STRUCT_GROUP(), 299 300 Add a class that counts pairs of delimiters, using it to match and 301 replace nested expressions. 302 303 The original approach was suggested by: 304 305 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 306 307 Although I re-implemented it to make it more generic and match 3 types 308 of delimiters. The logic checks if delimiters are paired. If not, it 309 will ignore the search string. 310 """ 311 312 # TODO: add a sub method 313 314 def __init__(self, regex): 315 self.regex = KernRe(regex) 316 317 def _search(self, tokenizer): 318 """ 319 Finds paired blocks for a regex that ends with a delimiter. 320 321 The suggestion of using finditer to match pairs came from: 322 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 323 but I ended using a different implementation to align all three types 324 of delimiters and seek for an initial regular expression. 325 326 The algorithm seeks for open/close paired delimiters and places them 327 into a stack, yielding a start/stop position of each match when the 328 stack is zeroed. 329 330 The algorithm should work fine for properly paired lines, but will 331 silently ignore end delimiters that precede a start delimiter. 332 This should be OK for kernel-doc parser, as unaligned delimiters 333 would cause compilation errors. So, we don't need to raise exceptions 334 to cover such issues. 335 """ 336 337 start = None 338 offset = -1 339 started = False 340 341 import sys 342 343 stack = [] 344 345 for i, tok in enumerate(tokenizer.tokens): 346 if start is None: 347 if tok.kind == CToken.NAME and self.regex.match(tok.value): 348 start = i 349 stack.append((start, tok.level)) 350 started = False 351 352 continue 353 354 if not started and tok.kind == CToken.BEGIN: 355 started = True 356 continue 357 358 if tok.kind == CToken.END and tok.level == stack[-1][1]: 359 start, level = stack.pop() 360 offset = i 361 362 yield CTokenizer(tokenizer.tokens[start:offset + 1]) 363 start = None 364 365 # 366 # If an END zeroing levels is not there, return remaining stuff 367 # This is meant to solve cases where the caller logic might be 368 # picking an incomplete block. 369 # 370 if start and offset < 0: 371 print("WARNING: can't find an end", file=sys.stderr) 372 yield CTokenizer(tokenizer.tokens[start:]) 373 374 def search(self, source): 375 """ 376 This is similar to re.search: 377 378 It matches a regex that it is followed by a delimiter, 379 returning occurrences only if all delimiters are paired. 380 """ 381 382 if isinstance(source, CTokenizer): 383 tokenizer = source 384 is_token = True 385 else: 386 tokenizer = CTokenizer(source) 387 is_token = False 388 389 for new_tokenizer in self._search(tokenizer): 390 if is_token: 391 yield new_tokenizer 392 else: 393 yield str(new_tokenizer) 394