1*df50e848SMauro Carvalho Chehab#!/usr/bin/env python3 2*df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3*df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4*df50e848SMauro Carvalho Chehab 5*df50e848SMauro Carvalho Chehab""" 6*df50e848SMauro Carvalho ChehabRegular expression ancillary classes. 7*df50e848SMauro Carvalho Chehab 8*df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9*df50e848SMauro Carvalho Chehab 10*df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad 11*df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern. 12*df50e848SMauro Carvalho Chehab 13*df50e848SMauro Carvalho ChehabOther errors are logged via log instance. 14*df50e848SMauro Carvalho Chehab""" 15*df50e848SMauro Carvalho Chehab 16*df50e848SMauro Carvalho Chehabimport logging 17*df50e848SMauro Carvalho Chehabimport re 18*df50e848SMauro Carvalho Chehab 19*df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe 20*df50e848SMauro Carvalho Chehab 21*df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__) 22*df50e848SMauro Carvalho Chehab 23*df50e848SMauro Carvalho Chehab 24*df50e848SMauro Carvalho Chehabclass CToken(): 25*df50e848SMauro Carvalho Chehab """ 26*df50e848SMauro Carvalho Chehab Data class to define a C token. 27*df50e848SMauro Carvalho Chehab """ 28*df50e848SMauro Carvalho Chehab 29*df50e848SMauro Carvalho Chehab # Tokens that can be used by the parser. Works like an C enum. 30*df50e848SMauro Carvalho Chehab 31*df50e848SMauro Carvalho Chehab COMMENT = 0 #: A standard C or C99 comment, including delimiter. 32*df50e848SMauro Carvalho Chehab STRING = 1 #: A string, including quotation marks. 33*df50e848SMauro Carvalho Chehab CHAR = 2 #: A character, including apostophes. 34*df50e848SMauro Carvalho Chehab NUMBER = 3 #: A number. 35*df50e848SMauro Carvalho Chehab PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 36*df50e848SMauro Carvalho Chehab BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 37*df50e848SMauro Carvalho Chehab END = 6 #: A end character: ``}`` / ``]`` / ``)``. 38*df50e848SMauro Carvalho Chehab CPP = 7 #: A preprocessor macro. 39*df50e848SMauro Carvalho Chehab HASH = 8 #: The hash character - useful to handle other macros. 40*df50e848SMauro Carvalho Chehab OP = 9 #: A C operator (add, subtract, ...). 41*df50e848SMauro Carvalho Chehab STRUCT = 10 #: A ``struct`` keyword. 42*df50e848SMauro Carvalho Chehab UNION = 11 #: An ``union`` keyword. 43*df50e848SMauro Carvalho Chehab ENUM = 12 #: A ``struct`` keyword. 44*df50e848SMauro Carvalho Chehab TYPEDEF = 13 #: A ``typedef`` keyword. 45*df50e848SMauro Carvalho Chehab NAME = 14 #: A name. Can be an ID or a type. 46*df50e848SMauro Carvalho Chehab SPACE = 15 #: Any space characters, including new lines 47*df50e848SMauro Carvalho Chehab ENDSTMT = 16 #: End of an statement (``;``). 48*df50e848SMauro Carvalho Chehab 49*df50e848SMauro Carvalho Chehab BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 50*df50e848SMauro Carvalho Chehab 51*df50e848SMauro Carvalho Chehab MISMATCH = 255 #: an error indicator: should never happen in practice. 52*df50e848SMauro Carvalho Chehab 53*df50e848SMauro Carvalho Chehab # Dict to convert from an enum interger into a string. 54*df50e848SMauro Carvalho Chehab _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 55*df50e848SMauro Carvalho Chehab 56*df50e848SMauro Carvalho Chehab # Dict to convert from string to an enum-like integer value. 57*df50e848SMauro Carvalho Chehab _name_to_val = {k: v for v, k in _name_by_val.items()} 58*df50e848SMauro Carvalho Chehab 59*df50e848SMauro Carvalho Chehab @staticmethod 60*df50e848SMauro Carvalho Chehab def to_name(val): 61*df50e848SMauro Carvalho Chehab """Convert from an integer value from CToken enum into a string""" 62*df50e848SMauro Carvalho Chehab 63*df50e848SMauro Carvalho Chehab return CToken._name_by_val.get(val, f"UNKNOWN({val})") 64*df50e848SMauro Carvalho Chehab 65*df50e848SMauro Carvalho Chehab @staticmethod 66*df50e848SMauro Carvalho Chehab def from_name(name): 67*df50e848SMauro Carvalho Chehab """Convert a string into a CToken enum value""" 68*df50e848SMauro Carvalho Chehab if name in CToken._name_to_val: 69*df50e848SMauro Carvalho Chehab return CToken._name_to_val[name] 70*df50e848SMauro Carvalho Chehab 71*df50e848SMauro Carvalho Chehab return CToken.MISMATCH 72*df50e848SMauro Carvalho Chehab 73*df50e848SMauro Carvalho Chehab 74*df50e848SMauro Carvalho Chehab def __init__(self, kind, value=None, pos=0, 75*df50e848SMauro Carvalho Chehab brace_level=0, paren_level=0, bracket_level=0): 76*df50e848SMauro Carvalho Chehab self.kind = kind 77*df50e848SMauro Carvalho Chehab self.value = value 78*df50e848SMauro Carvalho Chehab self.pos = pos 79*df50e848SMauro Carvalho Chehab self.level = (bracket_level, paren_level, brace_level) 80*df50e848SMauro Carvalho Chehab 81*df50e848SMauro Carvalho Chehab def __repr__(self): 82*df50e848SMauro Carvalho Chehab name = self.to_name(self.kind) 83*df50e848SMauro Carvalho Chehab if isinstance(self.value, str): 84*df50e848SMauro Carvalho Chehab value = '"' + self.value + '"' 85*df50e848SMauro Carvalho Chehab else: 86*df50e848SMauro Carvalho Chehab value = self.value 87*df50e848SMauro Carvalho Chehab 88*df50e848SMauro Carvalho Chehab return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 89*df50e848SMauro Carvalho Chehab 90*df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens. 91*df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [ 92*df50e848SMauro Carvalho Chehab # 93*df50e848SMauro Carvalho Chehab # Note that \s\S is different than .*, as it also catches \n 94*df50e848SMauro Carvalho Chehab # 95*df50e848SMauro Carvalho Chehab (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 96*df50e848SMauro Carvalho Chehab 97*df50e848SMauro Carvalho Chehab (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 98*df50e848SMauro Carvalho Chehab (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 99*df50e848SMauro Carvalho Chehab 100*df50e848SMauro Carvalho Chehab (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 101*df50e848SMauro Carvalho Chehab r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 102*df50e848SMauro Carvalho Chehab 103*df50e848SMauro Carvalho Chehab (CToken.ENDSTMT, r"(?:\s+;|;)"), 104*df50e848SMauro Carvalho Chehab 105*df50e848SMauro Carvalho Chehab (CToken.PUNC, r"[,\.]"), 106*df50e848SMauro Carvalho Chehab 107*df50e848SMauro Carvalho Chehab (CToken.BEGIN, r"[\[\(\{]"), 108*df50e848SMauro Carvalho Chehab 109*df50e848SMauro Carvalho Chehab (CToken.END, r"[\]\)\}]"), 110*df50e848SMauro Carvalho Chehab 111*df50e848SMauro Carvalho Chehab (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 112*df50e848SMauro Carvalho Chehab 113*df50e848SMauro Carvalho Chehab (CToken.HASH, r"#"), 114*df50e848SMauro Carvalho Chehab 115*df50e848SMauro Carvalho Chehab (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 116*df50e848SMauro Carvalho Chehab r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 117*df50e848SMauro Carvalho Chehab 118*df50e848SMauro Carvalho Chehab (CToken.STRUCT, r"\bstruct\b"), 119*df50e848SMauro Carvalho Chehab (CToken.UNION, r"\bunion\b"), 120*df50e848SMauro Carvalho Chehab (CToken.ENUM, r"\benum\b"), 121*df50e848SMauro Carvalho Chehab (CToken.TYPEDEF, r"\btypedef\b"), 122*df50e848SMauro Carvalho Chehab 123*df50e848SMauro Carvalho Chehab (CToken.NAME, r"[A-Za-z_]\w*"), 124*df50e848SMauro Carvalho Chehab 125*df50e848SMauro Carvalho Chehab (CToken.SPACE, r"\s+"), 126*df50e848SMauro Carvalho Chehab 127*df50e848SMauro Carvalho Chehab (CToken.BACKREF, r"\\\d+"), 128*df50e848SMauro Carvalho Chehab 129*df50e848SMauro Carvalho Chehab (CToken.MISMATCH,r"."), 130*df50e848SMauro Carvalho Chehab] 131*df50e848SMauro Carvalho Chehab 132*df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list): 133*df50e848SMauro Carvalho Chehab """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 134*df50e848SMauro Carvalho Chehab re_tokens = [] 135*df50e848SMauro Carvalho Chehab 136*df50e848SMauro Carvalho Chehab for kind, pattern in token_list: 137*df50e848SMauro Carvalho Chehab name = CToken.to_name(kind) 138*df50e848SMauro Carvalho Chehab re_tokens.append(f"(?P<{name}>{pattern})") 139*df50e848SMauro Carvalho Chehab 140*df50e848SMauro Carvalho Chehab return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 141*df50e848SMauro Carvalho Chehab 142*df50e848SMauro Carvalho Chehab#: Handle C continuation lines. 143*df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n") 144*df50e848SMauro Carvalho Chehab 145*df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*') 146*df50e848SMauro Carvalho Chehab 147*df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage. 148*df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 149*df50e848SMauro Carvalho Chehab 150*df50e848SMauro Carvalho Chehab 151*df50e848SMauro Carvalho Chehabclass CTokenizer(): 152*df50e848SMauro Carvalho Chehab """ 153*df50e848SMauro Carvalho Chehab Scan C statements and definitions and produce tokens. 154*df50e848SMauro Carvalho Chehab 155*df50e848SMauro Carvalho Chehab When converted to string, it drops comments and handle public/private 156*df50e848SMauro Carvalho Chehab values, respecting depth. 157*df50e848SMauro Carvalho Chehab """ 158*df50e848SMauro Carvalho Chehab 159*df50e848SMauro Carvalho Chehab # This class is inspired and follows the basic concepts of: 160*df50e848SMauro Carvalho Chehab # https://docs.python.org/3/library/re.html#writing-a-tokenizer 161*df50e848SMauro Carvalho Chehab 162*df50e848SMauro Carvalho Chehab def __init__(self, source=None, log=None): 163*df50e848SMauro Carvalho Chehab """ 164*df50e848SMauro Carvalho Chehab Create a regular expression to handle RE_SCANNER_LIST. 165*df50e848SMauro Carvalho Chehab 166*df50e848SMauro Carvalho Chehab While I generally don't like using regex group naming via: 167*df50e848SMauro Carvalho Chehab (?P<name>...) 168*df50e848SMauro Carvalho Chehab 169*df50e848SMauro Carvalho Chehab in this particular case, it makes sense, as we can pick the name 170*df50e848SMauro Carvalho Chehab when matching a code via RE_SCANNER. 171*df50e848SMauro Carvalho Chehab """ 172*df50e848SMauro Carvalho Chehab 173*df50e848SMauro Carvalho Chehab self.tokens = [] 174*df50e848SMauro Carvalho Chehab 175*df50e848SMauro Carvalho Chehab if not source: 176*df50e848SMauro Carvalho Chehab return 177*df50e848SMauro Carvalho Chehab 178*df50e848SMauro Carvalho Chehab if isinstance(source, list): 179*df50e848SMauro Carvalho Chehab self.tokens = source 180*df50e848SMauro Carvalho Chehab return 181*df50e848SMauro Carvalho Chehab 182*df50e848SMauro Carvalho Chehab # 183*df50e848SMauro Carvalho Chehab # While we could just use _tokenize directly via interator, 184*df50e848SMauro Carvalho Chehab # As we'll need to use the tokenizer several times inside kernel-doc 185*df50e848SMauro Carvalho Chehab # to handle macro transforms, cache the results on a list, as 186*df50e848SMauro Carvalho Chehab # re-using it is cheaper than having to parse everytime. 187*df50e848SMauro Carvalho Chehab # 188*df50e848SMauro Carvalho Chehab for tok in self._tokenize(source): 189*df50e848SMauro Carvalho Chehab self.tokens.append(tok) 190*df50e848SMauro Carvalho Chehab 191*df50e848SMauro Carvalho Chehab def _tokenize(self, source): 192*df50e848SMauro Carvalho Chehab """ 193*df50e848SMauro Carvalho Chehab Iterator that parses ``source``, splitting it into tokens, as defined 194*df50e848SMauro Carvalho Chehab at ``self.RE_SCANNER_LIST``. 195*df50e848SMauro Carvalho Chehab 196*df50e848SMauro Carvalho Chehab The interactor returns a CToken class object. 197*df50e848SMauro Carvalho Chehab """ 198*df50e848SMauro Carvalho Chehab 199*df50e848SMauro Carvalho Chehab # Handle continuation lines. Note that kdoc_parser already has a 200*df50e848SMauro Carvalho Chehab # logic to do that. Still, let's keep it for completeness, as we might 201*df50e848SMauro Carvalho Chehab # end re-using this tokenizer outsize kernel-doc some day - or we may 202*df50e848SMauro Carvalho Chehab # eventually remove from there as a future cleanup. 203*df50e848SMauro Carvalho Chehab source = RE_CONT.sub("", source) 204*df50e848SMauro Carvalho Chehab 205*df50e848SMauro Carvalho Chehab brace_level = 0 206*df50e848SMauro Carvalho Chehab paren_level = 0 207*df50e848SMauro Carvalho Chehab bracket_level = 0 208*df50e848SMauro Carvalho Chehab 209*df50e848SMauro Carvalho Chehab for match in RE_SCANNER.finditer(source): 210*df50e848SMauro Carvalho Chehab kind = CToken.from_name(match.lastgroup) 211*df50e848SMauro Carvalho Chehab pos = match.start() 212*df50e848SMauro Carvalho Chehab value = match.group() 213*df50e848SMauro Carvalho Chehab 214*df50e848SMauro Carvalho Chehab if kind == CToken.MISMATCH: 215*df50e848SMauro Carvalho Chehab log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 216*df50e848SMauro Carvalho Chehab elif kind == CToken.BEGIN: 217*df50e848SMauro Carvalho Chehab if value == '(': 218*df50e848SMauro Carvalho Chehab paren_level += 1 219*df50e848SMauro Carvalho Chehab elif value == '[': 220*df50e848SMauro Carvalho Chehab bracket_level += 1 221*df50e848SMauro Carvalho Chehab else: # value == '{' 222*df50e848SMauro Carvalho Chehab brace_level += 1 223*df50e848SMauro Carvalho Chehab 224*df50e848SMauro Carvalho Chehab elif kind == CToken.END: 225*df50e848SMauro Carvalho Chehab if value == ')' and paren_level > 0: 226*df50e848SMauro Carvalho Chehab paren_level -= 1 227*df50e848SMauro Carvalho Chehab elif value == ']' and bracket_level > 0: 228*df50e848SMauro Carvalho Chehab bracket_level -= 1 229*df50e848SMauro Carvalho Chehab elif brace_level > 0: # value == '}' 230*df50e848SMauro Carvalho Chehab brace_level -= 1 231*df50e848SMauro Carvalho Chehab 232*df50e848SMauro Carvalho Chehab yield CToken(kind, value, pos, 233*df50e848SMauro Carvalho Chehab brace_level, paren_level, bracket_level) 234*df50e848SMauro Carvalho Chehab 235*df50e848SMauro Carvalho Chehab def __str__(self): 236*df50e848SMauro Carvalho Chehab out="" 237*df50e848SMauro Carvalho Chehab show_stack = [True] 238*df50e848SMauro Carvalho Chehab 239*df50e848SMauro Carvalho Chehab for i, tok in enumerate(self.tokens): 240*df50e848SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 241*df50e848SMauro Carvalho Chehab show_stack.append(show_stack[-1]) 242*df50e848SMauro Carvalho Chehab 243*df50e848SMauro Carvalho Chehab elif tok.kind == CToken.END: 244*df50e848SMauro Carvalho Chehab prev = show_stack[-1] 245*df50e848SMauro Carvalho Chehab if len(show_stack) > 1: 246*df50e848SMauro Carvalho Chehab show_stack.pop() 247*df50e848SMauro Carvalho Chehab 248*df50e848SMauro Carvalho Chehab if not prev and show_stack[-1]: 249*df50e848SMauro Carvalho Chehab # 250*df50e848SMauro Carvalho Chehab # Try to preserve indent 251*df50e848SMauro Carvalho Chehab # 252*df50e848SMauro Carvalho Chehab out += "\t" * (len(show_stack) - 1) 253*df50e848SMauro Carvalho Chehab 254*df50e848SMauro Carvalho Chehab out += str(tok.value) 255*df50e848SMauro Carvalho Chehab continue 256*df50e848SMauro Carvalho Chehab 257*df50e848SMauro Carvalho Chehab elif tok.kind == CToken.COMMENT: 258*df50e848SMauro Carvalho Chehab comment = RE_COMMENT_START.sub("", tok.value) 259*df50e848SMauro Carvalho Chehab 260*df50e848SMauro Carvalho Chehab if comment.startswith("private:"): 261*df50e848SMauro Carvalho Chehab show_stack[-1] = False 262*df50e848SMauro Carvalho Chehab show = False 263*df50e848SMauro Carvalho Chehab elif comment.startswith("public:"): 264*df50e848SMauro Carvalho Chehab show_stack[-1] = True 265*df50e848SMauro Carvalho Chehab 266*df50e848SMauro Carvalho Chehab continue 267*df50e848SMauro Carvalho Chehab 268*df50e848SMauro Carvalho Chehab if not show_stack[-1]: 269*df50e848SMauro Carvalho Chehab continue 270*df50e848SMauro Carvalho Chehab 271*df50e848SMauro Carvalho Chehab if i < len(self.tokens) - 1: 272*df50e848SMauro Carvalho Chehab next_tok = self.tokens[i + 1] 273*df50e848SMauro Carvalho Chehab 274*df50e848SMauro Carvalho Chehab # Do some cleanups before ";" 275*df50e848SMauro Carvalho Chehab 276*df50e848SMauro Carvalho Chehab if (tok.kind == CToken.SPACE and 277*df50e848SMauro Carvalho Chehab next_tok.kind == CToken.PUNC and 278*df50e848SMauro Carvalho Chehab next_tok.value == ";"): 279*df50e848SMauro Carvalho Chehab 280*df50e848SMauro Carvalho Chehab continue 281*df50e848SMauro Carvalho Chehab 282*df50e848SMauro Carvalho Chehab if (tok.kind == CToken.PUNC and 283*df50e848SMauro Carvalho Chehab next_tok.kind == CToken.PUNC and 284*df50e848SMauro Carvalho Chehab tok.value == ";" and 285*df50e848SMauro Carvalho Chehab next_tok.kind == CToken.PUNC and 286*df50e848SMauro Carvalho Chehab next_tok.value == ";"): 287*df50e848SMauro Carvalho Chehab 288*df50e848SMauro Carvalho Chehab continue 289*df50e848SMauro Carvalho Chehab 290*df50e848SMauro Carvalho Chehab out += str(tok.value) 291*df50e848SMauro Carvalho Chehab 292*df50e848SMauro Carvalho Chehab return out 293