1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9 10Please notice that the code here may rise exceptions to indicate bad 11usage inside kdoc to indicate problems at the replace pattern. 12 13Other errors are logged via log instance. 14""" 15 16import logging 17import re 18 19from .kdoc_re import KernRe 20 21log = logging.getLogger(__name__) 22 23 24class CToken(): 25 """ 26 Data class to define a C token. 27 """ 28 29 # Tokens that can be used by the parser. Works like an C enum. 30 31 COMMENT = 0 #: A standard C or C99 comment, including delimiter. 32 STRING = 1 #: A string, including quotation marks. 33 CHAR = 2 #: A character, including apostophes. 34 NUMBER = 3 #: A number. 35 PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 36 BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 37 END = 6 #: A end character: ``}`` / ``]`` / ``)``. 38 CPP = 7 #: A preprocessor macro. 39 HASH = 8 #: The hash character - useful to handle other macros. 40 OP = 9 #: A C operator (add, subtract, ...). 41 STRUCT = 10 #: A ``struct`` keyword. 42 UNION = 11 #: An ``union`` keyword. 43 ENUM = 12 #: A ``struct`` keyword. 44 TYPEDEF = 13 #: A ``typedef`` keyword. 45 NAME = 14 #: A name. Can be an ID or a type. 46 SPACE = 15 #: Any space characters, including new lines 47 ENDSTMT = 16 #: End of an statement (``;``). 48 49 BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 50 51 MISMATCH = 255 #: an error indicator: should never happen in practice. 52 53 # Dict to convert from an enum interger into a string. 54 _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 55 56 # Dict to convert from string to an enum-like integer value. 57 _name_to_val = {k: v for v, k in _name_by_val.items()} 58 59 @staticmethod 60 def to_name(val): 61 """Convert from an integer value from CToken enum into a string""" 62 63 return CToken._name_by_val.get(val, f"UNKNOWN({val})") 64 65 @staticmethod 66 def from_name(name): 67 """Convert a string into a CToken enum value""" 68 if name in CToken._name_to_val: 69 return CToken._name_to_val[name] 70 71 return CToken.MISMATCH 72 73 74 def __init__(self, kind, value=None, pos=0, 75 brace_level=0, paren_level=0, bracket_level=0): 76 self.kind = kind 77 self.value = value 78 self.pos = pos 79 self.level = (bracket_level, paren_level, brace_level) 80 81 def __repr__(self): 82 name = self.to_name(self.kind) 83 if isinstance(self.value, str): 84 value = '"' + self.value + '"' 85 else: 86 value = self.value 87 88 return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 89 90#: Regexes to parse C code, transforming it into tokens. 91RE_SCANNER_LIST = [ 92 # 93 # Note that \s\S is different than .*, as it also catches \n 94 # 95 (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 96 97 (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 98 (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 99 100 (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 101 r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 102 103 (CToken.ENDSTMT, r"(?:\s+;|;)"), 104 105 (CToken.PUNC, r"[,\.]"), 106 107 (CToken.BEGIN, r"[\[\(\{]"), 108 109 (CToken.END, r"[\]\)\}]"), 110 111 (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 112 113 (CToken.HASH, r"#"), 114 115 (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 116 r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 117 118 (CToken.STRUCT, r"\bstruct\b"), 119 (CToken.UNION, r"\bunion\b"), 120 (CToken.ENUM, r"\benum\b"), 121 (CToken.TYPEDEF, r"\btypedef\b"), 122 123 (CToken.NAME, r"[A-Za-z_]\w*"), 124 125 (CToken.SPACE, r"\s+"), 126 127 (CToken.BACKREF, r"\\\d+"), 128 129 (CToken.MISMATCH,r"."), 130] 131 132def fill_re_scanner(token_list): 133 """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 134 re_tokens = [] 135 136 for kind, pattern in token_list: 137 name = CToken.to_name(kind) 138 re_tokens.append(f"(?P<{name}>{pattern})") 139 140 return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 141 142#: Handle C continuation lines. 143RE_CONT = KernRe(r"\\\n") 144 145RE_COMMENT_START = KernRe(r'/\*\s*') 146 147#: tokenizer regex. Will be filled at the first CTokenizer usage. 148RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 149 150 151class CTokenizer(): 152 """ 153 Scan C statements and definitions and produce tokens. 154 155 When converted to string, it drops comments and handle public/private 156 values, respecting depth. 157 """ 158 159 # This class is inspired and follows the basic concepts of: 160 # https://docs.python.org/3/library/re.html#writing-a-tokenizer 161 162 def __init__(self, source=None, log=None): 163 """ 164 Create a regular expression to handle RE_SCANNER_LIST. 165 166 While I generally don't like using regex group naming via: 167 (?P<name>...) 168 169 in this particular case, it makes sense, as we can pick the name 170 when matching a code via RE_SCANNER. 171 """ 172 173 self.tokens = [] 174 175 if not source: 176 return 177 178 if isinstance(source, list): 179 self.tokens = source 180 return 181 182 # 183 # While we could just use _tokenize directly via interator, 184 # As we'll need to use the tokenizer several times inside kernel-doc 185 # to handle macro transforms, cache the results on a list, as 186 # re-using it is cheaper than having to parse everytime. 187 # 188 for tok in self._tokenize(source): 189 self.tokens.append(tok) 190 191 def _tokenize(self, source): 192 """ 193 Iterator that parses ``source``, splitting it into tokens, as defined 194 at ``self.RE_SCANNER_LIST``. 195 196 The interactor returns a CToken class object. 197 """ 198 199 # Handle continuation lines. Note that kdoc_parser already has a 200 # logic to do that. Still, let's keep it for completeness, as we might 201 # end re-using this tokenizer outsize kernel-doc some day - or we may 202 # eventually remove from there as a future cleanup. 203 source = RE_CONT.sub("", source) 204 205 brace_level = 0 206 paren_level = 0 207 bracket_level = 0 208 209 for match in RE_SCANNER.finditer(source): 210 kind = CToken.from_name(match.lastgroup) 211 pos = match.start() 212 value = match.group() 213 214 if kind == CToken.MISMATCH: 215 log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 216 elif kind == CToken.BEGIN: 217 if value == '(': 218 paren_level += 1 219 elif value == '[': 220 bracket_level += 1 221 else: # value == '{' 222 brace_level += 1 223 224 elif kind == CToken.END: 225 if value == ')' and paren_level > 0: 226 paren_level -= 1 227 elif value == ']' and bracket_level > 0: 228 bracket_level -= 1 229 elif brace_level > 0: # value == '}' 230 brace_level -= 1 231 232 yield CToken(kind, value, pos, 233 brace_level, paren_level, bracket_level) 234 235 def __str__(self): 236 out="" 237 show_stack = [True] 238 239 for i, tok in enumerate(self.tokens): 240 if tok.kind == CToken.BEGIN: 241 show_stack.append(show_stack[-1]) 242 243 elif tok.kind == CToken.END: 244 prev = show_stack[-1] 245 if len(show_stack) > 1: 246 show_stack.pop() 247 248 if not prev and show_stack[-1]: 249 # 250 # Try to preserve indent 251 # 252 out += "\t" * (len(show_stack) - 1) 253 254 out += str(tok.value) 255 continue 256 257 elif tok.kind == CToken.COMMENT: 258 comment = RE_COMMENT_START.sub("", tok.value) 259 260 if comment.startswith("private:"): 261 show_stack[-1] = False 262 show = False 263 elif comment.startswith("public:"): 264 show_stack[-1] = True 265 266 continue 267 268 if not show_stack[-1]: 269 continue 270 271 if i < len(self.tokens) - 1: 272 next_tok = self.tokens[i + 1] 273 274 # Do some cleanups before ";" 275 276 if (tok.kind == CToken.SPACE and 277 next_tok.kind == CToken.PUNC and 278 next_tok.value == ";"): 279 280 continue 281 282 if (tok.kind == CToken.PUNC and 283 next_tok.kind == CToken.PUNC and 284 tok.value == ";" and 285 next_tok.kind == CToken.PUNC and 286 next_tok.value == ";"): 287 288 continue 289 290 out += str(tok.value) 291 292 return out 293