1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3 2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4df50e848SMauro Carvalho Chehab 5df50e848SMauro Carvalho Chehab""" 6df50e848SMauro Carvalho ChehabRegular expression ancillary classes. 7df50e848SMauro Carvalho Chehab 8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9df50e848SMauro Carvalho Chehab 10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad 11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern. 12df50e848SMauro Carvalho Chehab 13df50e848SMauro Carvalho ChehabOther errors are logged via log instance. 14df50e848SMauro Carvalho Chehab""" 15df50e848SMauro Carvalho Chehab 16df50e848SMauro Carvalho Chehabimport logging 17df50e848SMauro Carvalho Chehabimport re 18df50e848SMauro Carvalho Chehab 199aaeb817SMauro Carvalho Chehabfrom copy import copy 209aaeb817SMauro Carvalho Chehab 21df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe 22df50e848SMauro Carvalho Chehab 23df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__) 24df50e848SMauro Carvalho Chehab 25024e200eSMauro Carvalho Chehabdef tokenizer_set_log(logger, prefix = ""): 26024e200eSMauro Carvalho Chehab """ 27024e200eSMauro Carvalho Chehab Replace the module‑level logger with a LoggerAdapter that 28024e200eSMauro Carvalho Chehab prepends *prefix* to every message. 29024e200eSMauro Carvalho Chehab """ 30024e200eSMauro Carvalho Chehab global log 31024e200eSMauro Carvalho Chehab 32024e200eSMauro Carvalho Chehab class PrefixAdapter(logging.LoggerAdapter): 33024e200eSMauro Carvalho Chehab """ 34024e200eSMauro Carvalho Chehab Ancillary class to set prefix on all message logs. 35024e200eSMauro Carvalho Chehab """ 36024e200eSMauro Carvalho Chehab def process(self, msg, kwargs): 37024e200eSMauro Carvalho Chehab return f"{prefix}{msg}", kwargs 38024e200eSMauro Carvalho Chehab 39024e200eSMauro Carvalho Chehab # Wrap the provided logger in our adapter 40024e200eSMauro Carvalho Chehab log = PrefixAdapter(logger, {"prefix": prefix}) 41df50e848SMauro Carvalho Chehab 42df50e848SMauro Carvalho Chehabclass CToken(): 43df50e848SMauro Carvalho Chehab """ 44df50e848SMauro Carvalho Chehab Data class to define a C token. 45df50e848SMauro Carvalho Chehab """ 46df50e848SMauro Carvalho Chehab 47df50e848SMauro Carvalho Chehab # Tokens that can be used by the parser. Works like an C enum. 48df50e848SMauro Carvalho Chehab 49df50e848SMauro Carvalho Chehab COMMENT = 0 #: A standard C or C99 comment, including delimiter. 50df50e848SMauro Carvalho Chehab STRING = 1 #: A string, including quotation marks. 51df50e848SMauro Carvalho Chehab CHAR = 2 #: A character, including apostophes. 52df50e848SMauro Carvalho Chehab NUMBER = 3 #: A number. 53df50e848SMauro Carvalho Chehab PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 54df50e848SMauro Carvalho Chehab BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 55df50e848SMauro Carvalho Chehab END = 6 #: A end character: ``}`` / ``]`` / ``)``. 56df50e848SMauro Carvalho Chehab CPP = 7 #: A preprocessor macro. 57df50e848SMauro Carvalho Chehab HASH = 8 #: The hash character - useful to handle other macros. 58df50e848SMauro Carvalho Chehab OP = 9 #: A C operator (add, subtract, ...). 59df50e848SMauro Carvalho Chehab STRUCT = 10 #: A ``struct`` keyword. 60df50e848SMauro Carvalho Chehab UNION = 11 #: An ``union`` keyword. 61df50e848SMauro Carvalho Chehab ENUM = 12 #: A ``struct`` keyword. 62df50e848SMauro Carvalho Chehab TYPEDEF = 13 #: A ``typedef`` keyword. 63df50e848SMauro Carvalho Chehab NAME = 14 #: A name. Can be an ID or a type. 64df50e848SMauro Carvalho Chehab SPACE = 15 #: Any space characters, including new lines 65df50e848SMauro Carvalho Chehab ENDSTMT = 16 #: End of an statement (``;``). 66df50e848SMauro Carvalho Chehab 67df50e848SMauro Carvalho Chehab BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 68df50e848SMauro Carvalho Chehab 69df50e848SMauro Carvalho Chehab MISMATCH = 255 #: an error indicator: should never happen in practice. 70df50e848SMauro Carvalho Chehab 71df50e848SMauro Carvalho Chehab # Dict to convert from an enum interger into a string. 72df50e848SMauro Carvalho Chehab _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 73df50e848SMauro Carvalho Chehab 74df50e848SMauro Carvalho Chehab # Dict to convert from string to an enum-like integer value. 75df50e848SMauro Carvalho Chehab _name_to_val = {k: v for v, k in _name_by_val.items()} 76df50e848SMauro Carvalho Chehab 77df50e848SMauro Carvalho Chehab @staticmethod 78df50e848SMauro Carvalho Chehab def to_name(val): 79df50e848SMauro Carvalho Chehab """Convert from an integer value from CToken enum into a string""" 80df50e848SMauro Carvalho Chehab 81df50e848SMauro Carvalho Chehab return CToken._name_by_val.get(val, f"UNKNOWN({val})") 82df50e848SMauro Carvalho Chehab 83df50e848SMauro Carvalho Chehab @staticmethod 84df50e848SMauro Carvalho Chehab def from_name(name): 85df50e848SMauro Carvalho Chehab """Convert a string into a CToken enum value""" 86df50e848SMauro Carvalho Chehab if name in CToken._name_to_val: 87df50e848SMauro Carvalho Chehab return CToken._name_to_val[name] 88df50e848SMauro Carvalho Chehab 89df50e848SMauro Carvalho Chehab return CToken.MISMATCH 90df50e848SMauro Carvalho Chehab 91df50e848SMauro Carvalho Chehab 92df50e848SMauro Carvalho Chehab def __init__(self, kind, value=None, pos=0, 93df50e848SMauro Carvalho Chehab brace_level=0, paren_level=0, bracket_level=0): 94df50e848SMauro Carvalho Chehab self.kind = kind 95df50e848SMauro Carvalho Chehab self.value = value 96df50e848SMauro Carvalho Chehab self.pos = pos 97df50e848SMauro Carvalho Chehab self.level = (bracket_level, paren_level, brace_level) 98df50e848SMauro Carvalho Chehab 99df50e848SMauro Carvalho Chehab def __repr__(self): 100df50e848SMauro Carvalho Chehab name = self.to_name(self.kind) 101df50e848SMauro Carvalho Chehab if isinstance(self.value, str): 102df50e848SMauro Carvalho Chehab value = '"' + self.value + '"' 103df50e848SMauro Carvalho Chehab else: 104df50e848SMauro Carvalho Chehab value = self.value 105df50e848SMauro Carvalho Chehab 106df50e848SMauro Carvalho Chehab return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 107df50e848SMauro Carvalho Chehab 108df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens. 109df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [ 110df50e848SMauro Carvalho Chehab # 111df50e848SMauro Carvalho Chehab # Note that \s\S is different than .*, as it also catches \n 112df50e848SMauro Carvalho Chehab # 113df50e848SMauro Carvalho Chehab (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 114df50e848SMauro Carvalho Chehab 115df50e848SMauro Carvalho Chehab (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 116df50e848SMauro Carvalho Chehab (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 117df50e848SMauro Carvalho Chehab 118df50e848SMauro Carvalho Chehab (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 119df50e848SMauro Carvalho Chehab r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 120df50e848SMauro Carvalho Chehab 121df50e848SMauro Carvalho Chehab (CToken.ENDSTMT, r"(?:\s+;|;)"), 122df50e848SMauro Carvalho Chehab 123df50e848SMauro Carvalho Chehab (CToken.PUNC, r"[,\.]"), 124df50e848SMauro Carvalho Chehab 125df50e848SMauro Carvalho Chehab (CToken.BEGIN, r"[\[\(\{]"), 126df50e848SMauro Carvalho Chehab 127df50e848SMauro Carvalho Chehab (CToken.END, r"[\]\)\}]"), 128df50e848SMauro Carvalho Chehab 129df50e848SMauro Carvalho Chehab (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 130df50e848SMauro Carvalho Chehab 131df50e848SMauro Carvalho Chehab (CToken.HASH, r"#"), 132df50e848SMauro Carvalho Chehab 133df50e848SMauro Carvalho Chehab (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 134df50e848SMauro Carvalho Chehab r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 135df50e848SMauro Carvalho Chehab 136df50e848SMauro Carvalho Chehab (CToken.STRUCT, r"\bstruct\b"), 137df50e848SMauro Carvalho Chehab (CToken.UNION, r"\bunion\b"), 138df50e848SMauro Carvalho Chehab (CToken.ENUM, r"\benum\b"), 139df50e848SMauro Carvalho Chehab (CToken.TYPEDEF, r"\btypedef\b"), 140df50e848SMauro Carvalho Chehab 141df50e848SMauro Carvalho Chehab (CToken.NAME, r"[A-Za-z_]\w*"), 142df50e848SMauro Carvalho Chehab 143df50e848SMauro Carvalho Chehab (CToken.SPACE, r"\s+"), 144df50e848SMauro Carvalho Chehab 145df50e848SMauro Carvalho Chehab (CToken.BACKREF, r"\\\d+"), 146df50e848SMauro Carvalho Chehab 147df50e848SMauro Carvalho Chehab (CToken.MISMATCH,r"."), 148df50e848SMauro Carvalho Chehab] 149df50e848SMauro Carvalho Chehab 150df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list): 151df50e848SMauro Carvalho Chehab """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 152df50e848SMauro Carvalho Chehab re_tokens = [] 153df50e848SMauro Carvalho Chehab 154df50e848SMauro Carvalho Chehab for kind, pattern in token_list: 155df50e848SMauro Carvalho Chehab name = CToken.to_name(kind) 156df50e848SMauro Carvalho Chehab re_tokens.append(f"(?P<{name}>{pattern})") 157df50e848SMauro Carvalho Chehab 158df50e848SMauro Carvalho Chehab return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 159df50e848SMauro Carvalho Chehab 160df50e848SMauro Carvalho Chehab#: Handle C continuation lines. 161df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n") 162df50e848SMauro Carvalho Chehab 163df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*') 164df50e848SMauro Carvalho Chehab 165df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage. 166df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 167df50e848SMauro Carvalho Chehab 168df50e848SMauro Carvalho Chehab 169df50e848SMauro Carvalho Chehabclass CTokenizer(): 170df50e848SMauro Carvalho Chehab """ 171df50e848SMauro Carvalho Chehab Scan C statements and definitions and produce tokens. 172df50e848SMauro Carvalho Chehab 173df50e848SMauro Carvalho Chehab When converted to string, it drops comments and handle public/private 174df50e848SMauro Carvalho Chehab values, respecting depth. 175df50e848SMauro Carvalho Chehab """ 176df50e848SMauro Carvalho Chehab 177df50e848SMauro Carvalho Chehab # This class is inspired and follows the basic concepts of: 178df50e848SMauro Carvalho Chehab # https://docs.python.org/3/library/re.html#writing-a-tokenizer 179df50e848SMauro Carvalho Chehab 180df50e848SMauro Carvalho Chehab def __init__(self, source=None, log=None): 181df50e848SMauro Carvalho Chehab """ 182df50e848SMauro Carvalho Chehab Create a regular expression to handle RE_SCANNER_LIST. 183df50e848SMauro Carvalho Chehab 184df50e848SMauro Carvalho Chehab While I generally don't like using regex group naming via: 185df50e848SMauro Carvalho Chehab (?P<name>...) 186df50e848SMauro Carvalho Chehab 187df50e848SMauro Carvalho Chehab in this particular case, it makes sense, as we can pick the name 188df50e848SMauro Carvalho Chehab when matching a code via RE_SCANNER. 189df50e848SMauro Carvalho Chehab """ 190df50e848SMauro Carvalho Chehab 191df50e848SMauro Carvalho Chehab self.tokens = [] 192df50e848SMauro Carvalho Chehab 193df50e848SMauro Carvalho Chehab if not source: 194df50e848SMauro Carvalho Chehab return 195df50e848SMauro Carvalho Chehab 196df50e848SMauro Carvalho Chehab if isinstance(source, list): 197df50e848SMauro Carvalho Chehab self.tokens = source 198df50e848SMauro Carvalho Chehab return 199df50e848SMauro Carvalho Chehab 200df50e848SMauro Carvalho Chehab # 201df50e848SMauro Carvalho Chehab # While we could just use _tokenize directly via interator, 202df50e848SMauro Carvalho Chehab # As we'll need to use the tokenizer several times inside kernel-doc 203df50e848SMauro Carvalho Chehab # to handle macro transforms, cache the results on a list, as 204df50e848SMauro Carvalho Chehab # re-using it is cheaper than having to parse everytime. 205df50e848SMauro Carvalho Chehab # 206df50e848SMauro Carvalho Chehab for tok in self._tokenize(source): 207df50e848SMauro Carvalho Chehab self.tokens.append(tok) 208df50e848SMauro Carvalho Chehab 209df50e848SMauro Carvalho Chehab def _tokenize(self, source): 210df50e848SMauro Carvalho Chehab """ 211df50e848SMauro Carvalho Chehab Iterator that parses ``source``, splitting it into tokens, as defined 212df50e848SMauro Carvalho Chehab at ``self.RE_SCANNER_LIST``. 213df50e848SMauro Carvalho Chehab 214df50e848SMauro Carvalho Chehab The interactor returns a CToken class object. 215df50e848SMauro Carvalho Chehab """ 216df50e848SMauro Carvalho Chehab 217df50e848SMauro Carvalho Chehab # Handle continuation lines. Note that kdoc_parser already has a 218df50e848SMauro Carvalho Chehab # logic to do that. Still, let's keep it for completeness, as we might 219df50e848SMauro Carvalho Chehab # end re-using this tokenizer outsize kernel-doc some day - or we may 220df50e848SMauro Carvalho Chehab # eventually remove from there as a future cleanup. 221df50e848SMauro Carvalho Chehab source = RE_CONT.sub("", source) 222df50e848SMauro Carvalho Chehab 223df50e848SMauro Carvalho Chehab brace_level = 0 224df50e848SMauro Carvalho Chehab paren_level = 0 225df50e848SMauro Carvalho Chehab bracket_level = 0 226df50e848SMauro Carvalho Chehab 227df50e848SMauro Carvalho Chehab for match in RE_SCANNER.finditer(source): 228df50e848SMauro Carvalho Chehab kind = CToken.from_name(match.lastgroup) 229df50e848SMauro Carvalho Chehab pos = match.start() 230df50e848SMauro Carvalho Chehab value = match.group() 231df50e848SMauro Carvalho Chehab 232df50e848SMauro Carvalho Chehab if kind == CToken.MISMATCH: 233df50e848SMauro Carvalho Chehab log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 234df50e848SMauro Carvalho Chehab elif kind == CToken.BEGIN: 235df50e848SMauro Carvalho Chehab if value == '(': 236df50e848SMauro Carvalho Chehab paren_level += 1 237df50e848SMauro Carvalho Chehab elif value == '[': 238df50e848SMauro Carvalho Chehab bracket_level += 1 239df50e848SMauro Carvalho Chehab else: # value == '{' 240df50e848SMauro Carvalho Chehab brace_level += 1 241df50e848SMauro Carvalho Chehab 242df50e848SMauro Carvalho Chehab elif kind == CToken.END: 243df50e848SMauro Carvalho Chehab if value == ')' and paren_level > 0: 244df50e848SMauro Carvalho Chehab paren_level -= 1 245df50e848SMauro Carvalho Chehab elif value == ']' and bracket_level > 0: 246df50e848SMauro Carvalho Chehab bracket_level -= 1 247df50e848SMauro Carvalho Chehab elif brace_level > 0: # value == '}' 248df50e848SMauro Carvalho Chehab brace_level -= 1 249df50e848SMauro Carvalho Chehab 250df50e848SMauro Carvalho Chehab yield CToken(kind, value, pos, 251df50e848SMauro Carvalho Chehab brace_level, paren_level, bracket_level) 252df50e848SMauro Carvalho Chehab 253df50e848SMauro Carvalho Chehab def __str__(self): 254df50e848SMauro Carvalho Chehab out="" 255df50e848SMauro Carvalho Chehab show_stack = [True] 256df50e848SMauro Carvalho Chehab 257df50e848SMauro Carvalho Chehab for i, tok in enumerate(self.tokens): 258df50e848SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 259df50e848SMauro Carvalho Chehab show_stack.append(show_stack[-1]) 260df50e848SMauro Carvalho Chehab 261df50e848SMauro Carvalho Chehab elif tok.kind == CToken.END: 262df50e848SMauro Carvalho Chehab prev = show_stack[-1] 263df50e848SMauro Carvalho Chehab if len(show_stack) > 1: 264df50e848SMauro Carvalho Chehab show_stack.pop() 265df50e848SMauro Carvalho Chehab 266df50e848SMauro Carvalho Chehab if not prev and show_stack[-1]: 267df50e848SMauro Carvalho Chehab # 268df50e848SMauro Carvalho Chehab # Try to preserve indent 269df50e848SMauro Carvalho Chehab # 270df50e848SMauro Carvalho Chehab out += "\t" * (len(show_stack) - 1) 271df50e848SMauro Carvalho Chehab 272df50e848SMauro Carvalho Chehab out += str(tok.value) 273df50e848SMauro Carvalho Chehab continue 274df50e848SMauro Carvalho Chehab 275df50e848SMauro Carvalho Chehab elif tok.kind == CToken.COMMENT: 276df50e848SMauro Carvalho Chehab comment = RE_COMMENT_START.sub("", tok.value) 277df50e848SMauro Carvalho Chehab 278df50e848SMauro Carvalho Chehab if comment.startswith("private:"): 279df50e848SMauro Carvalho Chehab show_stack[-1] = False 280df50e848SMauro Carvalho Chehab show = False 281df50e848SMauro Carvalho Chehab elif comment.startswith("public:"): 282df50e848SMauro Carvalho Chehab show_stack[-1] = True 283df50e848SMauro Carvalho Chehab 284df50e848SMauro Carvalho Chehab continue 285df50e848SMauro Carvalho Chehab 286df50e848SMauro Carvalho Chehab if not show_stack[-1]: 287df50e848SMauro Carvalho Chehab continue 288df50e848SMauro Carvalho Chehab 289df50e848SMauro Carvalho Chehab if i < len(self.tokens) - 1: 290df50e848SMauro Carvalho Chehab next_tok = self.tokens[i + 1] 291df50e848SMauro Carvalho Chehab 292df50e848SMauro Carvalho Chehab # Do some cleanups before ";" 293df50e848SMauro Carvalho Chehab 294f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 295df50e848SMauro Carvalho Chehab continue 296df50e848SMauro Carvalho Chehab 297f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 298df50e848SMauro Carvalho Chehab continue 299df50e848SMauro Carvalho Chehab 300df50e848SMauro Carvalho Chehab out += str(tok.value) 301df50e848SMauro Carvalho Chehab 302df50e848SMauro Carvalho Chehab return out 303f1cf9f7cSMauro Carvalho Chehab 304f1cf9f7cSMauro Carvalho Chehab 3059aaeb817SMauro Carvalho Chehabclass CTokenArgs: 3069aaeb817SMauro Carvalho Chehab """ 3079aaeb817SMauro Carvalho Chehab Ancillary class to help using backrefs from sub matches. 3089aaeb817SMauro Carvalho Chehab 3099aaeb817SMauro Carvalho Chehab If the highest backref contain a "+" at the last element, 3109aaeb817SMauro Carvalho Chehab the logic will be greedy, picking all other delims. 3119aaeb817SMauro Carvalho Chehab 3129aaeb817SMauro Carvalho Chehab This is needed to parse struct_group macros with end with ``MEMBERS...``. 3139aaeb817SMauro Carvalho Chehab """ 3149aaeb817SMauro Carvalho Chehab def __init__(self, sub_str): 3159aaeb817SMauro Carvalho Chehab self.sub_groups = set() 3169aaeb817SMauro Carvalho Chehab self.max_group = -1 3179aaeb817SMauro Carvalho Chehab self.greedy = None 3189aaeb817SMauro Carvalho Chehab 3199aaeb817SMauro Carvalho Chehab for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): 3209aaeb817SMauro Carvalho Chehab group = int(m.group(1)) 3219aaeb817SMauro Carvalho Chehab if m.group(2) == "+": 3229aaeb817SMauro Carvalho Chehab if self.greedy and self.greedy != group: 3239aaeb817SMauro Carvalho Chehab raise ValueError("There are multiple greedy patterns!") 3249aaeb817SMauro Carvalho Chehab self.greedy = group 3259aaeb817SMauro Carvalho Chehab 3269aaeb817SMauro Carvalho Chehab self.sub_groups.add(group) 3279aaeb817SMauro Carvalho Chehab self.max_group = max(self.max_group, group) 3289aaeb817SMauro Carvalho Chehab 3299aaeb817SMauro Carvalho Chehab if self.greedy: 3309aaeb817SMauro Carvalho Chehab if self.greedy != self.max_group: 3319aaeb817SMauro Carvalho Chehab raise ValueError("Greedy pattern is not the last one!") 3329aaeb817SMauro Carvalho Chehab 3339aaeb817SMauro Carvalho Chehab sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) 3349aaeb817SMauro Carvalho Chehab 3359aaeb817SMauro Carvalho Chehab self.sub_str = sub_str 3369aaeb817SMauro Carvalho Chehab self.sub_tokeninzer = CTokenizer(sub_str) 3379aaeb817SMauro Carvalho Chehab 3389aaeb817SMauro Carvalho Chehab def groups(self, new_tokenizer): 339*8c0b7c0dSMauro Carvalho Chehab r""" 3409aaeb817SMauro Carvalho Chehab Create replacement arguments for backrefs like: 3419aaeb817SMauro Carvalho Chehab 342*8c0b7c0dSMauro Carvalho Chehab ``\0``, ``\1``, ``\2``, ... ``\{number}`` 3439aaeb817SMauro Carvalho Chehab 344*8c0b7c0dSMauro Carvalho Chehab It also accepts a ``+`` character to the highest backref, like 345*8c0b7c0dSMauro Carvalho Chehab ``\4+``. When used, the backref will be greedy, picking all other 346*8c0b7c0dSMauro Carvalho Chehab arguments afterwards. 3479aaeb817SMauro Carvalho Chehab 3489aaeb817SMauro Carvalho Chehab The logic is smart enough to only go up to the maximum required 3499aaeb817SMauro Carvalho Chehab argument, even if there are more. 3509aaeb817SMauro Carvalho Chehab 3519aaeb817SMauro Carvalho Chehab If there is a backref for an argument above the limit, it will 3529aaeb817SMauro Carvalho Chehab raise an exception. Please notice that, on C, square brackets 3539aaeb817SMauro Carvalho Chehab don't have any separator on it. Trying to use ``\1``..``\n`` for 3549aaeb817SMauro Carvalho Chehab brackets also raise an exception. 3559aaeb817SMauro Carvalho Chehab """ 3569aaeb817SMauro Carvalho Chehab 3579aaeb817SMauro Carvalho Chehab level = (0, 0, 0) 3589aaeb817SMauro Carvalho Chehab 3599aaeb817SMauro Carvalho Chehab if self.max_group < 0: 3609aaeb817SMauro Carvalho Chehab return level, [] 3619aaeb817SMauro Carvalho Chehab 3629aaeb817SMauro Carvalho Chehab tokens = new_tokenizer.tokens 3639aaeb817SMauro Carvalho Chehab 3649aaeb817SMauro Carvalho Chehab # 3659aaeb817SMauro Carvalho Chehab # Fill \0 with the full token contents 3669aaeb817SMauro Carvalho Chehab # 3679aaeb817SMauro Carvalho Chehab groups_list = [ [] ] 3689aaeb817SMauro Carvalho Chehab 3699aaeb817SMauro Carvalho Chehab if 0 in self.sub_groups: 3709aaeb817SMauro Carvalho Chehab inner_level = 0 3719aaeb817SMauro Carvalho Chehab 3729aaeb817SMauro Carvalho Chehab for i in range(0, len(tokens)): 3739aaeb817SMauro Carvalho Chehab tok = tokens[i] 3749aaeb817SMauro Carvalho Chehab 3759aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 3769aaeb817SMauro Carvalho Chehab inner_level += 1 3779aaeb817SMauro Carvalho Chehab 3789aaeb817SMauro Carvalho Chehab # 3799aaeb817SMauro Carvalho Chehab # Discard first begin 3809aaeb817SMauro Carvalho Chehab # 3819aaeb817SMauro Carvalho Chehab if not groups_list[0]: 3829aaeb817SMauro Carvalho Chehab continue 3839aaeb817SMauro Carvalho Chehab elif tok.kind == CToken.END: 3849aaeb817SMauro Carvalho Chehab inner_level -= 1 3859aaeb817SMauro Carvalho Chehab if inner_level < 0: 3869aaeb817SMauro Carvalho Chehab break 3879aaeb817SMauro Carvalho Chehab 3889aaeb817SMauro Carvalho Chehab if inner_level: 3899aaeb817SMauro Carvalho Chehab groups_list[0].append(tok) 3909aaeb817SMauro Carvalho Chehab 3919aaeb817SMauro Carvalho Chehab if not self.max_group: 3929aaeb817SMauro Carvalho Chehab return level, groups_list 3939aaeb817SMauro Carvalho Chehab 3949aaeb817SMauro Carvalho Chehab delim = None 3959aaeb817SMauro Carvalho Chehab 3969aaeb817SMauro Carvalho Chehab # 3979aaeb817SMauro Carvalho Chehab # Ignore everything before BEGIN. The value of begin gives the 3989aaeb817SMauro Carvalho Chehab # delimiter to be used for the matches 3999aaeb817SMauro Carvalho Chehab # 4009aaeb817SMauro Carvalho Chehab for i in range(0, len(tokens)): 4019aaeb817SMauro Carvalho Chehab tok = tokens[i] 4029aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 4039aaeb817SMauro Carvalho Chehab if tok.value == "{": 4049aaeb817SMauro Carvalho Chehab delim = ";" 4059aaeb817SMauro Carvalho Chehab elif tok.value == "(": 4069aaeb817SMauro Carvalho Chehab delim = "," 4079aaeb817SMauro Carvalho Chehab else: 4089aaeb817SMauro Carvalho Chehab self.log.error(fr"Can't handle \1..\n on {sub_str}") 4099aaeb817SMauro Carvalho Chehab 4109aaeb817SMauro Carvalho Chehab level = tok.level 4119aaeb817SMauro Carvalho Chehab break 4129aaeb817SMauro Carvalho Chehab 4139aaeb817SMauro Carvalho Chehab pos = 1 4149aaeb817SMauro Carvalho Chehab groups_list.append([]) 4159aaeb817SMauro Carvalho Chehab 4169aaeb817SMauro Carvalho Chehab inner_level = 0 4179aaeb817SMauro Carvalho Chehab for i in range(i + 1, len(tokens)): 4189aaeb817SMauro Carvalho Chehab tok = tokens[i] 4199aaeb817SMauro Carvalho Chehab 4209aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 4219aaeb817SMauro Carvalho Chehab inner_level += 1 4229aaeb817SMauro Carvalho Chehab if tok.kind == CToken.END: 4239aaeb817SMauro Carvalho Chehab inner_level -= 1 4249aaeb817SMauro Carvalho Chehab if inner_level < 0: 4259aaeb817SMauro Carvalho Chehab break 4269aaeb817SMauro Carvalho Chehab 4279aaeb817SMauro Carvalho Chehab if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value: 4289aaeb817SMauro Carvalho Chehab pos += 1 4299aaeb817SMauro Carvalho Chehab if self.greedy and pos > self.max_group: 4309aaeb817SMauro Carvalho Chehab pos -= 1 4319aaeb817SMauro Carvalho Chehab else: 4329aaeb817SMauro Carvalho Chehab groups_list.append([]) 4339aaeb817SMauro Carvalho Chehab 4349aaeb817SMauro Carvalho Chehab if pos > self.max_group: 4359aaeb817SMauro Carvalho Chehab break 4369aaeb817SMauro Carvalho Chehab 4379aaeb817SMauro Carvalho Chehab continue 4389aaeb817SMauro Carvalho Chehab 4399aaeb817SMauro Carvalho Chehab groups_list[pos].append(tok) 4409aaeb817SMauro Carvalho Chehab 4419aaeb817SMauro Carvalho Chehab if pos < self.max_group: 4429aaeb817SMauro Carvalho Chehab log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}") 4439aaeb817SMauro Carvalho Chehab 4449aaeb817SMauro Carvalho Chehab return level, groups_list 4459aaeb817SMauro Carvalho Chehab 4469aaeb817SMauro Carvalho Chehab def tokens(self, new_tokenizer): 4479aaeb817SMauro Carvalho Chehab level, groups = self.groups(new_tokenizer) 4489aaeb817SMauro Carvalho Chehab 4499aaeb817SMauro Carvalho Chehab new = CTokenizer() 4509aaeb817SMauro Carvalho Chehab 4519aaeb817SMauro Carvalho Chehab for tok in self.sub_tokeninzer.tokens: 4529aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BACKREF: 4539aaeb817SMauro Carvalho Chehab group = int(tok.value[1:]) 4549aaeb817SMauro Carvalho Chehab 4559aaeb817SMauro Carvalho Chehab for group_tok in groups[group]: 4569aaeb817SMauro Carvalho Chehab new_tok = copy(group_tok) 4579aaeb817SMauro Carvalho Chehab 4589aaeb817SMauro Carvalho Chehab new_level = [0, 0, 0] 4599aaeb817SMauro Carvalho Chehab 4609aaeb817SMauro Carvalho Chehab for i in range(0, len(level)): 4619aaeb817SMauro Carvalho Chehab new_level[i] = new_tok.level[i] + level[i] 4629aaeb817SMauro Carvalho Chehab 4639aaeb817SMauro Carvalho Chehab new_tok.level = tuple(new_level) 4649aaeb817SMauro Carvalho Chehab 4659aaeb817SMauro Carvalho Chehab new.tokens += [ new_tok ] 4669aaeb817SMauro Carvalho Chehab else: 4679aaeb817SMauro Carvalho Chehab new.tokens += [ tok ] 4689aaeb817SMauro Carvalho Chehab 4699aaeb817SMauro Carvalho Chehab return new.tokens 4709aaeb817SMauro Carvalho Chehab 4719aaeb817SMauro Carvalho Chehab 472f1cf9f7cSMauro Carvalho Chehabclass CMatch: 473f1cf9f7cSMauro Carvalho Chehab """ 474f1cf9f7cSMauro Carvalho Chehab Finding nested delimiters is hard with regular expressions. It is 475f1cf9f7cSMauro Carvalho Chehab even harder on Python with its normal re module, as there are several 476f1cf9f7cSMauro Carvalho Chehab advanced regular expressions that are missing. 477f1cf9f7cSMauro Carvalho Chehab 478f1cf9f7cSMauro Carvalho Chehab This is the case of this pattern:: 479f1cf9f7cSMauro Carvalho Chehab 480f1cf9f7cSMauro Carvalho Chehab '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 481f1cf9f7cSMauro Carvalho Chehab 482f1cf9f7cSMauro Carvalho Chehab which is used to properly match open/close parentheses of the 483f1cf9f7cSMauro Carvalho Chehab string search STRUCT_GROUP(), 484f1cf9f7cSMauro Carvalho Chehab 485f1cf9f7cSMauro Carvalho Chehab Add a class that counts pairs of delimiters, using it to match and 486f1cf9f7cSMauro Carvalho Chehab replace nested expressions. 487f1cf9f7cSMauro Carvalho Chehab 488f1cf9f7cSMauro Carvalho Chehab The original approach was suggested by: 489f1cf9f7cSMauro Carvalho Chehab 490f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 491f1cf9f7cSMauro Carvalho Chehab 492f1cf9f7cSMauro Carvalho Chehab Although I re-implemented it to make it more generic and match 3 types 493f1cf9f7cSMauro Carvalho Chehab of delimiters. The logic checks if delimiters are paired. If not, it 494f1cf9f7cSMauro Carvalho Chehab will ignore the search string. 495f1cf9f7cSMauro Carvalho Chehab """ 496f1cf9f7cSMauro Carvalho Chehab 497f1cf9f7cSMauro Carvalho Chehab 4989aaeb817SMauro Carvalho Chehab def __init__(self, regex, delim="("): 4999aaeb817SMauro Carvalho Chehab self.regex = KernRe("^" + regex + r"\b") 5009aaeb817SMauro Carvalho Chehab self.start_delim = delim 501f1cf9f7cSMauro Carvalho Chehab 502f1cf9f7cSMauro Carvalho Chehab def _search(self, tokenizer): 503f1cf9f7cSMauro Carvalho Chehab """ 504f1cf9f7cSMauro Carvalho Chehab Finds paired blocks for a regex that ends with a delimiter. 505f1cf9f7cSMauro Carvalho Chehab 506f1cf9f7cSMauro Carvalho Chehab The suggestion of using finditer to match pairs came from: 507f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 508f1cf9f7cSMauro Carvalho Chehab but I ended using a different implementation to align all three types 509f1cf9f7cSMauro Carvalho Chehab of delimiters and seek for an initial regular expression. 510f1cf9f7cSMauro Carvalho Chehab 511f1cf9f7cSMauro Carvalho Chehab The algorithm seeks for open/close paired delimiters and places them 512f1cf9f7cSMauro Carvalho Chehab into a stack, yielding a start/stop position of each match when the 513f1cf9f7cSMauro Carvalho Chehab stack is zeroed. 514f1cf9f7cSMauro Carvalho Chehab 515f1cf9f7cSMauro Carvalho Chehab The algorithm should work fine for properly paired lines, but will 516f1cf9f7cSMauro Carvalho Chehab silently ignore end delimiters that precede a start delimiter. 517f1cf9f7cSMauro Carvalho Chehab This should be OK for kernel-doc parser, as unaligned delimiters 518f1cf9f7cSMauro Carvalho Chehab would cause compilation errors. So, we don't need to raise exceptions 519f1cf9f7cSMauro Carvalho Chehab to cover such issues. 520f1cf9f7cSMauro Carvalho Chehab """ 521f1cf9f7cSMauro Carvalho Chehab 522f1cf9f7cSMauro Carvalho Chehab start = None 523f1cf9f7cSMauro Carvalho Chehab started = False 524f1cf9f7cSMauro Carvalho Chehab 525f1cf9f7cSMauro Carvalho Chehab import sys 526f1cf9f7cSMauro Carvalho Chehab 527f1cf9f7cSMauro Carvalho Chehab stack = [] 528f1cf9f7cSMauro Carvalho Chehab 529f1cf9f7cSMauro Carvalho Chehab for i, tok in enumerate(tokenizer.tokens): 530f1cf9f7cSMauro Carvalho Chehab if start is None: 531f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.NAME and self.regex.match(tok.value): 532f1cf9f7cSMauro Carvalho Chehab start = i 533f1cf9f7cSMauro Carvalho Chehab stack.append((start, tok.level)) 534f1cf9f7cSMauro Carvalho Chehab started = False 535f1cf9f7cSMauro Carvalho Chehab 536f1cf9f7cSMauro Carvalho Chehab continue 537f1cf9f7cSMauro Carvalho Chehab 5389aaeb817SMauro Carvalho Chehab if not started: 5399aaeb817SMauro Carvalho Chehab if tok.kind == CToken.SPACE: 5409aaeb817SMauro Carvalho Chehab continue 5419aaeb817SMauro Carvalho Chehab 5429aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN and tok.value == self.start_delim: 543f1cf9f7cSMauro Carvalho Chehab started = True 544f1cf9f7cSMauro Carvalho Chehab continue 545f1cf9f7cSMauro Carvalho Chehab 5469aaeb817SMauro Carvalho Chehab # Name only token without BEGIN/END 5479aaeb817SMauro Carvalho Chehab if i > start: 5489aaeb817SMauro Carvalho Chehab i -= 1 5499aaeb817SMauro Carvalho Chehab yield start, i 5509aaeb817SMauro Carvalho Chehab start = None 5519aaeb817SMauro Carvalho Chehab 552f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.END and tok.level == stack[-1][1]: 553f1cf9f7cSMauro Carvalho Chehab start, level = stack.pop() 554f1cf9f7cSMauro Carvalho Chehab 5559aaeb817SMauro Carvalho Chehab yield start, i 556f1cf9f7cSMauro Carvalho Chehab start = None 557f1cf9f7cSMauro Carvalho Chehab 558f1cf9f7cSMauro Carvalho Chehab # 559f1cf9f7cSMauro Carvalho Chehab # If an END zeroing levels is not there, return remaining stuff 560f1cf9f7cSMauro Carvalho Chehab # This is meant to solve cases where the caller logic might be 561f1cf9f7cSMauro Carvalho Chehab # picking an incomplete block. 562f1cf9f7cSMauro Carvalho Chehab # 5639aaeb817SMauro Carvalho Chehab if start and stack: 5649aaeb817SMauro Carvalho Chehab if started: 5659aaeb817SMauro Carvalho Chehab s = str(tokenizer) 5669aaeb817SMauro Carvalho Chehab log.warning(f"can't find a final end at {s}") 5679aaeb817SMauro Carvalho Chehab 5689aaeb817SMauro Carvalho Chehab yield start, len(tokenizer.tokens) 569f1cf9f7cSMauro Carvalho Chehab 570f1cf9f7cSMauro Carvalho Chehab def search(self, source): 571f1cf9f7cSMauro Carvalho Chehab """ 572f1cf9f7cSMauro Carvalho Chehab This is similar to re.search: 573f1cf9f7cSMauro Carvalho Chehab 574f1cf9f7cSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 575f1cf9f7cSMauro Carvalho Chehab returning occurrences only if all delimiters are paired. 576f1cf9f7cSMauro Carvalho Chehab """ 577f1cf9f7cSMauro Carvalho Chehab 578f1cf9f7cSMauro Carvalho Chehab if isinstance(source, CTokenizer): 579f1cf9f7cSMauro Carvalho Chehab tokenizer = source 580f1cf9f7cSMauro Carvalho Chehab is_token = True 581f1cf9f7cSMauro Carvalho Chehab else: 582f1cf9f7cSMauro Carvalho Chehab tokenizer = CTokenizer(source) 583f1cf9f7cSMauro Carvalho Chehab is_token = False 584f1cf9f7cSMauro Carvalho Chehab 5859aaeb817SMauro Carvalho Chehab for start, end in self._search(tokenizer): 5869aaeb817SMauro Carvalho Chehab new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1]) 5879aaeb817SMauro Carvalho Chehab 588f1cf9f7cSMauro Carvalho Chehab if is_token: 589f1cf9f7cSMauro Carvalho Chehab yield new_tokenizer 590f1cf9f7cSMauro Carvalho Chehab else: 591f1cf9f7cSMauro Carvalho Chehab yield str(new_tokenizer) 5929aaeb817SMauro Carvalho Chehab 5939aaeb817SMauro Carvalho Chehab def sub(self, sub_str, source, count=0): 5949aaeb817SMauro Carvalho Chehab """ 5959aaeb817SMauro Carvalho Chehab This is similar to re.sub: 5969aaeb817SMauro Carvalho Chehab 5979aaeb817SMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 5989aaeb817SMauro Carvalho Chehab replacing occurrences only if all delimiters are paired. 5999aaeb817SMauro Carvalho Chehab 6009aaeb817SMauro Carvalho Chehab if the sub argument contains:: 6019aaeb817SMauro Carvalho Chehab 6029aaeb817SMauro Carvalho Chehab r'\0' 6039aaeb817SMauro Carvalho Chehab 6049aaeb817SMauro Carvalho Chehab it will work just like re: it places there the matched paired data 6059aaeb817SMauro Carvalho Chehab with the delimiter stripped. 6069aaeb817SMauro Carvalho Chehab 6079aaeb817SMauro Carvalho Chehab If count is different than zero, it will replace at most count 6089aaeb817SMauro Carvalho Chehab items. 6099aaeb817SMauro Carvalho Chehab """ 6109aaeb817SMauro Carvalho Chehab if isinstance(source, CTokenizer): 6119aaeb817SMauro Carvalho Chehab is_token = True 6129aaeb817SMauro Carvalho Chehab tokenizer = source 6139aaeb817SMauro Carvalho Chehab else: 6149aaeb817SMauro Carvalho Chehab is_token = False 6159aaeb817SMauro Carvalho Chehab tokenizer = CTokenizer(source) 6169aaeb817SMauro Carvalho Chehab 6179aaeb817SMauro Carvalho Chehab # Detect if sub_str contains sub arguments 6189aaeb817SMauro Carvalho Chehab 6199aaeb817SMauro Carvalho Chehab args_match = CTokenArgs(sub_str) 6209aaeb817SMauro Carvalho Chehab 6219aaeb817SMauro Carvalho Chehab new_tokenizer = CTokenizer() 6229aaeb817SMauro Carvalho Chehab pos = 0 6239aaeb817SMauro Carvalho Chehab n = 0 6249aaeb817SMauro Carvalho Chehab 6259aaeb817SMauro Carvalho Chehab # 6269aaeb817SMauro Carvalho Chehab # NOTE: the code below doesn't consider overlays at sub. 6279aaeb817SMauro Carvalho Chehab # We may need to add some extra unit tests to check if those 6289aaeb817SMauro Carvalho Chehab # would cause problems. When replacing by "", this should not 6299aaeb817SMauro Carvalho Chehab # be a problem, but other transformations could be problematic 6309aaeb817SMauro Carvalho Chehab # 6319aaeb817SMauro Carvalho Chehab for start, end in self._search(tokenizer): 6329aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += tokenizer.tokens[pos:start] 6339aaeb817SMauro Carvalho Chehab 6349aaeb817SMauro Carvalho Chehab new = CTokenizer(tokenizer.tokens[start:end + 1]) 6359aaeb817SMauro Carvalho Chehab 6369aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += args_match.tokens(new) 6379aaeb817SMauro Carvalho Chehab 6389aaeb817SMauro Carvalho Chehab pos = end + 1 6399aaeb817SMauro Carvalho Chehab 6409aaeb817SMauro Carvalho Chehab n += 1 6419aaeb817SMauro Carvalho Chehab if count and n >= count: 6429aaeb817SMauro Carvalho Chehab break 6439aaeb817SMauro Carvalho Chehab 6449aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += tokenizer.tokens[pos:] 6459aaeb817SMauro Carvalho Chehab 6469aaeb817SMauro Carvalho Chehab if not is_token: 6479aaeb817SMauro Carvalho Chehab return str(new_tokenizer) 6489aaeb817SMauro Carvalho Chehab 6499aaeb817SMauro Carvalho Chehab return new_tokenizer 6509aaeb817SMauro Carvalho Chehab 6519aaeb817SMauro Carvalho Chehab def __repr__(self): 6529aaeb817SMauro Carvalho Chehab """ 6539aaeb817SMauro Carvalho Chehab Returns a displayable version of the class init. 6549aaeb817SMauro Carvalho Chehab """ 6559aaeb817SMauro Carvalho Chehab 6569aaeb817SMauro Carvalho Chehab return f'CMatch("{self.regex.regex.pattern}")' 657