1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3 2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4df50e848SMauro Carvalho Chehab 5df50e848SMauro Carvalho Chehab""" 6df50e848SMauro Carvalho ChehabRegular expression ancillary classes. 7df50e848SMauro Carvalho Chehab 8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9df50e848SMauro Carvalho Chehab 10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad 11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern. 12df50e848SMauro Carvalho Chehab 13df50e848SMauro Carvalho ChehabOther errors are logged via log instance. 14df50e848SMauro Carvalho Chehab""" 15df50e848SMauro Carvalho Chehab 16df50e848SMauro Carvalho Chehabimport logging 17df50e848SMauro Carvalho Chehabimport re 18df50e848SMauro Carvalho Chehab 19*9aaeb817SMauro Carvalho Chehabfrom copy import copy 20*9aaeb817SMauro Carvalho Chehab 21df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe 22df50e848SMauro Carvalho Chehab 23df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__) 24df50e848SMauro Carvalho Chehab 25df50e848SMauro Carvalho Chehab 26df50e848SMauro Carvalho Chehabclass CToken(): 27df50e848SMauro Carvalho Chehab """ 28df50e848SMauro Carvalho Chehab Data class to define a C token. 29df50e848SMauro Carvalho Chehab """ 30df50e848SMauro Carvalho Chehab 31df50e848SMauro Carvalho Chehab # Tokens that can be used by the parser. Works like an C enum. 32df50e848SMauro Carvalho Chehab 33df50e848SMauro Carvalho Chehab COMMENT = 0 #: A standard C or C99 comment, including delimiter. 34df50e848SMauro Carvalho Chehab STRING = 1 #: A string, including quotation marks. 35df50e848SMauro Carvalho Chehab CHAR = 2 #: A character, including apostophes. 36df50e848SMauro Carvalho Chehab NUMBER = 3 #: A number. 37df50e848SMauro Carvalho Chehab PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 38df50e848SMauro Carvalho Chehab BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 39df50e848SMauro Carvalho Chehab END = 6 #: A end character: ``}`` / ``]`` / ``)``. 40df50e848SMauro Carvalho Chehab CPP = 7 #: A preprocessor macro. 41df50e848SMauro Carvalho Chehab HASH = 8 #: The hash character - useful to handle other macros. 42df50e848SMauro Carvalho Chehab OP = 9 #: A C operator (add, subtract, ...). 43df50e848SMauro Carvalho Chehab STRUCT = 10 #: A ``struct`` keyword. 44df50e848SMauro Carvalho Chehab UNION = 11 #: An ``union`` keyword. 45df50e848SMauro Carvalho Chehab ENUM = 12 #: A ``struct`` keyword. 46df50e848SMauro Carvalho Chehab TYPEDEF = 13 #: A ``typedef`` keyword. 47df50e848SMauro Carvalho Chehab NAME = 14 #: A name. Can be an ID or a type. 48df50e848SMauro Carvalho Chehab SPACE = 15 #: Any space characters, including new lines 49df50e848SMauro Carvalho Chehab ENDSTMT = 16 #: End of an statement (``;``). 50df50e848SMauro Carvalho Chehab 51df50e848SMauro Carvalho Chehab BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 52df50e848SMauro Carvalho Chehab 53df50e848SMauro Carvalho Chehab MISMATCH = 255 #: an error indicator: should never happen in practice. 54df50e848SMauro Carvalho Chehab 55df50e848SMauro Carvalho Chehab # Dict to convert from an enum interger into a string. 56df50e848SMauro Carvalho Chehab _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 57df50e848SMauro Carvalho Chehab 58df50e848SMauro Carvalho Chehab # Dict to convert from string to an enum-like integer value. 59df50e848SMauro Carvalho Chehab _name_to_val = {k: v for v, k in _name_by_val.items()} 60df50e848SMauro Carvalho Chehab 61df50e848SMauro Carvalho Chehab @staticmethod 62df50e848SMauro Carvalho Chehab def to_name(val): 63df50e848SMauro Carvalho Chehab """Convert from an integer value from CToken enum into a string""" 64df50e848SMauro Carvalho Chehab 65df50e848SMauro Carvalho Chehab return CToken._name_by_val.get(val, f"UNKNOWN({val})") 66df50e848SMauro Carvalho Chehab 67df50e848SMauro Carvalho Chehab @staticmethod 68df50e848SMauro Carvalho Chehab def from_name(name): 69df50e848SMauro Carvalho Chehab """Convert a string into a CToken enum value""" 70df50e848SMauro Carvalho Chehab if name in CToken._name_to_val: 71df50e848SMauro Carvalho Chehab return CToken._name_to_val[name] 72df50e848SMauro Carvalho Chehab 73df50e848SMauro Carvalho Chehab return CToken.MISMATCH 74df50e848SMauro Carvalho Chehab 75df50e848SMauro Carvalho Chehab 76df50e848SMauro Carvalho Chehab def __init__(self, kind, value=None, pos=0, 77df50e848SMauro Carvalho Chehab brace_level=0, paren_level=0, bracket_level=0): 78df50e848SMauro Carvalho Chehab self.kind = kind 79df50e848SMauro Carvalho Chehab self.value = value 80df50e848SMauro Carvalho Chehab self.pos = pos 81df50e848SMauro Carvalho Chehab self.level = (bracket_level, paren_level, brace_level) 82df50e848SMauro Carvalho Chehab 83df50e848SMauro Carvalho Chehab def __repr__(self): 84df50e848SMauro Carvalho Chehab name = self.to_name(self.kind) 85df50e848SMauro Carvalho Chehab if isinstance(self.value, str): 86df50e848SMauro Carvalho Chehab value = '"' + self.value + '"' 87df50e848SMauro Carvalho Chehab else: 88df50e848SMauro Carvalho Chehab value = self.value 89df50e848SMauro Carvalho Chehab 90df50e848SMauro Carvalho Chehab return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 91df50e848SMauro Carvalho Chehab 92df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens. 93df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [ 94df50e848SMauro Carvalho Chehab # 95df50e848SMauro Carvalho Chehab # Note that \s\S is different than .*, as it also catches \n 96df50e848SMauro Carvalho Chehab # 97df50e848SMauro Carvalho Chehab (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 98df50e848SMauro Carvalho Chehab 99df50e848SMauro Carvalho Chehab (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 100df50e848SMauro Carvalho Chehab (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 101df50e848SMauro Carvalho Chehab 102df50e848SMauro Carvalho Chehab (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 103df50e848SMauro Carvalho Chehab r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 104df50e848SMauro Carvalho Chehab 105df50e848SMauro Carvalho Chehab (CToken.ENDSTMT, r"(?:\s+;|;)"), 106df50e848SMauro Carvalho Chehab 107df50e848SMauro Carvalho Chehab (CToken.PUNC, r"[,\.]"), 108df50e848SMauro Carvalho Chehab 109df50e848SMauro Carvalho Chehab (CToken.BEGIN, r"[\[\(\{]"), 110df50e848SMauro Carvalho Chehab 111df50e848SMauro Carvalho Chehab (CToken.END, r"[\]\)\}]"), 112df50e848SMauro Carvalho Chehab 113df50e848SMauro Carvalho Chehab (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 114df50e848SMauro Carvalho Chehab 115df50e848SMauro Carvalho Chehab (CToken.HASH, r"#"), 116df50e848SMauro Carvalho Chehab 117df50e848SMauro Carvalho Chehab (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 118df50e848SMauro Carvalho Chehab r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 119df50e848SMauro Carvalho Chehab 120df50e848SMauro Carvalho Chehab (CToken.STRUCT, r"\bstruct\b"), 121df50e848SMauro Carvalho Chehab (CToken.UNION, r"\bunion\b"), 122df50e848SMauro Carvalho Chehab (CToken.ENUM, r"\benum\b"), 123df50e848SMauro Carvalho Chehab (CToken.TYPEDEF, r"\btypedef\b"), 124df50e848SMauro Carvalho Chehab 125df50e848SMauro Carvalho Chehab (CToken.NAME, r"[A-Za-z_]\w*"), 126df50e848SMauro Carvalho Chehab 127df50e848SMauro Carvalho Chehab (CToken.SPACE, r"\s+"), 128df50e848SMauro Carvalho Chehab 129df50e848SMauro Carvalho Chehab (CToken.BACKREF, r"\\\d+"), 130df50e848SMauro Carvalho Chehab 131df50e848SMauro Carvalho Chehab (CToken.MISMATCH,r"."), 132df50e848SMauro Carvalho Chehab] 133df50e848SMauro Carvalho Chehab 134df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list): 135df50e848SMauro Carvalho Chehab """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 136df50e848SMauro Carvalho Chehab re_tokens = [] 137df50e848SMauro Carvalho Chehab 138df50e848SMauro Carvalho Chehab for kind, pattern in token_list: 139df50e848SMauro Carvalho Chehab name = CToken.to_name(kind) 140df50e848SMauro Carvalho Chehab re_tokens.append(f"(?P<{name}>{pattern})") 141df50e848SMauro Carvalho Chehab 142df50e848SMauro Carvalho Chehab return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 143df50e848SMauro Carvalho Chehab 144df50e848SMauro Carvalho Chehab#: Handle C continuation lines. 145df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n") 146df50e848SMauro Carvalho Chehab 147df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*') 148df50e848SMauro Carvalho Chehab 149df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage. 150df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 151df50e848SMauro Carvalho Chehab 152df50e848SMauro Carvalho Chehab 153df50e848SMauro Carvalho Chehabclass CTokenizer(): 154df50e848SMauro Carvalho Chehab """ 155df50e848SMauro Carvalho Chehab Scan C statements and definitions and produce tokens. 156df50e848SMauro Carvalho Chehab 157df50e848SMauro Carvalho Chehab When converted to string, it drops comments and handle public/private 158df50e848SMauro Carvalho Chehab values, respecting depth. 159df50e848SMauro Carvalho Chehab """ 160df50e848SMauro Carvalho Chehab 161df50e848SMauro Carvalho Chehab # This class is inspired and follows the basic concepts of: 162df50e848SMauro Carvalho Chehab # https://docs.python.org/3/library/re.html#writing-a-tokenizer 163df50e848SMauro Carvalho Chehab 164df50e848SMauro Carvalho Chehab def __init__(self, source=None, log=None): 165df50e848SMauro Carvalho Chehab """ 166df50e848SMauro Carvalho Chehab Create a regular expression to handle RE_SCANNER_LIST. 167df50e848SMauro Carvalho Chehab 168df50e848SMauro Carvalho Chehab While I generally don't like using regex group naming via: 169df50e848SMauro Carvalho Chehab (?P<name>...) 170df50e848SMauro Carvalho Chehab 171df50e848SMauro Carvalho Chehab in this particular case, it makes sense, as we can pick the name 172df50e848SMauro Carvalho Chehab when matching a code via RE_SCANNER. 173df50e848SMauro Carvalho Chehab """ 174df50e848SMauro Carvalho Chehab 175df50e848SMauro Carvalho Chehab self.tokens = [] 176df50e848SMauro Carvalho Chehab 177df50e848SMauro Carvalho Chehab if not source: 178df50e848SMauro Carvalho Chehab return 179df50e848SMauro Carvalho Chehab 180df50e848SMauro Carvalho Chehab if isinstance(source, list): 181df50e848SMauro Carvalho Chehab self.tokens = source 182df50e848SMauro Carvalho Chehab return 183df50e848SMauro Carvalho Chehab 184df50e848SMauro Carvalho Chehab # 185df50e848SMauro Carvalho Chehab # While we could just use _tokenize directly via interator, 186df50e848SMauro Carvalho Chehab # As we'll need to use the tokenizer several times inside kernel-doc 187df50e848SMauro Carvalho Chehab # to handle macro transforms, cache the results on a list, as 188df50e848SMauro Carvalho Chehab # re-using it is cheaper than having to parse everytime. 189df50e848SMauro Carvalho Chehab # 190df50e848SMauro Carvalho Chehab for tok in self._tokenize(source): 191df50e848SMauro Carvalho Chehab self.tokens.append(tok) 192df50e848SMauro Carvalho Chehab 193df50e848SMauro Carvalho Chehab def _tokenize(self, source): 194df50e848SMauro Carvalho Chehab """ 195df50e848SMauro Carvalho Chehab Iterator that parses ``source``, splitting it into tokens, as defined 196df50e848SMauro Carvalho Chehab at ``self.RE_SCANNER_LIST``. 197df50e848SMauro Carvalho Chehab 198df50e848SMauro Carvalho Chehab The interactor returns a CToken class object. 199df50e848SMauro Carvalho Chehab """ 200df50e848SMauro Carvalho Chehab 201df50e848SMauro Carvalho Chehab # Handle continuation lines. Note that kdoc_parser already has a 202df50e848SMauro Carvalho Chehab # logic to do that. Still, let's keep it for completeness, as we might 203df50e848SMauro Carvalho Chehab # end re-using this tokenizer outsize kernel-doc some day - or we may 204df50e848SMauro Carvalho Chehab # eventually remove from there as a future cleanup. 205df50e848SMauro Carvalho Chehab source = RE_CONT.sub("", source) 206df50e848SMauro Carvalho Chehab 207df50e848SMauro Carvalho Chehab brace_level = 0 208df50e848SMauro Carvalho Chehab paren_level = 0 209df50e848SMauro Carvalho Chehab bracket_level = 0 210df50e848SMauro Carvalho Chehab 211df50e848SMauro Carvalho Chehab for match in RE_SCANNER.finditer(source): 212df50e848SMauro Carvalho Chehab kind = CToken.from_name(match.lastgroup) 213df50e848SMauro Carvalho Chehab pos = match.start() 214df50e848SMauro Carvalho Chehab value = match.group() 215df50e848SMauro Carvalho Chehab 216df50e848SMauro Carvalho Chehab if kind == CToken.MISMATCH: 217df50e848SMauro Carvalho Chehab log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 218df50e848SMauro Carvalho Chehab elif kind == CToken.BEGIN: 219df50e848SMauro Carvalho Chehab if value == '(': 220df50e848SMauro Carvalho Chehab paren_level += 1 221df50e848SMauro Carvalho Chehab elif value == '[': 222df50e848SMauro Carvalho Chehab bracket_level += 1 223df50e848SMauro Carvalho Chehab else: # value == '{' 224df50e848SMauro Carvalho Chehab brace_level += 1 225df50e848SMauro Carvalho Chehab 226df50e848SMauro Carvalho Chehab elif kind == CToken.END: 227df50e848SMauro Carvalho Chehab if value == ')' and paren_level > 0: 228df50e848SMauro Carvalho Chehab paren_level -= 1 229df50e848SMauro Carvalho Chehab elif value == ']' and bracket_level > 0: 230df50e848SMauro Carvalho Chehab bracket_level -= 1 231df50e848SMauro Carvalho Chehab elif brace_level > 0: # value == '}' 232df50e848SMauro Carvalho Chehab brace_level -= 1 233df50e848SMauro Carvalho Chehab 234df50e848SMauro Carvalho Chehab yield CToken(kind, value, pos, 235df50e848SMauro Carvalho Chehab brace_level, paren_level, bracket_level) 236df50e848SMauro Carvalho Chehab 237df50e848SMauro Carvalho Chehab def __str__(self): 238df50e848SMauro Carvalho Chehab out="" 239df50e848SMauro Carvalho Chehab show_stack = [True] 240df50e848SMauro Carvalho Chehab 241df50e848SMauro Carvalho Chehab for i, tok in enumerate(self.tokens): 242df50e848SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 243df50e848SMauro Carvalho Chehab show_stack.append(show_stack[-1]) 244df50e848SMauro Carvalho Chehab 245df50e848SMauro Carvalho Chehab elif tok.kind == CToken.END: 246df50e848SMauro Carvalho Chehab prev = show_stack[-1] 247df50e848SMauro Carvalho Chehab if len(show_stack) > 1: 248df50e848SMauro Carvalho Chehab show_stack.pop() 249df50e848SMauro Carvalho Chehab 250df50e848SMauro Carvalho Chehab if not prev and show_stack[-1]: 251df50e848SMauro Carvalho Chehab # 252df50e848SMauro Carvalho Chehab # Try to preserve indent 253df50e848SMauro Carvalho Chehab # 254df50e848SMauro Carvalho Chehab out += "\t" * (len(show_stack) - 1) 255df50e848SMauro Carvalho Chehab 256df50e848SMauro Carvalho Chehab out += str(tok.value) 257df50e848SMauro Carvalho Chehab continue 258df50e848SMauro Carvalho Chehab 259df50e848SMauro Carvalho Chehab elif tok.kind == CToken.COMMENT: 260df50e848SMauro Carvalho Chehab comment = RE_COMMENT_START.sub("", tok.value) 261df50e848SMauro Carvalho Chehab 262df50e848SMauro Carvalho Chehab if comment.startswith("private:"): 263df50e848SMauro Carvalho Chehab show_stack[-1] = False 264df50e848SMauro Carvalho Chehab show = False 265df50e848SMauro Carvalho Chehab elif comment.startswith("public:"): 266df50e848SMauro Carvalho Chehab show_stack[-1] = True 267df50e848SMauro Carvalho Chehab 268df50e848SMauro Carvalho Chehab continue 269df50e848SMauro Carvalho Chehab 270df50e848SMauro Carvalho Chehab if not show_stack[-1]: 271df50e848SMauro Carvalho Chehab continue 272df50e848SMauro Carvalho Chehab 273df50e848SMauro Carvalho Chehab if i < len(self.tokens) - 1: 274df50e848SMauro Carvalho Chehab next_tok = self.tokens[i + 1] 275df50e848SMauro Carvalho Chehab 276df50e848SMauro Carvalho Chehab # Do some cleanups before ";" 277df50e848SMauro Carvalho Chehab 278f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 279df50e848SMauro Carvalho Chehab continue 280df50e848SMauro Carvalho Chehab 281f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 282df50e848SMauro Carvalho Chehab continue 283df50e848SMauro Carvalho Chehab 284df50e848SMauro Carvalho Chehab out += str(tok.value) 285df50e848SMauro Carvalho Chehab 286df50e848SMauro Carvalho Chehab return out 287f1cf9f7cSMauro Carvalho Chehab 288f1cf9f7cSMauro Carvalho Chehab 289*9aaeb817SMauro Carvalho Chehabclass CTokenArgs: 290*9aaeb817SMauro Carvalho Chehab """ 291*9aaeb817SMauro Carvalho Chehab Ancillary class to help using backrefs from sub matches. 292*9aaeb817SMauro Carvalho Chehab 293*9aaeb817SMauro Carvalho Chehab If the highest backref contain a "+" at the last element, 294*9aaeb817SMauro Carvalho Chehab the logic will be greedy, picking all other delims. 295*9aaeb817SMauro Carvalho Chehab 296*9aaeb817SMauro Carvalho Chehab This is needed to parse struct_group macros with end with ``MEMBERS...``. 297*9aaeb817SMauro Carvalho Chehab """ 298*9aaeb817SMauro Carvalho Chehab def __init__(self, sub_str): 299*9aaeb817SMauro Carvalho Chehab self.sub_groups = set() 300*9aaeb817SMauro Carvalho Chehab self.max_group = -1 301*9aaeb817SMauro Carvalho Chehab self.greedy = None 302*9aaeb817SMauro Carvalho Chehab 303*9aaeb817SMauro Carvalho Chehab for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): 304*9aaeb817SMauro Carvalho Chehab group = int(m.group(1)) 305*9aaeb817SMauro Carvalho Chehab if m.group(2) == "+": 306*9aaeb817SMauro Carvalho Chehab if self.greedy and self.greedy != group: 307*9aaeb817SMauro Carvalho Chehab raise ValueError("There are multiple greedy patterns!") 308*9aaeb817SMauro Carvalho Chehab self.greedy = group 309*9aaeb817SMauro Carvalho Chehab 310*9aaeb817SMauro Carvalho Chehab self.sub_groups.add(group) 311*9aaeb817SMauro Carvalho Chehab self.max_group = max(self.max_group, group) 312*9aaeb817SMauro Carvalho Chehab 313*9aaeb817SMauro Carvalho Chehab if self.greedy: 314*9aaeb817SMauro Carvalho Chehab if self.greedy != self.max_group: 315*9aaeb817SMauro Carvalho Chehab raise ValueError("Greedy pattern is not the last one!") 316*9aaeb817SMauro Carvalho Chehab 317*9aaeb817SMauro Carvalho Chehab sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) 318*9aaeb817SMauro Carvalho Chehab 319*9aaeb817SMauro Carvalho Chehab self.sub_str = sub_str 320*9aaeb817SMauro Carvalho Chehab self.sub_tokeninzer = CTokenizer(sub_str) 321*9aaeb817SMauro Carvalho Chehab 322*9aaeb817SMauro Carvalho Chehab def groups(self, new_tokenizer): 323*9aaeb817SMauro Carvalho Chehab """ 324*9aaeb817SMauro Carvalho Chehab Create replacement arguments for backrefs like: 325*9aaeb817SMauro Carvalho Chehab 326*9aaeb817SMauro Carvalho Chehab ``\0``, ``\1``, ``\2``, ...``\n`` 327*9aaeb817SMauro Carvalho Chehab 328*9aaeb817SMauro Carvalho Chehab It also accepts a ``+`` character to the highest backref. When used, 329*9aaeb817SMauro Carvalho Chehab it means in practice to ignore delimins after it, being greedy. 330*9aaeb817SMauro Carvalho Chehab 331*9aaeb817SMauro Carvalho Chehab The logic is smart enough to only go up to the maximum required 332*9aaeb817SMauro Carvalho Chehab argument, even if there are more. 333*9aaeb817SMauro Carvalho Chehab 334*9aaeb817SMauro Carvalho Chehab If there is a backref for an argument above the limit, it will 335*9aaeb817SMauro Carvalho Chehab raise an exception. Please notice that, on C, square brackets 336*9aaeb817SMauro Carvalho Chehab don't have any separator on it. Trying to use ``\1``..``\n`` for 337*9aaeb817SMauro Carvalho Chehab brackets also raise an exception. 338*9aaeb817SMauro Carvalho Chehab """ 339*9aaeb817SMauro Carvalho Chehab 340*9aaeb817SMauro Carvalho Chehab level = (0, 0, 0) 341*9aaeb817SMauro Carvalho Chehab 342*9aaeb817SMauro Carvalho Chehab if self.max_group < 0: 343*9aaeb817SMauro Carvalho Chehab return level, [] 344*9aaeb817SMauro Carvalho Chehab 345*9aaeb817SMauro Carvalho Chehab tokens = new_tokenizer.tokens 346*9aaeb817SMauro Carvalho Chehab 347*9aaeb817SMauro Carvalho Chehab # 348*9aaeb817SMauro Carvalho Chehab # Fill \0 with the full token contents 349*9aaeb817SMauro Carvalho Chehab # 350*9aaeb817SMauro Carvalho Chehab groups_list = [ [] ] 351*9aaeb817SMauro Carvalho Chehab 352*9aaeb817SMauro Carvalho Chehab if 0 in self.sub_groups: 353*9aaeb817SMauro Carvalho Chehab inner_level = 0 354*9aaeb817SMauro Carvalho Chehab 355*9aaeb817SMauro Carvalho Chehab for i in range(0, len(tokens)): 356*9aaeb817SMauro Carvalho Chehab tok = tokens[i] 357*9aaeb817SMauro Carvalho Chehab 358*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 359*9aaeb817SMauro Carvalho Chehab inner_level += 1 360*9aaeb817SMauro Carvalho Chehab 361*9aaeb817SMauro Carvalho Chehab # 362*9aaeb817SMauro Carvalho Chehab # Discard first begin 363*9aaeb817SMauro Carvalho Chehab # 364*9aaeb817SMauro Carvalho Chehab if not groups_list[0]: 365*9aaeb817SMauro Carvalho Chehab continue 366*9aaeb817SMauro Carvalho Chehab elif tok.kind == CToken.END: 367*9aaeb817SMauro Carvalho Chehab inner_level -= 1 368*9aaeb817SMauro Carvalho Chehab if inner_level < 0: 369*9aaeb817SMauro Carvalho Chehab break 370*9aaeb817SMauro Carvalho Chehab 371*9aaeb817SMauro Carvalho Chehab if inner_level: 372*9aaeb817SMauro Carvalho Chehab groups_list[0].append(tok) 373*9aaeb817SMauro Carvalho Chehab 374*9aaeb817SMauro Carvalho Chehab if not self.max_group: 375*9aaeb817SMauro Carvalho Chehab return level, groups_list 376*9aaeb817SMauro Carvalho Chehab 377*9aaeb817SMauro Carvalho Chehab delim = None 378*9aaeb817SMauro Carvalho Chehab 379*9aaeb817SMauro Carvalho Chehab # 380*9aaeb817SMauro Carvalho Chehab # Ignore everything before BEGIN. The value of begin gives the 381*9aaeb817SMauro Carvalho Chehab # delimiter to be used for the matches 382*9aaeb817SMauro Carvalho Chehab # 383*9aaeb817SMauro Carvalho Chehab for i in range(0, len(tokens)): 384*9aaeb817SMauro Carvalho Chehab tok = tokens[i] 385*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 386*9aaeb817SMauro Carvalho Chehab if tok.value == "{": 387*9aaeb817SMauro Carvalho Chehab delim = ";" 388*9aaeb817SMauro Carvalho Chehab elif tok.value == "(": 389*9aaeb817SMauro Carvalho Chehab delim = "," 390*9aaeb817SMauro Carvalho Chehab else: 391*9aaeb817SMauro Carvalho Chehab self.log.error(fr"Can't handle \1..\n on {sub_str}") 392*9aaeb817SMauro Carvalho Chehab 393*9aaeb817SMauro Carvalho Chehab level = tok.level 394*9aaeb817SMauro Carvalho Chehab break 395*9aaeb817SMauro Carvalho Chehab 396*9aaeb817SMauro Carvalho Chehab pos = 1 397*9aaeb817SMauro Carvalho Chehab groups_list.append([]) 398*9aaeb817SMauro Carvalho Chehab 399*9aaeb817SMauro Carvalho Chehab inner_level = 0 400*9aaeb817SMauro Carvalho Chehab for i in range(i + 1, len(tokens)): 401*9aaeb817SMauro Carvalho Chehab tok = tokens[i] 402*9aaeb817SMauro Carvalho Chehab 403*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 404*9aaeb817SMauro Carvalho Chehab inner_level += 1 405*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.END: 406*9aaeb817SMauro Carvalho Chehab inner_level -= 1 407*9aaeb817SMauro Carvalho Chehab if inner_level < 0: 408*9aaeb817SMauro Carvalho Chehab break 409*9aaeb817SMauro Carvalho Chehab 410*9aaeb817SMauro Carvalho Chehab if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value: 411*9aaeb817SMauro Carvalho Chehab pos += 1 412*9aaeb817SMauro Carvalho Chehab if self.greedy and pos > self.max_group: 413*9aaeb817SMauro Carvalho Chehab pos -= 1 414*9aaeb817SMauro Carvalho Chehab else: 415*9aaeb817SMauro Carvalho Chehab groups_list.append([]) 416*9aaeb817SMauro Carvalho Chehab 417*9aaeb817SMauro Carvalho Chehab if pos > self.max_group: 418*9aaeb817SMauro Carvalho Chehab break 419*9aaeb817SMauro Carvalho Chehab 420*9aaeb817SMauro Carvalho Chehab continue 421*9aaeb817SMauro Carvalho Chehab 422*9aaeb817SMauro Carvalho Chehab groups_list[pos].append(tok) 423*9aaeb817SMauro Carvalho Chehab 424*9aaeb817SMauro Carvalho Chehab if pos < self.max_group: 425*9aaeb817SMauro Carvalho Chehab log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}") 426*9aaeb817SMauro Carvalho Chehab 427*9aaeb817SMauro Carvalho Chehab return level, groups_list 428*9aaeb817SMauro Carvalho Chehab 429*9aaeb817SMauro Carvalho Chehab def tokens(self, new_tokenizer): 430*9aaeb817SMauro Carvalho Chehab level, groups = self.groups(new_tokenizer) 431*9aaeb817SMauro Carvalho Chehab 432*9aaeb817SMauro Carvalho Chehab new = CTokenizer() 433*9aaeb817SMauro Carvalho Chehab 434*9aaeb817SMauro Carvalho Chehab for tok in self.sub_tokeninzer.tokens: 435*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BACKREF: 436*9aaeb817SMauro Carvalho Chehab group = int(tok.value[1:]) 437*9aaeb817SMauro Carvalho Chehab 438*9aaeb817SMauro Carvalho Chehab for group_tok in groups[group]: 439*9aaeb817SMauro Carvalho Chehab new_tok = copy(group_tok) 440*9aaeb817SMauro Carvalho Chehab 441*9aaeb817SMauro Carvalho Chehab new_level = [0, 0, 0] 442*9aaeb817SMauro Carvalho Chehab 443*9aaeb817SMauro Carvalho Chehab for i in range(0, len(level)): 444*9aaeb817SMauro Carvalho Chehab new_level[i] = new_tok.level[i] + level[i] 445*9aaeb817SMauro Carvalho Chehab 446*9aaeb817SMauro Carvalho Chehab new_tok.level = tuple(new_level) 447*9aaeb817SMauro Carvalho Chehab 448*9aaeb817SMauro Carvalho Chehab new.tokens += [ new_tok ] 449*9aaeb817SMauro Carvalho Chehab else: 450*9aaeb817SMauro Carvalho Chehab new.tokens += [ tok ] 451*9aaeb817SMauro Carvalho Chehab 452*9aaeb817SMauro Carvalho Chehab return new.tokens 453*9aaeb817SMauro Carvalho Chehab 454*9aaeb817SMauro Carvalho Chehab 455f1cf9f7cSMauro Carvalho Chehabclass CMatch: 456f1cf9f7cSMauro Carvalho Chehab """ 457f1cf9f7cSMauro Carvalho Chehab Finding nested delimiters is hard with regular expressions. It is 458f1cf9f7cSMauro Carvalho Chehab even harder on Python with its normal re module, as there are several 459f1cf9f7cSMauro Carvalho Chehab advanced regular expressions that are missing. 460f1cf9f7cSMauro Carvalho Chehab 461f1cf9f7cSMauro Carvalho Chehab This is the case of this pattern:: 462f1cf9f7cSMauro Carvalho Chehab 463f1cf9f7cSMauro Carvalho Chehab '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 464f1cf9f7cSMauro Carvalho Chehab 465f1cf9f7cSMauro Carvalho Chehab which is used to properly match open/close parentheses of the 466f1cf9f7cSMauro Carvalho Chehab string search STRUCT_GROUP(), 467f1cf9f7cSMauro Carvalho Chehab 468f1cf9f7cSMauro Carvalho Chehab Add a class that counts pairs of delimiters, using it to match and 469f1cf9f7cSMauro Carvalho Chehab replace nested expressions. 470f1cf9f7cSMauro Carvalho Chehab 471f1cf9f7cSMauro Carvalho Chehab The original approach was suggested by: 472f1cf9f7cSMauro Carvalho Chehab 473f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 474f1cf9f7cSMauro Carvalho Chehab 475f1cf9f7cSMauro Carvalho Chehab Although I re-implemented it to make it more generic and match 3 types 476f1cf9f7cSMauro Carvalho Chehab of delimiters. The logic checks if delimiters are paired. If not, it 477f1cf9f7cSMauro Carvalho Chehab will ignore the search string. 478f1cf9f7cSMauro Carvalho Chehab """ 479f1cf9f7cSMauro Carvalho Chehab 480f1cf9f7cSMauro Carvalho Chehab 481*9aaeb817SMauro Carvalho Chehab def __init__(self, regex, delim="("): 482*9aaeb817SMauro Carvalho Chehab self.regex = KernRe("^" + regex + r"\b") 483*9aaeb817SMauro Carvalho Chehab self.start_delim = delim 484f1cf9f7cSMauro Carvalho Chehab 485f1cf9f7cSMauro Carvalho Chehab def _search(self, tokenizer): 486f1cf9f7cSMauro Carvalho Chehab """ 487f1cf9f7cSMauro Carvalho Chehab Finds paired blocks for a regex that ends with a delimiter. 488f1cf9f7cSMauro Carvalho Chehab 489f1cf9f7cSMauro Carvalho Chehab The suggestion of using finditer to match pairs came from: 490f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 491f1cf9f7cSMauro Carvalho Chehab but I ended using a different implementation to align all three types 492f1cf9f7cSMauro Carvalho Chehab of delimiters and seek for an initial regular expression. 493f1cf9f7cSMauro Carvalho Chehab 494f1cf9f7cSMauro Carvalho Chehab The algorithm seeks for open/close paired delimiters and places them 495f1cf9f7cSMauro Carvalho Chehab into a stack, yielding a start/stop position of each match when the 496f1cf9f7cSMauro Carvalho Chehab stack is zeroed. 497f1cf9f7cSMauro Carvalho Chehab 498f1cf9f7cSMauro Carvalho Chehab The algorithm should work fine for properly paired lines, but will 499f1cf9f7cSMauro Carvalho Chehab silently ignore end delimiters that precede a start delimiter. 500f1cf9f7cSMauro Carvalho Chehab This should be OK for kernel-doc parser, as unaligned delimiters 501f1cf9f7cSMauro Carvalho Chehab would cause compilation errors. So, we don't need to raise exceptions 502f1cf9f7cSMauro Carvalho Chehab to cover such issues. 503f1cf9f7cSMauro Carvalho Chehab """ 504f1cf9f7cSMauro Carvalho Chehab 505f1cf9f7cSMauro Carvalho Chehab start = None 506f1cf9f7cSMauro Carvalho Chehab started = False 507f1cf9f7cSMauro Carvalho Chehab 508f1cf9f7cSMauro Carvalho Chehab import sys 509f1cf9f7cSMauro Carvalho Chehab 510f1cf9f7cSMauro Carvalho Chehab stack = [] 511f1cf9f7cSMauro Carvalho Chehab 512f1cf9f7cSMauro Carvalho Chehab for i, tok in enumerate(tokenizer.tokens): 513f1cf9f7cSMauro Carvalho Chehab if start is None: 514f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.NAME and self.regex.match(tok.value): 515f1cf9f7cSMauro Carvalho Chehab start = i 516f1cf9f7cSMauro Carvalho Chehab stack.append((start, tok.level)) 517f1cf9f7cSMauro Carvalho Chehab started = False 518f1cf9f7cSMauro Carvalho Chehab 519f1cf9f7cSMauro Carvalho Chehab continue 520f1cf9f7cSMauro Carvalho Chehab 521*9aaeb817SMauro Carvalho Chehab if not started: 522*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.SPACE: 523*9aaeb817SMauro Carvalho Chehab continue 524*9aaeb817SMauro Carvalho Chehab 525*9aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN and tok.value == self.start_delim: 526f1cf9f7cSMauro Carvalho Chehab started = True 527f1cf9f7cSMauro Carvalho Chehab continue 528f1cf9f7cSMauro Carvalho Chehab 529*9aaeb817SMauro Carvalho Chehab # Name only token without BEGIN/END 530*9aaeb817SMauro Carvalho Chehab if i > start: 531*9aaeb817SMauro Carvalho Chehab i -= 1 532*9aaeb817SMauro Carvalho Chehab yield start, i 533*9aaeb817SMauro Carvalho Chehab start = None 534*9aaeb817SMauro Carvalho Chehab 535f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.END and tok.level == stack[-1][1]: 536f1cf9f7cSMauro Carvalho Chehab start, level = stack.pop() 537f1cf9f7cSMauro Carvalho Chehab 538*9aaeb817SMauro Carvalho Chehab yield start, i 539f1cf9f7cSMauro Carvalho Chehab start = None 540f1cf9f7cSMauro Carvalho Chehab 541f1cf9f7cSMauro Carvalho Chehab # 542f1cf9f7cSMauro Carvalho Chehab # If an END zeroing levels is not there, return remaining stuff 543f1cf9f7cSMauro Carvalho Chehab # This is meant to solve cases where the caller logic might be 544f1cf9f7cSMauro Carvalho Chehab # picking an incomplete block. 545f1cf9f7cSMauro Carvalho Chehab # 546*9aaeb817SMauro Carvalho Chehab if start and stack: 547*9aaeb817SMauro Carvalho Chehab if started: 548*9aaeb817SMauro Carvalho Chehab s = str(tokenizer) 549*9aaeb817SMauro Carvalho Chehab log.warning(f"can't find a final end at {s}") 550*9aaeb817SMauro Carvalho Chehab 551*9aaeb817SMauro Carvalho Chehab yield start, len(tokenizer.tokens) 552f1cf9f7cSMauro Carvalho Chehab 553f1cf9f7cSMauro Carvalho Chehab def search(self, source): 554f1cf9f7cSMauro Carvalho Chehab """ 555f1cf9f7cSMauro Carvalho Chehab This is similar to re.search: 556f1cf9f7cSMauro Carvalho Chehab 557f1cf9f7cSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 558f1cf9f7cSMauro Carvalho Chehab returning occurrences only if all delimiters are paired. 559f1cf9f7cSMauro Carvalho Chehab """ 560f1cf9f7cSMauro Carvalho Chehab 561f1cf9f7cSMauro Carvalho Chehab if isinstance(source, CTokenizer): 562f1cf9f7cSMauro Carvalho Chehab tokenizer = source 563f1cf9f7cSMauro Carvalho Chehab is_token = True 564f1cf9f7cSMauro Carvalho Chehab else: 565f1cf9f7cSMauro Carvalho Chehab tokenizer = CTokenizer(source) 566f1cf9f7cSMauro Carvalho Chehab is_token = False 567f1cf9f7cSMauro Carvalho Chehab 568*9aaeb817SMauro Carvalho Chehab for start, end in self._search(tokenizer): 569*9aaeb817SMauro Carvalho Chehab new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1]) 570*9aaeb817SMauro Carvalho Chehab 571f1cf9f7cSMauro Carvalho Chehab if is_token: 572f1cf9f7cSMauro Carvalho Chehab yield new_tokenizer 573f1cf9f7cSMauro Carvalho Chehab else: 574f1cf9f7cSMauro Carvalho Chehab yield str(new_tokenizer) 575*9aaeb817SMauro Carvalho Chehab 576*9aaeb817SMauro Carvalho Chehab def sub(self, sub_str, source, count=0): 577*9aaeb817SMauro Carvalho Chehab """ 578*9aaeb817SMauro Carvalho Chehab This is similar to re.sub: 579*9aaeb817SMauro Carvalho Chehab 580*9aaeb817SMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 581*9aaeb817SMauro Carvalho Chehab replacing occurrences only if all delimiters are paired. 582*9aaeb817SMauro Carvalho Chehab 583*9aaeb817SMauro Carvalho Chehab if the sub argument contains:: 584*9aaeb817SMauro Carvalho Chehab 585*9aaeb817SMauro Carvalho Chehab r'\0' 586*9aaeb817SMauro Carvalho Chehab 587*9aaeb817SMauro Carvalho Chehab it will work just like re: it places there the matched paired data 588*9aaeb817SMauro Carvalho Chehab with the delimiter stripped. 589*9aaeb817SMauro Carvalho Chehab 590*9aaeb817SMauro Carvalho Chehab If count is different than zero, it will replace at most count 591*9aaeb817SMauro Carvalho Chehab items. 592*9aaeb817SMauro Carvalho Chehab """ 593*9aaeb817SMauro Carvalho Chehab if isinstance(source, CTokenizer): 594*9aaeb817SMauro Carvalho Chehab is_token = True 595*9aaeb817SMauro Carvalho Chehab tokenizer = source 596*9aaeb817SMauro Carvalho Chehab else: 597*9aaeb817SMauro Carvalho Chehab is_token = False 598*9aaeb817SMauro Carvalho Chehab tokenizer = CTokenizer(source) 599*9aaeb817SMauro Carvalho Chehab 600*9aaeb817SMauro Carvalho Chehab # Detect if sub_str contains sub arguments 601*9aaeb817SMauro Carvalho Chehab 602*9aaeb817SMauro Carvalho Chehab args_match = CTokenArgs(sub_str) 603*9aaeb817SMauro Carvalho Chehab 604*9aaeb817SMauro Carvalho Chehab new_tokenizer = CTokenizer() 605*9aaeb817SMauro Carvalho Chehab pos = 0 606*9aaeb817SMauro Carvalho Chehab n = 0 607*9aaeb817SMauro Carvalho Chehab 608*9aaeb817SMauro Carvalho Chehab # 609*9aaeb817SMauro Carvalho Chehab # NOTE: the code below doesn't consider overlays at sub. 610*9aaeb817SMauro Carvalho Chehab # We may need to add some extra unit tests to check if those 611*9aaeb817SMauro Carvalho Chehab # would cause problems. When replacing by "", this should not 612*9aaeb817SMauro Carvalho Chehab # be a problem, but other transformations could be problematic 613*9aaeb817SMauro Carvalho Chehab # 614*9aaeb817SMauro Carvalho Chehab for start, end in self._search(tokenizer): 615*9aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += tokenizer.tokens[pos:start] 616*9aaeb817SMauro Carvalho Chehab 617*9aaeb817SMauro Carvalho Chehab new = CTokenizer(tokenizer.tokens[start:end + 1]) 618*9aaeb817SMauro Carvalho Chehab 619*9aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += args_match.tokens(new) 620*9aaeb817SMauro Carvalho Chehab 621*9aaeb817SMauro Carvalho Chehab pos = end + 1 622*9aaeb817SMauro Carvalho Chehab 623*9aaeb817SMauro Carvalho Chehab n += 1 624*9aaeb817SMauro Carvalho Chehab if count and n >= count: 625*9aaeb817SMauro Carvalho Chehab break 626*9aaeb817SMauro Carvalho Chehab 627*9aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += tokenizer.tokens[pos:] 628*9aaeb817SMauro Carvalho Chehab 629*9aaeb817SMauro Carvalho Chehab if not is_token: 630*9aaeb817SMauro Carvalho Chehab return str(new_tokenizer) 631*9aaeb817SMauro Carvalho Chehab 632*9aaeb817SMauro Carvalho Chehab return new_tokenizer 633*9aaeb817SMauro Carvalho Chehab 634*9aaeb817SMauro Carvalho Chehab def __repr__(self): 635*9aaeb817SMauro Carvalho Chehab """ 636*9aaeb817SMauro Carvalho Chehab Returns a displayable version of the class init. 637*9aaeb817SMauro Carvalho Chehab """ 638*9aaeb817SMauro Carvalho Chehab 639*9aaeb817SMauro Carvalho Chehab return f'CMatch("{self.regex.regex.pattern}")' 640