1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3 2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4df50e848SMauro Carvalho Chehab 5df50e848SMauro Carvalho Chehab""" 6df50e848SMauro Carvalho ChehabRegular expression ancillary classes. 7df50e848SMauro Carvalho Chehab 8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9df50e848SMauro Carvalho Chehab 10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad 11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern. 12df50e848SMauro Carvalho Chehab 13df50e848SMauro Carvalho ChehabOther errors are logged via log instance. 14df50e848SMauro Carvalho Chehab""" 15df50e848SMauro Carvalho Chehab 16df50e848SMauro Carvalho Chehabimport logging 17df50e848SMauro Carvalho Chehabimport re 18df50e848SMauro Carvalho Chehab 19df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe 20df50e848SMauro Carvalho Chehab 21df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__) 22df50e848SMauro Carvalho Chehab 23df50e848SMauro Carvalho Chehab 24df50e848SMauro Carvalho Chehabclass CToken(): 25df50e848SMauro Carvalho Chehab """ 26df50e848SMauro Carvalho Chehab Data class to define a C token. 27df50e848SMauro Carvalho Chehab """ 28df50e848SMauro Carvalho Chehab 29df50e848SMauro Carvalho Chehab # Tokens that can be used by the parser. Works like an C enum. 30df50e848SMauro Carvalho Chehab 31df50e848SMauro Carvalho Chehab COMMENT = 0 #: A standard C or C99 comment, including delimiter. 32df50e848SMauro Carvalho Chehab STRING = 1 #: A string, including quotation marks. 33df50e848SMauro Carvalho Chehab CHAR = 2 #: A character, including apostophes. 34df50e848SMauro Carvalho Chehab NUMBER = 3 #: A number. 35df50e848SMauro Carvalho Chehab PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 36df50e848SMauro Carvalho Chehab BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 37df50e848SMauro Carvalho Chehab END = 6 #: A end character: ``}`` / ``]`` / ``)``. 38df50e848SMauro Carvalho Chehab CPP = 7 #: A preprocessor macro. 39df50e848SMauro Carvalho Chehab HASH = 8 #: The hash character - useful to handle other macros. 40df50e848SMauro Carvalho Chehab OP = 9 #: A C operator (add, subtract, ...). 41df50e848SMauro Carvalho Chehab STRUCT = 10 #: A ``struct`` keyword. 42df50e848SMauro Carvalho Chehab UNION = 11 #: An ``union`` keyword. 43df50e848SMauro Carvalho Chehab ENUM = 12 #: A ``struct`` keyword. 44df50e848SMauro Carvalho Chehab TYPEDEF = 13 #: A ``typedef`` keyword. 45df50e848SMauro Carvalho Chehab NAME = 14 #: A name. Can be an ID or a type. 46df50e848SMauro Carvalho Chehab SPACE = 15 #: Any space characters, including new lines 47df50e848SMauro Carvalho Chehab ENDSTMT = 16 #: End of an statement (``;``). 48df50e848SMauro Carvalho Chehab 49df50e848SMauro Carvalho Chehab BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 50df50e848SMauro Carvalho Chehab 51df50e848SMauro Carvalho Chehab MISMATCH = 255 #: an error indicator: should never happen in practice. 52df50e848SMauro Carvalho Chehab 53df50e848SMauro Carvalho Chehab # Dict to convert from an enum interger into a string. 54df50e848SMauro Carvalho Chehab _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 55df50e848SMauro Carvalho Chehab 56df50e848SMauro Carvalho Chehab # Dict to convert from string to an enum-like integer value. 57df50e848SMauro Carvalho Chehab _name_to_val = {k: v for v, k in _name_by_val.items()} 58df50e848SMauro Carvalho Chehab 59df50e848SMauro Carvalho Chehab @staticmethod 60df50e848SMauro Carvalho Chehab def to_name(val): 61df50e848SMauro Carvalho Chehab """Convert from an integer value from CToken enum into a string""" 62df50e848SMauro Carvalho Chehab 63df50e848SMauro Carvalho Chehab return CToken._name_by_val.get(val, f"UNKNOWN({val})") 64df50e848SMauro Carvalho Chehab 65df50e848SMauro Carvalho Chehab @staticmethod 66df50e848SMauro Carvalho Chehab def from_name(name): 67df50e848SMauro Carvalho Chehab """Convert a string into a CToken enum value""" 68df50e848SMauro Carvalho Chehab if name in CToken._name_to_val: 69df50e848SMauro Carvalho Chehab return CToken._name_to_val[name] 70df50e848SMauro Carvalho Chehab 71df50e848SMauro Carvalho Chehab return CToken.MISMATCH 72df50e848SMauro Carvalho Chehab 73df50e848SMauro Carvalho Chehab 74df50e848SMauro Carvalho Chehab def __init__(self, kind, value=None, pos=0, 75df50e848SMauro Carvalho Chehab brace_level=0, paren_level=0, bracket_level=0): 76df50e848SMauro Carvalho Chehab self.kind = kind 77df50e848SMauro Carvalho Chehab self.value = value 78df50e848SMauro Carvalho Chehab self.pos = pos 79df50e848SMauro Carvalho Chehab self.level = (bracket_level, paren_level, brace_level) 80df50e848SMauro Carvalho Chehab 81df50e848SMauro Carvalho Chehab def __repr__(self): 82df50e848SMauro Carvalho Chehab name = self.to_name(self.kind) 83df50e848SMauro Carvalho Chehab if isinstance(self.value, str): 84df50e848SMauro Carvalho Chehab value = '"' + self.value + '"' 85df50e848SMauro Carvalho Chehab else: 86df50e848SMauro Carvalho Chehab value = self.value 87df50e848SMauro Carvalho Chehab 88df50e848SMauro Carvalho Chehab return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 89df50e848SMauro Carvalho Chehab 90df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens. 91df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [ 92df50e848SMauro Carvalho Chehab # 93df50e848SMauro Carvalho Chehab # Note that \s\S is different than .*, as it also catches \n 94df50e848SMauro Carvalho Chehab # 95df50e848SMauro Carvalho Chehab (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 96df50e848SMauro Carvalho Chehab 97df50e848SMauro Carvalho Chehab (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 98df50e848SMauro Carvalho Chehab (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 99df50e848SMauro Carvalho Chehab 100df50e848SMauro Carvalho Chehab (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 101df50e848SMauro Carvalho Chehab r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 102df50e848SMauro Carvalho Chehab 103df50e848SMauro Carvalho Chehab (CToken.ENDSTMT, r"(?:\s+;|;)"), 104df50e848SMauro Carvalho Chehab 105df50e848SMauro Carvalho Chehab (CToken.PUNC, r"[,\.]"), 106df50e848SMauro Carvalho Chehab 107df50e848SMauro Carvalho Chehab (CToken.BEGIN, r"[\[\(\{]"), 108df50e848SMauro Carvalho Chehab 109df50e848SMauro Carvalho Chehab (CToken.END, r"[\]\)\}]"), 110df50e848SMauro Carvalho Chehab 111df50e848SMauro Carvalho Chehab (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 112df50e848SMauro Carvalho Chehab 113df50e848SMauro Carvalho Chehab (CToken.HASH, r"#"), 114df50e848SMauro Carvalho Chehab 115df50e848SMauro Carvalho Chehab (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 116df50e848SMauro Carvalho Chehab r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 117df50e848SMauro Carvalho Chehab 118df50e848SMauro Carvalho Chehab (CToken.STRUCT, r"\bstruct\b"), 119df50e848SMauro Carvalho Chehab (CToken.UNION, r"\bunion\b"), 120df50e848SMauro Carvalho Chehab (CToken.ENUM, r"\benum\b"), 121df50e848SMauro Carvalho Chehab (CToken.TYPEDEF, r"\btypedef\b"), 122df50e848SMauro Carvalho Chehab 123df50e848SMauro Carvalho Chehab (CToken.NAME, r"[A-Za-z_]\w*"), 124df50e848SMauro Carvalho Chehab 125df50e848SMauro Carvalho Chehab (CToken.SPACE, r"\s+"), 126df50e848SMauro Carvalho Chehab 127df50e848SMauro Carvalho Chehab (CToken.BACKREF, r"\\\d+"), 128df50e848SMauro Carvalho Chehab 129df50e848SMauro Carvalho Chehab (CToken.MISMATCH,r"."), 130df50e848SMauro Carvalho Chehab] 131df50e848SMauro Carvalho Chehab 132df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list): 133df50e848SMauro Carvalho Chehab """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 134df50e848SMauro Carvalho Chehab re_tokens = [] 135df50e848SMauro Carvalho Chehab 136df50e848SMauro Carvalho Chehab for kind, pattern in token_list: 137df50e848SMauro Carvalho Chehab name = CToken.to_name(kind) 138df50e848SMauro Carvalho Chehab re_tokens.append(f"(?P<{name}>{pattern})") 139df50e848SMauro Carvalho Chehab 140df50e848SMauro Carvalho Chehab return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 141df50e848SMauro Carvalho Chehab 142df50e848SMauro Carvalho Chehab#: Handle C continuation lines. 143df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n") 144df50e848SMauro Carvalho Chehab 145df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*') 146df50e848SMauro Carvalho Chehab 147df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage. 148df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 149df50e848SMauro Carvalho Chehab 150df50e848SMauro Carvalho Chehab 151df50e848SMauro Carvalho Chehabclass CTokenizer(): 152df50e848SMauro Carvalho Chehab """ 153df50e848SMauro Carvalho Chehab Scan C statements and definitions and produce tokens. 154df50e848SMauro Carvalho Chehab 155df50e848SMauro Carvalho Chehab When converted to string, it drops comments and handle public/private 156df50e848SMauro Carvalho Chehab values, respecting depth. 157df50e848SMauro Carvalho Chehab """ 158df50e848SMauro Carvalho Chehab 159df50e848SMauro Carvalho Chehab # This class is inspired and follows the basic concepts of: 160df50e848SMauro Carvalho Chehab # https://docs.python.org/3/library/re.html#writing-a-tokenizer 161df50e848SMauro Carvalho Chehab 162df50e848SMauro Carvalho Chehab def __init__(self, source=None, log=None): 163df50e848SMauro Carvalho Chehab """ 164df50e848SMauro Carvalho Chehab Create a regular expression to handle RE_SCANNER_LIST. 165df50e848SMauro Carvalho Chehab 166df50e848SMauro Carvalho Chehab While I generally don't like using regex group naming via: 167df50e848SMauro Carvalho Chehab (?P<name>...) 168df50e848SMauro Carvalho Chehab 169df50e848SMauro Carvalho Chehab in this particular case, it makes sense, as we can pick the name 170df50e848SMauro Carvalho Chehab when matching a code via RE_SCANNER. 171df50e848SMauro Carvalho Chehab """ 172df50e848SMauro Carvalho Chehab 173df50e848SMauro Carvalho Chehab self.tokens = [] 174df50e848SMauro Carvalho Chehab 175df50e848SMauro Carvalho Chehab if not source: 176df50e848SMauro Carvalho Chehab return 177df50e848SMauro Carvalho Chehab 178df50e848SMauro Carvalho Chehab if isinstance(source, list): 179df50e848SMauro Carvalho Chehab self.tokens = source 180df50e848SMauro Carvalho Chehab return 181df50e848SMauro Carvalho Chehab 182df50e848SMauro Carvalho Chehab # 183df50e848SMauro Carvalho Chehab # While we could just use _tokenize directly via interator, 184df50e848SMauro Carvalho Chehab # As we'll need to use the tokenizer several times inside kernel-doc 185df50e848SMauro Carvalho Chehab # to handle macro transforms, cache the results on a list, as 186df50e848SMauro Carvalho Chehab # re-using it is cheaper than having to parse everytime. 187df50e848SMauro Carvalho Chehab # 188df50e848SMauro Carvalho Chehab for tok in self._tokenize(source): 189df50e848SMauro Carvalho Chehab self.tokens.append(tok) 190df50e848SMauro Carvalho Chehab 191df50e848SMauro Carvalho Chehab def _tokenize(self, source): 192df50e848SMauro Carvalho Chehab """ 193df50e848SMauro Carvalho Chehab Iterator that parses ``source``, splitting it into tokens, as defined 194df50e848SMauro Carvalho Chehab at ``self.RE_SCANNER_LIST``. 195df50e848SMauro Carvalho Chehab 196df50e848SMauro Carvalho Chehab The interactor returns a CToken class object. 197df50e848SMauro Carvalho Chehab """ 198df50e848SMauro Carvalho Chehab 199df50e848SMauro Carvalho Chehab # Handle continuation lines. Note that kdoc_parser already has a 200df50e848SMauro Carvalho Chehab # logic to do that. Still, let's keep it for completeness, as we might 201df50e848SMauro Carvalho Chehab # end re-using this tokenizer outsize kernel-doc some day - or we may 202df50e848SMauro Carvalho Chehab # eventually remove from there as a future cleanup. 203df50e848SMauro Carvalho Chehab source = RE_CONT.sub("", source) 204df50e848SMauro Carvalho Chehab 205df50e848SMauro Carvalho Chehab brace_level = 0 206df50e848SMauro Carvalho Chehab paren_level = 0 207df50e848SMauro Carvalho Chehab bracket_level = 0 208df50e848SMauro Carvalho Chehab 209df50e848SMauro Carvalho Chehab for match in RE_SCANNER.finditer(source): 210df50e848SMauro Carvalho Chehab kind = CToken.from_name(match.lastgroup) 211df50e848SMauro Carvalho Chehab pos = match.start() 212df50e848SMauro Carvalho Chehab value = match.group() 213df50e848SMauro Carvalho Chehab 214df50e848SMauro Carvalho Chehab if kind == CToken.MISMATCH: 215df50e848SMauro Carvalho Chehab log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 216df50e848SMauro Carvalho Chehab elif kind == CToken.BEGIN: 217df50e848SMauro Carvalho Chehab if value == '(': 218df50e848SMauro Carvalho Chehab paren_level += 1 219df50e848SMauro Carvalho Chehab elif value == '[': 220df50e848SMauro Carvalho Chehab bracket_level += 1 221df50e848SMauro Carvalho Chehab else: # value == '{' 222df50e848SMauro Carvalho Chehab brace_level += 1 223df50e848SMauro Carvalho Chehab 224df50e848SMauro Carvalho Chehab elif kind == CToken.END: 225df50e848SMauro Carvalho Chehab if value == ')' and paren_level > 0: 226df50e848SMauro Carvalho Chehab paren_level -= 1 227df50e848SMauro Carvalho Chehab elif value == ']' and bracket_level > 0: 228df50e848SMauro Carvalho Chehab bracket_level -= 1 229df50e848SMauro Carvalho Chehab elif brace_level > 0: # value == '}' 230df50e848SMauro Carvalho Chehab brace_level -= 1 231df50e848SMauro Carvalho Chehab 232df50e848SMauro Carvalho Chehab yield CToken(kind, value, pos, 233df50e848SMauro Carvalho Chehab brace_level, paren_level, bracket_level) 234df50e848SMauro Carvalho Chehab 235df50e848SMauro Carvalho Chehab def __str__(self): 236df50e848SMauro Carvalho Chehab out="" 237df50e848SMauro Carvalho Chehab show_stack = [True] 238df50e848SMauro Carvalho Chehab 239df50e848SMauro Carvalho Chehab for i, tok in enumerate(self.tokens): 240df50e848SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 241df50e848SMauro Carvalho Chehab show_stack.append(show_stack[-1]) 242df50e848SMauro Carvalho Chehab 243df50e848SMauro Carvalho Chehab elif tok.kind == CToken.END: 244df50e848SMauro Carvalho Chehab prev = show_stack[-1] 245df50e848SMauro Carvalho Chehab if len(show_stack) > 1: 246df50e848SMauro Carvalho Chehab show_stack.pop() 247df50e848SMauro Carvalho Chehab 248df50e848SMauro Carvalho Chehab if not prev and show_stack[-1]: 249df50e848SMauro Carvalho Chehab # 250df50e848SMauro Carvalho Chehab # Try to preserve indent 251df50e848SMauro Carvalho Chehab # 252df50e848SMauro Carvalho Chehab out += "\t" * (len(show_stack) - 1) 253df50e848SMauro Carvalho Chehab 254df50e848SMauro Carvalho Chehab out += str(tok.value) 255df50e848SMauro Carvalho Chehab continue 256df50e848SMauro Carvalho Chehab 257df50e848SMauro Carvalho Chehab elif tok.kind == CToken.COMMENT: 258df50e848SMauro Carvalho Chehab comment = RE_COMMENT_START.sub("", tok.value) 259df50e848SMauro Carvalho Chehab 260df50e848SMauro Carvalho Chehab if comment.startswith("private:"): 261df50e848SMauro Carvalho Chehab show_stack[-1] = False 262df50e848SMauro Carvalho Chehab show = False 263df50e848SMauro Carvalho Chehab elif comment.startswith("public:"): 264df50e848SMauro Carvalho Chehab show_stack[-1] = True 265df50e848SMauro Carvalho Chehab 266df50e848SMauro Carvalho Chehab continue 267df50e848SMauro Carvalho Chehab 268df50e848SMauro Carvalho Chehab if not show_stack[-1]: 269df50e848SMauro Carvalho Chehab continue 270df50e848SMauro Carvalho Chehab 271df50e848SMauro Carvalho Chehab if i < len(self.tokens) - 1: 272df50e848SMauro Carvalho Chehab next_tok = self.tokens[i + 1] 273df50e848SMauro Carvalho Chehab 274df50e848SMauro Carvalho Chehab # Do some cleanups before ";" 275df50e848SMauro Carvalho Chehab 276*f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 277df50e848SMauro Carvalho Chehab continue 278df50e848SMauro Carvalho Chehab 279*f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 280df50e848SMauro Carvalho Chehab continue 281df50e848SMauro Carvalho Chehab 282df50e848SMauro Carvalho Chehab out += str(tok.value) 283df50e848SMauro Carvalho Chehab 284df50e848SMauro Carvalho Chehab return out 285*f1cf9f7cSMauro Carvalho Chehab 286*f1cf9f7cSMauro Carvalho Chehab 287*f1cf9f7cSMauro Carvalho Chehabclass CMatch: 288*f1cf9f7cSMauro Carvalho Chehab """ 289*f1cf9f7cSMauro Carvalho Chehab Finding nested delimiters is hard with regular expressions. It is 290*f1cf9f7cSMauro Carvalho Chehab even harder on Python with its normal re module, as there are several 291*f1cf9f7cSMauro Carvalho Chehab advanced regular expressions that are missing. 292*f1cf9f7cSMauro Carvalho Chehab 293*f1cf9f7cSMauro Carvalho Chehab This is the case of this pattern:: 294*f1cf9f7cSMauro Carvalho Chehab 295*f1cf9f7cSMauro Carvalho Chehab '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 296*f1cf9f7cSMauro Carvalho Chehab 297*f1cf9f7cSMauro Carvalho Chehab which is used to properly match open/close parentheses of the 298*f1cf9f7cSMauro Carvalho Chehab string search STRUCT_GROUP(), 299*f1cf9f7cSMauro Carvalho Chehab 300*f1cf9f7cSMauro Carvalho Chehab Add a class that counts pairs of delimiters, using it to match and 301*f1cf9f7cSMauro Carvalho Chehab replace nested expressions. 302*f1cf9f7cSMauro Carvalho Chehab 303*f1cf9f7cSMauro Carvalho Chehab The original approach was suggested by: 304*f1cf9f7cSMauro Carvalho Chehab 305*f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 306*f1cf9f7cSMauro Carvalho Chehab 307*f1cf9f7cSMauro Carvalho Chehab Although I re-implemented it to make it more generic and match 3 types 308*f1cf9f7cSMauro Carvalho Chehab of delimiters. The logic checks if delimiters are paired. If not, it 309*f1cf9f7cSMauro Carvalho Chehab will ignore the search string. 310*f1cf9f7cSMauro Carvalho Chehab """ 311*f1cf9f7cSMauro Carvalho Chehab 312*f1cf9f7cSMauro Carvalho Chehab # TODO: add a sub method 313*f1cf9f7cSMauro Carvalho Chehab 314*f1cf9f7cSMauro Carvalho Chehab def __init__(self, regex): 315*f1cf9f7cSMauro Carvalho Chehab self.regex = KernRe(regex) 316*f1cf9f7cSMauro Carvalho Chehab 317*f1cf9f7cSMauro Carvalho Chehab def _search(self, tokenizer): 318*f1cf9f7cSMauro Carvalho Chehab """ 319*f1cf9f7cSMauro Carvalho Chehab Finds paired blocks for a regex that ends with a delimiter. 320*f1cf9f7cSMauro Carvalho Chehab 321*f1cf9f7cSMauro Carvalho Chehab The suggestion of using finditer to match pairs came from: 322*f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 323*f1cf9f7cSMauro Carvalho Chehab but I ended using a different implementation to align all three types 324*f1cf9f7cSMauro Carvalho Chehab of delimiters and seek for an initial regular expression. 325*f1cf9f7cSMauro Carvalho Chehab 326*f1cf9f7cSMauro Carvalho Chehab The algorithm seeks for open/close paired delimiters and places them 327*f1cf9f7cSMauro Carvalho Chehab into a stack, yielding a start/stop position of each match when the 328*f1cf9f7cSMauro Carvalho Chehab stack is zeroed. 329*f1cf9f7cSMauro Carvalho Chehab 330*f1cf9f7cSMauro Carvalho Chehab The algorithm should work fine for properly paired lines, but will 331*f1cf9f7cSMauro Carvalho Chehab silently ignore end delimiters that precede a start delimiter. 332*f1cf9f7cSMauro Carvalho Chehab This should be OK for kernel-doc parser, as unaligned delimiters 333*f1cf9f7cSMauro Carvalho Chehab would cause compilation errors. So, we don't need to raise exceptions 334*f1cf9f7cSMauro Carvalho Chehab to cover such issues. 335*f1cf9f7cSMauro Carvalho Chehab """ 336*f1cf9f7cSMauro Carvalho Chehab 337*f1cf9f7cSMauro Carvalho Chehab start = None 338*f1cf9f7cSMauro Carvalho Chehab offset = -1 339*f1cf9f7cSMauro Carvalho Chehab started = False 340*f1cf9f7cSMauro Carvalho Chehab 341*f1cf9f7cSMauro Carvalho Chehab import sys 342*f1cf9f7cSMauro Carvalho Chehab 343*f1cf9f7cSMauro Carvalho Chehab stack = [] 344*f1cf9f7cSMauro Carvalho Chehab 345*f1cf9f7cSMauro Carvalho Chehab for i, tok in enumerate(tokenizer.tokens): 346*f1cf9f7cSMauro Carvalho Chehab if start is None: 347*f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.NAME and self.regex.match(tok.value): 348*f1cf9f7cSMauro Carvalho Chehab start = i 349*f1cf9f7cSMauro Carvalho Chehab stack.append((start, tok.level)) 350*f1cf9f7cSMauro Carvalho Chehab started = False 351*f1cf9f7cSMauro Carvalho Chehab 352*f1cf9f7cSMauro Carvalho Chehab continue 353*f1cf9f7cSMauro Carvalho Chehab 354*f1cf9f7cSMauro Carvalho Chehab if not started and tok.kind == CToken.BEGIN: 355*f1cf9f7cSMauro Carvalho Chehab started = True 356*f1cf9f7cSMauro Carvalho Chehab continue 357*f1cf9f7cSMauro Carvalho Chehab 358*f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.END and tok.level == stack[-1][1]: 359*f1cf9f7cSMauro Carvalho Chehab start, level = stack.pop() 360*f1cf9f7cSMauro Carvalho Chehab offset = i 361*f1cf9f7cSMauro Carvalho Chehab 362*f1cf9f7cSMauro Carvalho Chehab yield CTokenizer(tokenizer.tokens[start:offset + 1]) 363*f1cf9f7cSMauro Carvalho Chehab start = None 364*f1cf9f7cSMauro Carvalho Chehab 365*f1cf9f7cSMauro Carvalho Chehab # 366*f1cf9f7cSMauro Carvalho Chehab # If an END zeroing levels is not there, return remaining stuff 367*f1cf9f7cSMauro Carvalho Chehab # This is meant to solve cases where the caller logic might be 368*f1cf9f7cSMauro Carvalho Chehab # picking an incomplete block. 369*f1cf9f7cSMauro Carvalho Chehab # 370*f1cf9f7cSMauro Carvalho Chehab if start and offset < 0: 371*f1cf9f7cSMauro Carvalho Chehab print("WARNING: can't find an end", file=sys.stderr) 372*f1cf9f7cSMauro Carvalho Chehab yield CTokenizer(tokenizer.tokens[start:]) 373*f1cf9f7cSMauro Carvalho Chehab 374*f1cf9f7cSMauro Carvalho Chehab def search(self, source): 375*f1cf9f7cSMauro Carvalho Chehab """ 376*f1cf9f7cSMauro Carvalho Chehab This is similar to re.search: 377*f1cf9f7cSMauro Carvalho Chehab 378*f1cf9f7cSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 379*f1cf9f7cSMauro Carvalho Chehab returning occurrences only if all delimiters are paired. 380*f1cf9f7cSMauro Carvalho Chehab """ 381*f1cf9f7cSMauro Carvalho Chehab 382*f1cf9f7cSMauro Carvalho Chehab if isinstance(source, CTokenizer): 383*f1cf9f7cSMauro Carvalho Chehab tokenizer = source 384*f1cf9f7cSMauro Carvalho Chehab is_token = True 385*f1cf9f7cSMauro Carvalho Chehab else: 386*f1cf9f7cSMauro Carvalho Chehab tokenizer = CTokenizer(source) 387*f1cf9f7cSMauro Carvalho Chehab is_token = False 388*f1cf9f7cSMauro Carvalho Chehab 389*f1cf9f7cSMauro Carvalho Chehab for new_tokenizer in self._search(tokenizer): 390*f1cf9f7cSMauro Carvalho Chehab if is_token: 391*f1cf9f7cSMauro Carvalho Chehab yield new_tokenizer 392*f1cf9f7cSMauro Carvalho Chehab else: 393*f1cf9f7cSMauro Carvalho Chehab yield str(new_tokenizer) 394