1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage. 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or reuses it from the cache. 31 """ 32 self.regex = re_cache.get(string, None) 33 if not self.regex: 34 self.regex = re.compile(string, flags=flags) 35 if self.cache: 36 re_cache[string] = self.regex 37 38 def __init__(self, string, cache=True, flags=0): 39 """ 40 Compile a regular expression and initialize internal vars. 41 """ 42 43 self.cache = cache 44 self.last_match = None 45 46 self._add_regex(string, flags) 47 48 def __str__(self): 49 """ 50 Return the regular expression pattern. 51 """ 52 return self.regex.pattern 53 54 def __repr__(self): 55 """ 56 Returns a displayable version of the class init. 57 """ 58 59 flag_map = { 60 re.IGNORECASE: "re.I", 61 re.MULTILINE: "re.M", 62 re.DOTALL: "re.S", 63 re.VERBOSE: "re.X", 64 } 65 66 flags = [] 67 for flag, name in flag_map.items(): 68 if self.regex.flags & flag: 69 flags.append(name) 70 71 flags_name = " | ".join(flags) 72 73 if flags_name: 74 return f'KernRe("{self.regex.pattern}", {flags_name})' 75 else: 76 return f'KernRe("{self.regex.pattern}")' 77 78 def __add__(self, other): 79 """ 80 Allows adding two regular expressions into one. 81 """ 82 83 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 84 flags=self.regex.flags | other.regex.flags) 85 86 def match(self, string): 87 """ 88 Handles a re.match storing its results. 89 """ 90 91 self.last_match = self.regex.match(string) 92 return self.last_match 93 94 def search(self, string): 95 """ 96 Handles a re.search storing its results. 97 """ 98 99 self.last_match = self.regex.search(string) 100 return self.last_match 101 102 def finditer(self, string): 103 """ 104 Alias to re.finditer. 105 """ 106 107 return self.regex.finditer(string) 108 109 def findall(self, string): 110 """ 111 Alias to re.findall. 112 """ 113 114 return self.regex.findall(string) 115 116 def split(self, string): 117 """ 118 Alias to re.split. 119 """ 120 121 return self.regex.split(string) 122 123 def sub(self, sub, string, count=0): 124 """ 125 Alias to re.sub. 126 """ 127 128 return self.regex.sub(sub, string, count=count) 129 130 def group(self, num): 131 """ 132 Returns the group results of the last match. 133 """ 134 135 return self.last_match.group(num) 136 137 def groups(self): 138 """ 139 Returns the group results of the last match 140 """ 141 142 return self.last_match.groups() 143 144#: Nested delimited pairs (brackets and parenthesis) 145DELIMITER_PAIRS = { 146 '{': '}', 147 '(': ')', 148 '[': ']', 149} 150 151#: compiled delimiters 152RE_DELIM = KernRe(r'[\{\}\[\]\(\)]') 153 154 155class NestedMatch: 156 """ 157 Finding nested delimiters is hard with regular expressions. It is 158 even harder on Python with its normal re module, as there are several 159 advanced regular expressions that are missing. 160 161 This is the case of this pattern:: 162 163 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 164 165 which is used to properly match open/close parentheses of the 166 string search STRUCT_GROUP(), 167 168 Add a class that counts pairs of delimiters, using it to match and 169 replace nested expressions. 170 171 The original approach was suggested by: 172 173 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 174 175 Although I re-implemented it to make it more generic and match 3 types 176 of delimiters. The logic checks if delimiters are paired. If not, it 177 will ignore the search string. 178 """ 179 180 # TODO: make NestedMatch handle multiple match groups 181 # 182 # Right now, regular expressions to match it are defined only up to 183 # the start delimiter, e.g.: 184 # 185 # \bSTRUCT_GROUP\( 186 # 187 # is similar to: STRUCT_GROUP\((.*)\) 188 # except that the content inside the match group is delimiter-aligned. 189 # 190 # The content inside parentheses is converted into a single replace 191 # group (e.g. r`\0'). 192 # 193 # It would be nice to change such definition to support multiple 194 # match groups, allowing a regex equivalent to: 195 # 196 # FOO\((.*), (.*), (.*)\) 197 # 198 # it is probably easier to define it not as a regular expression, but 199 # with some lexical definition like: 200 # 201 # FOO(arg1, arg2, arg3) 202 203 def __init__(self, regex): 204 self.regex = KernRe(regex) 205 206 def _search(self, line): 207 """ 208 Finds paired blocks for a regex that ends with a delimiter. 209 210 The suggestion of using finditer to match pairs came from: 211 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 212 but I ended using a different implementation to align all three types 213 of delimiters and seek for an initial regular expression. 214 215 The algorithm seeks for open/close paired delimiters and places them 216 into a stack, yielding a start/stop position of each match when the 217 stack is zeroed. 218 219 The algorithm should work fine for properly paired lines, but will 220 silently ignore end delimiters that precede a start delimiter. 221 This should be OK for kernel-doc parser, as unaligned delimiters 222 would cause compilation errors. So, we don't need to raise exceptions 223 to cover such issues. 224 """ 225 226 stack = [] 227 228 for match_re in self.regex.finditer(line): 229 start = match_re.start() 230 offset = match_re.end() 231 string_char = None 232 escape = False 233 234 d = line[offset - 1] 235 if d not in DELIMITER_PAIRS: 236 continue 237 238 end = DELIMITER_PAIRS[d] 239 stack.append(end) 240 241 for match in RE_DELIM.finditer(line[offset:]): 242 pos = match.start() + offset 243 244 d = line[pos] 245 246 if escape: 247 escape = False 248 continue 249 250 if string_char: 251 if d == '\\': 252 escape = True 253 elif d == string_char: 254 string_char = None 255 256 continue 257 258 if d in ('"', "'"): 259 string_char = d 260 continue 261 262 if d in DELIMITER_PAIRS: 263 end = DELIMITER_PAIRS[d] 264 265 stack.append(end) 266 continue 267 268 # Does the end delimiter match what is expected? 269 if stack and d == stack[-1]: 270 stack.pop() 271 272 if not stack: 273 yield start, offset, pos + 1 274 break 275 276 def search(self, line): 277 """ 278 This is similar to re.search: 279 280 It matches a regex that it is followed by a delimiter, 281 returning occurrences only if all delimiters are paired. 282 """ 283 284 for t in self._search(line): 285 286 yield line[t[0]:t[2]] 287 288 def sub(self, sub, line, count=0): 289 """ 290 This is similar to re.sub: 291 292 It matches a regex that it is followed by a delimiter, 293 replacing occurrences only if all delimiters are paired. 294 295 if the sub argument contains:: 296 297 r'\0' 298 299 it will work just like re: it places there the matched paired data 300 with the delimiter stripped. 301 302 If count is different than zero, it will replace at most count 303 items. 304 """ 305 out = "" 306 307 cur_pos = 0 308 n = 0 309 310 for start, end, pos in self._search(line): 311 out += line[cur_pos:start] 312 313 # Value, ignoring start/end delimiters 314 value = line[end:pos - 1] 315 316 # replaces \0 at the sub string, if \0 is used there 317 new_sub = sub 318 new_sub = new_sub.replace(r'\0', value) 319 320 out += new_sub 321 322 # Drop end ';' if any 323 if pos < len(line) and line[pos] == ';': 324 pos += 1 325 326 cur_pos = pos 327 n += 1 328 329 if count and count >= n: 330 break 331 332 # Append the remaining string 333 l = len(line) 334 out += line[cur_pos:l] 335 336 return out 337 338 def __repr__(self): 339 """ 340 Returns a displayable version of the class init. 341 """ 342 343 return f'NestedMatch("{self.regex.regex.pattern}")' 344