1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage. 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or reuses it from the cache. 31 """ 32 self.regex = re_cache.get(string, None) 33 if not self.regex: 34 self.regex = re.compile(string, flags=flags) 35 if self.cache: 36 re_cache[string] = self.regex 37 38 def __init__(self, string, cache=True, flags=0): 39 """ 40 Compile a regular expression and initialize internal vars. 41 """ 42 43 self.cache = cache 44 self.last_match = None 45 46 self._add_regex(string, flags) 47 48 def __str__(self): 49 """ 50 Return the regular expression pattern. 51 """ 52 return self.regex.pattern 53 54 def __repr__(self): 55 """ 56 Returns a displayable version of the class init. 57 """ 58 59 flag_map = { 60 re.IGNORECASE: "re.I", 61 re.MULTILINE: "re.M", 62 re.DOTALL: "re.S", 63 re.VERBOSE: "re.X", 64 } 65 66 flags = [] 67 for flag, name in flag_map.items(): 68 if self.regex.flags & flag: 69 flags.append(name) 70 71 flags_name = " | ".join(flags) 72 73 if flags_name: 74 return f'KernRe("{self.regex.pattern}", {flags_name})' 75 else: 76 return f'KernRe("{self.regex.pattern}")' 77 78 def __add__(self, other): 79 """ 80 Allows adding two regular expressions into one. 81 """ 82 83 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 84 flags=self.regex.flags | other.regex.flags) 85 86 def match(self, string): 87 """ 88 Handles a re.match storing its results. 89 """ 90 91 self.last_match = self.regex.match(string) 92 return self.last_match 93 94 def search(self, string): 95 """ 96 Handles a re.search storing its results. 97 """ 98 99 self.last_match = self.regex.search(string) 100 return self.last_match 101 102 def finditer(self, string): 103 """ 104 Alias to re.finditer. 105 """ 106 107 return self.regex.finditer(string) 108 109 def findall(self, string): 110 """ 111 Alias to re.findall. 112 """ 113 114 return self.regex.findall(string) 115 116 def split(self, string): 117 """ 118 Alias to re.split. 119 """ 120 121 return self.regex.split(string) 122 123 def sub(self, sub, string, count=0): 124 """ 125 Alias to re.sub. 126 """ 127 128 return self.regex.sub(sub, string, count=count) 129 130 def group(self, num): 131 """ 132 Returns the group results of the last match. 133 """ 134 135 return self.last_match.group(num) 136 137 def groups(self): 138 """ 139 Returns the group results of the last match 140 """ 141 142 return self.last_match.groups() 143 144#: Nested delimited pairs (brackets and parenthesis) 145DELIMITER_PAIRS = { 146 '{': '}', 147 '(': ')', 148 '[': ']', 149} 150 151#: compiled delimiters 152RE_DELIM = KernRe(r'[\{\}\[\]\(\)]') 153 154 155class NestedMatch: 156 """ 157 Finding nested delimiters is hard with regular expressions. It is 158 even harder on Python with its normal re module, as there are several 159 advanced regular expressions that are missing. 160 161 This is the case of this pattern:: 162 163 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 164 165 which is used to properly match open/close parentheses of the 166 string search STRUCT_GROUP(), 167 168 Add a class that counts pairs of delimiters, using it to match and 169 replace nested expressions. 170 171 The original approach was suggested by: 172 173 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 174 175 Although I re-implemented it to make it more generic and match 3 types 176 of delimiters. The logic checks if delimiters are paired. If not, it 177 will ignore the search string. 178 """ 179 180 # TODO: make NestedMatch handle multiple match groups 181 # 182 # Right now, regular expressions to match it are defined only up to 183 # the start delimiter, e.g.: 184 # 185 # \bSTRUCT_GROUP\( 186 # 187 # is similar to: STRUCT_GROUP\((.*)\) 188 # except that the content inside the match group is delimiter-aligned. 189 # 190 # The content inside parentheses is converted into a single replace 191 # group (e.g. r`\1'). 192 # 193 # It would be nice to change such definition to support multiple 194 # match groups, allowing a regex equivalent to: 195 # 196 # FOO\((.*), (.*), (.*)\) 197 # 198 # it is probably easier to define it not as a regular expression, but 199 # with some lexical definition like: 200 # 201 # FOO(arg1, arg2, arg3) 202 203 def _search(self, regex, line): 204 """ 205 Finds paired blocks for a regex that ends with a delimiter. 206 207 The suggestion of using finditer to match pairs came from: 208 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 209 but I ended using a different implementation to align all three types 210 of delimiters and seek for an initial regular expression. 211 212 The algorithm seeks for open/close paired delimiters and places them 213 into a stack, yielding a start/stop position of each match when the 214 stack is zeroed. 215 216 The algorithm should work fine for properly paired lines, but will 217 silently ignore end delimiters that precede a start delimiter. 218 This should be OK for kernel-doc parser, as unaligned delimiters 219 would cause compilation errors. So, we don't need to raise exceptions 220 to cover such issues. 221 """ 222 223 stack = [] 224 225 for match_re in regex.finditer(line): 226 start = match_re.start() 227 offset = match_re.end() 228 string_char = None 229 escape = False 230 231 d = line[offset - 1] 232 if d not in DELIMITER_PAIRS: 233 continue 234 235 end = DELIMITER_PAIRS[d] 236 stack.append(end) 237 238 for match in RE_DELIM.finditer(line[offset:]): 239 pos = match.start() + offset 240 241 d = line[pos] 242 243 if escape: 244 escape = False 245 continue 246 247 if string_char: 248 if d == '\\': 249 escape = True 250 elif d == string_char: 251 string_char = None 252 253 continue 254 255 if d in ('"', "'"): 256 string_char = d 257 continue 258 259 if d in DELIMITER_PAIRS: 260 end = DELIMITER_PAIRS[d] 261 262 stack.append(end) 263 continue 264 265 # Does the end delimiter match what is expected? 266 if stack and d == stack[-1]: 267 stack.pop() 268 269 if not stack: 270 yield start, offset, pos + 1 271 break 272 273 def search(self, regex, line): 274 """ 275 This is similar to re.search: 276 277 It matches a regex that it is followed by a delimiter, 278 returning occurrences only if all delimiters are paired. 279 """ 280 281 for t in self._search(regex, line): 282 283 yield line[t[0]:t[2]] 284 285 def sub(self, regex, sub, line, count=0): 286 r""" 287 This is similar to re.sub: 288 289 It matches a regex that it is followed by a delimiter, 290 replacing occurrences only if all delimiters are paired. 291 292 if the sub argument contains:: 293 294 r'\1' 295 296 it will work just like re: it places there the matched paired data 297 with the delimiter stripped. 298 299 If count is different than zero, it will replace at most count 300 items. 301 """ 302 out = "" 303 304 cur_pos = 0 305 n = 0 306 307 for start, end, pos in self._search(regex, line): 308 out += line[cur_pos:start] 309 310 # Value, ignoring start/end delimiters 311 value = line[end:pos - 1] 312 313 # replaces \1 at the sub string, if \1 is used there 314 new_sub = sub 315 new_sub = new_sub.replace(r'\1', value) 316 317 out += new_sub 318 319 # Drop end ';' if any 320 if pos < len(line) and line[pos] == ';': 321 pos += 1 322 323 cur_pos = pos 324 n += 1 325 326 if count and count >= n: 327 break 328 329 # Append the remaining string 330 l = len(line) 331 out += line[cur_pos:l] 332 333 return out 334