1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage. 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or reuses it from the cache. 31 """ 32 self.regex = re_cache.get(string, None) 33 if not self.regex: 34 self.regex = re.compile(string, flags=flags) 35 if self.cache: 36 re_cache[string] = self.regex 37 38 def __init__(self, string, cache=True, flags=0): 39 """ 40 Compile a regular expression and initialize internal vars. 41 """ 42 43 self.cache = cache 44 self.last_match = None 45 46 self._add_regex(string, flags) 47 48 def __str__(self): 49 """ 50 Return the regular expression pattern. 51 """ 52 return self.regex.pattern 53 54 def __repr__(self): 55 """ 56 Returns a displayable version of the class init. 57 """ 58 59 flag_map = { 60 re.IGNORECASE: "re.I", 61 re.MULTILINE: "re.M", 62 re.DOTALL: "re.S", 63 re.VERBOSE: "re.X", 64 } 65 66 flags = [] 67 for flag, name in flag_map.items(): 68 if self.regex.flags & flag: 69 flags.append(name) 70 71 flags_name = " | ".join(flags) 72 73 if flags_name: 74 return f'KernRe("{self.regex.pattern}", {flags_name})' 75 else: 76 return f'KernRe("{self.regex.pattern}")' 77 78 def __add__(self, other): 79 """ 80 Allows adding two regular expressions into one. 81 """ 82 83 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 84 flags=self.regex.flags | other.regex.flags) 85 86 def match(self, string): 87 """ 88 Handles a re.match storing its results. 89 """ 90 91 self.last_match = self.regex.match(string) 92 return self.last_match 93 94 def search(self, string): 95 """ 96 Handles a re.search storing its results. 97 """ 98 99 self.last_match = self.regex.search(string) 100 return self.last_match 101 102 def findall(self, string): 103 """ 104 Alias to re.findall. 105 """ 106 107 return self.regex.findall(string) 108 109 def split(self, string): 110 """ 111 Alias to re.split. 112 """ 113 114 return self.regex.split(string) 115 116 def sub(self, sub, string, count=0): 117 """ 118 Alias to re.sub. 119 """ 120 121 return self.regex.sub(sub, string, count=count) 122 123 def group(self, num): 124 """ 125 Returns the group results of the last match. 126 """ 127 128 return self.last_match.group(num) 129 130 def groups(self): 131 """ 132 Returns the group results of the last match 133 """ 134 135 return self.last_match.groups() 136 137 138class NestedMatch: 139 """ 140 Finding nested delimiters is hard with regular expressions. It is 141 even harder on Python with its normal re module, as there are several 142 advanced regular expressions that are missing. 143 144 This is the case of this pattern:: 145 146 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 147 148 which is used to properly match open/close parentheses of the 149 string search STRUCT_GROUP(), 150 151 Add a class that counts pairs of delimiters, using it to match and 152 replace nested expressions. 153 154 The original approach was suggested by: 155 156 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 157 158 Although I re-implemented it to make it more generic and match 3 types 159 of delimiters. The logic checks if delimiters are paired. If not, it 160 will ignore the search string. 161 """ 162 163 # TODO: make NestedMatch handle multiple match groups 164 # 165 # Right now, regular expressions to match it are defined only up to 166 # the start delimiter, e.g.: 167 # 168 # \bSTRUCT_GROUP\( 169 # 170 # is similar to: STRUCT_GROUP\((.*)\) 171 # except that the content inside the match group is delimiter-aligned. 172 # 173 # The content inside parentheses is converted into a single replace 174 # group (e.g. r`\1'). 175 # 176 # It would be nice to change such definition to support multiple 177 # match groups, allowing a regex equivalent to: 178 # 179 # FOO\((.*), (.*), (.*)\) 180 # 181 # it is probably easier to define it not as a regular expression, but 182 # with some lexical definition like: 183 # 184 # FOO(arg1, arg2, arg3) 185 186 DELIMITER_PAIRS = { 187 '{': '}', 188 '(': ')', 189 '[': ']', 190 } 191 192 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 193 194 def _search(self, regex, line): 195 """ 196 Finds paired blocks for a regex that ends with a delimiter. 197 198 The suggestion of using finditer to match pairs came from: 199 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 200 but I ended using a different implementation to align all three types 201 of delimiters and seek for an initial regular expression. 202 203 The algorithm seeks for open/close paired delimiters and places them 204 into a stack, yielding a start/stop position of each match when the 205 stack is zeroed. 206 207 The algorithm should work fine for properly paired lines, but will 208 silently ignore end delimiters that precede a start delimiter. 209 This should be OK for kernel-doc parser, as unaligned delimiters 210 would cause compilation errors. So, we don't need to raise exceptions 211 to cover such issues. 212 """ 213 214 stack = [] 215 216 for match_re in regex.finditer(line): 217 start = match_re.start() 218 offset = match_re.end() 219 string_char = None 220 escape = False 221 222 d = line[offset - 1] 223 if d not in self.DELIMITER_PAIRS: 224 continue 225 226 end = self.DELIMITER_PAIRS[d] 227 stack.append(end) 228 229 for match in self.RE_DELIM.finditer(line[offset:]): 230 pos = match.start() + offset 231 232 d = line[pos] 233 234 if escape: 235 escape = False 236 continue 237 238 if string_char: 239 if d == '\\': 240 escape = True 241 elif d == string_char: 242 string_char = None 243 244 continue 245 246 if d in ('"', "'"): 247 string_char = d 248 continue 249 250 if d in self.DELIMITER_PAIRS: 251 end = self.DELIMITER_PAIRS[d] 252 253 stack.append(end) 254 continue 255 256 # Does the end delimiter match what is expected? 257 if stack and d == stack[-1]: 258 stack.pop() 259 260 if not stack: 261 yield start, offset, pos + 1 262 break 263 264 def search(self, regex, line): 265 """ 266 This is similar to re.search: 267 268 It matches a regex that it is followed by a delimiter, 269 returning occurrences only if all delimiters are paired. 270 """ 271 272 for t in self._search(regex, line): 273 274 yield line[t[0]:t[2]] 275 276 def sub(self, regex, sub, line, count=0): 277 r""" 278 This is similar to re.sub: 279 280 It matches a regex that it is followed by a delimiter, 281 replacing occurrences only if all delimiters are paired. 282 283 if the sub argument contains:: 284 285 r'\1' 286 287 it will work just like re: it places there the matched paired data 288 with the delimiter stripped. 289 290 If count is different than zero, it will replace at most count 291 items. 292 """ 293 out = "" 294 295 cur_pos = 0 296 n = 0 297 298 for start, end, pos in self._search(regex, line): 299 out += line[cur_pos:start] 300 301 # Value, ignoring start/end delimiters 302 value = line[end:pos - 1] 303 304 # replaces \1 at the sub string, if \1 is used there 305 new_sub = sub 306 new_sub = new_sub.replace(r'\1', value) 307 308 out += new_sub 309 310 # Drop end ';' if any 311 if pos < len(line) and line[pos] == ';': 312 pos += 1 313 314 cur_pos = pos 315 n += 1 316 317 if count and count >= n: 318 break 319 320 # Append the remaining string 321 l = len(line) 322 out += line[cur_pos:l] 323 324 return out 325