1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage. 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or reuses it from the cache. 31 """ 32 self.regex = re_cache.get(string, None) 33 if not self.regex: 34 self.regex = re.compile(string, flags=flags) 35 if self.cache: 36 re_cache[string] = self.regex 37 38 def __init__(self, string, cache=True, flags=0): 39 """ 40 Compile a regular expression and initialize internal vars. 41 """ 42 43 self.cache = cache 44 self.last_match = None 45 46 self._add_regex(string, flags) 47 48 def __str__(self): 49 """ 50 Return the regular expression pattern. 51 """ 52 return self.regex.pattern 53 54 def __repr__(self): 55 return f're.compile("{self.regex.pattern}")' 56 57 def __add__(self, other): 58 """ 59 Allows adding two regular expressions into one. 60 """ 61 62 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 63 flags=self.regex.flags | other.regex.flags) 64 65 def match(self, string): 66 """ 67 Handles a re.match storing its results. 68 """ 69 70 self.last_match = self.regex.match(string) 71 return self.last_match 72 73 def search(self, string): 74 """ 75 Handles a re.search storing its results. 76 """ 77 78 self.last_match = self.regex.search(string) 79 return self.last_match 80 81 def findall(self, string): 82 """ 83 Alias to re.findall. 84 """ 85 86 return self.regex.findall(string) 87 88 def split(self, string): 89 """ 90 Alias to re.split. 91 """ 92 93 return self.regex.split(string) 94 95 def sub(self, sub, string, count=0): 96 """ 97 Alias to re.sub. 98 """ 99 100 return self.regex.sub(sub, string, count=count) 101 102 def group(self, num): 103 """ 104 Returns the group results of the last match. 105 """ 106 107 return self.last_match.group(num) 108 109 def groups(self): 110 """ 111 Returns the group results of the last match 112 """ 113 114 return self.last_match.groups() 115 116 117class NestedMatch: 118 """ 119 Finding nested delimiters is hard with regular expressions. It is 120 even harder on Python with its normal re module, as there are several 121 advanced regular expressions that are missing. 122 123 This is the case of this pattern:: 124 125 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 126 127 which is used to properly match open/close parentheses of the 128 string search STRUCT_GROUP(), 129 130 Add a class that counts pairs of delimiters, using it to match and 131 replace nested expressions. 132 133 The original approach was suggested by: 134 135 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 136 137 Although I re-implemented it to make it more generic and match 3 types 138 of delimiters. The logic checks if delimiters are paired. If not, it 139 will ignore the search string. 140 """ 141 142 # TODO: make NestedMatch handle multiple match groups 143 # 144 # Right now, regular expressions to match it are defined only up to 145 # the start delimiter, e.g.: 146 # 147 # \bSTRUCT_GROUP\( 148 # 149 # is similar to: STRUCT_GROUP\((.*)\) 150 # except that the content inside the match group is delimiter-aligned. 151 # 152 # The content inside parentheses is converted into a single replace 153 # group (e.g. r`\1'). 154 # 155 # It would be nice to change such definition to support multiple 156 # match groups, allowing a regex equivalent to: 157 # 158 # FOO\((.*), (.*), (.*)\) 159 # 160 # it is probably easier to define it not as a regular expression, but 161 # with some lexical definition like: 162 # 163 # FOO(arg1, arg2, arg3) 164 165 DELIMITER_PAIRS = { 166 '{': '}', 167 '(': ')', 168 '[': ']', 169 } 170 171 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 172 173 def _search(self, regex, line): 174 """ 175 Finds paired blocks for a regex that ends with a delimiter. 176 177 The suggestion of using finditer to match pairs came from: 178 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 179 but I ended using a different implementation to align all three types 180 of delimiters and seek for an initial regular expression. 181 182 The algorithm seeks for open/close paired delimiters and places them 183 into a stack, yielding a start/stop position of each match when the 184 stack is zeroed. 185 186 The algorithm should work fine for properly paired lines, but will 187 silently ignore end delimiters that precede a start delimiter. 188 This should be OK for kernel-doc parser, as unaligned delimiters 189 would cause compilation errors. So, we don't need to raise exceptions 190 to cover such issues. 191 """ 192 193 stack = [] 194 195 for match_re in regex.finditer(line): 196 start = match_re.start() 197 offset = match_re.end() 198 199 d = line[offset - 1] 200 if d not in self.DELIMITER_PAIRS: 201 continue 202 203 end = self.DELIMITER_PAIRS[d] 204 stack.append(end) 205 206 for match in self.RE_DELIM.finditer(line[offset:]): 207 pos = match.start() + offset 208 209 d = line[pos] 210 211 if d in self.DELIMITER_PAIRS: 212 end = self.DELIMITER_PAIRS[d] 213 214 stack.append(end) 215 continue 216 217 # Does the end delimiter match what is expected? 218 if stack and d == stack[-1]: 219 stack.pop() 220 221 if not stack: 222 yield start, offset, pos + 1 223 break 224 225 def search(self, regex, line): 226 """ 227 This is similar to re.search: 228 229 It matches a regex that it is followed by a delimiter, 230 returning occurrences only if all delimiters are paired. 231 """ 232 233 for t in self._search(regex, line): 234 235 yield line[t[0]:t[2]] 236 237 def sub(self, regex, sub, line, count=0): 238 r""" 239 This is similar to re.sub: 240 241 It matches a regex that it is followed by a delimiter, 242 replacing occurrences only if all delimiters are paired. 243 244 if the sub argument contains:: 245 246 r'\1' 247 248 it will work just like re: it places there the matched paired data 249 with the delimiter stripped. 250 251 If count is different than zero, it will replace at most count 252 items. 253 """ 254 out = "" 255 256 cur_pos = 0 257 n = 0 258 259 for start, end, pos in self._search(regex, line): 260 out += line[cur_pos:start] 261 262 # Value, ignoring start/end delimiters 263 value = line[end:pos - 1] 264 265 # replaces \1 at the sub string, if \1 is used there 266 new_sub = sub 267 new_sub = new_sub.replace(r'\1', value) 268 269 out += new_sub 270 271 # Drop end ';' if any 272 if line[pos] == ';': 273 pos += 1 274 275 cur_pos = pos 276 n += 1 277 278 if count and count >= n: 279 break 280 281 # Append the remaining string 282 l = len(line) 283 out += line[cur_pos:l] 284 285 return out 286