1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage. 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or reuses it from the cache. 31 """ 32 self.regex = re_cache.get(string, None) 33 if not self.regex: 34 self.regex = re.compile(string, flags=flags) 35 if self.cache: 36 re_cache[string] = self.regex 37 38 def __init__(self, string, cache=True, flags=0): 39 """ 40 Compile a regular expression and initialize internal vars. 41 """ 42 43 self.cache = cache 44 self.last_match = None 45 46 self._add_regex(string, flags) 47 48 def __str__(self): 49 """ 50 Return the regular expression pattern. 51 """ 52 return self.regex.pattern 53 54 def __repr__(self): 55 return f're.compile("{self.regex.pattern}")' 56 57 def __add__(self, other): 58 """ 59 Allows adding two regular expressions into one. 60 """ 61 62 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 63 flags=self.regex.flags | other.regex.flags) 64 65 def match(self, string): 66 """ 67 Handles a re.match storing its results. 68 """ 69 70 self.last_match = self.regex.match(string) 71 return self.last_match 72 73 def search(self, string): 74 """ 75 Handles a re.search storing its results. 76 """ 77 78 self.last_match = self.regex.search(string) 79 return self.last_match 80 81 def findall(self, string): 82 """ 83 Alias to re.findall. 84 """ 85 86 return self.regex.findall(string) 87 88 def split(self, string): 89 """ 90 Alias to re.split. 91 """ 92 93 return self.regex.split(string) 94 95 def sub(self, sub, string, count=0): 96 """ 97 Alias to re.sub. 98 """ 99 100 return self.regex.sub(sub, string, count=count) 101 102 def group(self, num): 103 """ 104 Returns the group results of the last match. 105 """ 106 107 return self.last_match.group(num) 108 109 110class NestedMatch: 111 """ 112 Finding nested delimiters is hard with regular expressions. It is 113 even harder on Python with its normal re module, as there are several 114 advanced regular expressions that are missing. 115 116 This is the case of this pattern:: 117 118 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 119 120 which is used to properly match open/close parentheses of the 121 string search STRUCT_GROUP(), 122 123 Add a class that counts pairs of delimiters, using it to match and 124 replace nested expressions. 125 126 The original approach was suggested by: 127 128 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 129 130 Although I re-implemented it to make it more generic and match 3 types 131 of delimiters. The logic checks if delimiters are paired. If not, it 132 will ignore the search string. 133 """ 134 135 # TODO: make NestedMatch handle multiple match groups 136 # 137 # Right now, regular expressions to match it are defined only up to 138 # the start delimiter, e.g.: 139 # 140 # \bSTRUCT_GROUP\( 141 # 142 # is similar to: STRUCT_GROUP\((.*)\) 143 # except that the content inside the match group is delimiter-aligned. 144 # 145 # The content inside parentheses is converted into a single replace 146 # group (e.g. r`\1'). 147 # 148 # It would be nice to change such definition to support multiple 149 # match groups, allowing a regex equivalent to: 150 # 151 # FOO\((.*), (.*), (.*)\) 152 # 153 # it is probably easier to define it not as a regular expression, but 154 # with some lexical definition like: 155 # 156 # FOO(arg1, arg2, arg3) 157 158 DELIMITER_PAIRS = { 159 '{': '}', 160 '(': ')', 161 '[': ']', 162 } 163 164 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 165 166 def _search(self, regex, line): 167 """ 168 Finds paired blocks for a regex that ends with a delimiter. 169 170 The suggestion of using finditer to match pairs came from: 171 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 172 but I ended using a different implementation to align all three types 173 of delimiters and seek for an initial regular expression. 174 175 The algorithm seeks for open/close paired delimiters and places them 176 into a stack, yielding a start/stop position of each match when the 177 stack is zeroed. 178 179 The algorithm should work fine for properly paired lines, but will 180 silently ignore end delimiters that precede a start delimiter. 181 This should be OK for kernel-doc parser, as unaligned delimiters 182 would cause compilation errors. So, we don't need to raise exceptions 183 to cover such issues. 184 """ 185 186 stack = [] 187 188 for match_re in regex.finditer(line): 189 start = match_re.start() 190 offset = match_re.end() 191 192 d = line[offset - 1] 193 if d not in self.DELIMITER_PAIRS: 194 continue 195 196 end = self.DELIMITER_PAIRS[d] 197 stack.append(end) 198 199 for match in self.RE_DELIM.finditer(line[offset:]): 200 pos = match.start() + offset 201 202 d = line[pos] 203 204 if d in self.DELIMITER_PAIRS: 205 end = self.DELIMITER_PAIRS[d] 206 207 stack.append(end) 208 continue 209 210 # Does the end delimiter match what is expected? 211 if stack and d == stack[-1]: 212 stack.pop() 213 214 if not stack: 215 yield start, offset, pos + 1 216 break 217 218 def search(self, regex, line): 219 """ 220 This is similar to re.search: 221 222 It matches a regex that it is followed by a delimiter, 223 returning occurrences only if all delimiters are paired. 224 """ 225 226 for t in self._search(regex, line): 227 228 yield line[t[0]:t[2]] 229 230 def sub(self, regex, sub, line, count=0): 231 r""" 232 This is similar to re.sub: 233 234 It matches a regex that it is followed by a delimiter, 235 replacing occurrences only if all delimiters are paired. 236 237 if the sub argument contains:: 238 239 r'\1' 240 241 it will work just like re: it places there the matched paired data 242 with the delimiter stripped. 243 244 If count is different than zero, it will replace at most count 245 items. 246 """ 247 out = "" 248 249 cur_pos = 0 250 n = 0 251 252 for start, end, pos in self._search(regex, line): 253 out += line[cur_pos:start] 254 255 # Value, ignoring start/end delimiters 256 value = line[end:pos - 1] 257 258 # replaces \1 at the sub string, if \1 is used there 259 new_sub = sub 260 new_sub = new_sub.replace(r'\1', value) 261 262 out += new_sub 263 264 # Drop end ';' if any 265 if line[pos] == ';': 266 pos += 1 267 268 cur_pos = pos 269 n += 1 270 271 if count and count >= n: 272 break 273 274 # Append the remaining string 275 l = len(line) 276 out += line[cur_pos:l] 277 278 return out 279