1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage, 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or re-use it from the cache. 31 """ 32 self.regex = re_cache.get(string, None) 33 if not self.regex: 34 self.regex = re.compile(string, flags=flags) 35 if self.cache: 36 re_cache[string] = self.regex 37 38 def __init__(self, string, cache=True, flags=0): 39 """ 40 Compile a regular expression and initialize internal vars. 41 """ 42 43 self.cache = cache 44 self.last_match = None 45 46 self._add_regex(string, flags) 47 48 def __str__(self): 49 """ 50 Return the regular expression pattern. 51 """ 52 return self.regex.pattern 53 54 def __add__(self, other): 55 """ 56 Allows adding two regular expressions into one. 57 """ 58 59 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 60 flags=self.regex.flags | other.regex.flags) 61 62 def match(self, string): 63 """ 64 Handles a re.match storing its results 65 """ 66 67 self.last_match = self.regex.match(string) 68 return self.last_match 69 70 def search(self, string): 71 """ 72 Handles a re.search storing its results 73 """ 74 75 self.last_match = self.regex.search(string) 76 return self.last_match 77 78 def findall(self, string): 79 """ 80 Alias to re.findall 81 """ 82 83 return self.regex.findall(string) 84 85 def split(self, string): 86 """ 87 Alias to re.split 88 """ 89 90 return self.regex.split(string) 91 92 def sub(self, sub, string, count=0): 93 """ 94 Alias to re.sub 95 """ 96 97 return self.regex.sub(sub, string, count=count) 98 99 def group(self, num): 100 """ 101 Returns the group results of the last match 102 """ 103 104 return self.last_match.group(num) 105 106 107class NestedMatch: 108 """ 109 Finding nested delimiters is hard with regular expressions. It is 110 even harder on Python with its normal re module, as there are several 111 advanced regular expressions that are missing. 112 113 This is the case of this pattern: 114 115 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 116 117 which is used to properly match open/close parenthesis of the 118 string search STRUCT_GROUP(), 119 120 Add a class that counts pairs of delimiters, using it to match and 121 replace nested expressions. 122 123 The original approach was suggested by: 124 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 125 126 Although I re-implemented it to make it more generic and match 3 types 127 of delimiters. The logic checks if delimiters are paired. If not, it 128 will ignore the search string. 129 """ 130 131 # TODO: make NestedMatch handle multiple match groups 132 # 133 # Right now, regular expressions to match it are defined only up to 134 # the start delimiter, e.g.: 135 # 136 # \bSTRUCT_GROUP\( 137 # 138 # is similar to: STRUCT_GROUP\((.*)\) 139 # except that the content inside the match group is delimiter's aligned. 140 # 141 # The content inside parenthesis are converted into a single replace 142 # group (e.g. r`\1'). 143 # 144 # It would be nice to change such definition to support multiple 145 # match groups, allowing a regex equivalent to. 146 # 147 # FOO\((.*), (.*), (.*)\) 148 # 149 # it is probably easier to define it not as a regular expression, but 150 # with some lexical definition like: 151 # 152 # FOO(arg1, arg2, arg3) 153 154 DELIMITER_PAIRS = { 155 '{': '}', 156 '(': ')', 157 '[': ']', 158 } 159 160 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 161 162 def _search(self, regex, line): 163 """ 164 Finds paired blocks for a regex that ends with a delimiter. 165 166 The suggestion of using finditer to match pairs came from: 167 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 168 but I ended using a different implementation to align all three types 169 of delimiters and seek for an initial regular expression. 170 171 The algorithm seeks for open/close paired delimiters and place them 172 into a stack, yielding a start/stop position of each match when the 173 stack is zeroed. 174 175 The algorithm shoud work fine for properly paired lines, but will 176 silently ignore end delimiters that preceeds an start delimiter. 177 This should be OK for kernel-doc parser, as unaligned delimiters 178 would cause compilation errors. So, we don't need to rise exceptions 179 to cover such issues. 180 """ 181 182 stack = [] 183 184 for match_re in regex.finditer(line): 185 start = match_re.start() 186 offset = match_re.end() 187 188 d = line[offset - 1] 189 if d not in self.DELIMITER_PAIRS: 190 continue 191 192 end = self.DELIMITER_PAIRS[d] 193 stack.append(end) 194 195 for match in self.RE_DELIM.finditer(line[offset:]): 196 pos = match.start() + offset 197 198 d = line[pos] 199 200 if d in self.DELIMITER_PAIRS: 201 end = self.DELIMITER_PAIRS[d] 202 203 stack.append(end) 204 continue 205 206 # Does the end delimiter match what it is expected? 207 if stack and d == stack[-1]: 208 stack.pop() 209 210 if not stack: 211 yield start, offset, pos + 1 212 break 213 214 def search(self, regex, line): 215 """ 216 This is similar to re.search: 217 218 It matches a regex that it is followed by a delimiter, 219 returning occurrences only if all delimiters are paired. 220 """ 221 222 for t in self._search(regex, line): 223 224 yield line[t[0]:t[2]] 225 226 def sub(self, regex, sub, line, count=0): 227 """ 228 This is similar to re.sub: 229 230 It matches a regex that it is followed by a delimiter, 231 replacing occurrences only if all delimiters are paired. 232 233 if r'\1' is used, it works just like re: it places there the 234 matched paired data with the delimiter stripped. 235 236 If count is different than zero, it will replace at most count 237 items. 238 """ 239 out = "" 240 241 cur_pos = 0 242 n = 0 243 244 for start, end, pos in self._search(regex, line): 245 out += line[cur_pos:start] 246 247 # Value, ignoring start/end delimiters 248 value = line[end:pos - 1] 249 250 # replaces \1 at the sub string, if \1 is used there 251 new_sub = sub 252 new_sub = new_sub.replace(r'\1', value) 253 254 out += new_sub 255 256 # Drop end ';' if any 257 if line[pos] == ';': 258 pos += 1 259 260 cur_pos = pos 261 n += 1 262 263 if count and count >= n: 264 break 265 266 # Append the remaining string 267 l = len(line) 268 out += line[cur_pos:l] 269 270 return out 271