1e31fd36dSMauro Carvalho Chehab#!/usr/bin/env python3 2e31fd36dSMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3e31fd36dSMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4e31fd36dSMauro Carvalho Chehab 5e31fd36dSMauro Carvalho Chehab""" 6e31fd36dSMauro Carvalho ChehabRegular expression ancillary classes. 7e31fd36dSMauro Carvalho Chehab 8e31fd36dSMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9e31fd36dSMauro Carvalho Chehab""" 10e31fd36dSMauro Carvalho Chehab 11e31fd36dSMauro Carvalho Chehabimport re 12e31fd36dSMauro Carvalho Chehab 13e31fd36dSMauro Carvalho Chehab# Local cache for regular expressions 14e31fd36dSMauro Carvalho Chehabre_cache = {} 15e31fd36dSMauro Carvalho Chehab 16e31fd36dSMauro Carvalho Chehab 17*04a383ceSMauro Carvalho Chehabclass KernRe: 18e31fd36dSMauro Carvalho Chehab """ 19e31fd36dSMauro Carvalho Chehab Helper class to simplify regex declaration and usage, 20e31fd36dSMauro Carvalho Chehab 21e31fd36dSMauro Carvalho Chehab It calls re.compile for a given pattern. It also allows adding 22e31fd36dSMauro Carvalho Chehab regular expressions and define sub at class init time. 23e31fd36dSMauro Carvalho Chehab 24e31fd36dSMauro Carvalho Chehab Regular expressions can be cached via an argument, helping to speedup 25e31fd36dSMauro Carvalho Chehab searches. 26e31fd36dSMauro Carvalho Chehab """ 27e31fd36dSMauro Carvalho Chehab 28e31fd36dSMauro Carvalho Chehab def _add_regex(self, string, flags): 29e31fd36dSMauro Carvalho Chehab """ 30e31fd36dSMauro Carvalho Chehab Adds a new regex or re-use it from the cache. 31e31fd36dSMauro Carvalho Chehab """ 32e31fd36dSMauro Carvalho Chehab 33e31fd36dSMauro Carvalho Chehab if string in re_cache: 34e31fd36dSMauro Carvalho Chehab self.regex = re_cache[string] 35e31fd36dSMauro Carvalho Chehab else: 36e31fd36dSMauro Carvalho Chehab self.regex = re.compile(string, flags=flags) 37e31fd36dSMauro Carvalho Chehab 38e31fd36dSMauro Carvalho Chehab if self.cache: 39e31fd36dSMauro Carvalho Chehab re_cache[string] = self.regex 40e31fd36dSMauro Carvalho Chehab 41e31fd36dSMauro Carvalho Chehab def __init__(self, string, cache=True, flags=0): 42e31fd36dSMauro Carvalho Chehab """ 43e31fd36dSMauro Carvalho Chehab Compile a regular expression and initialize internal vars. 44e31fd36dSMauro Carvalho Chehab """ 45e31fd36dSMauro Carvalho Chehab 46e31fd36dSMauro Carvalho Chehab self.cache = cache 47e31fd36dSMauro Carvalho Chehab self.last_match = None 48e31fd36dSMauro Carvalho Chehab 49e31fd36dSMauro Carvalho Chehab self._add_regex(string, flags) 50e31fd36dSMauro Carvalho Chehab 51e31fd36dSMauro Carvalho Chehab def __str__(self): 52e31fd36dSMauro Carvalho Chehab """ 53e31fd36dSMauro Carvalho Chehab Return the regular expression pattern. 54e31fd36dSMauro Carvalho Chehab """ 55e31fd36dSMauro Carvalho Chehab return self.regex.pattern 56e31fd36dSMauro Carvalho Chehab 57e31fd36dSMauro Carvalho Chehab def __add__(self, other): 58e31fd36dSMauro Carvalho Chehab """ 59e31fd36dSMauro Carvalho Chehab Allows adding two regular expressions into one. 60e31fd36dSMauro Carvalho Chehab """ 61e31fd36dSMauro Carvalho Chehab 62*04a383ceSMauro Carvalho Chehab return KernRe(str(self) + str(other), cache=self.cache or other.cache, 63e31fd36dSMauro Carvalho Chehab flags=self.regex.flags | other.regex.flags) 64e31fd36dSMauro Carvalho Chehab 65e31fd36dSMauro Carvalho Chehab def match(self, string): 66e31fd36dSMauro Carvalho Chehab """ 67e31fd36dSMauro Carvalho Chehab Handles a re.match storing its results 68e31fd36dSMauro Carvalho Chehab """ 69e31fd36dSMauro Carvalho Chehab 70e31fd36dSMauro Carvalho Chehab self.last_match = self.regex.match(string) 71e31fd36dSMauro Carvalho Chehab return self.last_match 72e31fd36dSMauro Carvalho Chehab 73e31fd36dSMauro Carvalho Chehab def search(self, string): 74e31fd36dSMauro Carvalho Chehab """ 75e31fd36dSMauro Carvalho Chehab Handles a re.search storing its results 76e31fd36dSMauro Carvalho Chehab """ 77e31fd36dSMauro Carvalho Chehab 78e31fd36dSMauro Carvalho Chehab self.last_match = self.regex.search(string) 79e31fd36dSMauro Carvalho Chehab return self.last_match 80e31fd36dSMauro Carvalho Chehab 81e31fd36dSMauro Carvalho Chehab def findall(self, string): 82e31fd36dSMauro Carvalho Chehab """ 83e31fd36dSMauro Carvalho Chehab Alias to re.findall 84e31fd36dSMauro Carvalho Chehab """ 85e31fd36dSMauro Carvalho Chehab 86e31fd36dSMauro Carvalho Chehab return self.regex.findall(string) 87e31fd36dSMauro Carvalho Chehab 88e31fd36dSMauro Carvalho Chehab def split(self, string): 89e31fd36dSMauro Carvalho Chehab """ 90e31fd36dSMauro Carvalho Chehab Alias to re.split 91e31fd36dSMauro Carvalho Chehab """ 92e31fd36dSMauro Carvalho Chehab 93e31fd36dSMauro Carvalho Chehab return self.regex.split(string) 94e31fd36dSMauro Carvalho Chehab 95e31fd36dSMauro Carvalho Chehab def sub(self, sub, string, count=0): 96e31fd36dSMauro Carvalho Chehab """ 97e31fd36dSMauro Carvalho Chehab Alias to re.sub 98e31fd36dSMauro Carvalho Chehab """ 99e31fd36dSMauro Carvalho Chehab 100e31fd36dSMauro Carvalho Chehab return self.regex.sub(sub, string, count=count) 101e31fd36dSMauro Carvalho Chehab 102e31fd36dSMauro Carvalho Chehab def group(self, num): 103e31fd36dSMauro Carvalho Chehab """ 104e31fd36dSMauro Carvalho Chehab Returns the group results of the last match 105e31fd36dSMauro Carvalho Chehab """ 106e31fd36dSMauro Carvalho Chehab 107e31fd36dSMauro Carvalho Chehab return self.last_match.group(num) 108e31fd36dSMauro Carvalho Chehab 109e31fd36dSMauro Carvalho Chehab 110e31fd36dSMauro Carvalho Chehabclass NestedMatch: 111e31fd36dSMauro Carvalho Chehab """ 112e31fd36dSMauro Carvalho Chehab Finding nested delimiters is hard with regular expressions. It is 113e31fd36dSMauro Carvalho Chehab even harder on Python with its normal re module, as there are several 114e31fd36dSMauro Carvalho Chehab advanced regular expressions that are missing. 115e31fd36dSMauro Carvalho Chehab 116e31fd36dSMauro Carvalho Chehab This is the case of this pattern: 117e31fd36dSMauro Carvalho Chehab 118e31fd36dSMauro Carvalho Chehab '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 119e31fd36dSMauro Carvalho Chehab 120e31fd36dSMauro Carvalho Chehab which is used to properly match open/close parenthesis of the 121e31fd36dSMauro Carvalho Chehab string search STRUCT_GROUP(), 122e31fd36dSMauro Carvalho Chehab 123e31fd36dSMauro Carvalho Chehab Add a class that counts pairs of delimiters, using it to match and 124e31fd36dSMauro Carvalho Chehab replace nested expressions. 125e31fd36dSMauro Carvalho Chehab 126e31fd36dSMauro Carvalho Chehab The original approach was suggested by: 127e31fd36dSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 128e31fd36dSMauro Carvalho Chehab 129e31fd36dSMauro Carvalho Chehab Although I re-implemented it to make it more generic and match 3 types 130e31fd36dSMauro Carvalho Chehab of delimiters. The logic checks if delimiters are paired. If not, it 131e31fd36dSMauro Carvalho Chehab will ignore the search string. 132e31fd36dSMauro Carvalho Chehab """ 133e31fd36dSMauro Carvalho Chehab 134485f6f79SMauro Carvalho Chehab # TODO: make NestedMatch handle multiple match groups 135485f6f79SMauro Carvalho Chehab # 136e31fd36dSMauro Carvalho Chehab # Right now, regular expressions to match it are defined only up to 137e31fd36dSMauro Carvalho Chehab # the start delimiter, e.g.: 138e31fd36dSMauro Carvalho Chehab # 139e31fd36dSMauro Carvalho Chehab # \bSTRUCT_GROUP\( 140e31fd36dSMauro Carvalho Chehab # 141e31fd36dSMauro Carvalho Chehab # is similar to: STRUCT_GROUP\((.*)\) 142e31fd36dSMauro Carvalho Chehab # except that the content inside the match group is delimiter's aligned. 143e31fd36dSMauro Carvalho Chehab # 144e31fd36dSMauro Carvalho Chehab # The content inside parenthesis are converted into a single replace 145e31fd36dSMauro Carvalho Chehab # group (e.g. r`\1'). 146e31fd36dSMauro Carvalho Chehab # 147e31fd36dSMauro Carvalho Chehab # It would be nice to change such definition to support multiple 148e31fd36dSMauro Carvalho Chehab # match groups, allowing a regex equivalent to. 149e31fd36dSMauro Carvalho Chehab # 150e31fd36dSMauro Carvalho Chehab # FOO\((.*), (.*), (.*)\) 151e31fd36dSMauro Carvalho Chehab # 152e31fd36dSMauro Carvalho Chehab # it is probably easier to define it not as a regular expression, but 153e31fd36dSMauro Carvalho Chehab # with some lexical definition like: 154e31fd36dSMauro Carvalho Chehab # 155e31fd36dSMauro Carvalho Chehab # FOO(arg1, arg2, arg3) 156e31fd36dSMauro Carvalho Chehab 157e31fd36dSMauro Carvalho Chehab DELIMITER_PAIRS = { 158e31fd36dSMauro Carvalho Chehab '{': '}', 159e31fd36dSMauro Carvalho Chehab '(': ')', 160e31fd36dSMauro Carvalho Chehab '[': ']', 161e31fd36dSMauro Carvalho Chehab } 162e31fd36dSMauro Carvalho Chehab 163e31fd36dSMauro Carvalho Chehab RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 164e31fd36dSMauro Carvalho Chehab 165e31fd36dSMauro Carvalho Chehab def _search(self, regex, line): 166e31fd36dSMauro Carvalho Chehab """ 167e31fd36dSMauro Carvalho Chehab Finds paired blocks for a regex that ends with a delimiter. 168e31fd36dSMauro Carvalho Chehab 169e31fd36dSMauro Carvalho Chehab The suggestion of using finditer to match pairs came from: 170e31fd36dSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 171e31fd36dSMauro Carvalho Chehab but I ended using a different implementation to align all three types 172e31fd36dSMauro Carvalho Chehab of delimiters and seek for an initial regular expression. 173e31fd36dSMauro Carvalho Chehab 174e31fd36dSMauro Carvalho Chehab The algorithm seeks for open/close paired delimiters and place them 175e31fd36dSMauro Carvalho Chehab into a stack, yielding a start/stop position of each match when the 176e31fd36dSMauro Carvalho Chehab stack is zeroed. 177e31fd36dSMauro Carvalho Chehab 178e31fd36dSMauro Carvalho Chehab The algorithm shoud work fine for properly paired lines, but will 179e31fd36dSMauro Carvalho Chehab silently ignore end delimiters that preceeds an start delimiter. 180e31fd36dSMauro Carvalho Chehab This should be OK for kernel-doc parser, as unaligned delimiters 181e31fd36dSMauro Carvalho Chehab would cause compilation errors. So, we don't need to rise exceptions 182e31fd36dSMauro Carvalho Chehab to cover such issues. 183e31fd36dSMauro Carvalho Chehab """ 184e31fd36dSMauro Carvalho Chehab 185e31fd36dSMauro Carvalho Chehab stack = [] 186e31fd36dSMauro Carvalho Chehab 187e31fd36dSMauro Carvalho Chehab for match_re in regex.finditer(line): 188e31fd36dSMauro Carvalho Chehab start = match_re.start() 189e31fd36dSMauro Carvalho Chehab offset = match_re.end() 190e31fd36dSMauro Carvalho Chehab 191e31fd36dSMauro Carvalho Chehab d = line[offset - 1] 192e31fd36dSMauro Carvalho Chehab if d not in self.DELIMITER_PAIRS: 193e31fd36dSMauro Carvalho Chehab continue 194e31fd36dSMauro Carvalho Chehab 195e31fd36dSMauro Carvalho Chehab end = self.DELIMITER_PAIRS[d] 196e31fd36dSMauro Carvalho Chehab stack.append(end) 197e31fd36dSMauro Carvalho Chehab 198e31fd36dSMauro Carvalho Chehab for match in self.RE_DELIM.finditer(line[offset:]): 199e31fd36dSMauro Carvalho Chehab pos = match.start() + offset 200e31fd36dSMauro Carvalho Chehab 201e31fd36dSMauro Carvalho Chehab d = line[pos] 202e31fd36dSMauro Carvalho Chehab 203e31fd36dSMauro Carvalho Chehab if d in self.DELIMITER_PAIRS: 204e31fd36dSMauro Carvalho Chehab end = self.DELIMITER_PAIRS[d] 205e31fd36dSMauro Carvalho Chehab 206e31fd36dSMauro Carvalho Chehab stack.append(end) 207e31fd36dSMauro Carvalho Chehab continue 208e31fd36dSMauro Carvalho Chehab 209e31fd36dSMauro Carvalho Chehab # Does the end delimiter match what it is expected? 210e31fd36dSMauro Carvalho Chehab if stack and d == stack[-1]: 211e31fd36dSMauro Carvalho Chehab stack.pop() 212e31fd36dSMauro Carvalho Chehab 213e31fd36dSMauro Carvalho Chehab if not stack: 214e31fd36dSMauro Carvalho Chehab yield start, offset, pos + 1 215e31fd36dSMauro Carvalho Chehab break 216e31fd36dSMauro Carvalho Chehab 217e31fd36dSMauro Carvalho Chehab def search(self, regex, line): 218e31fd36dSMauro Carvalho Chehab """ 219e31fd36dSMauro Carvalho Chehab This is similar to re.search: 220e31fd36dSMauro Carvalho Chehab 221e31fd36dSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 222e31fd36dSMauro Carvalho Chehab returning occurrences only if all delimiters are paired. 223e31fd36dSMauro Carvalho Chehab """ 224e31fd36dSMauro Carvalho Chehab 225e31fd36dSMauro Carvalho Chehab for t in self._search(regex, line): 226e31fd36dSMauro Carvalho Chehab 227e31fd36dSMauro Carvalho Chehab yield line[t[0]:t[2]] 228e31fd36dSMauro Carvalho Chehab 229e31fd36dSMauro Carvalho Chehab def sub(self, regex, sub, line, count=0): 230e31fd36dSMauro Carvalho Chehab """ 231e31fd36dSMauro Carvalho Chehab This is similar to re.sub: 232e31fd36dSMauro Carvalho Chehab 233e31fd36dSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 234e31fd36dSMauro Carvalho Chehab replacing occurrences only if all delimiters are paired. 235e31fd36dSMauro Carvalho Chehab 236e31fd36dSMauro Carvalho Chehab if r'\1' is used, it works just like re: it places there the 237e31fd36dSMauro Carvalho Chehab matched paired data with the delimiter stripped. 238e31fd36dSMauro Carvalho Chehab 239e31fd36dSMauro Carvalho Chehab If count is different than zero, it will replace at most count 240e31fd36dSMauro Carvalho Chehab items. 241e31fd36dSMauro Carvalho Chehab """ 242e31fd36dSMauro Carvalho Chehab out = "" 243e31fd36dSMauro Carvalho Chehab 244e31fd36dSMauro Carvalho Chehab cur_pos = 0 245e31fd36dSMauro Carvalho Chehab n = 0 246e31fd36dSMauro Carvalho Chehab 247e31fd36dSMauro Carvalho Chehab for start, end, pos in self._search(regex, line): 248e31fd36dSMauro Carvalho Chehab out += line[cur_pos:start] 249e31fd36dSMauro Carvalho Chehab 250e31fd36dSMauro Carvalho Chehab # Value, ignoring start/end delimiters 251e31fd36dSMauro Carvalho Chehab value = line[end:pos - 1] 252e31fd36dSMauro Carvalho Chehab 253e31fd36dSMauro Carvalho Chehab # replaces \1 at the sub string, if \1 is used there 254e31fd36dSMauro Carvalho Chehab new_sub = sub 255e31fd36dSMauro Carvalho Chehab new_sub = new_sub.replace(r'\1', value) 256e31fd36dSMauro Carvalho Chehab 257e31fd36dSMauro Carvalho Chehab out += new_sub 258e31fd36dSMauro Carvalho Chehab 259e31fd36dSMauro Carvalho Chehab # Drop end ';' if any 260e31fd36dSMauro Carvalho Chehab if line[pos] == ';': 261e31fd36dSMauro Carvalho Chehab pos += 1 262e31fd36dSMauro Carvalho Chehab 263e31fd36dSMauro Carvalho Chehab cur_pos = pos 264e31fd36dSMauro Carvalho Chehab n += 1 265e31fd36dSMauro Carvalho Chehab 266e31fd36dSMauro Carvalho Chehab if count and count >= n: 267e31fd36dSMauro Carvalho Chehab break 268e31fd36dSMauro Carvalho Chehab 269e31fd36dSMauro Carvalho Chehab # Append the remaining string 270e31fd36dSMauro Carvalho Chehab l = len(line) 271e31fd36dSMauro Carvalho Chehab out += line[cur_pos:l] 272e31fd36dSMauro Carvalho Chehab 273e31fd36dSMauro Carvalho Chehab return out 274