xref: /linux/tools/lib/python/kdoc/kdoc_re.py (revision c991b7ef2fb658c186df56d16b3ebcd0afb555cc)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage.
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or reuses it from the cache.
31        """
32        self.regex = re_cache.get(string, None)
33        if not self.regex:
34            self.regex = re.compile(string, flags=flags)
35            if self.cache:
36                re_cache[string] = self.regex
37
38    def __init__(self, string, cache=True, flags=0):
39        """
40        Compile a regular expression and initialize internal vars.
41        """
42
43        self.cache = cache
44        self.last_match = None
45
46        self._add_regex(string, flags)
47
48    def __str__(self):
49        """
50        Return the regular expression pattern.
51        """
52        return self.regex.pattern
53
54    def __repr__(self):
55        """
56        Returns a displayable version of the class init.
57        """
58
59        flag_map = {
60            re.IGNORECASE: "re.I",
61            re.MULTILINE: "re.M",
62            re.DOTALL: "re.S",
63            re.VERBOSE: "re.X",
64        }
65
66        flags = []
67        for flag, name in flag_map.items():
68            if self.regex.flags & flag:
69                flags.append(name)
70
71        flags_name = " | ".join(flags)
72
73        if flags_name:
74            return f'KernRe("{self.regex.pattern}", {flags_name})'
75        else:
76            return f'KernRe("{self.regex.pattern}")'
77
78    def __add__(self, other):
79        """
80        Allows adding two regular expressions into one.
81        """
82
83        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
84                  flags=self.regex.flags | other.regex.flags)
85
86    def match(self, string):
87        """
88        Handles a re.match storing its results.
89        """
90
91        self.last_match = self.regex.match(string)
92        return self.last_match
93
94    def search(self, string):
95        """
96        Handles a re.search storing its results.
97        """
98
99        self.last_match = self.regex.search(string)
100        return self.last_match
101
102    def finditer(self,  string):
103        """
104        Alias to re.finditer.
105        """
106
107        return self.regex.finditer(string)
108
109    def findall(self, string):
110        """
111        Alias to re.findall.
112        """
113
114        return self.regex.findall(string)
115
116    def split(self, string):
117        """
118        Alias to re.split.
119        """
120
121        return self.regex.split(string)
122
123    def sub(self, sub, string, count=0):
124        """
125        Alias to re.sub.
126        """
127
128        return self.regex.sub(sub, string, count=count)
129
130    def group(self, num):
131        """
132        Returns the group results of the last match.
133        """
134
135        return self.last_match.group(num)
136
137    def groups(self):
138        """
139        Returns the group results of the last match
140        """
141
142        return self.last_match.groups()
143
144#: Nested delimited pairs (brackets and parenthesis)
145DELIMITER_PAIRS = {
146    '{': '}',
147    '(': ')',
148    '[': ']',
149}
150
151#: compiled delimiters
152RE_DELIM = KernRe(r'[\{\}\[\]\(\)]')
153
154
155class NestedMatch:
156    """
157    Finding nested delimiters is hard with regular expressions. It is
158    even harder on Python with its normal re module, as there are several
159    advanced regular expressions that are missing.
160
161    This is the case of this pattern::
162
163            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
164
165    which is used to properly match open/close parentheses of the
166    string search STRUCT_GROUP(),
167
168    Add a class that counts pairs of delimiters, using it to match and
169    replace nested expressions.
170
171    The original approach was suggested by:
172
173        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
174
175    Although I re-implemented it to make it more generic and match 3 types
176    of delimiters. The logic checks if delimiters are paired. If not, it
177    will ignore the search string.
178    """
179
180    # TODO: make NestedMatch handle multiple match groups
181    #
182    # Right now, regular expressions to match it are defined only up to
183    #       the start delimiter, e.g.:
184    #
185    #       \bSTRUCT_GROUP\(
186    #
187    # is similar to: STRUCT_GROUP\((.*)\)
188    # except that the content inside the match group is delimiter-aligned.
189    #
190    # The content inside parentheses is converted into a single replace
191    # group (e.g. r`\0').
192    #
193    # It would be nice to change such definition to support multiple
194    # match groups, allowing a regex equivalent to:
195    #
196    #   FOO\((.*), (.*), (.*)\)
197    #
198    # it is probably easier to define it not as a regular expression, but
199    # with some lexical definition like:
200    #
201    #   FOO(arg1, arg2, arg3)
202
203    def __init__(self, regex):
204        self.regex = KernRe(regex)
205
206    def _search(self, line):
207        """
208        Finds paired blocks for a regex that ends with a delimiter.
209
210        The suggestion of using finditer to match pairs came from:
211        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
212        but I ended using a different implementation to align all three types
213        of delimiters and seek for an initial regular expression.
214
215        The algorithm seeks for open/close paired delimiters and places them
216        into a stack, yielding a start/stop position of each match when the
217        stack is zeroed.
218
219        The algorithm should work fine for properly paired lines, but will
220        silently ignore end delimiters that precede a start delimiter.
221        This should be OK for kernel-doc parser, as unaligned delimiters
222        would cause compilation errors. So, we don't need to raise exceptions
223        to cover such issues.
224        """
225
226        stack = []
227
228        for match_re in self.regex.finditer(line):
229            start = match_re.start()
230            offset = match_re.end()
231            string_char = None
232            escape = False
233
234            d = line[offset - 1]
235            if d not in DELIMITER_PAIRS:
236                continue
237
238            end = DELIMITER_PAIRS[d]
239            stack.append(end)
240
241            for match in RE_DELIM.finditer(line[offset:]):
242                pos = match.start() + offset
243
244                d = line[pos]
245
246                if escape:
247                    escape = False
248                    continue
249
250                if string_char:
251                    if d == '\\':
252                        escape = True
253                    elif d == string_char:
254                        string_char = None
255
256                    continue
257
258                if d in ('"', "'"):
259                    string_char = d
260                    continue
261
262                if d in DELIMITER_PAIRS:
263                    end = DELIMITER_PAIRS[d]
264
265                    stack.append(end)
266                    continue
267
268                # Does the end delimiter match what is expected?
269                if stack and d == stack[-1]:
270                    stack.pop()
271
272                    if not stack:
273                        yield start, offset, pos + 1
274                        break
275
276    def search(self, line):
277        """
278        This is similar to re.search:
279
280        It matches a regex that it is followed by a delimiter,
281        returning occurrences only if all delimiters are paired.
282        """
283
284        for t in self._search(line):
285
286            yield line[t[0]:t[2]]
287
288    def sub(self, sub, line, count=0):
289        """
290        This is similar to re.sub:
291
292        It matches a regex that it is followed by a delimiter,
293        replacing occurrences only if all delimiters are paired.
294
295        if the sub argument contains::
296
297            r'\0'
298
299        it will work just like re: it places there the matched paired data
300        with the delimiter stripped.
301
302        If count is different than zero, it will replace at most count
303        items.
304        """
305        out = ""
306
307        cur_pos = 0
308        n = 0
309
310        for start, end, pos in self._search(line):
311            out += line[cur_pos:start]
312
313            # Value, ignoring start/end delimiters
314            value = line[end:pos - 1]
315
316            # replaces \0 at the sub string, if \0 is used there
317            new_sub = sub
318            new_sub = new_sub.replace(r'\0', value)
319
320            out += new_sub
321
322            # Drop end ';' if any
323            if pos < len(line) and line[pos] == ';':
324                pos += 1
325
326            cur_pos = pos
327            n += 1
328
329            if count and count >= n:
330                break
331
332        # Append the remaining string
333        l = len(line)
334        out += line[cur_pos:l]
335
336        return out
337
338    def __repr__(self):
339        """
340        Returns a displayable version of the class init.
341        """
342
343        return f'NestedMatch("{self.regex.regex.pattern}")'
344