xref: /linux/tools/lib/python/kdoc/kdoc_re.py (revision 95a9429cc6d31371575793ab7beb94bf3e7a2f92)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage.
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or reuses it from the cache.
31        """
32        self.regex = re_cache.get(string, None)
33        if not self.regex:
34            self.regex = re.compile(string, flags=flags)
35            if self.cache:
36                re_cache[string] = self.regex
37
38    def __init__(self, string, cache=True, flags=0):
39        """
40        Compile a regular expression and initialize internal vars.
41        """
42
43        self.cache = cache
44        self.last_match = None
45
46        self._add_regex(string, flags)
47
48    def __str__(self):
49        """
50        Return the regular expression pattern.
51        """
52        return self.regex.pattern
53
54    def __repr__(self):
55        """
56        Returns a displayable version of the class init.
57        """
58
59        flag_map = {
60            re.IGNORECASE: "re.I",
61            re.MULTILINE: "re.M",
62            re.DOTALL: "re.S",
63            re.VERBOSE: "re.X",
64        }
65
66        flags = []
67        for flag, name in flag_map.items():
68            if self.regex.flags & flag:
69                flags.append(name)
70
71        flags_name = " | ".join(flags)
72
73        if flags_name:
74            return f'KernRe("{self.regex.pattern}", {flags_name})'
75        else:
76            return f'KernRe("{self.regex.pattern}")'
77
78    def __add__(self, other):
79        """
80        Allows adding two regular expressions into one.
81        """
82
83        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
84                  flags=self.regex.flags | other.regex.flags)
85
86    def match(self, string):
87        """
88        Handles a re.match storing its results.
89        """
90
91        self.last_match = self.regex.match(string)
92        return self.last_match
93
94    def search(self, string):
95        """
96        Handles a re.search storing its results.
97        """
98
99        self.last_match = self.regex.search(string)
100        return self.last_match
101
102    def findall(self, string):
103        """
104        Alias to re.findall.
105        """
106
107        return self.regex.findall(string)
108
109    def split(self, string):
110        """
111        Alias to re.split.
112        """
113
114        return self.regex.split(string)
115
116    def sub(self, sub, string, count=0):
117        """
118        Alias to re.sub.
119        """
120
121        return self.regex.sub(sub, string, count=count)
122
123    def group(self, num):
124        """
125        Returns the group results of the last match.
126        """
127
128        return self.last_match.group(num)
129
130    def groups(self):
131        """
132        Returns the group results of the last match
133        """
134
135        return self.last_match.groups()
136
137
138class NestedMatch:
139    """
140    Finding nested delimiters is hard with regular expressions. It is
141    even harder on Python with its normal re module, as there are several
142    advanced regular expressions that are missing.
143
144    This is the case of this pattern::
145
146            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
147
148    which is used to properly match open/close parentheses of the
149    string search STRUCT_GROUP(),
150
151    Add a class that counts pairs of delimiters, using it to match and
152    replace nested expressions.
153
154    The original approach was suggested by:
155
156        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
157
158    Although I re-implemented it to make it more generic and match 3 types
159    of delimiters. The logic checks if delimiters are paired. If not, it
160    will ignore the search string.
161    """
162
163    # TODO: make NestedMatch handle multiple match groups
164    #
165    # Right now, regular expressions to match it are defined only up to
166    #       the start delimiter, e.g.:
167    #
168    #       \bSTRUCT_GROUP\(
169    #
170    # is similar to: STRUCT_GROUP\((.*)\)
171    # except that the content inside the match group is delimiter-aligned.
172    #
173    # The content inside parentheses is converted into a single replace
174    # group (e.g. r`\1').
175    #
176    # It would be nice to change such definition to support multiple
177    # match groups, allowing a regex equivalent to:
178    #
179    #   FOO\((.*), (.*), (.*)\)
180    #
181    # it is probably easier to define it not as a regular expression, but
182    # with some lexical definition like:
183    #
184    #   FOO(arg1, arg2, arg3)
185
186    DELIMITER_PAIRS = {
187        '{': '}',
188        '(': ')',
189        '[': ']',
190    }
191
192    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
193
194    def _search(self, regex, line):
195        """
196        Finds paired blocks for a regex that ends with a delimiter.
197
198        The suggestion of using finditer to match pairs came from:
199        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
200        but I ended using a different implementation to align all three types
201        of delimiters and seek for an initial regular expression.
202
203        The algorithm seeks for open/close paired delimiters and places them
204        into a stack, yielding a start/stop position of each match when the
205        stack is zeroed.
206
207        The algorithm should work fine for properly paired lines, but will
208        silently ignore end delimiters that precede a start delimiter.
209        This should be OK for kernel-doc parser, as unaligned delimiters
210        would cause compilation errors. So, we don't need to raise exceptions
211        to cover such issues.
212        """
213
214        stack = []
215
216        for match_re in regex.finditer(line):
217            start = match_re.start()
218            offset = match_re.end()
219
220            d = line[offset - 1]
221            if d not in self.DELIMITER_PAIRS:
222                continue
223
224            end = self.DELIMITER_PAIRS[d]
225            stack.append(end)
226
227            for match in self.RE_DELIM.finditer(line[offset:]):
228                pos = match.start() + offset
229
230                d = line[pos]
231
232                if d in self.DELIMITER_PAIRS:
233                    end = self.DELIMITER_PAIRS[d]
234
235                    stack.append(end)
236                    continue
237
238                # Does the end delimiter match what is expected?
239                if stack and d == stack[-1]:
240                    stack.pop()
241
242                    if not stack:
243                        yield start, offset, pos + 1
244                        break
245
246    def search(self, regex, line):
247        """
248        This is similar to re.search:
249
250        It matches a regex that it is followed by a delimiter,
251        returning occurrences only if all delimiters are paired.
252        """
253
254        for t in self._search(regex, line):
255
256            yield line[t[0]:t[2]]
257
258    def sub(self, regex, sub, line, count=0):
259        r"""
260        This is similar to re.sub:
261
262        It matches a regex that it is followed by a delimiter,
263        replacing occurrences only if all delimiters are paired.
264
265        if the sub argument contains::
266
267            r'\1'
268
269        it will work just like re: it places there the matched paired data
270        with the delimiter stripped.
271
272        If count is different than zero, it will replace at most count
273        items.
274        """
275        out = ""
276
277        cur_pos = 0
278        n = 0
279
280        for start, end, pos in self._search(regex, line):
281            out += line[cur_pos:start]
282
283            # Value, ignoring start/end delimiters
284            value = line[end:pos - 1]
285
286            # replaces \1 at the sub string, if \1 is used there
287            new_sub = sub
288            new_sub = new_sub.replace(r'\1', value)
289
290            out += new_sub
291
292            # Drop end ';' if any
293            if pos < len(line) and line[pos] == ';':
294                pos += 1
295
296            cur_pos = pos
297            n += 1
298
299            if count and count >= n:
300                break
301
302        # Append the remaining string
303        l = len(line)
304        out += line[cur_pos:l]
305
306        return out
307