xref: /linux/tools/lib/python/kdoc/kdoc_re.py (revision 2b144a30a407d29b7e6d24549f5316175115e788)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage.
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or reuses it from the cache.
31        """
32        self.regex = re_cache.get(string, None)
33        if not self.regex:
34            self.regex = re.compile(string, flags=flags)
35            if self.cache:
36                re_cache[string] = self.regex
37
38    def __init__(self, string, cache=True, flags=0):
39        """
40        Compile a regular expression and initialize internal vars.
41        """
42
43        self.cache = cache
44        self.last_match = None
45
46        self._add_regex(string, flags)
47
48    def __str__(self):
49        """
50        Return the regular expression pattern.
51        """
52        return self.regex.pattern
53
54    def __repr__(self):
55        return f're.compile("{self.regex.pattern}")'
56
57    def __add__(self, other):
58        """
59        Allows adding two regular expressions into one.
60        """
61
62        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63                  flags=self.regex.flags | other.regex.flags)
64
65    def match(self, string):
66        """
67        Handles a re.match storing its results.
68        """
69
70        self.last_match = self.regex.match(string)
71        return self.last_match
72
73    def search(self, string):
74        """
75        Handles a re.search storing its results.
76        """
77
78        self.last_match = self.regex.search(string)
79        return self.last_match
80
81    def findall(self, string):
82        """
83        Alias to re.findall.
84        """
85
86        return self.regex.findall(string)
87
88    def split(self, string):
89        """
90        Alias to re.split.
91        """
92
93        return self.regex.split(string)
94
95    def sub(self, sub, string, count=0):
96        """
97        Alias to re.sub.
98        """
99
100        return self.regex.sub(sub, string, count=count)
101
102    def group(self, num):
103        """
104        Returns the group results of the last match.
105        """
106
107        return self.last_match.group(num)
108
109    def groups(self):
110        """
111        Returns the group results of the last match
112        """
113
114        return self.last_match.groups()
115
116
117class NestedMatch:
118    """
119    Finding nested delimiters is hard with regular expressions. It is
120    even harder on Python with its normal re module, as there are several
121    advanced regular expressions that are missing.
122
123    This is the case of this pattern::
124
125            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
126
127    which is used to properly match open/close parentheses of the
128    string search STRUCT_GROUP(),
129
130    Add a class that counts pairs of delimiters, using it to match and
131    replace nested expressions.
132
133    The original approach was suggested by:
134
135        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
136
137    Although I re-implemented it to make it more generic and match 3 types
138    of delimiters. The logic checks if delimiters are paired. If not, it
139    will ignore the search string.
140    """
141
142    # TODO: make NestedMatch handle multiple match groups
143    #
144    # Right now, regular expressions to match it are defined only up to
145    #       the start delimiter, e.g.:
146    #
147    #       \bSTRUCT_GROUP\(
148    #
149    # is similar to: STRUCT_GROUP\((.*)\)
150    # except that the content inside the match group is delimiter-aligned.
151    #
152    # The content inside parentheses is converted into a single replace
153    # group (e.g. r`\1').
154    #
155    # It would be nice to change such definition to support multiple
156    # match groups, allowing a regex equivalent to:
157    #
158    #   FOO\((.*), (.*), (.*)\)
159    #
160    # it is probably easier to define it not as a regular expression, but
161    # with some lexical definition like:
162    #
163    #   FOO(arg1, arg2, arg3)
164
165    DELIMITER_PAIRS = {
166        '{': '}',
167        '(': ')',
168        '[': ']',
169    }
170
171    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
172
173    def _search(self, regex, line):
174        """
175        Finds paired blocks for a regex that ends with a delimiter.
176
177        The suggestion of using finditer to match pairs came from:
178        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
179        but I ended using a different implementation to align all three types
180        of delimiters and seek for an initial regular expression.
181
182        The algorithm seeks for open/close paired delimiters and places them
183        into a stack, yielding a start/stop position of each match when the
184        stack is zeroed.
185
186        The algorithm should work fine for properly paired lines, but will
187        silently ignore end delimiters that precede a start delimiter.
188        This should be OK for kernel-doc parser, as unaligned delimiters
189        would cause compilation errors. So, we don't need to raise exceptions
190        to cover such issues.
191        """
192
193        stack = []
194
195        for match_re in regex.finditer(line):
196            start = match_re.start()
197            offset = match_re.end()
198
199            d = line[offset - 1]
200            if d not in self.DELIMITER_PAIRS:
201                continue
202
203            end = self.DELIMITER_PAIRS[d]
204            stack.append(end)
205
206            for match in self.RE_DELIM.finditer(line[offset:]):
207                pos = match.start() + offset
208
209                d = line[pos]
210
211                if d in self.DELIMITER_PAIRS:
212                    end = self.DELIMITER_PAIRS[d]
213
214                    stack.append(end)
215                    continue
216
217                # Does the end delimiter match what is expected?
218                if stack and d == stack[-1]:
219                    stack.pop()
220
221                    if not stack:
222                        yield start, offset, pos + 1
223                        break
224
225    def search(self, regex, line):
226        """
227        This is similar to re.search:
228
229        It matches a regex that it is followed by a delimiter,
230        returning occurrences only if all delimiters are paired.
231        """
232
233        for t in self._search(regex, line):
234
235            yield line[t[0]:t[2]]
236
237    def sub(self, regex, sub, line, count=0):
238        r"""
239        This is similar to re.sub:
240
241        It matches a regex that it is followed by a delimiter,
242        replacing occurrences only if all delimiters are paired.
243
244        if the sub argument contains::
245
246            r'\1'
247
248        it will work just like re: it places there the matched paired data
249        with the delimiter stripped.
250
251        If count is different than zero, it will replace at most count
252        items.
253        """
254        out = ""
255
256        cur_pos = 0
257        n = 0
258
259        for start, end, pos in self._search(regex, line):
260            out += line[cur_pos:start]
261
262            # Value, ignoring start/end delimiters
263            value = line[end:pos - 1]
264
265            # replaces \1 at the sub string, if \1 is used there
266            new_sub = sub
267            new_sub = new_sub.replace(r'\1', value)
268
269            out += new_sub
270
271            # Drop end ';' if any
272            if line[pos] == ';':
273                pos += 1
274
275            cur_pos = pos
276            n += 1
277
278            if count and count >= n:
279                break
280
281        # Append the remaining string
282        l = len(line)
283        out += line[cur_pos:l]
284
285        return out
286