xref: /linux/tools/lib/python/kdoc/kdoc_re.py (revision 37a93dd5c49b5fda807fd204edf2547c3493319c)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage.
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or reuses it from the cache.
31        """
32        self.regex = re_cache.get(string, None)
33        if not self.regex:
34            self.regex = re.compile(string, flags=flags)
35            if self.cache:
36                re_cache[string] = self.regex
37
38    def __init__(self, string, cache=True, flags=0):
39        """
40        Compile a regular expression and initialize internal vars.
41        """
42
43        self.cache = cache
44        self.last_match = None
45
46        self._add_regex(string, flags)
47
48    def __str__(self):
49        """
50        Return the regular expression pattern.
51        """
52        return self.regex.pattern
53
54    def __repr__(self):
55        return f're.compile("{self.regex.pattern}")'
56
57    def __add__(self, other):
58        """
59        Allows adding two regular expressions into one.
60        """
61
62        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63                  flags=self.regex.flags | other.regex.flags)
64
65    def match(self, string):
66        """
67        Handles a re.match storing its results.
68        """
69
70        self.last_match = self.regex.match(string)
71        return self.last_match
72
73    def search(self, string):
74        """
75        Handles a re.search storing its results.
76        """
77
78        self.last_match = self.regex.search(string)
79        return self.last_match
80
81    def findall(self, string):
82        """
83        Alias to re.findall.
84        """
85
86        return self.regex.findall(string)
87
88    def split(self, string):
89        """
90        Alias to re.split.
91        """
92
93        return self.regex.split(string)
94
95    def sub(self, sub, string, count=0):
96        """
97        Alias to re.sub.
98        """
99
100        return self.regex.sub(sub, string, count=count)
101
102    def group(self, num):
103        """
104        Returns the group results of the last match.
105        """
106
107        return self.last_match.group(num)
108
109
110class NestedMatch:
111    """
112    Finding nested delimiters is hard with regular expressions. It is
113    even harder on Python with its normal re module, as there are several
114    advanced regular expressions that are missing.
115
116    This is the case of this pattern::
117
118            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
119
120    which is used to properly match open/close parentheses of the
121    string search STRUCT_GROUP(),
122
123    Add a class that counts pairs of delimiters, using it to match and
124    replace nested expressions.
125
126    The original approach was suggested by:
127
128        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
129
130    Although I re-implemented it to make it more generic and match 3 types
131    of delimiters. The logic checks if delimiters are paired. If not, it
132    will ignore the search string.
133    """
134
135    # TODO: make NestedMatch handle multiple match groups
136    #
137    # Right now, regular expressions to match it are defined only up to
138    #       the start delimiter, e.g.:
139    #
140    #       \bSTRUCT_GROUP\(
141    #
142    # is similar to: STRUCT_GROUP\((.*)\)
143    # except that the content inside the match group is delimiter-aligned.
144    #
145    # The content inside parentheses is converted into a single replace
146    # group (e.g. r`\1').
147    #
148    # It would be nice to change such definition to support multiple
149    # match groups, allowing a regex equivalent to:
150    #
151    #   FOO\((.*), (.*), (.*)\)
152    #
153    # it is probably easier to define it not as a regular expression, but
154    # with some lexical definition like:
155    #
156    #   FOO(arg1, arg2, arg3)
157
158    DELIMITER_PAIRS = {
159        '{': '}',
160        '(': ')',
161        '[': ']',
162    }
163
164    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
165
166    def _search(self, regex, line):
167        """
168        Finds paired blocks for a regex that ends with a delimiter.
169
170        The suggestion of using finditer to match pairs came from:
171        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
172        but I ended using a different implementation to align all three types
173        of delimiters and seek for an initial regular expression.
174
175        The algorithm seeks for open/close paired delimiters and places them
176        into a stack, yielding a start/stop position of each match when the
177        stack is zeroed.
178
179        The algorithm should work fine for properly paired lines, but will
180        silently ignore end delimiters that precede a start delimiter.
181        This should be OK for kernel-doc parser, as unaligned delimiters
182        would cause compilation errors. So, we don't need to raise exceptions
183        to cover such issues.
184        """
185
186        stack = []
187
188        for match_re in regex.finditer(line):
189            start = match_re.start()
190            offset = match_re.end()
191
192            d = line[offset - 1]
193            if d not in self.DELIMITER_PAIRS:
194                continue
195
196            end = self.DELIMITER_PAIRS[d]
197            stack.append(end)
198
199            for match in self.RE_DELIM.finditer(line[offset:]):
200                pos = match.start() + offset
201
202                d = line[pos]
203
204                if d in self.DELIMITER_PAIRS:
205                    end = self.DELIMITER_PAIRS[d]
206
207                    stack.append(end)
208                    continue
209
210                # Does the end delimiter match what is expected?
211                if stack and d == stack[-1]:
212                    stack.pop()
213
214                    if not stack:
215                        yield start, offset, pos + 1
216                        break
217
218    def search(self, regex, line):
219        """
220        This is similar to re.search:
221
222        It matches a regex that it is followed by a delimiter,
223        returning occurrences only if all delimiters are paired.
224        """
225
226        for t in self._search(regex, line):
227
228            yield line[t[0]:t[2]]
229
230    def sub(self, regex, sub, line, count=0):
231        r"""
232        This is similar to re.sub:
233
234        It matches a regex that it is followed by a delimiter,
235        replacing occurrences only if all delimiters are paired.
236
237        if the sub argument contains::
238
239            r'\1'
240
241        it will work just like re: it places there the matched paired data
242        with the delimiter stripped.
243
244        If count is different than zero, it will replace at most count
245        items.
246        """
247        out = ""
248
249        cur_pos = 0
250        n = 0
251
252        for start, end, pos in self._search(regex, line):
253            out += line[cur_pos:start]
254
255            # Value, ignoring start/end delimiters
256            value = line[end:pos - 1]
257
258            # replaces \1 at the sub string, if \1 is used there
259            new_sub = sub
260            new_sub = new_sub.replace(r'\1', value)
261
262            out += new_sub
263
264            # Drop end ';' if any
265            if line[pos] == ';':
266                pos += 1
267
268            cur_pos = pos
269            n += 1
270
271            if count and count >= n:
272                break
273
274        # Append the remaining string
275        l = len(line)
276        out += line[cur_pos:l]
277
278        return out
279