xref: /linux/scripts/lib/kdoc/kdoc_re.py (revision 8d2b0853add1d7534dc0794e3c8e0b9e8c4ec640)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage,
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or re-use it from the cache.
31        """
32        self.regex = re_cache.get(string, None)
33        if not self.regex:
34            self.regex = re.compile(string, flags=flags)
35            if self.cache:
36                re_cache[string] = self.regex
37
38    def __init__(self, string, cache=True, flags=0):
39        """
40        Compile a regular expression and initialize internal vars.
41        """
42
43        self.cache = cache
44        self.last_match = None
45
46        self._add_regex(string, flags)
47
48    def __str__(self):
49        """
50        Return the regular expression pattern.
51        """
52        return self.regex.pattern
53
54    def __add__(self, other):
55        """
56        Allows adding two regular expressions into one.
57        """
58
59        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
60                  flags=self.regex.flags | other.regex.flags)
61
62    def match(self, string):
63        """
64        Handles a re.match storing its results
65        """
66
67        self.last_match = self.regex.match(string)
68        return self.last_match
69
70    def search(self, string):
71        """
72        Handles a re.search storing its results
73        """
74
75        self.last_match = self.regex.search(string)
76        return self.last_match
77
78    def findall(self, string):
79        """
80        Alias to re.findall
81        """
82
83        return self.regex.findall(string)
84
85    def split(self, string):
86        """
87        Alias to re.split
88        """
89
90        return self.regex.split(string)
91
92    def sub(self, sub, string, count=0):
93        """
94        Alias to re.sub
95        """
96
97        return self.regex.sub(sub, string, count=count)
98
99    def group(self, num):
100        """
101        Returns the group results of the last match
102        """
103
104        return self.last_match.group(num)
105
106
107class NestedMatch:
108    """
109    Finding nested delimiters is hard with regular expressions. It is
110    even harder on Python with its normal re module, as there are several
111    advanced regular expressions that are missing.
112
113    This is the case of this pattern:
114
115            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
116
117    which is used to properly match open/close parenthesis of the
118    string search STRUCT_GROUP(),
119
120    Add a class that counts pairs of delimiters, using it to match and
121    replace nested expressions.
122
123    The original approach was suggested by:
124        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
125
126    Although I re-implemented it to make it more generic and match 3 types
127    of delimiters. The logic checks if delimiters are paired. If not, it
128    will ignore the search string.
129    """
130
131    # TODO: make NestedMatch handle multiple match groups
132    #
133    # Right now, regular expressions to match it are defined only up to
134    #       the start delimiter, e.g.:
135    #
136    #       \bSTRUCT_GROUP\(
137    #
138    # is similar to: STRUCT_GROUP\((.*)\)
139    # except that the content inside the match group is delimiter's aligned.
140    #
141    # The content inside parenthesis are converted into a single replace
142    # group (e.g. r`\1').
143    #
144    # It would be nice to change such definition to support multiple
145    # match groups, allowing a regex equivalent to.
146    #
147    #   FOO\((.*), (.*), (.*)\)
148    #
149    # it is probably easier to define it not as a regular expression, but
150    # with some lexical definition like:
151    #
152    #   FOO(arg1, arg2, arg3)
153
154    DELIMITER_PAIRS = {
155        '{': '}',
156        '(': ')',
157        '[': ']',
158    }
159
160    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
161
162    def _search(self, regex, line):
163        """
164        Finds paired blocks for a regex that ends with a delimiter.
165
166        The suggestion of using finditer to match pairs came from:
167        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
168        but I ended using a different implementation to align all three types
169        of delimiters and seek for an initial regular expression.
170
171        The algorithm seeks for open/close paired delimiters and place them
172        into a stack, yielding a start/stop position of each match  when the
173        stack is zeroed.
174
175        The algorithm shoud work fine for properly paired lines, but will
176        silently ignore end delimiters that preceeds an start delimiter.
177        This should be OK for kernel-doc parser, as unaligned delimiters
178        would cause compilation errors. So, we don't need to rise exceptions
179        to cover such issues.
180        """
181
182        stack = []
183
184        for match_re in regex.finditer(line):
185            start = match_re.start()
186            offset = match_re.end()
187
188            d = line[offset - 1]
189            if d not in self.DELIMITER_PAIRS:
190                continue
191
192            end = self.DELIMITER_PAIRS[d]
193            stack.append(end)
194
195            for match in self.RE_DELIM.finditer(line[offset:]):
196                pos = match.start() + offset
197
198                d = line[pos]
199
200                if d in self.DELIMITER_PAIRS:
201                    end = self.DELIMITER_PAIRS[d]
202
203                    stack.append(end)
204                    continue
205
206                # Does the end delimiter match what it is expected?
207                if stack and d == stack[-1]:
208                    stack.pop()
209
210                    if not stack:
211                        yield start, offset, pos + 1
212                        break
213
214    def search(self, regex, line):
215        """
216        This is similar to re.search:
217
218        It matches a regex that it is followed by a delimiter,
219        returning occurrences only if all delimiters are paired.
220        """
221
222        for t in self._search(regex, line):
223
224            yield line[t[0]:t[2]]
225
226    def sub(self, regex, sub, line, count=0):
227        """
228        This is similar to re.sub:
229
230        It matches a regex that it is followed by a delimiter,
231        replacing occurrences only if all delimiters are paired.
232
233        if r'\1' is used, it works just like re: it places there the
234        matched paired data with the delimiter stripped.
235
236        If count is different than zero, it will replace at most count
237        items.
238        """
239        out = ""
240
241        cur_pos = 0
242        n = 0
243
244        for start, end, pos in self._search(regex, line):
245            out += line[cur_pos:start]
246
247            # Value, ignoring start/end delimiters
248            value = line[end:pos - 1]
249
250            # replaces \1 at the sub string, if \1 is used there
251            new_sub = sub
252            new_sub = new_sub.replace(r'\1', value)
253
254            out += new_sub
255
256            # Drop end ';' if any
257            if line[pos] == ';':
258                pos += 1
259
260            cur_pos = pos
261            n += 1
262
263            if count and count >= n:
264                break
265
266        # Append the remaining string
267        l = len(line)
268        out += line[cur_pos:l]
269
270        return out
271