xref: /linux/tools/lib/python/kdoc/kdoc_re.py (revision 134468b0e2043efec4bd25dc6bcef238358a8111)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage.
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or reuses it from the cache.
31        """
32        self.regex = re_cache.get(string, None)
33        if not self.regex:
34            self.regex = re.compile(string, flags=flags)
35            if self.cache:
36                re_cache[string] = self.regex
37
38    def __init__(self, string, cache=True, flags=0):
39        """
40        Compile a regular expression and initialize internal vars.
41        """
42
43        self.cache = cache
44        self.last_match = None
45
46        self._add_regex(string, flags)
47
48    def __str__(self):
49        """
50        Return the regular expression pattern.
51        """
52        return self.regex.pattern
53
54    def __repr__(self):
55        """
56        Returns a displayable version of the class init.
57        """
58
59        flag_map = {
60            re.IGNORECASE: "re.I",
61            re.MULTILINE: "re.M",
62            re.DOTALL: "re.S",
63            re.VERBOSE: "re.X",
64        }
65
66        flags = []
67        for flag, name in flag_map.items():
68            if self.regex.flags & flag:
69                flags.append(name)
70
71        flags_name = " | ".join(flags)
72
73        if flags_name:
74            return f'KernRe("{self.regex.pattern}", {flags_name})'
75        else:
76            return f'KernRe("{self.regex.pattern}")'
77
78    def __add__(self, other):
79        """
80        Allows adding two regular expressions into one.
81        """
82
83        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
84                  flags=self.regex.flags | other.regex.flags)
85
86    def match(self, string):
87        """
88        Handles a re.match storing its results.
89        """
90
91        self.last_match = self.regex.match(string)
92        return self.last_match
93
94    def search(self, string):
95        """
96        Handles a re.search storing its results.
97        """
98
99        self.last_match = self.regex.search(string)
100        return self.last_match
101
102    def findall(self, string):
103        """
104        Alias to re.findall.
105        """
106
107        return self.regex.findall(string)
108
109    def split(self, string):
110        """
111        Alias to re.split.
112        """
113
114        return self.regex.split(string)
115
116    def sub(self, sub, string, count=0):
117        """
118        Alias to re.sub.
119        """
120
121        return self.regex.sub(sub, string, count=count)
122
123    def group(self, num):
124        """
125        Returns the group results of the last match.
126        """
127
128        return self.last_match.group(num)
129
130    def groups(self):
131        """
132        Returns the group results of the last match
133        """
134
135        return self.last_match.groups()
136
137
138class NestedMatch:
139    """
140    Finding nested delimiters is hard with regular expressions. It is
141    even harder on Python with its normal re module, as there are several
142    advanced regular expressions that are missing.
143
144    This is the case of this pattern::
145
146            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
147
148    which is used to properly match open/close parentheses of the
149    string search STRUCT_GROUP(),
150
151    Add a class that counts pairs of delimiters, using it to match and
152    replace nested expressions.
153
154    The original approach was suggested by:
155
156        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
157
158    Although I re-implemented it to make it more generic and match 3 types
159    of delimiters. The logic checks if delimiters are paired. If not, it
160    will ignore the search string.
161    """
162
163    # TODO: make NestedMatch handle multiple match groups
164    #
165    # Right now, regular expressions to match it are defined only up to
166    #       the start delimiter, e.g.:
167    #
168    #       \bSTRUCT_GROUP\(
169    #
170    # is similar to: STRUCT_GROUP\((.*)\)
171    # except that the content inside the match group is delimiter-aligned.
172    #
173    # The content inside parentheses is converted into a single replace
174    # group (e.g. r`\1').
175    #
176    # It would be nice to change such definition to support multiple
177    # match groups, allowing a regex equivalent to:
178    #
179    #   FOO\((.*), (.*), (.*)\)
180    #
181    # it is probably easier to define it not as a regular expression, but
182    # with some lexical definition like:
183    #
184    #   FOO(arg1, arg2, arg3)
185
186    DELIMITER_PAIRS = {
187        '{': '}',
188        '(': ')',
189        '[': ']',
190    }
191
192    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
193
194    def _search(self, regex, line):
195        """
196        Finds paired blocks for a regex that ends with a delimiter.
197
198        The suggestion of using finditer to match pairs came from:
199        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
200        but I ended using a different implementation to align all three types
201        of delimiters and seek for an initial regular expression.
202
203        The algorithm seeks for open/close paired delimiters and places them
204        into a stack, yielding a start/stop position of each match when the
205        stack is zeroed.
206
207        The algorithm should work fine for properly paired lines, but will
208        silently ignore end delimiters that precede a start delimiter.
209        This should be OK for kernel-doc parser, as unaligned delimiters
210        would cause compilation errors. So, we don't need to raise exceptions
211        to cover such issues.
212        """
213
214        stack = []
215
216        for match_re in regex.finditer(line):
217            start = match_re.start()
218            offset = match_re.end()
219            string_char = None
220            escape = False
221
222            d = line[offset - 1]
223            if d not in self.DELIMITER_PAIRS:
224                continue
225
226            end = self.DELIMITER_PAIRS[d]
227            stack.append(end)
228
229            for match in self.RE_DELIM.finditer(line[offset:]):
230                pos = match.start() + offset
231
232                d = line[pos]
233
234                if escape:
235                    escape = False
236                    continue
237
238                if string_char:
239                    if d == '\\':
240                        escape = True
241                    elif d == string_char:
242                        string_char = None
243
244                    continue
245
246                if d in ('"', "'"):
247                    string_char = d
248                    continue
249
250                if d in self.DELIMITER_PAIRS:
251                    end = self.DELIMITER_PAIRS[d]
252
253                    stack.append(end)
254                    continue
255
256                # Does the end delimiter match what is expected?
257                if stack and d == stack[-1]:
258                    stack.pop()
259
260                    if not stack:
261                        yield start, offset, pos + 1
262                        break
263
264    def search(self, regex, line):
265        """
266        This is similar to re.search:
267
268        It matches a regex that it is followed by a delimiter,
269        returning occurrences only if all delimiters are paired.
270        """
271
272        for t in self._search(regex, line):
273
274            yield line[t[0]:t[2]]
275
276    def sub(self, regex, sub, line, count=0):
277        r"""
278        This is similar to re.sub:
279
280        It matches a regex that it is followed by a delimiter,
281        replacing occurrences only if all delimiters are paired.
282
283        if the sub argument contains::
284
285            r'\1'
286
287        it will work just like re: it places there the matched paired data
288        with the delimiter stripped.
289
290        If count is different than zero, it will replace at most count
291        items.
292        """
293        out = ""
294
295        cur_pos = 0
296        n = 0
297
298        for start, end, pos in self._search(regex, line):
299            out += line[cur_pos:start]
300
301            # Value, ignoring start/end delimiters
302            value = line[end:pos - 1]
303
304            # replaces \1 at the sub string, if \1 is used there
305            new_sub = sub
306            new_sub = new_sub.replace(r'\1', value)
307
308            out += new_sub
309
310            # Drop end ';' if any
311            if pos < len(line) and line[pos] == ';':
312                pos += 1
313
314            cur_pos = pos
315            n += 1
316
317            if count and count >= n:
318                break
319
320        # Append the remaining string
321        l = len(line)
322        out += line[cur_pos:l]
323
324        return out
325