xref: /linux/scripts/lib/kdoc/kdoc_re.py (revision 3e443d167327b10966166c1953631936547b03d0)
1e31fd36dSMauro Carvalho Chehab#!/usr/bin/env python3
2e31fd36dSMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3e31fd36dSMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4e31fd36dSMauro Carvalho Chehab
5e31fd36dSMauro Carvalho Chehab"""
6e31fd36dSMauro Carvalho ChehabRegular expression ancillary classes.
7e31fd36dSMauro Carvalho Chehab
8e31fd36dSMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9e31fd36dSMauro Carvalho Chehab"""
10e31fd36dSMauro Carvalho Chehab
11e31fd36dSMauro Carvalho Chehabimport re
12e31fd36dSMauro Carvalho Chehab
13e31fd36dSMauro Carvalho Chehab# Local cache for regular expressions
14e31fd36dSMauro Carvalho Chehabre_cache = {}
15e31fd36dSMauro Carvalho Chehab
16e31fd36dSMauro Carvalho Chehab
17*04a383ceSMauro Carvalho Chehabclass KernRe:
18e31fd36dSMauro Carvalho Chehab    """
19e31fd36dSMauro Carvalho Chehab    Helper class to simplify regex declaration and usage,
20e31fd36dSMauro Carvalho Chehab
21e31fd36dSMauro Carvalho Chehab    It calls re.compile for a given pattern. It also allows adding
22e31fd36dSMauro Carvalho Chehab    regular expressions and define sub at class init time.
23e31fd36dSMauro Carvalho Chehab
24e31fd36dSMauro Carvalho Chehab    Regular expressions can be cached via an argument, helping to speedup
25e31fd36dSMauro Carvalho Chehab    searches.
26e31fd36dSMauro Carvalho Chehab    """
27e31fd36dSMauro Carvalho Chehab
28e31fd36dSMauro Carvalho Chehab    def _add_regex(self, string, flags):
29e31fd36dSMauro Carvalho Chehab        """
30e31fd36dSMauro Carvalho Chehab        Adds a new regex or re-use it from the cache.
31e31fd36dSMauro Carvalho Chehab        """
32e31fd36dSMauro Carvalho Chehab
33e31fd36dSMauro Carvalho Chehab        if string in re_cache:
34e31fd36dSMauro Carvalho Chehab            self.regex = re_cache[string]
35e31fd36dSMauro Carvalho Chehab        else:
36e31fd36dSMauro Carvalho Chehab            self.regex = re.compile(string, flags=flags)
37e31fd36dSMauro Carvalho Chehab
38e31fd36dSMauro Carvalho Chehab            if self.cache:
39e31fd36dSMauro Carvalho Chehab                re_cache[string] = self.regex
40e31fd36dSMauro Carvalho Chehab
41e31fd36dSMauro Carvalho Chehab    def __init__(self, string, cache=True, flags=0):
42e31fd36dSMauro Carvalho Chehab        """
43e31fd36dSMauro Carvalho Chehab        Compile a regular expression and initialize internal vars.
44e31fd36dSMauro Carvalho Chehab        """
45e31fd36dSMauro Carvalho Chehab
46e31fd36dSMauro Carvalho Chehab        self.cache = cache
47e31fd36dSMauro Carvalho Chehab        self.last_match = None
48e31fd36dSMauro Carvalho Chehab
49e31fd36dSMauro Carvalho Chehab        self._add_regex(string, flags)
50e31fd36dSMauro Carvalho Chehab
51e31fd36dSMauro Carvalho Chehab    def __str__(self):
52e31fd36dSMauro Carvalho Chehab        """
53e31fd36dSMauro Carvalho Chehab        Return the regular expression pattern.
54e31fd36dSMauro Carvalho Chehab        """
55e31fd36dSMauro Carvalho Chehab        return self.regex.pattern
56e31fd36dSMauro Carvalho Chehab
57e31fd36dSMauro Carvalho Chehab    def __add__(self, other):
58e31fd36dSMauro Carvalho Chehab        """
59e31fd36dSMauro Carvalho Chehab        Allows adding two regular expressions into one.
60e31fd36dSMauro Carvalho Chehab        """
61e31fd36dSMauro Carvalho Chehab
62*04a383ceSMauro Carvalho Chehab        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63e31fd36dSMauro Carvalho Chehab                  flags=self.regex.flags | other.regex.flags)
64e31fd36dSMauro Carvalho Chehab
65e31fd36dSMauro Carvalho Chehab    def match(self, string):
66e31fd36dSMauro Carvalho Chehab        """
67e31fd36dSMauro Carvalho Chehab        Handles a re.match storing its results
68e31fd36dSMauro Carvalho Chehab        """
69e31fd36dSMauro Carvalho Chehab
70e31fd36dSMauro Carvalho Chehab        self.last_match = self.regex.match(string)
71e31fd36dSMauro Carvalho Chehab        return self.last_match
72e31fd36dSMauro Carvalho Chehab
73e31fd36dSMauro Carvalho Chehab    def search(self, string):
74e31fd36dSMauro Carvalho Chehab        """
75e31fd36dSMauro Carvalho Chehab        Handles a re.search storing its results
76e31fd36dSMauro Carvalho Chehab        """
77e31fd36dSMauro Carvalho Chehab
78e31fd36dSMauro Carvalho Chehab        self.last_match = self.regex.search(string)
79e31fd36dSMauro Carvalho Chehab        return self.last_match
80e31fd36dSMauro Carvalho Chehab
81e31fd36dSMauro Carvalho Chehab    def findall(self, string):
82e31fd36dSMauro Carvalho Chehab        """
83e31fd36dSMauro Carvalho Chehab        Alias to re.findall
84e31fd36dSMauro Carvalho Chehab        """
85e31fd36dSMauro Carvalho Chehab
86e31fd36dSMauro Carvalho Chehab        return self.regex.findall(string)
87e31fd36dSMauro Carvalho Chehab
88e31fd36dSMauro Carvalho Chehab    def split(self, string):
89e31fd36dSMauro Carvalho Chehab        """
90e31fd36dSMauro Carvalho Chehab        Alias to re.split
91e31fd36dSMauro Carvalho Chehab        """
92e31fd36dSMauro Carvalho Chehab
93e31fd36dSMauro Carvalho Chehab        return self.regex.split(string)
94e31fd36dSMauro Carvalho Chehab
95e31fd36dSMauro Carvalho Chehab    def sub(self, sub, string, count=0):
96e31fd36dSMauro Carvalho Chehab        """
97e31fd36dSMauro Carvalho Chehab        Alias to re.sub
98e31fd36dSMauro Carvalho Chehab        """
99e31fd36dSMauro Carvalho Chehab
100e31fd36dSMauro Carvalho Chehab        return self.regex.sub(sub, string, count=count)
101e31fd36dSMauro Carvalho Chehab
102e31fd36dSMauro Carvalho Chehab    def group(self, num):
103e31fd36dSMauro Carvalho Chehab        """
104e31fd36dSMauro Carvalho Chehab        Returns the group results of the last match
105e31fd36dSMauro Carvalho Chehab        """
106e31fd36dSMauro Carvalho Chehab
107e31fd36dSMauro Carvalho Chehab        return self.last_match.group(num)
108e31fd36dSMauro Carvalho Chehab
109e31fd36dSMauro Carvalho Chehab
110e31fd36dSMauro Carvalho Chehabclass NestedMatch:
111e31fd36dSMauro Carvalho Chehab    """
112e31fd36dSMauro Carvalho Chehab    Finding nested delimiters is hard with regular expressions. It is
113e31fd36dSMauro Carvalho Chehab    even harder on Python with its normal re module, as there are several
114e31fd36dSMauro Carvalho Chehab    advanced regular expressions that are missing.
115e31fd36dSMauro Carvalho Chehab
116e31fd36dSMauro Carvalho Chehab    This is the case of this pattern:
117e31fd36dSMauro Carvalho Chehab
118e31fd36dSMauro Carvalho Chehab            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
119e31fd36dSMauro Carvalho Chehab
120e31fd36dSMauro Carvalho Chehab    which is used to properly match open/close parenthesis of the
121e31fd36dSMauro Carvalho Chehab    string search STRUCT_GROUP(),
122e31fd36dSMauro Carvalho Chehab
123e31fd36dSMauro Carvalho Chehab    Add a class that counts pairs of delimiters, using it to match and
124e31fd36dSMauro Carvalho Chehab    replace nested expressions.
125e31fd36dSMauro Carvalho Chehab
126e31fd36dSMauro Carvalho Chehab    The original approach was suggested by:
127e31fd36dSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
128e31fd36dSMauro Carvalho Chehab
129e31fd36dSMauro Carvalho Chehab    Although I re-implemented it to make it more generic and match 3 types
130e31fd36dSMauro Carvalho Chehab    of delimiters. The logic checks if delimiters are paired. If not, it
131e31fd36dSMauro Carvalho Chehab    will ignore the search string.
132e31fd36dSMauro Carvalho Chehab    """
133e31fd36dSMauro Carvalho Chehab
134485f6f79SMauro Carvalho Chehab    # TODO: make NestedMatch handle multiple match groups
135485f6f79SMauro Carvalho Chehab    #
136e31fd36dSMauro Carvalho Chehab    # Right now, regular expressions to match it are defined only up to
137e31fd36dSMauro Carvalho Chehab    #       the start delimiter, e.g.:
138e31fd36dSMauro Carvalho Chehab    #
139e31fd36dSMauro Carvalho Chehab    #       \bSTRUCT_GROUP\(
140e31fd36dSMauro Carvalho Chehab    #
141e31fd36dSMauro Carvalho Chehab    # is similar to: STRUCT_GROUP\((.*)\)
142e31fd36dSMauro Carvalho Chehab    # except that the content inside the match group is delimiter's aligned.
143e31fd36dSMauro Carvalho Chehab    #
144e31fd36dSMauro Carvalho Chehab    # The content inside parenthesis are converted into a single replace
145e31fd36dSMauro Carvalho Chehab    # group (e.g. r`\1').
146e31fd36dSMauro Carvalho Chehab    #
147e31fd36dSMauro Carvalho Chehab    # It would be nice to change such definition to support multiple
148e31fd36dSMauro Carvalho Chehab    # match groups, allowing a regex equivalent to.
149e31fd36dSMauro Carvalho Chehab    #
150e31fd36dSMauro Carvalho Chehab    #   FOO\((.*), (.*), (.*)\)
151e31fd36dSMauro Carvalho Chehab    #
152e31fd36dSMauro Carvalho Chehab    # it is probably easier to define it not as a regular expression, but
153e31fd36dSMauro Carvalho Chehab    # with some lexical definition like:
154e31fd36dSMauro Carvalho Chehab    #
155e31fd36dSMauro Carvalho Chehab    #   FOO(arg1, arg2, arg3)
156e31fd36dSMauro Carvalho Chehab
157e31fd36dSMauro Carvalho Chehab    DELIMITER_PAIRS = {
158e31fd36dSMauro Carvalho Chehab        '{': '}',
159e31fd36dSMauro Carvalho Chehab        '(': ')',
160e31fd36dSMauro Carvalho Chehab        '[': ']',
161e31fd36dSMauro Carvalho Chehab    }
162e31fd36dSMauro Carvalho Chehab
163e31fd36dSMauro Carvalho Chehab    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
164e31fd36dSMauro Carvalho Chehab
165e31fd36dSMauro Carvalho Chehab    def _search(self, regex, line):
166e31fd36dSMauro Carvalho Chehab        """
167e31fd36dSMauro Carvalho Chehab        Finds paired blocks for a regex that ends with a delimiter.
168e31fd36dSMauro Carvalho Chehab
169e31fd36dSMauro Carvalho Chehab        The suggestion of using finditer to match pairs came from:
170e31fd36dSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
171e31fd36dSMauro Carvalho Chehab        but I ended using a different implementation to align all three types
172e31fd36dSMauro Carvalho Chehab        of delimiters and seek for an initial regular expression.
173e31fd36dSMauro Carvalho Chehab
174e31fd36dSMauro Carvalho Chehab        The algorithm seeks for open/close paired delimiters and place them
175e31fd36dSMauro Carvalho Chehab        into a stack, yielding a start/stop position of each match  when the
176e31fd36dSMauro Carvalho Chehab        stack is zeroed.
177e31fd36dSMauro Carvalho Chehab
178e31fd36dSMauro Carvalho Chehab        The algorithm shoud work fine for properly paired lines, but will
179e31fd36dSMauro Carvalho Chehab        silently ignore end delimiters that preceeds an start delimiter.
180e31fd36dSMauro Carvalho Chehab        This should be OK for kernel-doc parser, as unaligned delimiters
181e31fd36dSMauro Carvalho Chehab        would cause compilation errors. So, we don't need to rise exceptions
182e31fd36dSMauro Carvalho Chehab        to cover such issues.
183e31fd36dSMauro Carvalho Chehab        """
184e31fd36dSMauro Carvalho Chehab
185e31fd36dSMauro Carvalho Chehab        stack = []
186e31fd36dSMauro Carvalho Chehab
187e31fd36dSMauro Carvalho Chehab        for match_re in regex.finditer(line):
188e31fd36dSMauro Carvalho Chehab            start = match_re.start()
189e31fd36dSMauro Carvalho Chehab            offset = match_re.end()
190e31fd36dSMauro Carvalho Chehab
191e31fd36dSMauro Carvalho Chehab            d = line[offset - 1]
192e31fd36dSMauro Carvalho Chehab            if d not in self.DELIMITER_PAIRS:
193e31fd36dSMauro Carvalho Chehab                continue
194e31fd36dSMauro Carvalho Chehab
195e31fd36dSMauro Carvalho Chehab            end = self.DELIMITER_PAIRS[d]
196e31fd36dSMauro Carvalho Chehab            stack.append(end)
197e31fd36dSMauro Carvalho Chehab
198e31fd36dSMauro Carvalho Chehab            for match in self.RE_DELIM.finditer(line[offset:]):
199e31fd36dSMauro Carvalho Chehab                pos = match.start() + offset
200e31fd36dSMauro Carvalho Chehab
201e31fd36dSMauro Carvalho Chehab                d = line[pos]
202e31fd36dSMauro Carvalho Chehab
203e31fd36dSMauro Carvalho Chehab                if d in self.DELIMITER_PAIRS:
204e31fd36dSMauro Carvalho Chehab                    end = self.DELIMITER_PAIRS[d]
205e31fd36dSMauro Carvalho Chehab
206e31fd36dSMauro Carvalho Chehab                    stack.append(end)
207e31fd36dSMauro Carvalho Chehab                    continue
208e31fd36dSMauro Carvalho Chehab
209e31fd36dSMauro Carvalho Chehab                # Does the end delimiter match what it is expected?
210e31fd36dSMauro Carvalho Chehab                if stack and d == stack[-1]:
211e31fd36dSMauro Carvalho Chehab                    stack.pop()
212e31fd36dSMauro Carvalho Chehab
213e31fd36dSMauro Carvalho Chehab                    if not stack:
214e31fd36dSMauro Carvalho Chehab                        yield start, offset, pos + 1
215e31fd36dSMauro Carvalho Chehab                        break
216e31fd36dSMauro Carvalho Chehab
217e31fd36dSMauro Carvalho Chehab    def search(self, regex, line):
218e31fd36dSMauro Carvalho Chehab        """
219e31fd36dSMauro Carvalho Chehab        This is similar to re.search:
220e31fd36dSMauro Carvalho Chehab
221e31fd36dSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
222e31fd36dSMauro Carvalho Chehab        returning occurrences only if all delimiters are paired.
223e31fd36dSMauro Carvalho Chehab        """
224e31fd36dSMauro Carvalho Chehab
225e31fd36dSMauro Carvalho Chehab        for t in self._search(regex, line):
226e31fd36dSMauro Carvalho Chehab
227e31fd36dSMauro Carvalho Chehab            yield line[t[0]:t[2]]
228e31fd36dSMauro Carvalho Chehab
229e31fd36dSMauro Carvalho Chehab    def sub(self, regex, sub, line, count=0):
230e31fd36dSMauro Carvalho Chehab        """
231e31fd36dSMauro Carvalho Chehab        This is similar to re.sub:
232e31fd36dSMauro Carvalho Chehab
233e31fd36dSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
234e31fd36dSMauro Carvalho Chehab        replacing occurrences only if all delimiters are paired.
235e31fd36dSMauro Carvalho Chehab
236e31fd36dSMauro Carvalho Chehab        if r'\1' is used, it works just like re: it places there the
237e31fd36dSMauro Carvalho Chehab        matched paired data with the delimiter stripped.
238e31fd36dSMauro Carvalho Chehab
239e31fd36dSMauro Carvalho Chehab        If count is different than zero, it will replace at most count
240e31fd36dSMauro Carvalho Chehab        items.
241e31fd36dSMauro Carvalho Chehab        """
242e31fd36dSMauro Carvalho Chehab        out = ""
243e31fd36dSMauro Carvalho Chehab
244e31fd36dSMauro Carvalho Chehab        cur_pos = 0
245e31fd36dSMauro Carvalho Chehab        n = 0
246e31fd36dSMauro Carvalho Chehab
247e31fd36dSMauro Carvalho Chehab        for start, end, pos in self._search(regex, line):
248e31fd36dSMauro Carvalho Chehab            out += line[cur_pos:start]
249e31fd36dSMauro Carvalho Chehab
250e31fd36dSMauro Carvalho Chehab            # Value, ignoring start/end delimiters
251e31fd36dSMauro Carvalho Chehab            value = line[end:pos - 1]
252e31fd36dSMauro Carvalho Chehab
253e31fd36dSMauro Carvalho Chehab            # replaces \1 at the sub string, if \1 is used there
254e31fd36dSMauro Carvalho Chehab            new_sub = sub
255e31fd36dSMauro Carvalho Chehab            new_sub = new_sub.replace(r'\1', value)
256e31fd36dSMauro Carvalho Chehab
257e31fd36dSMauro Carvalho Chehab            out += new_sub
258e31fd36dSMauro Carvalho Chehab
259e31fd36dSMauro Carvalho Chehab            # Drop end ';' if any
260e31fd36dSMauro Carvalho Chehab            if line[pos] == ';':
261e31fd36dSMauro Carvalho Chehab                pos += 1
262e31fd36dSMauro Carvalho Chehab
263e31fd36dSMauro Carvalho Chehab            cur_pos = pos
264e31fd36dSMauro Carvalho Chehab            n += 1
265e31fd36dSMauro Carvalho Chehab
266e31fd36dSMauro Carvalho Chehab            if count and count >= n:
267e31fd36dSMauro Carvalho Chehab                break
268e31fd36dSMauro Carvalho Chehab
269e31fd36dSMauro Carvalho Chehab        # Append the remaining string
270e31fd36dSMauro Carvalho Chehab        l = len(line)
271e31fd36dSMauro Carvalho Chehab        out += line[cur_pos:l]
272e31fd36dSMauro Carvalho Chehab
273e31fd36dSMauro Carvalho Chehab        return out
274