xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 50b87bb41e48127ec43a35f9302abb4c63ca6cc9)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from .kdoc_re import KernRe
20
21log = logging.getLogger(__name__)
22
23
24class CToken():
25    """
26    Data class to define a C token.
27    """
28
29    # Tokens that can be used by the parser. Works like an C enum.
30
31    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
32    STRING = 1      #: A string, including quotation marks.
33    CHAR = 2        #: A character, including apostophes.
34    NUMBER = 3      #: A number.
35    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
36    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
37    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
38    CPP = 7         #: A preprocessor macro.
39    HASH = 8        #: The hash character - useful to handle other macros.
40    OP = 9          #: A C operator (add, subtract, ...).
41    STRUCT = 10     #: A ``struct`` keyword.
42    UNION = 11      #: An ``union`` keyword.
43    ENUM = 12       #: A ``struct`` keyword.
44    TYPEDEF = 13    #: A ``typedef`` keyword.
45    NAME = 14       #: A name. Can be an ID or a type.
46    SPACE = 15      #: Any space characters, including new lines
47    ENDSTMT = 16    #: End of an statement (``;``).
48
49    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
50
51    MISMATCH = 255  #: an error indicator: should never happen in practice.
52
53    # Dict to convert from an enum interger into a string.
54    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
55
56    # Dict to convert from string to an enum-like integer value.
57    _name_to_val = {k: v for v, k in _name_by_val.items()}
58
59    @staticmethod
60    def to_name(val):
61        """Convert from an integer value from CToken enum into a string"""
62
63        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
64
65    @staticmethod
66    def from_name(name):
67        """Convert a string into a CToken enum value"""
68        if name in CToken._name_to_val:
69            return CToken._name_to_val[name]
70
71        return CToken.MISMATCH
72
73
74    def __init__(self, kind, value=None, pos=0,
75                 brace_level=0, paren_level=0, bracket_level=0):
76        self.kind = kind
77        self.value = value
78        self.pos = pos
79        self.level = (bracket_level, paren_level, brace_level)
80
81    def __repr__(self):
82        name = self.to_name(self.kind)
83        if isinstance(self.value, str):
84            value = '"' + self.value + '"'
85        else:
86            value = self.value
87
88        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
89
90#: Regexes to parse C code, transforming it into tokens.
91RE_SCANNER_LIST = [
92    #
93    # Note that \s\S is different than .*, as it also catches \n
94    #
95    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
96
97    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
98    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
99
100    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
101                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
102
103    (CToken.ENDSTMT, r"(?:\s+;|;)"),
104
105    (CToken.PUNC,    r"[,\.]"),
106
107    (CToken.BEGIN,   r"[\[\(\{]"),
108
109    (CToken.END,     r"[\]\)\}]"),
110
111    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
112
113    (CToken.HASH,    r"#"),
114
115    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
116                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
117
118    (CToken.STRUCT,  r"\bstruct\b"),
119    (CToken.UNION,   r"\bunion\b"),
120    (CToken.ENUM,    r"\benum\b"),
121    (CToken.TYPEDEF, r"\btypedef\b"),
122
123    (CToken.NAME,    r"[A-Za-z_]\w*"),
124
125    (CToken.SPACE,   r"\s+"),
126
127    (CToken.BACKREF, r"\\\d+"),
128
129    (CToken.MISMATCH,r"."),
130]
131
132def fill_re_scanner(token_list):
133    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
134    re_tokens = []
135
136    for kind, pattern in token_list:
137        name = CToken.to_name(kind)
138        re_tokens.append(f"(?P<{name}>{pattern})")
139
140    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
141
142#: Handle C continuation lines.
143RE_CONT = KernRe(r"\\\n")
144
145RE_COMMENT_START = KernRe(r'/\*\s*')
146
147#: tokenizer regex. Will be filled at the first CTokenizer usage.
148RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
149
150
151class CTokenizer():
152    """
153    Scan C statements and definitions and produce tokens.
154
155    When converted to string, it drops comments and handle public/private
156    values, respecting depth.
157    """
158
159    # This class is inspired and follows the basic concepts of:
160    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
161
162    def __init__(self, source=None, log=None):
163        """
164        Create a regular expression to handle RE_SCANNER_LIST.
165
166        While I generally don't like using regex group naming via:
167            (?P<name>...)
168
169        in this particular case, it makes sense, as we can pick the name
170        when matching a code via RE_SCANNER.
171        """
172
173        self.tokens = []
174
175        if not source:
176            return
177
178        if isinstance(source, list):
179            self.tokens = source
180            return
181
182        #
183        # While we could just use _tokenize directly via interator,
184        # As we'll need to use the tokenizer several times inside kernel-doc
185        # to handle macro transforms, cache the results on a list, as
186        # re-using it is cheaper than having to parse everytime.
187        #
188        for tok in self._tokenize(source):
189            self.tokens.append(tok)
190
191    def _tokenize(self, source):
192        """
193        Iterator that parses ``source``, splitting it into tokens, as defined
194        at ``self.RE_SCANNER_LIST``.
195
196        The interactor returns a CToken class object.
197        """
198
199        # Handle continuation lines. Note that kdoc_parser already has a
200        # logic to do that. Still, let's keep it for completeness, as we might
201        # end re-using this tokenizer outsize kernel-doc some day - or we may
202        # eventually remove from there as a future cleanup.
203        source = RE_CONT.sub("", source)
204
205        brace_level = 0
206        paren_level = 0
207        bracket_level = 0
208
209        for match in RE_SCANNER.finditer(source):
210            kind = CToken.from_name(match.lastgroup)
211            pos = match.start()
212            value = match.group()
213
214            if kind == CToken.MISMATCH:
215                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
216            elif kind == CToken.BEGIN:
217                if value == '(':
218                    paren_level += 1
219                elif value == '[':
220                    bracket_level += 1
221                else:  # value == '{'
222                    brace_level += 1
223
224            elif kind == CToken.END:
225                if value == ')' and paren_level > 0:
226                    paren_level -= 1
227                elif value == ']' and bracket_level > 0:
228                    bracket_level -= 1
229                elif brace_level > 0:    # value == '}'
230                    brace_level -= 1
231
232            yield CToken(kind, value, pos,
233                         brace_level, paren_level, bracket_level)
234
235    def __str__(self):
236        out=""
237        show_stack = [True]
238
239        for i, tok in enumerate(self.tokens):
240            if tok.kind == CToken.BEGIN:
241                show_stack.append(show_stack[-1])
242
243            elif tok.kind == CToken.END:
244                prev = show_stack[-1]
245                if len(show_stack) > 1:
246                    show_stack.pop()
247
248                if not prev and show_stack[-1]:
249                    #
250                    # Try to preserve indent
251                    #
252                    out += "\t" * (len(show_stack) - 1)
253
254                    out += str(tok.value)
255                    continue
256
257            elif tok.kind == CToken.COMMENT:
258                comment = RE_COMMENT_START.sub("", tok.value)
259
260                if comment.startswith("private:"):
261                    show_stack[-1] = False
262                    show = False
263                elif comment.startswith("public:"):
264                    show_stack[-1] = True
265
266                continue
267
268            if not show_stack[-1]:
269                continue
270
271            if i < len(self.tokens) - 1:
272                next_tok = self.tokens[i + 1]
273
274                # Do some cleanups before ";"
275
276                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
277                    continue
278
279                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
280                    continue
281
282            out += str(tok.value)
283
284        return out
285
286
287class CMatch:
288    """
289    Finding nested delimiters is hard with regular expressions. It is
290    even harder on Python with its normal re module, as there are several
291    advanced regular expressions that are missing.
292
293    This is the case of this pattern::
294
295            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
296
297    which is used to properly match open/close parentheses of the
298    string search STRUCT_GROUP(),
299
300    Add a class that counts pairs of delimiters, using it to match and
301    replace nested expressions.
302
303    The original approach was suggested by:
304
305        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
306
307    Although I re-implemented it to make it more generic and match 3 types
308    of delimiters. The logic checks if delimiters are paired. If not, it
309    will ignore the search string.
310    """
311
312    # TODO: add a sub method
313
314    def __init__(self, regex):
315        self.regex = KernRe(regex)
316
317    def _search(self, tokenizer):
318        """
319        Finds paired blocks for a regex that ends with a delimiter.
320
321        The suggestion of using finditer to match pairs came from:
322        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
323        but I ended using a different implementation to align all three types
324        of delimiters and seek for an initial regular expression.
325
326        The algorithm seeks for open/close paired delimiters and places them
327        into a stack, yielding a start/stop position of each match when the
328        stack is zeroed.
329
330        The algorithm should work fine for properly paired lines, but will
331        silently ignore end delimiters that precede a start delimiter.
332        This should be OK for kernel-doc parser, as unaligned delimiters
333        would cause compilation errors. So, we don't need to raise exceptions
334        to cover such issues.
335        """
336
337        start = None
338        offset = -1
339        started = False
340
341        import sys
342
343        stack = []
344
345        for i, tok in enumerate(tokenizer.tokens):
346            if start is None:
347                if tok.kind == CToken.NAME and self.regex.match(tok.value):
348                    start = i
349                    stack.append((start, tok.level))
350                    started = False
351
352                continue
353
354            if not started and tok.kind == CToken.BEGIN:
355                started = True
356                continue
357
358            if tok.kind == CToken.END and tok.level == stack[-1][1]:
359                start, level = stack.pop()
360                offset = i
361
362                yield CTokenizer(tokenizer.tokens[start:offset + 1])
363                start = None
364
365        #
366        # If an END zeroing levels is not there, return remaining stuff
367        # This is meant to solve cases where the caller logic might be
368        # picking an incomplete block.
369        #
370        if start and offset < 0:
371            print("WARNING: can't find an end", file=sys.stderr)
372            yield CTokenizer(tokenizer.tokens[start:])
373
374    def search(self, source):
375        """
376        This is similar to re.search:
377
378        It matches a regex that it is followed by a delimiter,
379        returning occurrences only if all delimiters are paired.
380        """
381
382        if isinstance(source, CTokenizer):
383            tokenizer = source
384            is_token = True
385        else:
386            tokenizer = CTokenizer(source)
387            is_token = False
388
389        for new_tokenizer in self._search(tokenizer):
390            if is_token:
391                yield new_tokenizer
392            else:
393                yield str(new_tokenizer)
394