xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 024e200e2a89d71dceff7d1bff8ae77b145726e0)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from copy import copy
20
21from .kdoc_re import KernRe
22
23log = logging.getLogger(__name__)
24
25def tokenizer_set_log(logger, prefix = ""):
26    """
27    Replace the module‑level logger with a LoggerAdapter that
28    prepends *prefix* to every message.
29    """
30    global log
31
32    class PrefixAdapter(logging.LoggerAdapter):
33        """
34        Ancillary class to set prefix on all message logs.
35        """
36        def process(self, msg, kwargs):
37            return f"{prefix}{msg}", kwargs
38
39    # Wrap the provided logger in our adapter
40    log = PrefixAdapter(logger, {"prefix": prefix})
41
42class CToken():
43    """
44    Data class to define a C token.
45    """
46
47    # Tokens that can be used by the parser. Works like an C enum.
48
49    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
50    STRING = 1      #: A string, including quotation marks.
51    CHAR = 2        #: A character, including apostophes.
52    NUMBER = 3      #: A number.
53    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
54    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
55    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
56    CPP = 7         #: A preprocessor macro.
57    HASH = 8        #: The hash character - useful to handle other macros.
58    OP = 9          #: A C operator (add, subtract, ...).
59    STRUCT = 10     #: A ``struct`` keyword.
60    UNION = 11      #: An ``union`` keyword.
61    ENUM = 12       #: A ``struct`` keyword.
62    TYPEDEF = 13    #: A ``typedef`` keyword.
63    NAME = 14       #: A name. Can be an ID or a type.
64    SPACE = 15      #: Any space characters, including new lines
65    ENDSTMT = 16    #: End of an statement (``;``).
66
67    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
68
69    MISMATCH = 255  #: an error indicator: should never happen in practice.
70
71    # Dict to convert from an enum interger into a string.
72    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73
74    # Dict to convert from string to an enum-like integer value.
75    _name_to_val = {k: v for v, k in _name_by_val.items()}
76
77    @staticmethod
78    def to_name(val):
79        """Convert from an integer value from CToken enum into a string"""
80
81        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82
83    @staticmethod
84    def from_name(name):
85        """Convert a string into a CToken enum value"""
86        if name in CToken._name_to_val:
87            return CToken._name_to_val[name]
88
89        return CToken.MISMATCH
90
91
92    def __init__(self, kind, value=None, pos=0,
93                 brace_level=0, paren_level=0, bracket_level=0):
94        self.kind = kind
95        self.value = value
96        self.pos = pos
97        self.level = (bracket_level, paren_level, brace_level)
98
99    def __repr__(self):
100        name = self.to_name(self.kind)
101        if isinstance(self.value, str):
102            value = '"' + self.value + '"'
103        else:
104            value = self.value
105
106        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107
108#: Regexes to parse C code, transforming it into tokens.
109RE_SCANNER_LIST = [
110    #
111    # Note that \s\S is different than .*, as it also catches \n
112    #
113    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114
115    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
116    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
117
118    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120
121    (CToken.ENDSTMT, r"(?:\s+;|;)"),
122
123    (CToken.PUNC,    r"[,\.]"),
124
125    (CToken.BEGIN,   r"[\[\(\{]"),
126
127    (CToken.END,     r"[\]\)\}]"),
128
129    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130
131    (CToken.HASH,    r"#"),
132
133    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135
136    (CToken.STRUCT,  r"\bstruct\b"),
137    (CToken.UNION,   r"\bunion\b"),
138    (CToken.ENUM,    r"\benum\b"),
139    (CToken.TYPEDEF, r"\btypedef\b"),
140
141    (CToken.NAME,    r"[A-Za-z_]\w*"),
142
143    (CToken.SPACE,   r"\s+"),
144
145    (CToken.BACKREF, r"\\\d+"),
146
147    (CToken.MISMATCH,r"."),
148]
149
150def fill_re_scanner(token_list):
151    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152    re_tokens = []
153
154    for kind, pattern in token_list:
155        name = CToken.to_name(kind)
156        re_tokens.append(f"(?P<{name}>{pattern})")
157
158    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159
160#: Handle C continuation lines.
161RE_CONT = KernRe(r"\\\n")
162
163RE_COMMENT_START = KernRe(r'/\*\s*')
164
165#: tokenizer regex. Will be filled at the first CTokenizer usage.
166RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167
168
169class CTokenizer():
170    """
171    Scan C statements and definitions and produce tokens.
172
173    When converted to string, it drops comments and handle public/private
174    values, respecting depth.
175    """
176
177    # This class is inspired and follows the basic concepts of:
178    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
179
180    def __init__(self, source=None, log=None):
181        """
182        Create a regular expression to handle RE_SCANNER_LIST.
183
184        While I generally don't like using regex group naming via:
185            (?P<name>...)
186
187        in this particular case, it makes sense, as we can pick the name
188        when matching a code via RE_SCANNER.
189        """
190
191        self.tokens = []
192
193        if not source:
194            return
195
196        if isinstance(source, list):
197            self.tokens = source
198            return
199
200        #
201        # While we could just use _tokenize directly via interator,
202        # As we'll need to use the tokenizer several times inside kernel-doc
203        # to handle macro transforms, cache the results on a list, as
204        # re-using it is cheaper than having to parse everytime.
205        #
206        for tok in self._tokenize(source):
207            self.tokens.append(tok)
208
209    def _tokenize(self, source):
210        """
211        Iterator that parses ``source``, splitting it into tokens, as defined
212        at ``self.RE_SCANNER_LIST``.
213
214        The interactor returns a CToken class object.
215        """
216
217        # Handle continuation lines. Note that kdoc_parser already has a
218        # logic to do that. Still, let's keep it for completeness, as we might
219        # end re-using this tokenizer outsize kernel-doc some day - or we may
220        # eventually remove from there as a future cleanup.
221        source = RE_CONT.sub("", source)
222
223        brace_level = 0
224        paren_level = 0
225        bracket_level = 0
226
227        for match in RE_SCANNER.finditer(source):
228            kind = CToken.from_name(match.lastgroup)
229            pos = match.start()
230            value = match.group()
231
232            if kind == CToken.MISMATCH:
233                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
234            elif kind == CToken.BEGIN:
235                if value == '(':
236                    paren_level += 1
237                elif value == '[':
238                    bracket_level += 1
239                else:  # value == '{'
240                    brace_level += 1
241
242            elif kind == CToken.END:
243                if value == ')' and paren_level > 0:
244                    paren_level -= 1
245                elif value == ']' and bracket_level > 0:
246                    bracket_level -= 1
247                elif brace_level > 0:    # value == '}'
248                    brace_level -= 1
249
250            yield CToken(kind, value, pos,
251                         brace_level, paren_level, bracket_level)
252
253    def __str__(self):
254        out=""
255        show_stack = [True]
256
257        for i, tok in enumerate(self.tokens):
258            if tok.kind == CToken.BEGIN:
259                show_stack.append(show_stack[-1])
260
261            elif tok.kind == CToken.END:
262                prev = show_stack[-1]
263                if len(show_stack) > 1:
264                    show_stack.pop()
265
266                if not prev and show_stack[-1]:
267                    #
268                    # Try to preserve indent
269                    #
270                    out += "\t" * (len(show_stack) - 1)
271
272                    out += str(tok.value)
273                    continue
274
275            elif tok.kind == CToken.COMMENT:
276                comment = RE_COMMENT_START.sub("", tok.value)
277
278                if comment.startswith("private:"):
279                    show_stack[-1] = False
280                    show = False
281                elif comment.startswith("public:"):
282                    show_stack[-1] = True
283
284                continue
285
286            if not show_stack[-1]:
287                continue
288
289            if i < len(self.tokens) - 1:
290                next_tok = self.tokens[i + 1]
291
292                # Do some cleanups before ";"
293
294                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
295                    continue
296
297                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
298                    continue
299
300            out += str(tok.value)
301
302        return out
303
304
305class CTokenArgs:
306    """
307    Ancillary class to help using backrefs from sub matches.
308
309    If the highest backref contain a "+" at the last element,
310    the logic will be greedy, picking all other delims.
311
312    This is needed to parse struct_group macros with end with ``MEMBERS...``.
313    """
314    def __init__(self, sub_str):
315        self.sub_groups = set()
316        self.max_group = -1
317        self.greedy = None
318
319        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
320            group = int(m.group(1))
321            if m.group(2) == "+":
322                if self.greedy and self.greedy != group:
323                    raise ValueError("There are multiple greedy patterns!")
324                self.greedy = group
325
326            self.sub_groups.add(group)
327            self.max_group = max(self.max_group, group)
328
329        if self.greedy:
330            if self.greedy != self.max_group:
331                raise ValueError("Greedy pattern is not the last one!")
332
333            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
334
335        self.sub_str = sub_str
336        self.sub_tokeninzer = CTokenizer(sub_str)
337
338    def groups(self, new_tokenizer):
339        """
340        Create replacement arguments for backrefs like:
341
342        ``\0``, ``\1``, ``\2``, ...``\n``
343
344        It also accepts a ``+`` character to the highest backref. When used,
345        it means in practice to ignore delimins after it, being greedy.
346
347        The logic is smart enough to only go up to the maximum required
348        argument, even if there are more.
349
350        If there is a backref for an argument above the limit, it will
351        raise an exception. Please notice that, on C, square brackets
352        don't have any separator on it. Trying to use ``\1``..``\n`` for
353        brackets also raise an exception.
354        """
355
356        level = (0, 0, 0)
357
358        if self.max_group < 0:
359            return level, []
360
361        tokens = new_tokenizer.tokens
362
363        #
364        # Fill \0 with the full token contents
365        #
366        groups_list = [ [] ]
367
368        if 0 in self.sub_groups:
369            inner_level = 0
370
371            for i in range(0, len(tokens)):
372                tok = tokens[i]
373
374                if tok.kind == CToken.BEGIN:
375                    inner_level += 1
376
377                    #
378                    # Discard first begin
379                    #
380                    if not groups_list[0]:
381                        continue
382                elif tok.kind == CToken.END:
383                    inner_level -= 1
384                    if inner_level < 0:
385                        break
386
387                if inner_level:
388                    groups_list[0].append(tok)
389
390        if not self.max_group:
391            return level, groups_list
392
393        delim = None
394
395        #
396        # Ignore everything before BEGIN. The value of begin gives the
397        # delimiter to be used for the matches
398        #
399        for i in range(0, len(tokens)):
400            tok = tokens[i]
401            if tok.kind == CToken.BEGIN:
402                if tok.value == "{":
403                    delim = ";"
404                elif tok.value == "(":
405                    delim = ","
406                else:
407                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
408
409                level = tok.level
410                break
411
412        pos = 1
413        groups_list.append([])
414
415        inner_level = 0
416        for i in range(i + 1, len(tokens)):
417            tok = tokens[i]
418
419            if tok.kind == CToken.BEGIN:
420                inner_level += 1
421            if tok.kind == CToken.END:
422                inner_level -= 1
423                if inner_level < 0:
424                    break
425
426            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
427                pos += 1
428                if self.greedy and pos > self.max_group:
429                    pos -= 1
430                else:
431                    groups_list.append([])
432
433                    if pos > self.max_group:
434                        break
435
436                    continue
437
438            groups_list[pos].append(tok)
439
440        if pos < self.max_group:
441            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
442
443        return level, groups_list
444
445    def tokens(self, new_tokenizer):
446        level, groups = self.groups(new_tokenizer)
447
448        new = CTokenizer()
449
450        for tok in self.sub_tokeninzer.tokens:
451            if tok.kind == CToken.BACKREF:
452                group = int(tok.value[1:])
453
454                for group_tok in groups[group]:
455                    new_tok = copy(group_tok)
456
457                    new_level = [0, 0, 0]
458
459                    for i in range(0, len(level)):
460                        new_level[i] = new_tok.level[i] + level[i]
461
462                    new_tok.level = tuple(new_level)
463
464                    new.tokens += [ new_tok ]
465            else:
466                new.tokens += [ tok ]
467
468        return new.tokens
469
470
471class CMatch:
472    """
473    Finding nested delimiters is hard with regular expressions. It is
474    even harder on Python with its normal re module, as there are several
475    advanced regular expressions that are missing.
476
477    This is the case of this pattern::
478
479            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
480
481    which is used to properly match open/close parentheses of the
482    string search STRUCT_GROUP(),
483
484    Add a class that counts pairs of delimiters, using it to match and
485    replace nested expressions.
486
487    The original approach was suggested by:
488
489        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
490
491    Although I re-implemented it to make it more generic and match 3 types
492    of delimiters. The logic checks if delimiters are paired. If not, it
493    will ignore the search string.
494    """
495
496
497    def __init__(self, regex, delim="("):
498        self.regex = KernRe("^" + regex + r"\b")
499        self.start_delim = delim
500
501    def _search(self, tokenizer):
502        """
503        Finds paired blocks for a regex that ends with a delimiter.
504
505        The suggestion of using finditer to match pairs came from:
506        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
507        but I ended using a different implementation to align all three types
508        of delimiters and seek for an initial regular expression.
509
510        The algorithm seeks for open/close paired delimiters and places them
511        into a stack, yielding a start/stop position of each match when the
512        stack is zeroed.
513
514        The algorithm should work fine for properly paired lines, but will
515        silently ignore end delimiters that precede a start delimiter.
516        This should be OK for kernel-doc parser, as unaligned delimiters
517        would cause compilation errors. So, we don't need to raise exceptions
518        to cover such issues.
519        """
520
521        start = None
522        started = False
523
524        import sys
525
526        stack = []
527
528        for i, tok in enumerate(tokenizer.tokens):
529            if start is None:
530                if tok.kind == CToken.NAME and self.regex.match(tok.value):
531                    start = i
532                    stack.append((start, tok.level))
533                    started = False
534
535                continue
536
537            if not started:
538                if tok.kind == CToken.SPACE:
539                    continue
540
541                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
542                    started = True
543                    continue
544
545                # Name only token without BEGIN/END
546                if i > start:
547                    i -= 1
548                yield start, i
549                start = None
550
551            if tok.kind == CToken.END and tok.level == stack[-1][1]:
552                start, level = stack.pop()
553
554                yield start, i
555                start = None
556
557        #
558        # If an END zeroing levels is not there, return remaining stuff
559        # This is meant to solve cases where the caller logic might be
560        # picking an incomplete block.
561        #
562        if start and stack:
563            if started:
564                s = str(tokenizer)
565                log.warning(f"can't find a final end at {s}")
566
567            yield start, len(tokenizer.tokens)
568
569    def search(self, source):
570        """
571        This is similar to re.search:
572
573        It matches a regex that it is followed by a delimiter,
574        returning occurrences only if all delimiters are paired.
575        """
576
577        if isinstance(source, CTokenizer):
578            tokenizer = source
579            is_token = True
580        else:
581            tokenizer = CTokenizer(source)
582            is_token = False
583
584        for start, end in self._search(tokenizer):
585            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
586
587            if is_token:
588                yield new_tokenizer
589            else:
590                yield str(new_tokenizer)
591
592    def sub(self, sub_str, source, count=0):
593        """
594        This is similar to re.sub:
595
596        It matches a regex that it is followed by a delimiter,
597        replacing occurrences only if all delimiters are paired.
598
599        if the sub argument contains::
600
601            r'\0'
602
603        it will work just like re: it places there the matched paired data
604        with the delimiter stripped.
605
606        If count is different than zero, it will replace at most count
607        items.
608        """
609        if isinstance(source, CTokenizer):
610            is_token = True
611            tokenizer = source
612        else:
613            is_token = False
614            tokenizer = CTokenizer(source)
615
616        # Detect if sub_str contains sub arguments
617
618        args_match = CTokenArgs(sub_str)
619
620        new_tokenizer = CTokenizer()
621        pos = 0
622        n = 0
623
624        #
625        # NOTE: the code below doesn't consider overlays at sub.
626        # We may need to add some extra unit tests to check if those
627        # would cause problems. When replacing by "", this should not
628        # be a problem, but other transformations could be problematic
629        #
630        for start, end in self._search(tokenizer):
631            new_tokenizer.tokens += tokenizer.tokens[pos:start]
632
633            new = CTokenizer(tokenizer.tokens[start:end + 1])
634
635            new_tokenizer.tokens += args_match.tokens(new)
636
637            pos = end + 1
638
639            n += 1
640            if count and n >= count:
641                break
642
643        new_tokenizer.tokens += tokenizer.tokens[pos:]
644
645        if not is_token:
646            return str(new_tokenizer)
647
648        return new_tokenizer
649
650    def __repr__(self):
651        """
652        Returns a displayable version of the class init.
653        """
654
655        return f'CMatch("{self.regex.regex.pattern}")'
656