xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 8c0b7c0d3c0e640b3ebb7f1f648ea322e56c227a)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from copy import copy
20
21from .kdoc_re import KernRe
22
23log = logging.getLogger(__name__)
24
25def tokenizer_set_log(logger, prefix = ""):
26    """
27    Replace the module‑level logger with a LoggerAdapter that
28    prepends *prefix* to every message.
29    """
30    global log
31
32    class PrefixAdapter(logging.LoggerAdapter):
33        """
34        Ancillary class to set prefix on all message logs.
35        """
36        def process(self, msg, kwargs):
37            return f"{prefix}{msg}", kwargs
38
39    # Wrap the provided logger in our adapter
40    log = PrefixAdapter(logger, {"prefix": prefix})
41
42class CToken():
43    """
44    Data class to define a C token.
45    """
46
47    # Tokens that can be used by the parser. Works like an C enum.
48
49    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
50    STRING = 1      #: A string, including quotation marks.
51    CHAR = 2        #: A character, including apostophes.
52    NUMBER = 3      #: A number.
53    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
54    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
55    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
56    CPP = 7         #: A preprocessor macro.
57    HASH = 8        #: The hash character - useful to handle other macros.
58    OP = 9          #: A C operator (add, subtract, ...).
59    STRUCT = 10     #: A ``struct`` keyword.
60    UNION = 11      #: An ``union`` keyword.
61    ENUM = 12       #: A ``struct`` keyword.
62    TYPEDEF = 13    #: A ``typedef`` keyword.
63    NAME = 14       #: A name. Can be an ID or a type.
64    SPACE = 15      #: Any space characters, including new lines
65    ENDSTMT = 16    #: End of an statement (``;``).
66
67    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
68
69    MISMATCH = 255  #: an error indicator: should never happen in practice.
70
71    # Dict to convert from an enum interger into a string.
72    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73
74    # Dict to convert from string to an enum-like integer value.
75    _name_to_val = {k: v for v, k in _name_by_val.items()}
76
77    @staticmethod
78    def to_name(val):
79        """Convert from an integer value from CToken enum into a string"""
80
81        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82
83    @staticmethod
84    def from_name(name):
85        """Convert a string into a CToken enum value"""
86        if name in CToken._name_to_val:
87            return CToken._name_to_val[name]
88
89        return CToken.MISMATCH
90
91
92    def __init__(self, kind, value=None, pos=0,
93                 brace_level=0, paren_level=0, bracket_level=0):
94        self.kind = kind
95        self.value = value
96        self.pos = pos
97        self.level = (bracket_level, paren_level, brace_level)
98
99    def __repr__(self):
100        name = self.to_name(self.kind)
101        if isinstance(self.value, str):
102            value = '"' + self.value + '"'
103        else:
104            value = self.value
105
106        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107
108#: Regexes to parse C code, transforming it into tokens.
109RE_SCANNER_LIST = [
110    #
111    # Note that \s\S is different than .*, as it also catches \n
112    #
113    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114
115    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
116    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
117
118    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120
121    (CToken.ENDSTMT, r"(?:\s+;|;)"),
122
123    (CToken.PUNC,    r"[,\.]"),
124
125    (CToken.BEGIN,   r"[\[\(\{]"),
126
127    (CToken.END,     r"[\]\)\}]"),
128
129    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130
131    (CToken.HASH,    r"#"),
132
133    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135
136    (CToken.STRUCT,  r"\bstruct\b"),
137    (CToken.UNION,   r"\bunion\b"),
138    (CToken.ENUM,    r"\benum\b"),
139    (CToken.TYPEDEF, r"\btypedef\b"),
140
141    (CToken.NAME,    r"[A-Za-z_]\w*"),
142
143    (CToken.SPACE,   r"\s+"),
144
145    (CToken.BACKREF, r"\\\d+"),
146
147    (CToken.MISMATCH,r"."),
148]
149
150def fill_re_scanner(token_list):
151    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152    re_tokens = []
153
154    for kind, pattern in token_list:
155        name = CToken.to_name(kind)
156        re_tokens.append(f"(?P<{name}>{pattern})")
157
158    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159
160#: Handle C continuation lines.
161RE_CONT = KernRe(r"\\\n")
162
163RE_COMMENT_START = KernRe(r'/\*\s*')
164
165#: tokenizer regex. Will be filled at the first CTokenizer usage.
166RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167
168
169class CTokenizer():
170    """
171    Scan C statements and definitions and produce tokens.
172
173    When converted to string, it drops comments and handle public/private
174    values, respecting depth.
175    """
176
177    # This class is inspired and follows the basic concepts of:
178    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
179
180    def __init__(self, source=None, log=None):
181        """
182        Create a regular expression to handle RE_SCANNER_LIST.
183
184        While I generally don't like using regex group naming via:
185            (?P<name>...)
186
187        in this particular case, it makes sense, as we can pick the name
188        when matching a code via RE_SCANNER.
189        """
190
191        self.tokens = []
192
193        if not source:
194            return
195
196        if isinstance(source, list):
197            self.tokens = source
198            return
199
200        #
201        # While we could just use _tokenize directly via interator,
202        # As we'll need to use the tokenizer several times inside kernel-doc
203        # to handle macro transforms, cache the results on a list, as
204        # re-using it is cheaper than having to parse everytime.
205        #
206        for tok in self._tokenize(source):
207            self.tokens.append(tok)
208
209    def _tokenize(self, source):
210        """
211        Iterator that parses ``source``, splitting it into tokens, as defined
212        at ``self.RE_SCANNER_LIST``.
213
214        The interactor returns a CToken class object.
215        """
216
217        # Handle continuation lines. Note that kdoc_parser already has a
218        # logic to do that. Still, let's keep it for completeness, as we might
219        # end re-using this tokenizer outsize kernel-doc some day - or we may
220        # eventually remove from there as a future cleanup.
221        source = RE_CONT.sub("", source)
222
223        brace_level = 0
224        paren_level = 0
225        bracket_level = 0
226
227        for match in RE_SCANNER.finditer(source):
228            kind = CToken.from_name(match.lastgroup)
229            pos = match.start()
230            value = match.group()
231
232            if kind == CToken.MISMATCH:
233                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
234            elif kind == CToken.BEGIN:
235                if value == '(':
236                    paren_level += 1
237                elif value == '[':
238                    bracket_level += 1
239                else:  # value == '{'
240                    brace_level += 1
241
242            elif kind == CToken.END:
243                if value == ')' and paren_level > 0:
244                    paren_level -= 1
245                elif value == ']' and bracket_level > 0:
246                    bracket_level -= 1
247                elif brace_level > 0:    # value == '}'
248                    brace_level -= 1
249
250            yield CToken(kind, value, pos,
251                         brace_level, paren_level, bracket_level)
252
253    def __str__(self):
254        out=""
255        show_stack = [True]
256
257        for i, tok in enumerate(self.tokens):
258            if tok.kind == CToken.BEGIN:
259                show_stack.append(show_stack[-1])
260
261            elif tok.kind == CToken.END:
262                prev = show_stack[-1]
263                if len(show_stack) > 1:
264                    show_stack.pop()
265
266                if not prev and show_stack[-1]:
267                    #
268                    # Try to preserve indent
269                    #
270                    out += "\t" * (len(show_stack) - 1)
271
272                    out += str(tok.value)
273                    continue
274
275            elif tok.kind == CToken.COMMENT:
276                comment = RE_COMMENT_START.sub("", tok.value)
277
278                if comment.startswith("private:"):
279                    show_stack[-1] = False
280                    show = False
281                elif comment.startswith("public:"):
282                    show_stack[-1] = True
283
284                continue
285
286            if not show_stack[-1]:
287                continue
288
289            if i < len(self.tokens) - 1:
290                next_tok = self.tokens[i + 1]
291
292                # Do some cleanups before ";"
293
294                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
295                    continue
296
297                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
298                    continue
299
300            out += str(tok.value)
301
302        return out
303
304
305class CTokenArgs:
306    """
307    Ancillary class to help using backrefs from sub matches.
308
309    If the highest backref contain a "+" at the last element,
310    the logic will be greedy, picking all other delims.
311
312    This is needed to parse struct_group macros with end with ``MEMBERS...``.
313    """
314    def __init__(self, sub_str):
315        self.sub_groups = set()
316        self.max_group = -1
317        self.greedy = None
318
319        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
320            group = int(m.group(1))
321            if m.group(2) == "+":
322                if self.greedy and self.greedy != group:
323                    raise ValueError("There are multiple greedy patterns!")
324                self.greedy = group
325
326            self.sub_groups.add(group)
327            self.max_group = max(self.max_group, group)
328
329        if self.greedy:
330            if self.greedy != self.max_group:
331                raise ValueError("Greedy pattern is not the last one!")
332
333            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
334
335        self.sub_str = sub_str
336        self.sub_tokeninzer = CTokenizer(sub_str)
337
338    def groups(self, new_tokenizer):
339        r"""
340        Create replacement arguments for backrefs like:
341
342        ``\0``, ``\1``, ``\2``, ... ``\{number}``
343
344        It also accepts a ``+`` character to the highest backref, like
345        ``\4+``. When used, the backref will be greedy, picking all other
346        arguments afterwards.
347
348        The logic is smart enough to only go up to the maximum required
349        argument, even if there are more.
350
351        If there is a backref for an argument above the limit, it will
352        raise an exception. Please notice that, on C, square brackets
353        don't have any separator on it. Trying to use ``\1``..``\n`` for
354        brackets also raise an exception.
355        """
356
357        level = (0, 0, 0)
358
359        if self.max_group < 0:
360            return level, []
361
362        tokens = new_tokenizer.tokens
363
364        #
365        # Fill \0 with the full token contents
366        #
367        groups_list = [ [] ]
368
369        if 0 in self.sub_groups:
370            inner_level = 0
371
372            for i in range(0, len(tokens)):
373                tok = tokens[i]
374
375                if tok.kind == CToken.BEGIN:
376                    inner_level += 1
377
378                    #
379                    # Discard first begin
380                    #
381                    if not groups_list[0]:
382                        continue
383                elif tok.kind == CToken.END:
384                    inner_level -= 1
385                    if inner_level < 0:
386                        break
387
388                if inner_level:
389                    groups_list[0].append(tok)
390
391        if not self.max_group:
392            return level, groups_list
393
394        delim = None
395
396        #
397        # Ignore everything before BEGIN. The value of begin gives the
398        # delimiter to be used for the matches
399        #
400        for i in range(0, len(tokens)):
401            tok = tokens[i]
402            if tok.kind == CToken.BEGIN:
403                if tok.value == "{":
404                    delim = ";"
405                elif tok.value == "(":
406                    delim = ","
407                else:
408                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
409
410                level = tok.level
411                break
412
413        pos = 1
414        groups_list.append([])
415
416        inner_level = 0
417        for i in range(i + 1, len(tokens)):
418            tok = tokens[i]
419
420            if tok.kind == CToken.BEGIN:
421                inner_level += 1
422            if tok.kind == CToken.END:
423                inner_level -= 1
424                if inner_level < 0:
425                    break
426
427            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
428                pos += 1
429                if self.greedy and pos > self.max_group:
430                    pos -= 1
431                else:
432                    groups_list.append([])
433
434                    if pos > self.max_group:
435                        break
436
437                    continue
438
439            groups_list[pos].append(tok)
440
441        if pos < self.max_group:
442            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
443
444        return level, groups_list
445
446    def tokens(self, new_tokenizer):
447        level, groups = self.groups(new_tokenizer)
448
449        new = CTokenizer()
450
451        for tok in self.sub_tokeninzer.tokens:
452            if tok.kind == CToken.BACKREF:
453                group = int(tok.value[1:])
454
455                for group_tok in groups[group]:
456                    new_tok = copy(group_tok)
457
458                    new_level = [0, 0, 0]
459
460                    for i in range(0, len(level)):
461                        new_level[i] = new_tok.level[i] + level[i]
462
463                    new_tok.level = tuple(new_level)
464
465                    new.tokens += [ new_tok ]
466            else:
467                new.tokens += [ tok ]
468
469        return new.tokens
470
471
472class CMatch:
473    """
474    Finding nested delimiters is hard with regular expressions. It is
475    even harder on Python with its normal re module, as there are several
476    advanced regular expressions that are missing.
477
478    This is the case of this pattern::
479
480            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
481
482    which is used to properly match open/close parentheses of the
483    string search STRUCT_GROUP(),
484
485    Add a class that counts pairs of delimiters, using it to match and
486    replace nested expressions.
487
488    The original approach was suggested by:
489
490        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
491
492    Although I re-implemented it to make it more generic and match 3 types
493    of delimiters. The logic checks if delimiters are paired. If not, it
494    will ignore the search string.
495    """
496
497
498    def __init__(self, regex, delim="("):
499        self.regex = KernRe("^" + regex + r"\b")
500        self.start_delim = delim
501
502    def _search(self, tokenizer):
503        """
504        Finds paired blocks for a regex that ends with a delimiter.
505
506        The suggestion of using finditer to match pairs came from:
507        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
508        but I ended using a different implementation to align all three types
509        of delimiters and seek for an initial regular expression.
510
511        The algorithm seeks for open/close paired delimiters and places them
512        into a stack, yielding a start/stop position of each match when the
513        stack is zeroed.
514
515        The algorithm should work fine for properly paired lines, but will
516        silently ignore end delimiters that precede a start delimiter.
517        This should be OK for kernel-doc parser, as unaligned delimiters
518        would cause compilation errors. So, we don't need to raise exceptions
519        to cover such issues.
520        """
521
522        start = None
523        started = False
524
525        import sys
526
527        stack = []
528
529        for i, tok in enumerate(tokenizer.tokens):
530            if start is None:
531                if tok.kind == CToken.NAME and self.regex.match(tok.value):
532                    start = i
533                    stack.append((start, tok.level))
534                    started = False
535
536                continue
537
538            if not started:
539                if tok.kind == CToken.SPACE:
540                    continue
541
542                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
543                    started = True
544                    continue
545
546                # Name only token without BEGIN/END
547                if i > start:
548                    i -= 1
549                yield start, i
550                start = None
551
552            if tok.kind == CToken.END and tok.level == stack[-1][1]:
553                start, level = stack.pop()
554
555                yield start, i
556                start = None
557
558        #
559        # If an END zeroing levels is not there, return remaining stuff
560        # This is meant to solve cases where the caller logic might be
561        # picking an incomplete block.
562        #
563        if start and stack:
564            if started:
565                s = str(tokenizer)
566                log.warning(f"can't find a final end at {s}")
567
568            yield start, len(tokenizer.tokens)
569
570    def search(self, source):
571        """
572        This is similar to re.search:
573
574        It matches a regex that it is followed by a delimiter,
575        returning occurrences only if all delimiters are paired.
576        """
577
578        if isinstance(source, CTokenizer):
579            tokenizer = source
580            is_token = True
581        else:
582            tokenizer = CTokenizer(source)
583            is_token = False
584
585        for start, end in self._search(tokenizer):
586            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
587
588            if is_token:
589                yield new_tokenizer
590            else:
591                yield str(new_tokenizer)
592
593    def sub(self, sub_str, source, count=0):
594        """
595        This is similar to re.sub:
596
597        It matches a regex that it is followed by a delimiter,
598        replacing occurrences only if all delimiters are paired.
599
600        if the sub argument contains::
601
602            r'\0'
603
604        it will work just like re: it places there the matched paired data
605        with the delimiter stripped.
606
607        If count is different than zero, it will replace at most count
608        items.
609        """
610        if isinstance(source, CTokenizer):
611            is_token = True
612            tokenizer = source
613        else:
614            is_token = False
615            tokenizer = CTokenizer(source)
616
617        # Detect if sub_str contains sub arguments
618
619        args_match = CTokenArgs(sub_str)
620
621        new_tokenizer = CTokenizer()
622        pos = 0
623        n = 0
624
625        #
626        # NOTE: the code below doesn't consider overlays at sub.
627        # We may need to add some extra unit tests to check if those
628        # would cause problems. When replacing by "", this should not
629        # be a problem, but other transformations could be problematic
630        #
631        for start, end in self._search(tokenizer):
632            new_tokenizer.tokens += tokenizer.tokens[pos:start]
633
634            new = CTokenizer(tokenizer.tokens[start:end + 1])
635
636            new_tokenizer.tokens += args_match.tokens(new)
637
638            pos = end + 1
639
640            n += 1
641            if count and n >= count:
642                break
643
644        new_tokenizer.tokens += tokenizer.tokens[pos:]
645
646        if not is_token:
647            return str(new_tokenizer)
648
649        return new_tokenizer
650
651    def __repr__(self):
652        """
653        Returns a displayable version of the class init.
654        """
655
656        return f'CMatch("{self.regex.regex.pattern}")'
657