xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 9aaeb817ef4f794d1dbb8736332a64b5dae9521c)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from copy import copy
20
21from .kdoc_re import KernRe
22
23log = logging.getLogger(__name__)
24
25
26class CToken():
27    """
28    Data class to define a C token.
29    """
30
31    # Tokens that can be used by the parser. Works like an C enum.
32
33    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
34    STRING = 1      #: A string, including quotation marks.
35    CHAR = 2        #: A character, including apostophes.
36    NUMBER = 3      #: A number.
37    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
38    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
39    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
40    CPP = 7         #: A preprocessor macro.
41    HASH = 8        #: The hash character - useful to handle other macros.
42    OP = 9          #: A C operator (add, subtract, ...).
43    STRUCT = 10     #: A ``struct`` keyword.
44    UNION = 11      #: An ``union`` keyword.
45    ENUM = 12       #: A ``struct`` keyword.
46    TYPEDEF = 13    #: A ``typedef`` keyword.
47    NAME = 14       #: A name. Can be an ID or a type.
48    SPACE = 15      #: Any space characters, including new lines
49    ENDSTMT = 16    #: End of an statement (``;``).
50
51    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
52
53    MISMATCH = 255  #: an error indicator: should never happen in practice.
54
55    # Dict to convert from an enum interger into a string.
56    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
57
58    # Dict to convert from string to an enum-like integer value.
59    _name_to_val = {k: v for v, k in _name_by_val.items()}
60
61    @staticmethod
62    def to_name(val):
63        """Convert from an integer value from CToken enum into a string"""
64
65        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
66
67    @staticmethod
68    def from_name(name):
69        """Convert a string into a CToken enum value"""
70        if name in CToken._name_to_val:
71            return CToken._name_to_val[name]
72
73        return CToken.MISMATCH
74
75
76    def __init__(self, kind, value=None, pos=0,
77                 brace_level=0, paren_level=0, bracket_level=0):
78        self.kind = kind
79        self.value = value
80        self.pos = pos
81        self.level = (bracket_level, paren_level, brace_level)
82
83    def __repr__(self):
84        name = self.to_name(self.kind)
85        if isinstance(self.value, str):
86            value = '"' + self.value + '"'
87        else:
88            value = self.value
89
90        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
91
92#: Regexes to parse C code, transforming it into tokens.
93RE_SCANNER_LIST = [
94    #
95    # Note that \s\S is different than .*, as it also catches \n
96    #
97    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
98
99    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
100    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
101
102    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
103                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
104
105    (CToken.ENDSTMT, r"(?:\s+;|;)"),
106
107    (CToken.PUNC,    r"[,\.]"),
108
109    (CToken.BEGIN,   r"[\[\(\{]"),
110
111    (CToken.END,     r"[\]\)\}]"),
112
113    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
114
115    (CToken.HASH,    r"#"),
116
117    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
118                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
119
120    (CToken.STRUCT,  r"\bstruct\b"),
121    (CToken.UNION,   r"\bunion\b"),
122    (CToken.ENUM,    r"\benum\b"),
123    (CToken.TYPEDEF, r"\btypedef\b"),
124
125    (CToken.NAME,    r"[A-Za-z_]\w*"),
126
127    (CToken.SPACE,   r"\s+"),
128
129    (CToken.BACKREF, r"\\\d+"),
130
131    (CToken.MISMATCH,r"."),
132]
133
134def fill_re_scanner(token_list):
135    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
136    re_tokens = []
137
138    for kind, pattern in token_list:
139        name = CToken.to_name(kind)
140        re_tokens.append(f"(?P<{name}>{pattern})")
141
142    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
143
144#: Handle C continuation lines.
145RE_CONT = KernRe(r"\\\n")
146
147RE_COMMENT_START = KernRe(r'/\*\s*')
148
149#: tokenizer regex. Will be filled at the first CTokenizer usage.
150RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
151
152
153class CTokenizer():
154    """
155    Scan C statements and definitions and produce tokens.
156
157    When converted to string, it drops comments and handle public/private
158    values, respecting depth.
159    """
160
161    # This class is inspired and follows the basic concepts of:
162    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
163
164    def __init__(self, source=None, log=None):
165        """
166        Create a regular expression to handle RE_SCANNER_LIST.
167
168        While I generally don't like using regex group naming via:
169            (?P<name>...)
170
171        in this particular case, it makes sense, as we can pick the name
172        when matching a code via RE_SCANNER.
173        """
174
175        self.tokens = []
176
177        if not source:
178            return
179
180        if isinstance(source, list):
181            self.tokens = source
182            return
183
184        #
185        # While we could just use _tokenize directly via interator,
186        # As we'll need to use the tokenizer several times inside kernel-doc
187        # to handle macro transforms, cache the results on a list, as
188        # re-using it is cheaper than having to parse everytime.
189        #
190        for tok in self._tokenize(source):
191            self.tokens.append(tok)
192
193    def _tokenize(self, source):
194        """
195        Iterator that parses ``source``, splitting it into tokens, as defined
196        at ``self.RE_SCANNER_LIST``.
197
198        The interactor returns a CToken class object.
199        """
200
201        # Handle continuation lines. Note that kdoc_parser already has a
202        # logic to do that. Still, let's keep it for completeness, as we might
203        # end re-using this tokenizer outsize kernel-doc some day - or we may
204        # eventually remove from there as a future cleanup.
205        source = RE_CONT.sub("", source)
206
207        brace_level = 0
208        paren_level = 0
209        bracket_level = 0
210
211        for match in RE_SCANNER.finditer(source):
212            kind = CToken.from_name(match.lastgroup)
213            pos = match.start()
214            value = match.group()
215
216            if kind == CToken.MISMATCH:
217                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
218            elif kind == CToken.BEGIN:
219                if value == '(':
220                    paren_level += 1
221                elif value == '[':
222                    bracket_level += 1
223                else:  # value == '{'
224                    brace_level += 1
225
226            elif kind == CToken.END:
227                if value == ')' and paren_level > 0:
228                    paren_level -= 1
229                elif value == ']' and bracket_level > 0:
230                    bracket_level -= 1
231                elif brace_level > 0:    # value == '}'
232                    brace_level -= 1
233
234            yield CToken(kind, value, pos,
235                         brace_level, paren_level, bracket_level)
236
237    def __str__(self):
238        out=""
239        show_stack = [True]
240
241        for i, tok in enumerate(self.tokens):
242            if tok.kind == CToken.BEGIN:
243                show_stack.append(show_stack[-1])
244
245            elif tok.kind == CToken.END:
246                prev = show_stack[-1]
247                if len(show_stack) > 1:
248                    show_stack.pop()
249
250                if not prev and show_stack[-1]:
251                    #
252                    # Try to preserve indent
253                    #
254                    out += "\t" * (len(show_stack) - 1)
255
256                    out += str(tok.value)
257                    continue
258
259            elif tok.kind == CToken.COMMENT:
260                comment = RE_COMMENT_START.sub("", tok.value)
261
262                if comment.startswith("private:"):
263                    show_stack[-1] = False
264                    show = False
265                elif comment.startswith("public:"):
266                    show_stack[-1] = True
267
268                continue
269
270            if not show_stack[-1]:
271                continue
272
273            if i < len(self.tokens) - 1:
274                next_tok = self.tokens[i + 1]
275
276                # Do some cleanups before ";"
277
278                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
279                    continue
280
281                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
282                    continue
283
284            out += str(tok.value)
285
286        return out
287
288
289class CTokenArgs:
290    """
291    Ancillary class to help using backrefs from sub matches.
292
293    If the highest backref contain a "+" at the last element,
294    the logic will be greedy, picking all other delims.
295
296    This is needed to parse struct_group macros with end with ``MEMBERS...``.
297    """
298    def __init__(self, sub_str):
299        self.sub_groups = set()
300        self.max_group = -1
301        self.greedy = None
302
303        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
304            group = int(m.group(1))
305            if m.group(2) == "+":
306                if self.greedy and self.greedy != group:
307                    raise ValueError("There are multiple greedy patterns!")
308                self.greedy = group
309
310            self.sub_groups.add(group)
311            self.max_group = max(self.max_group, group)
312
313        if self.greedy:
314            if self.greedy != self.max_group:
315                raise ValueError("Greedy pattern is not the last one!")
316
317            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
318
319        self.sub_str = sub_str
320        self.sub_tokeninzer = CTokenizer(sub_str)
321
322    def groups(self, new_tokenizer):
323        """
324        Create replacement arguments for backrefs like:
325
326        ``\0``, ``\1``, ``\2``, ...``\n``
327
328        It also accepts a ``+`` character to the highest backref. When used,
329        it means in practice to ignore delimins after it, being greedy.
330
331        The logic is smart enough to only go up to the maximum required
332        argument, even if there are more.
333
334        If there is a backref for an argument above the limit, it will
335        raise an exception. Please notice that, on C, square brackets
336        don't have any separator on it. Trying to use ``\1``..``\n`` for
337        brackets also raise an exception.
338        """
339
340        level = (0, 0, 0)
341
342        if self.max_group < 0:
343            return level, []
344
345        tokens = new_tokenizer.tokens
346
347        #
348        # Fill \0 with the full token contents
349        #
350        groups_list = [ [] ]
351
352        if 0 in self.sub_groups:
353            inner_level = 0
354
355            for i in range(0, len(tokens)):
356                tok = tokens[i]
357
358                if tok.kind == CToken.BEGIN:
359                    inner_level += 1
360
361                    #
362                    # Discard first begin
363                    #
364                    if not groups_list[0]:
365                        continue
366                elif tok.kind == CToken.END:
367                    inner_level -= 1
368                    if inner_level < 0:
369                        break
370
371                if inner_level:
372                    groups_list[0].append(tok)
373
374        if not self.max_group:
375            return level, groups_list
376
377        delim = None
378
379        #
380        # Ignore everything before BEGIN. The value of begin gives the
381        # delimiter to be used for the matches
382        #
383        for i in range(0, len(tokens)):
384            tok = tokens[i]
385            if tok.kind == CToken.BEGIN:
386                if tok.value == "{":
387                    delim = ";"
388                elif tok.value == "(":
389                    delim = ","
390                else:
391                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
392
393                level = tok.level
394                break
395
396        pos = 1
397        groups_list.append([])
398
399        inner_level = 0
400        for i in range(i + 1, len(tokens)):
401            tok = tokens[i]
402
403            if tok.kind == CToken.BEGIN:
404                inner_level += 1
405            if tok.kind == CToken.END:
406                inner_level -= 1
407                if inner_level < 0:
408                    break
409
410            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
411                pos += 1
412                if self.greedy and pos > self.max_group:
413                    pos -= 1
414                else:
415                    groups_list.append([])
416
417                    if pos > self.max_group:
418                        break
419
420                    continue
421
422            groups_list[pos].append(tok)
423
424        if pos < self.max_group:
425            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
426
427        return level, groups_list
428
429    def tokens(self, new_tokenizer):
430        level, groups = self.groups(new_tokenizer)
431
432        new = CTokenizer()
433
434        for tok in self.sub_tokeninzer.tokens:
435            if tok.kind == CToken.BACKREF:
436                group = int(tok.value[1:])
437
438                for group_tok in groups[group]:
439                    new_tok = copy(group_tok)
440
441                    new_level = [0, 0, 0]
442
443                    for i in range(0, len(level)):
444                        new_level[i] = new_tok.level[i] + level[i]
445
446                    new_tok.level = tuple(new_level)
447
448                    new.tokens += [ new_tok ]
449            else:
450                new.tokens += [ tok ]
451
452        return new.tokens
453
454
455class CMatch:
456    """
457    Finding nested delimiters is hard with regular expressions. It is
458    even harder on Python with its normal re module, as there are several
459    advanced regular expressions that are missing.
460
461    This is the case of this pattern::
462
463            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
464
465    which is used to properly match open/close parentheses of the
466    string search STRUCT_GROUP(),
467
468    Add a class that counts pairs of delimiters, using it to match and
469    replace nested expressions.
470
471    The original approach was suggested by:
472
473        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
474
475    Although I re-implemented it to make it more generic and match 3 types
476    of delimiters. The logic checks if delimiters are paired. If not, it
477    will ignore the search string.
478    """
479
480
481    def __init__(self, regex, delim="("):
482        self.regex = KernRe("^" + regex + r"\b")
483        self.start_delim = delim
484
485    def _search(self, tokenizer):
486        """
487        Finds paired blocks for a regex that ends with a delimiter.
488
489        The suggestion of using finditer to match pairs came from:
490        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
491        but I ended using a different implementation to align all three types
492        of delimiters and seek for an initial regular expression.
493
494        The algorithm seeks for open/close paired delimiters and places them
495        into a stack, yielding a start/stop position of each match when the
496        stack is zeroed.
497
498        The algorithm should work fine for properly paired lines, but will
499        silently ignore end delimiters that precede a start delimiter.
500        This should be OK for kernel-doc parser, as unaligned delimiters
501        would cause compilation errors. So, we don't need to raise exceptions
502        to cover such issues.
503        """
504
505        start = None
506        started = False
507
508        import sys
509
510        stack = []
511
512        for i, tok in enumerate(tokenizer.tokens):
513            if start is None:
514                if tok.kind == CToken.NAME and self.regex.match(tok.value):
515                    start = i
516                    stack.append((start, tok.level))
517                    started = False
518
519                continue
520
521            if not started:
522                if tok.kind == CToken.SPACE:
523                    continue
524
525                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
526                    started = True
527                    continue
528
529                # Name only token without BEGIN/END
530                if i > start:
531                    i -= 1
532                yield start, i
533                start = None
534
535            if tok.kind == CToken.END and tok.level == stack[-1][1]:
536                start, level = stack.pop()
537
538                yield start, i
539                start = None
540
541        #
542        # If an END zeroing levels is not there, return remaining stuff
543        # This is meant to solve cases where the caller logic might be
544        # picking an incomplete block.
545        #
546        if start and stack:
547            if started:
548                s = str(tokenizer)
549                log.warning(f"can't find a final end at {s}")
550
551            yield start, len(tokenizer.tokens)
552
553    def search(self, source):
554        """
555        This is similar to re.search:
556
557        It matches a regex that it is followed by a delimiter,
558        returning occurrences only if all delimiters are paired.
559        """
560
561        if isinstance(source, CTokenizer):
562            tokenizer = source
563            is_token = True
564        else:
565            tokenizer = CTokenizer(source)
566            is_token = False
567
568        for start, end in self._search(tokenizer):
569            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
570
571            if is_token:
572                yield new_tokenizer
573            else:
574                yield str(new_tokenizer)
575
576    def sub(self, sub_str, source, count=0):
577        """
578        This is similar to re.sub:
579
580        It matches a regex that it is followed by a delimiter,
581        replacing occurrences only if all delimiters are paired.
582
583        if the sub argument contains::
584
585            r'\0'
586
587        it will work just like re: it places there the matched paired data
588        with the delimiter stripped.
589
590        If count is different than zero, it will replace at most count
591        items.
592        """
593        if isinstance(source, CTokenizer):
594            is_token = True
595            tokenizer = source
596        else:
597            is_token = False
598            tokenizer = CTokenizer(source)
599
600        # Detect if sub_str contains sub arguments
601
602        args_match = CTokenArgs(sub_str)
603
604        new_tokenizer = CTokenizer()
605        pos = 0
606        n = 0
607
608        #
609        # NOTE: the code below doesn't consider overlays at sub.
610        # We may need to add some extra unit tests to check if those
611        # would cause problems. When replacing by "", this should not
612        # be a problem, but other transformations could be problematic
613        #
614        for start, end in self._search(tokenizer):
615            new_tokenizer.tokens += tokenizer.tokens[pos:start]
616
617            new = CTokenizer(tokenizer.tokens[start:end + 1])
618
619            new_tokenizer.tokens += args_match.tokens(new)
620
621            pos = end + 1
622
623            n += 1
624            if count and n >= count:
625                break
626
627        new_tokenizer.tokens += tokenizer.tokens[pos:]
628
629        if not is_token:
630            return str(new_tokenizer)
631
632        return new_tokenizer
633
634    def __repr__(self):
635        """
636        Returns a displayable version of the class init.
637        """
638
639        return f'CMatch("{self.regex.regex.pattern}")'
640