xref: /linux/tools/lib/python/kdoc/c_lex.py (revision df50e848f67523195ee0b4c6d2c01823e36a15e7)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from .kdoc_re import KernRe
20
21log = logging.getLogger(__name__)
22
23
24class CToken():
25    """
26    Data class to define a C token.
27    """
28
29    # Tokens that can be used by the parser. Works like an C enum.
30
31    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
32    STRING = 1      #: A string, including quotation marks.
33    CHAR = 2        #: A character, including apostophes.
34    NUMBER = 3      #: A number.
35    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
36    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
37    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
38    CPP = 7         #: A preprocessor macro.
39    HASH = 8        #: The hash character - useful to handle other macros.
40    OP = 9          #: A C operator (add, subtract, ...).
41    STRUCT = 10     #: A ``struct`` keyword.
42    UNION = 11      #: An ``union`` keyword.
43    ENUM = 12       #: A ``struct`` keyword.
44    TYPEDEF = 13    #: A ``typedef`` keyword.
45    NAME = 14       #: A name. Can be an ID or a type.
46    SPACE = 15      #: Any space characters, including new lines
47    ENDSTMT = 16    #: End of an statement (``;``).
48
49    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
50
51    MISMATCH = 255  #: an error indicator: should never happen in practice.
52
53    # Dict to convert from an enum interger into a string.
54    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
55
56    # Dict to convert from string to an enum-like integer value.
57    _name_to_val = {k: v for v, k in _name_by_val.items()}
58
59    @staticmethod
60    def to_name(val):
61        """Convert from an integer value from CToken enum into a string"""
62
63        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
64
65    @staticmethod
66    def from_name(name):
67        """Convert a string into a CToken enum value"""
68        if name in CToken._name_to_val:
69            return CToken._name_to_val[name]
70
71        return CToken.MISMATCH
72
73
74    def __init__(self, kind, value=None, pos=0,
75                 brace_level=0, paren_level=0, bracket_level=0):
76        self.kind = kind
77        self.value = value
78        self.pos = pos
79        self.level = (bracket_level, paren_level, brace_level)
80
81    def __repr__(self):
82        name = self.to_name(self.kind)
83        if isinstance(self.value, str):
84            value = '"' + self.value + '"'
85        else:
86            value = self.value
87
88        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
89
90#: Regexes to parse C code, transforming it into tokens.
91RE_SCANNER_LIST = [
92    #
93    # Note that \s\S is different than .*, as it also catches \n
94    #
95    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
96
97    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
98    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
99
100    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
101                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
102
103    (CToken.ENDSTMT, r"(?:\s+;|;)"),
104
105    (CToken.PUNC,    r"[,\.]"),
106
107    (CToken.BEGIN,   r"[\[\(\{]"),
108
109    (CToken.END,     r"[\]\)\}]"),
110
111    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
112
113    (CToken.HASH,    r"#"),
114
115    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
116                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
117
118    (CToken.STRUCT,  r"\bstruct\b"),
119    (CToken.UNION,   r"\bunion\b"),
120    (CToken.ENUM,    r"\benum\b"),
121    (CToken.TYPEDEF, r"\btypedef\b"),
122
123    (CToken.NAME,    r"[A-Za-z_]\w*"),
124
125    (CToken.SPACE,   r"\s+"),
126
127    (CToken.BACKREF, r"\\\d+"),
128
129    (CToken.MISMATCH,r"."),
130]
131
132def fill_re_scanner(token_list):
133    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
134    re_tokens = []
135
136    for kind, pattern in token_list:
137        name = CToken.to_name(kind)
138        re_tokens.append(f"(?P<{name}>{pattern})")
139
140    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
141
142#: Handle C continuation lines.
143RE_CONT = KernRe(r"\\\n")
144
145RE_COMMENT_START = KernRe(r'/\*\s*')
146
147#: tokenizer regex. Will be filled at the first CTokenizer usage.
148RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
149
150
151class CTokenizer():
152    """
153    Scan C statements and definitions and produce tokens.
154
155    When converted to string, it drops comments and handle public/private
156    values, respecting depth.
157    """
158
159    # This class is inspired and follows the basic concepts of:
160    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
161
162    def __init__(self, source=None, log=None):
163        """
164        Create a regular expression to handle RE_SCANNER_LIST.
165
166        While I generally don't like using regex group naming via:
167            (?P<name>...)
168
169        in this particular case, it makes sense, as we can pick the name
170        when matching a code via RE_SCANNER.
171        """
172
173        self.tokens = []
174
175        if not source:
176            return
177
178        if isinstance(source, list):
179            self.tokens = source
180            return
181
182        #
183        # While we could just use _tokenize directly via interator,
184        # As we'll need to use the tokenizer several times inside kernel-doc
185        # to handle macro transforms, cache the results on a list, as
186        # re-using it is cheaper than having to parse everytime.
187        #
188        for tok in self._tokenize(source):
189            self.tokens.append(tok)
190
191    def _tokenize(self, source):
192        """
193        Iterator that parses ``source``, splitting it into tokens, as defined
194        at ``self.RE_SCANNER_LIST``.
195
196        The interactor returns a CToken class object.
197        """
198
199        # Handle continuation lines. Note that kdoc_parser already has a
200        # logic to do that. Still, let's keep it for completeness, as we might
201        # end re-using this tokenizer outsize kernel-doc some day - or we may
202        # eventually remove from there as a future cleanup.
203        source = RE_CONT.sub("", source)
204
205        brace_level = 0
206        paren_level = 0
207        bracket_level = 0
208
209        for match in RE_SCANNER.finditer(source):
210            kind = CToken.from_name(match.lastgroup)
211            pos = match.start()
212            value = match.group()
213
214            if kind == CToken.MISMATCH:
215                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
216            elif kind == CToken.BEGIN:
217                if value == '(':
218                    paren_level += 1
219                elif value == '[':
220                    bracket_level += 1
221                else:  # value == '{'
222                    brace_level += 1
223
224            elif kind == CToken.END:
225                if value == ')' and paren_level > 0:
226                    paren_level -= 1
227                elif value == ']' and bracket_level > 0:
228                    bracket_level -= 1
229                elif brace_level > 0:    # value == '}'
230                    brace_level -= 1
231
232            yield CToken(kind, value, pos,
233                         brace_level, paren_level, bracket_level)
234
235    def __str__(self):
236        out=""
237        show_stack = [True]
238
239        for i, tok in enumerate(self.tokens):
240            if tok.kind == CToken.BEGIN:
241                show_stack.append(show_stack[-1])
242
243            elif tok.kind == CToken.END:
244                prev = show_stack[-1]
245                if len(show_stack) > 1:
246                    show_stack.pop()
247
248                if not prev and show_stack[-1]:
249                    #
250                    # Try to preserve indent
251                    #
252                    out += "\t" * (len(show_stack) - 1)
253
254                    out += str(tok.value)
255                    continue
256
257            elif tok.kind == CToken.COMMENT:
258                comment = RE_COMMENT_START.sub("", tok.value)
259
260                if comment.startswith("private:"):
261                    show_stack[-1] = False
262                    show = False
263                elif comment.startswith("public:"):
264                    show_stack[-1] = True
265
266                continue
267
268            if not show_stack[-1]:
269                continue
270
271            if i < len(self.tokens) - 1:
272                next_tok = self.tokens[i + 1]
273
274                # Do some cleanups before ";"
275
276                if (tok.kind == CToken.SPACE and
277                    next_tok.kind == CToken.PUNC and
278                    next_tok.value == ";"):
279
280                    continue
281
282                if (tok.kind == CToken.PUNC and
283                    next_tok.kind == CToken.PUNC and
284                    tok.value == ";" and
285                    next_tok.kind == CToken.PUNC and
286                    next_tok.value == ";"):
287
288                    continue
289
290            out += str(tok.value)
291
292        return out
293