xref: /linux/tools/lib/python/kdoc/c_lex.py (revision df50e848f67523195ee0b4c6d2c01823e36a15e7)
1*df50e848SMauro Carvalho Chehab#!/usr/bin/env python3
2*df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3*df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4*df50e848SMauro Carvalho Chehab
5*df50e848SMauro Carvalho Chehab"""
6*df50e848SMauro Carvalho ChehabRegular expression ancillary classes.
7*df50e848SMauro Carvalho Chehab
8*df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9*df50e848SMauro Carvalho Chehab
10*df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad
11*df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern.
12*df50e848SMauro Carvalho Chehab
13*df50e848SMauro Carvalho ChehabOther errors are logged via log instance.
14*df50e848SMauro Carvalho Chehab"""
15*df50e848SMauro Carvalho Chehab
16*df50e848SMauro Carvalho Chehabimport logging
17*df50e848SMauro Carvalho Chehabimport re
18*df50e848SMauro Carvalho Chehab
19*df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe
20*df50e848SMauro Carvalho Chehab
21*df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__)
22*df50e848SMauro Carvalho Chehab
23*df50e848SMauro Carvalho Chehab
24*df50e848SMauro Carvalho Chehabclass CToken():
25*df50e848SMauro Carvalho Chehab    """
26*df50e848SMauro Carvalho Chehab    Data class to define a C token.
27*df50e848SMauro Carvalho Chehab    """
28*df50e848SMauro Carvalho Chehab
29*df50e848SMauro Carvalho Chehab    # Tokens that can be used by the parser. Works like an C enum.
30*df50e848SMauro Carvalho Chehab
31*df50e848SMauro Carvalho Chehab    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
32*df50e848SMauro Carvalho Chehab    STRING = 1      #: A string, including quotation marks.
33*df50e848SMauro Carvalho Chehab    CHAR = 2        #: A character, including apostophes.
34*df50e848SMauro Carvalho Chehab    NUMBER = 3      #: A number.
35*df50e848SMauro Carvalho Chehab    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
36*df50e848SMauro Carvalho Chehab    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
37*df50e848SMauro Carvalho Chehab    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
38*df50e848SMauro Carvalho Chehab    CPP = 7         #: A preprocessor macro.
39*df50e848SMauro Carvalho Chehab    HASH = 8        #: The hash character - useful to handle other macros.
40*df50e848SMauro Carvalho Chehab    OP = 9          #: A C operator (add, subtract, ...).
41*df50e848SMauro Carvalho Chehab    STRUCT = 10     #: A ``struct`` keyword.
42*df50e848SMauro Carvalho Chehab    UNION = 11      #: An ``union`` keyword.
43*df50e848SMauro Carvalho Chehab    ENUM = 12       #: A ``struct`` keyword.
44*df50e848SMauro Carvalho Chehab    TYPEDEF = 13    #: A ``typedef`` keyword.
45*df50e848SMauro Carvalho Chehab    NAME = 14       #: A name. Can be an ID or a type.
46*df50e848SMauro Carvalho Chehab    SPACE = 15      #: Any space characters, including new lines
47*df50e848SMauro Carvalho Chehab    ENDSTMT = 16    #: End of an statement (``;``).
48*df50e848SMauro Carvalho Chehab
49*df50e848SMauro Carvalho Chehab    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
50*df50e848SMauro Carvalho Chehab
51*df50e848SMauro Carvalho Chehab    MISMATCH = 255  #: an error indicator: should never happen in practice.
52*df50e848SMauro Carvalho Chehab
53*df50e848SMauro Carvalho Chehab    # Dict to convert from an enum interger into a string.
54*df50e848SMauro Carvalho Chehab    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
55*df50e848SMauro Carvalho Chehab
56*df50e848SMauro Carvalho Chehab    # Dict to convert from string to an enum-like integer value.
57*df50e848SMauro Carvalho Chehab    _name_to_val = {k: v for v, k in _name_by_val.items()}
58*df50e848SMauro Carvalho Chehab
59*df50e848SMauro Carvalho Chehab    @staticmethod
60*df50e848SMauro Carvalho Chehab    def to_name(val):
61*df50e848SMauro Carvalho Chehab        """Convert from an integer value from CToken enum into a string"""
62*df50e848SMauro Carvalho Chehab
63*df50e848SMauro Carvalho Chehab        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
64*df50e848SMauro Carvalho Chehab
65*df50e848SMauro Carvalho Chehab    @staticmethod
66*df50e848SMauro Carvalho Chehab    def from_name(name):
67*df50e848SMauro Carvalho Chehab        """Convert a string into a CToken enum value"""
68*df50e848SMauro Carvalho Chehab        if name in CToken._name_to_val:
69*df50e848SMauro Carvalho Chehab            return CToken._name_to_val[name]
70*df50e848SMauro Carvalho Chehab
71*df50e848SMauro Carvalho Chehab        return CToken.MISMATCH
72*df50e848SMauro Carvalho Chehab
73*df50e848SMauro Carvalho Chehab
74*df50e848SMauro Carvalho Chehab    def __init__(self, kind, value=None, pos=0,
75*df50e848SMauro Carvalho Chehab                 brace_level=0, paren_level=0, bracket_level=0):
76*df50e848SMauro Carvalho Chehab        self.kind = kind
77*df50e848SMauro Carvalho Chehab        self.value = value
78*df50e848SMauro Carvalho Chehab        self.pos = pos
79*df50e848SMauro Carvalho Chehab        self.level = (bracket_level, paren_level, brace_level)
80*df50e848SMauro Carvalho Chehab
81*df50e848SMauro Carvalho Chehab    def __repr__(self):
82*df50e848SMauro Carvalho Chehab        name = self.to_name(self.kind)
83*df50e848SMauro Carvalho Chehab        if isinstance(self.value, str):
84*df50e848SMauro Carvalho Chehab            value = '"' + self.value + '"'
85*df50e848SMauro Carvalho Chehab        else:
86*df50e848SMauro Carvalho Chehab            value = self.value
87*df50e848SMauro Carvalho Chehab
88*df50e848SMauro Carvalho Chehab        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
89*df50e848SMauro Carvalho Chehab
90*df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens.
91*df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [
92*df50e848SMauro Carvalho Chehab    #
93*df50e848SMauro Carvalho Chehab    # Note that \s\S is different than .*, as it also catches \n
94*df50e848SMauro Carvalho Chehab    #
95*df50e848SMauro Carvalho Chehab    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
96*df50e848SMauro Carvalho Chehab
97*df50e848SMauro Carvalho Chehab    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
98*df50e848SMauro Carvalho Chehab    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
99*df50e848SMauro Carvalho Chehab
100*df50e848SMauro Carvalho Chehab    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
101*df50e848SMauro Carvalho Chehab                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
102*df50e848SMauro Carvalho Chehab
103*df50e848SMauro Carvalho Chehab    (CToken.ENDSTMT, r"(?:\s+;|;)"),
104*df50e848SMauro Carvalho Chehab
105*df50e848SMauro Carvalho Chehab    (CToken.PUNC,    r"[,\.]"),
106*df50e848SMauro Carvalho Chehab
107*df50e848SMauro Carvalho Chehab    (CToken.BEGIN,   r"[\[\(\{]"),
108*df50e848SMauro Carvalho Chehab
109*df50e848SMauro Carvalho Chehab    (CToken.END,     r"[\]\)\}]"),
110*df50e848SMauro Carvalho Chehab
111*df50e848SMauro Carvalho Chehab    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
112*df50e848SMauro Carvalho Chehab
113*df50e848SMauro Carvalho Chehab    (CToken.HASH,    r"#"),
114*df50e848SMauro Carvalho Chehab
115*df50e848SMauro Carvalho Chehab    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
116*df50e848SMauro Carvalho Chehab                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
117*df50e848SMauro Carvalho Chehab
118*df50e848SMauro Carvalho Chehab    (CToken.STRUCT,  r"\bstruct\b"),
119*df50e848SMauro Carvalho Chehab    (CToken.UNION,   r"\bunion\b"),
120*df50e848SMauro Carvalho Chehab    (CToken.ENUM,    r"\benum\b"),
121*df50e848SMauro Carvalho Chehab    (CToken.TYPEDEF, r"\btypedef\b"),
122*df50e848SMauro Carvalho Chehab
123*df50e848SMauro Carvalho Chehab    (CToken.NAME,    r"[A-Za-z_]\w*"),
124*df50e848SMauro Carvalho Chehab
125*df50e848SMauro Carvalho Chehab    (CToken.SPACE,   r"\s+"),
126*df50e848SMauro Carvalho Chehab
127*df50e848SMauro Carvalho Chehab    (CToken.BACKREF, r"\\\d+"),
128*df50e848SMauro Carvalho Chehab
129*df50e848SMauro Carvalho Chehab    (CToken.MISMATCH,r"."),
130*df50e848SMauro Carvalho Chehab]
131*df50e848SMauro Carvalho Chehab
132*df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list):
133*df50e848SMauro Carvalho Chehab    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
134*df50e848SMauro Carvalho Chehab    re_tokens = []
135*df50e848SMauro Carvalho Chehab
136*df50e848SMauro Carvalho Chehab    for kind, pattern in token_list:
137*df50e848SMauro Carvalho Chehab        name = CToken.to_name(kind)
138*df50e848SMauro Carvalho Chehab        re_tokens.append(f"(?P<{name}>{pattern})")
139*df50e848SMauro Carvalho Chehab
140*df50e848SMauro Carvalho Chehab    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
141*df50e848SMauro Carvalho Chehab
142*df50e848SMauro Carvalho Chehab#: Handle C continuation lines.
143*df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n")
144*df50e848SMauro Carvalho Chehab
145*df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*')
146*df50e848SMauro Carvalho Chehab
147*df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage.
148*df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
149*df50e848SMauro Carvalho Chehab
150*df50e848SMauro Carvalho Chehab
151*df50e848SMauro Carvalho Chehabclass CTokenizer():
152*df50e848SMauro Carvalho Chehab    """
153*df50e848SMauro Carvalho Chehab    Scan C statements and definitions and produce tokens.
154*df50e848SMauro Carvalho Chehab
155*df50e848SMauro Carvalho Chehab    When converted to string, it drops comments and handle public/private
156*df50e848SMauro Carvalho Chehab    values, respecting depth.
157*df50e848SMauro Carvalho Chehab    """
158*df50e848SMauro Carvalho Chehab
159*df50e848SMauro Carvalho Chehab    # This class is inspired and follows the basic concepts of:
160*df50e848SMauro Carvalho Chehab    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
161*df50e848SMauro Carvalho Chehab
162*df50e848SMauro Carvalho Chehab    def __init__(self, source=None, log=None):
163*df50e848SMauro Carvalho Chehab        """
164*df50e848SMauro Carvalho Chehab        Create a regular expression to handle RE_SCANNER_LIST.
165*df50e848SMauro Carvalho Chehab
166*df50e848SMauro Carvalho Chehab        While I generally don't like using regex group naming via:
167*df50e848SMauro Carvalho Chehab            (?P<name>...)
168*df50e848SMauro Carvalho Chehab
169*df50e848SMauro Carvalho Chehab        in this particular case, it makes sense, as we can pick the name
170*df50e848SMauro Carvalho Chehab        when matching a code via RE_SCANNER.
171*df50e848SMauro Carvalho Chehab        """
172*df50e848SMauro Carvalho Chehab
173*df50e848SMauro Carvalho Chehab        self.tokens = []
174*df50e848SMauro Carvalho Chehab
175*df50e848SMauro Carvalho Chehab        if not source:
176*df50e848SMauro Carvalho Chehab            return
177*df50e848SMauro Carvalho Chehab
178*df50e848SMauro Carvalho Chehab        if isinstance(source, list):
179*df50e848SMauro Carvalho Chehab            self.tokens = source
180*df50e848SMauro Carvalho Chehab            return
181*df50e848SMauro Carvalho Chehab
182*df50e848SMauro Carvalho Chehab        #
183*df50e848SMauro Carvalho Chehab        # While we could just use _tokenize directly via interator,
184*df50e848SMauro Carvalho Chehab        # As we'll need to use the tokenizer several times inside kernel-doc
185*df50e848SMauro Carvalho Chehab        # to handle macro transforms, cache the results on a list, as
186*df50e848SMauro Carvalho Chehab        # re-using it is cheaper than having to parse everytime.
187*df50e848SMauro Carvalho Chehab        #
188*df50e848SMauro Carvalho Chehab        for tok in self._tokenize(source):
189*df50e848SMauro Carvalho Chehab            self.tokens.append(tok)
190*df50e848SMauro Carvalho Chehab
191*df50e848SMauro Carvalho Chehab    def _tokenize(self, source):
192*df50e848SMauro Carvalho Chehab        """
193*df50e848SMauro Carvalho Chehab        Iterator that parses ``source``, splitting it into tokens, as defined
194*df50e848SMauro Carvalho Chehab        at ``self.RE_SCANNER_LIST``.
195*df50e848SMauro Carvalho Chehab
196*df50e848SMauro Carvalho Chehab        The interactor returns a CToken class object.
197*df50e848SMauro Carvalho Chehab        """
198*df50e848SMauro Carvalho Chehab
199*df50e848SMauro Carvalho Chehab        # Handle continuation lines. Note that kdoc_parser already has a
200*df50e848SMauro Carvalho Chehab        # logic to do that. Still, let's keep it for completeness, as we might
201*df50e848SMauro Carvalho Chehab        # end re-using this tokenizer outsize kernel-doc some day - or we may
202*df50e848SMauro Carvalho Chehab        # eventually remove from there as a future cleanup.
203*df50e848SMauro Carvalho Chehab        source = RE_CONT.sub("", source)
204*df50e848SMauro Carvalho Chehab
205*df50e848SMauro Carvalho Chehab        brace_level = 0
206*df50e848SMauro Carvalho Chehab        paren_level = 0
207*df50e848SMauro Carvalho Chehab        bracket_level = 0
208*df50e848SMauro Carvalho Chehab
209*df50e848SMauro Carvalho Chehab        for match in RE_SCANNER.finditer(source):
210*df50e848SMauro Carvalho Chehab            kind = CToken.from_name(match.lastgroup)
211*df50e848SMauro Carvalho Chehab            pos = match.start()
212*df50e848SMauro Carvalho Chehab            value = match.group()
213*df50e848SMauro Carvalho Chehab
214*df50e848SMauro Carvalho Chehab            if kind == CToken.MISMATCH:
215*df50e848SMauro Carvalho Chehab                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
216*df50e848SMauro Carvalho Chehab            elif kind == CToken.BEGIN:
217*df50e848SMauro Carvalho Chehab                if value == '(':
218*df50e848SMauro Carvalho Chehab                    paren_level += 1
219*df50e848SMauro Carvalho Chehab                elif value == '[':
220*df50e848SMauro Carvalho Chehab                    bracket_level += 1
221*df50e848SMauro Carvalho Chehab                else:  # value == '{'
222*df50e848SMauro Carvalho Chehab                    brace_level += 1
223*df50e848SMauro Carvalho Chehab
224*df50e848SMauro Carvalho Chehab            elif kind == CToken.END:
225*df50e848SMauro Carvalho Chehab                if value == ')' and paren_level > 0:
226*df50e848SMauro Carvalho Chehab                    paren_level -= 1
227*df50e848SMauro Carvalho Chehab                elif value == ']' and bracket_level > 0:
228*df50e848SMauro Carvalho Chehab                    bracket_level -= 1
229*df50e848SMauro Carvalho Chehab                elif brace_level > 0:    # value == '}'
230*df50e848SMauro Carvalho Chehab                    brace_level -= 1
231*df50e848SMauro Carvalho Chehab
232*df50e848SMauro Carvalho Chehab            yield CToken(kind, value, pos,
233*df50e848SMauro Carvalho Chehab                         brace_level, paren_level, bracket_level)
234*df50e848SMauro Carvalho Chehab
235*df50e848SMauro Carvalho Chehab    def __str__(self):
236*df50e848SMauro Carvalho Chehab        out=""
237*df50e848SMauro Carvalho Chehab        show_stack = [True]
238*df50e848SMauro Carvalho Chehab
239*df50e848SMauro Carvalho Chehab        for i, tok in enumerate(self.tokens):
240*df50e848SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
241*df50e848SMauro Carvalho Chehab                show_stack.append(show_stack[-1])
242*df50e848SMauro Carvalho Chehab
243*df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.END:
244*df50e848SMauro Carvalho Chehab                prev = show_stack[-1]
245*df50e848SMauro Carvalho Chehab                if len(show_stack) > 1:
246*df50e848SMauro Carvalho Chehab                    show_stack.pop()
247*df50e848SMauro Carvalho Chehab
248*df50e848SMauro Carvalho Chehab                if not prev and show_stack[-1]:
249*df50e848SMauro Carvalho Chehab                    #
250*df50e848SMauro Carvalho Chehab                    # Try to preserve indent
251*df50e848SMauro Carvalho Chehab                    #
252*df50e848SMauro Carvalho Chehab                    out += "\t" * (len(show_stack) - 1)
253*df50e848SMauro Carvalho Chehab
254*df50e848SMauro Carvalho Chehab                    out += str(tok.value)
255*df50e848SMauro Carvalho Chehab                    continue
256*df50e848SMauro Carvalho Chehab
257*df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.COMMENT:
258*df50e848SMauro Carvalho Chehab                comment = RE_COMMENT_START.sub("", tok.value)
259*df50e848SMauro Carvalho Chehab
260*df50e848SMauro Carvalho Chehab                if comment.startswith("private:"):
261*df50e848SMauro Carvalho Chehab                    show_stack[-1] = False
262*df50e848SMauro Carvalho Chehab                    show = False
263*df50e848SMauro Carvalho Chehab                elif comment.startswith("public:"):
264*df50e848SMauro Carvalho Chehab                    show_stack[-1] = True
265*df50e848SMauro Carvalho Chehab
266*df50e848SMauro Carvalho Chehab                continue
267*df50e848SMauro Carvalho Chehab
268*df50e848SMauro Carvalho Chehab            if not show_stack[-1]:
269*df50e848SMauro Carvalho Chehab                continue
270*df50e848SMauro Carvalho Chehab
271*df50e848SMauro Carvalho Chehab            if i < len(self.tokens) - 1:
272*df50e848SMauro Carvalho Chehab                next_tok = self.tokens[i + 1]
273*df50e848SMauro Carvalho Chehab
274*df50e848SMauro Carvalho Chehab                # Do some cleanups before ";"
275*df50e848SMauro Carvalho Chehab
276*df50e848SMauro Carvalho Chehab                if (tok.kind == CToken.SPACE and
277*df50e848SMauro Carvalho Chehab                    next_tok.kind == CToken.PUNC and
278*df50e848SMauro Carvalho Chehab                    next_tok.value == ";"):
279*df50e848SMauro Carvalho Chehab
280*df50e848SMauro Carvalho Chehab                    continue
281*df50e848SMauro Carvalho Chehab
282*df50e848SMauro Carvalho Chehab                if (tok.kind == CToken.PUNC and
283*df50e848SMauro Carvalho Chehab                    next_tok.kind == CToken.PUNC and
284*df50e848SMauro Carvalho Chehab                    tok.value == ";" and
285*df50e848SMauro Carvalho Chehab                    next_tok.kind == CToken.PUNC and
286*df50e848SMauro Carvalho Chehab                    next_tok.value == ";"):
287*df50e848SMauro Carvalho Chehab
288*df50e848SMauro Carvalho Chehab                    continue
289*df50e848SMauro Carvalho Chehab
290*df50e848SMauro Carvalho Chehab            out += str(tok.value)
291*df50e848SMauro Carvalho Chehab
292*df50e848SMauro Carvalho Chehab        return out
293