xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 024e200e2a89d71dceff7d1bff8ae77b145726e0)
1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3
2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4df50e848SMauro Carvalho Chehab
5df50e848SMauro Carvalho Chehab"""
6df50e848SMauro Carvalho ChehabRegular expression ancillary classes.
7df50e848SMauro Carvalho Chehab
8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9df50e848SMauro Carvalho Chehab
10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad
11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern.
12df50e848SMauro Carvalho Chehab
13df50e848SMauro Carvalho ChehabOther errors are logged via log instance.
14df50e848SMauro Carvalho Chehab"""
15df50e848SMauro Carvalho Chehab
16df50e848SMauro Carvalho Chehabimport logging
17df50e848SMauro Carvalho Chehabimport re
18df50e848SMauro Carvalho Chehab
199aaeb817SMauro Carvalho Chehabfrom copy import copy
209aaeb817SMauro Carvalho Chehab
21df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe
22df50e848SMauro Carvalho Chehab
23df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__)
24df50e848SMauro Carvalho Chehab
25*024e200eSMauro Carvalho Chehabdef tokenizer_set_log(logger, prefix = ""):
26*024e200eSMauro Carvalho Chehab    """
27*024e200eSMauro Carvalho Chehab    Replace the module‑level logger with a LoggerAdapter that
28*024e200eSMauro Carvalho Chehab    prepends *prefix* to every message.
29*024e200eSMauro Carvalho Chehab    """
30*024e200eSMauro Carvalho Chehab    global log
31*024e200eSMauro Carvalho Chehab
32*024e200eSMauro Carvalho Chehab    class PrefixAdapter(logging.LoggerAdapter):
33*024e200eSMauro Carvalho Chehab        """
34*024e200eSMauro Carvalho Chehab        Ancillary class to set prefix on all message logs.
35*024e200eSMauro Carvalho Chehab        """
36*024e200eSMauro Carvalho Chehab        def process(self, msg, kwargs):
37*024e200eSMauro Carvalho Chehab            return f"{prefix}{msg}", kwargs
38*024e200eSMauro Carvalho Chehab
39*024e200eSMauro Carvalho Chehab    # Wrap the provided logger in our adapter
40*024e200eSMauro Carvalho Chehab    log = PrefixAdapter(logger, {"prefix": prefix})
41df50e848SMauro Carvalho Chehab
42df50e848SMauro Carvalho Chehabclass CToken():
43df50e848SMauro Carvalho Chehab    """
44df50e848SMauro Carvalho Chehab    Data class to define a C token.
45df50e848SMauro Carvalho Chehab    """
46df50e848SMauro Carvalho Chehab
47df50e848SMauro Carvalho Chehab    # Tokens that can be used by the parser. Works like an C enum.
48df50e848SMauro Carvalho Chehab
49df50e848SMauro Carvalho Chehab    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
50df50e848SMauro Carvalho Chehab    STRING = 1      #: A string, including quotation marks.
51df50e848SMauro Carvalho Chehab    CHAR = 2        #: A character, including apostophes.
52df50e848SMauro Carvalho Chehab    NUMBER = 3      #: A number.
53df50e848SMauro Carvalho Chehab    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
54df50e848SMauro Carvalho Chehab    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
55df50e848SMauro Carvalho Chehab    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
56df50e848SMauro Carvalho Chehab    CPP = 7         #: A preprocessor macro.
57df50e848SMauro Carvalho Chehab    HASH = 8        #: The hash character - useful to handle other macros.
58df50e848SMauro Carvalho Chehab    OP = 9          #: A C operator (add, subtract, ...).
59df50e848SMauro Carvalho Chehab    STRUCT = 10     #: A ``struct`` keyword.
60df50e848SMauro Carvalho Chehab    UNION = 11      #: An ``union`` keyword.
61df50e848SMauro Carvalho Chehab    ENUM = 12       #: A ``struct`` keyword.
62df50e848SMauro Carvalho Chehab    TYPEDEF = 13    #: A ``typedef`` keyword.
63df50e848SMauro Carvalho Chehab    NAME = 14       #: A name. Can be an ID or a type.
64df50e848SMauro Carvalho Chehab    SPACE = 15      #: Any space characters, including new lines
65df50e848SMauro Carvalho Chehab    ENDSTMT = 16    #: End of an statement (``;``).
66df50e848SMauro Carvalho Chehab
67df50e848SMauro Carvalho Chehab    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
68df50e848SMauro Carvalho Chehab
69df50e848SMauro Carvalho Chehab    MISMATCH = 255  #: an error indicator: should never happen in practice.
70df50e848SMauro Carvalho Chehab
71df50e848SMauro Carvalho Chehab    # Dict to convert from an enum interger into a string.
72df50e848SMauro Carvalho Chehab    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73df50e848SMauro Carvalho Chehab
74df50e848SMauro Carvalho Chehab    # Dict to convert from string to an enum-like integer value.
75df50e848SMauro Carvalho Chehab    _name_to_val = {k: v for v, k in _name_by_val.items()}
76df50e848SMauro Carvalho Chehab
77df50e848SMauro Carvalho Chehab    @staticmethod
78df50e848SMauro Carvalho Chehab    def to_name(val):
79df50e848SMauro Carvalho Chehab        """Convert from an integer value from CToken enum into a string"""
80df50e848SMauro Carvalho Chehab
81df50e848SMauro Carvalho Chehab        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82df50e848SMauro Carvalho Chehab
83df50e848SMauro Carvalho Chehab    @staticmethod
84df50e848SMauro Carvalho Chehab    def from_name(name):
85df50e848SMauro Carvalho Chehab        """Convert a string into a CToken enum value"""
86df50e848SMauro Carvalho Chehab        if name in CToken._name_to_val:
87df50e848SMauro Carvalho Chehab            return CToken._name_to_val[name]
88df50e848SMauro Carvalho Chehab
89df50e848SMauro Carvalho Chehab        return CToken.MISMATCH
90df50e848SMauro Carvalho Chehab
91df50e848SMauro Carvalho Chehab
92df50e848SMauro Carvalho Chehab    def __init__(self, kind, value=None, pos=0,
93df50e848SMauro Carvalho Chehab                 brace_level=0, paren_level=0, bracket_level=0):
94df50e848SMauro Carvalho Chehab        self.kind = kind
95df50e848SMauro Carvalho Chehab        self.value = value
96df50e848SMauro Carvalho Chehab        self.pos = pos
97df50e848SMauro Carvalho Chehab        self.level = (bracket_level, paren_level, brace_level)
98df50e848SMauro Carvalho Chehab
99df50e848SMauro Carvalho Chehab    def __repr__(self):
100df50e848SMauro Carvalho Chehab        name = self.to_name(self.kind)
101df50e848SMauro Carvalho Chehab        if isinstance(self.value, str):
102df50e848SMauro Carvalho Chehab            value = '"' + self.value + '"'
103df50e848SMauro Carvalho Chehab        else:
104df50e848SMauro Carvalho Chehab            value = self.value
105df50e848SMauro Carvalho Chehab
106df50e848SMauro Carvalho Chehab        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107df50e848SMauro Carvalho Chehab
108df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens.
109df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [
110df50e848SMauro Carvalho Chehab    #
111df50e848SMauro Carvalho Chehab    # Note that \s\S is different than .*, as it also catches \n
112df50e848SMauro Carvalho Chehab    #
113df50e848SMauro Carvalho Chehab    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114df50e848SMauro Carvalho Chehab
115df50e848SMauro Carvalho Chehab    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
116df50e848SMauro Carvalho Chehab    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
117df50e848SMauro Carvalho Chehab
118df50e848SMauro Carvalho Chehab    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119df50e848SMauro Carvalho Chehab                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120df50e848SMauro Carvalho Chehab
121df50e848SMauro Carvalho Chehab    (CToken.ENDSTMT, r"(?:\s+;|;)"),
122df50e848SMauro Carvalho Chehab
123df50e848SMauro Carvalho Chehab    (CToken.PUNC,    r"[,\.]"),
124df50e848SMauro Carvalho Chehab
125df50e848SMauro Carvalho Chehab    (CToken.BEGIN,   r"[\[\(\{]"),
126df50e848SMauro Carvalho Chehab
127df50e848SMauro Carvalho Chehab    (CToken.END,     r"[\]\)\}]"),
128df50e848SMauro Carvalho Chehab
129df50e848SMauro Carvalho Chehab    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130df50e848SMauro Carvalho Chehab
131df50e848SMauro Carvalho Chehab    (CToken.HASH,    r"#"),
132df50e848SMauro Carvalho Chehab
133df50e848SMauro Carvalho Chehab    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134df50e848SMauro Carvalho Chehab                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135df50e848SMauro Carvalho Chehab
136df50e848SMauro Carvalho Chehab    (CToken.STRUCT,  r"\bstruct\b"),
137df50e848SMauro Carvalho Chehab    (CToken.UNION,   r"\bunion\b"),
138df50e848SMauro Carvalho Chehab    (CToken.ENUM,    r"\benum\b"),
139df50e848SMauro Carvalho Chehab    (CToken.TYPEDEF, r"\btypedef\b"),
140df50e848SMauro Carvalho Chehab
141df50e848SMauro Carvalho Chehab    (CToken.NAME,    r"[A-Za-z_]\w*"),
142df50e848SMauro Carvalho Chehab
143df50e848SMauro Carvalho Chehab    (CToken.SPACE,   r"\s+"),
144df50e848SMauro Carvalho Chehab
145df50e848SMauro Carvalho Chehab    (CToken.BACKREF, r"\\\d+"),
146df50e848SMauro Carvalho Chehab
147df50e848SMauro Carvalho Chehab    (CToken.MISMATCH,r"."),
148df50e848SMauro Carvalho Chehab]
149df50e848SMauro Carvalho Chehab
150df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list):
151df50e848SMauro Carvalho Chehab    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152df50e848SMauro Carvalho Chehab    re_tokens = []
153df50e848SMauro Carvalho Chehab
154df50e848SMauro Carvalho Chehab    for kind, pattern in token_list:
155df50e848SMauro Carvalho Chehab        name = CToken.to_name(kind)
156df50e848SMauro Carvalho Chehab        re_tokens.append(f"(?P<{name}>{pattern})")
157df50e848SMauro Carvalho Chehab
158df50e848SMauro Carvalho Chehab    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159df50e848SMauro Carvalho Chehab
160df50e848SMauro Carvalho Chehab#: Handle C continuation lines.
161df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n")
162df50e848SMauro Carvalho Chehab
163df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*')
164df50e848SMauro Carvalho Chehab
165df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage.
166df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167df50e848SMauro Carvalho Chehab
168df50e848SMauro Carvalho Chehab
169df50e848SMauro Carvalho Chehabclass CTokenizer():
170df50e848SMauro Carvalho Chehab    """
171df50e848SMauro Carvalho Chehab    Scan C statements and definitions and produce tokens.
172df50e848SMauro Carvalho Chehab
173df50e848SMauro Carvalho Chehab    When converted to string, it drops comments and handle public/private
174df50e848SMauro Carvalho Chehab    values, respecting depth.
175df50e848SMauro Carvalho Chehab    """
176df50e848SMauro Carvalho Chehab
177df50e848SMauro Carvalho Chehab    # This class is inspired and follows the basic concepts of:
178df50e848SMauro Carvalho Chehab    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
179df50e848SMauro Carvalho Chehab
180df50e848SMauro Carvalho Chehab    def __init__(self, source=None, log=None):
181df50e848SMauro Carvalho Chehab        """
182df50e848SMauro Carvalho Chehab        Create a regular expression to handle RE_SCANNER_LIST.
183df50e848SMauro Carvalho Chehab
184df50e848SMauro Carvalho Chehab        While I generally don't like using regex group naming via:
185df50e848SMauro Carvalho Chehab            (?P<name>...)
186df50e848SMauro Carvalho Chehab
187df50e848SMauro Carvalho Chehab        in this particular case, it makes sense, as we can pick the name
188df50e848SMauro Carvalho Chehab        when matching a code via RE_SCANNER.
189df50e848SMauro Carvalho Chehab        """
190df50e848SMauro Carvalho Chehab
191df50e848SMauro Carvalho Chehab        self.tokens = []
192df50e848SMauro Carvalho Chehab
193df50e848SMauro Carvalho Chehab        if not source:
194df50e848SMauro Carvalho Chehab            return
195df50e848SMauro Carvalho Chehab
196df50e848SMauro Carvalho Chehab        if isinstance(source, list):
197df50e848SMauro Carvalho Chehab            self.tokens = source
198df50e848SMauro Carvalho Chehab            return
199df50e848SMauro Carvalho Chehab
200df50e848SMauro Carvalho Chehab        #
201df50e848SMauro Carvalho Chehab        # While we could just use _tokenize directly via interator,
202df50e848SMauro Carvalho Chehab        # As we'll need to use the tokenizer several times inside kernel-doc
203df50e848SMauro Carvalho Chehab        # to handle macro transforms, cache the results on a list, as
204df50e848SMauro Carvalho Chehab        # re-using it is cheaper than having to parse everytime.
205df50e848SMauro Carvalho Chehab        #
206df50e848SMauro Carvalho Chehab        for tok in self._tokenize(source):
207df50e848SMauro Carvalho Chehab            self.tokens.append(tok)
208df50e848SMauro Carvalho Chehab
209df50e848SMauro Carvalho Chehab    def _tokenize(self, source):
210df50e848SMauro Carvalho Chehab        """
211df50e848SMauro Carvalho Chehab        Iterator that parses ``source``, splitting it into tokens, as defined
212df50e848SMauro Carvalho Chehab        at ``self.RE_SCANNER_LIST``.
213df50e848SMauro Carvalho Chehab
214df50e848SMauro Carvalho Chehab        The interactor returns a CToken class object.
215df50e848SMauro Carvalho Chehab        """
216df50e848SMauro Carvalho Chehab
217df50e848SMauro Carvalho Chehab        # Handle continuation lines. Note that kdoc_parser already has a
218df50e848SMauro Carvalho Chehab        # logic to do that. Still, let's keep it for completeness, as we might
219df50e848SMauro Carvalho Chehab        # end re-using this tokenizer outsize kernel-doc some day - or we may
220df50e848SMauro Carvalho Chehab        # eventually remove from there as a future cleanup.
221df50e848SMauro Carvalho Chehab        source = RE_CONT.sub("", source)
222df50e848SMauro Carvalho Chehab
223df50e848SMauro Carvalho Chehab        brace_level = 0
224df50e848SMauro Carvalho Chehab        paren_level = 0
225df50e848SMauro Carvalho Chehab        bracket_level = 0
226df50e848SMauro Carvalho Chehab
227df50e848SMauro Carvalho Chehab        for match in RE_SCANNER.finditer(source):
228df50e848SMauro Carvalho Chehab            kind = CToken.from_name(match.lastgroup)
229df50e848SMauro Carvalho Chehab            pos = match.start()
230df50e848SMauro Carvalho Chehab            value = match.group()
231df50e848SMauro Carvalho Chehab
232df50e848SMauro Carvalho Chehab            if kind == CToken.MISMATCH:
233df50e848SMauro Carvalho Chehab                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
234df50e848SMauro Carvalho Chehab            elif kind == CToken.BEGIN:
235df50e848SMauro Carvalho Chehab                if value == '(':
236df50e848SMauro Carvalho Chehab                    paren_level += 1
237df50e848SMauro Carvalho Chehab                elif value == '[':
238df50e848SMauro Carvalho Chehab                    bracket_level += 1
239df50e848SMauro Carvalho Chehab                else:  # value == '{'
240df50e848SMauro Carvalho Chehab                    brace_level += 1
241df50e848SMauro Carvalho Chehab
242df50e848SMauro Carvalho Chehab            elif kind == CToken.END:
243df50e848SMauro Carvalho Chehab                if value == ')' and paren_level > 0:
244df50e848SMauro Carvalho Chehab                    paren_level -= 1
245df50e848SMauro Carvalho Chehab                elif value == ']' and bracket_level > 0:
246df50e848SMauro Carvalho Chehab                    bracket_level -= 1
247df50e848SMauro Carvalho Chehab                elif brace_level > 0:    # value == '}'
248df50e848SMauro Carvalho Chehab                    brace_level -= 1
249df50e848SMauro Carvalho Chehab
250df50e848SMauro Carvalho Chehab            yield CToken(kind, value, pos,
251df50e848SMauro Carvalho Chehab                         brace_level, paren_level, bracket_level)
252df50e848SMauro Carvalho Chehab
253df50e848SMauro Carvalho Chehab    def __str__(self):
254df50e848SMauro Carvalho Chehab        out=""
255df50e848SMauro Carvalho Chehab        show_stack = [True]
256df50e848SMauro Carvalho Chehab
257df50e848SMauro Carvalho Chehab        for i, tok in enumerate(self.tokens):
258df50e848SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
259df50e848SMauro Carvalho Chehab                show_stack.append(show_stack[-1])
260df50e848SMauro Carvalho Chehab
261df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.END:
262df50e848SMauro Carvalho Chehab                prev = show_stack[-1]
263df50e848SMauro Carvalho Chehab                if len(show_stack) > 1:
264df50e848SMauro Carvalho Chehab                    show_stack.pop()
265df50e848SMauro Carvalho Chehab
266df50e848SMauro Carvalho Chehab                if not prev and show_stack[-1]:
267df50e848SMauro Carvalho Chehab                    #
268df50e848SMauro Carvalho Chehab                    # Try to preserve indent
269df50e848SMauro Carvalho Chehab                    #
270df50e848SMauro Carvalho Chehab                    out += "\t" * (len(show_stack) - 1)
271df50e848SMauro Carvalho Chehab
272df50e848SMauro Carvalho Chehab                    out += str(tok.value)
273df50e848SMauro Carvalho Chehab                    continue
274df50e848SMauro Carvalho Chehab
275df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.COMMENT:
276df50e848SMauro Carvalho Chehab                comment = RE_COMMENT_START.sub("", tok.value)
277df50e848SMauro Carvalho Chehab
278df50e848SMauro Carvalho Chehab                if comment.startswith("private:"):
279df50e848SMauro Carvalho Chehab                    show_stack[-1] = False
280df50e848SMauro Carvalho Chehab                    show = False
281df50e848SMauro Carvalho Chehab                elif comment.startswith("public:"):
282df50e848SMauro Carvalho Chehab                    show_stack[-1] = True
283df50e848SMauro Carvalho Chehab
284df50e848SMauro Carvalho Chehab                continue
285df50e848SMauro Carvalho Chehab
286df50e848SMauro Carvalho Chehab            if not show_stack[-1]:
287df50e848SMauro Carvalho Chehab                continue
288df50e848SMauro Carvalho Chehab
289df50e848SMauro Carvalho Chehab            if i < len(self.tokens) - 1:
290df50e848SMauro Carvalho Chehab                next_tok = self.tokens[i + 1]
291df50e848SMauro Carvalho Chehab
292df50e848SMauro Carvalho Chehab                # Do some cleanups before ";"
293df50e848SMauro Carvalho Chehab
294f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
295df50e848SMauro Carvalho Chehab                    continue
296df50e848SMauro Carvalho Chehab
297f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
298df50e848SMauro Carvalho Chehab                    continue
299df50e848SMauro Carvalho Chehab
300df50e848SMauro Carvalho Chehab            out += str(tok.value)
301df50e848SMauro Carvalho Chehab
302df50e848SMauro Carvalho Chehab        return out
303f1cf9f7cSMauro Carvalho Chehab
304f1cf9f7cSMauro Carvalho Chehab
3059aaeb817SMauro Carvalho Chehabclass CTokenArgs:
3069aaeb817SMauro Carvalho Chehab    """
3079aaeb817SMauro Carvalho Chehab    Ancillary class to help using backrefs from sub matches.
3089aaeb817SMauro Carvalho Chehab
3099aaeb817SMauro Carvalho Chehab    If the highest backref contain a "+" at the last element,
3109aaeb817SMauro Carvalho Chehab    the logic will be greedy, picking all other delims.
3119aaeb817SMauro Carvalho Chehab
3129aaeb817SMauro Carvalho Chehab    This is needed to parse struct_group macros with end with ``MEMBERS...``.
3139aaeb817SMauro Carvalho Chehab    """
3149aaeb817SMauro Carvalho Chehab    def __init__(self, sub_str):
3159aaeb817SMauro Carvalho Chehab        self.sub_groups = set()
3169aaeb817SMauro Carvalho Chehab        self.max_group = -1
3179aaeb817SMauro Carvalho Chehab        self.greedy = None
3189aaeb817SMauro Carvalho Chehab
3199aaeb817SMauro Carvalho Chehab        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
3209aaeb817SMauro Carvalho Chehab            group = int(m.group(1))
3219aaeb817SMauro Carvalho Chehab            if m.group(2) == "+":
3229aaeb817SMauro Carvalho Chehab                if self.greedy and self.greedy != group:
3239aaeb817SMauro Carvalho Chehab                    raise ValueError("There are multiple greedy patterns!")
3249aaeb817SMauro Carvalho Chehab                self.greedy = group
3259aaeb817SMauro Carvalho Chehab
3269aaeb817SMauro Carvalho Chehab            self.sub_groups.add(group)
3279aaeb817SMauro Carvalho Chehab            self.max_group = max(self.max_group, group)
3289aaeb817SMauro Carvalho Chehab
3299aaeb817SMauro Carvalho Chehab        if self.greedy:
3309aaeb817SMauro Carvalho Chehab            if self.greedy != self.max_group:
3319aaeb817SMauro Carvalho Chehab                raise ValueError("Greedy pattern is not the last one!")
3329aaeb817SMauro Carvalho Chehab
3339aaeb817SMauro Carvalho Chehab            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
3349aaeb817SMauro Carvalho Chehab
3359aaeb817SMauro Carvalho Chehab        self.sub_str = sub_str
3369aaeb817SMauro Carvalho Chehab        self.sub_tokeninzer = CTokenizer(sub_str)
3379aaeb817SMauro Carvalho Chehab
3389aaeb817SMauro Carvalho Chehab    def groups(self, new_tokenizer):
3399aaeb817SMauro Carvalho Chehab        """
3409aaeb817SMauro Carvalho Chehab        Create replacement arguments for backrefs like:
3419aaeb817SMauro Carvalho Chehab
3429aaeb817SMauro Carvalho Chehab        ``\0``, ``\1``, ``\2``, ...``\n``
3439aaeb817SMauro Carvalho Chehab
3449aaeb817SMauro Carvalho Chehab        It also accepts a ``+`` character to the highest backref. When used,
3459aaeb817SMauro Carvalho Chehab        it means in practice to ignore delimins after it, being greedy.
3469aaeb817SMauro Carvalho Chehab
3479aaeb817SMauro Carvalho Chehab        The logic is smart enough to only go up to the maximum required
3489aaeb817SMauro Carvalho Chehab        argument, even if there are more.
3499aaeb817SMauro Carvalho Chehab
3509aaeb817SMauro Carvalho Chehab        If there is a backref for an argument above the limit, it will
3519aaeb817SMauro Carvalho Chehab        raise an exception. Please notice that, on C, square brackets
3529aaeb817SMauro Carvalho Chehab        don't have any separator on it. Trying to use ``\1``..``\n`` for
3539aaeb817SMauro Carvalho Chehab        brackets also raise an exception.
3549aaeb817SMauro Carvalho Chehab        """
3559aaeb817SMauro Carvalho Chehab
3569aaeb817SMauro Carvalho Chehab        level = (0, 0, 0)
3579aaeb817SMauro Carvalho Chehab
3589aaeb817SMauro Carvalho Chehab        if self.max_group < 0:
3599aaeb817SMauro Carvalho Chehab            return level, []
3609aaeb817SMauro Carvalho Chehab
3619aaeb817SMauro Carvalho Chehab        tokens = new_tokenizer.tokens
3629aaeb817SMauro Carvalho Chehab
3639aaeb817SMauro Carvalho Chehab        #
3649aaeb817SMauro Carvalho Chehab        # Fill \0 with the full token contents
3659aaeb817SMauro Carvalho Chehab        #
3669aaeb817SMauro Carvalho Chehab        groups_list = [ [] ]
3679aaeb817SMauro Carvalho Chehab
3689aaeb817SMauro Carvalho Chehab        if 0 in self.sub_groups:
3699aaeb817SMauro Carvalho Chehab            inner_level = 0
3709aaeb817SMauro Carvalho Chehab
3719aaeb817SMauro Carvalho Chehab            for i in range(0, len(tokens)):
3729aaeb817SMauro Carvalho Chehab                tok = tokens[i]
3739aaeb817SMauro Carvalho Chehab
3749aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.BEGIN:
3759aaeb817SMauro Carvalho Chehab                    inner_level += 1
3769aaeb817SMauro Carvalho Chehab
3779aaeb817SMauro Carvalho Chehab                    #
3789aaeb817SMauro Carvalho Chehab                    # Discard first begin
3799aaeb817SMauro Carvalho Chehab                    #
3809aaeb817SMauro Carvalho Chehab                    if not groups_list[0]:
3819aaeb817SMauro Carvalho Chehab                        continue
3829aaeb817SMauro Carvalho Chehab                elif tok.kind == CToken.END:
3839aaeb817SMauro Carvalho Chehab                    inner_level -= 1
3849aaeb817SMauro Carvalho Chehab                    if inner_level < 0:
3859aaeb817SMauro Carvalho Chehab                        break
3869aaeb817SMauro Carvalho Chehab
3879aaeb817SMauro Carvalho Chehab                if inner_level:
3889aaeb817SMauro Carvalho Chehab                    groups_list[0].append(tok)
3899aaeb817SMauro Carvalho Chehab
3909aaeb817SMauro Carvalho Chehab        if not self.max_group:
3919aaeb817SMauro Carvalho Chehab            return level, groups_list
3929aaeb817SMauro Carvalho Chehab
3939aaeb817SMauro Carvalho Chehab        delim = None
3949aaeb817SMauro Carvalho Chehab
3959aaeb817SMauro Carvalho Chehab        #
3969aaeb817SMauro Carvalho Chehab        # Ignore everything before BEGIN. The value of begin gives the
3979aaeb817SMauro Carvalho Chehab        # delimiter to be used for the matches
3989aaeb817SMauro Carvalho Chehab        #
3999aaeb817SMauro Carvalho Chehab        for i in range(0, len(tokens)):
4009aaeb817SMauro Carvalho Chehab            tok = tokens[i]
4019aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
4029aaeb817SMauro Carvalho Chehab                if tok.value == "{":
4039aaeb817SMauro Carvalho Chehab                    delim = ";"
4049aaeb817SMauro Carvalho Chehab                elif tok.value == "(":
4059aaeb817SMauro Carvalho Chehab                    delim = ","
4069aaeb817SMauro Carvalho Chehab                else:
4079aaeb817SMauro Carvalho Chehab                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
4089aaeb817SMauro Carvalho Chehab
4099aaeb817SMauro Carvalho Chehab                level = tok.level
4109aaeb817SMauro Carvalho Chehab                break
4119aaeb817SMauro Carvalho Chehab
4129aaeb817SMauro Carvalho Chehab        pos = 1
4139aaeb817SMauro Carvalho Chehab        groups_list.append([])
4149aaeb817SMauro Carvalho Chehab
4159aaeb817SMauro Carvalho Chehab        inner_level = 0
4169aaeb817SMauro Carvalho Chehab        for i in range(i + 1, len(tokens)):
4179aaeb817SMauro Carvalho Chehab            tok = tokens[i]
4189aaeb817SMauro Carvalho Chehab
4199aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
4209aaeb817SMauro Carvalho Chehab                inner_level += 1
4219aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.END:
4229aaeb817SMauro Carvalho Chehab                inner_level -= 1
4239aaeb817SMauro Carvalho Chehab                if inner_level < 0:
4249aaeb817SMauro Carvalho Chehab                    break
4259aaeb817SMauro Carvalho Chehab
4269aaeb817SMauro Carvalho Chehab            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
4279aaeb817SMauro Carvalho Chehab                pos += 1
4289aaeb817SMauro Carvalho Chehab                if self.greedy and pos > self.max_group:
4299aaeb817SMauro Carvalho Chehab                    pos -= 1
4309aaeb817SMauro Carvalho Chehab                else:
4319aaeb817SMauro Carvalho Chehab                    groups_list.append([])
4329aaeb817SMauro Carvalho Chehab
4339aaeb817SMauro Carvalho Chehab                    if pos > self.max_group:
4349aaeb817SMauro Carvalho Chehab                        break
4359aaeb817SMauro Carvalho Chehab
4369aaeb817SMauro Carvalho Chehab                    continue
4379aaeb817SMauro Carvalho Chehab
4389aaeb817SMauro Carvalho Chehab            groups_list[pos].append(tok)
4399aaeb817SMauro Carvalho Chehab
4409aaeb817SMauro Carvalho Chehab        if pos < self.max_group:
4419aaeb817SMauro Carvalho Chehab            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
4429aaeb817SMauro Carvalho Chehab
4439aaeb817SMauro Carvalho Chehab        return level, groups_list
4449aaeb817SMauro Carvalho Chehab
4459aaeb817SMauro Carvalho Chehab    def tokens(self, new_tokenizer):
4469aaeb817SMauro Carvalho Chehab        level, groups = self.groups(new_tokenizer)
4479aaeb817SMauro Carvalho Chehab
4489aaeb817SMauro Carvalho Chehab        new = CTokenizer()
4499aaeb817SMauro Carvalho Chehab
4509aaeb817SMauro Carvalho Chehab        for tok in self.sub_tokeninzer.tokens:
4519aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BACKREF:
4529aaeb817SMauro Carvalho Chehab                group = int(tok.value[1:])
4539aaeb817SMauro Carvalho Chehab
4549aaeb817SMauro Carvalho Chehab                for group_tok in groups[group]:
4559aaeb817SMauro Carvalho Chehab                    new_tok = copy(group_tok)
4569aaeb817SMauro Carvalho Chehab
4579aaeb817SMauro Carvalho Chehab                    new_level = [0, 0, 0]
4589aaeb817SMauro Carvalho Chehab
4599aaeb817SMauro Carvalho Chehab                    for i in range(0, len(level)):
4609aaeb817SMauro Carvalho Chehab                        new_level[i] = new_tok.level[i] + level[i]
4619aaeb817SMauro Carvalho Chehab
4629aaeb817SMauro Carvalho Chehab                    new_tok.level = tuple(new_level)
4639aaeb817SMauro Carvalho Chehab
4649aaeb817SMauro Carvalho Chehab                    new.tokens += [ new_tok ]
4659aaeb817SMauro Carvalho Chehab            else:
4669aaeb817SMauro Carvalho Chehab                new.tokens += [ tok ]
4679aaeb817SMauro Carvalho Chehab
4689aaeb817SMauro Carvalho Chehab        return new.tokens
4699aaeb817SMauro Carvalho Chehab
4709aaeb817SMauro Carvalho Chehab
471f1cf9f7cSMauro Carvalho Chehabclass CMatch:
472f1cf9f7cSMauro Carvalho Chehab    """
473f1cf9f7cSMauro Carvalho Chehab    Finding nested delimiters is hard with regular expressions. It is
474f1cf9f7cSMauro Carvalho Chehab    even harder on Python with its normal re module, as there are several
475f1cf9f7cSMauro Carvalho Chehab    advanced regular expressions that are missing.
476f1cf9f7cSMauro Carvalho Chehab
477f1cf9f7cSMauro Carvalho Chehab    This is the case of this pattern::
478f1cf9f7cSMauro Carvalho Chehab
479f1cf9f7cSMauro Carvalho Chehab            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
480f1cf9f7cSMauro Carvalho Chehab
481f1cf9f7cSMauro Carvalho Chehab    which is used to properly match open/close parentheses of the
482f1cf9f7cSMauro Carvalho Chehab    string search STRUCT_GROUP(),
483f1cf9f7cSMauro Carvalho Chehab
484f1cf9f7cSMauro Carvalho Chehab    Add a class that counts pairs of delimiters, using it to match and
485f1cf9f7cSMauro Carvalho Chehab    replace nested expressions.
486f1cf9f7cSMauro Carvalho Chehab
487f1cf9f7cSMauro Carvalho Chehab    The original approach was suggested by:
488f1cf9f7cSMauro Carvalho Chehab
489f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
490f1cf9f7cSMauro Carvalho Chehab
491f1cf9f7cSMauro Carvalho Chehab    Although I re-implemented it to make it more generic and match 3 types
492f1cf9f7cSMauro Carvalho Chehab    of delimiters. The logic checks if delimiters are paired. If not, it
493f1cf9f7cSMauro Carvalho Chehab    will ignore the search string.
494f1cf9f7cSMauro Carvalho Chehab    """
495f1cf9f7cSMauro Carvalho Chehab
496f1cf9f7cSMauro Carvalho Chehab
4979aaeb817SMauro Carvalho Chehab    def __init__(self, regex, delim="("):
4989aaeb817SMauro Carvalho Chehab        self.regex = KernRe("^" + regex + r"\b")
4999aaeb817SMauro Carvalho Chehab        self.start_delim = delim
500f1cf9f7cSMauro Carvalho Chehab
501f1cf9f7cSMauro Carvalho Chehab    def _search(self, tokenizer):
502f1cf9f7cSMauro Carvalho Chehab        """
503f1cf9f7cSMauro Carvalho Chehab        Finds paired blocks for a regex that ends with a delimiter.
504f1cf9f7cSMauro Carvalho Chehab
505f1cf9f7cSMauro Carvalho Chehab        The suggestion of using finditer to match pairs came from:
506f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
507f1cf9f7cSMauro Carvalho Chehab        but I ended using a different implementation to align all three types
508f1cf9f7cSMauro Carvalho Chehab        of delimiters and seek for an initial regular expression.
509f1cf9f7cSMauro Carvalho Chehab
510f1cf9f7cSMauro Carvalho Chehab        The algorithm seeks for open/close paired delimiters and places them
511f1cf9f7cSMauro Carvalho Chehab        into a stack, yielding a start/stop position of each match when the
512f1cf9f7cSMauro Carvalho Chehab        stack is zeroed.
513f1cf9f7cSMauro Carvalho Chehab
514f1cf9f7cSMauro Carvalho Chehab        The algorithm should work fine for properly paired lines, but will
515f1cf9f7cSMauro Carvalho Chehab        silently ignore end delimiters that precede a start delimiter.
516f1cf9f7cSMauro Carvalho Chehab        This should be OK for kernel-doc parser, as unaligned delimiters
517f1cf9f7cSMauro Carvalho Chehab        would cause compilation errors. So, we don't need to raise exceptions
518f1cf9f7cSMauro Carvalho Chehab        to cover such issues.
519f1cf9f7cSMauro Carvalho Chehab        """
520f1cf9f7cSMauro Carvalho Chehab
521f1cf9f7cSMauro Carvalho Chehab        start = None
522f1cf9f7cSMauro Carvalho Chehab        started = False
523f1cf9f7cSMauro Carvalho Chehab
524f1cf9f7cSMauro Carvalho Chehab        import sys
525f1cf9f7cSMauro Carvalho Chehab
526f1cf9f7cSMauro Carvalho Chehab        stack = []
527f1cf9f7cSMauro Carvalho Chehab
528f1cf9f7cSMauro Carvalho Chehab        for i, tok in enumerate(tokenizer.tokens):
529f1cf9f7cSMauro Carvalho Chehab            if start is None:
530f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.NAME and self.regex.match(tok.value):
531f1cf9f7cSMauro Carvalho Chehab                    start = i
532f1cf9f7cSMauro Carvalho Chehab                    stack.append((start, tok.level))
533f1cf9f7cSMauro Carvalho Chehab                    started = False
534f1cf9f7cSMauro Carvalho Chehab
535f1cf9f7cSMauro Carvalho Chehab                continue
536f1cf9f7cSMauro Carvalho Chehab
5379aaeb817SMauro Carvalho Chehab            if not started:
5389aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.SPACE:
5399aaeb817SMauro Carvalho Chehab                    continue
5409aaeb817SMauro Carvalho Chehab
5419aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
542f1cf9f7cSMauro Carvalho Chehab                    started = True
543f1cf9f7cSMauro Carvalho Chehab                    continue
544f1cf9f7cSMauro Carvalho Chehab
5459aaeb817SMauro Carvalho Chehab                # Name only token without BEGIN/END
5469aaeb817SMauro Carvalho Chehab                if i > start:
5479aaeb817SMauro Carvalho Chehab                    i -= 1
5489aaeb817SMauro Carvalho Chehab                yield start, i
5499aaeb817SMauro Carvalho Chehab                start = None
5509aaeb817SMauro Carvalho Chehab
551f1cf9f7cSMauro Carvalho Chehab            if tok.kind == CToken.END and tok.level == stack[-1][1]:
552f1cf9f7cSMauro Carvalho Chehab                start, level = stack.pop()
553f1cf9f7cSMauro Carvalho Chehab
5549aaeb817SMauro Carvalho Chehab                yield start, i
555f1cf9f7cSMauro Carvalho Chehab                start = None
556f1cf9f7cSMauro Carvalho Chehab
557f1cf9f7cSMauro Carvalho Chehab        #
558f1cf9f7cSMauro Carvalho Chehab        # If an END zeroing levels is not there, return remaining stuff
559f1cf9f7cSMauro Carvalho Chehab        # This is meant to solve cases where the caller logic might be
560f1cf9f7cSMauro Carvalho Chehab        # picking an incomplete block.
561f1cf9f7cSMauro Carvalho Chehab        #
5629aaeb817SMauro Carvalho Chehab        if start and stack:
5639aaeb817SMauro Carvalho Chehab            if started:
5649aaeb817SMauro Carvalho Chehab                s = str(tokenizer)
5659aaeb817SMauro Carvalho Chehab                log.warning(f"can't find a final end at {s}")
5669aaeb817SMauro Carvalho Chehab
5679aaeb817SMauro Carvalho Chehab            yield start, len(tokenizer.tokens)
568f1cf9f7cSMauro Carvalho Chehab
569f1cf9f7cSMauro Carvalho Chehab    def search(self, source):
570f1cf9f7cSMauro Carvalho Chehab        """
571f1cf9f7cSMauro Carvalho Chehab        This is similar to re.search:
572f1cf9f7cSMauro Carvalho Chehab
573f1cf9f7cSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
574f1cf9f7cSMauro Carvalho Chehab        returning occurrences only if all delimiters are paired.
575f1cf9f7cSMauro Carvalho Chehab        """
576f1cf9f7cSMauro Carvalho Chehab
577f1cf9f7cSMauro Carvalho Chehab        if isinstance(source, CTokenizer):
578f1cf9f7cSMauro Carvalho Chehab            tokenizer = source
579f1cf9f7cSMauro Carvalho Chehab            is_token = True
580f1cf9f7cSMauro Carvalho Chehab        else:
581f1cf9f7cSMauro Carvalho Chehab            tokenizer = CTokenizer(source)
582f1cf9f7cSMauro Carvalho Chehab            is_token = False
583f1cf9f7cSMauro Carvalho Chehab
5849aaeb817SMauro Carvalho Chehab        for start, end in self._search(tokenizer):
5859aaeb817SMauro Carvalho Chehab            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
5869aaeb817SMauro Carvalho Chehab
587f1cf9f7cSMauro Carvalho Chehab            if is_token:
588f1cf9f7cSMauro Carvalho Chehab                yield new_tokenizer
589f1cf9f7cSMauro Carvalho Chehab            else:
590f1cf9f7cSMauro Carvalho Chehab                yield str(new_tokenizer)
5919aaeb817SMauro Carvalho Chehab
5929aaeb817SMauro Carvalho Chehab    def sub(self, sub_str, source, count=0):
5939aaeb817SMauro Carvalho Chehab        """
5949aaeb817SMauro Carvalho Chehab        This is similar to re.sub:
5959aaeb817SMauro Carvalho Chehab
5969aaeb817SMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
5979aaeb817SMauro Carvalho Chehab        replacing occurrences only if all delimiters are paired.
5989aaeb817SMauro Carvalho Chehab
5999aaeb817SMauro Carvalho Chehab        if the sub argument contains::
6009aaeb817SMauro Carvalho Chehab
6019aaeb817SMauro Carvalho Chehab            r'\0'
6029aaeb817SMauro Carvalho Chehab
6039aaeb817SMauro Carvalho Chehab        it will work just like re: it places there the matched paired data
6049aaeb817SMauro Carvalho Chehab        with the delimiter stripped.
6059aaeb817SMauro Carvalho Chehab
6069aaeb817SMauro Carvalho Chehab        If count is different than zero, it will replace at most count
6079aaeb817SMauro Carvalho Chehab        items.
6089aaeb817SMauro Carvalho Chehab        """
6099aaeb817SMauro Carvalho Chehab        if isinstance(source, CTokenizer):
6109aaeb817SMauro Carvalho Chehab            is_token = True
6119aaeb817SMauro Carvalho Chehab            tokenizer = source
6129aaeb817SMauro Carvalho Chehab        else:
6139aaeb817SMauro Carvalho Chehab            is_token = False
6149aaeb817SMauro Carvalho Chehab            tokenizer = CTokenizer(source)
6159aaeb817SMauro Carvalho Chehab
6169aaeb817SMauro Carvalho Chehab        # Detect if sub_str contains sub arguments
6179aaeb817SMauro Carvalho Chehab
6189aaeb817SMauro Carvalho Chehab        args_match = CTokenArgs(sub_str)
6199aaeb817SMauro Carvalho Chehab
6209aaeb817SMauro Carvalho Chehab        new_tokenizer = CTokenizer()
6219aaeb817SMauro Carvalho Chehab        pos = 0
6229aaeb817SMauro Carvalho Chehab        n = 0
6239aaeb817SMauro Carvalho Chehab
6249aaeb817SMauro Carvalho Chehab        #
6259aaeb817SMauro Carvalho Chehab        # NOTE: the code below doesn't consider overlays at sub.
6269aaeb817SMauro Carvalho Chehab        # We may need to add some extra unit tests to check if those
6279aaeb817SMauro Carvalho Chehab        # would cause problems. When replacing by "", this should not
6289aaeb817SMauro Carvalho Chehab        # be a problem, but other transformations could be problematic
6299aaeb817SMauro Carvalho Chehab        #
6309aaeb817SMauro Carvalho Chehab        for start, end in self._search(tokenizer):
6319aaeb817SMauro Carvalho Chehab            new_tokenizer.tokens += tokenizer.tokens[pos:start]
6329aaeb817SMauro Carvalho Chehab
6339aaeb817SMauro Carvalho Chehab            new = CTokenizer(tokenizer.tokens[start:end + 1])
6349aaeb817SMauro Carvalho Chehab
6359aaeb817SMauro Carvalho Chehab            new_tokenizer.tokens += args_match.tokens(new)
6369aaeb817SMauro Carvalho Chehab
6379aaeb817SMauro Carvalho Chehab            pos = end + 1
6389aaeb817SMauro Carvalho Chehab
6399aaeb817SMauro Carvalho Chehab            n += 1
6409aaeb817SMauro Carvalho Chehab            if count and n >= count:
6419aaeb817SMauro Carvalho Chehab                break
6429aaeb817SMauro Carvalho Chehab
6439aaeb817SMauro Carvalho Chehab        new_tokenizer.tokens += tokenizer.tokens[pos:]
6449aaeb817SMauro Carvalho Chehab
6459aaeb817SMauro Carvalho Chehab        if not is_token:
6469aaeb817SMauro Carvalho Chehab            return str(new_tokenizer)
6479aaeb817SMauro Carvalho Chehab
6489aaeb817SMauro Carvalho Chehab        return new_tokenizer
6499aaeb817SMauro Carvalho Chehab
6509aaeb817SMauro Carvalho Chehab    def __repr__(self):
6519aaeb817SMauro Carvalho Chehab        """
6529aaeb817SMauro Carvalho Chehab        Returns a displayable version of the class init.
6539aaeb817SMauro Carvalho Chehab        """
6549aaeb817SMauro Carvalho Chehab
6559aaeb817SMauro Carvalho Chehab        return f'CMatch("{self.regex.regex.pattern}")'
656