xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 8c0b7c0d3c0e640b3ebb7f1f648ea322e56c227a)
1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3
2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4df50e848SMauro Carvalho Chehab
5df50e848SMauro Carvalho Chehab"""
6df50e848SMauro Carvalho ChehabRegular expression ancillary classes.
7df50e848SMauro Carvalho Chehab
8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9df50e848SMauro Carvalho Chehab
10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad
11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern.
12df50e848SMauro Carvalho Chehab
13df50e848SMauro Carvalho ChehabOther errors are logged via log instance.
14df50e848SMauro Carvalho Chehab"""
15df50e848SMauro Carvalho Chehab
16df50e848SMauro Carvalho Chehabimport logging
17df50e848SMauro Carvalho Chehabimport re
18df50e848SMauro Carvalho Chehab
199aaeb817SMauro Carvalho Chehabfrom copy import copy
209aaeb817SMauro Carvalho Chehab
21df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe
22df50e848SMauro Carvalho Chehab
23df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__)
24df50e848SMauro Carvalho Chehab
25024e200eSMauro Carvalho Chehabdef tokenizer_set_log(logger, prefix = ""):
26024e200eSMauro Carvalho Chehab    """
27024e200eSMauro Carvalho Chehab    Replace the module‑level logger with a LoggerAdapter that
28024e200eSMauro Carvalho Chehab    prepends *prefix* to every message.
29024e200eSMauro Carvalho Chehab    """
30024e200eSMauro Carvalho Chehab    global log
31024e200eSMauro Carvalho Chehab
32024e200eSMauro Carvalho Chehab    class PrefixAdapter(logging.LoggerAdapter):
33024e200eSMauro Carvalho Chehab        """
34024e200eSMauro Carvalho Chehab        Ancillary class to set prefix on all message logs.
35024e200eSMauro Carvalho Chehab        """
36024e200eSMauro Carvalho Chehab        def process(self, msg, kwargs):
37024e200eSMauro Carvalho Chehab            return f"{prefix}{msg}", kwargs
38024e200eSMauro Carvalho Chehab
39024e200eSMauro Carvalho Chehab    # Wrap the provided logger in our adapter
40024e200eSMauro Carvalho Chehab    log = PrefixAdapter(logger, {"prefix": prefix})
41df50e848SMauro Carvalho Chehab
42df50e848SMauro Carvalho Chehabclass CToken():
43df50e848SMauro Carvalho Chehab    """
44df50e848SMauro Carvalho Chehab    Data class to define a C token.
45df50e848SMauro Carvalho Chehab    """
46df50e848SMauro Carvalho Chehab
47df50e848SMauro Carvalho Chehab    # Tokens that can be used by the parser. Works like an C enum.
48df50e848SMauro Carvalho Chehab
49df50e848SMauro Carvalho Chehab    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
50df50e848SMauro Carvalho Chehab    STRING = 1      #: A string, including quotation marks.
51df50e848SMauro Carvalho Chehab    CHAR = 2        #: A character, including apostophes.
52df50e848SMauro Carvalho Chehab    NUMBER = 3      #: A number.
53df50e848SMauro Carvalho Chehab    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
54df50e848SMauro Carvalho Chehab    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
55df50e848SMauro Carvalho Chehab    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
56df50e848SMauro Carvalho Chehab    CPP = 7         #: A preprocessor macro.
57df50e848SMauro Carvalho Chehab    HASH = 8        #: The hash character - useful to handle other macros.
58df50e848SMauro Carvalho Chehab    OP = 9          #: A C operator (add, subtract, ...).
59df50e848SMauro Carvalho Chehab    STRUCT = 10     #: A ``struct`` keyword.
60df50e848SMauro Carvalho Chehab    UNION = 11      #: An ``union`` keyword.
61df50e848SMauro Carvalho Chehab    ENUM = 12       #: A ``struct`` keyword.
62df50e848SMauro Carvalho Chehab    TYPEDEF = 13    #: A ``typedef`` keyword.
63df50e848SMauro Carvalho Chehab    NAME = 14       #: A name. Can be an ID or a type.
64df50e848SMauro Carvalho Chehab    SPACE = 15      #: Any space characters, including new lines
65df50e848SMauro Carvalho Chehab    ENDSTMT = 16    #: End of an statement (``;``).
66df50e848SMauro Carvalho Chehab
67df50e848SMauro Carvalho Chehab    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
68df50e848SMauro Carvalho Chehab
69df50e848SMauro Carvalho Chehab    MISMATCH = 255  #: an error indicator: should never happen in practice.
70df50e848SMauro Carvalho Chehab
71df50e848SMauro Carvalho Chehab    # Dict to convert from an enum interger into a string.
72df50e848SMauro Carvalho Chehab    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73df50e848SMauro Carvalho Chehab
74df50e848SMauro Carvalho Chehab    # Dict to convert from string to an enum-like integer value.
75df50e848SMauro Carvalho Chehab    _name_to_val = {k: v for v, k in _name_by_val.items()}
76df50e848SMauro Carvalho Chehab
77df50e848SMauro Carvalho Chehab    @staticmethod
78df50e848SMauro Carvalho Chehab    def to_name(val):
79df50e848SMauro Carvalho Chehab        """Convert from an integer value from CToken enum into a string"""
80df50e848SMauro Carvalho Chehab
81df50e848SMauro Carvalho Chehab        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82df50e848SMauro Carvalho Chehab
83df50e848SMauro Carvalho Chehab    @staticmethod
84df50e848SMauro Carvalho Chehab    def from_name(name):
85df50e848SMauro Carvalho Chehab        """Convert a string into a CToken enum value"""
86df50e848SMauro Carvalho Chehab        if name in CToken._name_to_val:
87df50e848SMauro Carvalho Chehab            return CToken._name_to_val[name]
88df50e848SMauro Carvalho Chehab
89df50e848SMauro Carvalho Chehab        return CToken.MISMATCH
90df50e848SMauro Carvalho Chehab
91df50e848SMauro Carvalho Chehab
92df50e848SMauro Carvalho Chehab    def __init__(self, kind, value=None, pos=0,
93df50e848SMauro Carvalho Chehab                 brace_level=0, paren_level=0, bracket_level=0):
94df50e848SMauro Carvalho Chehab        self.kind = kind
95df50e848SMauro Carvalho Chehab        self.value = value
96df50e848SMauro Carvalho Chehab        self.pos = pos
97df50e848SMauro Carvalho Chehab        self.level = (bracket_level, paren_level, brace_level)
98df50e848SMauro Carvalho Chehab
99df50e848SMauro Carvalho Chehab    def __repr__(self):
100df50e848SMauro Carvalho Chehab        name = self.to_name(self.kind)
101df50e848SMauro Carvalho Chehab        if isinstance(self.value, str):
102df50e848SMauro Carvalho Chehab            value = '"' + self.value + '"'
103df50e848SMauro Carvalho Chehab        else:
104df50e848SMauro Carvalho Chehab            value = self.value
105df50e848SMauro Carvalho Chehab
106df50e848SMauro Carvalho Chehab        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107df50e848SMauro Carvalho Chehab
108df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens.
109df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [
110df50e848SMauro Carvalho Chehab    #
111df50e848SMauro Carvalho Chehab    # Note that \s\S is different than .*, as it also catches \n
112df50e848SMauro Carvalho Chehab    #
113df50e848SMauro Carvalho Chehab    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114df50e848SMauro Carvalho Chehab
115df50e848SMauro Carvalho Chehab    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
116df50e848SMauro Carvalho Chehab    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
117df50e848SMauro Carvalho Chehab
118df50e848SMauro Carvalho Chehab    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119df50e848SMauro Carvalho Chehab                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120df50e848SMauro Carvalho Chehab
121df50e848SMauro Carvalho Chehab    (CToken.ENDSTMT, r"(?:\s+;|;)"),
122df50e848SMauro Carvalho Chehab
123df50e848SMauro Carvalho Chehab    (CToken.PUNC,    r"[,\.]"),
124df50e848SMauro Carvalho Chehab
125df50e848SMauro Carvalho Chehab    (CToken.BEGIN,   r"[\[\(\{]"),
126df50e848SMauro Carvalho Chehab
127df50e848SMauro Carvalho Chehab    (CToken.END,     r"[\]\)\}]"),
128df50e848SMauro Carvalho Chehab
129df50e848SMauro Carvalho Chehab    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130df50e848SMauro Carvalho Chehab
131df50e848SMauro Carvalho Chehab    (CToken.HASH,    r"#"),
132df50e848SMauro Carvalho Chehab
133df50e848SMauro Carvalho Chehab    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134df50e848SMauro Carvalho Chehab                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135df50e848SMauro Carvalho Chehab
136df50e848SMauro Carvalho Chehab    (CToken.STRUCT,  r"\bstruct\b"),
137df50e848SMauro Carvalho Chehab    (CToken.UNION,   r"\bunion\b"),
138df50e848SMauro Carvalho Chehab    (CToken.ENUM,    r"\benum\b"),
139df50e848SMauro Carvalho Chehab    (CToken.TYPEDEF, r"\btypedef\b"),
140df50e848SMauro Carvalho Chehab
141df50e848SMauro Carvalho Chehab    (CToken.NAME,    r"[A-Za-z_]\w*"),
142df50e848SMauro Carvalho Chehab
143df50e848SMauro Carvalho Chehab    (CToken.SPACE,   r"\s+"),
144df50e848SMauro Carvalho Chehab
145df50e848SMauro Carvalho Chehab    (CToken.BACKREF, r"\\\d+"),
146df50e848SMauro Carvalho Chehab
147df50e848SMauro Carvalho Chehab    (CToken.MISMATCH,r"."),
148df50e848SMauro Carvalho Chehab]
149df50e848SMauro Carvalho Chehab
150df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list):
151df50e848SMauro Carvalho Chehab    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152df50e848SMauro Carvalho Chehab    re_tokens = []
153df50e848SMauro Carvalho Chehab
154df50e848SMauro Carvalho Chehab    for kind, pattern in token_list:
155df50e848SMauro Carvalho Chehab        name = CToken.to_name(kind)
156df50e848SMauro Carvalho Chehab        re_tokens.append(f"(?P<{name}>{pattern})")
157df50e848SMauro Carvalho Chehab
158df50e848SMauro Carvalho Chehab    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159df50e848SMauro Carvalho Chehab
160df50e848SMauro Carvalho Chehab#: Handle C continuation lines.
161df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n")
162df50e848SMauro Carvalho Chehab
163df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*')
164df50e848SMauro Carvalho Chehab
165df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage.
166df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167df50e848SMauro Carvalho Chehab
168df50e848SMauro Carvalho Chehab
169df50e848SMauro Carvalho Chehabclass CTokenizer():
170df50e848SMauro Carvalho Chehab    """
171df50e848SMauro Carvalho Chehab    Scan C statements and definitions and produce tokens.
172df50e848SMauro Carvalho Chehab
173df50e848SMauro Carvalho Chehab    When converted to string, it drops comments and handle public/private
174df50e848SMauro Carvalho Chehab    values, respecting depth.
175df50e848SMauro Carvalho Chehab    """
176df50e848SMauro Carvalho Chehab
177df50e848SMauro Carvalho Chehab    # This class is inspired and follows the basic concepts of:
178df50e848SMauro Carvalho Chehab    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
179df50e848SMauro Carvalho Chehab
180df50e848SMauro Carvalho Chehab    def __init__(self, source=None, log=None):
181df50e848SMauro Carvalho Chehab        """
182df50e848SMauro Carvalho Chehab        Create a regular expression to handle RE_SCANNER_LIST.
183df50e848SMauro Carvalho Chehab
184df50e848SMauro Carvalho Chehab        While I generally don't like using regex group naming via:
185df50e848SMauro Carvalho Chehab            (?P<name>...)
186df50e848SMauro Carvalho Chehab
187df50e848SMauro Carvalho Chehab        in this particular case, it makes sense, as we can pick the name
188df50e848SMauro Carvalho Chehab        when matching a code via RE_SCANNER.
189df50e848SMauro Carvalho Chehab        """
190df50e848SMauro Carvalho Chehab
191df50e848SMauro Carvalho Chehab        self.tokens = []
192df50e848SMauro Carvalho Chehab
193df50e848SMauro Carvalho Chehab        if not source:
194df50e848SMauro Carvalho Chehab            return
195df50e848SMauro Carvalho Chehab
196df50e848SMauro Carvalho Chehab        if isinstance(source, list):
197df50e848SMauro Carvalho Chehab            self.tokens = source
198df50e848SMauro Carvalho Chehab            return
199df50e848SMauro Carvalho Chehab
200df50e848SMauro Carvalho Chehab        #
201df50e848SMauro Carvalho Chehab        # While we could just use _tokenize directly via interator,
202df50e848SMauro Carvalho Chehab        # As we'll need to use the tokenizer several times inside kernel-doc
203df50e848SMauro Carvalho Chehab        # to handle macro transforms, cache the results on a list, as
204df50e848SMauro Carvalho Chehab        # re-using it is cheaper than having to parse everytime.
205df50e848SMauro Carvalho Chehab        #
206df50e848SMauro Carvalho Chehab        for tok in self._tokenize(source):
207df50e848SMauro Carvalho Chehab            self.tokens.append(tok)
208df50e848SMauro Carvalho Chehab
209df50e848SMauro Carvalho Chehab    def _tokenize(self, source):
210df50e848SMauro Carvalho Chehab        """
211df50e848SMauro Carvalho Chehab        Iterator that parses ``source``, splitting it into tokens, as defined
212df50e848SMauro Carvalho Chehab        at ``self.RE_SCANNER_LIST``.
213df50e848SMauro Carvalho Chehab
214df50e848SMauro Carvalho Chehab        The interactor returns a CToken class object.
215df50e848SMauro Carvalho Chehab        """
216df50e848SMauro Carvalho Chehab
217df50e848SMauro Carvalho Chehab        # Handle continuation lines. Note that kdoc_parser already has a
218df50e848SMauro Carvalho Chehab        # logic to do that. Still, let's keep it for completeness, as we might
219df50e848SMauro Carvalho Chehab        # end re-using this tokenizer outsize kernel-doc some day - or we may
220df50e848SMauro Carvalho Chehab        # eventually remove from there as a future cleanup.
221df50e848SMauro Carvalho Chehab        source = RE_CONT.sub("", source)
222df50e848SMauro Carvalho Chehab
223df50e848SMauro Carvalho Chehab        brace_level = 0
224df50e848SMauro Carvalho Chehab        paren_level = 0
225df50e848SMauro Carvalho Chehab        bracket_level = 0
226df50e848SMauro Carvalho Chehab
227df50e848SMauro Carvalho Chehab        for match in RE_SCANNER.finditer(source):
228df50e848SMauro Carvalho Chehab            kind = CToken.from_name(match.lastgroup)
229df50e848SMauro Carvalho Chehab            pos = match.start()
230df50e848SMauro Carvalho Chehab            value = match.group()
231df50e848SMauro Carvalho Chehab
232df50e848SMauro Carvalho Chehab            if kind == CToken.MISMATCH:
233df50e848SMauro Carvalho Chehab                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
234df50e848SMauro Carvalho Chehab            elif kind == CToken.BEGIN:
235df50e848SMauro Carvalho Chehab                if value == '(':
236df50e848SMauro Carvalho Chehab                    paren_level += 1
237df50e848SMauro Carvalho Chehab                elif value == '[':
238df50e848SMauro Carvalho Chehab                    bracket_level += 1
239df50e848SMauro Carvalho Chehab                else:  # value == '{'
240df50e848SMauro Carvalho Chehab                    brace_level += 1
241df50e848SMauro Carvalho Chehab
242df50e848SMauro Carvalho Chehab            elif kind == CToken.END:
243df50e848SMauro Carvalho Chehab                if value == ')' and paren_level > 0:
244df50e848SMauro Carvalho Chehab                    paren_level -= 1
245df50e848SMauro Carvalho Chehab                elif value == ']' and bracket_level > 0:
246df50e848SMauro Carvalho Chehab                    bracket_level -= 1
247df50e848SMauro Carvalho Chehab                elif brace_level > 0:    # value == '}'
248df50e848SMauro Carvalho Chehab                    brace_level -= 1
249df50e848SMauro Carvalho Chehab
250df50e848SMauro Carvalho Chehab            yield CToken(kind, value, pos,
251df50e848SMauro Carvalho Chehab                         brace_level, paren_level, bracket_level)
252df50e848SMauro Carvalho Chehab
253df50e848SMauro Carvalho Chehab    def __str__(self):
254df50e848SMauro Carvalho Chehab        out=""
255df50e848SMauro Carvalho Chehab        show_stack = [True]
256df50e848SMauro Carvalho Chehab
257df50e848SMauro Carvalho Chehab        for i, tok in enumerate(self.tokens):
258df50e848SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
259df50e848SMauro Carvalho Chehab                show_stack.append(show_stack[-1])
260df50e848SMauro Carvalho Chehab
261df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.END:
262df50e848SMauro Carvalho Chehab                prev = show_stack[-1]
263df50e848SMauro Carvalho Chehab                if len(show_stack) > 1:
264df50e848SMauro Carvalho Chehab                    show_stack.pop()
265df50e848SMauro Carvalho Chehab
266df50e848SMauro Carvalho Chehab                if not prev and show_stack[-1]:
267df50e848SMauro Carvalho Chehab                    #
268df50e848SMauro Carvalho Chehab                    # Try to preserve indent
269df50e848SMauro Carvalho Chehab                    #
270df50e848SMauro Carvalho Chehab                    out += "\t" * (len(show_stack) - 1)
271df50e848SMauro Carvalho Chehab
272df50e848SMauro Carvalho Chehab                    out += str(tok.value)
273df50e848SMauro Carvalho Chehab                    continue
274df50e848SMauro Carvalho Chehab
275df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.COMMENT:
276df50e848SMauro Carvalho Chehab                comment = RE_COMMENT_START.sub("", tok.value)
277df50e848SMauro Carvalho Chehab
278df50e848SMauro Carvalho Chehab                if comment.startswith("private:"):
279df50e848SMauro Carvalho Chehab                    show_stack[-1] = False
280df50e848SMauro Carvalho Chehab                    show = False
281df50e848SMauro Carvalho Chehab                elif comment.startswith("public:"):
282df50e848SMauro Carvalho Chehab                    show_stack[-1] = True
283df50e848SMauro Carvalho Chehab
284df50e848SMauro Carvalho Chehab                continue
285df50e848SMauro Carvalho Chehab
286df50e848SMauro Carvalho Chehab            if not show_stack[-1]:
287df50e848SMauro Carvalho Chehab                continue
288df50e848SMauro Carvalho Chehab
289df50e848SMauro Carvalho Chehab            if i < len(self.tokens) - 1:
290df50e848SMauro Carvalho Chehab                next_tok = self.tokens[i + 1]
291df50e848SMauro Carvalho Chehab
292df50e848SMauro Carvalho Chehab                # Do some cleanups before ";"
293df50e848SMauro Carvalho Chehab
294f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
295df50e848SMauro Carvalho Chehab                    continue
296df50e848SMauro Carvalho Chehab
297f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
298df50e848SMauro Carvalho Chehab                    continue
299df50e848SMauro Carvalho Chehab
300df50e848SMauro Carvalho Chehab            out += str(tok.value)
301df50e848SMauro Carvalho Chehab
302df50e848SMauro Carvalho Chehab        return out
303f1cf9f7cSMauro Carvalho Chehab
304f1cf9f7cSMauro Carvalho Chehab
3059aaeb817SMauro Carvalho Chehabclass CTokenArgs:
3069aaeb817SMauro Carvalho Chehab    """
3079aaeb817SMauro Carvalho Chehab    Ancillary class to help using backrefs from sub matches.
3089aaeb817SMauro Carvalho Chehab
3099aaeb817SMauro Carvalho Chehab    If the highest backref contain a "+" at the last element,
3109aaeb817SMauro Carvalho Chehab    the logic will be greedy, picking all other delims.
3119aaeb817SMauro Carvalho Chehab
3129aaeb817SMauro Carvalho Chehab    This is needed to parse struct_group macros with end with ``MEMBERS...``.
3139aaeb817SMauro Carvalho Chehab    """
3149aaeb817SMauro Carvalho Chehab    def __init__(self, sub_str):
3159aaeb817SMauro Carvalho Chehab        self.sub_groups = set()
3169aaeb817SMauro Carvalho Chehab        self.max_group = -1
3179aaeb817SMauro Carvalho Chehab        self.greedy = None
3189aaeb817SMauro Carvalho Chehab
3199aaeb817SMauro Carvalho Chehab        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
3209aaeb817SMauro Carvalho Chehab            group = int(m.group(1))
3219aaeb817SMauro Carvalho Chehab            if m.group(2) == "+":
3229aaeb817SMauro Carvalho Chehab                if self.greedy and self.greedy != group:
3239aaeb817SMauro Carvalho Chehab                    raise ValueError("There are multiple greedy patterns!")
3249aaeb817SMauro Carvalho Chehab                self.greedy = group
3259aaeb817SMauro Carvalho Chehab
3269aaeb817SMauro Carvalho Chehab            self.sub_groups.add(group)
3279aaeb817SMauro Carvalho Chehab            self.max_group = max(self.max_group, group)
3289aaeb817SMauro Carvalho Chehab
3299aaeb817SMauro Carvalho Chehab        if self.greedy:
3309aaeb817SMauro Carvalho Chehab            if self.greedy != self.max_group:
3319aaeb817SMauro Carvalho Chehab                raise ValueError("Greedy pattern is not the last one!")
3329aaeb817SMauro Carvalho Chehab
3339aaeb817SMauro Carvalho Chehab            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
3349aaeb817SMauro Carvalho Chehab
3359aaeb817SMauro Carvalho Chehab        self.sub_str = sub_str
3369aaeb817SMauro Carvalho Chehab        self.sub_tokeninzer = CTokenizer(sub_str)
3379aaeb817SMauro Carvalho Chehab
3389aaeb817SMauro Carvalho Chehab    def groups(self, new_tokenizer):
339*8c0b7c0dSMauro Carvalho Chehab        r"""
3409aaeb817SMauro Carvalho Chehab        Create replacement arguments for backrefs like:
3419aaeb817SMauro Carvalho Chehab
342*8c0b7c0dSMauro Carvalho Chehab        ``\0``, ``\1``, ``\2``, ... ``\{number}``
3439aaeb817SMauro Carvalho Chehab
344*8c0b7c0dSMauro Carvalho Chehab        It also accepts a ``+`` character to the highest backref, like
345*8c0b7c0dSMauro Carvalho Chehab        ``\4+``. When used, the backref will be greedy, picking all other
346*8c0b7c0dSMauro Carvalho Chehab        arguments afterwards.
3479aaeb817SMauro Carvalho Chehab
3489aaeb817SMauro Carvalho Chehab        The logic is smart enough to only go up to the maximum required
3499aaeb817SMauro Carvalho Chehab        argument, even if there are more.
3509aaeb817SMauro Carvalho Chehab
3519aaeb817SMauro Carvalho Chehab        If there is a backref for an argument above the limit, it will
3529aaeb817SMauro Carvalho Chehab        raise an exception. Please notice that, on C, square brackets
3539aaeb817SMauro Carvalho Chehab        don't have any separator on it. Trying to use ``\1``..``\n`` for
3549aaeb817SMauro Carvalho Chehab        brackets also raise an exception.
3559aaeb817SMauro Carvalho Chehab        """
3569aaeb817SMauro Carvalho Chehab
3579aaeb817SMauro Carvalho Chehab        level = (0, 0, 0)
3589aaeb817SMauro Carvalho Chehab
3599aaeb817SMauro Carvalho Chehab        if self.max_group < 0:
3609aaeb817SMauro Carvalho Chehab            return level, []
3619aaeb817SMauro Carvalho Chehab
3629aaeb817SMauro Carvalho Chehab        tokens = new_tokenizer.tokens
3639aaeb817SMauro Carvalho Chehab
3649aaeb817SMauro Carvalho Chehab        #
3659aaeb817SMauro Carvalho Chehab        # Fill \0 with the full token contents
3669aaeb817SMauro Carvalho Chehab        #
3679aaeb817SMauro Carvalho Chehab        groups_list = [ [] ]
3689aaeb817SMauro Carvalho Chehab
3699aaeb817SMauro Carvalho Chehab        if 0 in self.sub_groups:
3709aaeb817SMauro Carvalho Chehab            inner_level = 0
3719aaeb817SMauro Carvalho Chehab
3729aaeb817SMauro Carvalho Chehab            for i in range(0, len(tokens)):
3739aaeb817SMauro Carvalho Chehab                tok = tokens[i]
3749aaeb817SMauro Carvalho Chehab
3759aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.BEGIN:
3769aaeb817SMauro Carvalho Chehab                    inner_level += 1
3779aaeb817SMauro Carvalho Chehab
3789aaeb817SMauro Carvalho Chehab                    #
3799aaeb817SMauro Carvalho Chehab                    # Discard first begin
3809aaeb817SMauro Carvalho Chehab                    #
3819aaeb817SMauro Carvalho Chehab                    if not groups_list[0]:
3829aaeb817SMauro Carvalho Chehab                        continue
3839aaeb817SMauro Carvalho Chehab                elif tok.kind == CToken.END:
3849aaeb817SMauro Carvalho Chehab                    inner_level -= 1
3859aaeb817SMauro Carvalho Chehab                    if inner_level < 0:
3869aaeb817SMauro Carvalho Chehab                        break
3879aaeb817SMauro Carvalho Chehab
3889aaeb817SMauro Carvalho Chehab                if inner_level:
3899aaeb817SMauro Carvalho Chehab                    groups_list[0].append(tok)
3909aaeb817SMauro Carvalho Chehab
3919aaeb817SMauro Carvalho Chehab        if not self.max_group:
3929aaeb817SMauro Carvalho Chehab            return level, groups_list
3939aaeb817SMauro Carvalho Chehab
3949aaeb817SMauro Carvalho Chehab        delim = None
3959aaeb817SMauro Carvalho Chehab
3969aaeb817SMauro Carvalho Chehab        #
3979aaeb817SMauro Carvalho Chehab        # Ignore everything before BEGIN. The value of begin gives the
3989aaeb817SMauro Carvalho Chehab        # delimiter to be used for the matches
3999aaeb817SMauro Carvalho Chehab        #
4009aaeb817SMauro Carvalho Chehab        for i in range(0, len(tokens)):
4019aaeb817SMauro Carvalho Chehab            tok = tokens[i]
4029aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
4039aaeb817SMauro Carvalho Chehab                if tok.value == "{":
4049aaeb817SMauro Carvalho Chehab                    delim = ";"
4059aaeb817SMauro Carvalho Chehab                elif tok.value == "(":
4069aaeb817SMauro Carvalho Chehab                    delim = ","
4079aaeb817SMauro Carvalho Chehab                else:
4089aaeb817SMauro Carvalho Chehab                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
4099aaeb817SMauro Carvalho Chehab
4109aaeb817SMauro Carvalho Chehab                level = tok.level
4119aaeb817SMauro Carvalho Chehab                break
4129aaeb817SMauro Carvalho Chehab
4139aaeb817SMauro Carvalho Chehab        pos = 1
4149aaeb817SMauro Carvalho Chehab        groups_list.append([])
4159aaeb817SMauro Carvalho Chehab
4169aaeb817SMauro Carvalho Chehab        inner_level = 0
4179aaeb817SMauro Carvalho Chehab        for i in range(i + 1, len(tokens)):
4189aaeb817SMauro Carvalho Chehab            tok = tokens[i]
4199aaeb817SMauro Carvalho Chehab
4209aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
4219aaeb817SMauro Carvalho Chehab                inner_level += 1
4229aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.END:
4239aaeb817SMauro Carvalho Chehab                inner_level -= 1
4249aaeb817SMauro Carvalho Chehab                if inner_level < 0:
4259aaeb817SMauro Carvalho Chehab                    break
4269aaeb817SMauro Carvalho Chehab
4279aaeb817SMauro Carvalho Chehab            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
4289aaeb817SMauro Carvalho Chehab                pos += 1
4299aaeb817SMauro Carvalho Chehab                if self.greedy and pos > self.max_group:
4309aaeb817SMauro Carvalho Chehab                    pos -= 1
4319aaeb817SMauro Carvalho Chehab                else:
4329aaeb817SMauro Carvalho Chehab                    groups_list.append([])
4339aaeb817SMauro Carvalho Chehab
4349aaeb817SMauro Carvalho Chehab                    if pos > self.max_group:
4359aaeb817SMauro Carvalho Chehab                        break
4369aaeb817SMauro Carvalho Chehab
4379aaeb817SMauro Carvalho Chehab                    continue
4389aaeb817SMauro Carvalho Chehab
4399aaeb817SMauro Carvalho Chehab            groups_list[pos].append(tok)
4409aaeb817SMauro Carvalho Chehab
4419aaeb817SMauro Carvalho Chehab        if pos < self.max_group:
4429aaeb817SMauro Carvalho Chehab            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
4439aaeb817SMauro Carvalho Chehab
4449aaeb817SMauro Carvalho Chehab        return level, groups_list
4459aaeb817SMauro Carvalho Chehab
4469aaeb817SMauro Carvalho Chehab    def tokens(self, new_tokenizer):
4479aaeb817SMauro Carvalho Chehab        level, groups = self.groups(new_tokenizer)
4489aaeb817SMauro Carvalho Chehab
4499aaeb817SMauro Carvalho Chehab        new = CTokenizer()
4509aaeb817SMauro Carvalho Chehab
4519aaeb817SMauro Carvalho Chehab        for tok in self.sub_tokeninzer.tokens:
4529aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BACKREF:
4539aaeb817SMauro Carvalho Chehab                group = int(tok.value[1:])
4549aaeb817SMauro Carvalho Chehab
4559aaeb817SMauro Carvalho Chehab                for group_tok in groups[group]:
4569aaeb817SMauro Carvalho Chehab                    new_tok = copy(group_tok)
4579aaeb817SMauro Carvalho Chehab
4589aaeb817SMauro Carvalho Chehab                    new_level = [0, 0, 0]
4599aaeb817SMauro Carvalho Chehab
4609aaeb817SMauro Carvalho Chehab                    for i in range(0, len(level)):
4619aaeb817SMauro Carvalho Chehab                        new_level[i] = new_tok.level[i] + level[i]
4629aaeb817SMauro Carvalho Chehab
4639aaeb817SMauro Carvalho Chehab                    new_tok.level = tuple(new_level)
4649aaeb817SMauro Carvalho Chehab
4659aaeb817SMauro Carvalho Chehab                    new.tokens += [ new_tok ]
4669aaeb817SMauro Carvalho Chehab            else:
4679aaeb817SMauro Carvalho Chehab                new.tokens += [ tok ]
4689aaeb817SMauro Carvalho Chehab
4699aaeb817SMauro Carvalho Chehab        return new.tokens
4709aaeb817SMauro Carvalho Chehab
4719aaeb817SMauro Carvalho Chehab
472f1cf9f7cSMauro Carvalho Chehabclass CMatch:
473f1cf9f7cSMauro Carvalho Chehab    """
474f1cf9f7cSMauro Carvalho Chehab    Finding nested delimiters is hard with regular expressions. It is
475f1cf9f7cSMauro Carvalho Chehab    even harder on Python with its normal re module, as there are several
476f1cf9f7cSMauro Carvalho Chehab    advanced regular expressions that are missing.
477f1cf9f7cSMauro Carvalho Chehab
478f1cf9f7cSMauro Carvalho Chehab    This is the case of this pattern::
479f1cf9f7cSMauro Carvalho Chehab
480f1cf9f7cSMauro Carvalho Chehab            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
481f1cf9f7cSMauro Carvalho Chehab
482f1cf9f7cSMauro Carvalho Chehab    which is used to properly match open/close parentheses of the
483f1cf9f7cSMauro Carvalho Chehab    string search STRUCT_GROUP(),
484f1cf9f7cSMauro Carvalho Chehab
485f1cf9f7cSMauro Carvalho Chehab    Add a class that counts pairs of delimiters, using it to match and
486f1cf9f7cSMauro Carvalho Chehab    replace nested expressions.
487f1cf9f7cSMauro Carvalho Chehab
488f1cf9f7cSMauro Carvalho Chehab    The original approach was suggested by:
489f1cf9f7cSMauro Carvalho Chehab
490f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
491f1cf9f7cSMauro Carvalho Chehab
492f1cf9f7cSMauro Carvalho Chehab    Although I re-implemented it to make it more generic and match 3 types
493f1cf9f7cSMauro Carvalho Chehab    of delimiters. The logic checks if delimiters are paired. If not, it
494f1cf9f7cSMauro Carvalho Chehab    will ignore the search string.
495f1cf9f7cSMauro Carvalho Chehab    """
496f1cf9f7cSMauro Carvalho Chehab
497f1cf9f7cSMauro Carvalho Chehab
4989aaeb817SMauro Carvalho Chehab    def __init__(self, regex, delim="("):
4999aaeb817SMauro Carvalho Chehab        self.regex = KernRe("^" + regex + r"\b")
5009aaeb817SMauro Carvalho Chehab        self.start_delim = delim
501f1cf9f7cSMauro Carvalho Chehab
502f1cf9f7cSMauro Carvalho Chehab    def _search(self, tokenizer):
503f1cf9f7cSMauro Carvalho Chehab        """
504f1cf9f7cSMauro Carvalho Chehab        Finds paired blocks for a regex that ends with a delimiter.
505f1cf9f7cSMauro Carvalho Chehab
506f1cf9f7cSMauro Carvalho Chehab        The suggestion of using finditer to match pairs came from:
507f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
508f1cf9f7cSMauro Carvalho Chehab        but I ended using a different implementation to align all three types
509f1cf9f7cSMauro Carvalho Chehab        of delimiters and seek for an initial regular expression.
510f1cf9f7cSMauro Carvalho Chehab
511f1cf9f7cSMauro Carvalho Chehab        The algorithm seeks for open/close paired delimiters and places them
512f1cf9f7cSMauro Carvalho Chehab        into a stack, yielding a start/stop position of each match when the
513f1cf9f7cSMauro Carvalho Chehab        stack is zeroed.
514f1cf9f7cSMauro Carvalho Chehab
515f1cf9f7cSMauro Carvalho Chehab        The algorithm should work fine for properly paired lines, but will
516f1cf9f7cSMauro Carvalho Chehab        silently ignore end delimiters that precede a start delimiter.
517f1cf9f7cSMauro Carvalho Chehab        This should be OK for kernel-doc parser, as unaligned delimiters
518f1cf9f7cSMauro Carvalho Chehab        would cause compilation errors. So, we don't need to raise exceptions
519f1cf9f7cSMauro Carvalho Chehab        to cover such issues.
520f1cf9f7cSMauro Carvalho Chehab        """
521f1cf9f7cSMauro Carvalho Chehab
522f1cf9f7cSMauro Carvalho Chehab        start = None
523f1cf9f7cSMauro Carvalho Chehab        started = False
524f1cf9f7cSMauro Carvalho Chehab
525f1cf9f7cSMauro Carvalho Chehab        import sys
526f1cf9f7cSMauro Carvalho Chehab
527f1cf9f7cSMauro Carvalho Chehab        stack = []
528f1cf9f7cSMauro Carvalho Chehab
529f1cf9f7cSMauro Carvalho Chehab        for i, tok in enumerate(tokenizer.tokens):
530f1cf9f7cSMauro Carvalho Chehab            if start is None:
531f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.NAME and self.regex.match(tok.value):
532f1cf9f7cSMauro Carvalho Chehab                    start = i
533f1cf9f7cSMauro Carvalho Chehab                    stack.append((start, tok.level))
534f1cf9f7cSMauro Carvalho Chehab                    started = False
535f1cf9f7cSMauro Carvalho Chehab
536f1cf9f7cSMauro Carvalho Chehab                continue
537f1cf9f7cSMauro Carvalho Chehab
5389aaeb817SMauro Carvalho Chehab            if not started:
5399aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.SPACE:
5409aaeb817SMauro Carvalho Chehab                    continue
5419aaeb817SMauro Carvalho Chehab
5429aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
543f1cf9f7cSMauro Carvalho Chehab                    started = True
544f1cf9f7cSMauro Carvalho Chehab                    continue
545f1cf9f7cSMauro Carvalho Chehab
5469aaeb817SMauro Carvalho Chehab                # Name only token without BEGIN/END
5479aaeb817SMauro Carvalho Chehab                if i > start:
5489aaeb817SMauro Carvalho Chehab                    i -= 1
5499aaeb817SMauro Carvalho Chehab                yield start, i
5509aaeb817SMauro Carvalho Chehab                start = None
5519aaeb817SMauro Carvalho Chehab
552f1cf9f7cSMauro Carvalho Chehab            if tok.kind == CToken.END and tok.level == stack[-1][1]:
553f1cf9f7cSMauro Carvalho Chehab                start, level = stack.pop()
554f1cf9f7cSMauro Carvalho Chehab
5559aaeb817SMauro Carvalho Chehab                yield start, i
556f1cf9f7cSMauro Carvalho Chehab                start = None
557f1cf9f7cSMauro Carvalho Chehab
558f1cf9f7cSMauro Carvalho Chehab        #
559f1cf9f7cSMauro Carvalho Chehab        # If an END zeroing levels is not there, return remaining stuff
560f1cf9f7cSMauro Carvalho Chehab        # This is meant to solve cases where the caller logic might be
561f1cf9f7cSMauro Carvalho Chehab        # picking an incomplete block.
562f1cf9f7cSMauro Carvalho Chehab        #
5639aaeb817SMauro Carvalho Chehab        if start and stack:
5649aaeb817SMauro Carvalho Chehab            if started:
5659aaeb817SMauro Carvalho Chehab                s = str(tokenizer)
5669aaeb817SMauro Carvalho Chehab                log.warning(f"can't find a final end at {s}")
5679aaeb817SMauro Carvalho Chehab
5689aaeb817SMauro Carvalho Chehab            yield start, len(tokenizer.tokens)
569f1cf9f7cSMauro Carvalho Chehab
570f1cf9f7cSMauro Carvalho Chehab    def search(self, source):
571f1cf9f7cSMauro Carvalho Chehab        """
572f1cf9f7cSMauro Carvalho Chehab        This is similar to re.search:
573f1cf9f7cSMauro Carvalho Chehab
574f1cf9f7cSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
575f1cf9f7cSMauro Carvalho Chehab        returning occurrences only if all delimiters are paired.
576f1cf9f7cSMauro Carvalho Chehab        """
577f1cf9f7cSMauro Carvalho Chehab
578f1cf9f7cSMauro Carvalho Chehab        if isinstance(source, CTokenizer):
579f1cf9f7cSMauro Carvalho Chehab            tokenizer = source
580f1cf9f7cSMauro Carvalho Chehab            is_token = True
581f1cf9f7cSMauro Carvalho Chehab        else:
582f1cf9f7cSMauro Carvalho Chehab            tokenizer = CTokenizer(source)
583f1cf9f7cSMauro Carvalho Chehab            is_token = False
584f1cf9f7cSMauro Carvalho Chehab
5859aaeb817SMauro Carvalho Chehab        for start, end in self._search(tokenizer):
5869aaeb817SMauro Carvalho Chehab            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
5879aaeb817SMauro Carvalho Chehab
588f1cf9f7cSMauro Carvalho Chehab            if is_token:
589f1cf9f7cSMauro Carvalho Chehab                yield new_tokenizer
590f1cf9f7cSMauro Carvalho Chehab            else:
591f1cf9f7cSMauro Carvalho Chehab                yield str(new_tokenizer)
5929aaeb817SMauro Carvalho Chehab
5939aaeb817SMauro Carvalho Chehab    def sub(self, sub_str, source, count=0):
5949aaeb817SMauro Carvalho Chehab        """
5959aaeb817SMauro Carvalho Chehab        This is similar to re.sub:
5969aaeb817SMauro Carvalho Chehab
5979aaeb817SMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
5989aaeb817SMauro Carvalho Chehab        replacing occurrences only if all delimiters are paired.
5999aaeb817SMauro Carvalho Chehab
6009aaeb817SMauro Carvalho Chehab        if the sub argument contains::
6019aaeb817SMauro Carvalho Chehab
6029aaeb817SMauro Carvalho Chehab            r'\0'
6039aaeb817SMauro Carvalho Chehab
6049aaeb817SMauro Carvalho Chehab        it will work just like re: it places there the matched paired data
6059aaeb817SMauro Carvalho Chehab        with the delimiter stripped.
6069aaeb817SMauro Carvalho Chehab
6079aaeb817SMauro Carvalho Chehab        If count is different than zero, it will replace at most count
6089aaeb817SMauro Carvalho Chehab        items.
6099aaeb817SMauro Carvalho Chehab        """
6109aaeb817SMauro Carvalho Chehab        if isinstance(source, CTokenizer):
6119aaeb817SMauro Carvalho Chehab            is_token = True
6129aaeb817SMauro Carvalho Chehab            tokenizer = source
6139aaeb817SMauro Carvalho Chehab        else:
6149aaeb817SMauro Carvalho Chehab            is_token = False
6159aaeb817SMauro Carvalho Chehab            tokenizer = CTokenizer(source)
6169aaeb817SMauro Carvalho Chehab
6179aaeb817SMauro Carvalho Chehab        # Detect if sub_str contains sub arguments
6189aaeb817SMauro Carvalho Chehab
6199aaeb817SMauro Carvalho Chehab        args_match = CTokenArgs(sub_str)
6209aaeb817SMauro Carvalho Chehab
6219aaeb817SMauro Carvalho Chehab        new_tokenizer = CTokenizer()
6229aaeb817SMauro Carvalho Chehab        pos = 0
6239aaeb817SMauro Carvalho Chehab        n = 0
6249aaeb817SMauro Carvalho Chehab
6259aaeb817SMauro Carvalho Chehab        #
6269aaeb817SMauro Carvalho Chehab        # NOTE: the code below doesn't consider overlays at sub.
6279aaeb817SMauro Carvalho Chehab        # We may need to add some extra unit tests to check if those
6289aaeb817SMauro Carvalho Chehab        # would cause problems. When replacing by "", this should not
6299aaeb817SMauro Carvalho Chehab        # be a problem, but other transformations could be problematic
6309aaeb817SMauro Carvalho Chehab        #
6319aaeb817SMauro Carvalho Chehab        for start, end in self._search(tokenizer):
6329aaeb817SMauro Carvalho Chehab            new_tokenizer.tokens += tokenizer.tokens[pos:start]
6339aaeb817SMauro Carvalho Chehab
6349aaeb817SMauro Carvalho Chehab            new = CTokenizer(tokenizer.tokens[start:end + 1])
6359aaeb817SMauro Carvalho Chehab
6369aaeb817SMauro Carvalho Chehab            new_tokenizer.tokens += args_match.tokens(new)
6379aaeb817SMauro Carvalho Chehab
6389aaeb817SMauro Carvalho Chehab            pos = end + 1
6399aaeb817SMauro Carvalho Chehab
6409aaeb817SMauro Carvalho Chehab            n += 1
6419aaeb817SMauro Carvalho Chehab            if count and n >= count:
6429aaeb817SMauro Carvalho Chehab                break
6439aaeb817SMauro Carvalho Chehab
6449aaeb817SMauro Carvalho Chehab        new_tokenizer.tokens += tokenizer.tokens[pos:]
6459aaeb817SMauro Carvalho Chehab
6469aaeb817SMauro Carvalho Chehab        if not is_token:
6479aaeb817SMauro Carvalho Chehab            return str(new_tokenizer)
6489aaeb817SMauro Carvalho Chehab
6499aaeb817SMauro Carvalho Chehab        return new_tokenizer
6509aaeb817SMauro Carvalho Chehab
6519aaeb817SMauro Carvalho Chehab    def __repr__(self):
6529aaeb817SMauro Carvalho Chehab        """
6539aaeb817SMauro Carvalho Chehab        Returns a displayable version of the class init.
6549aaeb817SMauro Carvalho Chehab        """
6559aaeb817SMauro Carvalho Chehab
6569aaeb817SMauro Carvalho Chehab        return f'CMatch("{self.regex.regex.pattern}")'
657