xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 9aaeb817ef4f794d1dbb8736332a64b5dae9521c)
1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3
2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4df50e848SMauro Carvalho Chehab
5df50e848SMauro Carvalho Chehab"""
6df50e848SMauro Carvalho ChehabRegular expression ancillary classes.
7df50e848SMauro Carvalho Chehab
8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9df50e848SMauro Carvalho Chehab
10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad
11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern.
12df50e848SMauro Carvalho Chehab
13df50e848SMauro Carvalho ChehabOther errors are logged via log instance.
14df50e848SMauro Carvalho Chehab"""
15df50e848SMauro Carvalho Chehab
16df50e848SMauro Carvalho Chehabimport logging
17df50e848SMauro Carvalho Chehabimport re
18df50e848SMauro Carvalho Chehab
19*9aaeb817SMauro Carvalho Chehabfrom copy import copy
20*9aaeb817SMauro Carvalho Chehab
21df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe
22df50e848SMauro Carvalho Chehab
23df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__)
24df50e848SMauro Carvalho Chehab
25df50e848SMauro Carvalho Chehab
26df50e848SMauro Carvalho Chehabclass CToken():
27df50e848SMauro Carvalho Chehab    """
28df50e848SMauro Carvalho Chehab    Data class to define a C token.
29df50e848SMauro Carvalho Chehab    """
30df50e848SMauro Carvalho Chehab
31df50e848SMauro Carvalho Chehab    # Tokens that can be used by the parser. Works like an C enum.
32df50e848SMauro Carvalho Chehab
33df50e848SMauro Carvalho Chehab    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
34df50e848SMauro Carvalho Chehab    STRING = 1      #: A string, including quotation marks.
35df50e848SMauro Carvalho Chehab    CHAR = 2        #: A character, including apostophes.
36df50e848SMauro Carvalho Chehab    NUMBER = 3      #: A number.
37df50e848SMauro Carvalho Chehab    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
38df50e848SMauro Carvalho Chehab    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
39df50e848SMauro Carvalho Chehab    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
40df50e848SMauro Carvalho Chehab    CPP = 7         #: A preprocessor macro.
41df50e848SMauro Carvalho Chehab    HASH = 8        #: The hash character - useful to handle other macros.
42df50e848SMauro Carvalho Chehab    OP = 9          #: A C operator (add, subtract, ...).
43df50e848SMauro Carvalho Chehab    STRUCT = 10     #: A ``struct`` keyword.
44df50e848SMauro Carvalho Chehab    UNION = 11      #: An ``union`` keyword.
45df50e848SMauro Carvalho Chehab    ENUM = 12       #: A ``struct`` keyword.
46df50e848SMauro Carvalho Chehab    TYPEDEF = 13    #: A ``typedef`` keyword.
47df50e848SMauro Carvalho Chehab    NAME = 14       #: A name. Can be an ID or a type.
48df50e848SMauro Carvalho Chehab    SPACE = 15      #: Any space characters, including new lines
49df50e848SMauro Carvalho Chehab    ENDSTMT = 16    #: End of an statement (``;``).
50df50e848SMauro Carvalho Chehab
51df50e848SMauro Carvalho Chehab    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
52df50e848SMauro Carvalho Chehab
53df50e848SMauro Carvalho Chehab    MISMATCH = 255  #: an error indicator: should never happen in practice.
54df50e848SMauro Carvalho Chehab
55df50e848SMauro Carvalho Chehab    # Dict to convert from an enum interger into a string.
56df50e848SMauro Carvalho Chehab    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
57df50e848SMauro Carvalho Chehab
58df50e848SMauro Carvalho Chehab    # Dict to convert from string to an enum-like integer value.
59df50e848SMauro Carvalho Chehab    _name_to_val = {k: v for v, k in _name_by_val.items()}
60df50e848SMauro Carvalho Chehab
61df50e848SMauro Carvalho Chehab    @staticmethod
62df50e848SMauro Carvalho Chehab    def to_name(val):
63df50e848SMauro Carvalho Chehab        """Convert from an integer value from CToken enum into a string"""
64df50e848SMauro Carvalho Chehab
65df50e848SMauro Carvalho Chehab        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
66df50e848SMauro Carvalho Chehab
67df50e848SMauro Carvalho Chehab    @staticmethod
68df50e848SMauro Carvalho Chehab    def from_name(name):
69df50e848SMauro Carvalho Chehab        """Convert a string into a CToken enum value"""
70df50e848SMauro Carvalho Chehab        if name in CToken._name_to_val:
71df50e848SMauro Carvalho Chehab            return CToken._name_to_val[name]
72df50e848SMauro Carvalho Chehab
73df50e848SMauro Carvalho Chehab        return CToken.MISMATCH
74df50e848SMauro Carvalho Chehab
75df50e848SMauro Carvalho Chehab
76df50e848SMauro Carvalho Chehab    def __init__(self, kind, value=None, pos=0,
77df50e848SMauro Carvalho Chehab                 brace_level=0, paren_level=0, bracket_level=0):
78df50e848SMauro Carvalho Chehab        self.kind = kind
79df50e848SMauro Carvalho Chehab        self.value = value
80df50e848SMauro Carvalho Chehab        self.pos = pos
81df50e848SMauro Carvalho Chehab        self.level = (bracket_level, paren_level, brace_level)
82df50e848SMauro Carvalho Chehab
83df50e848SMauro Carvalho Chehab    def __repr__(self):
84df50e848SMauro Carvalho Chehab        name = self.to_name(self.kind)
85df50e848SMauro Carvalho Chehab        if isinstance(self.value, str):
86df50e848SMauro Carvalho Chehab            value = '"' + self.value + '"'
87df50e848SMauro Carvalho Chehab        else:
88df50e848SMauro Carvalho Chehab            value = self.value
89df50e848SMauro Carvalho Chehab
90df50e848SMauro Carvalho Chehab        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
91df50e848SMauro Carvalho Chehab
92df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens.
93df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [
94df50e848SMauro Carvalho Chehab    #
95df50e848SMauro Carvalho Chehab    # Note that \s\S is different than .*, as it also catches \n
96df50e848SMauro Carvalho Chehab    #
97df50e848SMauro Carvalho Chehab    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
98df50e848SMauro Carvalho Chehab
99df50e848SMauro Carvalho Chehab    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
100df50e848SMauro Carvalho Chehab    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
101df50e848SMauro Carvalho Chehab
102df50e848SMauro Carvalho Chehab    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
103df50e848SMauro Carvalho Chehab                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
104df50e848SMauro Carvalho Chehab
105df50e848SMauro Carvalho Chehab    (CToken.ENDSTMT, r"(?:\s+;|;)"),
106df50e848SMauro Carvalho Chehab
107df50e848SMauro Carvalho Chehab    (CToken.PUNC,    r"[,\.]"),
108df50e848SMauro Carvalho Chehab
109df50e848SMauro Carvalho Chehab    (CToken.BEGIN,   r"[\[\(\{]"),
110df50e848SMauro Carvalho Chehab
111df50e848SMauro Carvalho Chehab    (CToken.END,     r"[\]\)\}]"),
112df50e848SMauro Carvalho Chehab
113df50e848SMauro Carvalho Chehab    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
114df50e848SMauro Carvalho Chehab
115df50e848SMauro Carvalho Chehab    (CToken.HASH,    r"#"),
116df50e848SMauro Carvalho Chehab
117df50e848SMauro Carvalho Chehab    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
118df50e848SMauro Carvalho Chehab                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
119df50e848SMauro Carvalho Chehab
120df50e848SMauro Carvalho Chehab    (CToken.STRUCT,  r"\bstruct\b"),
121df50e848SMauro Carvalho Chehab    (CToken.UNION,   r"\bunion\b"),
122df50e848SMauro Carvalho Chehab    (CToken.ENUM,    r"\benum\b"),
123df50e848SMauro Carvalho Chehab    (CToken.TYPEDEF, r"\btypedef\b"),
124df50e848SMauro Carvalho Chehab
125df50e848SMauro Carvalho Chehab    (CToken.NAME,    r"[A-Za-z_]\w*"),
126df50e848SMauro Carvalho Chehab
127df50e848SMauro Carvalho Chehab    (CToken.SPACE,   r"\s+"),
128df50e848SMauro Carvalho Chehab
129df50e848SMauro Carvalho Chehab    (CToken.BACKREF, r"\\\d+"),
130df50e848SMauro Carvalho Chehab
131df50e848SMauro Carvalho Chehab    (CToken.MISMATCH,r"."),
132df50e848SMauro Carvalho Chehab]
133df50e848SMauro Carvalho Chehab
134df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list):
135df50e848SMauro Carvalho Chehab    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
136df50e848SMauro Carvalho Chehab    re_tokens = []
137df50e848SMauro Carvalho Chehab
138df50e848SMauro Carvalho Chehab    for kind, pattern in token_list:
139df50e848SMauro Carvalho Chehab        name = CToken.to_name(kind)
140df50e848SMauro Carvalho Chehab        re_tokens.append(f"(?P<{name}>{pattern})")
141df50e848SMauro Carvalho Chehab
142df50e848SMauro Carvalho Chehab    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
143df50e848SMauro Carvalho Chehab
144df50e848SMauro Carvalho Chehab#: Handle C continuation lines.
145df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n")
146df50e848SMauro Carvalho Chehab
147df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*')
148df50e848SMauro Carvalho Chehab
149df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage.
150df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
151df50e848SMauro Carvalho Chehab
152df50e848SMauro Carvalho Chehab
153df50e848SMauro Carvalho Chehabclass CTokenizer():
154df50e848SMauro Carvalho Chehab    """
155df50e848SMauro Carvalho Chehab    Scan C statements and definitions and produce tokens.
156df50e848SMauro Carvalho Chehab
157df50e848SMauro Carvalho Chehab    When converted to string, it drops comments and handle public/private
158df50e848SMauro Carvalho Chehab    values, respecting depth.
159df50e848SMauro Carvalho Chehab    """
160df50e848SMauro Carvalho Chehab
161df50e848SMauro Carvalho Chehab    # This class is inspired and follows the basic concepts of:
162df50e848SMauro Carvalho Chehab    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
163df50e848SMauro Carvalho Chehab
164df50e848SMauro Carvalho Chehab    def __init__(self, source=None, log=None):
165df50e848SMauro Carvalho Chehab        """
166df50e848SMauro Carvalho Chehab        Create a regular expression to handle RE_SCANNER_LIST.
167df50e848SMauro Carvalho Chehab
168df50e848SMauro Carvalho Chehab        While I generally don't like using regex group naming via:
169df50e848SMauro Carvalho Chehab            (?P<name>...)
170df50e848SMauro Carvalho Chehab
171df50e848SMauro Carvalho Chehab        in this particular case, it makes sense, as we can pick the name
172df50e848SMauro Carvalho Chehab        when matching a code via RE_SCANNER.
173df50e848SMauro Carvalho Chehab        """
174df50e848SMauro Carvalho Chehab
175df50e848SMauro Carvalho Chehab        self.tokens = []
176df50e848SMauro Carvalho Chehab
177df50e848SMauro Carvalho Chehab        if not source:
178df50e848SMauro Carvalho Chehab            return
179df50e848SMauro Carvalho Chehab
180df50e848SMauro Carvalho Chehab        if isinstance(source, list):
181df50e848SMauro Carvalho Chehab            self.tokens = source
182df50e848SMauro Carvalho Chehab            return
183df50e848SMauro Carvalho Chehab
184df50e848SMauro Carvalho Chehab        #
185df50e848SMauro Carvalho Chehab        # While we could just use _tokenize directly via interator,
186df50e848SMauro Carvalho Chehab        # As we'll need to use the tokenizer several times inside kernel-doc
187df50e848SMauro Carvalho Chehab        # to handle macro transforms, cache the results on a list, as
188df50e848SMauro Carvalho Chehab        # re-using it is cheaper than having to parse everytime.
189df50e848SMauro Carvalho Chehab        #
190df50e848SMauro Carvalho Chehab        for tok in self._tokenize(source):
191df50e848SMauro Carvalho Chehab            self.tokens.append(tok)
192df50e848SMauro Carvalho Chehab
193df50e848SMauro Carvalho Chehab    def _tokenize(self, source):
194df50e848SMauro Carvalho Chehab        """
195df50e848SMauro Carvalho Chehab        Iterator that parses ``source``, splitting it into tokens, as defined
196df50e848SMauro Carvalho Chehab        at ``self.RE_SCANNER_LIST``.
197df50e848SMauro Carvalho Chehab
198df50e848SMauro Carvalho Chehab        The interactor returns a CToken class object.
199df50e848SMauro Carvalho Chehab        """
200df50e848SMauro Carvalho Chehab
201df50e848SMauro Carvalho Chehab        # Handle continuation lines. Note that kdoc_parser already has a
202df50e848SMauro Carvalho Chehab        # logic to do that. Still, let's keep it for completeness, as we might
203df50e848SMauro Carvalho Chehab        # end re-using this tokenizer outsize kernel-doc some day - or we may
204df50e848SMauro Carvalho Chehab        # eventually remove from there as a future cleanup.
205df50e848SMauro Carvalho Chehab        source = RE_CONT.sub("", source)
206df50e848SMauro Carvalho Chehab
207df50e848SMauro Carvalho Chehab        brace_level = 0
208df50e848SMauro Carvalho Chehab        paren_level = 0
209df50e848SMauro Carvalho Chehab        bracket_level = 0
210df50e848SMauro Carvalho Chehab
211df50e848SMauro Carvalho Chehab        for match in RE_SCANNER.finditer(source):
212df50e848SMauro Carvalho Chehab            kind = CToken.from_name(match.lastgroup)
213df50e848SMauro Carvalho Chehab            pos = match.start()
214df50e848SMauro Carvalho Chehab            value = match.group()
215df50e848SMauro Carvalho Chehab
216df50e848SMauro Carvalho Chehab            if kind == CToken.MISMATCH:
217df50e848SMauro Carvalho Chehab                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
218df50e848SMauro Carvalho Chehab            elif kind == CToken.BEGIN:
219df50e848SMauro Carvalho Chehab                if value == '(':
220df50e848SMauro Carvalho Chehab                    paren_level += 1
221df50e848SMauro Carvalho Chehab                elif value == '[':
222df50e848SMauro Carvalho Chehab                    bracket_level += 1
223df50e848SMauro Carvalho Chehab                else:  # value == '{'
224df50e848SMauro Carvalho Chehab                    brace_level += 1
225df50e848SMauro Carvalho Chehab
226df50e848SMauro Carvalho Chehab            elif kind == CToken.END:
227df50e848SMauro Carvalho Chehab                if value == ')' and paren_level > 0:
228df50e848SMauro Carvalho Chehab                    paren_level -= 1
229df50e848SMauro Carvalho Chehab                elif value == ']' and bracket_level > 0:
230df50e848SMauro Carvalho Chehab                    bracket_level -= 1
231df50e848SMauro Carvalho Chehab                elif brace_level > 0:    # value == '}'
232df50e848SMauro Carvalho Chehab                    brace_level -= 1
233df50e848SMauro Carvalho Chehab
234df50e848SMauro Carvalho Chehab            yield CToken(kind, value, pos,
235df50e848SMauro Carvalho Chehab                         brace_level, paren_level, bracket_level)
236df50e848SMauro Carvalho Chehab
237df50e848SMauro Carvalho Chehab    def __str__(self):
238df50e848SMauro Carvalho Chehab        out=""
239df50e848SMauro Carvalho Chehab        show_stack = [True]
240df50e848SMauro Carvalho Chehab
241df50e848SMauro Carvalho Chehab        for i, tok in enumerate(self.tokens):
242df50e848SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
243df50e848SMauro Carvalho Chehab                show_stack.append(show_stack[-1])
244df50e848SMauro Carvalho Chehab
245df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.END:
246df50e848SMauro Carvalho Chehab                prev = show_stack[-1]
247df50e848SMauro Carvalho Chehab                if len(show_stack) > 1:
248df50e848SMauro Carvalho Chehab                    show_stack.pop()
249df50e848SMauro Carvalho Chehab
250df50e848SMauro Carvalho Chehab                if not prev and show_stack[-1]:
251df50e848SMauro Carvalho Chehab                    #
252df50e848SMauro Carvalho Chehab                    # Try to preserve indent
253df50e848SMauro Carvalho Chehab                    #
254df50e848SMauro Carvalho Chehab                    out += "\t" * (len(show_stack) - 1)
255df50e848SMauro Carvalho Chehab
256df50e848SMauro Carvalho Chehab                    out += str(tok.value)
257df50e848SMauro Carvalho Chehab                    continue
258df50e848SMauro Carvalho Chehab
259df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.COMMENT:
260df50e848SMauro Carvalho Chehab                comment = RE_COMMENT_START.sub("", tok.value)
261df50e848SMauro Carvalho Chehab
262df50e848SMauro Carvalho Chehab                if comment.startswith("private:"):
263df50e848SMauro Carvalho Chehab                    show_stack[-1] = False
264df50e848SMauro Carvalho Chehab                    show = False
265df50e848SMauro Carvalho Chehab                elif comment.startswith("public:"):
266df50e848SMauro Carvalho Chehab                    show_stack[-1] = True
267df50e848SMauro Carvalho Chehab
268df50e848SMauro Carvalho Chehab                continue
269df50e848SMauro Carvalho Chehab
270df50e848SMauro Carvalho Chehab            if not show_stack[-1]:
271df50e848SMauro Carvalho Chehab                continue
272df50e848SMauro Carvalho Chehab
273df50e848SMauro Carvalho Chehab            if i < len(self.tokens) - 1:
274df50e848SMauro Carvalho Chehab                next_tok = self.tokens[i + 1]
275df50e848SMauro Carvalho Chehab
276df50e848SMauro Carvalho Chehab                # Do some cleanups before ";"
277df50e848SMauro Carvalho Chehab
278f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
279df50e848SMauro Carvalho Chehab                    continue
280df50e848SMauro Carvalho Chehab
281f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
282df50e848SMauro Carvalho Chehab                    continue
283df50e848SMauro Carvalho Chehab
284df50e848SMauro Carvalho Chehab            out += str(tok.value)
285df50e848SMauro Carvalho Chehab
286df50e848SMauro Carvalho Chehab        return out
287f1cf9f7cSMauro Carvalho Chehab
288f1cf9f7cSMauro Carvalho Chehab
289*9aaeb817SMauro Carvalho Chehabclass CTokenArgs:
290*9aaeb817SMauro Carvalho Chehab    """
291*9aaeb817SMauro Carvalho Chehab    Ancillary class to help using backrefs from sub matches.
292*9aaeb817SMauro Carvalho Chehab
293*9aaeb817SMauro Carvalho Chehab    If the highest backref contain a "+" at the last element,
294*9aaeb817SMauro Carvalho Chehab    the logic will be greedy, picking all other delims.
295*9aaeb817SMauro Carvalho Chehab
296*9aaeb817SMauro Carvalho Chehab    This is needed to parse struct_group macros with end with ``MEMBERS...``.
297*9aaeb817SMauro Carvalho Chehab    """
298*9aaeb817SMauro Carvalho Chehab    def __init__(self, sub_str):
299*9aaeb817SMauro Carvalho Chehab        self.sub_groups = set()
300*9aaeb817SMauro Carvalho Chehab        self.max_group = -1
301*9aaeb817SMauro Carvalho Chehab        self.greedy = None
302*9aaeb817SMauro Carvalho Chehab
303*9aaeb817SMauro Carvalho Chehab        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
304*9aaeb817SMauro Carvalho Chehab            group = int(m.group(1))
305*9aaeb817SMauro Carvalho Chehab            if m.group(2) == "+":
306*9aaeb817SMauro Carvalho Chehab                if self.greedy and self.greedy != group:
307*9aaeb817SMauro Carvalho Chehab                    raise ValueError("There are multiple greedy patterns!")
308*9aaeb817SMauro Carvalho Chehab                self.greedy = group
309*9aaeb817SMauro Carvalho Chehab
310*9aaeb817SMauro Carvalho Chehab            self.sub_groups.add(group)
311*9aaeb817SMauro Carvalho Chehab            self.max_group = max(self.max_group, group)
312*9aaeb817SMauro Carvalho Chehab
313*9aaeb817SMauro Carvalho Chehab        if self.greedy:
314*9aaeb817SMauro Carvalho Chehab            if self.greedy != self.max_group:
315*9aaeb817SMauro Carvalho Chehab                raise ValueError("Greedy pattern is not the last one!")
316*9aaeb817SMauro Carvalho Chehab
317*9aaeb817SMauro Carvalho Chehab            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
318*9aaeb817SMauro Carvalho Chehab
319*9aaeb817SMauro Carvalho Chehab        self.sub_str = sub_str
320*9aaeb817SMauro Carvalho Chehab        self.sub_tokeninzer = CTokenizer(sub_str)
321*9aaeb817SMauro Carvalho Chehab
322*9aaeb817SMauro Carvalho Chehab    def groups(self, new_tokenizer):
323*9aaeb817SMauro Carvalho Chehab        """
324*9aaeb817SMauro Carvalho Chehab        Create replacement arguments for backrefs like:
325*9aaeb817SMauro Carvalho Chehab
326*9aaeb817SMauro Carvalho Chehab        ``\0``, ``\1``, ``\2``, ...``\n``
327*9aaeb817SMauro Carvalho Chehab
328*9aaeb817SMauro Carvalho Chehab        It also accepts a ``+`` character to the highest backref. When used,
329*9aaeb817SMauro Carvalho Chehab        it means in practice to ignore delimins after it, being greedy.
330*9aaeb817SMauro Carvalho Chehab
331*9aaeb817SMauro Carvalho Chehab        The logic is smart enough to only go up to the maximum required
332*9aaeb817SMauro Carvalho Chehab        argument, even if there are more.
333*9aaeb817SMauro Carvalho Chehab
334*9aaeb817SMauro Carvalho Chehab        If there is a backref for an argument above the limit, it will
335*9aaeb817SMauro Carvalho Chehab        raise an exception. Please notice that, on C, square brackets
336*9aaeb817SMauro Carvalho Chehab        don't have any separator on it. Trying to use ``\1``..``\n`` for
337*9aaeb817SMauro Carvalho Chehab        brackets also raise an exception.
338*9aaeb817SMauro Carvalho Chehab        """
339*9aaeb817SMauro Carvalho Chehab
340*9aaeb817SMauro Carvalho Chehab        level = (0, 0, 0)
341*9aaeb817SMauro Carvalho Chehab
342*9aaeb817SMauro Carvalho Chehab        if self.max_group < 0:
343*9aaeb817SMauro Carvalho Chehab            return level, []
344*9aaeb817SMauro Carvalho Chehab
345*9aaeb817SMauro Carvalho Chehab        tokens = new_tokenizer.tokens
346*9aaeb817SMauro Carvalho Chehab
347*9aaeb817SMauro Carvalho Chehab        #
348*9aaeb817SMauro Carvalho Chehab        # Fill \0 with the full token contents
349*9aaeb817SMauro Carvalho Chehab        #
350*9aaeb817SMauro Carvalho Chehab        groups_list = [ [] ]
351*9aaeb817SMauro Carvalho Chehab
352*9aaeb817SMauro Carvalho Chehab        if 0 in self.sub_groups:
353*9aaeb817SMauro Carvalho Chehab            inner_level = 0
354*9aaeb817SMauro Carvalho Chehab
355*9aaeb817SMauro Carvalho Chehab            for i in range(0, len(tokens)):
356*9aaeb817SMauro Carvalho Chehab                tok = tokens[i]
357*9aaeb817SMauro Carvalho Chehab
358*9aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.BEGIN:
359*9aaeb817SMauro Carvalho Chehab                    inner_level += 1
360*9aaeb817SMauro Carvalho Chehab
361*9aaeb817SMauro Carvalho Chehab                    #
362*9aaeb817SMauro Carvalho Chehab                    # Discard first begin
363*9aaeb817SMauro Carvalho Chehab                    #
364*9aaeb817SMauro Carvalho Chehab                    if not groups_list[0]:
365*9aaeb817SMauro Carvalho Chehab                        continue
366*9aaeb817SMauro Carvalho Chehab                elif tok.kind == CToken.END:
367*9aaeb817SMauro Carvalho Chehab                    inner_level -= 1
368*9aaeb817SMauro Carvalho Chehab                    if inner_level < 0:
369*9aaeb817SMauro Carvalho Chehab                        break
370*9aaeb817SMauro Carvalho Chehab
371*9aaeb817SMauro Carvalho Chehab                if inner_level:
372*9aaeb817SMauro Carvalho Chehab                    groups_list[0].append(tok)
373*9aaeb817SMauro Carvalho Chehab
374*9aaeb817SMauro Carvalho Chehab        if not self.max_group:
375*9aaeb817SMauro Carvalho Chehab            return level, groups_list
376*9aaeb817SMauro Carvalho Chehab
377*9aaeb817SMauro Carvalho Chehab        delim = None
378*9aaeb817SMauro Carvalho Chehab
379*9aaeb817SMauro Carvalho Chehab        #
380*9aaeb817SMauro Carvalho Chehab        # Ignore everything before BEGIN. The value of begin gives the
381*9aaeb817SMauro Carvalho Chehab        # delimiter to be used for the matches
382*9aaeb817SMauro Carvalho Chehab        #
383*9aaeb817SMauro Carvalho Chehab        for i in range(0, len(tokens)):
384*9aaeb817SMauro Carvalho Chehab            tok = tokens[i]
385*9aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
386*9aaeb817SMauro Carvalho Chehab                if tok.value == "{":
387*9aaeb817SMauro Carvalho Chehab                    delim = ";"
388*9aaeb817SMauro Carvalho Chehab                elif tok.value == "(":
389*9aaeb817SMauro Carvalho Chehab                    delim = ","
390*9aaeb817SMauro Carvalho Chehab                else:
391*9aaeb817SMauro Carvalho Chehab                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
392*9aaeb817SMauro Carvalho Chehab
393*9aaeb817SMauro Carvalho Chehab                level = tok.level
394*9aaeb817SMauro Carvalho Chehab                break
395*9aaeb817SMauro Carvalho Chehab
396*9aaeb817SMauro Carvalho Chehab        pos = 1
397*9aaeb817SMauro Carvalho Chehab        groups_list.append([])
398*9aaeb817SMauro Carvalho Chehab
399*9aaeb817SMauro Carvalho Chehab        inner_level = 0
400*9aaeb817SMauro Carvalho Chehab        for i in range(i + 1, len(tokens)):
401*9aaeb817SMauro Carvalho Chehab            tok = tokens[i]
402*9aaeb817SMauro Carvalho Chehab
403*9aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
404*9aaeb817SMauro Carvalho Chehab                inner_level += 1
405*9aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.END:
406*9aaeb817SMauro Carvalho Chehab                inner_level -= 1
407*9aaeb817SMauro Carvalho Chehab                if inner_level < 0:
408*9aaeb817SMauro Carvalho Chehab                    break
409*9aaeb817SMauro Carvalho Chehab
410*9aaeb817SMauro Carvalho Chehab            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
411*9aaeb817SMauro Carvalho Chehab                pos += 1
412*9aaeb817SMauro Carvalho Chehab                if self.greedy and pos > self.max_group:
413*9aaeb817SMauro Carvalho Chehab                    pos -= 1
414*9aaeb817SMauro Carvalho Chehab                else:
415*9aaeb817SMauro Carvalho Chehab                    groups_list.append([])
416*9aaeb817SMauro Carvalho Chehab
417*9aaeb817SMauro Carvalho Chehab                    if pos > self.max_group:
418*9aaeb817SMauro Carvalho Chehab                        break
419*9aaeb817SMauro Carvalho Chehab
420*9aaeb817SMauro Carvalho Chehab                    continue
421*9aaeb817SMauro Carvalho Chehab
422*9aaeb817SMauro Carvalho Chehab            groups_list[pos].append(tok)
423*9aaeb817SMauro Carvalho Chehab
424*9aaeb817SMauro Carvalho Chehab        if pos < self.max_group:
425*9aaeb817SMauro Carvalho Chehab            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
426*9aaeb817SMauro Carvalho Chehab
427*9aaeb817SMauro Carvalho Chehab        return level, groups_list
428*9aaeb817SMauro Carvalho Chehab
429*9aaeb817SMauro Carvalho Chehab    def tokens(self, new_tokenizer):
430*9aaeb817SMauro Carvalho Chehab        level, groups = self.groups(new_tokenizer)
431*9aaeb817SMauro Carvalho Chehab
432*9aaeb817SMauro Carvalho Chehab        new = CTokenizer()
433*9aaeb817SMauro Carvalho Chehab
434*9aaeb817SMauro Carvalho Chehab        for tok in self.sub_tokeninzer.tokens:
435*9aaeb817SMauro Carvalho Chehab            if tok.kind == CToken.BACKREF:
436*9aaeb817SMauro Carvalho Chehab                group = int(tok.value[1:])
437*9aaeb817SMauro Carvalho Chehab
438*9aaeb817SMauro Carvalho Chehab                for group_tok in groups[group]:
439*9aaeb817SMauro Carvalho Chehab                    new_tok = copy(group_tok)
440*9aaeb817SMauro Carvalho Chehab
441*9aaeb817SMauro Carvalho Chehab                    new_level = [0, 0, 0]
442*9aaeb817SMauro Carvalho Chehab
443*9aaeb817SMauro Carvalho Chehab                    for i in range(0, len(level)):
444*9aaeb817SMauro Carvalho Chehab                        new_level[i] = new_tok.level[i] + level[i]
445*9aaeb817SMauro Carvalho Chehab
446*9aaeb817SMauro Carvalho Chehab                    new_tok.level = tuple(new_level)
447*9aaeb817SMauro Carvalho Chehab
448*9aaeb817SMauro Carvalho Chehab                    new.tokens += [ new_tok ]
449*9aaeb817SMauro Carvalho Chehab            else:
450*9aaeb817SMauro Carvalho Chehab                new.tokens += [ tok ]
451*9aaeb817SMauro Carvalho Chehab
452*9aaeb817SMauro Carvalho Chehab        return new.tokens
453*9aaeb817SMauro Carvalho Chehab
454*9aaeb817SMauro Carvalho Chehab
455f1cf9f7cSMauro Carvalho Chehabclass CMatch:
456f1cf9f7cSMauro Carvalho Chehab    """
457f1cf9f7cSMauro Carvalho Chehab    Finding nested delimiters is hard with regular expressions. It is
458f1cf9f7cSMauro Carvalho Chehab    even harder on Python with its normal re module, as there are several
459f1cf9f7cSMauro Carvalho Chehab    advanced regular expressions that are missing.
460f1cf9f7cSMauro Carvalho Chehab
461f1cf9f7cSMauro Carvalho Chehab    This is the case of this pattern::
462f1cf9f7cSMauro Carvalho Chehab
463f1cf9f7cSMauro Carvalho Chehab            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
464f1cf9f7cSMauro Carvalho Chehab
465f1cf9f7cSMauro Carvalho Chehab    which is used to properly match open/close parentheses of the
466f1cf9f7cSMauro Carvalho Chehab    string search STRUCT_GROUP(),
467f1cf9f7cSMauro Carvalho Chehab
468f1cf9f7cSMauro Carvalho Chehab    Add a class that counts pairs of delimiters, using it to match and
469f1cf9f7cSMauro Carvalho Chehab    replace nested expressions.
470f1cf9f7cSMauro Carvalho Chehab
471f1cf9f7cSMauro Carvalho Chehab    The original approach was suggested by:
472f1cf9f7cSMauro Carvalho Chehab
473f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
474f1cf9f7cSMauro Carvalho Chehab
475f1cf9f7cSMauro Carvalho Chehab    Although I re-implemented it to make it more generic and match 3 types
476f1cf9f7cSMauro Carvalho Chehab    of delimiters. The logic checks if delimiters are paired. If not, it
477f1cf9f7cSMauro Carvalho Chehab    will ignore the search string.
478f1cf9f7cSMauro Carvalho Chehab    """
479f1cf9f7cSMauro Carvalho Chehab
480f1cf9f7cSMauro Carvalho Chehab
481*9aaeb817SMauro Carvalho Chehab    def __init__(self, regex, delim="("):
482*9aaeb817SMauro Carvalho Chehab        self.regex = KernRe("^" + regex + r"\b")
483*9aaeb817SMauro Carvalho Chehab        self.start_delim = delim
484f1cf9f7cSMauro Carvalho Chehab
485f1cf9f7cSMauro Carvalho Chehab    def _search(self, tokenizer):
486f1cf9f7cSMauro Carvalho Chehab        """
487f1cf9f7cSMauro Carvalho Chehab        Finds paired blocks for a regex that ends with a delimiter.
488f1cf9f7cSMauro Carvalho Chehab
489f1cf9f7cSMauro Carvalho Chehab        The suggestion of using finditer to match pairs came from:
490f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
491f1cf9f7cSMauro Carvalho Chehab        but I ended using a different implementation to align all three types
492f1cf9f7cSMauro Carvalho Chehab        of delimiters and seek for an initial regular expression.
493f1cf9f7cSMauro Carvalho Chehab
494f1cf9f7cSMauro Carvalho Chehab        The algorithm seeks for open/close paired delimiters and places them
495f1cf9f7cSMauro Carvalho Chehab        into a stack, yielding a start/stop position of each match when the
496f1cf9f7cSMauro Carvalho Chehab        stack is zeroed.
497f1cf9f7cSMauro Carvalho Chehab
498f1cf9f7cSMauro Carvalho Chehab        The algorithm should work fine for properly paired lines, but will
499f1cf9f7cSMauro Carvalho Chehab        silently ignore end delimiters that precede a start delimiter.
500f1cf9f7cSMauro Carvalho Chehab        This should be OK for kernel-doc parser, as unaligned delimiters
501f1cf9f7cSMauro Carvalho Chehab        would cause compilation errors. So, we don't need to raise exceptions
502f1cf9f7cSMauro Carvalho Chehab        to cover such issues.
503f1cf9f7cSMauro Carvalho Chehab        """
504f1cf9f7cSMauro Carvalho Chehab
505f1cf9f7cSMauro Carvalho Chehab        start = None
506f1cf9f7cSMauro Carvalho Chehab        started = False
507f1cf9f7cSMauro Carvalho Chehab
508f1cf9f7cSMauro Carvalho Chehab        import sys
509f1cf9f7cSMauro Carvalho Chehab
510f1cf9f7cSMauro Carvalho Chehab        stack = []
511f1cf9f7cSMauro Carvalho Chehab
512f1cf9f7cSMauro Carvalho Chehab        for i, tok in enumerate(tokenizer.tokens):
513f1cf9f7cSMauro Carvalho Chehab            if start is None:
514f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.NAME and self.regex.match(tok.value):
515f1cf9f7cSMauro Carvalho Chehab                    start = i
516f1cf9f7cSMauro Carvalho Chehab                    stack.append((start, tok.level))
517f1cf9f7cSMauro Carvalho Chehab                    started = False
518f1cf9f7cSMauro Carvalho Chehab
519f1cf9f7cSMauro Carvalho Chehab                continue
520f1cf9f7cSMauro Carvalho Chehab
521*9aaeb817SMauro Carvalho Chehab            if not started:
522*9aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.SPACE:
523*9aaeb817SMauro Carvalho Chehab                    continue
524*9aaeb817SMauro Carvalho Chehab
525*9aaeb817SMauro Carvalho Chehab                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
526f1cf9f7cSMauro Carvalho Chehab                    started = True
527f1cf9f7cSMauro Carvalho Chehab                    continue
528f1cf9f7cSMauro Carvalho Chehab
529*9aaeb817SMauro Carvalho Chehab                # Name only token without BEGIN/END
530*9aaeb817SMauro Carvalho Chehab                if i > start:
531*9aaeb817SMauro Carvalho Chehab                    i -= 1
532*9aaeb817SMauro Carvalho Chehab                yield start, i
533*9aaeb817SMauro Carvalho Chehab                start = None
534*9aaeb817SMauro Carvalho Chehab
535f1cf9f7cSMauro Carvalho Chehab            if tok.kind == CToken.END and tok.level == stack[-1][1]:
536f1cf9f7cSMauro Carvalho Chehab                start, level = stack.pop()
537f1cf9f7cSMauro Carvalho Chehab
538*9aaeb817SMauro Carvalho Chehab                yield start, i
539f1cf9f7cSMauro Carvalho Chehab                start = None
540f1cf9f7cSMauro Carvalho Chehab
541f1cf9f7cSMauro Carvalho Chehab        #
542f1cf9f7cSMauro Carvalho Chehab        # If an END zeroing levels is not there, return remaining stuff
543f1cf9f7cSMauro Carvalho Chehab        # This is meant to solve cases where the caller logic might be
544f1cf9f7cSMauro Carvalho Chehab        # picking an incomplete block.
545f1cf9f7cSMauro Carvalho Chehab        #
546*9aaeb817SMauro Carvalho Chehab        if start and stack:
547*9aaeb817SMauro Carvalho Chehab            if started:
548*9aaeb817SMauro Carvalho Chehab                s = str(tokenizer)
549*9aaeb817SMauro Carvalho Chehab                log.warning(f"can't find a final end at {s}")
550*9aaeb817SMauro Carvalho Chehab
551*9aaeb817SMauro Carvalho Chehab            yield start, len(tokenizer.tokens)
552f1cf9f7cSMauro Carvalho Chehab
553f1cf9f7cSMauro Carvalho Chehab    def search(self, source):
554f1cf9f7cSMauro Carvalho Chehab        """
555f1cf9f7cSMauro Carvalho Chehab        This is similar to re.search:
556f1cf9f7cSMauro Carvalho Chehab
557f1cf9f7cSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
558f1cf9f7cSMauro Carvalho Chehab        returning occurrences only if all delimiters are paired.
559f1cf9f7cSMauro Carvalho Chehab        """
560f1cf9f7cSMauro Carvalho Chehab
561f1cf9f7cSMauro Carvalho Chehab        if isinstance(source, CTokenizer):
562f1cf9f7cSMauro Carvalho Chehab            tokenizer = source
563f1cf9f7cSMauro Carvalho Chehab            is_token = True
564f1cf9f7cSMauro Carvalho Chehab        else:
565f1cf9f7cSMauro Carvalho Chehab            tokenizer = CTokenizer(source)
566f1cf9f7cSMauro Carvalho Chehab            is_token = False
567f1cf9f7cSMauro Carvalho Chehab
568*9aaeb817SMauro Carvalho Chehab        for start, end in self._search(tokenizer):
569*9aaeb817SMauro Carvalho Chehab            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
570*9aaeb817SMauro Carvalho Chehab
571f1cf9f7cSMauro Carvalho Chehab            if is_token:
572f1cf9f7cSMauro Carvalho Chehab                yield new_tokenizer
573f1cf9f7cSMauro Carvalho Chehab            else:
574f1cf9f7cSMauro Carvalho Chehab                yield str(new_tokenizer)
575*9aaeb817SMauro Carvalho Chehab
576*9aaeb817SMauro Carvalho Chehab    def sub(self, sub_str, source, count=0):
577*9aaeb817SMauro Carvalho Chehab        """
578*9aaeb817SMauro Carvalho Chehab        This is similar to re.sub:
579*9aaeb817SMauro Carvalho Chehab
580*9aaeb817SMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
581*9aaeb817SMauro Carvalho Chehab        replacing occurrences only if all delimiters are paired.
582*9aaeb817SMauro Carvalho Chehab
583*9aaeb817SMauro Carvalho Chehab        if the sub argument contains::
584*9aaeb817SMauro Carvalho Chehab
585*9aaeb817SMauro Carvalho Chehab            r'\0'
586*9aaeb817SMauro Carvalho Chehab
587*9aaeb817SMauro Carvalho Chehab        it will work just like re: it places there the matched paired data
588*9aaeb817SMauro Carvalho Chehab        with the delimiter stripped.
589*9aaeb817SMauro Carvalho Chehab
590*9aaeb817SMauro Carvalho Chehab        If count is different than zero, it will replace at most count
591*9aaeb817SMauro Carvalho Chehab        items.
592*9aaeb817SMauro Carvalho Chehab        """
593*9aaeb817SMauro Carvalho Chehab        if isinstance(source, CTokenizer):
594*9aaeb817SMauro Carvalho Chehab            is_token = True
595*9aaeb817SMauro Carvalho Chehab            tokenizer = source
596*9aaeb817SMauro Carvalho Chehab        else:
597*9aaeb817SMauro Carvalho Chehab            is_token = False
598*9aaeb817SMauro Carvalho Chehab            tokenizer = CTokenizer(source)
599*9aaeb817SMauro Carvalho Chehab
600*9aaeb817SMauro Carvalho Chehab        # Detect if sub_str contains sub arguments
601*9aaeb817SMauro Carvalho Chehab
602*9aaeb817SMauro Carvalho Chehab        args_match = CTokenArgs(sub_str)
603*9aaeb817SMauro Carvalho Chehab
604*9aaeb817SMauro Carvalho Chehab        new_tokenizer = CTokenizer()
605*9aaeb817SMauro Carvalho Chehab        pos = 0
606*9aaeb817SMauro Carvalho Chehab        n = 0
607*9aaeb817SMauro Carvalho Chehab
608*9aaeb817SMauro Carvalho Chehab        #
609*9aaeb817SMauro Carvalho Chehab        # NOTE: the code below doesn't consider overlays at sub.
610*9aaeb817SMauro Carvalho Chehab        # We may need to add some extra unit tests to check if those
611*9aaeb817SMauro Carvalho Chehab        # would cause problems. When replacing by "", this should not
612*9aaeb817SMauro Carvalho Chehab        # be a problem, but other transformations could be problematic
613*9aaeb817SMauro Carvalho Chehab        #
614*9aaeb817SMauro Carvalho Chehab        for start, end in self._search(tokenizer):
615*9aaeb817SMauro Carvalho Chehab            new_tokenizer.tokens += tokenizer.tokens[pos:start]
616*9aaeb817SMauro Carvalho Chehab
617*9aaeb817SMauro Carvalho Chehab            new = CTokenizer(tokenizer.tokens[start:end + 1])
618*9aaeb817SMauro Carvalho Chehab
619*9aaeb817SMauro Carvalho Chehab            new_tokenizer.tokens += args_match.tokens(new)
620*9aaeb817SMauro Carvalho Chehab
621*9aaeb817SMauro Carvalho Chehab            pos = end + 1
622*9aaeb817SMauro Carvalho Chehab
623*9aaeb817SMauro Carvalho Chehab            n += 1
624*9aaeb817SMauro Carvalho Chehab            if count and n >= count:
625*9aaeb817SMauro Carvalho Chehab                break
626*9aaeb817SMauro Carvalho Chehab
627*9aaeb817SMauro Carvalho Chehab        new_tokenizer.tokens += tokenizer.tokens[pos:]
628*9aaeb817SMauro Carvalho Chehab
629*9aaeb817SMauro Carvalho Chehab        if not is_token:
630*9aaeb817SMauro Carvalho Chehab            return str(new_tokenizer)
631*9aaeb817SMauro Carvalho Chehab
632*9aaeb817SMauro Carvalho Chehab        return new_tokenizer
633*9aaeb817SMauro Carvalho Chehab
634*9aaeb817SMauro Carvalho Chehab    def __repr__(self):
635*9aaeb817SMauro Carvalho Chehab        """
636*9aaeb817SMauro Carvalho Chehab        Returns a displayable version of the class init.
637*9aaeb817SMauro Carvalho Chehab        """
638*9aaeb817SMauro Carvalho Chehab
639*9aaeb817SMauro Carvalho Chehab        return f'CMatch("{self.regex.regex.pattern}")'
640