xref: /linux/tools/lib/python/kdoc/c_lex.py (revision f1cf9f7cd66f1f90c4c3beb0885b6f7771e1b419)
1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3
2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4df50e848SMauro Carvalho Chehab
5df50e848SMauro Carvalho Chehab"""
6df50e848SMauro Carvalho ChehabRegular expression ancillary classes.
7df50e848SMauro Carvalho Chehab
8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9df50e848SMauro Carvalho Chehab
10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad
11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern.
12df50e848SMauro Carvalho Chehab
13df50e848SMauro Carvalho ChehabOther errors are logged via log instance.
14df50e848SMauro Carvalho Chehab"""
15df50e848SMauro Carvalho Chehab
16df50e848SMauro Carvalho Chehabimport logging
17df50e848SMauro Carvalho Chehabimport re
18df50e848SMauro Carvalho Chehab
19df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe
20df50e848SMauro Carvalho Chehab
21df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__)
22df50e848SMauro Carvalho Chehab
23df50e848SMauro Carvalho Chehab
24df50e848SMauro Carvalho Chehabclass CToken():
25df50e848SMauro Carvalho Chehab    """
26df50e848SMauro Carvalho Chehab    Data class to define a C token.
27df50e848SMauro Carvalho Chehab    """
28df50e848SMauro Carvalho Chehab
29df50e848SMauro Carvalho Chehab    # Tokens that can be used by the parser. Works like an C enum.
30df50e848SMauro Carvalho Chehab
31df50e848SMauro Carvalho Chehab    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
32df50e848SMauro Carvalho Chehab    STRING = 1      #: A string, including quotation marks.
33df50e848SMauro Carvalho Chehab    CHAR = 2        #: A character, including apostophes.
34df50e848SMauro Carvalho Chehab    NUMBER = 3      #: A number.
35df50e848SMauro Carvalho Chehab    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
36df50e848SMauro Carvalho Chehab    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
37df50e848SMauro Carvalho Chehab    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
38df50e848SMauro Carvalho Chehab    CPP = 7         #: A preprocessor macro.
39df50e848SMauro Carvalho Chehab    HASH = 8        #: The hash character - useful to handle other macros.
40df50e848SMauro Carvalho Chehab    OP = 9          #: A C operator (add, subtract, ...).
41df50e848SMauro Carvalho Chehab    STRUCT = 10     #: A ``struct`` keyword.
42df50e848SMauro Carvalho Chehab    UNION = 11      #: An ``union`` keyword.
43df50e848SMauro Carvalho Chehab    ENUM = 12       #: A ``struct`` keyword.
44df50e848SMauro Carvalho Chehab    TYPEDEF = 13    #: A ``typedef`` keyword.
45df50e848SMauro Carvalho Chehab    NAME = 14       #: A name. Can be an ID or a type.
46df50e848SMauro Carvalho Chehab    SPACE = 15      #: Any space characters, including new lines
47df50e848SMauro Carvalho Chehab    ENDSTMT = 16    #: End of an statement (``;``).
48df50e848SMauro Carvalho Chehab
49df50e848SMauro Carvalho Chehab    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
50df50e848SMauro Carvalho Chehab
51df50e848SMauro Carvalho Chehab    MISMATCH = 255  #: an error indicator: should never happen in practice.
52df50e848SMauro Carvalho Chehab
53df50e848SMauro Carvalho Chehab    # Dict to convert from an enum interger into a string.
54df50e848SMauro Carvalho Chehab    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
55df50e848SMauro Carvalho Chehab
56df50e848SMauro Carvalho Chehab    # Dict to convert from string to an enum-like integer value.
57df50e848SMauro Carvalho Chehab    _name_to_val = {k: v for v, k in _name_by_val.items()}
58df50e848SMauro Carvalho Chehab
59df50e848SMauro Carvalho Chehab    @staticmethod
60df50e848SMauro Carvalho Chehab    def to_name(val):
61df50e848SMauro Carvalho Chehab        """Convert from an integer value from CToken enum into a string"""
62df50e848SMauro Carvalho Chehab
63df50e848SMauro Carvalho Chehab        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
64df50e848SMauro Carvalho Chehab
65df50e848SMauro Carvalho Chehab    @staticmethod
66df50e848SMauro Carvalho Chehab    def from_name(name):
67df50e848SMauro Carvalho Chehab        """Convert a string into a CToken enum value"""
68df50e848SMauro Carvalho Chehab        if name in CToken._name_to_val:
69df50e848SMauro Carvalho Chehab            return CToken._name_to_val[name]
70df50e848SMauro Carvalho Chehab
71df50e848SMauro Carvalho Chehab        return CToken.MISMATCH
72df50e848SMauro Carvalho Chehab
73df50e848SMauro Carvalho Chehab
74df50e848SMauro Carvalho Chehab    def __init__(self, kind, value=None, pos=0,
75df50e848SMauro Carvalho Chehab                 brace_level=0, paren_level=0, bracket_level=0):
76df50e848SMauro Carvalho Chehab        self.kind = kind
77df50e848SMauro Carvalho Chehab        self.value = value
78df50e848SMauro Carvalho Chehab        self.pos = pos
79df50e848SMauro Carvalho Chehab        self.level = (bracket_level, paren_level, brace_level)
80df50e848SMauro Carvalho Chehab
81df50e848SMauro Carvalho Chehab    def __repr__(self):
82df50e848SMauro Carvalho Chehab        name = self.to_name(self.kind)
83df50e848SMauro Carvalho Chehab        if isinstance(self.value, str):
84df50e848SMauro Carvalho Chehab            value = '"' + self.value + '"'
85df50e848SMauro Carvalho Chehab        else:
86df50e848SMauro Carvalho Chehab            value = self.value
87df50e848SMauro Carvalho Chehab
88df50e848SMauro Carvalho Chehab        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
89df50e848SMauro Carvalho Chehab
90df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens.
91df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [
92df50e848SMauro Carvalho Chehab    #
93df50e848SMauro Carvalho Chehab    # Note that \s\S is different than .*, as it also catches \n
94df50e848SMauro Carvalho Chehab    #
95df50e848SMauro Carvalho Chehab    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
96df50e848SMauro Carvalho Chehab
97df50e848SMauro Carvalho Chehab    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
98df50e848SMauro Carvalho Chehab    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
99df50e848SMauro Carvalho Chehab
100df50e848SMauro Carvalho Chehab    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
101df50e848SMauro Carvalho Chehab                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
102df50e848SMauro Carvalho Chehab
103df50e848SMauro Carvalho Chehab    (CToken.ENDSTMT, r"(?:\s+;|;)"),
104df50e848SMauro Carvalho Chehab
105df50e848SMauro Carvalho Chehab    (CToken.PUNC,    r"[,\.]"),
106df50e848SMauro Carvalho Chehab
107df50e848SMauro Carvalho Chehab    (CToken.BEGIN,   r"[\[\(\{]"),
108df50e848SMauro Carvalho Chehab
109df50e848SMauro Carvalho Chehab    (CToken.END,     r"[\]\)\}]"),
110df50e848SMauro Carvalho Chehab
111df50e848SMauro Carvalho Chehab    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
112df50e848SMauro Carvalho Chehab
113df50e848SMauro Carvalho Chehab    (CToken.HASH,    r"#"),
114df50e848SMauro Carvalho Chehab
115df50e848SMauro Carvalho Chehab    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
116df50e848SMauro Carvalho Chehab                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
117df50e848SMauro Carvalho Chehab
118df50e848SMauro Carvalho Chehab    (CToken.STRUCT,  r"\bstruct\b"),
119df50e848SMauro Carvalho Chehab    (CToken.UNION,   r"\bunion\b"),
120df50e848SMauro Carvalho Chehab    (CToken.ENUM,    r"\benum\b"),
121df50e848SMauro Carvalho Chehab    (CToken.TYPEDEF, r"\btypedef\b"),
122df50e848SMauro Carvalho Chehab
123df50e848SMauro Carvalho Chehab    (CToken.NAME,    r"[A-Za-z_]\w*"),
124df50e848SMauro Carvalho Chehab
125df50e848SMauro Carvalho Chehab    (CToken.SPACE,   r"\s+"),
126df50e848SMauro Carvalho Chehab
127df50e848SMauro Carvalho Chehab    (CToken.BACKREF, r"\\\d+"),
128df50e848SMauro Carvalho Chehab
129df50e848SMauro Carvalho Chehab    (CToken.MISMATCH,r"."),
130df50e848SMauro Carvalho Chehab]
131df50e848SMauro Carvalho Chehab
132df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list):
133df50e848SMauro Carvalho Chehab    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
134df50e848SMauro Carvalho Chehab    re_tokens = []
135df50e848SMauro Carvalho Chehab
136df50e848SMauro Carvalho Chehab    for kind, pattern in token_list:
137df50e848SMauro Carvalho Chehab        name = CToken.to_name(kind)
138df50e848SMauro Carvalho Chehab        re_tokens.append(f"(?P<{name}>{pattern})")
139df50e848SMauro Carvalho Chehab
140df50e848SMauro Carvalho Chehab    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
141df50e848SMauro Carvalho Chehab
142df50e848SMauro Carvalho Chehab#: Handle C continuation lines.
143df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n")
144df50e848SMauro Carvalho Chehab
145df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*')
146df50e848SMauro Carvalho Chehab
147df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage.
148df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
149df50e848SMauro Carvalho Chehab
150df50e848SMauro Carvalho Chehab
151df50e848SMauro Carvalho Chehabclass CTokenizer():
152df50e848SMauro Carvalho Chehab    """
153df50e848SMauro Carvalho Chehab    Scan C statements and definitions and produce tokens.
154df50e848SMauro Carvalho Chehab
155df50e848SMauro Carvalho Chehab    When converted to string, it drops comments and handle public/private
156df50e848SMauro Carvalho Chehab    values, respecting depth.
157df50e848SMauro Carvalho Chehab    """
158df50e848SMauro Carvalho Chehab
159df50e848SMauro Carvalho Chehab    # This class is inspired and follows the basic concepts of:
160df50e848SMauro Carvalho Chehab    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
161df50e848SMauro Carvalho Chehab
162df50e848SMauro Carvalho Chehab    def __init__(self, source=None, log=None):
163df50e848SMauro Carvalho Chehab        """
164df50e848SMauro Carvalho Chehab        Create a regular expression to handle RE_SCANNER_LIST.
165df50e848SMauro Carvalho Chehab
166df50e848SMauro Carvalho Chehab        While I generally don't like using regex group naming via:
167df50e848SMauro Carvalho Chehab            (?P<name>...)
168df50e848SMauro Carvalho Chehab
169df50e848SMauro Carvalho Chehab        in this particular case, it makes sense, as we can pick the name
170df50e848SMauro Carvalho Chehab        when matching a code via RE_SCANNER.
171df50e848SMauro Carvalho Chehab        """
172df50e848SMauro Carvalho Chehab
173df50e848SMauro Carvalho Chehab        self.tokens = []
174df50e848SMauro Carvalho Chehab
175df50e848SMauro Carvalho Chehab        if not source:
176df50e848SMauro Carvalho Chehab            return
177df50e848SMauro Carvalho Chehab
178df50e848SMauro Carvalho Chehab        if isinstance(source, list):
179df50e848SMauro Carvalho Chehab            self.tokens = source
180df50e848SMauro Carvalho Chehab            return
181df50e848SMauro Carvalho Chehab
182df50e848SMauro Carvalho Chehab        #
183df50e848SMauro Carvalho Chehab        # While we could just use _tokenize directly via interator,
184df50e848SMauro Carvalho Chehab        # As we'll need to use the tokenizer several times inside kernel-doc
185df50e848SMauro Carvalho Chehab        # to handle macro transforms, cache the results on a list, as
186df50e848SMauro Carvalho Chehab        # re-using it is cheaper than having to parse everytime.
187df50e848SMauro Carvalho Chehab        #
188df50e848SMauro Carvalho Chehab        for tok in self._tokenize(source):
189df50e848SMauro Carvalho Chehab            self.tokens.append(tok)
190df50e848SMauro Carvalho Chehab
191df50e848SMauro Carvalho Chehab    def _tokenize(self, source):
192df50e848SMauro Carvalho Chehab        """
193df50e848SMauro Carvalho Chehab        Iterator that parses ``source``, splitting it into tokens, as defined
194df50e848SMauro Carvalho Chehab        at ``self.RE_SCANNER_LIST``.
195df50e848SMauro Carvalho Chehab
196df50e848SMauro Carvalho Chehab        The interactor returns a CToken class object.
197df50e848SMauro Carvalho Chehab        """
198df50e848SMauro Carvalho Chehab
199df50e848SMauro Carvalho Chehab        # Handle continuation lines. Note that kdoc_parser already has a
200df50e848SMauro Carvalho Chehab        # logic to do that. Still, let's keep it for completeness, as we might
201df50e848SMauro Carvalho Chehab        # end re-using this tokenizer outsize kernel-doc some day - or we may
202df50e848SMauro Carvalho Chehab        # eventually remove from there as a future cleanup.
203df50e848SMauro Carvalho Chehab        source = RE_CONT.sub("", source)
204df50e848SMauro Carvalho Chehab
205df50e848SMauro Carvalho Chehab        brace_level = 0
206df50e848SMauro Carvalho Chehab        paren_level = 0
207df50e848SMauro Carvalho Chehab        bracket_level = 0
208df50e848SMauro Carvalho Chehab
209df50e848SMauro Carvalho Chehab        for match in RE_SCANNER.finditer(source):
210df50e848SMauro Carvalho Chehab            kind = CToken.from_name(match.lastgroup)
211df50e848SMauro Carvalho Chehab            pos = match.start()
212df50e848SMauro Carvalho Chehab            value = match.group()
213df50e848SMauro Carvalho Chehab
214df50e848SMauro Carvalho Chehab            if kind == CToken.MISMATCH:
215df50e848SMauro Carvalho Chehab                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
216df50e848SMauro Carvalho Chehab            elif kind == CToken.BEGIN:
217df50e848SMauro Carvalho Chehab                if value == '(':
218df50e848SMauro Carvalho Chehab                    paren_level += 1
219df50e848SMauro Carvalho Chehab                elif value == '[':
220df50e848SMauro Carvalho Chehab                    bracket_level += 1
221df50e848SMauro Carvalho Chehab                else:  # value == '{'
222df50e848SMauro Carvalho Chehab                    brace_level += 1
223df50e848SMauro Carvalho Chehab
224df50e848SMauro Carvalho Chehab            elif kind == CToken.END:
225df50e848SMauro Carvalho Chehab                if value == ')' and paren_level > 0:
226df50e848SMauro Carvalho Chehab                    paren_level -= 1
227df50e848SMauro Carvalho Chehab                elif value == ']' and bracket_level > 0:
228df50e848SMauro Carvalho Chehab                    bracket_level -= 1
229df50e848SMauro Carvalho Chehab                elif brace_level > 0:    # value == '}'
230df50e848SMauro Carvalho Chehab                    brace_level -= 1
231df50e848SMauro Carvalho Chehab
232df50e848SMauro Carvalho Chehab            yield CToken(kind, value, pos,
233df50e848SMauro Carvalho Chehab                         brace_level, paren_level, bracket_level)
234df50e848SMauro Carvalho Chehab
235df50e848SMauro Carvalho Chehab    def __str__(self):
236df50e848SMauro Carvalho Chehab        out=""
237df50e848SMauro Carvalho Chehab        show_stack = [True]
238df50e848SMauro Carvalho Chehab
239df50e848SMauro Carvalho Chehab        for i, tok in enumerate(self.tokens):
240df50e848SMauro Carvalho Chehab            if tok.kind == CToken.BEGIN:
241df50e848SMauro Carvalho Chehab                show_stack.append(show_stack[-1])
242df50e848SMauro Carvalho Chehab
243df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.END:
244df50e848SMauro Carvalho Chehab                prev = show_stack[-1]
245df50e848SMauro Carvalho Chehab                if len(show_stack) > 1:
246df50e848SMauro Carvalho Chehab                    show_stack.pop()
247df50e848SMauro Carvalho Chehab
248df50e848SMauro Carvalho Chehab                if not prev and show_stack[-1]:
249df50e848SMauro Carvalho Chehab                    #
250df50e848SMauro Carvalho Chehab                    # Try to preserve indent
251df50e848SMauro Carvalho Chehab                    #
252df50e848SMauro Carvalho Chehab                    out += "\t" * (len(show_stack) - 1)
253df50e848SMauro Carvalho Chehab
254df50e848SMauro Carvalho Chehab                    out += str(tok.value)
255df50e848SMauro Carvalho Chehab                    continue
256df50e848SMauro Carvalho Chehab
257df50e848SMauro Carvalho Chehab            elif tok.kind == CToken.COMMENT:
258df50e848SMauro Carvalho Chehab                comment = RE_COMMENT_START.sub("", tok.value)
259df50e848SMauro Carvalho Chehab
260df50e848SMauro Carvalho Chehab                if comment.startswith("private:"):
261df50e848SMauro Carvalho Chehab                    show_stack[-1] = False
262df50e848SMauro Carvalho Chehab                    show = False
263df50e848SMauro Carvalho Chehab                elif comment.startswith("public:"):
264df50e848SMauro Carvalho Chehab                    show_stack[-1] = True
265df50e848SMauro Carvalho Chehab
266df50e848SMauro Carvalho Chehab                continue
267df50e848SMauro Carvalho Chehab
268df50e848SMauro Carvalho Chehab            if not show_stack[-1]:
269df50e848SMauro Carvalho Chehab                continue
270df50e848SMauro Carvalho Chehab
271df50e848SMauro Carvalho Chehab            if i < len(self.tokens) - 1:
272df50e848SMauro Carvalho Chehab                next_tok = self.tokens[i + 1]
273df50e848SMauro Carvalho Chehab
274df50e848SMauro Carvalho Chehab                # Do some cleanups before ";"
275df50e848SMauro Carvalho Chehab
276*f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
277df50e848SMauro Carvalho Chehab                    continue
278df50e848SMauro Carvalho Chehab
279*f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
280df50e848SMauro Carvalho Chehab                    continue
281df50e848SMauro Carvalho Chehab
282df50e848SMauro Carvalho Chehab            out += str(tok.value)
283df50e848SMauro Carvalho Chehab
284df50e848SMauro Carvalho Chehab        return out
285*f1cf9f7cSMauro Carvalho Chehab
286*f1cf9f7cSMauro Carvalho Chehab
287*f1cf9f7cSMauro Carvalho Chehabclass CMatch:
288*f1cf9f7cSMauro Carvalho Chehab    """
289*f1cf9f7cSMauro Carvalho Chehab    Finding nested delimiters is hard with regular expressions. It is
290*f1cf9f7cSMauro Carvalho Chehab    even harder on Python with its normal re module, as there are several
291*f1cf9f7cSMauro Carvalho Chehab    advanced regular expressions that are missing.
292*f1cf9f7cSMauro Carvalho Chehab
293*f1cf9f7cSMauro Carvalho Chehab    This is the case of this pattern::
294*f1cf9f7cSMauro Carvalho Chehab
295*f1cf9f7cSMauro Carvalho Chehab            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
296*f1cf9f7cSMauro Carvalho Chehab
297*f1cf9f7cSMauro Carvalho Chehab    which is used to properly match open/close parentheses of the
298*f1cf9f7cSMauro Carvalho Chehab    string search STRUCT_GROUP(),
299*f1cf9f7cSMauro Carvalho Chehab
300*f1cf9f7cSMauro Carvalho Chehab    Add a class that counts pairs of delimiters, using it to match and
301*f1cf9f7cSMauro Carvalho Chehab    replace nested expressions.
302*f1cf9f7cSMauro Carvalho Chehab
303*f1cf9f7cSMauro Carvalho Chehab    The original approach was suggested by:
304*f1cf9f7cSMauro Carvalho Chehab
305*f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
306*f1cf9f7cSMauro Carvalho Chehab
307*f1cf9f7cSMauro Carvalho Chehab    Although I re-implemented it to make it more generic and match 3 types
308*f1cf9f7cSMauro Carvalho Chehab    of delimiters. The logic checks if delimiters are paired. If not, it
309*f1cf9f7cSMauro Carvalho Chehab    will ignore the search string.
310*f1cf9f7cSMauro Carvalho Chehab    """
311*f1cf9f7cSMauro Carvalho Chehab
312*f1cf9f7cSMauro Carvalho Chehab    # TODO: add a sub method
313*f1cf9f7cSMauro Carvalho Chehab
314*f1cf9f7cSMauro Carvalho Chehab    def __init__(self, regex):
315*f1cf9f7cSMauro Carvalho Chehab        self.regex = KernRe(regex)
316*f1cf9f7cSMauro Carvalho Chehab
317*f1cf9f7cSMauro Carvalho Chehab    def _search(self, tokenizer):
318*f1cf9f7cSMauro Carvalho Chehab        """
319*f1cf9f7cSMauro Carvalho Chehab        Finds paired blocks for a regex that ends with a delimiter.
320*f1cf9f7cSMauro Carvalho Chehab
321*f1cf9f7cSMauro Carvalho Chehab        The suggestion of using finditer to match pairs came from:
322*f1cf9f7cSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
323*f1cf9f7cSMauro Carvalho Chehab        but I ended using a different implementation to align all three types
324*f1cf9f7cSMauro Carvalho Chehab        of delimiters and seek for an initial regular expression.
325*f1cf9f7cSMauro Carvalho Chehab
326*f1cf9f7cSMauro Carvalho Chehab        The algorithm seeks for open/close paired delimiters and places them
327*f1cf9f7cSMauro Carvalho Chehab        into a stack, yielding a start/stop position of each match when the
328*f1cf9f7cSMauro Carvalho Chehab        stack is zeroed.
329*f1cf9f7cSMauro Carvalho Chehab
330*f1cf9f7cSMauro Carvalho Chehab        The algorithm should work fine for properly paired lines, but will
331*f1cf9f7cSMauro Carvalho Chehab        silently ignore end delimiters that precede a start delimiter.
332*f1cf9f7cSMauro Carvalho Chehab        This should be OK for kernel-doc parser, as unaligned delimiters
333*f1cf9f7cSMauro Carvalho Chehab        would cause compilation errors. So, we don't need to raise exceptions
334*f1cf9f7cSMauro Carvalho Chehab        to cover such issues.
335*f1cf9f7cSMauro Carvalho Chehab        """
336*f1cf9f7cSMauro Carvalho Chehab
337*f1cf9f7cSMauro Carvalho Chehab        start = None
338*f1cf9f7cSMauro Carvalho Chehab        offset = -1
339*f1cf9f7cSMauro Carvalho Chehab        started = False
340*f1cf9f7cSMauro Carvalho Chehab
341*f1cf9f7cSMauro Carvalho Chehab        import sys
342*f1cf9f7cSMauro Carvalho Chehab
343*f1cf9f7cSMauro Carvalho Chehab        stack = []
344*f1cf9f7cSMauro Carvalho Chehab
345*f1cf9f7cSMauro Carvalho Chehab        for i, tok in enumerate(tokenizer.tokens):
346*f1cf9f7cSMauro Carvalho Chehab            if start is None:
347*f1cf9f7cSMauro Carvalho Chehab                if tok.kind == CToken.NAME and self.regex.match(tok.value):
348*f1cf9f7cSMauro Carvalho Chehab                    start = i
349*f1cf9f7cSMauro Carvalho Chehab                    stack.append((start, tok.level))
350*f1cf9f7cSMauro Carvalho Chehab                    started = False
351*f1cf9f7cSMauro Carvalho Chehab
352*f1cf9f7cSMauro Carvalho Chehab                continue
353*f1cf9f7cSMauro Carvalho Chehab
354*f1cf9f7cSMauro Carvalho Chehab            if not started and tok.kind == CToken.BEGIN:
355*f1cf9f7cSMauro Carvalho Chehab                started = True
356*f1cf9f7cSMauro Carvalho Chehab                continue
357*f1cf9f7cSMauro Carvalho Chehab
358*f1cf9f7cSMauro Carvalho Chehab            if tok.kind == CToken.END and tok.level == stack[-1][1]:
359*f1cf9f7cSMauro Carvalho Chehab                start, level = stack.pop()
360*f1cf9f7cSMauro Carvalho Chehab                offset = i
361*f1cf9f7cSMauro Carvalho Chehab
362*f1cf9f7cSMauro Carvalho Chehab                yield CTokenizer(tokenizer.tokens[start:offset + 1])
363*f1cf9f7cSMauro Carvalho Chehab                start = None
364*f1cf9f7cSMauro Carvalho Chehab
365*f1cf9f7cSMauro Carvalho Chehab        #
366*f1cf9f7cSMauro Carvalho Chehab        # If an END zeroing levels is not there, return remaining stuff
367*f1cf9f7cSMauro Carvalho Chehab        # This is meant to solve cases where the caller logic might be
368*f1cf9f7cSMauro Carvalho Chehab        # picking an incomplete block.
369*f1cf9f7cSMauro Carvalho Chehab        #
370*f1cf9f7cSMauro Carvalho Chehab        if start and offset < 0:
371*f1cf9f7cSMauro Carvalho Chehab            print("WARNING: can't find an end", file=sys.stderr)
372*f1cf9f7cSMauro Carvalho Chehab            yield CTokenizer(tokenizer.tokens[start:])
373*f1cf9f7cSMauro Carvalho Chehab
374*f1cf9f7cSMauro Carvalho Chehab    def search(self, source):
375*f1cf9f7cSMauro Carvalho Chehab        """
376*f1cf9f7cSMauro Carvalho Chehab        This is similar to re.search:
377*f1cf9f7cSMauro Carvalho Chehab
378*f1cf9f7cSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
379*f1cf9f7cSMauro Carvalho Chehab        returning occurrences only if all delimiters are paired.
380*f1cf9f7cSMauro Carvalho Chehab        """
381*f1cf9f7cSMauro Carvalho Chehab
382*f1cf9f7cSMauro Carvalho Chehab        if isinstance(source, CTokenizer):
383*f1cf9f7cSMauro Carvalho Chehab            tokenizer = source
384*f1cf9f7cSMauro Carvalho Chehab            is_token = True
385*f1cf9f7cSMauro Carvalho Chehab        else:
386*f1cf9f7cSMauro Carvalho Chehab            tokenizer = CTokenizer(source)
387*f1cf9f7cSMauro Carvalho Chehab            is_token = False
388*f1cf9f7cSMauro Carvalho Chehab
389*f1cf9f7cSMauro Carvalho Chehab        for new_tokenizer in self._search(tokenizer):
390*f1cf9f7cSMauro Carvalho Chehab            if is_token:
391*f1cf9f7cSMauro Carvalho Chehab                yield new_tokenizer
392*f1cf9f7cSMauro Carvalho Chehab            else:
393*f1cf9f7cSMauro Carvalho Chehab                yield str(new_tokenizer)
394