xref: /linux/scripts/clang-tools/gen_compile_commands.py (revision 2ddcf4962c1834a14340a1f50afafc3276c015bd)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3#
4# Copyright (C) Google LLC, 2018
5#
6# Author: Tom Roeder <tmroeder@google.com>
7#
8"""A tool for generating compile_commands.json in the Linux kernel."""
9
10import argparse
11import json
12import logging
13import os
14import re
15import subprocess
16import sys
17
18_DEFAULT_OUTPUT = 'compile_commands.json'
19_DEFAULT_LOG_LEVEL = 'WARNING'
20
21_FILENAME_PATTERN = r'^\..*\.cmd$'
22_LINE_PATTERN = r'^(saved)?cmd_[^ ]*\.o := (?P<command_prefix>.* )(?P<file_path>[^ ]*\.[cS]) *(;|$)'
23_VALID_LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
24
25# Pre-compiled regexes for better performance
26_INCLUDE_PATTERN = re.compile(r'^\s*#\s*include\s*[<"]([^>"]*)[>"]')
27_C_INCLUDE_PATTERN = re.compile(r'^\s*#\s*include\s*"([^"]*\.c)"\s*$')
28_FILENAME_MATCHER = re.compile(_FILENAME_PATTERN)
29
30# The tools/ directory adopts a different build system, and produces .cmd
31# files in a different format. Do not support it.
32_EXCLUDE_DIRS = ['.git', 'Documentation', 'include', 'tools']
33
34def parse_arguments():
35    """Sets up and parses command-line arguments.
36
37    Returns:
38        log_level: A logging level to filter log output.
39        directory: The work directory where the objects were built.
40        ar: Command used for parsing .a archives.
41        output: Where to write the compile-commands JSON file.
42        paths: The list of files/directories to handle to find .cmd files.
43    """
44    usage = 'Creates a compile_commands.json database from kernel .cmd files'
45    parser = argparse.ArgumentParser(description=usage)
46
47    directory_help = ('specify the output directory used for the kernel build '
48                      '(defaults to the working directory)')
49    parser.add_argument('-d', '--directory', type=str, default='.',
50                        help=directory_help)
51
52    output_help = ('path to the output command database (defaults to ' +
53                   _DEFAULT_OUTPUT + ')')
54    parser.add_argument('-o', '--output', type=str, default=_DEFAULT_OUTPUT,
55                        help=output_help)
56
57    log_level_help = ('the level of log messages to produce (defaults to ' +
58                      _DEFAULT_LOG_LEVEL + ')')
59    parser.add_argument('--log_level', choices=_VALID_LOG_LEVELS,
60                        default=_DEFAULT_LOG_LEVEL, help=log_level_help)
61
62    ar_help = 'command used for parsing .a archives'
63    parser.add_argument('-a', '--ar', type=str, default='llvm-ar', help=ar_help)
64
65    paths_help = ('directories to search or files to parse '
66                  '(files should be *.o, *.a, or modules.order). '
67                  'If nothing is specified, the current directory is searched')
68    parser.add_argument('paths', type=str, nargs='*', help=paths_help)
69
70    args = parser.parse_args()
71
72    return (args.log_level,
73            os.path.realpath(args.directory),
74            args.output,
75            args.ar,
76            args.paths if len(args.paths) > 0 else [args.directory])
77
78
79def cmdfiles_in_dir(directory):
80    """Generate the iterator of .cmd files found under the directory.
81
82    Walk under the given directory, and yield every .cmd file found.
83
84    Args:
85        directory: The directory to search for .cmd files.
86
87    Yields:
88        The path to a .cmd file.
89    """
90
91    exclude_dirs = [ os.path.join(directory, d) for d in _EXCLUDE_DIRS ]
92
93    for dirpath, dirnames, filenames in os.walk(directory, topdown=True):
94        # Prune unwanted directories.
95        if dirpath in exclude_dirs:
96            dirnames[:] = []
97            continue
98
99        for filename in filenames:
100            if _FILENAME_MATCHER.match(filename):
101                yield os.path.join(dirpath, filename)
102
103
104def to_cmdfile(path):
105    """Return the path of .cmd file used for the given build artifact
106
107    Args:
108        Path: file path
109
110    Returns:
111        The path to .cmd file
112    """
113    dir, base = os.path.split(path)
114    return os.path.join(dir, '.' + base + '.cmd')
115
116
117def cmdfiles_for_a(archive, ar):
118    """Generate the iterator of .cmd files associated with the archive.
119
120    Parse the given archive, and yield every .cmd file used to build it.
121
122    Args:
123        archive: The archive to parse
124
125    Yields:
126        The path to every .cmd file found
127    """
128    for obj in subprocess.check_output([ar, '-t', archive]).decode().split():
129        yield to_cmdfile(obj)
130
131
132def cmdfiles_for_modorder(modorder):
133    """Generate the iterator of .cmd files associated with the modules.order.
134
135    Parse the given modules.order, and yield every .cmd file used to build the
136    contained modules.
137
138    Args:
139        modorder: The modules.order file to parse
140
141    Yields:
142        The path to every .cmd file found
143    """
144    with open(modorder) as f:
145        for line in f:
146            obj = line.rstrip()
147            base, ext = os.path.splitext(obj)
148            if ext != '.o':
149                sys.exit('{}: module path must end with .o'.format(obj))
150            mod = base + '.mod'
151            # Read from *.mod, to get a list of objects that compose the module.
152            with open(mod) as m:
153                for mod_line in m:
154                    yield to_cmdfile(mod_line.rstrip())
155
156
157def extract_includes_from_file(source_file, root_directory):
158    """Extract #include statements from a C file.
159
160    Args:
161        source_file: Path to the source .c file to analyze
162        root_directory: Root directory for resolving relative paths
163
164    Returns:
165        List of header files that should be included (without quotes/brackets)
166    """
167    includes = []
168    if not os.path.exists(source_file):
169        return includes
170
171    try:
172        with open(source_file, 'r') as f:
173            for line in f:
174                line = line.strip()
175                # Look for #include statements.
176                # Match both #include "header.h" and #include <header.h>.
177                match = _INCLUDE_PATTERN.match(line)
178                if match:
179                    header = match.group(1)
180                    # Skip including other .c files to avoid circular includes.
181                    if not header.endswith('.c'):
182                        # For relative includes (quoted), resolve path relative to source file.
183                        if '"' in line:
184                            src_dir = os.path.dirname(source_file)
185                            header_path = os.path.join(src_dir, header)
186                            if os.path.exists(header_path):
187                                rel_header = os.path.relpath(header_path, root_directory)
188                                includes.append(rel_header)
189                            else:
190                                includes.append(header)
191                        else:
192                            # System include like <linux/sched.h>.
193                            includes.append(header)
194    except IOError:
195        pass
196
197    return includes
198
199
200def find_included_c_files(source_file, root_directory):
201    """Find .c files that are included by the given source file.
202
203    Args:
204        source_file: Path to the source .c file
205        root_directory: Root directory for resolving relative paths
206
207    Yields:
208        Full paths to included .c files
209    """
210    if not os.path.exists(source_file):
211        return
212
213    try:
214        with open(source_file, 'r') as f:
215            for line in f:
216                line = line.strip()
217                # Look for #include "*.c" patterns.
218                match = _C_INCLUDE_PATTERN.match(line)
219                if match:
220                    included_file = match.group(1)
221                    # Handle relative paths.
222                    if not os.path.isabs(included_file):
223                        src_dir = os.path.dirname(source_file)
224                        included_file = os.path.join(src_dir, included_file)
225
226                    # Normalize the path.
227                    included_file = os.path.normpath(included_file)
228
229                    # Check if the file exists.
230                    if os.path.exists(included_file):
231                        yield included_file
232    except IOError:
233        pass
234
235
236def process_line(root_directory, command_prefix, file_path):
237    """Extracts information from a .cmd line and creates entries from it.
238
239    Args:
240        root_directory: The directory that was searched for .cmd files. Usually
241            used directly in the "directory" entry in compile_commands.json.
242        command_prefix: The extracted command line, up to the last element.
243        file_path: The .c file from the end of the extracted command.
244            Usually relative to root_directory, but sometimes absolute.
245
246    Returns:
247        A list of entries to append to compile_commands (may include multiple
248        entries if the source file includes other .c files).
249
250    Raises:
251        ValueError: Could not find the extracted file based on file_path and
252            root_directory or file_directory.
253    """
254    # The .cmd files are intended to be included directly by Make, so they
255    # escape the pound sign '#' as '$(pound)'. The compile_commands.json file
256    # is not interepreted by Make, so this code replaces the escaped version
257    # with '#'.
258    prefix = command_prefix.replace('$(pound)', '#')
259
260    # Return the canonical path, eliminating any symbolic links encountered in the path.
261    abs_path = os.path.realpath(os.path.join(root_directory, file_path))
262    if not os.path.exists(abs_path):
263        raise ValueError('File %s not found' % abs_path)
264
265    entries = []
266
267    # Create entry for the main source file.
268    main_entry = {
269        'directory': root_directory,
270        'file': abs_path,
271        'command': prefix + file_path,
272    }
273    entries.append(main_entry)
274
275    # Find and create entries for included .c files.
276    for included_c_file in find_included_c_files(abs_path, root_directory):
277        # For included .c files, create a compilation command that:
278        # 1. Uses the same compilation flags as the parent file
279        # 2. But compiles the included file directly (not the parent)
280        # 3. Includes necessary headers from the parent file for proper macro resolution
281
282        # Convert absolute path to relative for the command.
283        rel_path = os.path.relpath(included_c_file, root_directory)
284
285        # Extract includes from the parent file to provide proper compilation context.
286        extra_includes = ''
287        try:
288            parent_includes = extract_includes_from_file(abs_path, root_directory)
289            if parent_includes:
290                extra_includes = ' ' + ' '.join('-include ' + inc for inc in parent_includes)
291        except IOError:
292            pass
293
294        included_entry = {
295            'directory': root_directory,
296            'file': included_c_file,
297            # Use the same compilation prefix but target the included file directly.
298            # Add extra headers for proper macro resolution.
299            'command': prefix + extra_includes + ' ' + rel_path,
300        }
301        entries.append(included_entry)
302        logging.debug('Added entry for included file: %s', included_c_file)
303
304    return entries
305
306
307def main():
308    """Walks through the directory and finds and parses .cmd files."""
309    log_level, directory, output, ar, paths = parse_arguments()
310
311    level = getattr(logging, log_level)
312    logging.basicConfig(format='%(levelname)s: %(message)s', level=level)
313
314    line_matcher = re.compile(_LINE_PATTERN)
315
316    compile_commands = []
317
318    for path in paths:
319        # If 'path' is a directory, handle all .cmd files under it.
320        # Otherwise, handle .cmd files associated with the file.
321        # built-in objects are linked via vmlinux.a
322        # Modules are listed in modules.order.
323        if os.path.isdir(path):
324            cmdfiles = cmdfiles_in_dir(path)
325        elif path.endswith('.a'):
326            cmdfiles = cmdfiles_for_a(path, ar)
327        elif path.endswith('modules.order'):
328            cmdfiles = cmdfiles_for_modorder(path)
329        else:
330            sys.exit('{}: unknown file type'.format(path))
331
332        for cmdfile in cmdfiles:
333            with open(cmdfile, 'rt') as f:
334                result = line_matcher.match(f.readline())
335                if result:
336                    try:
337                        entries = process_line(directory, result.group('command_prefix'),
338                                             result.group('file_path'))
339                        compile_commands.extend(entries)
340                    except ValueError as err:
341                        logging.info('Could not add line from %s: %s',
342                                     cmdfile, err)
343
344    with open(output, 'wt') as f:
345        json.dump(sorted(compile_commands, key=lambda x: x["file"]), f, indent=2, sort_keys=True)
346
347
348if __name__ == '__main__':
349    main()
350