1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# 4# Copyright (C) Google LLC, 2018 5# 6# Author: Tom Roeder <tmroeder@google.com> 7# 8"""A tool for generating compile_commands.json in the Linux kernel.""" 9 10import argparse 11import json 12import logging 13import os 14import re 15import subprocess 16import sys 17 18_DEFAULT_OUTPUT = 'compile_commands.json' 19_DEFAULT_LOG_LEVEL = 'WARNING' 20 21_FILENAME_PATTERN = r'^\..*\.cmd$' 22_LINE_PATTERN = r'^(saved)?cmd_[^ ]*\.o := (?P<command_prefix>.* )(?P<file_path>[^ ]*\.[cS]) *(;|$)' 23_VALID_LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] 24 25# Pre-compiled regexes for better performance 26_INCLUDE_PATTERN = re.compile(r'^\s*#\s*include\s*[<"]([^>"]*)[>"]') 27_C_INCLUDE_PATTERN = re.compile(r'^\s*#\s*include\s*"([^"]*\.c)"\s*$') 28_FILENAME_MATCHER = re.compile(_FILENAME_PATTERN) 29 30# The tools/ directory adopts a different build system, and produces .cmd 31# files in a different format. Do not support it. 32_EXCLUDE_DIRS = ['.git', 'Documentation', 'include', 'tools'] 33 34def parse_arguments(): 35 """Sets up and parses command-line arguments. 36 37 Returns: 38 log_level: A logging level to filter log output. 39 directory: The work directory where the objects were built. 40 ar: Command used for parsing .a archives. 41 output: Where to write the compile-commands JSON file. 42 paths: The list of files/directories to handle to find .cmd files. 43 """ 44 usage = 'Creates a compile_commands.json database from kernel .cmd files' 45 parser = argparse.ArgumentParser(description=usage) 46 47 directory_help = ('specify the output directory used for the kernel build ' 48 '(defaults to the working directory)') 49 parser.add_argument('-d', '--directory', type=str, default='.', 50 help=directory_help) 51 52 output_help = ('path to the output command database (defaults to ' + 53 _DEFAULT_OUTPUT + ')') 54 parser.add_argument('-o', '--output', type=str, default=_DEFAULT_OUTPUT, 55 help=output_help) 56 57 log_level_help = ('the level of log messages to produce (defaults to ' + 58 _DEFAULT_LOG_LEVEL + ')') 59 parser.add_argument('--log_level', choices=_VALID_LOG_LEVELS, 60 default=_DEFAULT_LOG_LEVEL, help=log_level_help) 61 62 ar_help = 'command used for parsing .a archives' 63 parser.add_argument('-a', '--ar', type=str, default='llvm-ar', help=ar_help) 64 65 paths_help = ('directories to search or files to parse ' 66 '(files should be *.o, *.a, or modules.order). ' 67 'If nothing is specified, the current directory is searched') 68 parser.add_argument('paths', type=str, nargs='*', help=paths_help) 69 70 args = parser.parse_args() 71 72 return (args.log_level, 73 os.path.realpath(args.directory), 74 args.output, 75 args.ar, 76 args.paths if len(args.paths) > 0 else [args.directory]) 77 78 79def cmdfiles_in_dir(directory): 80 """Generate the iterator of .cmd files found under the directory. 81 82 Walk under the given directory, and yield every .cmd file found. 83 84 Args: 85 directory: The directory to search for .cmd files. 86 87 Yields: 88 The path to a .cmd file. 89 """ 90 91 exclude_dirs = [ os.path.join(directory, d) for d in _EXCLUDE_DIRS ] 92 93 for dirpath, dirnames, filenames in os.walk(directory, topdown=True): 94 # Prune unwanted directories. 95 if dirpath in exclude_dirs: 96 dirnames[:] = [] 97 continue 98 99 for filename in filenames: 100 if _FILENAME_MATCHER.match(filename): 101 yield os.path.join(dirpath, filename) 102 103 104def to_cmdfile(path): 105 """Return the path of .cmd file used for the given build artifact 106 107 Args: 108 Path: file path 109 110 Returns: 111 The path to .cmd file 112 """ 113 dir, base = os.path.split(path) 114 return os.path.join(dir, '.' + base + '.cmd') 115 116 117def cmdfiles_for_a(archive, ar): 118 """Generate the iterator of .cmd files associated with the archive. 119 120 Parse the given archive, and yield every .cmd file used to build it. 121 122 Args: 123 archive: The archive to parse 124 125 Yields: 126 The path to every .cmd file found 127 """ 128 for obj in subprocess.check_output([ar, '-t', archive]).decode().split(): 129 yield to_cmdfile(obj) 130 131 132def cmdfiles_for_modorder(modorder): 133 """Generate the iterator of .cmd files associated with the modules.order. 134 135 Parse the given modules.order, and yield every .cmd file used to build the 136 contained modules. 137 138 Args: 139 modorder: The modules.order file to parse 140 141 Yields: 142 The path to every .cmd file found 143 """ 144 with open(modorder) as f: 145 for line in f: 146 obj = line.rstrip() 147 base, ext = os.path.splitext(obj) 148 if ext != '.o': 149 sys.exit('{}: module path must end with .o'.format(obj)) 150 mod = base + '.mod' 151 # Read from *.mod, to get a list of objects that compose the module. 152 with open(mod) as m: 153 for mod_line in m: 154 yield to_cmdfile(mod_line.rstrip()) 155 156 157def extract_includes_from_file(source_file, root_directory): 158 """Extract #include statements from a C file. 159 160 Args: 161 source_file: Path to the source .c file to analyze 162 root_directory: Root directory for resolving relative paths 163 164 Returns: 165 List of header files that should be included (without quotes/brackets) 166 """ 167 includes = [] 168 if not os.path.exists(source_file): 169 return includes 170 171 try: 172 with open(source_file, 'r') as f: 173 for line in f: 174 line = line.strip() 175 # Look for #include statements. 176 # Match both #include "header.h" and #include <header.h>. 177 match = _INCLUDE_PATTERN.match(line) 178 if match: 179 header = match.group(1) 180 # Skip including other .c files to avoid circular includes. 181 if not header.endswith('.c'): 182 # For relative includes (quoted), resolve path relative to source file. 183 if '"' in line: 184 src_dir = os.path.dirname(source_file) 185 header_path = os.path.join(src_dir, header) 186 if os.path.exists(header_path): 187 rel_header = os.path.relpath(header_path, root_directory) 188 includes.append(rel_header) 189 else: 190 includes.append(header) 191 else: 192 # System include like <linux/sched.h>. 193 includes.append(header) 194 except IOError: 195 pass 196 197 return includes 198 199 200def find_included_c_files(source_file, root_directory): 201 """Find .c files that are included by the given source file. 202 203 Args: 204 source_file: Path to the source .c file 205 root_directory: Root directory for resolving relative paths 206 207 Yields: 208 Full paths to included .c files 209 """ 210 if not os.path.exists(source_file): 211 return 212 213 try: 214 with open(source_file, 'r') as f: 215 for line in f: 216 line = line.strip() 217 # Look for #include "*.c" patterns. 218 match = _C_INCLUDE_PATTERN.match(line) 219 if match: 220 included_file = match.group(1) 221 # Handle relative paths. 222 if not os.path.isabs(included_file): 223 src_dir = os.path.dirname(source_file) 224 included_file = os.path.join(src_dir, included_file) 225 226 # Normalize the path. 227 included_file = os.path.normpath(included_file) 228 229 # Check if the file exists. 230 if os.path.exists(included_file): 231 yield included_file 232 except IOError: 233 pass 234 235 236def process_line(root_directory, command_prefix, file_path): 237 """Extracts information from a .cmd line and creates entries from it. 238 239 Args: 240 root_directory: The directory that was searched for .cmd files. Usually 241 used directly in the "directory" entry in compile_commands.json. 242 command_prefix: The extracted command line, up to the last element. 243 file_path: The .c file from the end of the extracted command. 244 Usually relative to root_directory, but sometimes absolute. 245 246 Returns: 247 A list of entries to append to compile_commands (may include multiple 248 entries if the source file includes other .c files). 249 250 Raises: 251 ValueError: Could not find the extracted file based on file_path and 252 root_directory or file_directory. 253 """ 254 # The .cmd files are intended to be included directly by Make, so they 255 # escape the pound sign '#' as '$(pound)'. The compile_commands.json file 256 # is not interepreted by Make, so this code replaces the escaped version 257 # with '#'. 258 prefix = command_prefix.replace('$(pound)', '#') 259 260 # Return the canonical path, eliminating any symbolic links encountered in the path. 261 abs_path = os.path.realpath(os.path.join(root_directory, file_path)) 262 if not os.path.exists(abs_path): 263 raise ValueError('File %s not found' % abs_path) 264 265 entries = [] 266 267 # Create entry for the main source file. 268 main_entry = { 269 'directory': root_directory, 270 'file': abs_path, 271 'command': prefix + file_path, 272 } 273 entries.append(main_entry) 274 275 # Find and create entries for included .c files. 276 for included_c_file in find_included_c_files(abs_path, root_directory): 277 # For included .c files, create a compilation command that: 278 # 1. Uses the same compilation flags as the parent file 279 # 2. But compiles the included file directly (not the parent) 280 # 3. Includes necessary headers from the parent file for proper macro resolution 281 282 # Convert absolute path to relative for the command. 283 rel_path = os.path.relpath(included_c_file, root_directory) 284 285 # Extract includes from the parent file to provide proper compilation context. 286 extra_includes = '' 287 try: 288 parent_includes = extract_includes_from_file(abs_path, root_directory) 289 if parent_includes: 290 extra_includes = ' ' + ' '.join('-include ' + inc for inc in parent_includes) 291 except IOError: 292 pass 293 294 included_entry = { 295 'directory': root_directory, 296 'file': included_c_file, 297 # Use the same compilation prefix but target the included file directly. 298 # Add extra headers for proper macro resolution. 299 'command': prefix + extra_includes + ' ' + rel_path, 300 } 301 entries.append(included_entry) 302 logging.debug('Added entry for included file: %s', included_c_file) 303 304 return entries 305 306 307def main(): 308 """Walks through the directory and finds and parses .cmd files.""" 309 log_level, directory, output, ar, paths = parse_arguments() 310 311 level = getattr(logging, log_level) 312 logging.basicConfig(format='%(levelname)s: %(message)s', level=level) 313 314 line_matcher = re.compile(_LINE_PATTERN) 315 316 compile_commands = [] 317 318 for path in paths: 319 # If 'path' is a directory, handle all .cmd files under it. 320 # Otherwise, handle .cmd files associated with the file. 321 # built-in objects are linked via vmlinux.a 322 # Modules are listed in modules.order. 323 if os.path.isdir(path): 324 cmdfiles = cmdfiles_in_dir(path) 325 elif path.endswith('.a'): 326 cmdfiles = cmdfiles_for_a(path, ar) 327 elif path.endswith('modules.order'): 328 cmdfiles = cmdfiles_for_modorder(path) 329 else: 330 sys.exit('{}: unknown file type'.format(path)) 331 332 for cmdfile in cmdfiles: 333 with open(cmdfile, 'rt') as f: 334 result = line_matcher.match(f.readline()) 335 if result: 336 try: 337 entries = process_line(directory, result.group('command_prefix'), 338 result.group('file_path')) 339 compile_commands.extend(entries) 340 except ValueError as err: 341 logging.info('Could not add line from %s: %s', 342 cmdfile, err) 343 344 with open(output, 'wt') as f: 345 json.dump(sorted(compile_commands, key=lambda x: x["file"]), f, indent=2, sort_keys=True) 346 347 348if __name__ == '__main__': 349 main() 350