xref: /linux/kernel/module/stats.c (revision 0ea5c948cb64bab5bc7a5516774eb8536f05aa0d)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * Debugging module statistics.
4   *
5   * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
6   */
7  
8  #include <linux/module.h>
9  #include <uapi/linux/module.h>
10  #include <linux/string.h>
11  #include <linux/printk.h>
12  #include <linux/slab.h>
13  #include <linux/list.h>
14  #include <linux/debugfs.h>
15  #include <linux/rculist.h>
16  #include <linux/math.h>
17  
18  #include "internal.h"
19  
20  /**
21   * DOC: module debugging statistics overview
22   *
23   * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
24   * are useful to monitor and root cause memory pressure issues with module
25   * loading. These statistics are useful to allow us to improve production
26   * workloads.
27   *
28   * The current module debugging statistics supported help keep track of module
29   * loading failures to enable improvements either for kernel module auto-loading
30   * usage (request_module()) or interactions with userspace. Statistics are
31   * provided to track all possible failures in the finit_module() path and memory
32   * wasted in this process space.  Each of the failure counters are associated
33   * to a type of module loading failure which is known to incur a certain amount
34   * of memory allocation loss. In the worst case loading a module will fail after
35   * a 3 step memory allocation process:
36   *
37   *   a) memory allocated with kernel_read_file_from_fd()
38   *   b) module decompression processes the file read from
39   *      kernel_read_file_from_fd(), and vmap() is used to map
40   *      the decompressed module to a new local buffer which represents
41   *      a copy of the decompressed module passed from userspace. The buffer
42   *      from kernel_read_file_from_fd() is freed right away.
43   *   c) layout_and_allocate() allocates space for the final resting
44   *      place where we would keep the module if it were to be processed
45   *      successfully.
46   *
47   * If a failure occurs after these three different allocations only one
48   * counter will be incremented with the summation of the allocated bytes freed
49   * incurred during this failure. Likewise, if module loading failed only after
50   * step b) a separate counter is used and incremented for the bytes freed and
51   * not used during both of those allocations.
52   *
53   * Virtual memory space can be limited, for example on x86 virtual memory size
54   * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
55   * memory allocations when possible. These module debugging statistics help
56   * to evaluate how much memory is being wasted on bootup due to module loading
57   * failures.
58   *
59   * All counters are designed to be incremental. Atomic counters are used so to
60   * remain simple and avoid delays and deadlocks.
61   */
62  
63  /**
64   * DOC: dup_failed_modules - tracks duplicate failed modules
65   *
66   * Linked list of modules which failed to be loaded because an already existing
67   * module with the same name was already being processed or already loaded.
68   * The finit_module() system call incurs heavy virtual memory allocations. In
69   * the worst case an finit_module() system call can end up allocating virtual
70   * memory 3 times:
71   *
72   *   1) kernel_read_file_from_fd() call uses vmalloc()
73   *   2) optional module decompression uses vmap()
74   *   3) layout_and allocate() can use vzalloc() or an arch specific variation of
75   *      vmalloc to deal with ELF sections requiring special permissions
76   *
77   * In practice on a typical boot today most finit_module() calls fail due to
78   * the module with the same name already being loaded or about to be processed.
79   * All virtual memory allocated to these failed modules will be freed with
80   * no functional use.
81   *
82   * To help with this the dup_failed_modules allows us to track modules which
83   * failed to load due to the fact that a module was already loaded or being
84   * processed.  There are only two points at which we can fail such calls,
85   * we list them below along with the number of virtual memory allocation
86   * calls:
87   *
88   *   a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
89   *	layout_and_allocate().
90   *	- with module decompression: 2 virtual memory allocation calls
91   *	- without module decompression: 1 virtual memory allocation calls
92   *   b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
93   *   	- with module decompression 3 virtual memory allocation calls
94   *   	- without module decompression 2 virtual memory allocation calls
95   *
96   * We should strive to get this list to be as small as possible. If this list
97   * is not empty it is a reflection of possible work or optimizations possible
98   * either in-kernel or in userspace.
99   */
100  static LIST_HEAD(dup_failed_modules);
101  
102  /**
103   * DOC: module statistics debugfs counters
104   *
105   * The total amount of wasted virtual memory allocation space during module
106   * loading can be computed by adding the total from the summation:
107   *
108   *   * @invalid_kread_bytes +
109   *     @invalid_decompress_bytes +
110   *     @invalid_becoming_bytes +
111   *     @invalid_mod_bytes
112   *
113   * The following debugfs counters are available to inspect module loading
114   * failures:
115   *
116   *   * total_mod_size: total bytes ever used by all modules we've dealt with on
117   *     this system
118   *   * total_text_size: total bytes of the .text and .init.text ELF section
119   *     sizes we've dealt with on this system
120   *   * invalid_kread_bytes: bytes allocated and then freed on failures which
121   *     happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
122   *     uses vmalloc(). These should typically not happen unless your system is
123   *     under memory pressure.
124   *   * invalid_decompress_bytes: number of bytes allocated and freed due to
125   *     memory allocations in the module decompression path that use vmap().
126   *     These typically should not happen unless your system is under memory
127   *     pressure.
128   *   * invalid_becoming_bytes: total number of bytes allocated and freed used
129   *     to read the kernel module userspace wants us to read before we
130   *     promote it to be processed to be added to our @modules linked list. These
131   *     failures can happen if we had a check in between a successful kernel_read_file_from_fd()
132   *     call and right before we allocate the our private memory for the module
133   *     which would be kept if the module is successfully loaded. The most common
134   *     reason for this failure is when userspace is racing to load a module
135   *     which it does not yet see loaded. The first module to succeed in
136   *     add_unformed_module() will add a module to our &modules list and
137   *     subsequent loads of modules with the same name will error out at the
138   *     end of early_mod_check(). The check for module_patient_check_exists()
139   *     at the end of early_mod_check() prevents duplicate allocations
140   *     on layout_and_allocate() for modules already being processed. These
141   *     duplicate failed modules are non-fatal, however they typically are
142   *     indicative of userspace not seeing a module in userspace loaded yet and
143   *     unnecessarily trying to load a module before the kernel even has a chance
144   *     to begin to process prior requests. Although duplicate failures can be
145   *     non-fatal, we should try to reduce vmalloc() pressure proactively, so
146   *     ideally after boot this will be close to as 0 as possible.  If module
147   *     decompression was used we also add to this counter the cost of the
148   *     initial kernel_read_file_from_fd() of the compressed module. If module
149   *     decompression was not used the value represents the total allocated and
150   *     freed bytes in kernel_read_file_from_fd() calls for these type of
151   *     failures. These failures can occur because:
152   *
153   *    * module_sig_check() - module signature checks
154   *    * elf_validity_cache_copy() - some ELF validation issue
155   *    * early_mod_check():
156   *
157   *      * blacklisting
158   *      * failed to rewrite section headers
159   *      * version magic
160   *      * live patch requirements didn't check out
161   *      * the module was detected as being already present
162   *
163   *   * invalid_mod_bytes: these are the total number of bytes allocated and
164   *     freed due to failures after we did all the sanity checks of the module
165   *     which userspace passed to us and after our first check that the module
166   *     is unique.  A module can still fail to load if we detect the module is
167   *     loaded after we allocate space for it with layout_and_allocate(), we do
168   *     this check right before processing the module as live and run its
169   *     initialization routines. Note that you have a failure of this type it
170   *     also means the respective kernel_read_file_from_fd() memory space was
171   *     also freed and not used, and so we increment this counter with twice
172   *     the size of the module. Additionally if you used module decompression
173   *     the size of the compressed module is also added to this counter.
174   *
175   *  * modcount: how many modules we've loaded in our kernel life time
176   *  * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
177   *  * failed_decompress: how many failed module decompression attempts we've had.
178   *    These really should not happen unless your compression / decompression
179   *    might be broken.
180   *  * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
181   *    it and before we allocate memory for it with layout_and_allocate(). This
182   *    counter is never incremented if you manage to validate the module and
183   *    call layout_and_allocate() for it.
184   *  * failed_load_modules: how many modules failed once we've allocated our
185   *    private space for our module using layout_and_allocate(). These failures
186   *    should hopefully mostly be dealt with already. Races in theory could
187   *    still exist here, but it would just mean the kernel had started processing
188   *    two threads concurrently up to early_mod_check() and one thread won.
189   *    These failures are good signs the kernel or userspace is doing something
190   *    seriously stupid or that could be improved. We should strive to fix these,
191   *    but it is perhaps not easy to fix them. A recent example are the modules
192   *    requests incurred for frequency modules, a separate module request was
193   *    being issued for each CPU on a system.
194   */
195  
196  atomic_long_t total_mod_size;
197  atomic_long_t total_text_size;
198  atomic_long_t invalid_kread_bytes;
199  atomic_long_t invalid_decompress_bytes;
200  static atomic_long_t invalid_becoming_bytes;
201  static atomic_long_t invalid_mod_bytes;
202  atomic_t modcount;
203  atomic_t failed_kreads;
204  atomic_t failed_decompress;
205  static atomic_t failed_becoming;
206  static atomic_t failed_load_modules;
207  
mod_fail_to_str(struct mod_fail_load * mod_fail)208  static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
209  {
210  	if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
211  	    test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
212  		return "Becoming & Load";
213  	if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
214  		return "Becoming";
215  	if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
216  		return "Load";
217  	return "Bug-on-stats";
218  }
219  
mod_stat_bump_invalid(struct load_info * info,int flags)220  void mod_stat_bump_invalid(struct load_info *info, int flags)
221  {
222  	atomic_long_add(info->len * 2, &invalid_mod_bytes);
223  	atomic_inc(&failed_load_modules);
224  #if defined(CONFIG_MODULE_DECOMPRESS)
225  	if (flags & MODULE_INIT_COMPRESSED_FILE)
226  		atomic_long_add(info->compressed_len, &invalid_mod_bytes);
227  #endif
228  }
229  
mod_stat_bump_becoming(struct load_info * info,int flags)230  void mod_stat_bump_becoming(struct load_info *info, int flags)
231  {
232  	atomic_inc(&failed_becoming);
233  	atomic_long_add(info->len, &invalid_becoming_bytes);
234  #if defined(CONFIG_MODULE_DECOMPRESS)
235  	if (flags & MODULE_INIT_COMPRESSED_FILE)
236  		atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
237  #endif
238  }
239  
try_add_failed_module(const char * name,enum fail_dup_mod_reason reason)240  int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
241  {
242  	struct mod_fail_load *mod_fail;
243  
244  	list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
245  				lockdep_is_held(&module_mutex)) {
246  		if (!strcmp(mod_fail->name, name)) {
247  			atomic_long_inc(&mod_fail->count);
248  			__set_bit(reason, &mod_fail->dup_fail_mask);
249  			goto out;
250  		}
251  	}
252  
253  	mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
254  	if (!mod_fail)
255  		return -ENOMEM;
256  	memcpy(mod_fail->name, name, strlen(name));
257  	__set_bit(reason, &mod_fail->dup_fail_mask);
258  	atomic_long_inc(&mod_fail->count);
259  	list_add_rcu(&mod_fail->list, &dup_failed_modules);
260  out:
261  	return 0;
262  }
263  
264  /*
265   * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
266   * 112 module prints within 8k.
267   *
268   * 1024 + (64*112) = 8k
269   */
270  #define MAX_PREAMBLE 1024
271  #define MAX_FAILED_MOD_PRINT 112
272  #define MAX_BYTES_PER_MOD 64
read_file_mod_stats(struct file * file,char __user * user_buf,size_t count,loff_t * ppos)273  static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
274  				   size_t count, loff_t *ppos)
275  {
276  	struct mod_fail_load *mod_fail;
277  	unsigned int len, size, count_failed = 0;
278  	char *buf;
279  	int ret;
280  	u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
281  	unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
282  		idecompress_bytes, imod_bytes, total_virtual_lost;
283  
284  	live_mod_count = atomic_read(&modcount);
285  	fkreads = atomic_read(&failed_kreads);
286  	fdecompress = atomic_read(&failed_decompress);
287  	fbecoming = atomic_read(&failed_becoming);
288  	floads = atomic_read(&failed_load_modules);
289  
290  	total_size = atomic_long_read(&total_mod_size);
291  	text_size = atomic_long_read(&total_text_size);
292  	ikread_bytes = atomic_long_read(&invalid_kread_bytes);
293  	idecompress_bytes = atomic_long_read(&invalid_decompress_bytes);
294  	ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes);
295  	imod_bytes = atomic_long_read(&invalid_mod_bytes);
296  
297  	total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
298  
299  	size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
300  				  (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
301  	buf = kzalloc(size, GFP_KERNEL);
302  	if (buf == NULL)
303  		return -ENOMEM;
304  
305  	/* The beginning of our debug preamble */
306  	len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
307  
308  	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
309  
310  	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
311  			 fdecompress);
312  	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
313  
314  	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
315  
316  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size);
317  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size);
318  
319  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes);
320  
321  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes",
322  			 idecompress_bytes);
323  
324  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes);
325  
326  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes);
327  
328  	len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost);
329  
330  	if (live_mod_count && total_size) {
331  		len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size",
332  				 DIV_ROUND_UP(total_size, live_mod_count));
333  	}
334  
335  	if (live_mod_count && text_size) {
336  		len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size",
337  				 DIV_ROUND_UP(text_size, live_mod_count));
338  	}
339  
340  	/*
341  	 * We use WARN_ON_ONCE() for the counters to ensure we always have parity
342  	 * for keeping tabs on a type of failure with one type of byte counter.
343  	 * The counters for imod_bytes does not increase for fkreads failures
344  	 * for example, and so on.
345  	 */
346  
347  	WARN_ON_ONCE(ikread_bytes && !fkreads);
348  	if (fkreads && ikread_bytes) {
349  		len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes",
350  				 DIV_ROUND_UP(ikread_bytes, fkreads));
351  	}
352  
353  	WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
354  	if (fbecoming && ibecoming_bytes) {
355  		len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes",
356  				 DIV_ROUND_UP(ibecoming_bytes, fbecoming));
357  	}
358  
359  	WARN_ON_ONCE(idecompress_bytes && !fdecompress);
360  	if (fdecompress && idecompress_bytes) {
361  		len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes",
362  				 DIV_ROUND_UP(idecompress_bytes, fdecompress));
363  	}
364  
365  	WARN_ON_ONCE(imod_bytes && !floads);
366  	if (floads && imod_bytes) {
367  		len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes",
368  				 DIV_ROUND_UP(imod_bytes, floads));
369  	}
370  
371  	/* End of our debug preamble header. */
372  
373  	/* Catch when we've gone beyond our expected preamble */
374  	WARN_ON_ONCE(len >= MAX_PREAMBLE);
375  
376  	if (list_empty(&dup_failed_modules))
377  		goto out;
378  
379  	len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
380  	len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
381  			 "Module-name", "How-many-times", "Reason");
382  	mutex_lock(&module_mutex);
383  
384  
385  	list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
386  		if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
387  			goto out_unlock;
388  		len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name,
389  				 atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail));
390  	}
391  out_unlock:
392  	mutex_unlock(&module_mutex);
393  out:
394  	ret = simple_read_from_buffer(user_buf, count, ppos, buf, len);
395  	kfree(buf);
396  	return ret;
397  }
398  #undef MAX_PREAMBLE
399  #undef MAX_FAILED_MOD_PRINT
400  #undef MAX_BYTES_PER_MOD
401  
402  static const struct file_operations fops_mod_stats = {
403  	.read = read_file_mod_stats,
404  	.open = simple_open,
405  	.owner = THIS_MODULE,
406  	.llseek = default_llseek,
407  };
408  
409  #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
410  #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
module_stats_init(void)411  static int __init module_stats_init(void)
412  {
413  	mod_debug_add_ulong(total_mod_size);
414  	mod_debug_add_ulong(total_text_size);
415  	mod_debug_add_ulong(invalid_kread_bytes);
416  	mod_debug_add_ulong(invalid_decompress_bytes);
417  	mod_debug_add_ulong(invalid_becoming_bytes);
418  	mod_debug_add_ulong(invalid_mod_bytes);
419  
420  	mod_debug_add_atomic(modcount);
421  	mod_debug_add_atomic(failed_kreads);
422  	mod_debug_add_atomic(failed_decompress);
423  	mod_debug_add_atomic(failed_becoming);
424  	mod_debug_add_atomic(failed_load_modules);
425  
426  	debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
427  
428  	return 0;
429  }
430  #undef mod_debug_add_ulong
431  #undef mod_debug_add_atomic
432  module_init(module_stats_init);
433