124b2e73fSBreno Leitao // SPDX-License-Identifier: GPL-2.0 224b2e73fSBreno Leitao 324b2e73fSBreno Leitao /* 424b2e73fSBreno Leitao * Test module for stress and performance analysis of workqueue. 524b2e73fSBreno Leitao * 624b2e73fSBreno Leitao * Benchmarks queue_work() throughput on an unbound workqueue to measure 724b2e73fSBreno Leitao * pool->lock contention under different affinity scope configurations 824b2e73fSBreno Leitao * (e.g., cache vs cache_shard). 924b2e73fSBreno Leitao * 1024b2e73fSBreno Leitao * The affinity scope is changed between runs via the workqueue's sysfs 1124b2e73fSBreno Leitao * affinity_scope attribute (WQ_SYSFS). 1224b2e73fSBreno Leitao * 1324b2e73fSBreno Leitao * Copyright (c) 2026 Meta Platforms, Inc. and affiliates 1424b2e73fSBreno Leitao * Copyright (c) 2026 Breno Leitao <leitao@debian.org> 1524b2e73fSBreno Leitao * 1624b2e73fSBreno Leitao */ 1724b2e73fSBreno Leitao #include <linux/init.h> 1824b2e73fSBreno Leitao #include <linux/kernel.h> 1924b2e73fSBreno Leitao #include <linux/module.h> 2024b2e73fSBreno Leitao #include <linux/workqueue.h> 2124b2e73fSBreno Leitao #include <linux/kthread.h> 2224b2e73fSBreno Leitao #include <linux/moduleparam.h> 2324b2e73fSBreno Leitao #include <linux/completion.h> 2424b2e73fSBreno Leitao #include <linux/atomic.h> 2524b2e73fSBreno Leitao #include <linux/slab.h> 2624b2e73fSBreno Leitao #include <linux/ktime.h> 2724b2e73fSBreno Leitao #include <linux/cpumask.h> 2824b2e73fSBreno Leitao #include <linux/sched.h> 2924b2e73fSBreno Leitao #include <linux/sort.h> 3024b2e73fSBreno Leitao #include <linux/fs.h> 3124b2e73fSBreno Leitao 3224b2e73fSBreno Leitao #define WQ_NAME "bench_wq" 3324b2e73fSBreno Leitao #define SCOPE_PATH "/sys/bus/workqueue/devices/" WQ_NAME "/affinity_scope" 3424b2e73fSBreno Leitao 3524b2e73fSBreno Leitao static int nr_threads; 3624b2e73fSBreno Leitao module_param(nr_threads, int, 0444); 3724b2e73fSBreno Leitao MODULE_PARM_DESC(nr_threads, 3824b2e73fSBreno Leitao "Number of threads to spawn (default: 0 = num_online_cpus())"); 3924b2e73fSBreno Leitao 4024b2e73fSBreno Leitao static int wq_items = 50000; 4124b2e73fSBreno Leitao module_param(wq_items, int, 0444); 4224b2e73fSBreno Leitao MODULE_PARM_DESC(wq_items, 4324b2e73fSBreno Leitao "Number of work items each thread queues (default: 50000)"); 4424b2e73fSBreno Leitao 4524b2e73fSBreno Leitao static struct workqueue_struct *bench_wq; 4624b2e73fSBreno Leitao static atomic_t threads_done; 4724b2e73fSBreno Leitao static DECLARE_COMPLETION(start_comp); 4824b2e73fSBreno Leitao static DECLARE_COMPLETION(all_done_comp); 4924b2e73fSBreno Leitao 5024b2e73fSBreno Leitao struct thread_ctx { 5124b2e73fSBreno Leitao struct completion work_done; 5224b2e73fSBreno Leitao struct work_struct work; 5324b2e73fSBreno Leitao u64 *latencies; 5424b2e73fSBreno Leitao int cpu; 5524b2e73fSBreno Leitao int items; 5624b2e73fSBreno Leitao }; 5724b2e73fSBreno Leitao 5824b2e73fSBreno Leitao static void bench_work_fn(struct work_struct *work) 5924b2e73fSBreno Leitao { 6024b2e73fSBreno Leitao struct thread_ctx *ctx = container_of(work, struct thread_ctx, work); 6124b2e73fSBreno Leitao 6224b2e73fSBreno Leitao complete(&ctx->work_done); 6324b2e73fSBreno Leitao } 6424b2e73fSBreno Leitao 6524b2e73fSBreno Leitao static int bench_kthread_fn(void *data) 6624b2e73fSBreno Leitao { 6724b2e73fSBreno Leitao struct thread_ctx *ctx = data; 6824b2e73fSBreno Leitao ktime_t t_start, t_end; 6924b2e73fSBreno Leitao int i; 7024b2e73fSBreno Leitao 7124b2e73fSBreno Leitao /* Wait for all threads to be ready */ 7224b2e73fSBreno Leitao wait_for_completion(&start_comp); 7324b2e73fSBreno Leitao 7424b2e73fSBreno Leitao if (kthread_should_stop()) 7524b2e73fSBreno Leitao return 0; 7624b2e73fSBreno Leitao 7724b2e73fSBreno Leitao for (i = 0; i < ctx->items; i++) { 7824b2e73fSBreno Leitao reinit_completion(&ctx->work_done); 7924b2e73fSBreno Leitao INIT_WORK(&ctx->work, bench_work_fn); 8024b2e73fSBreno Leitao 8124b2e73fSBreno Leitao t_start = ktime_get(); 8224b2e73fSBreno Leitao queue_work(bench_wq, &ctx->work); 8324b2e73fSBreno Leitao t_end = ktime_get(); 8424b2e73fSBreno Leitao 8524b2e73fSBreno Leitao ctx->latencies[i] = ktime_to_ns(ktime_sub(t_end, t_start)); 8624b2e73fSBreno Leitao wait_for_completion(&ctx->work_done); 8724b2e73fSBreno Leitao } 8824b2e73fSBreno Leitao 8924b2e73fSBreno Leitao if (atomic_dec_and_test(&threads_done)) 9024b2e73fSBreno Leitao complete(&all_done_comp); 9124b2e73fSBreno Leitao 9224b2e73fSBreno Leitao /* 9324b2e73fSBreno Leitao * Wait for kthread_stop() so the module text isn't freed 9424b2e73fSBreno Leitao * while we're still executing. 9524b2e73fSBreno Leitao */ 9624b2e73fSBreno Leitao while (!kthread_should_stop()) 9724b2e73fSBreno Leitao schedule(); 9824b2e73fSBreno Leitao 9924b2e73fSBreno Leitao return 0; 10024b2e73fSBreno Leitao } 10124b2e73fSBreno Leitao 10224b2e73fSBreno Leitao static int cmp_u64(const void *a, const void *b) 10324b2e73fSBreno Leitao { 10424b2e73fSBreno Leitao u64 va = *(const u64 *)a; 10524b2e73fSBreno Leitao u64 vb = *(const u64 *)b; 10624b2e73fSBreno Leitao 10724b2e73fSBreno Leitao if (va < vb) 10824b2e73fSBreno Leitao return -1; 10924b2e73fSBreno Leitao if (va > vb) 11024b2e73fSBreno Leitao return 1; 11124b2e73fSBreno Leitao return 0; 11224b2e73fSBreno Leitao } 11324b2e73fSBreno Leitao 11424b2e73fSBreno Leitao static int __init set_affn_scope(const char *scope) 11524b2e73fSBreno Leitao { 11624b2e73fSBreno Leitao struct file *f; 11724b2e73fSBreno Leitao loff_t pos = 0; 11824b2e73fSBreno Leitao ssize_t ret; 11924b2e73fSBreno Leitao 12024b2e73fSBreno Leitao f = filp_open(SCOPE_PATH, O_WRONLY, 0); 12124b2e73fSBreno Leitao if (IS_ERR(f)) { 12224b2e73fSBreno Leitao pr_err("test_workqueue: open %s failed: %ld\n", 12324b2e73fSBreno Leitao SCOPE_PATH, PTR_ERR(f)); 12424b2e73fSBreno Leitao return PTR_ERR(f); 12524b2e73fSBreno Leitao } 12624b2e73fSBreno Leitao 12724b2e73fSBreno Leitao ret = kernel_write(f, scope, strlen(scope), &pos); 12824b2e73fSBreno Leitao filp_close(f, NULL); 12924b2e73fSBreno Leitao 13024b2e73fSBreno Leitao if (ret < 0) { 13124b2e73fSBreno Leitao pr_err("test_workqueue: write '%s' failed: %zd\n", scope, ret); 13224b2e73fSBreno Leitao return ret; 13324b2e73fSBreno Leitao } 13424b2e73fSBreno Leitao 13524b2e73fSBreno Leitao return 0; 13624b2e73fSBreno Leitao } 13724b2e73fSBreno Leitao 13824b2e73fSBreno Leitao static int __init run_bench(int n_threads, const char *scope, const char *label) 13924b2e73fSBreno Leitao { 14024b2e73fSBreno Leitao struct task_struct **tasks; 14124b2e73fSBreno Leitao unsigned long total_items; 14224b2e73fSBreno Leitao struct thread_ctx *ctxs; 14324b2e73fSBreno Leitao u64 *all_latencies; 14424b2e73fSBreno Leitao ktime_t start, end; 14524b2e73fSBreno Leitao int cpu, i, j, ret; 14624b2e73fSBreno Leitao s64 elapsed_us; 14724b2e73fSBreno Leitao 14824b2e73fSBreno Leitao ret = set_affn_scope(scope); 14924b2e73fSBreno Leitao if (ret) 15024b2e73fSBreno Leitao return ret; 15124b2e73fSBreno Leitao 15224b2e73fSBreno Leitao ctxs = kcalloc(n_threads, sizeof(*ctxs), GFP_KERNEL); 15324b2e73fSBreno Leitao if (!ctxs) 15424b2e73fSBreno Leitao return -ENOMEM; 15524b2e73fSBreno Leitao 15624b2e73fSBreno Leitao tasks = kcalloc(n_threads, sizeof(*tasks), GFP_KERNEL); 15724b2e73fSBreno Leitao if (!tasks) { 15824b2e73fSBreno Leitao kfree(ctxs); 15924b2e73fSBreno Leitao return -ENOMEM; 16024b2e73fSBreno Leitao } 16124b2e73fSBreno Leitao 16224b2e73fSBreno Leitao total_items = (unsigned long)n_threads * wq_items; 16324b2e73fSBreno Leitao all_latencies = kvmalloc_array(total_items, sizeof(u64), GFP_KERNEL); 16424b2e73fSBreno Leitao if (!all_latencies) { 16524b2e73fSBreno Leitao kfree(tasks); 16624b2e73fSBreno Leitao kfree(ctxs); 16724b2e73fSBreno Leitao return -ENOMEM; 16824b2e73fSBreno Leitao } 16924b2e73fSBreno Leitao 17024b2e73fSBreno Leitao /* Allocate per-thread latency arrays */ 17124b2e73fSBreno Leitao for (i = 0; i < n_threads; i++) { 17224b2e73fSBreno Leitao ctxs[i].latencies = kvmalloc_array(wq_items, sizeof(u64), 17324b2e73fSBreno Leitao GFP_KERNEL); 17424b2e73fSBreno Leitao if (!ctxs[i].latencies) { 17524b2e73fSBreno Leitao while (--i >= 0) 17624b2e73fSBreno Leitao kvfree(ctxs[i].latencies); 17724b2e73fSBreno Leitao kvfree(all_latencies); 17824b2e73fSBreno Leitao kfree(tasks); 17924b2e73fSBreno Leitao kfree(ctxs); 18024b2e73fSBreno Leitao return -ENOMEM; 18124b2e73fSBreno Leitao } 18224b2e73fSBreno Leitao } 18324b2e73fSBreno Leitao 18424b2e73fSBreno Leitao atomic_set(&threads_done, n_threads); 18524b2e73fSBreno Leitao reinit_completion(&all_done_comp); 18624b2e73fSBreno Leitao reinit_completion(&start_comp); 18724b2e73fSBreno Leitao 18824b2e73fSBreno Leitao /* Create kthreads, each bound to a different online CPU */ 18924b2e73fSBreno Leitao i = 0; 19024b2e73fSBreno Leitao for_each_online_cpu(cpu) { 19124b2e73fSBreno Leitao if (i >= n_threads) 19224b2e73fSBreno Leitao break; 19324b2e73fSBreno Leitao 19424b2e73fSBreno Leitao ctxs[i].cpu = cpu; 19524b2e73fSBreno Leitao ctxs[i].items = wq_items; 19624b2e73fSBreno Leitao init_completion(&ctxs[i].work_done); 19724b2e73fSBreno Leitao 19824b2e73fSBreno Leitao tasks[i] = kthread_create(bench_kthread_fn, &ctxs[i], 19924b2e73fSBreno Leitao "wq_bench/%d", cpu); 20024b2e73fSBreno Leitao if (IS_ERR(tasks[i])) { 20124b2e73fSBreno Leitao ret = PTR_ERR(tasks[i]); 20224b2e73fSBreno Leitao pr_err("test_workqueue: failed to create kthread %d: %d\n", 20324b2e73fSBreno Leitao i, ret); 20424b2e73fSBreno Leitao /* Unblock threads waiting on start_comp before stopping them */ 20524b2e73fSBreno Leitao complete_all(&start_comp); 20624b2e73fSBreno Leitao while (--i >= 0) 20724b2e73fSBreno Leitao kthread_stop(tasks[i]); 20824b2e73fSBreno Leitao goto out_free; 20924b2e73fSBreno Leitao } 21024b2e73fSBreno Leitao 21124b2e73fSBreno Leitao kthread_bind(tasks[i], cpu); 21224b2e73fSBreno Leitao wake_up_process(tasks[i]); 21324b2e73fSBreno Leitao i++; 21424b2e73fSBreno Leitao } 21524b2e73fSBreno Leitao 21624b2e73fSBreno Leitao /* Start timing and release all threads */ 21724b2e73fSBreno Leitao start = ktime_get(); 21824b2e73fSBreno Leitao complete_all(&start_comp); 21924b2e73fSBreno Leitao 22024b2e73fSBreno Leitao /* Wait for all threads to finish the benchmark */ 22124b2e73fSBreno Leitao wait_for_completion(&all_done_comp); 22224b2e73fSBreno Leitao 22324b2e73fSBreno Leitao /* Drain any remaining work */ 22424b2e73fSBreno Leitao flush_workqueue(bench_wq); 22524b2e73fSBreno Leitao 22624b2e73fSBreno Leitao /* Ensure all kthreads have fully exited before module memory is freed */ 22724b2e73fSBreno Leitao for (i = 0; i < n_threads; i++) 22824b2e73fSBreno Leitao kthread_stop(tasks[i]); 22924b2e73fSBreno Leitao 23024b2e73fSBreno Leitao end = ktime_get(); 23124b2e73fSBreno Leitao elapsed_us = ktime_us_delta(end, start); 23224b2e73fSBreno Leitao 23324b2e73fSBreno Leitao /* Merge all per-thread latencies and sort for percentile calculation */ 23424b2e73fSBreno Leitao j = 0; 23524b2e73fSBreno Leitao for (i = 0; i < n_threads; i++) { 23624b2e73fSBreno Leitao memcpy(&all_latencies[j], ctxs[i].latencies, 23724b2e73fSBreno Leitao wq_items * sizeof(u64)); 23824b2e73fSBreno Leitao j += wq_items; 23924b2e73fSBreno Leitao } 24024b2e73fSBreno Leitao 24124b2e73fSBreno Leitao sort(all_latencies, total_items, sizeof(u64), cmp_u64, NULL); 24224b2e73fSBreno Leitao 24324b2e73fSBreno Leitao pr_info("test_workqueue: %-16s %llu items/sec\tp50=%llu\tp90=%llu\tp95=%llu ns\n", 24424b2e73fSBreno Leitao label, 245*c6890f36SArnd Bergmann elapsed_us ? div_u64(total_items * 1000000ULL, elapsed_us) : 0, 24624b2e73fSBreno Leitao all_latencies[total_items * 50 / 100], 24724b2e73fSBreno Leitao all_latencies[total_items * 90 / 100], 24824b2e73fSBreno Leitao all_latencies[total_items * 95 / 100]); 24924b2e73fSBreno Leitao 25024b2e73fSBreno Leitao ret = 0; 25124b2e73fSBreno Leitao out_free: 25224b2e73fSBreno Leitao for (i = 0; i < n_threads; i++) 25324b2e73fSBreno Leitao kvfree(ctxs[i].latencies); 25424b2e73fSBreno Leitao kvfree(all_latencies); 25524b2e73fSBreno Leitao kfree(tasks); 25624b2e73fSBreno Leitao kfree(ctxs); 25724b2e73fSBreno Leitao 25824b2e73fSBreno Leitao return ret; 25924b2e73fSBreno Leitao } 26024b2e73fSBreno Leitao 26124b2e73fSBreno Leitao static const char * const bench_scopes[] = { 26224b2e73fSBreno Leitao "cpu", "smt", "cache_shard", "cache", "numa", "system", 26324b2e73fSBreno Leitao }; 26424b2e73fSBreno Leitao 26524b2e73fSBreno Leitao static int __init test_workqueue_init(void) 26624b2e73fSBreno Leitao { 26724b2e73fSBreno Leitao int n_threads = min(nr_threads ?: num_online_cpus(), num_online_cpus()); 26824b2e73fSBreno Leitao int i; 26924b2e73fSBreno Leitao 27024b2e73fSBreno Leitao if (wq_items <= 0) { 27124b2e73fSBreno Leitao pr_err("test_workqueue: wq_items must be > 0\n"); 27224b2e73fSBreno Leitao return -EINVAL; 27324b2e73fSBreno Leitao } 27424b2e73fSBreno Leitao 27524b2e73fSBreno Leitao bench_wq = alloc_workqueue(WQ_NAME, WQ_UNBOUND | WQ_SYSFS, 0); 27624b2e73fSBreno Leitao if (!bench_wq) 27724b2e73fSBreno Leitao return -ENOMEM; 27824b2e73fSBreno Leitao 27924b2e73fSBreno Leitao pr_info("test_workqueue: running %d threads, %d items/thread\n", 28024b2e73fSBreno Leitao n_threads, wq_items); 28124b2e73fSBreno Leitao 28224b2e73fSBreno Leitao for (i = 0; i < ARRAY_SIZE(bench_scopes); i++) 28324b2e73fSBreno Leitao run_bench(n_threads, bench_scopes[i], bench_scopes[i]); 28424b2e73fSBreno Leitao 28524b2e73fSBreno Leitao destroy_workqueue(bench_wq); 28624b2e73fSBreno Leitao 28724b2e73fSBreno Leitao /* Return -EAGAIN so the module doesn't stay loaded after the benchmark */ 28824b2e73fSBreno Leitao return -EAGAIN; 28924b2e73fSBreno Leitao } 29024b2e73fSBreno Leitao 29124b2e73fSBreno Leitao module_init(test_workqueue_init); 29224b2e73fSBreno Leitao MODULE_AUTHOR("Breno Leitao <leitao@debian.org>"); 29324b2e73fSBreno Leitao MODULE_DESCRIPTION("Stress/performance benchmark for workqueue subsystem"); 29424b2e73fSBreno Leitao MODULE_LICENSE("GPL"); 295