1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kernel/workqueue.c - generic async execution with shared worker pool 4 * 5 * Copyright (C) 2002 Ingo Molnar 6 * 7 * Derived from the taskqueue/keventd code by: 8 * David Woodhouse <dwmw2@infradead.org> 9 * Andrew Morton 10 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 11 * Theodore Ts'o <tytso@mit.edu> 12 * 13 * Made to use alloc_percpu by Christoph Lameter. 14 * 15 * Copyright (C) 2010 SUSE Linux Products GmbH 16 * Copyright (C) 2010 Tejun Heo <tj@kernel.org> 17 * 18 * This is the generic async execution mechanism. Work items as are 19 * executed in process context. The worker pool is shared and 20 * automatically managed. There are two worker pools for each CPU (one for 21 * normal work items and the other for high priority ones) and some extra 22 * pools for workqueues which are not bound to any specific CPU - the 23 * number of these backing pools is dynamic. 24 * 25 * Please read Documentation/core-api/workqueue.rst for details. 26 */ 27 28 #include <linux/export.h> 29 #include <linux/kernel.h> 30 #include <linux/sched.h> 31 #include <linux/init.h> 32 #include <linux/interrupt.h> 33 #include <linux/signal.h> 34 #include <linux/completion.h> 35 #include <linux/workqueue.h> 36 #include <linux/slab.h> 37 #include <linux/cpu.h> 38 #include <linux/notifier.h> 39 #include <linux/kthread.h> 40 #include <linux/hardirq.h> 41 #include <linux/mempolicy.h> 42 #include <linux/freezer.h> 43 #include <linux/debug_locks.h> 44 #include <linux/lockdep.h> 45 #include <linux/idr.h> 46 #include <linux/jhash.h> 47 #include <linux/hashtable.h> 48 #include <linux/rculist.h> 49 #include <linux/nodemask.h> 50 #include <linux/moduleparam.h> 51 #include <linux/uaccess.h> 52 #include <linux/sched/isolation.h> 53 #include <linux/sched/debug.h> 54 #include <linux/nmi.h> 55 #include <linux/kvm_para.h> 56 #include <linux/delay.h> 57 #include <linux/irq_work.h> 58 59 #include "workqueue_internal.h" 60 61 enum worker_pool_flags { 62 /* 63 * worker_pool flags 64 * 65 * A bound pool is either associated or disassociated with its CPU. 66 * While associated (!DISASSOCIATED), all workers are bound to the 67 * CPU and none has %WORKER_UNBOUND set and concurrency management 68 * is in effect. 69 * 70 * While DISASSOCIATED, the cpu may be offline and all workers have 71 * %WORKER_UNBOUND set and concurrency management disabled, and may 72 * be executing on any CPU. The pool behaves as an unbound one. 73 * 74 * Note that DISASSOCIATED should be flipped only while holding 75 * wq_pool_attach_mutex to avoid changing binding state while 76 * worker_attach_to_pool() is in progress. 77 * 78 * As there can only be one concurrent BH execution context per CPU, a 79 * BH pool is per-CPU and always DISASSOCIATED. 80 */ 81 POOL_BH = 1 << 0, /* is a BH pool */ 82 POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ 83 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 84 POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ 85 }; 86 87 enum worker_flags { 88 /* worker flags */ 89 WORKER_DIE = 1 << 1, /* die die die */ 90 WORKER_IDLE = 1 << 2, /* is idle */ 91 WORKER_PREP = 1 << 3, /* preparing to run works */ 92 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 93 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 94 WORKER_REBOUND = 1 << 8, /* worker was rebound */ 95 96 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | 97 WORKER_UNBOUND | WORKER_REBOUND, 98 }; 99 100 enum work_cancel_flags { 101 WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ 102 WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ 103 }; 104 105 enum wq_internal_consts { 106 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 107 108 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ 109 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 110 111 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 112 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 113 114 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, 115 /* call for help after 10ms 116 (min two ticks) */ 117 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 118 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 119 120 RESCUER_BATCH = 16, /* process items per turn */ 121 122 /* 123 * Rescue workers are used only on emergencies and shared by 124 * all cpus. Give MIN_NICE. 125 */ 126 RESCUER_NICE_LEVEL = MIN_NICE, 127 HIGHPRI_NICE_LEVEL = MIN_NICE, 128 129 WQ_NAME_LEN = 32, 130 WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ 131 }; 132 133 /* 134 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and 135 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because 136 * msecs_to_jiffies() can't be an initializer. 137 */ 138 #define BH_WORKER_JIFFIES msecs_to_jiffies(2) 139 #define BH_WORKER_RESTARTS 10 140 141 /* 142 * Structure fields follow one of the following exclusion rules. 143 * 144 * I: Modifiable by initialization/destruction paths and read-only for 145 * everyone else. 146 * 147 * P: Preemption protected. Disabling preemption is enough and should 148 * only be modified and accessed from the local cpu. 149 * 150 * L: pool->lock protected. Access with pool->lock held. 151 * 152 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for 153 * reads. 154 * 155 * K: Only modified by worker while holding pool->lock. Can be safely read by 156 * self, while holding pool->lock or from IRQ context if %current is the 157 * kworker. 158 * 159 * S: Only modified by worker self. 160 * 161 * A: wq_pool_attach_mutex protected. 162 * 163 * PL: wq_pool_mutex protected. 164 * 165 * PR: wq_pool_mutex protected for writes. RCU protected for reads. 166 * 167 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. 168 * 169 * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or 170 * RCU for reads. 171 * 172 * WQ: wq->mutex protected. 173 * 174 * WR: wq->mutex protected for writes. RCU protected for reads. 175 * 176 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read 177 * with READ_ONCE() without locking. 178 * 179 * MD: wq_mayday_lock protected. 180 * 181 * WD: Used internally by the watchdog. 182 */ 183 184 /* struct worker is defined in workqueue_internal.h */ 185 186 struct worker_pool { 187 raw_spinlock_t lock; /* the pool lock */ 188 int cpu; /* I: the associated cpu */ 189 int node; /* I: the associated node ID */ 190 int id; /* I: pool ID */ 191 unsigned int flags; /* L: flags */ 192 193 unsigned long watchdog_ts; /* L: watchdog timestamp */ 194 bool cpu_stall; /* WD: stalled cpu bound pool */ 195 196 /* 197 * The counter is incremented in a process context on the associated CPU 198 * w/ preemption disabled, and decremented or reset in the same context 199 * but w/ pool->lock held. The readers grab pool->lock and are 200 * guaranteed to see if the counter reached zero. 201 */ 202 int nr_running; 203 204 struct list_head worklist; /* L: list of pending works */ 205 206 int nr_workers; /* L: total number of workers */ 207 int nr_idle; /* L: currently idle workers */ 208 209 struct list_head idle_list; /* L: list of idle workers */ 210 struct timer_list idle_timer; /* L: worker idle timeout */ 211 struct work_struct idle_cull_work; /* L: worker idle cleanup */ 212 213 struct timer_list mayday_timer; /* L: SOS timer for workers */ 214 215 /* a workers is either on busy_hash or idle_list, or the manager */ 216 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); 217 /* L: hash of busy workers */ 218 219 struct worker *manager; /* L: purely informational */ 220 struct list_head workers; /* A: attached workers */ 221 222 struct ida worker_ida; /* worker IDs for task name */ 223 224 struct workqueue_attrs *attrs; /* I: worker attributes */ 225 struct hlist_node hash_node; /* PL: unbound_pool_hash node */ 226 int refcnt; /* PL: refcnt for unbound pools */ 227 #ifdef CONFIG_PREEMPT_RT 228 spinlock_t cb_lock; /* BH worker cancel lock */ 229 #endif 230 /* 231 * Destruction of pool is RCU protected to allow dereferences 232 * from get_work_pool(). 233 */ 234 struct rcu_head rcu; 235 }; 236 237 /* 238 * Per-pool_workqueue statistics. These can be monitored using 239 * tools/workqueue/wq_monitor.py. 240 */ 241 enum pool_workqueue_stats { 242 PWQ_STAT_STARTED, /* work items started execution */ 243 PWQ_STAT_COMPLETED, /* work items completed execution */ 244 PWQ_STAT_CPU_TIME, /* total CPU time consumed */ 245 PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ 246 PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ 247 PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ 248 PWQ_STAT_MAYDAY, /* maydays to rescuer */ 249 PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ 250 251 PWQ_NR_STATS, 252 }; 253 254 /* 255 * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT 256 * of work_struct->data are used for flags and the remaining high bits 257 * point to the pwq; thus, pwqs need to be aligned at two's power of the 258 * number of flag bits. 259 */ 260 struct pool_workqueue { 261 struct worker_pool *pool; /* I: the associated pool */ 262 struct workqueue_struct *wq; /* I: the owning workqueue */ 263 int work_color; /* L: current color */ 264 int flush_color; /* L: flushing color */ 265 int refcnt; /* L: reference count */ 266 int nr_in_flight[WORK_NR_COLORS]; 267 /* L: nr of in_flight works */ 268 bool plugged; /* L: execution suspended */ 269 270 /* 271 * nr_active management and WORK_STRUCT_INACTIVE: 272 * 273 * When pwq->nr_active >= max_active, new work item is queued to 274 * pwq->inactive_works instead of pool->worklist and marked with 275 * WORK_STRUCT_INACTIVE. 276 * 277 * All work items marked with WORK_STRUCT_INACTIVE do not participate in 278 * nr_active and all work items in pwq->inactive_works are marked with 279 * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are 280 * in pwq->inactive_works. Some of them are ready to run in 281 * pool->worklist or worker->scheduled. Those work itmes are only struct 282 * wq_barrier which is used for flush_work() and should not participate 283 * in nr_active. For non-barrier work item, it is marked with 284 * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. 285 */ 286 int nr_active; /* L: nr of active works */ 287 struct list_head inactive_works; /* L: inactive works */ 288 struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ 289 struct list_head pwqs_node; /* WR: node on wq->pwqs */ 290 struct list_head mayday_node; /* MD: node on wq->maydays */ 291 struct work_struct mayday_cursor; /* L: cursor on pool->worklist */ 292 293 u64 stats[PWQ_NR_STATS]; 294 295 /* 296 * Release of unbound pwq is punted to a kthread_worker. See put_pwq() 297 * and pwq_release_workfn() for details. pool_workqueue itself is also 298 * RCU protected so that the first pwq can be determined without 299 * grabbing wq->mutex. 300 */ 301 struct kthread_work release_work; 302 struct rcu_head rcu; 303 } __aligned(1 << WORK_STRUCT_PWQ_SHIFT); 304 305 /* 306 * Structure used to wait for workqueue flush. 307 */ 308 struct wq_flusher { 309 struct list_head list; /* WQ: list of flushers */ 310 int flush_color; /* WQ: flush color waiting for */ 311 struct completion done; /* flush completion */ 312 }; 313 314 struct wq_device; 315 316 /* 317 * Unlike in a per-cpu workqueue where max_active limits its concurrency level 318 * on each CPU, in an unbound workqueue, max_active applies to the whole system. 319 * As sharing a single nr_active across multiple sockets can be very expensive, 320 * the counting and enforcement is per NUMA node. 321 * 322 * The following struct is used to enforce per-node max_active. When a pwq wants 323 * to start executing a work item, it should increment ->nr using 324 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over 325 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish 326 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in 327 * round-robin order. 328 */ 329 struct wq_node_nr_active { 330 int max; /* per-node max_active */ 331 atomic_t nr; /* per-node nr_active */ 332 raw_spinlock_t lock; /* nests inside pool locks */ 333 struct list_head pending_pwqs; /* LN: pwqs with inactive works */ 334 }; 335 336 /* 337 * The externally visible workqueue. It relays the issued work items to 338 * the appropriate worker_pool through its pool_workqueues. 339 */ 340 struct workqueue_struct { 341 struct list_head pwqs; /* WR: all pwqs of this wq */ 342 struct list_head list; /* PR: list of all workqueues */ 343 344 struct mutex mutex; /* protects this wq */ 345 int work_color; /* WQ: current work color */ 346 int flush_color; /* WQ: current flush color */ 347 atomic_t nr_pwqs_to_flush; /* flush in progress */ 348 struct wq_flusher *first_flusher; /* WQ: first flusher */ 349 struct list_head flusher_queue; /* WQ: flush waiters */ 350 struct list_head flusher_overflow; /* WQ: flush overflow list */ 351 352 struct list_head maydays; /* MD: pwqs requesting rescue */ 353 struct worker *rescuer; /* MD: rescue worker */ 354 355 int nr_drainers; /* WQ: drain in progress */ 356 357 /* See alloc_workqueue() function comment for info on min/max_active */ 358 int max_active; /* WO: max active works */ 359 int min_active; /* WO: min active works */ 360 int saved_max_active; /* WQ: saved max_active */ 361 int saved_min_active; /* WQ: saved min_active */ 362 363 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ 364 struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ 365 366 #ifdef CONFIG_SYSFS 367 struct wq_device *wq_dev; /* I: for sysfs interface */ 368 #endif 369 #ifdef CONFIG_LOCKDEP 370 char *lock_name; 371 struct lock_class_key key; 372 struct lockdep_map __lockdep_map; 373 struct lockdep_map *lockdep_map; 374 #endif 375 char name[WQ_NAME_LEN]; /* I: workqueue name */ 376 377 /* 378 * Destruction of workqueue_struct is RCU protected to allow walking 379 * the workqueues list without grabbing wq_pool_mutex. 380 * This is used to dump all workqueues from sysrq. 381 */ 382 struct rcu_head rcu; 383 384 /* hot fields used during command issue, aligned to cacheline */ 385 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ 386 struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */ 387 struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ 388 }; 389 390 /* 391 * Each pod type describes how CPUs should be grouped for unbound workqueues. 392 * See the comment above workqueue_attrs->affn_scope. 393 */ 394 struct wq_pod_type { 395 int nr_pods; /* number of pods */ 396 cpumask_var_t *pod_cpus; /* pod -> cpus */ 397 int *pod_node; /* pod -> node */ 398 int *cpu_pod; /* cpu -> pod */ 399 }; 400 401 struct work_offq_data { 402 u32 pool_id; 403 u32 disable; 404 u32 flags; 405 }; 406 407 static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { 408 [WQ_AFFN_DFL] = "default", 409 [WQ_AFFN_CPU] = "cpu", 410 [WQ_AFFN_SMT] = "smt", 411 [WQ_AFFN_CACHE] = "cache", 412 [WQ_AFFN_NUMA] = "numa", 413 [WQ_AFFN_SYSTEM] = "system", 414 }; 415 416 /* 417 * Per-cpu work items which run for longer than the following threshold are 418 * automatically considered CPU intensive and excluded from concurrency 419 * management to prevent them from noticeably delaying other per-cpu work items. 420 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. 421 * The actual value is initialized in wq_cpu_intensive_thresh_init(). 422 */ 423 static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; 424 module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); 425 #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT 426 static unsigned int wq_cpu_intensive_warning_thresh = 4; 427 module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644); 428 #endif 429 430 /* see the comment above the definition of WQ_POWER_EFFICIENT */ 431 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); 432 module_param_named(power_efficient, wq_power_efficient, bool, 0444); 433 434 static bool wq_online; /* can kworkers be created yet? */ 435 static bool wq_topo_initialized __read_mostly = false; 436 437 static struct kmem_cache *pwq_cache; 438 439 static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; 440 static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; 441 442 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ 443 static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; 444 445 static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 446 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ 447 static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 448 /* wait for manager to go away */ 449 static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); 450 451 static LIST_HEAD(workqueues); /* PR: list of all workqueues */ 452 static bool workqueue_freezing; /* PL: have wqs started freezing? */ 453 454 /* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */ 455 static cpumask_var_t wq_online_cpumask; 456 457 /* PL&A: allowable cpus for unbound wqs and work items */ 458 static cpumask_var_t wq_unbound_cpumask; 459 460 /* PL: user requested unbound cpumask via sysfs */ 461 static cpumask_var_t wq_requested_unbound_cpumask; 462 463 /* PL: isolated cpumask to be excluded from unbound cpumask */ 464 static cpumask_var_t wq_isolated_cpumask; 465 466 /* for further constrain wq_unbound_cpumask by cmdline parameter*/ 467 static struct cpumask wq_cmdline_cpumask __initdata; 468 469 /* CPU where unbound work was last round robin scheduled from this CPU */ 470 static DEFINE_PER_CPU(int, wq_rr_cpu_last); 471 472 /* 473 * Local execution of unbound work items is no longer guaranteed. The 474 * following always forces round-robin CPU selection on unbound work items 475 * to uncover usages which depend on it. 476 */ 477 #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU 478 static bool wq_debug_force_rr_cpu = true; 479 #else 480 static bool wq_debug_force_rr_cpu = false; 481 #endif 482 module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); 483 484 /* to raise softirq for the BH worker pools on other CPUs */ 485 static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); 486 487 /* the BH worker pools */ 488 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); 489 490 /* the per-cpu worker pools */ 491 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); 492 493 static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ 494 495 /* PL: hash of all unbound pools keyed by pool->attrs */ 496 static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); 497 498 /* I: attributes used when instantiating standard unbound pools on demand */ 499 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 500 501 /* I: attributes used when instantiating ordered pools on demand */ 502 static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; 503 504 /* 505 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a 506 * process context while holding a pool lock. Bounce to a dedicated kthread 507 * worker to avoid A-A deadlocks. 508 */ 509 static struct kthread_worker *pwq_release_worker __ro_after_init; 510 511 struct workqueue_struct *system_wq __ro_after_init; 512 EXPORT_SYMBOL(system_wq); 513 struct workqueue_struct *system_percpu_wq __ro_after_init; 514 EXPORT_SYMBOL(system_percpu_wq); 515 struct workqueue_struct *system_highpri_wq __ro_after_init; 516 EXPORT_SYMBOL_GPL(system_highpri_wq); 517 struct workqueue_struct *system_long_wq __ro_after_init; 518 EXPORT_SYMBOL_GPL(system_long_wq); 519 struct workqueue_struct *system_unbound_wq __ro_after_init; 520 EXPORT_SYMBOL_GPL(system_unbound_wq); 521 struct workqueue_struct *system_dfl_wq __ro_after_init; 522 EXPORT_SYMBOL_GPL(system_dfl_wq); 523 struct workqueue_struct *system_freezable_wq __ro_after_init; 524 EXPORT_SYMBOL_GPL(system_freezable_wq); 525 struct workqueue_struct *system_power_efficient_wq __ro_after_init; 526 EXPORT_SYMBOL_GPL(system_power_efficient_wq); 527 struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; 528 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); 529 struct workqueue_struct *system_bh_wq; 530 EXPORT_SYMBOL_GPL(system_bh_wq); 531 struct workqueue_struct *system_bh_highpri_wq; 532 EXPORT_SYMBOL_GPL(system_bh_highpri_wq); 533 534 static int worker_thread(void *__worker); 535 static void workqueue_sysfs_unregister(struct workqueue_struct *wq); 536 static void show_pwq(struct pool_workqueue *pwq); 537 static void show_one_worker_pool(struct worker_pool *pool); 538 539 #define CREATE_TRACE_POINTS 540 #include <trace/events/workqueue.h> 541 542 #define assert_rcu_or_pool_mutex() \ 543 RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ 544 !lockdep_is_held(&wq_pool_mutex), \ 545 "RCU or wq_pool_mutex should be held") 546 547 #define for_each_bh_worker_pool(pool, cpu) \ 548 for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ 549 (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 550 (pool)++) 551 552 #define for_each_cpu_worker_pool(pool, cpu) \ 553 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 554 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 555 (pool)++) 556 557 /** 558 * for_each_pool - iterate through all worker_pools in the system 559 * @pool: iteration cursor 560 * @pi: integer used for iteration 561 * 562 * This must be called either with wq_pool_mutex held or RCU read 563 * locked. If the pool needs to be used beyond the locking in effect, the 564 * caller is responsible for guaranteeing that the pool stays online. 565 * 566 * The if/else clause exists only for the lockdep assertion and can be 567 * ignored. 568 */ 569 #define for_each_pool(pool, pi) \ 570 idr_for_each_entry(&worker_pool_idr, pool, pi) \ 571 if (({ assert_rcu_or_pool_mutex(); false; })) { } \ 572 else 573 574 /** 575 * for_each_pool_worker - iterate through all workers of a worker_pool 576 * @worker: iteration cursor 577 * @pool: worker_pool to iterate workers of 578 * 579 * This must be called with wq_pool_attach_mutex. 580 * 581 * The if/else clause exists only for the lockdep assertion and can be 582 * ignored. 583 */ 584 #define for_each_pool_worker(worker, pool) \ 585 list_for_each_entry((worker), &(pool)->workers, node) \ 586 if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ 587 else 588 589 /** 590 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue 591 * @pwq: iteration cursor 592 * @wq: the target workqueue 593 * 594 * This must be called either with wq->mutex held or RCU read locked. 595 * If the pwq needs to be used beyond the locking in effect, the caller is 596 * responsible for guaranteeing that the pwq stays online. 597 * 598 * The if/else clause exists only for the lockdep assertion and can be 599 * ignored. 600 */ 601 #define for_each_pwq(pwq, wq) \ 602 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ 603 lockdep_is_held(&(wq->mutex))) 604 605 #ifdef CONFIG_DEBUG_OBJECTS_WORK 606 607 static const struct debug_obj_descr work_debug_descr; 608 609 static void *work_debug_hint(void *addr) 610 { 611 return ((struct work_struct *) addr)->func; 612 } 613 614 static bool work_is_static_object(void *addr) 615 { 616 struct work_struct *work = addr; 617 618 return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); 619 } 620 621 /* 622 * fixup_init is called when: 623 * - an active object is initialized 624 */ 625 static bool work_fixup_init(void *addr, enum debug_obj_state state) 626 { 627 struct work_struct *work = addr; 628 629 switch (state) { 630 case ODEBUG_STATE_ACTIVE: 631 cancel_work_sync(work); 632 debug_object_init(work, &work_debug_descr); 633 return true; 634 default: 635 return false; 636 } 637 } 638 639 /* 640 * fixup_free is called when: 641 * - an active object is freed 642 */ 643 static bool work_fixup_free(void *addr, enum debug_obj_state state) 644 { 645 struct work_struct *work = addr; 646 647 switch (state) { 648 case ODEBUG_STATE_ACTIVE: 649 cancel_work_sync(work); 650 debug_object_free(work, &work_debug_descr); 651 return true; 652 default: 653 return false; 654 } 655 } 656 657 static const struct debug_obj_descr work_debug_descr = { 658 .name = "work_struct", 659 .debug_hint = work_debug_hint, 660 .is_static_object = work_is_static_object, 661 .fixup_init = work_fixup_init, 662 .fixup_free = work_fixup_free, 663 }; 664 665 static inline void debug_work_activate(struct work_struct *work) 666 { 667 debug_object_activate(work, &work_debug_descr); 668 } 669 670 static inline void debug_work_deactivate(struct work_struct *work) 671 { 672 debug_object_deactivate(work, &work_debug_descr); 673 } 674 675 void __init_work(struct work_struct *work, int onstack) 676 { 677 if (onstack) 678 debug_object_init_on_stack(work, &work_debug_descr); 679 else 680 debug_object_init(work, &work_debug_descr); 681 } 682 EXPORT_SYMBOL_GPL(__init_work); 683 684 void destroy_work_on_stack(struct work_struct *work) 685 { 686 debug_object_free(work, &work_debug_descr); 687 } 688 EXPORT_SYMBOL_GPL(destroy_work_on_stack); 689 690 void destroy_delayed_work_on_stack(struct delayed_work *work) 691 { 692 timer_destroy_on_stack(&work->timer); 693 debug_object_free(&work->work, &work_debug_descr); 694 } 695 EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); 696 697 #else 698 static inline void debug_work_activate(struct work_struct *work) { } 699 static inline void debug_work_deactivate(struct work_struct *work) { } 700 #endif 701 702 /** 703 * worker_pool_assign_id - allocate ID and assign it to @pool 704 * @pool: the pool pointer of interest 705 * 706 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned 707 * successfully, -errno on failure. 708 */ 709 static int worker_pool_assign_id(struct worker_pool *pool) 710 { 711 int ret; 712 713 lockdep_assert_held(&wq_pool_mutex); 714 715 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, 716 GFP_KERNEL); 717 if (ret >= 0) { 718 pool->id = ret; 719 return 0; 720 } 721 return ret; 722 } 723 724 static struct pool_workqueue __rcu ** 725 unbound_pwq_slot(struct workqueue_struct *wq, int cpu) 726 { 727 if (cpu >= 0) 728 return per_cpu_ptr(wq->cpu_pwq, cpu); 729 else 730 return &wq->dfl_pwq; 731 } 732 733 /* @cpu < 0 for dfl_pwq */ 734 static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) 735 { 736 return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), 737 lockdep_is_held(&wq_pool_mutex) || 738 lockdep_is_held(&wq->mutex)); 739 } 740 741 /** 742 * unbound_effective_cpumask - effective cpumask of an unbound workqueue 743 * @wq: workqueue of interest 744 * 745 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which 746 * is masked with wq_unbound_cpumask to determine the effective cpumask. The 747 * default pwq is always mapped to the pool with the current effective cpumask. 748 */ 749 static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) 750 { 751 return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; 752 } 753 754 static unsigned int work_color_to_flags(int color) 755 { 756 return color << WORK_STRUCT_COLOR_SHIFT; 757 } 758 759 static int get_work_color(unsigned long work_data) 760 { 761 return (work_data >> WORK_STRUCT_COLOR_SHIFT) & 762 ((1 << WORK_STRUCT_COLOR_BITS) - 1); 763 } 764 765 static int work_next_color(int color) 766 { 767 return (color + 1) % WORK_NR_COLORS; 768 } 769 770 static unsigned long pool_offq_flags(struct worker_pool *pool) 771 { 772 return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; 773 } 774 775 /* 776 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data 777 * contain the pointer to the queued pwq. Once execution starts, the flag 778 * is cleared and the high bits contain OFFQ flags and pool ID. 779 * 780 * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling() 781 * can be used to set the pwq, pool or clear work->data. These functions should 782 * only be called while the work is owned - ie. while the PENDING bit is set. 783 * 784 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq 785 * corresponding to a work. Pool is available once the work has been 786 * queued anywhere after initialization until it is sync canceled. pwq is 787 * available only while the work item is queued. 788 */ 789 static inline void set_work_data(struct work_struct *work, unsigned long data) 790 { 791 WARN_ON_ONCE(!work_pending(work)); 792 atomic_long_set(&work->data, data | work_static(work)); 793 } 794 795 static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, 796 unsigned long flags) 797 { 798 set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING | 799 WORK_STRUCT_PWQ | flags); 800 } 801 802 static void set_work_pool_and_keep_pending(struct work_struct *work, 803 int pool_id, unsigned long flags) 804 { 805 set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | 806 WORK_STRUCT_PENDING | flags); 807 } 808 809 static void set_work_pool_and_clear_pending(struct work_struct *work, 810 int pool_id, unsigned long flags) 811 { 812 /* 813 * The following wmb is paired with the implied mb in 814 * test_and_set_bit(PENDING) and ensures all updates to @work made 815 * here are visible to and precede any updates by the next PENDING 816 * owner. 817 */ 818 smp_wmb(); 819 set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | 820 flags); 821 /* 822 * The following mb guarantees that previous clear of a PENDING bit 823 * will not be reordered with any speculative LOADS or STORES from 824 * work->current_func, which is executed afterwards. This possible 825 * reordering can lead to a missed execution on attempt to queue 826 * the same @work. E.g. consider this case: 827 * 828 * CPU#0 CPU#1 829 * ---------------------------- -------------------------------- 830 * 831 * 1 STORE event_indicated 832 * 2 queue_work_on() { 833 * 3 test_and_set_bit(PENDING) 834 * 4 } set_..._and_clear_pending() { 835 * 5 set_work_data() # clear bit 836 * 6 smp_mb() 837 * 7 work->current_func() { 838 * 8 LOAD event_indicated 839 * } 840 * 841 * Without an explicit full barrier speculative LOAD on line 8 can 842 * be executed before CPU#0 does STORE on line 1. If that happens, 843 * CPU#0 observes the PENDING bit is still set and new execution of 844 * a @work is not queued in a hope, that CPU#1 will eventually 845 * finish the queued @work. Meanwhile CPU#1 does not see 846 * event_indicated is set, because speculative LOAD was executed 847 * before actual STORE. 848 */ 849 smp_mb(); 850 } 851 852 static inline struct pool_workqueue *work_struct_pwq(unsigned long data) 853 { 854 return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); 855 } 856 857 static struct pool_workqueue *get_work_pwq(struct work_struct *work) 858 { 859 unsigned long data = atomic_long_read(&work->data); 860 861 if (data & WORK_STRUCT_PWQ) 862 return work_struct_pwq(data); 863 else 864 return NULL; 865 } 866 867 /** 868 * get_work_pool - return the worker_pool a given work was associated with 869 * @work: the work item of interest 870 * 871 * Pools are created and destroyed under wq_pool_mutex, and allows read 872 * access under RCU read lock. As such, this function should be 873 * called under wq_pool_mutex or inside of a rcu_read_lock() region. 874 * 875 * All fields of the returned pool are accessible as long as the above 876 * mentioned locking is in effect. If the returned pool needs to be used 877 * beyond the critical section, the caller is responsible for ensuring the 878 * returned pool is and stays online. 879 * 880 * Return: The worker_pool @work was last associated with. %NULL if none. 881 */ 882 static struct worker_pool *get_work_pool(struct work_struct *work) 883 { 884 unsigned long data = atomic_long_read(&work->data); 885 int pool_id; 886 887 assert_rcu_or_pool_mutex(); 888 889 if (data & WORK_STRUCT_PWQ) 890 return work_struct_pwq(data)->pool; 891 892 pool_id = data >> WORK_OFFQ_POOL_SHIFT; 893 if (pool_id == WORK_OFFQ_POOL_NONE) 894 return NULL; 895 896 return idr_find(&worker_pool_idr, pool_id); 897 } 898 899 static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) 900 { 901 return (v >> shift) & ((1U << bits) - 1); 902 } 903 904 static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) 905 { 906 WARN_ON_ONCE(data & WORK_STRUCT_PWQ); 907 908 offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, 909 WORK_OFFQ_POOL_BITS); 910 offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, 911 WORK_OFFQ_DISABLE_BITS); 912 offqd->flags = data & WORK_OFFQ_FLAG_MASK; 913 } 914 915 static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) 916 { 917 return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | 918 ((unsigned long)offqd->flags); 919 } 920 921 /* 922 * Policy functions. These define the policies on how the global worker 923 * pools are managed. Unless noted otherwise, these functions assume that 924 * they're being called with pool->lock held. 925 */ 926 927 /* 928 * Need to wake up a worker? Called from anything but currently 929 * running workers. 930 * 931 * Note that, because unbound workers never contribute to nr_running, this 932 * function will always return %true for unbound pools as long as the 933 * worklist isn't empty. 934 */ 935 static bool need_more_worker(struct worker_pool *pool) 936 { 937 return !list_empty(&pool->worklist) && !pool->nr_running; 938 } 939 940 /* Can I start working? Called from busy but !running workers. */ 941 static bool may_start_working(struct worker_pool *pool) 942 { 943 return pool->nr_idle; 944 } 945 946 /* Do I need to keep working? Called from currently running workers. */ 947 static bool keep_working(struct worker_pool *pool) 948 { 949 return !list_empty(&pool->worklist) && (pool->nr_running <= 1); 950 } 951 952 /* Do we need a new worker? Called from manager. */ 953 static bool need_to_create_worker(struct worker_pool *pool) 954 { 955 return need_more_worker(pool) && !may_start_working(pool); 956 } 957 958 /* Do we have too many workers and should some go away? */ 959 static bool too_many_workers(struct worker_pool *pool) 960 { 961 bool managing = pool->flags & POOL_MANAGER_ACTIVE; 962 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 963 int nr_busy = pool->nr_workers - nr_idle; 964 965 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 966 } 967 968 /** 969 * worker_set_flags - set worker flags and adjust nr_running accordingly 970 * @worker: self 971 * @flags: flags to set 972 * 973 * Set @flags in @worker->flags and adjust nr_running accordingly. 974 */ 975 static inline void worker_set_flags(struct worker *worker, unsigned int flags) 976 { 977 struct worker_pool *pool = worker->pool; 978 979 lockdep_assert_held(&pool->lock); 980 981 /* If transitioning into NOT_RUNNING, adjust nr_running. */ 982 if ((flags & WORKER_NOT_RUNNING) && 983 !(worker->flags & WORKER_NOT_RUNNING)) { 984 pool->nr_running--; 985 } 986 987 worker->flags |= flags; 988 } 989 990 /** 991 * worker_clr_flags - clear worker flags and adjust nr_running accordingly 992 * @worker: self 993 * @flags: flags to clear 994 * 995 * Clear @flags in @worker->flags and adjust nr_running accordingly. 996 */ 997 static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 998 { 999 struct worker_pool *pool = worker->pool; 1000 unsigned int oflags = worker->flags; 1001 1002 lockdep_assert_held(&pool->lock); 1003 1004 worker->flags &= ~flags; 1005 1006 /* 1007 * If transitioning out of NOT_RUNNING, increment nr_running. Note 1008 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask 1009 * of multiple flags, not a single flag. 1010 */ 1011 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 1012 if (!(worker->flags & WORKER_NOT_RUNNING)) 1013 pool->nr_running++; 1014 } 1015 1016 /* Return the first idle worker. Called with pool->lock held. */ 1017 static struct worker *first_idle_worker(struct worker_pool *pool) 1018 { 1019 if (unlikely(list_empty(&pool->idle_list))) 1020 return NULL; 1021 1022 return list_first_entry(&pool->idle_list, struct worker, entry); 1023 } 1024 1025 /** 1026 * worker_enter_idle - enter idle state 1027 * @worker: worker which is entering idle state 1028 * 1029 * @worker is entering idle state. Update stats and idle timer if 1030 * necessary. 1031 * 1032 * LOCKING: 1033 * raw_spin_lock_irq(pool->lock). 1034 */ 1035 static void worker_enter_idle(struct worker *worker) 1036 { 1037 struct worker_pool *pool = worker->pool; 1038 1039 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || 1040 WARN_ON_ONCE(!list_empty(&worker->entry) && 1041 (worker->hentry.next || worker->hentry.pprev))) 1042 return; 1043 1044 /* can't use worker_set_flags(), also called from create_worker() */ 1045 worker->flags |= WORKER_IDLE; 1046 pool->nr_idle++; 1047 worker->last_active = jiffies; 1048 1049 /* idle_list is LIFO */ 1050 list_add(&worker->entry, &pool->idle_list); 1051 1052 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) 1053 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1054 1055 /* Sanity check nr_running. */ 1056 WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); 1057 } 1058 1059 /** 1060 * worker_leave_idle - leave idle state 1061 * @worker: worker which is leaving idle state 1062 * 1063 * @worker is leaving idle state. Update stats. 1064 * 1065 * LOCKING: 1066 * raw_spin_lock_irq(pool->lock). 1067 */ 1068 static void worker_leave_idle(struct worker *worker) 1069 { 1070 struct worker_pool *pool = worker->pool; 1071 1072 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) 1073 return; 1074 worker_clr_flags(worker, WORKER_IDLE); 1075 pool->nr_idle--; 1076 list_del_init(&worker->entry); 1077 } 1078 1079 /** 1080 * find_worker_executing_work - find worker which is executing a work 1081 * @pool: pool of interest 1082 * @work: work to find worker for 1083 * 1084 * Find a worker which is executing @work on @pool by searching 1085 * @pool->busy_hash which is keyed by the address of @work. For a worker 1086 * to match, its current execution should match the address of @work and 1087 * its work function. This is to avoid unwanted dependency between 1088 * unrelated work executions through a work item being recycled while still 1089 * being executed. 1090 * 1091 * This is a bit tricky. A work item may be freed once its execution 1092 * starts and nothing prevents the freed area from being recycled for 1093 * another work item. If the same work item address ends up being reused 1094 * before the original execution finishes, workqueue will identify the 1095 * recycled work item as currently executing and make it wait until the 1096 * current execution finishes, introducing an unwanted dependency. 1097 * 1098 * This function checks the work item address and work function to avoid 1099 * false positives. Note that this isn't complete as one may construct a 1100 * work function which can introduce dependency onto itself through a 1101 * recycled work item. Well, if somebody wants to shoot oneself in the 1102 * foot that badly, there's only so much we can do, and if such deadlock 1103 * actually occurs, it should be easy to locate the culprit work function. 1104 * 1105 * CONTEXT: 1106 * raw_spin_lock_irq(pool->lock). 1107 * 1108 * Return: 1109 * Pointer to worker which is executing @work if found, %NULL 1110 * otherwise. 1111 */ 1112 static struct worker *find_worker_executing_work(struct worker_pool *pool, 1113 struct work_struct *work) 1114 { 1115 struct worker *worker; 1116 1117 hash_for_each_possible(pool->busy_hash, worker, hentry, 1118 (unsigned long)work) 1119 if (worker->current_work == work && 1120 worker->current_func == work->func) 1121 return worker; 1122 1123 return NULL; 1124 } 1125 1126 static void mayday_cursor_func(struct work_struct *work) 1127 { 1128 /* should not be processed, only for marking position */ 1129 BUG(); 1130 } 1131 1132 /** 1133 * move_linked_works - move linked works to a list 1134 * @work: start of series of works to be scheduled 1135 * @head: target list to append @work to 1136 * @nextp: out parameter for nested worklist walking 1137 * 1138 * Schedule linked works starting from @work to @head. Work series to be 1139 * scheduled starts at @work and includes any consecutive work with 1140 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on 1141 * @nextp. 1142 * 1143 * CONTEXT: 1144 * raw_spin_lock_irq(pool->lock). 1145 */ 1146 static void move_linked_works(struct work_struct *work, struct list_head *head, 1147 struct work_struct **nextp) 1148 { 1149 struct work_struct *n; 1150 1151 /* 1152 * Linked worklist will always end before the end of the list, 1153 * use NULL for list head. 1154 */ 1155 list_for_each_entry_safe_from(work, n, NULL, entry) { 1156 list_move_tail(&work->entry, head); 1157 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) 1158 break; 1159 } 1160 1161 /* 1162 * If we're already inside safe list traversal and have moved 1163 * multiple works to the scheduled queue, the next position 1164 * needs to be updated. 1165 */ 1166 if (nextp) 1167 *nextp = n; 1168 } 1169 1170 /** 1171 * assign_work - assign a work item and its linked work items to a worker 1172 * @work: work to assign 1173 * @worker: worker to assign to 1174 * @nextp: out parameter for nested worklist walking 1175 * 1176 * Assign @work and its linked work items to @worker. If @work is already being 1177 * executed by another worker in the same pool, it'll be punted there. 1178 * 1179 * If @nextp is not NULL, it's updated to point to the next work of the last 1180 * scheduled work. This allows assign_work() to be nested inside 1181 * list_for_each_entry_safe(). 1182 * 1183 * Returns %true if @work was successfully assigned to @worker. %false if @work 1184 * was punted to another worker already executing it. 1185 */ 1186 static bool assign_work(struct work_struct *work, struct worker *worker, 1187 struct work_struct **nextp) 1188 { 1189 struct worker_pool *pool = worker->pool; 1190 struct worker *collision; 1191 1192 lockdep_assert_held(&pool->lock); 1193 1194 /* The cursor work should not be processed */ 1195 if (unlikely(work->func == mayday_cursor_func)) { 1196 /* only worker_thread() can possibly take this branch */ 1197 WARN_ON_ONCE(worker->rescue_wq); 1198 if (nextp) 1199 *nextp = list_next_entry(work, entry); 1200 list_del_init(&work->entry); 1201 return false; 1202 } 1203 1204 /* 1205 * A single work shouldn't be executed concurrently by multiple workers. 1206 * __queue_work() ensures that @work doesn't jump to a different pool 1207 * while still running in the previous pool. Here, we should ensure that 1208 * @work is not executed concurrently by multiple workers from the same 1209 * pool. Check whether anyone is already processing the work. If so, 1210 * defer the work to the currently executing one. 1211 */ 1212 collision = find_worker_executing_work(pool, work); 1213 if (unlikely(collision)) { 1214 move_linked_works(work, &collision->scheduled, nextp); 1215 return false; 1216 } 1217 1218 move_linked_works(work, &worker->scheduled, nextp); 1219 return true; 1220 } 1221 1222 static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) 1223 { 1224 int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0; 1225 1226 return &per_cpu(bh_pool_irq_works, pool->cpu)[high]; 1227 } 1228 1229 static void kick_bh_pool(struct worker_pool *pool) 1230 { 1231 #ifdef CONFIG_SMP 1232 /* see drain_dead_softirq_workfn() for BH_DRAINING */ 1233 if (unlikely(pool->cpu != smp_processor_id() && 1234 !(pool->flags & POOL_BH_DRAINING))) { 1235 irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); 1236 return; 1237 } 1238 #endif 1239 if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) 1240 raise_softirq_irqoff(HI_SOFTIRQ); 1241 else 1242 raise_softirq_irqoff(TASKLET_SOFTIRQ); 1243 } 1244 1245 /** 1246 * kick_pool - wake up an idle worker if necessary 1247 * @pool: pool to kick 1248 * 1249 * @pool may have pending work items. Wake up worker if necessary. Returns 1250 * whether a worker was woken up. 1251 */ 1252 static bool kick_pool(struct worker_pool *pool) 1253 { 1254 struct worker *worker = first_idle_worker(pool); 1255 struct task_struct *p; 1256 1257 lockdep_assert_held(&pool->lock); 1258 1259 if (!need_more_worker(pool) || !worker) 1260 return false; 1261 1262 if (pool->flags & POOL_BH) { 1263 kick_bh_pool(pool); 1264 return true; 1265 } 1266 1267 p = worker->task; 1268 1269 #ifdef CONFIG_SMP 1270 /* 1271 * Idle @worker is about to execute @work and waking up provides an 1272 * opportunity to migrate @worker at a lower cost by setting the task's 1273 * wake_cpu field. Let's see if we want to move @worker to improve 1274 * execution locality. 1275 * 1276 * We're waking the worker that went idle the latest and there's some 1277 * chance that @worker is marked idle but hasn't gone off CPU yet. If 1278 * so, setting the wake_cpu won't do anything. As this is a best-effort 1279 * optimization and the race window is narrow, let's leave as-is for 1280 * now. If this becomes pronounced, we can skip over workers which are 1281 * still on cpu when picking an idle worker. 1282 * 1283 * If @pool has non-strict affinity, @worker might have ended up outside 1284 * its affinity scope. Repatriate. 1285 */ 1286 if (!pool->attrs->affn_strict && 1287 !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { 1288 struct work_struct *work = list_first_entry(&pool->worklist, 1289 struct work_struct, entry); 1290 int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask, 1291 cpu_online_mask); 1292 if (wake_cpu < nr_cpu_ids) { 1293 p->wake_cpu = wake_cpu; 1294 get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; 1295 } 1296 } 1297 #endif 1298 wake_up_process(p); 1299 return true; 1300 } 1301 1302 #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT 1303 1304 /* 1305 * Concurrency-managed per-cpu work items that hog CPU for longer than 1306 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, 1307 * which prevents them from stalling other concurrency-managed work items. If a 1308 * work function keeps triggering this mechanism, it's likely that the work item 1309 * should be using an unbound workqueue instead. 1310 * 1311 * wq_cpu_intensive_report() tracks work functions which trigger such conditions 1312 * and report them so that they can be examined and converted to use unbound 1313 * workqueues as appropriate. To avoid flooding the console, each violating work 1314 * function is tracked and reported with exponential backoff. 1315 */ 1316 #define WCI_MAX_ENTS 128 1317 1318 struct wci_ent { 1319 work_func_t func; 1320 atomic64_t cnt; 1321 struct hlist_node hash_node; 1322 }; 1323 1324 static struct wci_ent wci_ents[WCI_MAX_ENTS]; 1325 static int wci_nr_ents; 1326 static DEFINE_RAW_SPINLOCK(wci_lock); 1327 static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); 1328 1329 static struct wci_ent *wci_find_ent(work_func_t func) 1330 { 1331 struct wci_ent *ent; 1332 1333 hash_for_each_possible_rcu(wci_hash, ent, hash_node, 1334 (unsigned long)func) { 1335 if (ent->func == func) 1336 return ent; 1337 } 1338 return NULL; 1339 } 1340 1341 static void wq_cpu_intensive_report(work_func_t func) 1342 { 1343 struct wci_ent *ent; 1344 1345 restart: 1346 ent = wci_find_ent(func); 1347 if (ent) { 1348 u64 cnt; 1349 1350 /* 1351 * Start reporting from the warning_thresh and back off 1352 * exponentially. 1353 */ 1354 cnt = atomic64_inc_return_relaxed(&ent->cnt); 1355 if (wq_cpu_intensive_warning_thresh && 1356 cnt >= wq_cpu_intensive_warning_thresh && 1357 is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh)) 1358 printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", 1359 ent->func, wq_cpu_intensive_thresh_us, 1360 atomic64_read(&ent->cnt)); 1361 return; 1362 } 1363 1364 /* 1365 * @func is a new violation. Allocate a new entry for it. If wcn_ents[] 1366 * is exhausted, something went really wrong and we probably made enough 1367 * noise already. 1368 */ 1369 if (wci_nr_ents >= WCI_MAX_ENTS) 1370 return; 1371 1372 raw_spin_lock(&wci_lock); 1373 1374 if (wci_nr_ents >= WCI_MAX_ENTS) { 1375 raw_spin_unlock(&wci_lock); 1376 return; 1377 } 1378 1379 if (wci_find_ent(func)) { 1380 raw_spin_unlock(&wci_lock); 1381 goto restart; 1382 } 1383 1384 ent = &wci_ents[wci_nr_ents++]; 1385 ent->func = func; 1386 atomic64_set(&ent->cnt, 0); 1387 hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); 1388 1389 raw_spin_unlock(&wci_lock); 1390 1391 goto restart; 1392 } 1393 1394 #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1395 static void wq_cpu_intensive_report(work_func_t func) {} 1396 #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1397 1398 /** 1399 * wq_worker_running - a worker is running again 1400 * @task: task waking up 1401 * 1402 * This function is called when a worker returns from schedule() 1403 */ 1404 void wq_worker_running(struct task_struct *task) 1405 { 1406 struct worker *worker = kthread_data(task); 1407 1408 if (!READ_ONCE(worker->sleeping)) 1409 return; 1410 1411 /* 1412 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check 1413 * and the nr_running increment below, we may ruin the nr_running reset 1414 * and leave with an unexpected pool->nr_running == 1 on the newly unbound 1415 * pool. Protect against such race. 1416 */ 1417 preempt_disable(); 1418 if (!(worker->flags & WORKER_NOT_RUNNING)) 1419 worker->pool->nr_running++; 1420 preempt_enable(); 1421 1422 /* 1423 * CPU intensive auto-detection cares about how long a work item hogged 1424 * CPU without sleeping. Reset the starting timestamp on wakeup. 1425 */ 1426 worker->current_at = worker->task->se.sum_exec_runtime; 1427 1428 WRITE_ONCE(worker->sleeping, 0); 1429 } 1430 1431 /** 1432 * wq_worker_sleeping - a worker is going to sleep 1433 * @task: task going to sleep 1434 * 1435 * This function is called from schedule() when a busy worker is 1436 * going to sleep. 1437 */ 1438 void wq_worker_sleeping(struct task_struct *task) 1439 { 1440 struct worker *worker = kthread_data(task); 1441 struct worker_pool *pool; 1442 1443 /* 1444 * Rescuers, which may not have all the fields set up like normal 1445 * workers, also reach here, let's not access anything before 1446 * checking NOT_RUNNING. 1447 */ 1448 if (worker->flags & WORKER_NOT_RUNNING) 1449 return; 1450 1451 pool = worker->pool; 1452 1453 /* Return if preempted before wq_worker_running() was reached */ 1454 if (READ_ONCE(worker->sleeping)) 1455 return; 1456 1457 WRITE_ONCE(worker->sleeping, 1); 1458 raw_spin_lock_irq(&pool->lock); 1459 1460 /* 1461 * Recheck in case unbind_workers() preempted us. We don't 1462 * want to decrement nr_running after the worker is unbound 1463 * and nr_running has been reset. 1464 */ 1465 if (worker->flags & WORKER_NOT_RUNNING) { 1466 raw_spin_unlock_irq(&pool->lock); 1467 return; 1468 } 1469 1470 pool->nr_running--; 1471 if (kick_pool(pool)) 1472 worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1473 1474 raw_spin_unlock_irq(&pool->lock); 1475 } 1476 1477 /** 1478 * wq_worker_tick - a scheduler tick occurred while a kworker is running 1479 * @task: task currently running 1480 * 1481 * Called from sched_tick(). We're in the IRQ context and the current 1482 * worker's fields which follow the 'K' locking rule can be accessed safely. 1483 */ 1484 void wq_worker_tick(struct task_struct *task) 1485 { 1486 struct worker *worker = kthread_data(task); 1487 struct pool_workqueue *pwq = worker->current_pwq; 1488 struct worker_pool *pool = worker->pool; 1489 1490 if (!pwq) 1491 return; 1492 1493 pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; 1494 1495 if (!wq_cpu_intensive_thresh_us) 1496 return; 1497 1498 /* 1499 * If the current worker is concurrency managed and hogged the CPU for 1500 * longer than wq_cpu_intensive_thresh_us, it's automatically marked 1501 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. 1502 * 1503 * Set @worker->sleeping means that @worker is in the process of 1504 * switching out voluntarily and won't be contributing to 1505 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also 1506 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to 1507 * double decrements. The task is releasing the CPU anyway. Let's skip. 1508 * We probably want to make this prettier in the future. 1509 */ 1510 if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || 1511 worker->task->se.sum_exec_runtime - worker->current_at < 1512 wq_cpu_intensive_thresh_us * NSEC_PER_USEC) 1513 return; 1514 1515 raw_spin_lock(&pool->lock); 1516 1517 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 1518 wq_cpu_intensive_report(worker->current_func); 1519 pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; 1520 1521 if (kick_pool(pool)) 1522 pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1523 1524 raw_spin_unlock(&pool->lock); 1525 } 1526 1527 /** 1528 * wq_worker_last_func - retrieve worker's last work function 1529 * @task: Task to retrieve last work function of. 1530 * 1531 * Determine the last function a worker executed. This is called from 1532 * the scheduler to get a worker's last known identity. 1533 * 1534 * CONTEXT: 1535 * raw_spin_lock_irq(rq->lock) 1536 * 1537 * This function is called during schedule() when a kworker is going 1538 * to sleep. It's used by psi to identify aggregation workers during 1539 * dequeuing, to allow periodic aggregation to shut-off when that 1540 * worker is the last task in the system or cgroup to go to sleep. 1541 * 1542 * As this function doesn't involve any workqueue-related locking, it 1543 * only returns stable values when called from inside the scheduler's 1544 * queuing and dequeuing paths, when @task, which must be a kworker, 1545 * is guaranteed to not be processing any works. 1546 * 1547 * Return: 1548 * The last work function %current executed as a worker, NULL if it 1549 * hasn't executed any work yet. 1550 */ 1551 work_func_t wq_worker_last_func(struct task_struct *task) 1552 { 1553 struct worker *worker = kthread_data(task); 1554 1555 return worker->last_func; 1556 } 1557 1558 /** 1559 * wq_node_nr_active - Determine wq_node_nr_active to use 1560 * @wq: workqueue of interest 1561 * @node: NUMA node, can be %NUMA_NO_NODE 1562 * 1563 * Determine wq_node_nr_active to use for @wq on @node. Returns: 1564 * 1565 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. 1566 * 1567 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. 1568 * 1569 * - Otherwise, node_nr_active[@node]. 1570 */ 1571 static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, 1572 int node) 1573 { 1574 if (!(wq->flags & WQ_UNBOUND)) 1575 return NULL; 1576 1577 if (node == NUMA_NO_NODE) 1578 node = nr_node_ids; 1579 1580 return wq->node_nr_active[node]; 1581 } 1582 1583 /** 1584 * wq_update_node_max_active - Update per-node max_actives to use 1585 * @wq: workqueue to update 1586 * @off_cpu: CPU that's going down, -1 if a CPU is not going down 1587 * 1588 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is 1589 * distributed among nodes according to the proportions of numbers of online 1590 * cpus. The result is always between @wq->min_active and max_active. 1591 */ 1592 static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) 1593 { 1594 struct cpumask *effective = unbound_effective_cpumask(wq); 1595 int min_active = READ_ONCE(wq->min_active); 1596 int max_active = READ_ONCE(wq->max_active); 1597 int total_cpus, node; 1598 1599 lockdep_assert_held(&wq->mutex); 1600 1601 if (!wq_topo_initialized) 1602 return; 1603 1604 if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective)) 1605 off_cpu = -1; 1606 1607 total_cpus = cpumask_weight_and(effective, cpu_online_mask); 1608 if (off_cpu >= 0) 1609 total_cpus--; 1610 1611 /* If all CPUs of the wq get offline, use the default values */ 1612 if (unlikely(!total_cpus)) { 1613 for_each_node(node) 1614 wq_node_nr_active(wq, node)->max = min_active; 1615 1616 wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; 1617 return; 1618 } 1619 1620 for_each_node(node) { 1621 int node_cpus; 1622 1623 node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); 1624 if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) 1625 node_cpus--; 1626 1627 wq_node_nr_active(wq, node)->max = 1628 clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), 1629 min_active, max_active); 1630 } 1631 1632 wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; 1633 } 1634 1635 /** 1636 * get_pwq - get an extra reference on the specified pool_workqueue 1637 * @pwq: pool_workqueue to get 1638 * 1639 * Obtain an extra reference on @pwq. The caller should guarantee that 1640 * @pwq has positive refcnt and be holding the matching pool->lock. 1641 */ 1642 static void get_pwq(struct pool_workqueue *pwq) 1643 { 1644 lockdep_assert_held(&pwq->pool->lock); 1645 WARN_ON_ONCE(pwq->refcnt <= 0); 1646 pwq->refcnt++; 1647 } 1648 1649 /** 1650 * put_pwq - put a pool_workqueue reference 1651 * @pwq: pool_workqueue to put 1652 * 1653 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its 1654 * destruction. The caller should be holding the matching pool->lock. 1655 */ 1656 static void put_pwq(struct pool_workqueue *pwq) 1657 { 1658 lockdep_assert_held(&pwq->pool->lock); 1659 if (likely(--pwq->refcnt)) 1660 return; 1661 /* 1662 * @pwq can't be released under pool->lock, bounce to a dedicated 1663 * kthread_worker to avoid A-A deadlocks. 1664 */ 1665 kthread_queue_work(pwq_release_worker, &pwq->release_work); 1666 } 1667 1668 /** 1669 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock 1670 * @pwq: pool_workqueue to put (can be %NULL) 1671 * 1672 * put_pwq() with locking. This function also allows %NULL @pwq. 1673 */ 1674 static void put_pwq_unlocked(struct pool_workqueue *pwq) 1675 { 1676 if (pwq) { 1677 /* 1678 * As both pwqs and pools are RCU protected, the 1679 * following lock operations are safe. 1680 */ 1681 raw_spin_lock_irq(&pwq->pool->lock); 1682 put_pwq(pwq); 1683 raw_spin_unlock_irq(&pwq->pool->lock); 1684 } 1685 } 1686 1687 static bool pwq_is_empty(struct pool_workqueue *pwq) 1688 { 1689 return !pwq->nr_active && list_empty(&pwq->inactive_works); 1690 } 1691 1692 static void __pwq_activate_work(struct pool_workqueue *pwq, 1693 struct work_struct *work) 1694 { 1695 unsigned long *wdb = work_data_bits(work); 1696 1697 WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); 1698 trace_workqueue_activate_work(work); 1699 if (list_empty(&pwq->pool->worklist)) 1700 pwq->pool->watchdog_ts = jiffies; 1701 move_linked_works(work, &pwq->pool->worklist, NULL); 1702 __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); 1703 } 1704 1705 static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) 1706 { 1707 int max = READ_ONCE(nna->max); 1708 int old = atomic_read(&nna->nr); 1709 1710 do { 1711 if (old >= max) 1712 return false; 1713 } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1)); 1714 1715 return true; 1716 } 1717 1718 /** 1719 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq 1720 * @pwq: pool_workqueue of interest 1721 * @fill: max_active may have increased, try to increase concurrency level 1722 * 1723 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is 1724 * successfully obtained. %false otherwise. 1725 */ 1726 static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) 1727 { 1728 struct workqueue_struct *wq = pwq->wq; 1729 struct worker_pool *pool = pwq->pool; 1730 struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); 1731 bool obtained = false; 1732 1733 lockdep_assert_held(&pool->lock); 1734 1735 if (!nna) { 1736 /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ 1737 obtained = pwq->nr_active < READ_ONCE(wq->max_active); 1738 goto out; 1739 } 1740 1741 if (unlikely(pwq->plugged)) 1742 return false; 1743 1744 /* 1745 * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is 1746 * already waiting on $nna, pwq_dec_nr_active() will maintain the 1747 * concurrency level. Don't jump the line. 1748 * 1749 * We need to ignore the pending test after max_active has increased as 1750 * pwq_dec_nr_active() can only maintain the concurrency level but not 1751 * increase it. This is indicated by @fill. 1752 */ 1753 if (!list_empty(&pwq->pending_node) && likely(!fill)) 1754 goto out; 1755 1756 obtained = tryinc_node_nr_active(nna); 1757 if (obtained) 1758 goto out; 1759 1760 /* 1761 * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs 1762 * and try again. The smp_mb() is paired with the implied memory barrier 1763 * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either 1764 * we see the decremented $nna->nr or they see non-empty 1765 * $nna->pending_pwqs. 1766 */ 1767 raw_spin_lock(&nna->lock); 1768 1769 if (list_empty(&pwq->pending_node)) 1770 list_add_tail(&pwq->pending_node, &nna->pending_pwqs); 1771 else if (likely(!fill)) 1772 goto out_unlock; 1773 1774 smp_mb(); 1775 1776 obtained = tryinc_node_nr_active(nna); 1777 1778 /* 1779 * If @fill, @pwq might have already been pending. Being spuriously 1780 * pending in cold paths doesn't affect anything. Let's leave it be. 1781 */ 1782 if (obtained && likely(!fill)) 1783 list_del_init(&pwq->pending_node); 1784 1785 out_unlock: 1786 raw_spin_unlock(&nna->lock); 1787 out: 1788 if (obtained) 1789 pwq->nr_active++; 1790 return obtained; 1791 } 1792 1793 /** 1794 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq 1795 * @pwq: pool_workqueue of interest 1796 * @fill: max_active may have increased, try to increase concurrency level 1797 * 1798 * Activate the first inactive work item of @pwq if available and allowed by 1799 * max_active limit. 1800 * 1801 * Returns %true if an inactive work item has been activated. %false if no 1802 * inactive work item is found or max_active limit is reached. 1803 */ 1804 static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) 1805 { 1806 struct work_struct *work = 1807 list_first_entry_or_null(&pwq->inactive_works, 1808 struct work_struct, entry); 1809 1810 if (work && pwq_tryinc_nr_active(pwq, fill)) { 1811 __pwq_activate_work(pwq, work); 1812 return true; 1813 } else { 1814 return false; 1815 } 1816 } 1817 1818 /** 1819 * unplug_oldest_pwq - unplug the oldest pool_workqueue 1820 * @wq: workqueue_struct where its oldest pwq is to be unplugged 1821 * 1822 * This function should only be called for ordered workqueues where only the 1823 * oldest pwq is unplugged, the others are plugged to suspend execution to 1824 * ensure proper work item ordering:: 1825 * 1826 * dfl_pwq --------------+ [P] - plugged 1827 * | 1828 * v 1829 * pwqs -> A -> B [P] -> C [P] (newest) 1830 * | | | 1831 * 1 3 5 1832 * | | | 1833 * 2 4 6 1834 * 1835 * When the oldest pwq is drained and removed, this function should be called 1836 * to unplug the next oldest one to start its work item execution. Note that 1837 * pwq's are linked into wq->pwqs with the oldest first, so the first one in 1838 * the list is the oldest. 1839 */ 1840 static void unplug_oldest_pwq(struct workqueue_struct *wq) 1841 { 1842 struct pool_workqueue *pwq; 1843 1844 lockdep_assert_held(&wq->mutex); 1845 1846 /* Caller should make sure that pwqs isn't empty before calling */ 1847 pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue, 1848 pwqs_node); 1849 raw_spin_lock_irq(&pwq->pool->lock); 1850 if (pwq->plugged) { 1851 pwq->plugged = false; 1852 if (pwq_activate_first_inactive(pwq, true)) 1853 kick_pool(pwq->pool); 1854 } 1855 raw_spin_unlock_irq(&pwq->pool->lock); 1856 } 1857 1858 /** 1859 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active 1860 * @nna: wq_node_nr_active to activate a pending pwq for 1861 * @caller_pool: worker_pool the caller is locking 1862 * 1863 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. 1864 * @caller_pool may be unlocked and relocked to lock other worker_pools. 1865 */ 1866 static void node_activate_pending_pwq(struct wq_node_nr_active *nna, 1867 struct worker_pool *caller_pool) 1868 { 1869 struct worker_pool *locked_pool = caller_pool; 1870 struct pool_workqueue *pwq; 1871 struct work_struct *work; 1872 1873 lockdep_assert_held(&caller_pool->lock); 1874 1875 raw_spin_lock(&nna->lock); 1876 retry: 1877 pwq = list_first_entry_or_null(&nna->pending_pwqs, 1878 struct pool_workqueue, pending_node); 1879 if (!pwq) 1880 goto out_unlock; 1881 1882 /* 1883 * If @pwq is for a different pool than @locked_pool, we need to lock 1884 * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock 1885 * / lock dance. For that, we also need to release @nna->lock as it's 1886 * nested inside pool locks. 1887 */ 1888 if (pwq->pool != locked_pool) { 1889 raw_spin_unlock(&locked_pool->lock); 1890 locked_pool = pwq->pool; 1891 if (!raw_spin_trylock(&locked_pool->lock)) { 1892 raw_spin_unlock(&nna->lock); 1893 raw_spin_lock(&locked_pool->lock); 1894 raw_spin_lock(&nna->lock); 1895 goto retry; 1896 } 1897 } 1898 1899 /* 1900 * $pwq may not have any inactive work items due to e.g. cancellations. 1901 * Drop it from pending_pwqs and see if there's another one. 1902 */ 1903 work = list_first_entry_or_null(&pwq->inactive_works, 1904 struct work_struct, entry); 1905 if (!work) { 1906 list_del_init(&pwq->pending_node); 1907 goto retry; 1908 } 1909 1910 /* 1911 * Acquire an nr_active count and activate the inactive work item. If 1912 * $pwq still has inactive work items, rotate it to the end of the 1913 * pending_pwqs so that we round-robin through them. This means that 1914 * inactive work items are not activated in queueing order which is fine 1915 * given that there has never been any ordering across different pwqs. 1916 */ 1917 if (likely(tryinc_node_nr_active(nna))) { 1918 pwq->nr_active++; 1919 __pwq_activate_work(pwq, work); 1920 1921 if (list_empty(&pwq->inactive_works)) 1922 list_del_init(&pwq->pending_node); 1923 else 1924 list_move_tail(&pwq->pending_node, &nna->pending_pwqs); 1925 1926 /* if activating a foreign pool, make sure it's running */ 1927 if (pwq->pool != caller_pool) 1928 kick_pool(pwq->pool); 1929 } 1930 1931 out_unlock: 1932 raw_spin_unlock(&nna->lock); 1933 if (locked_pool != caller_pool) { 1934 raw_spin_unlock(&locked_pool->lock); 1935 raw_spin_lock(&caller_pool->lock); 1936 } 1937 } 1938 1939 /** 1940 * pwq_dec_nr_active - Retire an active count 1941 * @pwq: pool_workqueue of interest 1942 * 1943 * Decrement @pwq's nr_active and try to activate the first inactive work item. 1944 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. 1945 */ 1946 static void pwq_dec_nr_active(struct pool_workqueue *pwq) 1947 { 1948 struct worker_pool *pool = pwq->pool; 1949 struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); 1950 1951 lockdep_assert_held(&pool->lock); 1952 1953 /* 1954 * @pwq->nr_active should be decremented for both percpu and unbound 1955 * workqueues. 1956 */ 1957 pwq->nr_active--; 1958 1959 /* 1960 * For a percpu workqueue, it's simple. Just need to kick the first 1961 * inactive work item on @pwq itself. 1962 */ 1963 if (!nna) { 1964 pwq_activate_first_inactive(pwq, false); 1965 return; 1966 } 1967 1968 /* 1969 * If @pwq is for an unbound workqueue, it's more complicated because 1970 * multiple pwqs and pools may be sharing the nr_active count. When a 1971 * pwq needs to wait for an nr_active count, it puts itself on 1972 * $nna->pending_pwqs. The following atomic_dec_return()'s implied 1973 * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to 1974 * guarantee that either we see non-empty pending_pwqs or they see 1975 * decremented $nna->nr. 1976 * 1977 * $nna->max may change as CPUs come online/offline and @pwq->wq's 1978 * max_active gets updated. However, it is guaranteed to be equal to or 1979 * larger than @pwq->wq->min_active which is above zero unless freezing. 1980 * This maintains the forward progress guarantee. 1981 */ 1982 if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) 1983 return; 1984 1985 if (!list_empty(&nna->pending_pwqs)) 1986 node_activate_pending_pwq(nna, pool); 1987 } 1988 1989 /** 1990 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight 1991 * @pwq: pwq of interest 1992 * @work_data: work_data of work which left the queue 1993 * 1994 * A work either has completed or is removed from pending queue, 1995 * decrement nr_in_flight of its pwq and handle workqueue flushing. 1996 * 1997 * NOTE: 1998 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock 1999 * and thus should be called after all other state updates for the in-flight 2000 * work item is complete. 2001 * 2002 * CONTEXT: 2003 * raw_spin_lock_irq(pool->lock). 2004 */ 2005 static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) 2006 { 2007 int color = get_work_color(work_data); 2008 2009 if (!(work_data & WORK_STRUCT_INACTIVE)) 2010 pwq_dec_nr_active(pwq); 2011 2012 pwq->nr_in_flight[color]--; 2013 2014 /* is flush in progress and are we at the flushing tip? */ 2015 if (likely(pwq->flush_color != color)) 2016 goto out_put; 2017 2018 /* are there still in-flight works? */ 2019 if (pwq->nr_in_flight[color]) 2020 goto out_put; 2021 2022 /* this pwq is done, clear flush_color */ 2023 pwq->flush_color = -1; 2024 2025 /* 2026 * If this was the last pwq, wake up the first flusher. It 2027 * will handle the rest. 2028 */ 2029 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) 2030 complete(&pwq->wq->first_flusher->done); 2031 out_put: 2032 put_pwq(pwq); 2033 } 2034 2035 /** 2036 * try_to_grab_pending - steal work item from worklist and disable irq 2037 * @work: work item to steal 2038 * @cflags: %WORK_CANCEL_ flags 2039 * @irq_flags: place to store irq state 2040 * 2041 * Try to grab PENDING bit of @work. This function can handle @work in any 2042 * stable state - idle, on timer or on worklist. 2043 * 2044 * Return: 2045 * 2046 * ======== ================================================================ 2047 * 1 if @work was pending and we successfully stole PENDING 2048 * 0 if @work was idle and we claimed PENDING 2049 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry 2050 * ======== ================================================================ 2051 * 2052 * Note: 2053 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting 2054 * interrupted while holding PENDING and @work off queue, irq must be 2055 * disabled on entry. This, combined with delayed_work->timer being 2056 * irqsafe, ensures that we return -EAGAIN for finite short period of time. 2057 * 2058 * On successful return, >= 0, irq is disabled and the caller is 2059 * responsible for releasing it using local_irq_restore(*@irq_flags). 2060 * 2061 * This function is safe to call from any context including IRQ handler. 2062 */ 2063 static int try_to_grab_pending(struct work_struct *work, u32 cflags, 2064 unsigned long *irq_flags) 2065 { 2066 struct worker_pool *pool; 2067 struct pool_workqueue *pwq; 2068 2069 local_irq_save(*irq_flags); 2070 2071 /* try to steal the timer if it exists */ 2072 if (cflags & WORK_CANCEL_DELAYED) { 2073 struct delayed_work *dwork = to_delayed_work(work); 2074 2075 /* 2076 * dwork->timer is irqsafe. If timer_delete() fails, it's 2077 * guaranteed that the timer is not queued anywhere and not 2078 * running on the local CPU. 2079 */ 2080 if (likely(timer_delete(&dwork->timer))) 2081 return 1; 2082 } 2083 2084 /* try to claim PENDING the normal way */ 2085 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) 2086 return 0; 2087 2088 rcu_read_lock(); 2089 /* 2090 * The queueing is in progress, or it is already queued. Try to 2091 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 2092 */ 2093 pool = get_work_pool(work); 2094 if (!pool) 2095 goto fail; 2096 2097 raw_spin_lock(&pool->lock); 2098 /* 2099 * work->data is guaranteed to point to pwq only while the work 2100 * item is queued on pwq->wq, and both updating work->data to point 2101 * to pwq on queueing and to pool on dequeueing are done under 2102 * pwq->pool->lock. This in turn guarantees that, if work->data 2103 * points to pwq which is associated with a locked pool, the work 2104 * item is currently queued on that pool. 2105 */ 2106 pwq = get_work_pwq(work); 2107 if (pwq && pwq->pool == pool) { 2108 unsigned long work_data = *work_data_bits(work); 2109 2110 debug_work_deactivate(work); 2111 2112 /* 2113 * A cancelable inactive work item must be in the 2114 * pwq->inactive_works since a queued barrier can't be 2115 * canceled (see the comments in insert_wq_barrier()). 2116 * 2117 * An inactive work item cannot be deleted directly because 2118 * it might have linked barrier work items which, if left 2119 * on the inactive_works list, will confuse pwq->nr_active 2120 * management later on and cause stall. Move the linked 2121 * barrier work items to the worklist when deleting the grabbed 2122 * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that 2123 * it doesn't participate in nr_active management in later 2124 * pwq_dec_nr_in_flight(). 2125 */ 2126 if (work_data & WORK_STRUCT_INACTIVE) 2127 move_linked_works(work, &pwq->pool->worklist, NULL); 2128 2129 list_del_init(&work->entry); 2130 2131 /* 2132 * work->data points to pwq iff queued. Let's point to pool. As 2133 * this destroys work->data needed by the next step, stash it. 2134 */ 2135 set_work_pool_and_keep_pending(work, pool->id, 2136 pool_offq_flags(pool)); 2137 2138 /* must be the last step, see the function comment */ 2139 pwq_dec_nr_in_flight(pwq, work_data); 2140 2141 raw_spin_unlock(&pool->lock); 2142 rcu_read_unlock(); 2143 return 1; 2144 } 2145 raw_spin_unlock(&pool->lock); 2146 fail: 2147 rcu_read_unlock(); 2148 local_irq_restore(*irq_flags); 2149 return -EAGAIN; 2150 } 2151 2152 /** 2153 * work_grab_pending - steal work item from worklist and disable irq 2154 * @work: work item to steal 2155 * @cflags: %WORK_CANCEL_ flags 2156 * @irq_flags: place to store IRQ state 2157 * 2158 * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer 2159 * or on worklist. 2160 * 2161 * Can be called from any context. IRQ is disabled on return with IRQ state 2162 * stored in *@irq_flags. The caller is responsible for re-enabling it using 2163 * local_irq_restore(). 2164 * 2165 * Returns %true if @work was pending. %false if idle. 2166 */ 2167 static bool work_grab_pending(struct work_struct *work, u32 cflags, 2168 unsigned long *irq_flags) 2169 { 2170 int ret; 2171 2172 while (true) { 2173 ret = try_to_grab_pending(work, cflags, irq_flags); 2174 if (ret >= 0) 2175 return ret; 2176 cpu_relax(); 2177 } 2178 } 2179 2180 /** 2181 * insert_work - insert a work into a pool 2182 * @pwq: pwq @work belongs to 2183 * @work: work to insert 2184 * @head: insertion point 2185 * @extra_flags: extra WORK_STRUCT_* flags to set 2186 * 2187 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to 2188 * work_struct flags. 2189 * 2190 * CONTEXT: 2191 * raw_spin_lock_irq(pool->lock). 2192 */ 2193 static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, 2194 struct list_head *head, unsigned int extra_flags) 2195 { 2196 debug_work_activate(work); 2197 2198 /* record the work call stack in order to print it in KASAN reports */ 2199 kasan_record_aux_stack(work); 2200 2201 /* we own @work, set data and link */ 2202 set_work_pwq(work, pwq, extra_flags); 2203 list_add_tail(&work->entry, head); 2204 get_pwq(pwq); 2205 } 2206 2207 /* 2208 * Test whether @work is being queued from another work executing on the 2209 * same workqueue. 2210 */ 2211 static bool is_chained_work(struct workqueue_struct *wq) 2212 { 2213 struct worker *worker; 2214 2215 worker = current_wq_worker(); 2216 /* 2217 * Return %true iff I'm a worker executing a work item on @wq. If 2218 * I'm @worker, it's safe to dereference it without locking. 2219 */ 2220 return worker && worker->current_pwq->wq == wq; 2221 } 2222 2223 /* 2224 * When queueing an unbound work item to a wq, prefer local CPU if allowed 2225 * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to 2226 * avoid perturbing sensitive tasks. 2227 */ 2228 static int wq_select_unbound_cpu(int cpu) 2229 { 2230 int new_cpu; 2231 2232 if (likely(!wq_debug_force_rr_cpu)) { 2233 if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) 2234 return cpu; 2235 } else { 2236 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); 2237 } 2238 2239 new_cpu = __this_cpu_read(wq_rr_cpu_last); 2240 new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask); 2241 if (unlikely(new_cpu >= nr_cpu_ids)) 2242 return cpu; 2243 __this_cpu_write(wq_rr_cpu_last, new_cpu); 2244 2245 return new_cpu; 2246 } 2247 2248 static void __queue_work(int cpu, struct workqueue_struct *wq, 2249 struct work_struct *work) 2250 { 2251 struct pool_workqueue *pwq; 2252 struct worker_pool *last_pool, *pool; 2253 unsigned int work_flags; 2254 unsigned int req_cpu = cpu; 2255 2256 /* 2257 * While a work item is PENDING && off queue, a task trying to 2258 * steal the PENDING will busy-loop waiting for it to either get 2259 * queued or lose PENDING. Grabbing PENDING and queueing should 2260 * happen with IRQ disabled. 2261 */ 2262 lockdep_assert_irqs_disabled(); 2263 2264 /* 2265 * For a draining wq, only works from the same workqueue are 2266 * allowed. The __WQ_DESTROYING helps to spot the issue that 2267 * queues a new work item to a wq after destroy_workqueue(wq). 2268 */ 2269 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && 2270 WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", 2271 work->func, wq->name))) { 2272 return; 2273 } 2274 rcu_read_lock(); 2275 retry: 2276 /* pwq which will be used unless @work is executing elsewhere */ 2277 if (req_cpu == WORK_CPU_UNBOUND) { 2278 if (wq->flags & WQ_UNBOUND) 2279 cpu = wq_select_unbound_cpu(raw_smp_processor_id()); 2280 else 2281 cpu = raw_smp_processor_id(); 2282 } 2283 2284 pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); 2285 pool = pwq->pool; 2286 2287 /* 2288 * If @work was previously on a different pool, it might still be 2289 * running there, in which case the work needs to be queued on that 2290 * pool to guarantee non-reentrancy. 2291 * 2292 * For ordered workqueue, work items must be queued on the newest pwq 2293 * for accurate order management. Guaranteed order also guarantees 2294 * non-reentrancy. See the comments above unplug_oldest_pwq(). 2295 */ 2296 last_pool = get_work_pool(work); 2297 if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { 2298 struct worker *worker; 2299 2300 raw_spin_lock(&last_pool->lock); 2301 2302 worker = find_worker_executing_work(last_pool, work); 2303 2304 if (worker && worker->current_pwq->wq == wq) { 2305 pwq = worker->current_pwq; 2306 pool = pwq->pool; 2307 WARN_ON_ONCE(pool != last_pool); 2308 } else { 2309 /* meh... not running there, queue here */ 2310 raw_spin_unlock(&last_pool->lock); 2311 raw_spin_lock(&pool->lock); 2312 } 2313 } else { 2314 raw_spin_lock(&pool->lock); 2315 } 2316 2317 /* 2318 * pwq is determined and locked. For unbound pools, we could have raced 2319 * with pwq release and it could already be dead. If its refcnt is zero, 2320 * repeat pwq selection. Note that unbound pwqs never die without 2321 * another pwq replacing it in cpu_pwq or while work items are executing 2322 * on it, so the retrying is guaranteed to make forward-progress. 2323 */ 2324 if (unlikely(!pwq->refcnt)) { 2325 if (wq->flags & WQ_UNBOUND) { 2326 raw_spin_unlock(&pool->lock); 2327 cpu_relax(); 2328 goto retry; 2329 } 2330 /* oops */ 2331 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", 2332 wq->name, cpu); 2333 } 2334 2335 /* pwq determined, queue */ 2336 trace_workqueue_queue_work(req_cpu, pwq, work); 2337 2338 if (WARN_ON(!list_empty(&work->entry))) 2339 goto out; 2340 2341 pwq->nr_in_flight[pwq->work_color]++; 2342 work_flags = work_color_to_flags(pwq->work_color); 2343 2344 /* 2345 * Limit the number of concurrently active work items to max_active. 2346 * @work must also queue behind existing inactive work items to maintain 2347 * ordering when max_active changes. See wq_adjust_max_active(). 2348 */ 2349 if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { 2350 if (list_empty(&pool->worklist)) 2351 pool->watchdog_ts = jiffies; 2352 2353 trace_workqueue_activate_work(work); 2354 insert_work(pwq, work, &pool->worklist, work_flags); 2355 kick_pool(pool); 2356 } else { 2357 work_flags |= WORK_STRUCT_INACTIVE; 2358 insert_work(pwq, work, &pwq->inactive_works, work_flags); 2359 } 2360 2361 out: 2362 raw_spin_unlock(&pool->lock); 2363 rcu_read_unlock(); 2364 } 2365 2366 static bool clear_pending_if_disabled(struct work_struct *work) 2367 { 2368 unsigned long data = *work_data_bits(work); 2369 struct work_offq_data offqd; 2370 2371 if (likely((data & WORK_STRUCT_PWQ) || 2372 !(data & WORK_OFFQ_DISABLE_MASK))) 2373 return false; 2374 2375 work_offqd_unpack(&offqd, data); 2376 set_work_pool_and_clear_pending(work, offqd.pool_id, 2377 work_offqd_pack_flags(&offqd)); 2378 return true; 2379 } 2380 2381 /** 2382 * queue_work_on - queue work on specific cpu 2383 * @cpu: CPU number to execute work on 2384 * @wq: workqueue to use 2385 * @work: work to queue 2386 * 2387 * We queue the work to a specific CPU, the caller must ensure it 2388 * can't go away. Callers that fail to ensure that the specified 2389 * CPU cannot go away will execute on a randomly chosen CPU. 2390 * But note well that callers specifying a CPU that never has been 2391 * online will get a splat. 2392 * 2393 * Return: %false if @work was already on a queue, %true otherwise. 2394 */ 2395 bool queue_work_on(int cpu, struct workqueue_struct *wq, 2396 struct work_struct *work) 2397 { 2398 bool ret = false; 2399 unsigned long irq_flags; 2400 2401 local_irq_save(irq_flags); 2402 2403 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && 2404 !clear_pending_if_disabled(work)) { 2405 __queue_work(cpu, wq, work); 2406 ret = true; 2407 } 2408 2409 local_irq_restore(irq_flags); 2410 return ret; 2411 } 2412 EXPORT_SYMBOL(queue_work_on); 2413 2414 /** 2415 * select_numa_node_cpu - Select a CPU based on NUMA node 2416 * @node: NUMA node ID that we want to select a CPU from 2417 * 2418 * This function will attempt to find a "random" cpu available on a given 2419 * node. If there are no CPUs available on the given node it will return 2420 * WORK_CPU_UNBOUND indicating that we should just schedule to any 2421 * available CPU if we need to schedule this work. 2422 */ 2423 static int select_numa_node_cpu(int node) 2424 { 2425 int cpu; 2426 2427 /* Delay binding to CPU if node is not valid or online */ 2428 if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) 2429 return WORK_CPU_UNBOUND; 2430 2431 /* Use local node/cpu if we are already there */ 2432 cpu = raw_smp_processor_id(); 2433 if (node == cpu_to_node(cpu)) 2434 return cpu; 2435 2436 /* Use "random" otherwise know as "first" online CPU of node */ 2437 cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); 2438 2439 /* If CPU is valid return that, otherwise just defer */ 2440 return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; 2441 } 2442 2443 /** 2444 * queue_work_node - queue work on a "random" cpu for a given NUMA node 2445 * @node: NUMA node that we are targeting the work for 2446 * @wq: workqueue to use 2447 * @work: work to queue 2448 * 2449 * We queue the work to a "random" CPU within a given NUMA node. The basic 2450 * idea here is to provide a way to somehow associate work with a given 2451 * NUMA node. 2452 * 2453 * This function will only make a best effort attempt at getting this onto 2454 * the right NUMA node. If no node is requested or the requested node is 2455 * offline then we just fall back to standard queue_work behavior. 2456 * 2457 * Currently the "random" CPU ends up being the first available CPU in the 2458 * intersection of cpu_online_mask and the cpumask of the node, unless we 2459 * are running on the node. In that case we just use the current CPU. 2460 * 2461 * Return: %false if @work was already on a queue, %true otherwise. 2462 */ 2463 bool queue_work_node(int node, struct workqueue_struct *wq, 2464 struct work_struct *work) 2465 { 2466 unsigned long irq_flags; 2467 bool ret = false; 2468 2469 /* 2470 * This current implementation is specific to unbound workqueues. 2471 * Specifically we only return the first available CPU for a given 2472 * node instead of cycling through individual CPUs within the node. 2473 * 2474 * If this is used with a per-cpu workqueue then the logic in 2475 * workqueue_select_cpu_near would need to be updated to allow for 2476 * some round robin type logic. 2477 */ 2478 WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); 2479 2480 local_irq_save(irq_flags); 2481 2482 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && 2483 !clear_pending_if_disabled(work)) { 2484 int cpu = select_numa_node_cpu(node); 2485 2486 __queue_work(cpu, wq, work); 2487 ret = true; 2488 } 2489 2490 local_irq_restore(irq_flags); 2491 return ret; 2492 } 2493 EXPORT_SYMBOL_GPL(queue_work_node); 2494 2495 void delayed_work_timer_fn(struct timer_list *t) 2496 { 2497 struct delayed_work *dwork = timer_container_of(dwork, t, timer); 2498 2499 /* should have been called from irqsafe timer with irq already off */ 2500 __queue_work(dwork->cpu, dwork->wq, &dwork->work); 2501 } 2502 EXPORT_SYMBOL(delayed_work_timer_fn); 2503 2504 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 2505 struct delayed_work *dwork, unsigned long delay) 2506 { 2507 struct timer_list *timer = &dwork->timer; 2508 struct work_struct *work = &dwork->work; 2509 2510 WARN_ON_ONCE(!wq); 2511 WARN_ON_ONCE(timer->function != delayed_work_timer_fn); 2512 WARN_ON_ONCE(timer_pending(timer)); 2513 WARN_ON_ONCE(!list_empty(&work->entry)); 2514 2515 /* 2516 * If @delay is 0, queue @dwork->work immediately. This is for 2517 * both optimization and correctness. The earliest @timer can 2518 * expire is on the closest next tick and delayed_work users depend 2519 * on that there's no such delay when @delay is 0. 2520 */ 2521 if (!delay) { 2522 __queue_work(cpu, wq, &dwork->work); 2523 return; 2524 } 2525 2526 WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu)); 2527 dwork->wq = wq; 2528 dwork->cpu = cpu; 2529 timer->expires = jiffies + delay; 2530 2531 if (housekeeping_enabled(HK_TYPE_TIMER)) { 2532 /* If the current cpu is a housekeeping cpu, use it. */ 2533 cpu = smp_processor_id(); 2534 if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER)) 2535 cpu = housekeeping_any_cpu(HK_TYPE_TIMER); 2536 add_timer_on(timer, cpu); 2537 } else { 2538 if (likely(cpu == WORK_CPU_UNBOUND)) 2539 add_timer_global(timer); 2540 else 2541 add_timer_on(timer, cpu); 2542 } 2543 } 2544 2545 /** 2546 * queue_delayed_work_on - queue work on specific CPU after delay 2547 * @cpu: CPU number to execute work on 2548 * @wq: workqueue to use 2549 * @dwork: work to queue 2550 * @delay: number of jiffies to wait before queueing 2551 * 2552 * We queue the delayed_work to a specific CPU, for non-zero delays the 2553 * caller must ensure it is online and can't go away. Callers that fail 2554 * to ensure this, may get @dwork->timer queued to an offlined CPU and 2555 * this will prevent queueing of @dwork->work unless the offlined CPU 2556 * becomes online again. 2557 * 2558 * Return: %false if @work was already on a queue, %true otherwise. If 2559 * @delay is zero and @dwork is idle, it will be scheduled for immediate 2560 * execution. 2561 */ 2562 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 2563 struct delayed_work *dwork, unsigned long delay) 2564 { 2565 struct work_struct *work = &dwork->work; 2566 bool ret = false; 2567 unsigned long irq_flags; 2568 2569 /* read the comment in __queue_work() */ 2570 local_irq_save(irq_flags); 2571 2572 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && 2573 !clear_pending_if_disabled(work)) { 2574 __queue_delayed_work(cpu, wq, dwork, delay); 2575 ret = true; 2576 } 2577 2578 local_irq_restore(irq_flags); 2579 return ret; 2580 } 2581 EXPORT_SYMBOL(queue_delayed_work_on); 2582 2583 /** 2584 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 2585 * @cpu: CPU number to execute work on 2586 * @wq: workqueue to use 2587 * @dwork: work to queue 2588 * @delay: number of jiffies to wait before queueing 2589 * 2590 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, 2591 * modify @dwork's timer so that it expires after @delay. If @delay is 2592 * zero, @work is guaranteed to be scheduled immediately regardless of its 2593 * current state. 2594 * 2595 * Return: %false if @dwork was idle and queued, %true if @dwork was 2596 * pending and its timer was modified. 2597 * 2598 * This function is safe to call from any context including IRQ handler. 2599 * See try_to_grab_pending() for details. 2600 */ 2601 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, 2602 struct delayed_work *dwork, unsigned long delay) 2603 { 2604 unsigned long irq_flags; 2605 bool ret; 2606 2607 ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); 2608 2609 if (!clear_pending_if_disabled(&dwork->work)) 2610 __queue_delayed_work(cpu, wq, dwork, delay); 2611 2612 local_irq_restore(irq_flags); 2613 return ret; 2614 } 2615 EXPORT_SYMBOL_GPL(mod_delayed_work_on); 2616 2617 static void rcu_work_rcufn(struct rcu_head *rcu) 2618 { 2619 struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); 2620 2621 /* read the comment in __queue_work() */ 2622 local_irq_disable(); 2623 __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); 2624 local_irq_enable(); 2625 } 2626 2627 /** 2628 * queue_rcu_work - queue work after a RCU grace period 2629 * @wq: workqueue to use 2630 * @rwork: work to queue 2631 * 2632 * Return: %false if @rwork was already pending, %true otherwise. Note 2633 * that a full RCU grace period is guaranteed only after a %true return. 2634 * While @rwork is guaranteed to be executed after a %false return, the 2635 * execution may happen before a full RCU grace period has passed. 2636 */ 2637 bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) 2638 { 2639 struct work_struct *work = &rwork->work; 2640 2641 /* 2642 * rcu_work can't be canceled or disabled. Warn if the user reached 2643 * inside @rwork and disabled the inner work. 2644 */ 2645 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && 2646 !WARN_ON_ONCE(clear_pending_if_disabled(work))) { 2647 rwork->wq = wq; 2648 call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); 2649 return true; 2650 } 2651 2652 return false; 2653 } 2654 EXPORT_SYMBOL(queue_rcu_work); 2655 2656 static struct worker *alloc_worker(int node) 2657 { 2658 struct worker *worker; 2659 2660 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); 2661 if (worker) { 2662 INIT_LIST_HEAD(&worker->entry); 2663 INIT_LIST_HEAD(&worker->scheduled); 2664 INIT_LIST_HEAD(&worker->node); 2665 /* on creation a worker is in !idle && prep state */ 2666 worker->flags = WORKER_PREP; 2667 } 2668 return worker; 2669 } 2670 2671 static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) 2672 { 2673 if (pool->cpu < 0 && pool->attrs->affn_strict) 2674 return pool->attrs->__pod_cpumask; 2675 else 2676 return pool->attrs->cpumask; 2677 } 2678 2679 /** 2680 * worker_attach_to_pool() - attach a worker to a pool 2681 * @worker: worker to be attached 2682 * @pool: the target pool 2683 * 2684 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and 2685 * cpu-binding of @worker are kept coordinated with the pool across 2686 * cpu-[un]hotplugs. 2687 */ 2688 static void worker_attach_to_pool(struct worker *worker, 2689 struct worker_pool *pool) 2690 { 2691 mutex_lock(&wq_pool_attach_mutex); 2692 2693 /* 2694 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable 2695 * across this function. See the comments above the flag definition for 2696 * details. BH workers are, while per-CPU, always DISASSOCIATED. 2697 */ 2698 if (pool->flags & POOL_DISASSOCIATED) { 2699 worker->flags |= WORKER_UNBOUND; 2700 } else { 2701 WARN_ON_ONCE(pool->flags & POOL_BH); 2702 kthread_set_per_cpu(worker->task, pool->cpu); 2703 } 2704 2705 if (worker->rescue_wq) 2706 set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); 2707 2708 list_add_tail(&worker->node, &pool->workers); 2709 worker->pool = pool; 2710 2711 mutex_unlock(&wq_pool_attach_mutex); 2712 } 2713 2714 static void unbind_worker(struct worker *worker) 2715 { 2716 lockdep_assert_held(&wq_pool_attach_mutex); 2717 2718 kthread_set_per_cpu(worker->task, -1); 2719 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) 2720 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); 2721 else 2722 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); 2723 } 2724 2725 2726 static void detach_worker(struct worker *worker) 2727 { 2728 lockdep_assert_held(&wq_pool_attach_mutex); 2729 2730 unbind_worker(worker); 2731 list_del(&worker->node); 2732 } 2733 2734 /** 2735 * worker_detach_from_pool() - detach a worker from its pool 2736 * @worker: worker which is attached to its pool 2737 * 2738 * Undo the attaching which had been done in worker_attach_to_pool(). The 2739 * caller worker shouldn't access to the pool after detached except it has 2740 * other reference to the pool. 2741 */ 2742 static void worker_detach_from_pool(struct worker *worker) 2743 { 2744 struct worker_pool *pool = worker->pool; 2745 2746 /* there is one permanent BH worker per CPU which should never detach */ 2747 WARN_ON_ONCE(pool->flags & POOL_BH); 2748 2749 mutex_lock(&wq_pool_attach_mutex); 2750 detach_worker(worker); 2751 worker->pool = NULL; 2752 mutex_unlock(&wq_pool_attach_mutex); 2753 2754 /* clear leftover flags without pool->lock after it is detached */ 2755 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); 2756 } 2757 2758 static int format_worker_id(char *buf, size_t size, struct worker *worker, 2759 struct worker_pool *pool) 2760 { 2761 if (worker->rescue_wq) 2762 return scnprintf(buf, size, "kworker/R-%s", 2763 worker->rescue_wq->name); 2764 2765 if (pool) { 2766 if (pool->cpu >= 0) 2767 return scnprintf(buf, size, "kworker/%d:%d%s", 2768 pool->cpu, worker->id, 2769 pool->attrs->nice < 0 ? "H" : ""); 2770 else 2771 return scnprintf(buf, size, "kworker/u%d:%d", 2772 pool->id, worker->id); 2773 } else { 2774 return scnprintf(buf, size, "kworker/dying"); 2775 } 2776 } 2777 2778 /** 2779 * create_worker - create a new workqueue worker 2780 * @pool: pool the new worker will belong to 2781 * 2782 * Create and start a new worker which is attached to @pool. 2783 * 2784 * CONTEXT: 2785 * Might sleep. Does GFP_KERNEL allocations. 2786 * 2787 * Return: 2788 * Pointer to the newly created worker. 2789 */ 2790 static struct worker *create_worker(struct worker_pool *pool) 2791 { 2792 struct worker *worker; 2793 int id; 2794 2795 /* ID is needed to determine kthread name */ 2796 id = ida_alloc(&pool->worker_ida, GFP_KERNEL); 2797 if (id < 0) { 2798 pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", 2799 ERR_PTR(id)); 2800 return NULL; 2801 } 2802 2803 worker = alloc_worker(pool->node); 2804 if (!worker) { 2805 pr_err_once("workqueue: Failed to allocate a worker\n"); 2806 goto fail; 2807 } 2808 2809 worker->id = id; 2810 2811 if (!(pool->flags & POOL_BH)) { 2812 char id_buf[WORKER_ID_LEN]; 2813 2814 format_worker_id(id_buf, sizeof(id_buf), worker, pool); 2815 worker->task = kthread_create_on_node(worker_thread, worker, 2816 pool->node, "%s", id_buf); 2817 if (IS_ERR(worker->task)) { 2818 if (PTR_ERR(worker->task) == -EINTR) { 2819 pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", 2820 id_buf); 2821 } else { 2822 pr_err_once("workqueue: Failed to create a worker thread: %pe", 2823 worker->task); 2824 } 2825 goto fail; 2826 } 2827 2828 set_user_nice(worker->task, pool->attrs->nice); 2829 kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); 2830 } 2831 2832 /* successful, attach the worker to the pool */ 2833 worker_attach_to_pool(worker, pool); 2834 2835 /* start the newly created worker */ 2836 raw_spin_lock_irq(&pool->lock); 2837 2838 worker->pool->nr_workers++; 2839 worker_enter_idle(worker); 2840 2841 /* 2842 * @worker is waiting on a completion in kthread() and will trigger hung 2843 * check if not woken up soon. As kick_pool() is noop if @pool is empty, 2844 * wake it up explicitly. 2845 */ 2846 if (worker->task) 2847 wake_up_process(worker->task); 2848 2849 raw_spin_unlock_irq(&pool->lock); 2850 2851 return worker; 2852 2853 fail: 2854 ida_free(&pool->worker_ida, id); 2855 kfree(worker); 2856 return NULL; 2857 } 2858 2859 static void detach_dying_workers(struct list_head *cull_list) 2860 { 2861 struct worker *worker; 2862 2863 list_for_each_entry(worker, cull_list, entry) 2864 detach_worker(worker); 2865 } 2866 2867 static void reap_dying_workers(struct list_head *cull_list) 2868 { 2869 struct worker *worker, *tmp; 2870 2871 list_for_each_entry_safe(worker, tmp, cull_list, entry) { 2872 list_del_init(&worker->entry); 2873 kthread_stop_put(worker->task); 2874 kfree(worker); 2875 } 2876 } 2877 2878 /** 2879 * set_worker_dying - Tag a worker for destruction 2880 * @worker: worker to be destroyed 2881 * @list: transfer worker away from its pool->idle_list and into list 2882 * 2883 * Tag @worker for destruction and adjust @pool stats accordingly. The worker 2884 * should be idle. 2885 * 2886 * CONTEXT: 2887 * raw_spin_lock_irq(pool->lock). 2888 */ 2889 static void set_worker_dying(struct worker *worker, struct list_head *list) 2890 { 2891 struct worker_pool *pool = worker->pool; 2892 2893 lockdep_assert_held(&pool->lock); 2894 lockdep_assert_held(&wq_pool_attach_mutex); 2895 2896 /* sanity check frenzy */ 2897 if (WARN_ON(worker->current_work) || 2898 WARN_ON(!list_empty(&worker->scheduled)) || 2899 WARN_ON(!(worker->flags & WORKER_IDLE))) 2900 return; 2901 2902 pool->nr_workers--; 2903 pool->nr_idle--; 2904 2905 worker->flags |= WORKER_DIE; 2906 2907 list_move(&worker->entry, list); 2908 2909 /* get an extra task struct reference for later kthread_stop_put() */ 2910 get_task_struct(worker->task); 2911 } 2912 2913 /** 2914 * idle_worker_timeout - check if some idle workers can now be deleted. 2915 * @t: The pool's idle_timer that just expired 2916 * 2917 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in 2918 * worker_leave_idle(), as a worker flicking between idle and active while its 2919 * pool is at the too_many_workers() tipping point would cause too much timer 2920 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let 2921 * it expire and re-evaluate things from there. 2922 */ 2923 static void idle_worker_timeout(struct timer_list *t) 2924 { 2925 struct worker_pool *pool = timer_container_of(pool, t, idle_timer); 2926 bool do_cull = false; 2927 2928 if (work_pending(&pool->idle_cull_work)) 2929 return; 2930 2931 raw_spin_lock_irq(&pool->lock); 2932 2933 if (too_many_workers(pool)) { 2934 struct worker *worker; 2935 unsigned long expires; 2936 2937 /* idle_list is kept in LIFO order, check the last one */ 2938 worker = list_last_entry(&pool->idle_list, struct worker, entry); 2939 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 2940 do_cull = !time_before(jiffies, expires); 2941 2942 if (!do_cull) 2943 mod_timer(&pool->idle_timer, expires); 2944 } 2945 raw_spin_unlock_irq(&pool->lock); 2946 2947 if (do_cull) 2948 queue_work(system_dfl_wq, &pool->idle_cull_work); 2949 } 2950 2951 /** 2952 * idle_cull_fn - cull workers that have been idle for too long. 2953 * @work: the pool's work for handling these idle workers 2954 * 2955 * This goes through a pool's idle workers and gets rid of those that have been 2956 * idle for at least IDLE_WORKER_TIMEOUT seconds. 2957 * 2958 * We don't want to disturb isolated CPUs because of a pcpu kworker being 2959 * culled, so this also resets worker affinity. This requires a sleepable 2960 * context, hence the split between timer callback and work item. 2961 */ 2962 static void idle_cull_fn(struct work_struct *work) 2963 { 2964 struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); 2965 LIST_HEAD(cull_list); 2966 2967 /* 2968 * Grabbing wq_pool_attach_mutex here ensures an already-running worker 2969 * cannot proceed beyong set_pf_worker() in its self-destruct path. 2970 * This is required as a previously-preempted worker could run after 2971 * set_worker_dying() has happened but before detach_dying_workers() did. 2972 */ 2973 mutex_lock(&wq_pool_attach_mutex); 2974 raw_spin_lock_irq(&pool->lock); 2975 2976 while (too_many_workers(pool)) { 2977 struct worker *worker; 2978 unsigned long expires; 2979 2980 worker = list_last_entry(&pool->idle_list, struct worker, entry); 2981 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 2982 2983 if (time_before(jiffies, expires)) { 2984 mod_timer(&pool->idle_timer, expires); 2985 break; 2986 } 2987 2988 set_worker_dying(worker, &cull_list); 2989 } 2990 2991 raw_spin_unlock_irq(&pool->lock); 2992 detach_dying_workers(&cull_list); 2993 mutex_unlock(&wq_pool_attach_mutex); 2994 2995 reap_dying_workers(&cull_list); 2996 } 2997 2998 static void send_mayday(struct pool_workqueue *pwq) 2999 { 3000 struct workqueue_struct *wq = pwq->wq; 3001 3002 lockdep_assert_held(&wq_mayday_lock); 3003 3004 if (!wq->rescuer) 3005 return; 3006 3007 /* mayday mayday mayday */ 3008 if (list_empty(&pwq->mayday_node)) { 3009 /* 3010 * If @pwq is for an unbound wq, its base ref may be put at 3011 * any time due to an attribute change. Pin @pwq until the 3012 * rescuer is done with it. 3013 */ 3014 get_pwq(pwq); 3015 list_add_tail(&pwq->mayday_node, &wq->maydays); 3016 wake_up_process(wq->rescuer->task); 3017 pwq->stats[PWQ_STAT_MAYDAY]++; 3018 } 3019 } 3020 3021 static void pool_mayday_timeout(struct timer_list *t) 3022 { 3023 struct worker_pool *pool = timer_container_of(pool, t, mayday_timer); 3024 struct work_struct *work; 3025 3026 raw_spin_lock_irq(&pool->lock); 3027 raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ 3028 3029 if (need_to_create_worker(pool)) { 3030 /* 3031 * We've been trying to create a new worker but 3032 * haven't been successful. We might be hitting an 3033 * allocation deadlock. Send distress signals to 3034 * rescuers. 3035 */ 3036 list_for_each_entry(work, &pool->worklist, entry) 3037 send_mayday(get_work_pwq(work)); 3038 } 3039 3040 raw_spin_unlock(&wq_mayday_lock); 3041 raw_spin_unlock_irq(&pool->lock); 3042 3043 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 3044 } 3045 3046 /** 3047 * maybe_create_worker - create a new worker if necessary 3048 * @pool: pool to create a new worker for 3049 * 3050 * Create a new worker for @pool if necessary. @pool is guaranteed to 3051 * have at least one idle worker on return from this function. If 3052 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 3053 * sent to all rescuers with works scheduled on @pool to resolve 3054 * possible allocation deadlock. 3055 * 3056 * On return, need_to_create_worker() is guaranteed to be %false and 3057 * may_start_working() %true. 3058 * 3059 * LOCKING: 3060 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 3061 * multiple times. Does GFP_KERNEL allocations. Called only from 3062 * manager. 3063 */ 3064 static void maybe_create_worker(struct worker_pool *pool) 3065 __releases(&pool->lock) 3066 __acquires(&pool->lock) 3067 { 3068 restart: 3069 raw_spin_unlock_irq(&pool->lock); 3070 3071 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 3072 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 3073 3074 while (true) { 3075 if (create_worker(pool) || !need_to_create_worker(pool)) 3076 break; 3077 3078 schedule_timeout_interruptible(CREATE_COOLDOWN); 3079 3080 if (!need_to_create_worker(pool)) 3081 break; 3082 } 3083 3084 timer_delete_sync(&pool->mayday_timer); 3085 raw_spin_lock_irq(&pool->lock); 3086 /* 3087 * This is necessary even after a new worker was just successfully 3088 * created as @pool->lock was dropped and the new worker might have 3089 * already become busy. 3090 */ 3091 if (need_to_create_worker(pool)) 3092 goto restart; 3093 } 3094 3095 #ifdef CONFIG_PREEMPT_RT 3096 static void worker_lock_callback(struct worker_pool *pool) 3097 { 3098 spin_lock(&pool->cb_lock); 3099 } 3100 3101 static void worker_unlock_callback(struct worker_pool *pool) 3102 { 3103 spin_unlock(&pool->cb_lock); 3104 } 3105 3106 static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) 3107 { 3108 spin_lock(&pool->cb_lock); 3109 spin_unlock(&pool->cb_lock); 3110 } 3111 3112 #else 3113 3114 static void worker_lock_callback(struct worker_pool *pool) { } 3115 static void worker_unlock_callback(struct worker_pool *pool) { } 3116 static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { } 3117 3118 #endif 3119 3120 /** 3121 * manage_workers - manage worker pool 3122 * @worker: self 3123 * 3124 * Assume the manager role and manage the worker pool @worker belongs 3125 * to. At any given time, there can be only zero or one manager per 3126 * pool. The exclusion is handled automatically by this function. 3127 * 3128 * The caller can safely start processing works on false return. On 3129 * true return, it's guaranteed that need_to_create_worker() is false 3130 * and may_start_working() is true. 3131 * 3132 * CONTEXT: 3133 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 3134 * multiple times. Does GFP_KERNEL allocations. 3135 * 3136 * Return: 3137 * %false if the pool doesn't need management and the caller can safely 3138 * start processing works, %true if management function was performed and 3139 * the conditions that the caller verified before calling the function may 3140 * no longer be true. 3141 */ 3142 static bool manage_workers(struct worker *worker) 3143 { 3144 struct worker_pool *pool = worker->pool; 3145 3146 if (pool->flags & POOL_MANAGER_ACTIVE) 3147 return false; 3148 3149 pool->flags |= POOL_MANAGER_ACTIVE; 3150 pool->manager = worker; 3151 3152 maybe_create_worker(pool); 3153 3154 pool->manager = NULL; 3155 pool->flags &= ~POOL_MANAGER_ACTIVE; 3156 rcuwait_wake_up(&manager_wait); 3157 return true; 3158 } 3159 3160 /** 3161 * process_one_work - process single work 3162 * @worker: self 3163 * @work: work to process 3164 * 3165 * Process @work. This function contains all the logics necessary to 3166 * process a single work including synchronization against and 3167 * interaction with other workers on the same cpu, queueing and 3168 * flushing. As long as context requirement is met, any worker can 3169 * call this function to process a work. 3170 * 3171 * CONTEXT: 3172 * raw_spin_lock_irq(pool->lock) which is released and regrabbed. 3173 */ 3174 static void process_one_work(struct worker *worker, struct work_struct *work) 3175 __releases(&pool->lock) 3176 __acquires(&pool->lock) 3177 { 3178 struct pool_workqueue *pwq = get_work_pwq(work); 3179 struct worker_pool *pool = worker->pool; 3180 unsigned long work_data; 3181 int lockdep_start_depth, rcu_start_depth; 3182 bool bh_draining = pool->flags & POOL_BH_DRAINING; 3183 #ifdef CONFIG_LOCKDEP 3184 /* 3185 * It is permissible to free the struct work_struct from 3186 * inside the function that is called from it, this we need to 3187 * take into account for lockdep too. To avoid bogus "held 3188 * lock freed" warnings as well as problems when looking into 3189 * work->lockdep_map, make a copy and use that here. 3190 */ 3191 struct lockdep_map lockdep_map; 3192 3193 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 3194 #endif 3195 /* ensure we're on the correct CPU */ 3196 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && 3197 raw_smp_processor_id() != pool->cpu); 3198 3199 /* claim and dequeue */ 3200 debug_work_deactivate(work); 3201 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); 3202 worker->current_work = work; 3203 worker->current_func = work->func; 3204 worker->current_pwq = pwq; 3205 if (worker->task) 3206 worker->current_at = worker->task->se.sum_exec_runtime; 3207 work_data = *work_data_bits(work); 3208 worker->current_color = get_work_color(work_data); 3209 3210 /* 3211 * Record wq name for cmdline and debug reporting, may get 3212 * overridden through set_worker_desc(). 3213 */ 3214 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); 3215 3216 list_del_init(&work->entry); 3217 3218 /* 3219 * CPU intensive works don't participate in concurrency management. 3220 * They're the scheduler's responsibility. This takes @worker out 3221 * of concurrency management and the next code block will chain 3222 * execution of the pending work items. 3223 */ 3224 if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) 3225 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 3226 3227 /* 3228 * Kick @pool if necessary. It's always noop for per-cpu worker pools 3229 * since nr_running would always be >= 1 at this point. This is used to 3230 * chain execution of the pending work items for WORKER_NOT_RUNNING 3231 * workers such as the UNBOUND and CPU_INTENSIVE ones. 3232 */ 3233 kick_pool(pool); 3234 3235 /* 3236 * Record the last pool and clear PENDING which should be the last 3237 * update to @work. Also, do this inside @pool->lock so that 3238 * PENDING and queued state changes happen together while IRQ is 3239 * disabled. 3240 */ 3241 set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); 3242 3243 pwq->stats[PWQ_STAT_STARTED]++; 3244 raw_spin_unlock_irq(&pool->lock); 3245 3246 rcu_start_depth = rcu_preempt_depth(); 3247 lockdep_start_depth = lockdep_depth(current); 3248 /* see drain_dead_softirq_workfn() */ 3249 if (!bh_draining) 3250 lock_map_acquire(pwq->wq->lockdep_map); 3251 lock_map_acquire(&lockdep_map); 3252 /* 3253 * Strictly speaking we should mark the invariant state without holding 3254 * any locks, that is, before these two lock_map_acquire()'s. 3255 * 3256 * However, that would result in: 3257 * 3258 * A(W1) 3259 * WFC(C) 3260 * A(W1) 3261 * C(C) 3262 * 3263 * Which would create W1->C->W1 dependencies, even though there is no 3264 * actual deadlock possible. There are two solutions, using a 3265 * read-recursive acquire on the work(queue) 'locks', but this will then 3266 * hit the lockdep limitation on recursive locks, or simply discard 3267 * these locks. 3268 * 3269 * AFAICT there is no possible deadlock scenario between the 3270 * flush_work() and complete() primitives (except for single-threaded 3271 * workqueues), so hiding them isn't a problem. 3272 */ 3273 lockdep_invariant_state(true); 3274 trace_workqueue_execute_start(work); 3275 worker->current_func(work); 3276 /* 3277 * While we must be careful to not use "work" after this, the trace 3278 * point will only record its address. 3279 */ 3280 trace_workqueue_execute_end(work, worker->current_func); 3281 3282 lock_map_release(&lockdep_map); 3283 if (!bh_draining) 3284 lock_map_release(pwq->wq->lockdep_map); 3285 3286 if (unlikely((worker->task && in_atomic()) || 3287 lockdep_depth(current) != lockdep_start_depth || 3288 rcu_preempt_depth() != rcu_start_depth)) { 3289 pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n" 3290 " preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n", 3291 current->comm, task_pid_nr(current), preempt_count(), 3292 lockdep_start_depth, lockdep_depth(current), 3293 rcu_start_depth, rcu_preempt_depth(), 3294 worker->current_func); 3295 debug_show_held_locks(current); 3296 dump_stack(); 3297 } 3298 3299 /* 3300 * The following prevents a kworker from hogging CPU on !PREEMPTION 3301 * kernels, where a requeueing work item waiting for something to 3302 * happen could deadlock with stop_machine as such work item could 3303 * indefinitely requeue itself while all other CPUs are trapped in 3304 * stop_machine. At the same time, report a quiescent RCU state so 3305 * the same condition doesn't freeze RCU. 3306 */ 3307 if (worker->task) 3308 cond_resched(); 3309 3310 raw_spin_lock_irq(&pool->lock); 3311 3312 pwq->stats[PWQ_STAT_COMPLETED]++; 3313 3314 /* 3315 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked 3316 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than 3317 * wq_cpu_intensive_thresh_us. Clear it. 3318 */ 3319 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 3320 3321 /* tag the worker for identification in schedule() */ 3322 worker->last_func = worker->current_func; 3323 3324 /* we're done with it, release */ 3325 hash_del(&worker->hentry); 3326 worker->current_work = NULL; 3327 worker->current_func = NULL; 3328 worker->current_pwq = NULL; 3329 worker->current_color = INT_MAX; 3330 3331 /* must be the last step, see the function comment */ 3332 pwq_dec_nr_in_flight(pwq, work_data); 3333 } 3334 3335 /** 3336 * process_scheduled_works - process scheduled works 3337 * @worker: self 3338 * 3339 * Process all scheduled works. Please note that the scheduled list 3340 * may change while processing a work, so this function repeatedly 3341 * fetches a work from the top and executes it. 3342 * 3343 * CONTEXT: 3344 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 3345 * multiple times. 3346 */ 3347 static void process_scheduled_works(struct worker *worker) 3348 { 3349 struct work_struct *work; 3350 bool first = true; 3351 3352 while ((work = list_first_entry_or_null(&worker->scheduled, 3353 struct work_struct, entry))) { 3354 if (first) { 3355 worker->pool->watchdog_ts = jiffies; 3356 first = false; 3357 } 3358 process_one_work(worker, work); 3359 } 3360 } 3361 3362 static void set_pf_worker(bool val) 3363 { 3364 mutex_lock(&wq_pool_attach_mutex); 3365 if (val) 3366 current->flags |= PF_WQ_WORKER; 3367 else 3368 current->flags &= ~PF_WQ_WORKER; 3369 mutex_unlock(&wq_pool_attach_mutex); 3370 } 3371 3372 /** 3373 * worker_thread - the worker thread function 3374 * @__worker: self 3375 * 3376 * The worker thread function. All workers belong to a worker_pool - 3377 * either a per-cpu one or dynamic unbound one. These workers process all 3378 * work items regardless of their specific target workqueue. The only 3379 * exception is work items which belong to workqueues with a rescuer which 3380 * will be explained in rescuer_thread(). 3381 * 3382 * Return: 0 3383 */ 3384 static int worker_thread(void *__worker) 3385 { 3386 struct worker *worker = __worker; 3387 struct worker_pool *pool = worker->pool; 3388 3389 /* tell the scheduler that this is a workqueue worker */ 3390 set_pf_worker(true); 3391 woke_up: 3392 raw_spin_lock_irq(&pool->lock); 3393 3394 /* am I supposed to die? */ 3395 if (unlikely(worker->flags & WORKER_DIE)) { 3396 raw_spin_unlock_irq(&pool->lock); 3397 set_pf_worker(false); 3398 /* 3399 * The worker is dead and PF_WQ_WORKER is cleared, worker->pool 3400 * shouldn't be accessed, reset it to NULL in case otherwise. 3401 */ 3402 worker->pool = NULL; 3403 ida_free(&pool->worker_ida, worker->id); 3404 return 0; 3405 } 3406 3407 worker_leave_idle(worker); 3408 recheck: 3409 /* no more worker necessary? */ 3410 if (!need_more_worker(pool)) 3411 goto sleep; 3412 3413 /* do we need to manage? */ 3414 if (unlikely(!may_start_working(pool)) && manage_workers(worker)) 3415 goto recheck; 3416 3417 /* 3418 * ->scheduled list can only be filled while a worker is 3419 * preparing to process a work or actually processing it. 3420 * Make sure nobody diddled with it while I was sleeping. 3421 */ 3422 WARN_ON_ONCE(!list_empty(&worker->scheduled)); 3423 3424 /* 3425 * Finish PREP stage. We're guaranteed to have at least one idle 3426 * worker or that someone else has already assumed the manager 3427 * role. This is where @worker starts participating in concurrency 3428 * management if applicable and concurrency management is restored 3429 * after being rebound. See rebind_workers() for details. 3430 */ 3431 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); 3432 3433 do { 3434 struct work_struct *work = 3435 list_first_entry(&pool->worklist, 3436 struct work_struct, entry); 3437 3438 if (assign_work(work, worker, NULL)) 3439 process_scheduled_works(worker); 3440 } while (keep_working(pool)); 3441 3442 worker_set_flags(worker, WORKER_PREP); 3443 sleep: 3444 /* 3445 * pool->lock is held and there's no work to process and no need to 3446 * manage, sleep. Workers are woken up only while holding 3447 * pool->lock or from local cpu, so setting the current state 3448 * before releasing pool->lock is enough to prevent losing any 3449 * event. 3450 */ 3451 worker_enter_idle(worker); 3452 __set_current_state(TASK_IDLE); 3453 raw_spin_unlock_irq(&pool->lock); 3454 schedule(); 3455 goto woke_up; 3456 } 3457 3458 static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer) 3459 { 3460 struct worker_pool *pool = pwq->pool; 3461 struct work_struct *cursor = &pwq->mayday_cursor; 3462 struct work_struct *work, *n; 3463 3464 /* have work items to rescue? */ 3465 if (!pwq->nr_active) 3466 return false; 3467 3468 /* need rescue? */ 3469 if (!need_to_create_worker(pool)) { 3470 /* 3471 * The pool has idle workers and doesn't need the rescuer, so it 3472 * could simply return false here. 3473 * 3474 * However, the memory pressure might not be fully relieved. 3475 * In PERCPU pool with concurrency enabled, having idle workers 3476 * does not necessarily mean memory pressure is gone; it may 3477 * simply mean regular workers have woken up, completed their 3478 * work, and gone idle again due to concurrency limits. 3479 * 3480 * In this case, those working workers may later sleep again, 3481 * the pool may run out of idle workers, and it will have to 3482 * allocate new ones and wait for the timer to send mayday, 3483 * causing unnecessary delay - especially if memory pressure 3484 * was never resolved throughout. 3485 * 3486 * Do more work if memory pressure is still on to reduce 3487 * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though 3488 * not precisely, unless there are other PWQs needing help. 3489 */ 3490 if (!(pool->flags & POOL_MANAGER_ACTIVE) || 3491 !list_empty(&pwq->wq->maydays)) 3492 return false; 3493 } 3494 3495 /* search from the start or cursor if available */ 3496 if (list_empty(&cursor->entry)) 3497 work = list_first_entry(&pool->worklist, struct work_struct, entry); 3498 else 3499 work = list_next_entry(cursor, entry); 3500 3501 /* find the next work item to rescue */ 3502 list_for_each_entry_safe_from(work, n, &pool->worklist, entry) { 3503 if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) { 3504 pwq->stats[PWQ_STAT_RESCUED]++; 3505 /* put the cursor for next search */ 3506 list_move_tail(&cursor->entry, &n->entry); 3507 return true; 3508 } 3509 } 3510 3511 return false; 3512 } 3513 3514 /** 3515 * rescuer_thread - the rescuer thread function 3516 * @__rescuer: self 3517 * 3518 * Workqueue rescuer thread function. There's one rescuer for each 3519 * workqueue which has WQ_MEM_RECLAIM set. 3520 * 3521 * Regular work processing on a pool may block trying to create a new 3522 * worker which uses GFP_KERNEL allocation which has slight chance of 3523 * developing into deadlock if some works currently on the same queue 3524 * need to be processed to satisfy the GFP_KERNEL allocation. This is 3525 * the problem rescuer solves. 3526 * 3527 * When such condition is possible, the pool summons rescuers of all 3528 * workqueues which have works queued on the pool and let them process 3529 * those works so that forward progress can be guaranteed. 3530 * 3531 * This should happen rarely. 3532 * 3533 * Return: 0 3534 */ 3535 static int rescuer_thread(void *__rescuer) 3536 { 3537 struct worker *rescuer = __rescuer; 3538 struct workqueue_struct *wq = rescuer->rescue_wq; 3539 bool should_stop; 3540 3541 set_user_nice(current, RESCUER_NICE_LEVEL); 3542 3543 /* 3544 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it 3545 * doesn't participate in concurrency management. 3546 */ 3547 set_pf_worker(true); 3548 repeat: 3549 set_current_state(TASK_IDLE); 3550 3551 /* 3552 * By the time the rescuer is requested to stop, the workqueue 3553 * shouldn't have any work pending, but @wq->maydays may still have 3554 * pwq(s) queued. This can happen by non-rescuer workers consuming 3555 * all the work items before the rescuer got to them. Go through 3556 * @wq->maydays processing before acting on should_stop so that the 3557 * list is always empty on exit. 3558 */ 3559 should_stop = kthread_should_stop(); 3560 3561 /* see whether any pwq is asking for help */ 3562 raw_spin_lock_irq(&wq_mayday_lock); 3563 3564 while (!list_empty(&wq->maydays)) { 3565 struct pool_workqueue *pwq = list_first_entry(&wq->maydays, 3566 struct pool_workqueue, mayday_node); 3567 struct worker_pool *pool = pwq->pool; 3568 unsigned int count = 0; 3569 3570 __set_current_state(TASK_RUNNING); 3571 list_del_init(&pwq->mayday_node); 3572 3573 raw_spin_unlock_irq(&wq_mayday_lock); 3574 3575 worker_attach_to_pool(rescuer, pool); 3576 3577 raw_spin_lock_irq(&pool->lock); 3578 3579 WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); 3580 3581 while (assign_rescuer_work(pwq, rescuer)) { 3582 process_scheduled_works(rescuer); 3583 3584 /* 3585 * If the per-turn work item limit is reached and other 3586 * PWQs are in mayday, requeue mayday for this PWQ and 3587 * let the rescuer handle the other PWQs first. 3588 */ 3589 if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) && 3590 pwq->nr_active && need_to_create_worker(pool)) { 3591 raw_spin_lock(&wq_mayday_lock); 3592 send_mayday(pwq); 3593 raw_spin_unlock(&wq_mayday_lock); 3594 break; 3595 } 3596 } 3597 3598 /* The cursor can not be left behind without the rescuer watching it. */ 3599 if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node)) 3600 list_del_init(&pwq->mayday_cursor.entry); 3601 3602 /* 3603 * Leave this pool. Notify regular workers; otherwise, we end up 3604 * with 0 concurrency and stalling the execution. 3605 */ 3606 kick_pool(pool); 3607 3608 raw_spin_unlock_irq(&pool->lock); 3609 3610 worker_detach_from_pool(rescuer); 3611 3612 /* 3613 * Put the reference grabbed by send_mayday(). @pool might 3614 * go away any time after it. 3615 */ 3616 put_pwq_unlocked(pwq); 3617 3618 raw_spin_lock_irq(&wq_mayday_lock); 3619 } 3620 3621 raw_spin_unlock_irq(&wq_mayday_lock); 3622 3623 if (should_stop) { 3624 __set_current_state(TASK_RUNNING); 3625 set_pf_worker(false); 3626 return 0; 3627 } 3628 3629 /* rescuers should never participate in concurrency management */ 3630 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 3631 schedule(); 3632 goto repeat; 3633 } 3634 3635 static void bh_worker(struct worker *worker) 3636 { 3637 struct worker_pool *pool = worker->pool; 3638 int nr_restarts = BH_WORKER_RESTARTS; 3639 unsigned long end = jiffies + BH_WORKER_JIFFIES; 3640 3641 worker_lock_callback(pool); 3642 raw_spin_lock_irq(&pool->lock); 3643 worker_leave_idle(worker); 3644 3645 /* 3646 * This function follows the structure of worker_thread(). See there for 3647 * explanations on each step. 3648 */ 3649 if (!need_more_worker(pool)) 3650 goto done; 3651 3652 WARN_ON_ONCE(!list_empty(&worker->scheduled)); 3653 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); 3654 3655 do { 3656 struct work_struct *work = 3657 list_first_entry(&pool->worklist, 3658 struct work_struct, entry); 3659 3660 if (assign_work(work, worker, NULL)) 3661 process_scheduled_works(worker); 3662 } while (keep_working(pool) && 3663 --nr_restarts && time_before(jiffies, end)); 3664 3665 worker_set_flags(worker, WORKER_PREP); 3666 done: 3667 worker_enter_idle(worker); 3668 kick_pool(pool); 3669 raw_spin_unlock_irq(&pool->lock); 3670 worker_unlock_callback(pool); 3671 } 3672 3673 /* 3674 * TODO: Convert all tasklet users to workqueue and use softirq directly. 3675 * 3676 * This is currently called from tasklet[_hi]action() and thus is also called 3677 * whenever there are tasklets to run. Let's do an early exit if there's nothing 3678 * queued. Once conversion from tasklet is complete, the need_more_worker() test 3679 * can be dropped. 3680 * 3681 * After full conversion, we'll add worker->softirq_action, directly use the 3682 * softirq action and obtain the worker pointer from the softirq_action pointer. 3683 */ 3684 void workqueue_softirq_action(bool highpri) 3685 { 3686 struct worker_pool *pool = 3687 &per_cpu(bh_worker_pools, smp_processor_id())[highpri]; 3688 if (need_more_worker(pool)) 3689 bh_worker(list_first_entry(&pool->workers, struct worker, node)); 3690 } 3691 3692 struct wq_drain_dead_softirq_work { 3693 struct work_struct work; 3694 struct worker_pool *pool; 3695 struct completion done; 3696 }; 3697 3698 static void drain_dead_softirq_workfn(struct work_struct *work) 3699 { 3700 struct wq_drain_dead_softirq_work *dead_work = 3701 container_of(work, struct wq_drain_dead_softirq_work, work); 3702 struct worker_pool *pool = dead_work->pool; 3703 bool repeat; 3704 3705 /* 3706 * @pool's CPU is dead and we want to execute its still pending work 3707 * items from this BH work item which is running on a different CPU. As 3708 * its CPU is dead, @pool can't be kicked and, as work execution path 3709 * will be nested, a lockdep annotation needs to be suppressed. Mark 3710 * @pool with %POOL_BH_DRAINING for the special treatments. 3711 */ 3712 raw_spin_lock_irq(&pool->lock); 3713 pool->flags |= POOL_BH_DRAINING; 3714 raw_spin_unlock_irq(&pool->lock); 3715 3716 bh_worker(list_first_entry(&pool->workers, struct worker, node)); 3717 3718 raw_spin_lock_irq(&pool->lock); 3719 pool->flags &= ~POOL_BH_DRAINING; 3720 repeat = need_more_worker(pool); 3721 raw_spin_unlock_irq(&pool->lock); 3722 3723 /* 3724 * bh_worker() might hit consecutive execution limit and bail. If there 3725 * still are pending work items, reschedule self and return so that we 3726 * don't hog this CPU's BH. 3727 */ 3728 if (repeat) { 3729 if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) 3730 queue_work(system_bh_highpri_wq, work); 3731 else 3732 queue_work(system_bh_wq, work); 3733 } else { 3734 complete(&dead_work->done); 3735 } 3736 } 3737 3738 /* 3739 * @cpu is dead. Drain the remaining BH work items on the current CPU. It's 3740 * possible to allocate dead_work per CPU and avoid flushing. However, then we 3741 * have to worry about draining overlapping with CPU coming back online or 3742 * nesting (one CPU's dead_work queued on another CPU which is also dead and so 3743 * on). Let's keep it simple and drain them synchronously. These are BH work 3744 * items which shouldn't be requeued on the same pool. Shouldn't take long. 3745 */ 3746 void workqueue_softirq_dead(unsigned int cpu) 3747 { 3748 int i; 3749 3750 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 3751 struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i]; 3752 struct wq_drain_dead_softirq_work dead_work; 3753 3754 if (!need_more_worker(pool)) 3755 continue; 3756 3757 INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); 3758 dead_work.pool = pool; 3759 init_completion(&dead_work.done); 3760 3761 if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) 3762 queue_work(system_bh_highpri_wq, &dead_work.work); 3763 else 3764 queue_work(system_bh_wq, &dead_work.work); 3765 3766 wait_for_completion(&dead_work.done); 3767 destroy_work_on_stack(&dead_work.work); 3768 } 3769 } 3770 3771 /** 3772 * check_flush_dependency - check for flush dependency sanity 3773 * @target_wq: workqueue being flushed 3774 * @target_work: work item being flushed (NULL for workqueue flushes) 3775 * @from_cancel: are we called from the work cancel path 3776 * 3777 * %current is trying to flush the whole @target_wq or @target_work on it. 3778 * If this is not the cancel path (which implies work being flushed is either 3779 * already running, or will not be at all), check if @target_wq doesn't have 3780 * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running 3781 * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- 3782 * progress guarantee leading to a deadlock. 3783 */ 3784 static void check_flush_dependency(struct workqueue_struct *target_wq, 3785 struct work_struct *target_work, 3786 bool from_cancel) 3787 { 3788 work_func_t target_func; 3789 struct worker *worker; 3790 3791 if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) 3792 return; 3793 3794 worker = current_wq_worker(); 3795 target_func = target_work ? target_work->func : NULL; 3796 3797 WARN_ONCE(current->flags & PF_MEMALLOC, 3798 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", 3799 current->pid, current->comm, target_wq->name, target_func); 3800 WARN_ONCE(worker && ((worker->current_pwq->wq->flags & 3801 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), 3802 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", 3803 worker->current_pwq->wq->name, worker->current_func, 3804 target_wq->name, target_func); 3805 } 3806 3807 struct wq_barrier { 3808 struct work_struct work; 3809 struct completion done; 3810 struct task_struct *task; /* purely informational */ 3811 }; 3812 3813 static void wq_barrier_func(struct work_struct *work) 3814 { 3815 struct wq_barrier *barr = container_of(work, struct wq_barrier, work); 3816 complete(&barr->done); 3817 } 3818 3819 /** 3820 * insert_wq_barrier - insert a barrier work 3821 * @pwq: pwq to insert barrier into 3822 * @barr: wq_barrier to insert 3823 * @target: target work to attach @barr to 3824 * @worker: worker currently executing @target, NULL if @target is not executing 3825 * 3826 * @barr is linked to @target such that @barr is completed only after 3827 * @target finishes execution. Please note that the ordering 3828 * guarantee is observed only with respect to @target and on the local 3829 * cpu. 3830 * 3831 * Currently, a queued barrier can't be canceled. This is because 3832 * try_to_grab_pending() can't determine whether the work to be 3833 * grabbed is at the head of the queue and thus can't clear LINKED 3834 * flag of the previous work while there must be a valid next work 3835 * after a work with LINKED flag set. 3836 * 3837 * Note that when @worker is non-NULL, @target may be modified 3838 * underneath us, so we can't reliably determine pwq from @target. 3839 * 3840 * CONTEXT: 3841 * raw_spin_lock_irq(pool->lock). 3842 */ 3843 static void insert_wq_barrier(struct pool_workqueue *pwq, 3844 struct wq_barrier *barr, 3845 struct work_struct *target, struct worker *worker) 3846 { 3847 static __maybe_unused struct lock_class_key bh_key, thr_key; 3848 unsigned int work_flags = 0; 3849 unsigned int work_color; 3850 struct list_head *head; 3851 3852 /* 3853 * debugobject calls are safe here even with pool->lock locked 3854 * as we know for sure that this will not trigger any of the 3855 * checks and call back into the fixup functions where we 3856 * might deadlock. 3857 * 3858 * BH and threaded workqueues need separate lockdep keys to avoid 3859 * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} 3860 * usage". 3861 */ 3862 INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func, 3863 (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key); 3864 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 3865 3866 init_completion_map(&barr->done, &target->lockdep_map); 3867 3868 barr->task = current; 3869 3870 /* The barrier work item does not participate in nr_active. */ 3871 work_flags |= WORK_STRUCT_INACTIVE; 3872 3873 /* 3874 * If @target is currently being executed, schedule the 3875 * barrier to the worker; otherwise, put it after @target. 3876 */ 3877 if (worker) { 3878 head = worker->scheduled.next; 3879 work_color = worker->current_color; 3880 } else { 3881 unsigned long *bits = work_data_bits(target); 3882 3883 head = target->entry.next; 3884 /* there can already be other linked works, inherit and set */ 3885 work_flags |= *bits & WORK_STRUCT_LINKED; 3886 work_color = get_work_color(*bits); 3887 __set_bit(WORK_STRUCT_LINKED_BIT, bits); 3888 } 3889 3890 pwq->nr_in_flight[work_color]++; 3891 work_flags |= work_color_to_flags(work_color); 3892 3893 insert_work(pwq, &barr->work, head, work_flags); 3894 } 3895 3896 /** 3897 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing 3898 * @wq: workqueue being flushed 3899 * @flush_color: new flush color, < 0 for no-op 3900 * @work_color: new work color, < 0 for no-op 3901 * 3902 * Prepare pwqs for workqueue flushing. 3903 * 3904 * If @flush_color is non-negative, flush_color on all pwqs should be 3905 * -1. If no pwq has in-flight commands at the specified color, all 3906 * pwq->flush_color's stay at -1 and %false is returned. If any pwq 3907 * has in flight commands, its pwq->flush_color is set to 3908 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq 3909 * wakeup logic is armed and %true is returned. 3910 * 3911 * The caller should have initialized @wq->first_flusher prior to 3912 * calling this function with non-negative @flush_color. If 3913 * @flush_color is negative, no flush color update is done and %false 3914 * is returned. 3915 * 3916 * If @work_color is non-negative, all pwqs should have the same 3917 * work_color which is previous to @work_color and all will be 3918 * advanced to @work_color. 3919 * 3920 * CONTEXT: 3921 * mutex_lock(wq->mutex). 3922 * 3923 * Return: 3924 * %true if @flush_color >= 0 and there's something to flush. %false 3925 * otherwise. 3926 */ 3927 static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, 3928 int flush_color, int work_color) 3929 { 3930 bool wait = false; 3931 struct pool_workqueue *pwq; 3932 struct worker_pool *current_pool = NULL; 3933 3934 if (flush_color >= 0) { 3935 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); 3936 atomic_set(&wq->nr_pwqs_to_flush, 1); 3937 } 3938 3939 /* 3940 * For unbound workqueue, pwqs will map to only a few pools. 3941 * Most of the time, pwqs within the same pool will be linked 3942 * sequentially to wq->pwqs by cpu index. So in the majority 3943 * of pwq iters, the pool is the same, only doing lock/unlock 3944 * if the pool has changed. This can largely reduce expensive 3945 * lock operations. 3946 */ 3947 for_each_pwq(pwq, wq) { 3948 if (current_pool != pwq->pool) { 3949 if (likely(current_pool)) 3950 raw_spin_unlock_irq(¤t_pool->lock); 3951 current_pool = pwq->pool; 3952 raw_spin_lock_irq(¤t_pool->lock); 3953 } 3954 3955 if (flush_color >= 0) { 3956 WARN_ON_ONCE(pwq->flush_color != -1); 3957 3958 if (pwq->nr_in_flight[flush_color]) { 3959 pwq->flush_color = flush_color; 3960 atomic_inc(&wq->nr_pwqs_to_flush); 3961 wait = true; 3962 } 3963 } 3964 3965 if (work_color >= 0) { 3966 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); 3967 pwq->work_color = work_color; 3968 } 3969 3970 } 3971 3972 if (current_pool) 3973 raw_spin_unlock_irq(¤t_pool->lock); 3974 3975 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) 3976 complete(&wq->first_flusher->done); 3977 3978 return wait; 3979 } 3980 3981 static void touch_wq_lockdep_map(struct workqueue_struct *wq) 3982 { 3983 #ifdef CONFIG_LOCKDEP 3984 if (unlikely(!wq->lockdep_map)) 3985 return; 3986 3987 if (wq->flags & WQ_BH) 3988 local_bh_disable(); 3989 3990 lock_map_acquire(wq->lockdep_map); 3991 lock_map_release(wq->lockdep_map); 3992 3993 if (wq->flags & WQ_BH) 3994 local_bh_enable(); 3995 #endif 3996 } 3997 3998 static void touch_work_lockdep_map(struct work_struct *work, 3999 struct workqueue_struct *wq) 4000 { 4001 #ifdef CONFIG_LOCKDEP 4002 if (wq->flags & WQ_BH) 4003 local_bh_disable(); 4004 4005 lock_map_acquire(&work->lockdep_map); 4006 lock_map_release(&work->lockdep_map); 4007 4008 if (wq->flags & WQ_BH) 4009 local_bh_enable(); 4010 #endif 4011 } 4012 4013 /** 4014 * __flush_workqueue - ensure that any scheduled work has run to completion. 4015 * @wq: workqueue to flush 4016 * 4017 * This function sleeps until all work items which were queued on entry 4018 * have finished execution, but it is not livelocked by new incoming ones. 4019 */ 4020 void __flush_workqueue(struct workqueue_struct *wq) 4021 { 4022 struct wq_flusher this_flusher = { 4023 .list = LIST_HEAD_INIT(this_flusher.list), 4024 .flush_color = -1, 4025 .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)), 4026 }; 4027 int next_color; 4028 4029 if (WARN_ON(!wq_online)) 4030 return; 4031 4032 touch_wq_lockdep_map(wq); 4033 4034 mutex_lock(&wq->mutex); 4035 4036 /* 4037 * Start-to-wait phase 4038 */ 4039 next_color = work_next_color(wq->work_color); 4040 4041 if (next_color != wq->flush_color) { 4042 /* 4043 * Color space is not full. The current work_color 4044 * becomes our flush_color and work_color is advanced 4045 * by one. 4046 */ 4047 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); 4048 this_flusher.flush_color = wq->work_color; 4049 wq->work_color = next_color; 4050 4051 if (!wq->first_flusher) { 4052 /* no flush in progress, become the first flusher */ 4053 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 4054 4055 wq->first_flusher = &this_flusher; 4056 4057 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, 4058 wq->work_color)) { 4059 /* nothing to flush, done */ 4060 wq->flush_color = next_color; 4061 wq->first_flusher = NULL; 4062 goto out_unlock; 4063 } 4064 } else { 4065 /* wait in queue */ 4066 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); 4067 list_add_tail(&this_flusher.list, &wq->flusher_queue); 4068 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 4069 } 4070 } else { 4071 /* 4072 * Oops, color space is full, wait on overflow queue. 4073 * The next flush completion will assign us 4074 * flush_color and transfer to flusher_queue. 4075 */ 4076 list_add_tail(&this_flusher.list, &wq->flusher_overflow); 4077 } 4078 4079 check_flush_dependency(wq, NULL, false); 4080 4081 mutex_unlock(&wq->mutex); 4082 4083 wait_for_completion(&this_flusher.done); 4084 4085 /* 4086 * Wake-up-and-cascade phase 4087 * 4088 * First flushers are responsible for cascading flushes and 4089 * handling overflow. Non-first flushers can simply return. 4090 */ 4091 if (READ_ONCE(wq->first_flusher) != &this_flusher) 4092 return; 4093 4094 mutex_lock(&wq->mutex); 4095 4096 /* we might have raced, check again with mutex held */ 4097 if (wq->first_flusher != &this_flusher) 4098 goto out_unlock; 4099 4100 WRITE_ONCE(wq->first_flusher, NULL); 4101 4102 WARN_ON_ONCE(!list_empty(&this_flusher.list)); 4103 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 4104 4105 while (true) { 4106 struct wq_flusher *next, *tmp; 4107 4108 /* complete all the flushers sharing the current flush color */ 4109 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { 4110 if (next->flush_color != wq->flush_color) 4111 break; 4112 list_del_init(&next->list); 4113 complete(&next->done); 4114 } 4115 4116 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && 4117 wq->flush_color != work_next_color(wq->work_color)); 4118 4119 /* this flush_color is finished, advance by one */ 4120 wq->flush_color = work_next_color(wq->flush_color); 4121 4122 /* one color has been freed, handle overflow queue */ 4123 if (!list_empty(&wq->flusher_overflow)) { 4124 /* 4125 * Assign the same color to all overflowed 4126 * flushers, advance work_color and append to 4127 * flusher_queue. This is the start-to-wait 4128 * phase for these overflowed flushers. 4129 */ 4130 list_for_each_entry(tmp, &wq->flusher_overflow, list) 4131 tmp->flush_color = wq->work_color; 4132 4133 wq->work_color = work_next_color(wq->work_color); 4134 4135 list_splice_tail_init(&wq->flusher_overflow, 4136 &wq->flusher_queue); 4137 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 4138 } 4139 4140 if (list_empty(&wq->flusher_queue)) { 4141 WARN_ON_ONCE(wq->flush_color != wq->work_color); 4142 break; 4143 } 4144 4145 /* 4146 * Need to flush more colors. Make the next flusher 4147 * the new first flusher and arm pwqs. 4148 */ 4149 WARN_ON_ONCE(wq->flush_color == wq->work_color); 4150 WARN_ON_ONCE(wq->flush_color != next->flush_color); 4151 4152 list_del_init(&next->list); 4153 wq->first_flusher = next; 4154 4155 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) 4156 break; 4157 4158 /* 4159 * Meh... this color is already done, clear first 4160 * flusher and repeat cascading. 4161 */ 4162 wq->first_flusher = NULL; 4163 } 4164 4165 out_unlock: 4166 mutex_unlock(&wq->mutex); 4167 } 4168 EXPORT_SYMBOL(__flush_workqueue); 4169 4170 /** 4171 * drain_workqueue - drain a workqueue 4172 * @wq: workqueue to drain 4173 * 4174 * Wait until the workqueue becomes empty. While draining is in progress, 4175 * only chain queueing is allowed. IOW, only currently pending or running 4176 * work items on @wq can queue further work items on it. @wq is flushed 4177 * repeatedly until it becomes empty. The number of flushing is determined 4178 * by the depth of chaining and should be relatively short. Whine if it 4179 * takes too long. 4180 */ 4181 void drain_workqueue(struct workqueue_struct *wq) 4182 { 4183 unsigned int flush_cnt = 0; 4184 struct pool_workqueue *pwq; 4185 4186 /* 4187 * __queue_work() needs to test whether there are drainers, is much 4188 * hotter than drain_workqueue() and already looks at @wq->flags. 4189 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. 4190 */ 4191 mutex_lock(&wq->mutex); 4192 if (!wq->nr_drainers++) 4193 wq->flags |= __WQ_DRAINING; 4194 mutex_unlock(&wq->mutex); 4195 reflush: 4196 __flush_workqueue(wq); 4197 4198 mutex_lock(&wq->mutex); 4199 4200 for_each_pwq(pwq, wq) { 4201 bool drained; 4202 4203 raw_spin_lock_irq(&pwq->pool->lock); 4204 drained = pwq_is_empty(pwq); 4205 raw_spin_unlock_irq(&pwq->pool->lock); 4206 4207 if (drained) 4208 continue; 4209 4210 if (++flush_cnt == 10 || 4211 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 4212 pr_warn("workqueue %s: %s() isn't complete after %u tries\n", 4213 wq->name, __func__, flush_cnt); 4214 4215 mutex_unlock(&wq->mutex); 4216 goto reflush; 4217 } 4218 4219 if (!--wq->nr_drainers) 4220 wq->flags &= ~__WQ_DRAINING; 4221 mutex_unlock(&wq->mutex); 4222 } 4223 EXPORT_SYMBOL_GPL(drain_workqueue); 4224 4225 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 4226 bool from_cancel) 4227 { 4228 struct worker *worker = NULL; 4229 struct worker_pool *pool; 4230 struct pool_workqueue *pwq; 4231 struct workqueue_struct *wq; 4232 4233 rcu_read_lock(); 4234 pool = get_work_pool(work); 4235 if (!pool) { 4236 rcu_read_unlock(); 4237 return false; 4238 } 4239 4240 raw_spin_lock_irq(&pool->lock); 4241 /* see the comment in try_to_grab_pending() with the same code */ 4242 pwq = get_work_pwq(work); 4243 if (pwq) { 4244 if (unlikely(pwq->pool != pool)) 4245 goto already_gone; 4246 } else { 4247 worker = find_worker_executing_work(pool, work); 4248 if (!worker) 4249 goto already_gone; 4250 pwq = worker->current_pwq; 4251 } 4252 4253 wq = pwq->wq; 4254 check_flush_dependency(wq, work, from_cancel); 4255 4256 insert_wq_barrier(pwq, barr, work, worker); 4257 raw_spin_unlock_irq(&pool->lock); 4258 4259 touch_work_lockdep_map(work, wq); 4260 4261 /* 4262 * Force a lock recursion deadlock when using flush_work() inside a 4263 * single-threaded or rescuer equipped workqueue. 4264 * 4265 * For single threaded workqueues the deadlock happens when the work 4266 * is after the work issuing the flush_work(). For rescuer equipped 4267 * workqueues the deadlock happens when the rescuer stalls, blocking 4268 * forward progress. 4269 */ 4270 if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer)) 4271 touch_wq_lockdep_map(wq); 4272 4273 rcu_read_unlock(); 4274 return true; 4275 already_gone: 4276 raw_spin_unlock_irq(&pool->lock); 4277 rcu_read_unlock(); 4278 return false; 4279 } 4280 4281 static bool __flush_work(struct work_struct *work, bool from_cancel) 4282 { 4283 struct wq_barrier barr; 4284 4285 if (WARN_ON(!wq_online)) 4286 return false; 4287 4288 if (WARN_ON(!work->func)) 4289 return false; 4290 4291 if (!start_flush_work(work, &barr, from_cancel)) 4292 return false; 4293 4294 /* 4295 * start_flush_work() returned %true. If @from_cancel is set, we know 4296 * that @work must have been executing during start_flush_work() and 4297 * can't currently be queued. Its data must contain OFFQ bits. If @work 4298 * was queued on a BH workqueue, we also know that it was running in the 4299 * BH context and thus can be busy-waited. 4300 */ 4301 if (from_cancel) { 4302 unsigned long data = *work_data_bits(work); 4303 4304 if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && 4305 (data & WORK_OFFQ_BH)) { 4306 /* 4307 * On RT, prevent a live lock when %current preempted 4308 * soft interrupt processing by blocking on lock which 4309 * is owned by the thread invoking the callback. 4310 */ 4311 while (!try_wait_for_completion(&barr.done)) { 4312 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 4313 struct worker_pool *pool; 4314 4315 guard(rcu)(); 4316 pool = get_work_pool(work); 4317 if (pool) 4318 workqueue_callback_cancel_wait_running(pool); 4319 } else { 4320 cpu_relax(); 4321 } 4322 } 4323 goto out_destroy; 4324 } 4325 } 4326 4327 wait_for_completion(&barr.done); 4328 4329 out_destroy: 4330 destroy_work_on_stack(&barr.work); 4331 return true; 4332 } 4333 4334 /** 4335 * flush_work - wait for a work to finish executing the last queueing instance 4336 * @work: the work to flush 4337 * 4338 * Wait until @work has finished execution. @work is guaranteed to be idle 4339 * on return if it hasn't been requeued since flush started. 4340 * 4341 * Return: 4342 * %true if flush_work() waited for the work to finish execution, 4343 * %false if it was already idle. 4344 */ 4345 bool flush_work(struct work_struct *work) 4346 { 4347 might_sleep(); 4348 return __flush_work(work, false); 4349 } 4350 EXPORT_SYMBOL_GPL(flush_work); 4351 4352 /** 4353 * flush_delayed_work - wait for a dwork to finish executing the last queueing 4354 * @dwork: the delayed work to flush 4355 * 4356 * Delayed timer is cancelled and the pending work is queued for 4357 * immediate execution. Like flush_work(), this function only 4358 * considers the last queueing instance of @dwork. 4359 * 4360 * Return: 4361 * %true if flush_work() waited for the work to finish execution, 4362 * %false if it was already idle. 4363 */ 4364 bool flush_delayed_work(struct delayed_work *dwork) 4365 { 4366 local_irq_disable(); 4367 if (timer_delete_sync(&dwork->timer)) 4368 __queue_work(dwork->cpu, dwork->wq, &dwork->work); 4369 local_irq_enable(); 4370 return flush_work(&dwork->work); 4371 } 4372 EXPORT_SYMBOL(flush_delayed_work); 4373 4374 /** 4375 * flush_rcu_work - wait for a rwork to finish executing the last queueing 4376 * @rwork: the rcu work to flush 4377 * 4378 * Return: 4379 * %true if flush_rcu_work() waited for the work to finish execution, 4380 * %false if it was already idle. 4381 */ 4382 bool flush_rcu_work(struct rcu_work *rwork) 4383 { 4384 if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { 4385 rcu_barrier(); 4386 flush_work(&rwork->work); 4387 return true; 4388 } else { 4389 return flush_work(&rwork->work); 4390 } 4391 } 4392 EXPORT_SYMBOL(flush_rcu_work); 4393 4394 static void work_offqd_disable(struct work_offq_data *offqd) 4395 { 4396 const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; 4397 4398 if (likely(offqd->disable < max)) 4399 offqd->disable++; 4400 else 4401 WARN_ONCE(true, "workqueue: work disable count overflowed\n"); 4402 } 4403 4404 static void work_offqd_enable(struct work_offq_data *offqd) 4405 { 4406 if (likely(offqd->disable > 0)) 4407 offqd->disable--; 4408 else 4409 WARN_ONCE(true, "workqueue: work disable count underflowed\n"); 4410 } 4411 4412 static bool __cancel_work(struct work_struct *work, u32 cflags) 4413 { 4414 struct work_offq_data offqd; 4415 unsigned long irq_flags; 4416 int ret; 4417 4418 ret = work_grab_pending(work, cflags, &irq_flags); 4419 4420 work_offqd_unpack(&offqd, *work_data_bits(work)); 4421 4422 if (cflags & WORK_CANCEL_DISABLE) 4423 work_offqd_disable(&offqd); 4424 4425 set_work_pool_and_clear_pending(work, offqd.pool_id, 4426 work_offqd_pack_flags(&offqd)); 4427 local_irq_restore(irq_flags); 4428 return ret; 4429 } 4430 4431 static bool __cancel_work_sync(struct work_struct *work, u32 cflags) 4432 { 4433 bool ret; 4434 4435 ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); 4436 4437 if (*work_data_bits(work) & WORK_OFFQ_BH) 4438 WARN_ON_ONCE(in_hardirq()); 4439 else 4440 might_sleep(); 4441 4442 /* 4443 * Skip __flush_work() during early boot when we know that @work isn't 4444 * executing. This allows canceling during early boot. 4445 */ 4446 if (wq_online) 4447 __flush_work(work, true); 4448 4449 if (!(cflags & WORK_CANCEL_DISABLE)) 4450 enable_work(work); 4451 4452 return ret; 4453 } 4454 4455 /* 4456 * See cancel_delayed_work() 4457 */ 4458 bool cancel_work(struct work_struct *work) 4459 { 4460 return __cancel_work(work, 0); 4461 } 4462 EXPORT_SYMBOL(cancel_work); 4463 4464 /** 4465 * cancel_work_sync - cancel a work and wait for it to finish 4466 * @work: the work to cancel 4467 * 4468 * Cancel @work and wait for its execution to finish. This function can be used 4469 * even if the work re-queues itself or migrates to another workqueue. On return 4470 * from this function, @work is guaranteed to be not pending or executing on any 4471 * CPU as long as there aren't racing enqueues. 4472 * 4473 * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. 4474 * Use cancel_delayed_work_sync() instead. 4475 * 4476 * Must be called from a sleepable context if @work was last queued on a non-BH 4477 * workqueue. Can also be called from non-hardirq atomic contexts including BH 4478 * if @work was last queued on a BH workqueue. 4479 * 4480 * Returns %true if @work was pending, %false otherwise. 4481 */ 4482 bool cancel_work_sync(struct work_struct *work) 4483 { 4484 return __cancel_work_sync(work, 0); 4485 } 4486 EXPORT_SYMBOL_GPL(cancel_work_sync); 4487 4488 /** 4489 * cancel_delayed_work - cancel a delayed work 4490 * @dwork: delayed_work to cancel 4491 * 4492 * Kill off a pending delayed_work. 4493 * 4494 * Return: %true if @dwork was pending and canceled; %false if it wasn't 4495 * pending. 4496 * 4497 * Note: 4498 * The work callback function may still be running on return, unless 4499 * it returns %true and the work doesn't re-arm itself. Explicitly flush or 4500 * use cancel_delayed_work_sync() to wait on it. 4501 * 4502 * This function is safe to call from any context including IRQ handler. 4503 */ 4504 bool cancel_delayed_work(struct delayed_work *dwork) 4505 { 4506 return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED); 4507 } 4508 EXPORT_SYMBOL(cancel_delayed_work); 4509 4510 /** 4511 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish 4512 * @dwork: the delayed work cancel 4513 * 4514 * This is cancel_work_sync() for delayed works. 4515 * 4516 * Return: 4517 * %true if @dwork was pending, %false otherwise. 4518 */ 4519 bool cancel_delayed_work_sync(struct delayed_work *dwork) 4520 { 4521 return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED); 4522 } 4523 EXPORT_SYMBOL(cancel_delayed_work_sync); 4524 4525 /** 4526 * disable_work - Disable and cancel a work item 4527 * @work: work item to disable 4528 * 4529 * Disable @work by incrementing its disable count and cancel it if currently 4530 * pending. As long as the disable count is non-zero, any attempt to queue @work 4531 * will fail and return %false. The maximum supported disable depth is 2 to the 4532 * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. 4533 * 4534 * Can be called from any context. Returns %true if @work was pending, %false 4535 * otherwise. 4536 */ 4537 bool disable_work(struct work_struct *work) 4538 { 4539 return __cancel_work(work, WORK_CANCEL_DISABLE); 4540 } 4541 EXPORT_SYMBOL_GPL(disable_work); 4542 4543 /** 4544 * disable_work_sync - Disable, cancel and drain a work item 4545 * @work: work item to disable 4546 * 4547 * Similar to disable_work() but also wait for @work to finish if currently 4548 * executing. 4549 * 4550 * Must be called from a sleepable context if @work was last queued on a non-BH 4551 * workqueue. Can also be called from non-hardirq atomic contexts including BH 4552 * if @work was last queued on a BH workqueue. 4553 * 4554 * Returns %true if @work was pending, %false otherwise. 4555 */ 4556 bool disable_work_sync(struct work_struct *work) 4557 { 4558 return __cancel_work_sync(work, WORK_CANCEL_DISABLE); 4559 } 4560 EXPORT_SYMBOL_GPL(disable_work_sync); 4561 4562 /** 4563 * enable_work - Enable a work item 4564 * @work: work item to enable 4565 * 4566 * Undo disable_work[_sync]() by decrementing @work's disable count. @work can 4567 * only be queued if its disable count is 0. 4568 * 4569 * Can be called from any context. Returns %true if the disable count reached 0. 4570 * Otherwise, %false. 4571 */ 4572 bool enable_work(struct work_struct *work) 4573 { 4574 struct work_offq_data offqd; 4575 unsigned long irq_flags; 4576 4577 work_grab_pending(work, 0, &irq_flags); 4578 4579 work_offqd_unpack(&offqd, *work_data_bits(work)); 4580 work_offqd_enable(&offqd); 4581 set_work_pool_and_clear_pending(work, offqd.pool_id, 4582 work_offqd_pack_flags(&offqd)); 4583 local_irq_restore(irq_flags); 4584 4585 return !offqd.disable; 4586 } 4587 EXPORT_SYMBOL_GPL(enable_work); 4588 4589 /** 4590 * disable_delayed_work - Disable and cancel a delayed work item 4591 * @dwork: delayed work item to disable 4592 * 4593 * disable_work() for delayed work items. 4594 */ 4595 bool disable_delayed_work(struct delayed_work *dwork) 4596 { 4597 return __cancel_work(&dwork->work, 4598 WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); 4599 } 4600 EXPORT_SYMBOL_GPL(disable_delayed_work); 4601 4602 /** 4603 * disable_delayed_work_sync - Disable, cancel and drain a delayed work item 4604 * @dwork: delayed work item to disable 4605 * 4606 * disable_work_sync() for delayed work items. 4607 */ 4608 bool disable_delayed_work_sync(struct delayed_work *dwork) 4609 { 4610 return __cancel_work_sync(&dwork->work, 4611 WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); 4612 } 4613 EXPORT_SYMBOL_GPL(disable_delayed_work_sync); 4614 4615 /** 4616 * enable_delayed_work - Enable a delayed work item 4617 * @dwork: delayed work item to enable 4618 * 4619 * enable_work() for delayed work items. 4620 */ 4621 bool enable_delayed_work(struct delayed_work *dwork) 4622 { 4623 return enable_work(&dwork->work); 4624 } 4625 EXPORT_SYMBOL_GPL(enable_delayed_work); 4626 4627 /** 4628 * schedule_on_each_cpu - execute a function synchronously on each online CPU 4629 * @func: the function to call 4630 * 4631 * schedule_on_each_cpu() executes @func on each online CPU using the 4632 * system workqueue and blocks until all CPUs have completed. 4633 * schedule_on_each_cpu() is very slow. 4634 * 4635 * Return: 4636 * 0 on success, -errno on failure. 4637 */ 4638 int schedule_on_each_cpu(work_func_t func) 4639 { 4640 int cpu; 4641 struct work_struct __percpu *works; 4642 4643 works = alloc_percpu(struct work_struct); 4644 if (!works) 4645 return -ENOMEM; 4646 4647 cpus_read_lock(); 4648 4649 for_each_online_cpu(cpu) { 4650 struct work_struct *work = per_cpu_ptr(works, cpu); 4651 4652 INIT_WORK(work, func); 4653 schedule_work_on(cpu, work); 4654 } 4655 4656 for_each_online_cpu(cpu) 4657 flush_work(per_cpu_ptr(works, cpu)); 4658 4659 cpus_read_unlock(); 4660 free_percpu(works); 4661 return 0; 4662 } 4663 4664 /** 4665 * execute_in_process_context - reliably execute the routine with user context 4666 * @fn: the function to execute 4667 * @ew: guaranteed storage for the execute work structure (must 4668 * be available when the work executes) 4669 * 4670 * Executes the function immediately if process context is available, 4671 * otherwise schedules the function for delayed execution. 4672 * 4673 * Return: 0 - function was executed 4674 * 1 - function was scheduled for execution 4675 */ 4676 int execute_in_process_context(work_func_t fn, struct execute_work *ew) 4677 { 4678 if (!in_interrupt()) { 4679 fn(&ew->work); 4680 return 0; 4681 } 4682 4683 INIT_WORK(&ew->work, fn); 4684 schedule_work(&ew->work); 4685 4686 return 1; 4687 } 4688 EXPORT_SYMBOL_GPL(execute_in_process_context); 4689 4690 /** 4691 * free_workqueue_attrs - free a workqueue_attrs 4692 * @attrs: workqueue_attrs to free 4693 * 4694 * Undo alloc_workqueue_attrs(). 4695 */ 4696 void free_workqueue_attrs(struct workqueue_attrs *attrs) 4697 { 4698 if (attrs) { 4699 free_cpumask_var(attrs->cpumask); 4700 free_cpumask_var(attrs->__pod_cpumask); 4701 kfree(attrs); 4702 } 4703 } 4704 4705 /** 4706 * alloc_workqueue_attrs - allocate a workqueue_attrs 4707 * 4708 * Allocate a new workqueue_attrs, initialize with default settings and 4709 * return it. 4710 * 4711 * Return: The allocated new workqueue_attr on success. %NULL on failure. 4712 */ 4713 struct workqueue_attrs *alloc_workqueue_attrs_noprof(void) 4714 { 4715 struct workqueue_attrs *attrs; 4716 4717 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 4718 if (!attrs) 4719 goto fail; 4720 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) 4721 goto fail; 4722 if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) 4723 goto fail; 4724 4725 cpumask_copy(attrs->cpumask, cpu_possible_mask); 4726 attrs->affn_scope = WQ_AFFN_DFL; 4727 return attrs; 4728 fail: 4729 free_workqueue_attrs(attrs); 4730 return NULL; 4731 } 4732 4733 static void copy_workqueue_attrs(struct workqueue_attrs *to, 4734 const struct workqueue_attrs *from) 4735 { 4736 to->nice = from->nice; 4737 cpumask_copy(to->cpumask, from->cpumask); 4738 cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); 4739 to->affn_strict = from->affn_strict; 4740 4741 /* 4742 * Unlike hash and equality test, copying shouldn't ignore wq-only 4743 * fields as copying is used for both pool and wq attrs. Instead, 4744 * get_unbound_pool() explicitly clears the fields. 4745 */ 4746 to->affn_scope = from->affn_scope; 4747 to->ordered = from->ordered; 4748 } 4749 4750 /* 4751 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the 4752 * comments in 'struct workqueue_attrs' definition. 4753 */ 4754 static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) 4755 { 4756 attrs->affn_scope = WQ_AFFN_NR_TYPES; 4757 attrs->ordered = false; 4758 if (attrs->affn_strict) 4759 cpumask_copy(attrs->cpumask, cpu_possible_mask); 4760 } 4761 4762 /* hash value of the content of @attr */ 4763 static u32 wqattrs_hash(const struct workqueue_attrs *attrs) 4764 { 4765 u32 hash = 0; 4766 4767 hash = jhash_1word(attrs->nice, hash); 4768 hash = jhash_1word(attrs->affn_strict, hash); 4769 hash = jhash(cpumask_bits(attrs->__pod_cpumask), 4770 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); 4771 if (!attrs->affn_strict) 4772 hash = jhash(cpumask_bits(attrs->cpumask), 4773 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); 4774 return hash; 4775 } 4776 4777 /* content equality test */ 4778 static bool wqattrs_equal(const struct workqueue_attrs *a, 4779 const struct workqueue_attrs *b) 4780 { 4781 if (a->nice != b->nice) 4782 return false; 4783 if (a->affn_strict != b->affn_strict) 4784 return false; 4785 if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) 4786 return false; 4787 if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) 4788 return false; 4789 return true; 4790 } 4791 4792 /* Update @attrs with actually available CPUs */ 4793 static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, 4794 const cpumask_t *unbound_cpumask) 4795 { 4796 /* 4797 * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If 4798 * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to 4799 * @unbound_cpumask. 4800 */ 4801 cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); 4802 if (unlikely(cpumask_empty(attrs->cpumask))) 4803 cpumask_copy(attrs->cpumask, unbound_cpumask); 4804 } 4805 4806 /* find wq_pod_type to use for @attrs */ 4807 static const struct wq_pod_type * 4808 wqattrs_pod_type(const struct workqueue_attrs *attrs) 4809 { 4810 enum wq_affn_scope scope; 4811 struct wq_pod_type *pt; 4812 4813 /* to synchronize access to wq_affn_dfl */ 4814 lockdep_assert_held(&wq_pool_mutex); 4815 4816 if (attrs->affn_scope == WQ_AFFN_DFL) 4817 scope = wq_affn_dfl; 4818 else 4819 scope = attrs->affn_scope; 4820 4821 pt = &wq_pod_types[scope]; 4822 4823 if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && 4824 likely(pt->nr_pods)) 4825 return pt; 4826 4827 /* 4828 * Before workqueue_init_topology(), only SYSTEM is available which is 4829 * initialized in workqueue_init_early(). 4830 */ 4831 pt = &wq_pod_types[WQ_AFFN_SYSTEM]; 4832 BUG_ON(!pt->nr_pods); 4833 return pt; 4834 } 4835 4836 /** 4837 * init_worker_pool - initialize a newly zalloc'd worker_pool 4838 * @pool: worker_pool to initialize 4839 * 4840 * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. 4841 * 4842 * Return: 0 on success, -errno on failure. Even on failure, all fields 4843 * inside @pool proper are initialized and put_unbound_pool() can be called 4844 * on @pool safely to release it. 4845 */ 4846 static int init_worker_pool(struct worker_pool *pool) 4847 { 4848 raw_spin_lock_init(&pool->lock); 4849 pool->id = -1; 4850 pool->cpu = -1; 4851 pool->node = NUMA_NO_NODE; 4852 pool->flags |= POOL_DISASSOCIATED; 4853 pool->watchdog_ts = jiffies; 4854 INIT_LIST_HEAD(&pool->worklist); 4855 INIT_LIST_HEAD(&pool->idle_list); 4856 hash_init(pool->busy_hash); 4857 4858 timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); 4859 INIT_WORK(&pool->idle_cull_work, idle_cull_fn); 4860 4861 timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); 4862 4863 INIT_LIST_HEAD(&pool->workers); 4864 4865 ida_init(&pool->worker_ida); 4866 INIT_HLIST_NODE(&pool->hash_node); 4867 pool->refcnt = 1; 4868 #ifdef CONFIG_PREEMPT_RT 4869 spin_lock_init(&pool->cb_lock); 4870 #endif 4871 4872 /* shouldn't fail above this point */ 4873 pool->attrs = alloc_workqueue_attrs(); 4874 if (!pool->attrs) 4875 return -ENOMEM; 4876 4877 wqattrs_clear_for_pool(pool->attrs); 4878 4879 return 0; 4880 } 4881 4882 #ifdef CONFIG_LOCKDEP 4883 static void wq_init_lockdep(struct workqueue_struct *wq) 4884 { 4885 char *lock_name; 4886 4887 lockdep_register_key(&wq->key); 4888 lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); 4889 if (!lock_name) 4890 lock_name = wq->name; 4891 4892 wq->lock_name = lock_name; 4893 wq->lockdep_map = &wq->__lockdep_map; 4894 lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0); 4895 } 4896 4897 static void wq_unregister_lockdep(struct workqueue_struct *wq) 4898 { 4899 if (wq->lockdep_map != &wq->__lockdep_map) 4900 return; 4901 4902 lockdep_unregister_key(&wq->key); 4903 } 4904 4905 static void wq_free_lockdep(struct workqueue_struct *wq) 4906 { 4907 if (wq->lockdep_map != &wq->__lockdep_map) 4908 return; 4909 4910 if (wq->lock_name != wq->name) 4911 kfree(wq->lock_name); 4912 } 4913 #else 4914 static void wq_init_lockdep(struct workqueue_struct *wq) 4915 { 4916 } 4917 4918 static void wq_unregister_lockdep(struct workqueue_struct *wq) 4919 { 4920 } 4921 4922 static void wq_free_lockdep(struct workqueue_struct *wq) 4923 { 4924 } 4925 #endif 4926 4927 static void free_node_nr_active(struct wq_node_nr_active **nna_ar) 4928 { 4929 int node; 4930 4931 for_each_node(node) { 4932 kfree(nna_ar[node]); 4933 nna_ar[node] = NULL; 4934 } 4935 4936 kfree(nna_ar[nr_node_ids]); 4937 nna_ar[nr_node_ids] = NULL; 4938 } 4939 4940 static void init_node_nr_active(struct wq_node_nr_active *nna) 4941 { 4942 nna->max = WQ_DFL_MIN_ACTIVE; 4943 atomic_set(&nna->nr, 0); 4944 raw_spin_lock_init(&nna->lock); 4945 INIT_LIST_HEAD(&nna->pending_pwqs); 4946 } 4947 4948 /* 4949 * Each node's nr_active counter will be accessed mostly from its own node and 4950 * should be allocated in the node. 4951 */ 4952 static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) 4953 { 4954 struct wq_node_nr_active *nna; 4955 int node; 4956 4957 for_each_node(node) { 4958 nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); 4959 if (!nna) 4960 goto err_free; 4961 init_node_nr_active(nna); 4962 nna_ar[node] = nna; 4963 } 4964 4965 /* [nr_node_ids] is used as the fallback */ 4966 nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); 4967 if (!nna) 4968 goto err_free; 4969 init_node_nr_active(nna); 4970 nna_ar[nr_node_ids] = nna; 4971 4972 return 0; 4973 4974 err_free: 4975 free_node_nr_active(nna_ar); 4976 return -ENOMEM; 4977 } 4978 4979 static void rcu_free_wq(struct rcu_head *rcu) 4980 { 4981 struct workqueue_struct *wq = 4982 container_of(rcu, struct workqueue_struct, rcu); 4983 4984 if (wq->flags & WQ_UNBOUND) 4985 free_node_nr_active(wq->node_nr_active); 4986 4987 wq_free_lockdep(wq); 4988 free_percpu(wq->cpu_pwq); 4989 free_workqueue_attrs(wq->unbound_attrs); 4990 kfree(wq); 4991 } 4992 4993 static void rcu_free_pool(struct rcu_head *rcu) 4994 { 4995 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 4996 4997 ida_destroy(&pool->worker_ida); 4998 free_workqueue_attrs(pool->attrs); 4999 kfree(pool); 5000 } 5001 5002 /** 5003 * put_unbound_pool - put a worker_pool 5004 * @pool: worker_pool to put 5005 * 5006 * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU 5007 * safe manner. get_unbound_pool() calls this function on its failure path 5008 * and this function should be able to release pools which went through, 5009 * successfully or not, init_worker_pool(). 5010 * 5011 * Should be called with wq_pool_mutex held. 5012 */ 5013 static void put_unbound_pool(struct worker_pool *pool) 5014 { 5015 struct worker *worker; 5016 LIST_HEAD(cull_list); 5017 5018 lockdep_assert_held(&wq_pool_mutex); 5019 5020 if (--pool->refcnt) 5021 return; 5022 5023 /* sanity checks */ 5024 if (WARN_ON(!(pool->cpu < 0)) || 5025 WARN_ON(!list_empty(&pool->worklist))) 5026 return; 5027 5028 /* release id and unhash */ 5029 if (pool->id >= 0) 5030 idr_remove(&worker_pool_idr, pool->id); 5031 hash_del(&pool->hash_node); 5032 5033 /* 5034 * Become the manager and destroy all workers. This prevents 5035 * @pool's workers from blocking on attach_mutex. We're the last 5036 * manager and @pool gets freed with the flag set. 5037 * 5038 * Having a concurrent manager is quite unlikely to happen as we can 5039 * only get here with 5040 * pwq->refcnt == pool->refcnt == 0 5041 * which implies no work queued to the pool, which implies no worker can 5042 * become the manager. However a worker could have taken the role of 5043 * manager before the refcnts dropped to 0, since maybe_create_worker() 5044 * drops pool->lock 5045 */ 5046 while (true) { 5047 rcuwait_wait_event(&manager_wait, 5048 !(pool->flags & POOL_MANAGER_ACTIVE), 5049 TASK_UNINTERRUPTIBLE); 5050 5051 mutex_lock(&wq_pool_attach_mutex); 5052 raw_spin_lock_irq(&pool->lock); 5053 if (!(pool->flags & POOL_MANAGER_ACTIVE)) { 5054 pool->flags |= POOL_MANAGER_ACTIVE; 5055 break; 5056 } 5057 raw_spin_unlock_irq(&pool->lock); 5058 mutex_unlock(&wq_pool_attach_mutex); 5059 } 5060 5061 while ((worker = first_idle_worker(pool))) 5062 set_worker_dying(worker, &cull_list); 5063 WARN_ON(pool->nr_workers || pool->nr_idle); 5064 raw_spin_unlock_irq(&pool->lock); 5065 5066 detach_dying_workers(&cull_list); 5067 5068 mutex_unlock(&wq_pool_attach_mutex); 5069 5070 reap_dying_workers(&cull_list); 5071 5072 /* shut down the timers */ 5073 timer_delete_sync(&pool->idle_timer); 5074 cancel_work_sync(&pool->idle_cull_work); 5075 timer_delete_sync(&pool->mayday_timer); 5076 5077 /* RCU protected to allow dereferences from get_work_pool() */ 5078 call_rcu(&pool->rcu, rcu_free_pool); 5079 } 5080 5081 /** 5082 * get_unbound_pool - get a worker_pool with the specified attributes 5083 * @attrs: the attributes of the worker_pool to get 5084 * 5085 * Obtain a worker_pool which has the same attributes as @attrs, bump the 5086 * reference count and return it. If there already is a matching 5087 * worker_pool, it will be used; otherwise, this function attempts to 5088 * create a new one. 5089 * 5090 * Should be called with wq_pool_mutex held. 5091 * 5092 * Return: On success, a worker_pool with the same attributes as @attrs. 5093 * On failure, %NULL. 5094 */ 5095 static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) 5096 { 5097 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; 5098 u32 hash = wqattrs_hash(attrs); 5099 struct worker_pool *pool; 5100 int pod, node = NUMA_NO_NODE; 5101 5102 lockdep_assert_held(&wq_pool_mutex); 5103 5104 /* do we already have a matching pool? */ 5105 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { 5106 if (wqattrs_equal(pool->attrs, attrs)) { 5107 pool->refcnt++; 5108 return pool; 5109 } 5110 } 5111 5112 /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ 5113 for (pod = 0; pod < pt->nr_pods; pod++) { 5114 if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { 5115 node = pt->pod_node[pod]; 5116 break; 5117 } 5118 } 5119 5120 /* nope, create a new one */ 5121 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); 5122 if (!pool || init_worker_pool(pool) < 0) 5123 goto fail; 5124 5125 pool->node = node; 5126 copy_workqueue_attrs(pool->attrs, attrs); 5127 wqattrs_clear_for_pool(pool->attrs); 5128 5129 if (worker_pool_assign_id(pool) < 0) 5130 goto fail; 5131 5132 /* create and start the initial worker */ 5133 if (wq_online && !create_worker(pool)) 5134 goto fail; 5135 5136 /* install */ 5137 hash_add(unbound_pool_hash, &pool->hash_node, hash); 5138 5139 return pool; 5140 fail: 5141 if (pool) 5142 put_unbound_pool(pool); 5143 return NULL; 5144 } 5145 5146 /* 5147 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero 5148 * refcnt and needs to be destroyed. 5149 */ 5150 static void pwq_release_workfn(struct kthread_work *work) 5151 { 5152 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, 5153 release_work); 5154 struct workqueue_struct *wq = pwq->wq; 5155 struct worker_pool *pool = pwq->pool; 5156 bool is_last = false; 5157 5158 /* 5159 * When @pwq is not linked, it doesn't hold any reference to the 5160 * @wq, and @wq is invalid to access. 5161 */ 5162 if (!list_empty(&pwq->pwqs_node)) { 5163 mutex_lock(&wq->mutex); 5164 list_del_rcu(&pwq->pwqs_node); 5165 is_last = list_empty(&wq->pwqs); 5166 5167 /* 5168 * For ordered workqueue with a plugged dfl_pwq, restart it now. 5169 */ 5170 if (!is_last && (wq->flags & __WQ_ORDERED)) 5171 unplug_oldest_pwq(wq); 5172 5173 mutex_unlock(&wq->mutex); 5174 } 5175 5176 if (wq->flags & WQ_UNBOUND) { 5177 mutex_lock(&wq_pool_mutex); 5178 put_unbound_pool(pool); 5179 mutex_unlock(&wq_pool_mutex); 5180 } 5181 5182 if (!list_empty(&pwq->pending_node)) { 5183 struct wq_node_nr_active *nna = 5184 wq_node_nr_active(pwq->wq, pwq->pool->node); 5185 5186 raw_spin_lock_irq(&nna->lock); 5187 list_del_init(&pwq->pending_node); 5188 raw_spin_unlock_irq(&nna->lock); 5189 } 5190 5191 kfree_rcu(pwq, rcu); 5192 5193 /* 5194 * If we're the last pwq going away, @wq is already dead and no one 5195 * is gonna access it anymore. Schedule RCU free. 5196 */ 5197 if (is_last) { 5198 wq_unregister_lockdep(wq); 5199 call_rcu(&wq->rcu, rcu_free_wq); 5200 } 5201 } 5202 5203 /* initialize newly allocated @pwq which is associated with @wq and @pool */ 5204 static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, 5205 struct worker_pool *pool) 5206 { 5207 BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK); 5208 5209 memset(pwq, 0, sizeof(*pwq)); 5210 5211 pwq->pool = pool; 5212 pwq->wq = wq; 5213 pwq->flush_color = -1; 5214 pwq->refcnt = 1; 5215 INIT_LIST_HEAD(&pwq->inactive_works); 5216 INIT_LIST_HEAD(&pwq->pending_node); 5217 INIT_LIST_HEAD(&pwq->pwqs_node); 5218 INIT_LIST_HEAD(&pwq->mayday_node); 5219 kthread_init_work(&pwq->release_work, pwq_release_workfn); 5220 5221 /* 5222 * Set the dummy cursor work with valid function and get_work_pwq(). 5223 * 5224 * The cursor work should only be in the pwq->pool->worklist, and 5225 * should not be treated as a processable work item. 5226 * 5227 * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less 5228 * surprise for kernel debugging tools and reviewers. 5229 */ 5230 INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func); 5231 atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq | 5232 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE); 5233 } 5234 5235 /* sync @pwq with the current state of its associated wq and link it */ 5236 static void link_pwq(struct pool_workqueue *pwq) 5237 { 5238 struct workqueue_struct *wq = pwq->wq; 5239 5240 lockdep_assert_held(&wq->mutex); 5241 5242 /* may be called multiple times, ignore if already linked */ 5243 if (!list_empty(&pwq->pwqs_node)) 5244 return; 5245 5246 /* set the matching work_color */ 5247 pwq->work_color = wq->work_color; 5248 5249 /* link in @pwq */ 5250 list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); 5251 } 5252 5253 /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ 5254 static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, 5255 const struct workqueue_attrs *attrs) 5256 { 5257 struct worker_pool *pool; 5258 struct pool_workqueue *pwq; 5259 5260 lockdep_assert_held(&wq_pool_mutex); 5261 5262 pool = get_unbound_pool(attrs); 5263 if (!pool) 5264 return NULL; 5265 5266 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); 5267 if (!pwq) { 5268 put_unbound_pool(pool); 5269 return NULL; 5270 } 5271 5272 init_pwq(pwq, wq, pool); 5273 return pwq; 5274 } 5275 5276 static void apply_wqattrs_lock(void) 5277 { 5278 mutex_lock(&wq_pool_mutex); 5279 } 5280 5281 static void apply_wqattrs_unlock(void) 5282 { 5283 mutex_unlock(&wq_pool_mutex); 5284 } 5285 5286 /** 5287 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod 5288 * @attrs: the wq_attrs of the default pwq of the target workqueue 5289 * @cpu: the target CPU 5290 * 5291 * Calculate the cpumask a workqueue with @attrs should use on @pod. 5292 * The result is stored in @attrs->__pod_cpumask. 5293 * 5294 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled 5295 * and @pod has online CPUs requested by @attrs, the returned cpumask is the 5296 * intersection of the possible CPUs of @pod and @attrs->cpumask. 5297 * 5298 * The caller is responsible for ensuring that the cpumask of @pod stays stable. 5299 */ 5300 static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu) 5301 { 5302 const struct wq_pod_type *pt = wqattrs_pod_type(attrs); 5303 int pod = pt->cpu_pod[cpu]; 5304 5305 /* calculate possible CPUs in @pod that @attrs wants */ 5306 cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); 5307 /* does @pod have any online CPUs @attrs wants? */ 5308 if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) { 5309 cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); 5310 return; 5311 } 5312 } 5313 5314 /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ 5315 static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, 5316 int cpu, struct pool_workqueue *pwq) 5317 { 5318 struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu); 5319 struct pool_workqueue *old_pwq; 5320 5321 lockdep_assert_held(&wq_pool_mutex); 5322 lockdep_assert_held(&wq->mutex); 5323 5324 /* link_pwq() can handle duplicate calls */ 5325 link_pwq(pwq); 5326 5327 old_pwq = rcu_access_pointer(*slot); 5328 rcu_assign_pointer(*slot, pwq); 5329 return old_pwq; 5330 } 5331 5332 /* context to store the prepared attrs & pwqs before applying */ 5333 struct apply_wqattrs_ctx { 5334 struct workqueue_struct *wq; /* target workqueue */ 5335 struct workqueue_attrs *attrs; /* attrs to apply */ 5336 struct list_head list; /* queued for batching commit */ 5337 struct pool_workqueue *dfl_pwq; 5338 struct pool_workqueue *pwq_tbl[]; 5339 }; 5340 5341 /* free the resources after success or abort */ 5342 static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) 5343 { 5344 if (ctx) { 5345 int cpu; 5346 5347 for_each_possible_cpu(cpu) 5348 put_pwq_unlocked(ctx->pwq_tbl[cpu]); 5349 put_pwq_unlocked(ctx->dfl_pwq); 5350 5351 free_workqueue_attrs(ctx->attrs); 5352 5353 kfree(ctx); 5354 } 5355 } 5356 5357 /* allocate the attrs and pwqs for later installation */ 5358 static struct apply_wqattrs_ctx * 5359 apply_wqattrs_prepare(struct workqueue_struct *wq, 5360 const struct workqueue_attrs *attrs, 5361 const cpumask_var_t unbound_cpumask) 5362 { 5363 struct apply_wqattrs_ctx *ctx; 5364 struct workqueue_attrs *new_attrs; 5365 int cpu; 5366 5367 lockdep_assert_held(&wq_pool_mutex); 5368 5369 if (WARN_ON(attrs->affn_scope < 0 || 5370 attrs->affn_scope >= WQ_AFFN_NR_TYPES)) 5371 return ERR_PTR(-EINVAL); 5372 5373 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); 5374 5375 new_attrs = alloc_workqueue_attrs(); 5376 if (!ctx || !new_attrs) 5377 goto out_free; 5378 5379 /* 5380 * If something goes wrong during CPU up/down, we'll fall back to 5381 * the default pwq covering whole @attrs->cpumask. Always create 5382 * it even if we don't use it immediately. 5383 */ 5384 copy_workqueue_attrs(new_attrs, attrs); 5385 wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); 5386 cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); 5387 ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); 5388 if (!ctx->dfl_pwq) 5389 goto out_free; 5390 5391 for_each_possible_cpu(cpu) { 5392 if (new_attrs->ordered) { 5393 ctx->dfl_pwq->refcnt++; 5394 ctx->pwq_tbl[cpu] = ctx->dfl_pwq; 5395 } else { 5396 wq_calc_pod_cpumask(new_attrs, cpu); 5397 ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); 5398 if (!ctx->pwq_tbl[cpu]) 5399 goto out_free; 5400 } 5401 } 5402 5403 /* save the user configured attrs and sanitize it. */ 5404 copy_workqueue_attrs(new_attrs, attrs); 5405 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); 5406 cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); 5407 ctx->attrs = new_attrs; 5408 5409 /* 5410 * For initialized ordered workqueues, there should only be one pwq 5411 * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution 5412 * of newly queued work items until execution of older work items in 5413 * the old pwq's have completed. 5414 */ 5415 if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)) 5416 ctx->dfl_pwq->plugged = true; 5417 5418 ctx->wq = wq; 5419 return ctx; 5420 5421 out_free: 5422 free_workqueue_attrs(new_attrs); 5423 apply_wqattrs_cleanup(ctx); 5424 return ERR_PTR(-ENOMEM); 5425 } 5426 5427 /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ 5428 static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) 5429 { 5430 int cpu; 5431 5432 /* all pwqs have been created successfully, let's install'em */ 5433 mutex_lock(&ctx->wq->mutex); 5434 5435 copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); 5436 5437 /* save the previous pwqs and install the new ones */ 5438 for_each_possible_cpu(cpu) 5439 ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, 5440 ctx->pwq_tbl[cpu]); 5441 ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); 5442 5443 /* update node_nr_active->max */ 5444 wq_update_node_max_active(ctx->wq, -1); 5445 5446 mutex_unlock(&ctx->wq->mutex); 5447 } 5448 5449 static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, 5450 const struct workqueue_attrs *attrs) 5451 { 5452 struct apply_wqattrs_ctx *ctx; 5453 5454 /* only unbound workqueues can change attributes */ 5455 if (WARN_ON(!(wq->flags & WQ_UNBOUND))) 5456 return -EINVAL; 5457 5458 ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); 5459 if (IS_ERR(ctx)) 5460 return PTR_ERR(ctx); 5461 5462 /* the ctx has been prepared successfully, let's commit it */ 5463 apply_wqattrs_commit(ctx); 5464 apply_wqattrs_cleanup(ctx); 5465 5466 return 0; 5467 } 5468 5469 /** 5470 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue 5471 * @wq: the target workqueue 5472 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() 5473 * 5474 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps 5475 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that 5476 * work items are affine to the pod it was issued on. Older pwqs are released as 5477 * in-flight work items finish. Note that a work item which repeatedly requeues 5478 * itself back-to-back will stay on its current pwq. 5479 * 5480 * Performs GFP_KERNEL allocations. 5481 * 5482 * Return: 0 on success and -errno on failure. 5483 */ 5484 int apply_workqueue_attrs(struct workqueue_struct *wq, 5485 const struct workqueue_attrs *attrs) 5486 { 5487 int ret; 5488 5489 mutex_lock(&wq_pool_mutex); 5490 ret = apply_workqueue_attrs_locked(wq, attrs); 5491 mutex_unlock(&wq_pool_mutex); 5492 5493 return ret; 5494 } 5495 5496 /** 5497 * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug 5498 * @wq: the target workqueue 5499 * @cpu: the CPU to update the pwq slot for 5500 * 5501 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and 5502 * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged. 5503 * 5504 * 5505 * If pod affinity can't be adjusted due to memory allocation failure, it falls 5506 * back to @wq->dfl_pwq which may not be optimal but is always correct. 5507 * 5508 * Note that when the last allowed CPU of a pod goes offline for a workqueue 5509 * with a cpumask spanning multiple pods, the workers which were already 5510 * executing the work items for the workqueue will lose their CPU affinity and 5511 * may execute on any CPU. This is similar to how per-cpu workqueues behave on 5512 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's 5513 * responsibility to flush the work item from CPU_DOWN_PREPARE. 5514 */ 5515 static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu) 5516 { 5517 struct pool_workqueue *old_pwq = NULL, *pwq; 5518 struct workqueue_attrs *target_attrs; 5519 5520 lockdep_assert_held(&wq_pool_mutex); 5521 5522 if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) 5523 return; 5524 5525 /* 5526 * We don't wanna alloc/free wq_attrs for each wq for each CPU. 5527 * Let's use a preallocated one. The following buf is protected by 5528 * CPU hotplug exclusion. 5529 */ 5530 target_attrs = unbound_wq_update_pwq_attrs_buf; 5531 5532 copy_workqueue_attrs(target_attrs, wq->unbound_attrs); 5533 wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); 5534 5535 /* nothing to do if the target cpumask matches the current pwq */ 5536 wq_calc_pod_cpumask(target_attrs, cpu); 5537 if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) 5538 return; 5539 5540 /* create a new pwq */ 5541 pwq = alloc_unbound_pwq(wq, target_attrs); 5542 if (!pwq) { 5543 pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", 5544 wq->name); 5545 goto use_dfl_pwq; 5546 } 5547 5548 /* Install the new pwq. */ 5549 mutex_lock(&wq->mutex); 5550 old_pwq = install_unbound_pwq(wq, cpu, pwq); 5551 goto out_unlock; 5552 5553 use_dfl_pwq: 5554 mutex_lock(&wq->mutex); 5555 pwq = unbound_pwq(wq, -1); 5556 raw_spin_lock_irq(&pwq->pool->lock); 5557 get_pwq(pwq); 5558 raw_spin_unlock_irq(&pwq->pool->lock); 5559 old_pwq = install_unbound_pwq(wq, cpu, pwq); 5560 out_unlock: 5561 mutex_unlock(&wq->mutex); 5562 put_pwq_unlocked(old_pwq); 5563 } 5564 5565 static int alloc_and_link_pwqs(struct workqueue_struct *wq) 5566 { 5567 bool highpri = wq->flags & WQ_HIGHPRI; 5568 int cpu, ret; 5569 5570 lockdep_assert_held(&wq_pool_mutex); 5571 5572 wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); 5573 if (!wq->cpu_pwq) 5574 goto enomem; 5575 5576 if (!(wq->flags & WQ_UNBOUND)) { 5577 struct worker_pool __percpu *pools; 5578 5579 if (wq->flags & WQ_BH) 5580 pools = bh_worker_pools; 5581 else 5582 pools = cpu_worker_pools; 5583 5584 for_each_possible_cpu(cpu) { 5585 struct pool_workqueue **pwq_p; 5586 struct worker_pool *pool; 5587 5588 pool = &(per_cpu_ptr(pools, cpu)[highpri]); 5589 pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); 5590 5591 *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, 5592 pool->node); 5593 if (!*pwq_p) 5594 goto enomem; 5595 5596 init_pwq(*pwq_p, wq, pool); 5597 5598 mutex_lock(&wq->mutex); 5599 link_pwq(*pwq_p); 5600 mutex_unlock(&wq->mutex); 5601 } 5602 return 0; 5603 } 5604 5605 if (wq->flags & __WQ_ORDERED) { 5606 struct pool_workqueue *dfl_pwq; 5607 5608 ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]); 5609 /* there should only be single pwq for ordering guarantee */ 5610 dfl_pwq = rcu_access_pointer(wq->dfl_pwq); 5611 WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || 5612 wq->pwqs.prev != &dfl_pwq->pwqs_node), 5613 "ordering guarantee broken for workqueue %s\n", wq->name); 5614 } else { 5615 ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); 5616 } 5617 5618 return ret; 5619 5620 enomem: 5621 if (wq->cpu_pwq) { 5622 for_each_possible_cpu(cpu) { 5623 struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); 5624 5625 if (pwq) 5626 kmem_cache_free(pwq_cache, pwq); 5627 } 5628 free_percpu(wq->cpu_pwq); 5629 wq->cpu_pwq = NULL; 5630 } 5631 return -ENOMEM; 5632 } 5633 5634 static int wq_clamp_max_active(int max_active, unsigned int flags, 5635 const char *name) 5636 { 5637 if (max_active < 1 || max_active > WQ_MAX_ACTIVE) 5638 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", 5639 max_active, name, 1, WQ_MAX_ACTIVE); 5640 5641 return clamp_val(max_active, 1, WQ_MAX_ACTIVE); 5642 } 5643 5644 /* 5645 * Workqueues which may be used during memory reclaim should have a rescuer 5646 * to guarantee forward progress. 5647 */ 5648 static int init_rescuer(struct workqueue_struct *wq) 5649 { 5650 struct worker *rescuer; 5651 char id_buf[WORKER_ID_LEN]; 5652 int ret; 5653 5654 lockdep_assert_held(&wq_pool_mutex); 5655 5656 if (!(wq->flags & WQ_MEM_RECLAIM)) 5657 return 0; 5658 5659 rescuer = alloc_worker(NUMA_NO_NODE); 5660 if (!rescuer) { 5661 pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", 5662 wq->name); 5663 return -ENOMEM; 5664 } 5665 5666 rescuer->rescue_wq = wq; 5667 format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL); 5668 5669 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf); 5670 if (IS_ERR(rescuer->task)) { 5671 ret = PTR_ERR(rescuer->task); 5672 pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", 5673 wq->name, ERR_PTR(ret)); 5674 kfree(rescuer); 5675 return ret; 5676 } 5677 5678 wq->rescuer = rescuer; 5679 5680 /* initial cpumask is consistent with the detached rescuer and unbind_worker() */ 5681 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) 5682 kthread_bind_mask(rescuer->task, wq_unbound_cpumask); 5683 else 5684 kthread_bind_mask(rescuer->task, cpu_possible_mask); 5685 5686 wake_up_process(rescuer->task); 5687 5688 return 0; 5689 } 5690 5691 /** 5692 * wq_adjust_max_active - update a wq's max_active to the current setting 5693 * @wq: target workqueue 5694 * 5695 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and 5696 * activate inactive work items accordingly. If @wq is freezing, clear 5697 * @wq->max_active to zero. 5698 */ 5699 static void wq_adjust_max_active(struct workqueue_struct *wq) 5700 { 5701 bool activated; 5702 int new_max, new_min; 5703 5704 lockdep_assert_held(&wq->mutex); 5705 5706 if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { 5707 new_max = 0; 5708 new_min = 0; 5709 } else { 5710 new_max = wq->saved_max_active; 5711 new_min = wq->saved_min_active; 5712 } 5713 5714 if (wq->max_active == new_max && wq->min_active == new_min) 5715 return; 5716 5717 /* 5718 * Update @wq->max/min_active and then kick inactive work items if more 5719 * active work items are allowed. This doesn't break work item ordering 5720 * because new work items are always queued behind existing inactive 5721 * work items if there are any. 5722 */ 5723 WRITE_ONCE(wq->max_active, new_max); 5724 WRITE_ONCE(wq->min_active, new_min); 5725 5726 if (wq->flags & WQ_UNBOUND) 5727 wq_update_node_max_active(wq, -1); 5728 5729 if (new_max == 0) 5730 return; 5731 5732 /* 5733 * Round-robin through pwq's activating the first inactive work item 5734 * until max_active is filled. 5735 */ 5736 do { 5737 struct pool_workqueue *pwq; 5738 5739 activated = false; 5740 for_each_pwq(pwq, wq) { 5741 unsigned long irq_flags; 5742 5743 /* can be called during early boot w/ irq disabled */ 5744 raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); 5745 if (pwq_activate_first_inactive(pwq, true)) { 5746 activated = true; 5747 kick_pool(pwq->pool); 5748 } 5749 raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); 5750 } 5751 } while (activated); 5752 } 5753 5754 __printf(1, 0) 5755 static struct workqueue_struct *__alloc_workqueue(const char *fmt, 5756 unsigned int flags, 5757 int max_active, va_list args) 5758 { 5759 struct workqueue_struct *wq; 5760 size_t wq_size; 5761 int name_len; 5762 5763 if (flags & WQ_BH) { 5764 if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS)) 5765 return NULL; 5766 if (WARN_ON_ONCE(max_active)) 5767 return NULL; 5768 } 5769 5770 /* see the comment above the definition of WQ_POWER_EFFICIENT */ 5771 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) 5772 flags |= WQ_UNBOUND; 5773 5774 /* allocate wq and format name */ 5775 if (flags & WQ_UNBOUND) 5776 wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); 5777 else 5778 wq_size = sizeof(*wq); 5779 5780 wq = kzalloc_noprof(wq_size, GFP_KERNEL); 5781 if (!wq) 5782 return NULL; 5783 5784 if (flags & WQ_UNBOUND) { 5785 wq->unbound_attrs = alloc_workqueue_attrs_noprof(); 5786 if (!wq->unbound_attrs) 5787 goto err_free_wq; 5788 } 5789 5790 name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); 5791 5792 if (name_len >= WQ_NAME_LEN) 5793 pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", 5794 wq->name); 5795 5796 if (flags & WQ_BH) { 5797 /* 5798 * BH workqueues always share a single execution context per CPU 5799 * and don't impose any max_active limit. 5800 */ 5801 max_active = INT_MAX; 5802 } else { 5803 max_active = max_active ?: WQ_DFL_ACTIVE; 5804 max_active = wq_clamp_max_active(max_active, flags, wq->name); 5805 } 5806 5807 /* init wq */ 5808 wq->flags = flags; 5809 wq->max_active = max_active; 5810 wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); 5811 wq->saved_max_active = wq->max_active; 5812 wq->saved_min_active = wq->min_active; 5813 mutex_init(&wq->mutex); 5814 atomic_set(&wq->nr_pwqs_to_flush, 0); 5815 INIT_LIST_HEAD(&wq->pwqs); 5816 INIT_LIST_HEAD(&wq->flusher_queue); 5817 INIT_LIST_HEAD(&wq->flusher_overflow); 5818 INIT_LIST_HEAD(&wq->maydays); 5819 5820 INIT_LIST_HEAD(&wq->list); 5821 5822 if (flags & WQ_UNBOUND) { 5823 if (alloc_node_nr_active(wq->node_nr_active) < 0) 5824 goto err_free_wq; 5825 } 5826 5827 /* 5828 * wq_pool_mutex protects the workqueues list, allocations of PWQs, 5829 * and the global freeze state. 5830 */ 5831 apply_wqattrs_lock(); 5832 5833 if (alloc_and_link_pwqs(wq) < 0) 5834 goto err_unlock_free_node_nr_active; 5835 5836 mutex_lock(&wq->mutex); 5837 wq_adjust_max_active(wq); 5838 mutex_unlock(&wq->mutex); 5839 5840 list_add_tail_rcu(&wq->list, &workqueues); 5841 5842 if (wq_online && init_rescuer(wq) < 0) 5843 goto err_unlock_destroy; 5844 5845 apply_wqattrs_unlock(); 5846 5847 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) 5848 goto err_destroy; 5849 5850 return wq; 5851 5852 err_unlock_free_node_nr_active: 5853 apply_wqattrs_unlock(); 5854 /* 5855 * Failed alloc_and_link_pwqs() may leave pending pwq->release_work, 5856 * flushing the pwq_release_worker ensures that the pwq_release_workfn() 5857 * completes before calling kfree(wq). 5858 */ 5859 if (wq->flags & WQ_UNBOUND) { 5860 kthread_flush_worker(pwq_release_worker); 5861 free_node_nr_active(wq->node_nr_active); 5862 } 5863 err_free_wq: 5864 free_workqueue_attrs(wq->unbound_attrs); 5865 kfree(wq); 5866 return NULL; 5867 err_unlock_destroy: 5868 apply_wqattrs_unlock(); 5869 err_destroy: 5870 destroy_workqueue(wq); 5871 return NULL; 5872 } 5873 5874 __printf(1, 4) 5875 struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, 5876 unsigned int flags, 5877 int max_active, ...) 5878 { 5879 struct workqueue_struct *wq; 5880 va_list args; 5881 5882 va_start(args, max_active); 5883 wq = __alloc_workqueue(fmt, flags, max_active, args); 5884 va_end(args); 5885 if (!wq) 5886 return NULL; 5887 5888 wq_init_lockdep(wq); 5889 5890 return wq; 5891 } 5892 EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); 5893 5894 #ifdef CONFIG_LOCKDEP 5895 __printf(1, 5) 5896 struct workqueue_struct * 5897 alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, 5898 int max_active, struct lockdep_map *lockdep_map, ...) 5899 { 5900 struct workqueue_struct *wq; 5901 va_list args; 5902 5903 va_start(args, lockdep_map); 5904 wq = __alloc_workqueue(fmt, flags, max_active, args); 5905 va_end(args); 5906 if (!wq) 5907 return NULL; 5908 5909 wq->lockdep_map = lockdep_map; 5910 5911 return wq; 5912 } 5913 EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map); 5914 #endif 5915 5916 static bool pwq_busy(struct pool_workqueue *pwq) 5917 { 5918 int i; 5919 5920 for (i = 0; i < WORK_NR_COLORS; i++) 5921 if (pwq->nr_in_flight[i]) 5922 return true; 5923 5924 if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1)) 5925 return true; 5926 if (!pwq_is_empty(pwq)) 5927 return true; 5928 5929 return false; 5930 } 5931 5932 /** 5933 * destroy_workqueue - safely terminate a workqueue 5934 * @wq: target workqueue 5935 * 5936 * Safely destroy a workqueue. All work currently pending will be done first. 5937 * 5938 * This function does NOT guarantee that non-pending work that has been 5939 * submitted with queue_delayed_work() and similar functions will be done 5940 * before destroying the workqueue. The fundamental problem is that, currently, 5941 * the workqueue has no way of accessing non-pending delayed_work. delayed_work 5942 * is only linked on the timer-side. All delayed_work must, therefore, be 5943 * canceled before calling this function. 5944 * 5945 * TODO: It would be better if the problem described above wouldn't exist and 5946 * destroy_workqueue() would cleanly cancel all pending and non-pending 5947 * delayed_work. 5948 */ 5949 void destroy_workqueue(struct workqueue_struct *wq) 5950 { 5951 struct pool_workqueue *pwq; 5952 int cpu; 5953 5954 /* 5955 * Remove it from sysfs first so that sanity check failure doesn't 5956 * lead to sysfs name conflicts. 5957 */ 5958 workqueue_sysfs_unregister(wq); 5959 5960 /* mark the workqueue destruction is in progress */ 5961 mutex_lock(&wq->mutex); 5962 wq->flags |= __WQ_DESTROYING; 5963 mutex_unlock(&wq->mutex); 5964 5965 /* drain it before proceeding with destruction */ 5966 drain_workqueue(wq); 5967 5968 /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ 5969 if (wq->rescuer) { 5970 /* rescuer will empty maydays list before exiting */ 5971 kthread_stop(wq->rescuer->task); 5972 kfree(wq->rescuer); 5973 wq->rescuer = NULL; 5974 } 5975 5976 /* 5977 * Sanity checks - grab all the locks so that we wait for all 5978 * in-flight operations which may do put_pwq(). 5979 */ 5980 mutex_lock(&wq_pool_mutex); 5981 mutex_lock(&wq->mutex); 5982 for_each_pwq(pwq, wq) { 5983 raw_spin_lock_irq(&pwq->pool->lock); 5984 if (WARN_ON(pwq_busy(pwq))) { 5985 pr_warn("%s: %s has the following busy pwq\n", 5986 __func__, wq->name); 5987 show_pwq(pwq); 5988 raw_spin_unlock_irq(&pwq->pool->lock); 5989 mutex_unlock(&wq->mutex); 5990 mutex_unlock(&wq_pool_mutex); 5991 show_one_workqueue(wq); 5992 return; 5993 } 5994 raw_spin_unlock_irq(&pwq->pool->lock); 5995 } 5996 mutex_unlock(&wq->mutex); 5997 5998 /* 5999 * wq list is used to freeze wq, remove from list after 6000 * flushing is complete in case freeze races us. 6001 */ 6002 list_del_rcu(&wq->list); 6003 mutex_unlock(&wq_pool_mutex); 6004 6005 /* 6006 * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq 6007 * to put the base refs. @wq will be auto-destroyed from the last 6008 * pwq_put. RCU read lock prevents @wq from going away from under us. 6009 */ 6010 rcu_read_lock(); 6011 6012 for_each_possible_cpu(cpu) { 6013 put_pwq_unlocked(unbound_pwq(wq, cpu)); 6014 RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL); 6015 } 6016 6017 put_pwq_unlocked(unbound_pwq(wq, -1)); 6018 RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL); 6019 6020 rcu_read_unlock(); 6021 } 6022 EXPORT_SYMBOL_GPL(destroy_workqueue); 6023 6024 /** 6025 * workqueue_set_max_active - adjust max_active of a workqueue 6026 * @wq: target workqueue 6027 * @max_active: new max_active value. 6028 * 6029 * Set max_active of @wq to @max_active. See the alloc_workqueue() function 6030 * comment. 6031 * 6032 * CONTEXT: 6033 * Don't call from IRQ context. 6034 */ 6035 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) 6036 { 6037 /* max_active doesn't mean anything for BH workqueues */ 6038 if (WARN_ON(wq->flags & WQ_BH)) 6039 return; 6040 /* disallow meddling with max_active for ordered workqueues */ 6041 if (WARN_ON(wq->flags & __WQ_ORDERED)) 6042 return; 6043 6044 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); 6045 6046 mutex_lock(&wq->mutex); 6047 6048 wq->saved_max_active = max_active; 6049 if (wq->flags & WQ_UNBOUND) 6050 wq->saved_min_active = min(wq->saved_min_active, max_active); 6051 6052 wq_adjust_max_active(wq); 6053 6054 mutex_unlock(&wq->mutex); 6055 } 6056 EXPORT_SYMBOL_GPL(workqueue_set_max_active); 6057 6058 /** 6059 * workqueue_set_min_active - adjust min_active of an unbound workqueue 6060 * @wq: target unbound workqueue 6061 * @min_active: new min_active value 6062 * 6063 * Set min_active of an unbound workqueue. Unlike other types of workqueues, an 6064 * unbound workqueue is not guaranteed to be able to process max_active 6065 * interdependent work items. Instead, an unbound workqueue is guaranteed to be 6066 * able to process min_active number of interdependent work items which is 6067 * %WQ_DFL_MIN_ACTIVE by default. 6068 * 6069 * Use this function to adjust the min_active value between 0 and the current 6070 * max_active. 6071 */ 6072 void workqueue_set_min_active(struct workqueue_struct *wq, int min_active) 6073 { 6074 /* min_active is only meaningful for non-ordered unbound workqueues */ 6075 if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) != 6076 WQ_UNBOUND)) 6077 return; 6078 6079 mutex_lock(&wq->mutex); 6080 wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active); 6081 wq_adjust_max_active(wq); 6082 mutex_unlock(&wq->mutex); 6083 } 6084 6085 /** 6086 * current_work - retrieve %current task's work struct 6087 * 6088 * Determine if %current task is a workqueue worker and what it's working on. 6089 * Useful to find out the context that the %current task is running in. 6090 * 6091 * Return: work struct if %current task is a workqueue worker, %NULL otherwise. 6092 */ 6093 struct work_struct *current_work(void) 6094 { 6095 struct worker *worker = current_wq_worker(); 6096 6097 return worker ? worker->current_work : NULL; 6098 } 6099 EXPORT_SYMBOL(current_work); 6100 6101 /** 6102 * current_is_workqueue_rescuer - is %current workqueue rescuer? 6103 * 6104 * Determine whether %current is a workqueue rescuer. Can be used from 6105 * work functions to determine whether it's being run off the rescuer task. 6106 * 6107 * Return: %true if %current is a workqueue rescuer. %false otherwise. 6108 */ 6109 bool current_is_workqueue_rescuer(void) 6110 { 6111 struct worker *worker = current_wq_worker(); 6112 6113 return worker && worker->rescue_wq; 6114 } 6115 6116 /** 6117 * workqueue_congested - test whether a workqueue is congested 6118 * @cpu: CPU in question 6119 * @wq: target workqueue 6120 * 6121 * Test whether @wq's cpu workqueue for @cpu is congested. There is 6122 * no synchronization around this function and the test result is 6123 * unreliable and only useful as advisory hints or for debugging. 6124 * 6125 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. 6126 * 6127 * With the exception of ordered workqueues, all workqueues have per-cpu 6128 * pool_workqueues, each with its own congested state. A workqueue being 6129 * congested on one CPU doesn't mean that the workqueue is contested on any 6130 * other CPUs. 6131 * 6132 * Return: 6133 * %true if congested, %false otherwise. 6134 */ 6135 bool workqueue_congested(int cpu, struct workqueue_struct *wq) 6136 { 6137 struct pool_workqueue *pwq; 6138 bool ret; 6139 6140 preempt_disable(); 6141 6142 if (cpu == WORK_CPU_UNBOUND) 6143 cpu = smp_processor_id(); 6144 6145 pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); 6146 ret = !list_empty(&pwq->inactive_works); 6147 6148 preempt_enable(); 6149 6150 return ret; 6151 } 6152 EXPORT_SYMBOL_GPL(workqueue_congested); 6153 6154 /** 6155 * work_busy - test whether a work is currently pending or running 6156 * @work: the work to be tested 6157 * 6158 * Test whether @work is currently pending or running. There is no 6159 * synchronization around this function and the test result is 6160 * unreliable and only useful as advisory hints or for debugging. 6161 * 6162 * Return: 6163 * OR'd bitmask of WORK_BUSY_* bits. 6164 */ 6165 unsigned int work_busy(struct work_struct *work) 6166 { 6167 struct worker_pool *pool; 6168 unsigned long irq_flags; 6169 unsigned int ret = 0; 6170 6171 if (work_pending(work)) 6172 ret |= WORK_BUSY_PENDING; 6173 6174 rcu_read_lock(); 6175 pool = get_work_pool(work); 6176 if (pool) { 6177 raw_spin_lock_irqsave(&pool->lock, irq_flags); 6178 if (find_worker_executing_work(pool, work)) 6179 ret |= WORK_BUSY_RUNNING; 6180 raw_spin_unlock_irqrestore(&pool->lock, irq_flags); 6181 } 6182 rcu_read_unlock(); 6183 6184 return ret; 6185 } 6186 EXPORT_SYMBOL_GPL(work_busy); 6187 6188 /** 6189 * set_worker_desc - set description for the current work item 6190 * @fmt: printf-style format string 6191 * @...: arguments for the format string 6192 * 6193 * This function can be called by a running work function to describe what 6194 * the work item is about. If the worker task gets dumped, this 6195 * information will be printed out together to help debugging. The 6196 * description can be at most WORKER_DESC_LEN including the trailing '\0'. 6197 */ 6198 void set_worker_desc(const char *fmt, ...) 6199 { 6200 struct worker *worker = current_wq_worker(); 6201 va_list args; 6202 6203 if (worker) { 6204 va_start(args, fmt); 6205 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); 6206 va_end(args); 6207 } 6208 } 6209 EXPORT_SYMBOL_GPL(set_worker_desc); 6210 6211 /** 6212 * print_worker_info - print out worker information and description 6213 * @log_lvl: the log level to use when printing 6214 * @task: target task 6215 * 6216 * If @task is a worker and currently executing a work item, print out the 6217 * name of the workqueue being serviced and worker description set with 6218 * set_worker_desc() by the currently executing work item. 6219 * 6220 * This function can be safely called on any task as long as the 6221 * task_struct itself is accessible. While safe, this function isn't 6222 * synchronized and may print out mixups or garbages of limited length. 6223 */ 6224 void print_worker_info(const char *log_lvl, struct task_struct *task) 6225 { 6226 work_func_t *fn = NULL; 6227 char name[WQ_NAME_LEN] = { }; 6228 char desc[WORKER_DESC_LEN] = { }; 6229 struct pool_workqueue *pwq = NULL; 6230 struct workqueue_struct *wq = NULL; 6231 struct worker *worker; 6232 6233 if (!(task->flags & PF_WQ_WORKER)) 6234 return; 6235 6236 /* 6237 * This function is called without any synchronization and @task 6238 * could be in any state. Be careful with dereferences. 6239 */ 6240 worker = kthread_probe_data(task); 6241 6242 /* 6243 * Carefully copy the associated workqueue's workfn, name and desc. 6244 * Keep the original last '\0' in case the original is garbage. 6245 */ 6246 copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); 6247 copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); 6248 copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); 6249 copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); 6250 copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); 6251 6252 if (fn || name[0] || desc[0]) { 6253 printk("%sWorkqueue: %s %ps", log_lvl, name, fn); 6254 if (strcmp(name, desc)) 6255 pr_cont(" (%s)", desc); 6256 pr_cont("\n"); 6257 } 6258 } 6259 6260 static void pr_cont_pool_info(struct worker_pool *pool) 6261 { 6262 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); 6263 if (pool->node != NUMA_NO_NODE) 6264 pr_cont(" node=%d", pool->node); 6265 pr_cont(" flags=0x%x", pool->flags); 6266 if (pool->flags & POOL_BH) 6267 pr_cont(" bh%s", 6268 pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); 6269 else 6270 pr_cont(" nice=%d", pool->attrs->nice); 6271 } 6272 6273 static void pr_cont_worker_id(struct worker *worker) 6274 { 6275 struct worker_pool *pool = worker->pool; 6276 6277 if (pool->flags & WQ_BH) 6278 pr_cont("bh%s", 6279 pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); 6280 else 6281 pr_cont("%d%s", task_pid_nr(worker->task), 6282 worker->rescue_wq ? "(RESCUER)" : ""); 6283 } 6284 6285 struct pr_cont_work_struct { 6286 bool comma; 6287 work_func_t func; 6288 long ctr; 6289 }; 6290 6291 static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) 6292 { 6293 if (!pcwsp->ctr) 6294 goto out_record; 6295 if (func == pcwsp->func) { 6296 pcwsp->ctr++; 6297 return; 6298 } 6299 if (pcwsp->ctr == 1) 6300 pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); 6301 else 6302 pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); 6303 pcwsp->ctr = 0; 6304 out_record: 6305 if ((long)func == -1L) 6306 return; 6307 pcwsp->comma = comma; 6308 pcwsp->func = func; 6309 pcwsp->ctr = 1; 6310 } 6311 6312 static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) 6313 { 6314 if (work->func == wq_barrier_func) { 6315 struct wq_barrier *barr; 6316 6317 barr = container_of(work, struct wq_barrier, work); 6318 6319 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); 6320 pr_cont("%s BAR(%d)", comma ? "," : "", 6321 task_pid_nr(barr->task)); 6322 } else { 6323 if (!comma) 6324 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); 6325 pr_cont_work_flush(comma, work->func, pcwsp); 6326 } 6327 } 6328 6329 static void show_pwq(struct pool_workqueue *pwq) 6330 { 6331 struct pr_cont_work_struct pcws = { .ctr = 0, }; 6332 struct worker_pool *pool = pwq->pool; 6333 struct work_struct *work; 6334 struct worker *worker; 6335 bool has_in_flight = false, has_pending = false; 6336 int bkt; 6337 6338 pr_info(" pwq %d:", pool->id); 6339 pr_cont_pool_info(pool); 6340 6341 pr_cont(" active=%d refcnt=%d%s\n", 6342 pwq->nr_active, pwq->refcnt, 6343 !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); 6344 6345 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 6346 if (worker->current_pwq == pwq) { 6347 has_in_flight = true; 6348 break; 6349 } 6350 } 6351 if (has_in_flight) { 6352 bool comma = false; 6353 6354 pr_info(" in-flight:"); 6355 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 6356 if (worker->current_pwq != pwq) 6357 continue; 6358 6359 pr_cont(" %s", comma ? "," : ""); 6360 pr_cont_worker_id(worker); 6361 pr_cont(":%ps", worker->current_func); 6362 list_for_each_entry(work, &worker->scheduled, entry) 6363 pr_cont_work(false, work, &pcws); 6364 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 6365 comma = true; 6366 } 6367 pr_cont("\n"); 6368 } 6369 6370 list_for_each_entry(work, &pool->worklist, entry) { 6371 if (get_work_pwq(work) == pwq) { 6372 has_pending = true; 6373 break; 6374 } 6375 } 6376 if (has_pending) { 6377 bool comma = false; 6378 6379 pr_info(" pending:"); 6380 list_for_each_entry(work, &pool->worklist, entry) { 6381 if (get_work_pwq(work) != pwq) 6382 continue; 6383 6384 pr_cont_work(comma, work, &pcws); 6385 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); 6386 } 6387 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 6388 pr_cont("\n"); 6389 } 6390 6391 if (!list_empty(&pwq->inactive_works)) { 6392 bool comma = false; 6393 6394 pr_info(" inactive:"); 6395 list_for_each_entry(work, &pwq->inactive_works, entry) { 6396 pr_cont_work(comma, work, &pcws); 6397 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); 6398 } 6399 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 6400 pr_cont("\n"); 6401 } 6402 } 6403 6404 /** 6405 * show_one_workqueue - dump state of specified workqueue 6406 * @wq: workqueue whose state will be printed 6407 */ 6408 void show_one_workqueue(struct workqueue_struct *wq) 6409 { 6410 struct pool_workqueue *pwq; 6411 bool idle = true; 6412 unsigned long irq_flags; 6413 6414 for_each_pwq(pwq, wq) { 6415 if (!pwq_is_empty(pwq)) { 6416 idle = false; 6417 break; 6418 } 6419 } 6420 if (idle) /* Nothing to print for idle workqueue */ 6421 return; 6422 6423 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); 6424 6425 for_each_pwq(pwq, wq) { 6426 raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); 6427 if (!pwq_is_empty(pwq)) { 6428 /* 6429 * Defer printing to avoid deadlocks in console 6430 * drivers that queue work while holding locks 6431 * also taken in their write paths. 6432 */ 6433 printk_deferred_enter(); 6434 show_pwq(pwq); 6435 printk_deferred_exit(); 6436 } 6437 raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); 6438 /* 6439 * We could be printing a lot from atomic context, e.g. 6440 * sysrq-t -> show_all_workqueues(). Avoid triggering 6441 * hard lockup. 6442 */ 6443 touch_nmi_watchdog(); 6444 } 6445 6446 } 6447 6448 /** 6449 * show_one_worker_pool - dump state of specified worker pool 6450 * @pool: worker pool whose state will be printed 6451 */ 6452 static void show_one_worker_pool(struct worker_pool *pool) 6453 { 6454 struct worker *worker; 6455 bool first = true; 6456 unsigned long irq_flags; 6457 unsigned long hung = 0; 6458 6459 raw_spin_lock_irqsave(&pool->lock, irq_flags); 6460 if (pool->nr_workers == pool->nr_idle) 6461 goto next_pool; 6462 6463 /* How long the first pending work is waiting for a worker. */ 6464 if (!list_empty(&pool->worklist)) 6465 hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; 6466 6467 /* 6468 * Defer printing to avoid deadlocks in console drivers that 6469 * queue work while holding locks also taken in their write 6470 * paths. 6471 */ 6472 printk_deferred_enter(); 6473 pr_info("pool %d:", pool->id); 6474 pr_cont_pool_info(pool); 6475 pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); 6476 if (pool->manager) 6477 pr_cont(" manager: %d", 6478 task_pid_nr(pool->manager->task)); 6479 list_for_each_entry(worker, &pool->idle_list, entry) { 6480 pr_cont(" %s", first ? "idle: " : ""); 6481 pr_cont_worker_id(worker); 6482 first = false; 6483 } 6484 pr_cont("\n"); 6485 printk_deferred_exit(); 6486 next_pool: 6487 raw_spin_unlock_irqrestore(&pool->lock, irq_flags); 6488 /* 6489 * We could be printing a lot from atomic context, e.g. 6490 * sysrq-t -> show_all_workqueues(). Avoid triggering 6491 * hard lockup. 6492 */ 6493 touch_nmi_watchdog(); 6494 6495 } 6496 6497 /** 6498 * show_all_workqueues - dump workqueue state 6499 * 6500 * Called from a sysrq handler and prints out all busy workqueues and pools. 6501 */ 6502 void show_all_workqueues(void) 6503 { 6504 struct workqueue_struct *wq; 6505 struct worker_pool *pool; 6506 int pi; 6507 6508 rcu_read_lock(); 6509 6510 pr_info("Showing busy workqueues and worker pools:\n"); 6511 6512 list_for_each_entry_rcu(wq, &workqueues, list) 6513 show_one_workqueue(wq); 6514 6515 for_each_pool(pool, pi) 6516 show_one_worker_pool(pool); 6517 6518 rcu_read_unlock(); 6519 } 6520 6521 /** 6522 * show_freezable_workqueues - dump freezable workqueue state 6523 * 6524 * Called from try_to_freeze_tasks() and prints out all freezable workqueues 6525 * still busy. 6526 */ 6527 void show_freezable_workqueues(void) 6528 { 6529 struct workqueue_struct *wq; 6530 6531 rcu_read_lock(); 6532 6533 pr_info("Showing freezable workqueues that are still busy:\n"); 6534 6535 list_for_each_entry_rcu(wq, &workqueues, list) { 6536 if (!(wq->flags & WQ_FREEZABLE)) 6537 continue; 6538 show_one_workqueue(wq); 6539 } 6540 6541 rcu_read_unlock(); 6542 } 6543 6544 /* used to show worker information through /proc/PID/{comm,stat,status} */ 6545 void wq_worker_comm(char *buf, size_t size, struct task_struct *task) 6546 { 6547 /* stabilize PF_WQ_WORKER and worker pool association */ 6548 mutex_lock(&wq_pool_attach_mutex); 6549 6550 if (task->flags & PF_WQ_WORKER) { 6551 struct worker *worker = kthread_data(task); 6552 struct worker_pool *pool = worker->pool; 6553 int off; 6554 6555 off = format_worker_id(buf, size, worker, pool); 6556 6557 if (pool) { 6558 raw_spin_lock_irq(&pool->lock); 6559 /* 6560 * ->desc tracks information (wq name or 6561 * set_worker_desc()) for the latest execution. If 6562 * current, prepend '+', otherwise '-'. 6563 */ 6564 if (worker->desc[0] != '\0') { 6565 if (worker->current_work) 6566 scnprintf(buf + off, size - off, "+%s", 6567 worker->desc); 6568 else 6569 scnprintf(buf + off, size - off, "-%s", 6570 worker->desc); 6571 } 6572 raw_spin_unlock_irq(&pool->lock); 6573 } 6574 } else { 6575 strscpy(buf, task->comm, size); 6576 } 6577 6578 mutex_unlock(&wq_pool_attach_mutex); 6579 } 6580 6581 #ifdef CONFIG_SMP 6582 6583 /* 6584 * CPU hotplug. 6585 * 6586 * There are two challenges in supporting CPU hotplug. Firstly, there 6587 * are a lot of assumptions on strong associations among work, pwq and 6588 * pool which make migrating pending and scheduled works very 6589 * difficult to implement without impacting hot paths. Secondly, 6590 * worker pools serve mix of short, long and very long running works making 6591 * blocked draining impractical. 6592 * 6593 * This is solved by allowing the pools to be disassociated from the CPU 6594 * running as an unbound one and allowing it to be reattached later if the 6595 * cpu comes back online. 6596 */ 6597 6598 static void unbind_workers(int cpu) 6599 { 6600 struct worker_pool *pool; 6601 struct worker *worker; 6602 6603 for_each_cpu_worker_pool(pool, cpu) { 6604 mutex_lock(&wq_pool_attach_mutex); 6605 raw_spin_lock_irq(&pool->lock); 6606 6607 /* 6608 * We've blocked all attach/detach operations. Make all workers 6609 * unbound and set DISASSOCIATED. Before this, all workers 6610 * must be on the cpu. After this, they may become diasporas. 6611 * And the preemption disabled section in their sched callbacks 6612 * are guaranteed to see WORKER_UNBOUND since the code here 6613 * is on the same cpu. 6614 */ 6615 for_each_pool_worker(worker, pool) 6616 worker->flags |= WORKER_UNBOUND; 6617 6618 pool->flags |= POOL_DISASSOCIATED; 6619 6620 /* 6621 * The handling of nr_running in sched callbacks are disabled 6622 * now. Zap nr_running. After this, nr_running stays zero and 6623 * need_more_worker() and keep_working() are always true as 6624 * long as the worklist is not empty. This pool now behaves as 6625 * an unbound (in terms of concurrency management) pool which 6626 * are served by workers tied to the pool. 6627 */ 6628 pool->nr_running = 0; 6629 6630 /* 6631 * With concurrency management just turned off, a busy 6632 * worker blocking could lead to lengthy stalls. Kick off 6633 * unbound chain execution of currently pending work items. 6634 */ 6635 kick_pool(pool); 6636 6637 raw_spin_unlock_irq(&pool->lock); 6638 6639 for_each_pool_worker(worker, pool) 6640 unbind_worker(worker); 6641 6642 mutex_unlock(&wq_pool_attach_mutex); 6643 } 6644 } 6645 6646 /** 6647 * rebind_workers - rebind all workers of a pool to the associated CPU 6648 * @pool: pool of interest 6649 * 6650 * @pool->cpu is coming online. Rebind all workers to the CPU. 6651 */ 6652 static void rebind_workers(struct worker_pool *pool) 6653 { 6654 struct worker *worker; 6655 6656 lockdep_assert_held(&wq_pool_attach_mutex); 6657 6658 /* 6659 * Restore CPU affinity of all workers. As all idle workers should 6660 * be on the run-queue of the associated CPU before any local 6661 * wake-ups for concurrency management happen, restore CPU affinity 6662 * of all workers first and then clear UNBOUND. As we're called 6663 * from CPU_ONLINE, the following shouldn't fail. 6664 */ 6665 for_each_pool_worker(worker, pool) { 6666 kthread_set_per_cpu(worker->task, pool->cpu); 6667 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 6668 pool_allowed_cpus(pool)) < 0); 6669 } 6670 6671 raw_spin_lock_irq(&pool->lock); 6672 6673 pool->flags &= ~POOL_DISASSOCIATED; 6674 6675 for_each_pool_worker(worker, pool) { 6676 unsigned int worker_flags = worker->flags; 6677 6678 /* 6679 * We want to clear UNBOUND but can't directly call 6680 * worker_clr_flags() or adjust nr_running. Atomically 6681 * replace UNBOUND with another NOT_RUNNING flag REBOUND. 6682 * @worker will clear REBOUND using worker_clr_flags() when 6683 * it initiates the next execution cycle thus restoring 6684 * concurrency management. Note that when or whether 6685 * @worker clears REBOUND doesn't affect correctness. 6686 * 6687 * WRITE_ONCE() is necessary because @worker->flags may be 6688 * tested without holding any lock in 6689 * wq_worker_running(). Without it, NOT_RUNNING test may 6690 * fail incorrectly leading to premature concurrency 6691 * management operations. 6692 */ 6693 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); 6694 worker_flags |= WORKER_REBOUND; 6695 worker_flags &= ~WORKER_UNBOUND; 6696 WRITE_ONCE(worker->flags, worker_flags); 6697 } 6698 6699 raw_spin_unlock_irq(&pool->lock); 6700 } 6701 6702 /** 6703 * restore_unbound_workers_cpumask - restore cpumask of unbound workers 6704 * @pool: unbound pool of interest 6705 * @cpu: the CPU which is coming up 6706 * 6707 * An unbound pool may end up with a cpumask which doesn't have any online 6708 * CPUs. When a worker of such pool get scheduled, the scheduler resets 6709 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any 6710 * online CPU before, cpus_allowed of all its workers should be restored. 6711 */ 6712 static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) 6713 { 6714 static cpumask_t cpumask; 6715 struct worker *worker; 6716 6717 lockdep_assert_held(&wq_pool_attach_mutex); 6718 6719 /* is @cpu allowed for @pool? */ 6720 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 6721 return; 6722 6723 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); 6724 6725 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 6726 for_each_pool_worker(worker, pool) 6727 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); 6728 } 6729 6730 int workqueue_prepare_cpu(unsigned int cpu) 6731 { 6732 struct worker_pool *pool; 6733 6734 for_each_cpu_worker_pool(pool, cpu) { 6735 if (pool->nr_workers) 6736 continue; 6737 if (!create_worker(pool)) 6738 return -ENOMEM; 6739 } 6740 return 0; 6741 } 6742 6743 int workqueue_online_cpu(unsigned int cpu) 6744 { 6745 struct worker_pool *pool; 6746 struct workqueue_struct *wq; 6747 int pi; 6748 6749 mutex_lock(&wq_pool_mutex); 6750 6751 cpumask_set_cpu(cpu, wq_online_cpumask); 6752 6753 for_each_pool(pool, pi) { 6754 /* BH pools aren't affected by hotplug */ 6755 if (pool->flags & POOL_BH) 6756 continue; 6757 6758 mutex_lock(&wq_pool_attach_mutex); 6759 if (pool->cpu == cpu) 6760 rebind_workers(pool); 6761 else if (pool->cpu < 0) 6762 restore_unbound_workers_cpumask(pool, cpu); 6763 mutex_unlock(&wq_pool_attach_mutex); 6764 } 6765 6766 /* update pod affinity of unbound workqueues */ 6767 list_for_each_entry(wq, &workqueues, list) { 6768 struct workqueue_attrs *attrs = wq->unbound_attrs; 6769 6770 if (attrs) { 6771 const struct wq_pod_type *pt = wqattrs_pod_type(attrs); 6772 int tcpu; 6773 6774 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) 6775 unbound_wq_update_pwq(wq, tcpu); 6776 6777 mutex_lock(&wq->mutex); 6778 wq_update_node_max_active(wq, -1); 6779 mutex_unlock(&wq->mutex); 6780 } 6781 } 6782 6783 mutex_unlock(&wq_pool_mutex); 6784 return 0; 6785 } 6786 6787 int workqueue_offline_cpu(unsigned int cpu) 6788 { 6789 struct workqueue_struct *wq; 6790 6791 /* unbinding per-cpu workers should happen on the local CPU */ 6792 if (WARN_ON(cpu != smp_processor_id())) 6793 return -1; 6794 6795 unbind_workers(cpu); 6796 6797 /* update pod affinity of unbound workqueues */ 6798 mutex_lock(&wq_pool_mutex); 6799 6800 cpumask_clear_cpu(cpu, wq_online_cpumask); 6801 6802 list_for_each_entry(wq, &workqueues, list) { 6803 struct workqueue_attrs *attrs = wq->unbound_attrs; 6804 6805 if (attrs) { 6806 const struct wq_pod_type *pt = wqattrs_pod_type(attrs); 6807 int tcpu; 6808 6809 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) 6810 unbound_wq_update_pwq(wq, tcpu); 6811 6812 mutex_lock(&wq->mutex); 6813 wq_update_node_max_active(wq, cpu); 6814 mutex_unlock(&wq->mutex); 6815 } 6816 } 6817 mutex_unlock(&wq_pool_mutex); 6818 6819 return 0; 6820 } 6821 6822 struct work_for_cpu { 6823 struct work_struct work; 6824 long (*fn)(void *); 6825 void *arg; 6826 long ret; 6827 }; 6828 6829 static void work_for_cpu_fn(struct work_struct *work) 6830 { 6831 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); 6832 6833 wfc->ret = wfc->fn(wfc->arg); 6834 } 6835 6836 /** 6837 * work_on_cpu_key - run a function in thread context on a particular cpu 6838 * @cpu: the cpu to run on 6839 * @fn: the function to run 6840 * @arg: the function arg 6841 * @key: The lock class key for lock debugging purposes 6842 * 6843 * It is up to the caller to ensure that the cpu doesn't go offline. 6844 * The caller must not hold any locks which would prevent @fn from completing. 6845 * 6846 * Return: The value @fn returns. 6847 */ 6848 long work_on_cpu_key(int cpu, long (*fn)(void *), 6849 void *arg, struct lock_class_key *key) 6850 { 6851 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 6852 6853 INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); 6854 schedule_work_on(cpu, &wfc.work); 6855 flush_work(&wfc.work); 6856 destroy_work_on_stack(&wfc.work); 6857 return wfc.ret; 6858 } 6859 EXPORT_SYMBOL_GPL(work_on_cpu_key); 6860 #endif /* CONFIG_SMP */ 6861 6862 #ifdef CONFIG_FREEZER 6863 6864 /** 6865 * freeze_workqueues_begin - begin freezing workqueues 6866 * 6867 * Start freezing workqueues. After this function returns, all freezable 6868 * workqueues will queue new works to their inactive_works list instead of 6869 * pool->worklist. 6870 * 6871 * CONTEXT: 6872 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. 6873 */ 6874 void freeze_workqueues_begin(void) 6875 { 6876 struct workqueue_struct *wq; 6877 6878 mutex_lock(&wq_pool_mutex); 6879 6880 WARN_ON_ONCE(workqueue_freezing); 6881 workqueue_freezing = true; 6882 6883 list_for_each_entry(wq, &workqueues, list) { 6884 mutex_lock(&wq->mutex); 6885 wq_adjust_max_active(wq); 6886 mutex_unlock(&wq->mutex); 6887 } 6888 6889 mutex_unlock(&wq_pool_mutex); 6890 } 6891 6892 /** 6893 * freeze_workqueues_busy - are freezable workqueues still busy? 6894 * 6895 * Check whether freezing is complete. This function must be called 6896 * between freeze_workqueues_begin() and thaw_workqueues(). 6897 * 6898 * CONTEXT: 6899 * Grabs and releases wq_pool_mutex. 6900 * 6901 * Return: 6902 * %true if some freezable workqueues are still busy. %false if freezing 6903 * is complete. 6904 */ 6905 bool freeze_workqueues_busy(void) 6906 { 6907 bool busy = false; 6908 struct workqueue_struct *wq; 6909 struct pool_workqueue *pwq; 6910 6911 mutex_lock(&wq_pool_mutex); 6912 6913 WARN_ON_ONCE(!workqueue_freezing); 6914 6915 list_for_each_entry(wq, &workqueues, list) { 6916 if (!(wq->flags & WQ_FREEZABLE)) 6917 continue; 6918 /* 6919 * nr_active is monotonically decreasing. It's safe 6920 * to peek without lock. 6921 */ 6922 rcu_read_lock(); 6923 for_each_pwq(pwq, wq) { 6924 WARN_ON_ONCE(pwq->nr_active < 0); 6925 if (pwq->nr_active) { 6926 busy = true; 6927 rcu_read_unlock(); 6928 goto out_unlock; 6929 } 6930 } 6931 rcu_read_unlock(); 6932 } 6933 out_unlock: 6934 mutex_unlock(&wq_pool_mutex); 6935 return busy; 6936 } 6937 6938 /** 6939 * thaw_workqueues - thaw workqueues 6940 * 6941 * Thaw workqueues. Normal queueing is restored and all collected 6942 * frozen works are transferred to their respective pool worklists. 6943 * 6944 * CONTEXT: 6945 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. 6946 */ 6947 void thaw_workqueues(void) 6948 { 6949 struct workqueue_struct *wq; 6950 6951 mutex_lock(&wq_pool_mutex); 6952 6953 if (!workqueue_freezing) 6954 goto out_unlock; 6955 6956 workqueue_freezing = false; 6957 6958 /* restore max_active and repopulate worklist */ 6959 list_for_each_entry(wq, &workqueues, list) { 6960 mutex_lock(&wq->mutex); 6961 wq_adjust_max_active(wq); 6962 mutex_unlock(&wq->mutex); 6963 } 6964 6965 out_unlock: 6966 mutex_unlock(&wq_pool_mutex); 6967 } 6968 #endif /* CONFIG_FREEZER */ 6969 6970 static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) 6971 { 6972 LIST_HEAD(ctxs); 6973 int ret = 0; 6974 struct workqueue_struct *wq; 6975 struct apply_wqattrs_ctx *ctx, *n; 6976 6977 lockdep_assert_held(&wq_pool_mutex); 6978 6979 list_for_each_entry(wq, &workqueues, list) { 6980 if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING)) 6981 continue; 6982 6983 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); 6984 if (IS_ERR(ctx)) { 6985 ret = PTR_ERR(ctx); 6986 break; 6987 } 6988 6989 list_add_tail(&ctx->list, &ctxs); 6990 } 6991 6992 list_for_each_entry_safe(ctx, n, &ctxs, list) { 6993 if (!ret) 6994 apply_wqattrs_commit(ctx); 6995 apply_wqattrs_cleanup(ctx); 6996 } 6997 6998 if (!ret) { 6999 int cpu; 7000 struct worker_pool *pool; 7001 struct worker *worker; 7002 7003 mutex_lock(&wq_pool_attach_mutex); 7004 cpumask_copy(wq_unbound_cpumask, unbound_cpumask); 7005 /* rescuer needs to respect cpumask changes when it is not attached */ 7006 list_for_each_entry(wq, &workqueues, list) { 7007 if (wq->rescuer && !wq->rescuer->pool) 7008 unbind_worker(wq->rescuer); 7009 } 7010 /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */ 7011 for_each_possible_cpu(cpu) { 7012 for_each_cpu_worker_pool(pool, cpu) { 7013 if (!(pool->flags & POOL_DISASSOCIATED)) 7014 continue; 7015 for_each_pool_worker(worker, pool) 7016 unbind_worker(worker); 7017 } 7018 } 7019 mutex_unlock(&wq_pool_attach_mutex); 7020 } 7021 return ret; 7022 } 7023 7024 /** 7025 * workqueue_unbound_housekeeping_update - Propagate housekeeping cpumask update 7026 * @hk: the new housekeeping cpumask 7027 * 7028 * Update the unbound workqueue cpumask on top of the new housekeeping cpumask such 7029 * that the effective unbound affinity is the intersection of the new housekeeping 7030 * with the requested affinity set via nohz_full=/isolcpus= or sysfs. 7031 * 7032 * Return: 0 on success and -errno on failure. 7033 */ 7034 int workqueue_unbound_housekeeping_update(const struct cpumask *hk) 7035 { 7036 cpumask_var_t cpumask; 7037 int ret = 0; 7038 7039 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 7040 return -ENOMEM; 7041 7042 mutex_lock(&wq_pool_mutex); 7043 7044 /* 7045 * If the operation fails, it will fall back to 7046 * wq_requested_unbound_cpumask which is initially set to 7047 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten 7048 * by any subsequent write to workqueue/cpumask sysfs file. 7049 */ 7050 if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk)) 7051 cpumask_copy(cpumask, wq_requested_unbound_cpumask); 7052 if (!cpumask_equal(cpumask, wq_unbound_cpumask)) 7053 ret = workqueue_apply_unbound_cpumask(cpumask); 7054 7055 /* Save the current isolated cpumask & export it via sysfs */ 7056 if (!ret) 7057 cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, hk); 7058 7059 mutex_unlock(&wq_pool_mutex); 7060 free_cpumask_var(cpumask); 7061 return ret; 7062 } 7063 7064 static int parse_affn_scope(const char *val) 7065 { 7066 int i; 7067 7068 for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { 7069 if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) 7070 return i; 7071 } 7072 return -EINVAL; 7073 } 7074 7075 static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) 7076 { 7077 struct workqueue_struct *wq; 7078 int affn, cpu; 7079 7080 affn = parse_affn_scope(val); 7081 if (affn < 0) 7082 return affn; 7083 if (affn == WQ_AFFN_DFL) 7084 return -EINVAL; 7085 7086 cpus_read_lock(); 7087 mutex_lock(&wq_pool_mutex); 7088 7089 wq_affn_dfl = affn; 7090 7091 list_for_each_entry(wq, &workqueues, list) { 7092 for_each_online_cpu(cpu) 7093 unbound_wq_update_pwq(wq, cpu); 7094 } 7095 7096 mutex_unlock(&wq_pool_mutex); 7097 cpus_read_unlock(); 7098 7099 return 0; 7100 } 7101 7102 static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) 7103 { 7104 return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); 7105 } 7106 7107 static const struct kernel_param_ops wq_affn_dfl_ops = { 7108 .set = wq_affn_dfl_set, 7109 .get = wq_affn_dfl_get, 7110 }; 7111 7112 module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); 7113 7114 #ifdef CONFIG_SYSFS 7115 /* 7116 * Workqueues with WQ_SYSFS flag set is visible to userland via 7117 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the 7118 * following attributes. 7119 * 7120 * per_cpu RO bool : whether the workqueue is per-cpu or unbound 7121 * max_active RW int : maximum number of in-flight work items 7122 * 7123 * Unbound workqueues have the following extra attributes. 7124 * 7125 * nice RW int : nice value of the workers 7126 * cpumask RW mask : bitmask of allowed CPUs for the workers 7127 * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) 7128 * affinity_strict RW bool : worker CPU affinity is strict 7129 */ 7130 struct wq_device { 7131 struct workqueue_struct *wq; 7132 struct device dev; 7133 }; 7134 7135 static struct workqueue_struct *dev_to_wq(struct device *dev) 7136 { 7137 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); 7138 7139 return wq_dev->wq; 7140 } 7141 7142 static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, 7143 char *buf) 7144 { 7145 struct workqueue_struct *wq = dev_to_wq(dev); 7146 7147 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 7148 } 7149 static DEVICE_ATTR_RO(per_cpu); 7150 7151 static ssize_t max_active_show(struct device *dev, 7152 struct device_attribute *attr, char *buf) 7153 { 7154 struct workqueue_struct *wq = dev_to_wq(dev); 7155 7156 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 7157 } 7158 7159 static ssize_t max_active_store(struct device *dev, 7160 struct device_attribute *attr, const char *buf, 7161 size_t count) 7162 { 7163 struct workqueue_struct *wq = dev_to_wq(dev); 7164 int val; 7165 7166 if (sscanf(buf, "%d", &val) != 1 || val <= 0) 7167 return -EINVAL; 7168 7169 workqueue_set_max_active(wq, val); 7170 return count; 7171 } 7172 static DEVICE_ATTR_RW(max_active); 7173 7174 static struct attribute *wq_sysfs_attrs[] = { 7175 &dev_attr_per_cpu.attr, 7176 &dev_attr_max_active.attr, 7177 NULL, 7178 }; 7179 ATTRIBUTE_GROUPS(wq_sysfs); 7180 7181 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, 7182 char *buf) 7183 { 7184 struct workqueue_struct *wq = dev_to_wq(dev); 7185 int written; 7186 7187 mutex_lock(&wq->mutex); 7188 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); 7189 mutex_unlock(&wq->mutex); 7190 7191 return written; 7192 } 7193 7194 /* prepare workqueue_attrs for sysfs store operations */ 7195 static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) 7196 { 7197 struct workqueue_attrs *attrs; 7198 7199 lockdep_assert_held(&wq_pool_mutex); 7200 7201 attrs = alloc_workqueue_attrs(); 7202 if (!attrs) 7203 return NULL; 7204 7205 copy_workqueue_attrs(attrs, wq->unbound_attrs); 7206 return attrs; 7207 } 7208 7209 static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, 7210 const char *buf, size_t count) 7211 { 7212 struct workqueue_struct *wq = dev_to_wq(dev); 7213 struct workqueue_attrs *attrs; 7214 int ret = -ENOMEM; 7215 7216 apply_wqattrs_lock(); 7217 7218 attrs = wq_sysfs_prep_attrs(wq); 7219 if (!attrs) 7220 goto out_unlock; 7221 7222 if (sscanf(buf, "%d", &attrs->nice) == 1 && 7223 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) 7224 ret = apply_workqueue_attrs_locked(wq, attrs); 7225 else 7226 ret = -EINVAL; 7227 7228 out_unlock: 7229 apply_wqattrs_unlock(); 7230 free_workqueue_attrs(attrs); 7231 return ret ?: count; 7232 } 7233 7234 static ssize_t wq_cpumask_show(struct device *dev, 7235 struct device_attribute *attr, char *buf) 7236 { 7237 struct workqueue_struct *wq = dev_to_wq(dev); 7238 int written; 7239 7240 mutex_lock(&wq->mutex); 7241 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", 7242 cpumask_pr_args(wq->unbound_attrs->cpumask)); 7243 mutex_unlock(&wq->mutex); 7244 return written; 7245 } 7246 7247 static ssize_t wq_cpumask_store(struct device *dev, 7248 struct device_attribute *attr, 7249 const char *buf, size_t count) 7250 { 7251 struct workqueue_struct *wq = dev_to_wq(dev); 7252 struct workqueue_attrs *attrs; 7253 int ret = -ENOMEM; 7254 7255 apply_wqattrs_lock(); 7256 7257 attrs = wq_sysfs_prep_attrs(wq); 7258 if (!attrs) 7259 goto out_unlock; 7260 7261 ret = cpumask_parse(buf, attrs->cpumask); 7262 if (!ret) 7263 ret = apply_workqueue_attrs_locked(wq, attrs); 7264 7265 out_unlock: 7266 apply_wqattrs_unlock(); 7267 free_workqueue_attrs(attrs); 7268 return ret ?: count; 7269 } 7270 7271 static ssize_t wq_affn_scope_show(struct device *dev, 7272 struct device_attribute *attr, char *buf) 7273 { 7274 struct workqueue_struct *wq = dev_to_wq(dev); 7275 int written; 7276 7277 mutex_lock(&wq->mutex); 7278 if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) 7279 written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", 7280 wq_affn_names[WQ_AFFN_DFL], 7281 wq_affn_names[wq_affn_dfl]); 7282 else 7283 written = scnprintf(buf, PAGE_SIZE, "%s\n", 7284 wq_affn_names[wq->unbound_attrs->affn_scope]); 7285 mutex_unlock(&wq->mutex); 7286 7287 return written; 7288 } 7289 7290 static ssize_t wq_affn_scope_store(struct device *dev, 7291 struct device_attribute *attr, 7292 const char *buf, size_t count) 7293 { 7294 struct workqueue_struct *wq = dev_to_wq(dev); 7295 struct workqueue_attrs *attrs; 7296 int affn, ret = -ENOMEM; 7297 7298 affn = parse_affn_scope(buf); 7299 if (affn < 0) 7300 return affn; 7301 7302 apply_wqattrs_lock(); 7303 attrs = wq_sysfs_prep_attrs(wq); 7304 if (attrs) { 7305 attrs->affn_scope = affn; 7306 ret = apply_workqueue_attrs_locked(wq, attrs); 7307 } 7308 apply_wqattrs_unlock(); 7309 free_workqueue_attrs(attrs); 7310 return ret ?: count; 7311 } 7312 7313 static ssize_t wq_affinity_strict_show(struct device *dev, 7314 struct device_attribute *attr, char *buf) 7315 { 7316 struct workqueue_struct *wq = dev_to_wq(dev); 7317 7318 return scnprintf(buf, PAGE_SIZE, "%d\n", 7319 wq->unbound_attrs->affn_strict); 7320 } 7321 7322 static ssize_t wq_affinity_strict_store(struct device *dev, 7323 struct device_attribute *attr, 7324 const char *buf, size_t count) 7325 { 7326 struct workqueue_struct *wq = dev_to_wq(dev); 7327 struct workqueue_attrs *attrs; 7328 int v, ret = -ENOMEM; 7329 7330 if (sscanf(buf, "%d", &v) != 1) 7331 return -EINVAL; 7332 7333 apply_wqattrs_lock(); 7334 attrs = wq_sysfs_prep_attrs(wq); 7335 if (attrs) { 7336 attrs->affn_strict = (bool)v; 7337 ret = apply_workqueue_attrs_locked(wq, attrs); 7338 } 7339 apply_wqattrs_unlock(); 7340 free_workqueue_attrs(attrs); 7341 return ret ?: count; 7342 } 7343 7344 static struct device_attribute wq_sysfs_unbound_attrs[] = { 7345 __ATTR(nice, 0644, wq_nice_show, wq_nice_store), 7346 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), 7347 __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), 7348 __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), 7349 __ATTR_NULL, 7350 }; 7351 7352 static const struct bus_type wq_subsys = { 7353 .name = "workqueue", 7354 .dev_groups = wq_sysfs_groups, 7355 }; 7356 7357 /** 7358 * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask 7359 * @cpumask: the cpumask to set 7360 * 7361 * The low-level workqueues cpumask is a global cpumask that limits 7362 * the affinity of all unbound workqueues. This function check the @cpumask 7363 * and apply it to all unbound workqueues and updates all pwqs of them. 7364 * 7365 * Return: 0 - Success 7366 * -EINVAL - Invalid @cpumask 7367 * -ENOMEM - Failed to allocate memory for attrs or pwqs. 7368 */ 7369 static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) 7370 { 7371 int ret = -EINVAL; 7372 7373 /* 7374 * Not excluding isolated cpus on purpose. 7375 * If the user wishes to include them, we allow that. 7376 */ 7377 cpumask_and(cpumask, cpumask, cpu_possible_mask); 7378 if (!cpumask_empty(cpumask)) { 7379 ret = 0; 7380 apply_wqattrs_lock(); 7381 if (!cpumask_equal(cpumask, wq_unbound_cpumask)) 7382 ret = workqueue_apply_unbound_cpumask(cpumask); 7383 if (!ret) 7384 cpumask_copy(wq_requested_unbound_cpumask, cpumask); 7385 apply_wqattrs_unlock(); 7386 } 7387 7388 return ret; 7389 } 7390 7391 static ssize_t __wq_cpumask_show(struct device *dev, 7392 struct device_attribute *attr, char *buf, cpumask_var_t mask) 7393 { 7394 int written; 7395 7396 mutex_lock(&wq_pool_mutex); 7397 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); 7398 mutex_unlock(&wq_pool_mutex); 7399 7400 return written; 7401 } 7402 7403 static ssize_t cpumask_requested_show(struct device *dev, 7404 struct device_attribute *attr, char *buf) 7405 { 7406 return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); 7407 } 7408 static DEVICE_ATTR_RO(cpumask_requested); 7409 7410 static ssize_t cpumask_isolated_show(struct device *dev, 7411 struct device_attribute *attr, char *buf) 7412 { 7413 return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); 7414 } 7415 static DEVICE_ATTR_RO(cpumask_isolated); 7416 7417 static ssize_t cpumask_show(struct device *dev, 7418 struct device_attribute *attr, char *buf) 7419 { 7420 return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); 7421 } 7422 7423 static ssize_t cpumask_store(struct device *dev, 7424 struct device_attribute *attr, const char *buf, size_t count) 7425 { 7426 cpumask_var_t cpumask; 7427 int ret; 7428 7429 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 7430 return -ENOMEM; 7431 7432 ret = cpumask_parse(buf, cpumask); 7433 if (!ret) 7434 ret = workqueue_set_unbound_cpumask(cpumask); 7435 7436 free_cpumask_var(cpumask); 7437 return ret ? ret : count; 7438 } 7439 static DEVICE_ATTR_RW(cpumask); 7440 7441 static struct attribute *wq_sysfs_cpumask_attrs[] = { 7442 &dev_attr_cpumask.attr, 7443 &dev_attr_cpumask_requested.attr, 7444 &dev_attr_cpumask_isolated.attr, 7445 NULL, 7446 }; 7447 ATTRIBUTE_GROUPS(wq_sysfs_cpumask); 7448 7449 static int __init wq_sysfs_init(void) 7450 { 7451 return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); 7452 } 7453 core_initcall(wq_sysfs_init); 7454 7455 static void wq_device_release(struct device *dev) 7456 { 7457 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); 7458 7459 kfree(wq_dev); 7460 } 7461 7462 /** 7463 * workqueue_sysfs_register - make a workqueue visible in sysfs 7464 * @wq: the workqueue to register 7465 * 7466 * Expose @wq in sysfs under /sys/bus/workqueue/devices. 7467 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set 7468 * which is the preferred method. 7469 * 7470 * Workqueue user should use this function directly iff it wants to apply 7471 * workqueue_attrs before making the workqueue visible in sysfs; otherwise, 7472 * apply_workqueue_attrs() may race against userland updating the 7473 * attributes. 7474 * 7475 * Return: 0 on success, -errno on failure. 7476 */ 7477 int workqueue_sysfs_register(struct workqueue_struct *wq) 7478 { 7479 struct wq_device *wq_dev; 7480 int ret; 7481 7482 /* 7483 * Adjusting max_active breaks ordering guarantee. Disallow exposing 7484 * ordered workqueues. 7485 */ 7486 if (WARN_ON(wq->flags & __WQ_ORDERED)) 7487 return -EINVAL; 7488 7489 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); 7490 if (!wq_dev) 7491 return -ENOMEM; 7492 7493 wq_dev->wq = wq; 7494 wq_dev->dev.bus = &wq_subsys; 7495 wq_dev->dev.release = wq_device_release; 7496 dev_set_name(&wq_dev->dev, "%s", wq->name); 7497 7498 /* 7499 * unbound_attrs are created separately. Suppress uevent until 7500 * everything is ready. 7501 */ 7502 dev_set_uevent_suppress(&wq_dev->dev, true); 7503 7504 ret = device_register(&wq_dev->dev); 7505 if (ret) { 7506 put_device(&wq_dev->dev); 7507 wq->wq_dev = NULL; 7508 return ret; 7509 } 7510 7511 if (wq->flags & WQ_UNBOUND) { 7512 struct device_attribute *attr; 7513 7514 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { 7515 ret = device_create_file(&wq_dev->dev, attr); 7516 if (ret) { 7517 device_unregister(&wq_dev->dev); 7518 wq->wq_dev = NULL; 7519 return ret; 7520 } 7521 } 7522 } 7523 7524 dev_set_uevent_suppress(&wq_dev->dev, false); 7525 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 7526 return 0; 7527 } 7528 7529 /** 7530 * workqueue_sysfs_unregister - undo workqueue_sysfs_register() 7531 * @wq: the workqueue to unregister 7532 * 7533 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. 7534 */ 7535 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) 7536 { 7537 struct wq_device *wq_dev = wq->wq_dev; 7538 7539 if (!wq->wq_dev) 7540 return; 7541 7542 wq->wq_dev = NULL; 7543 device_unregister(&wq_dev->dev); 7544 } 7545 #else /* CONFIG_SYSFS */ 7546 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } 7547 #endif /* CONFIG_SYSFS */ 7548 7549 /* 7550 * Workqueue watchdog. 7551 * 7552 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal 7553 * flush dependency, a concurrency managed work item which stays RUNNING 7554 * indefinitely. Workqueue stalls can be very difficult to debug as the 7555 * usual warning mechanisms don't trigger and internal workqueue state is 7556 * largely opaque. 7557 * 7558 * Workqueue watchdog monitors all worker pools periodically and dumps 7559 * state if some pools failed to make forward progress for a while where 7560 * forward progress is defined as the first item on ->worklist changing. 7561 * 7562 * This mechanism is controlled through the kernel parameter 7563 * "workqueue.watchdog_thresh" which can be updated at runtime through the 7564 * corresponding sysfs parameter file. 7565 */ 7566 #ifdef CONFIG_WQ_WATCHDOG 7567 7568 static unsigned long wq_watchdog_thresh = 30; 7569 static struct timer_list wq_watchdog_timer; 7570 7571 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; 7572 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; 7573 7574 static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC; 7575 module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); 7576 7577 static unsigned int wq_panic_on_stall_time; 7578 module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644); 7579 MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)"); 7580 7581 /* 7582 * Show workers that might prevent the processing of pending work items. 7583 * The only candidates are CPU-bound workers in the running state. 7584 * Pending work items should be handled by another idle worker 7585 * in all other situations. 7586 */ 7587 static void show_cpu_pool_hog(struct worker_pool *pool) 7588 { 7589 struct worker *worker; 7590 unsigned long irq_flags; 7591 int bkt; 7592 7593 raw_spin_lock_irqsave(&pool->lock, irq_flags); 7594 7595 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 7596 if (task_is_running(worker->task)) { 7597 /* 7598 * Defer printing to avoid deadlocks in console 7599 * drivers that queue work while holding locks 7600 * also taken in their write paths. 7601 */ 7602 printk_deferred_enter(); 7603 7604 pr_info("pool %d:\n", pool->id); 7605 sched_show_task(worker->task); 7606 7607 printk_deferred_exit(); 7608 } 7609 } 7610 7611 raw_spin_unlock_irqrestore(&pool->lock, irq_flags); 7612 } 7613 7614 static void show_cpu_pools_hogs(void) 7615 { 7616 struct worker_pool *pool; 7617 int pi; 7618 7619 pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); 7620 7621 rcu_read_lock(); 7622 7623 for_each_pool(pool, pi) { 7624 if (pool->cpu_stall) 7625 show_cpu_pool_hog(pool); 7626 7627 } 7628 7629 rcu_read_unlock(); 7630 } 7631 7632 /* 7633 * It triggers a panic in two scenarios: when the total number of stalls 7634 * exceeds a threshold, and when a stall lasts longer than 7635 * wq_panic_on_stall_time 7636 */ 7637 static void panic_on_wq_watchdog(unsigned int stall_time_sec) 7638 { 7639 static unsigned int wq_stall; 7640 7641 if (wq_panic_on_stall) { 7642 wq_stall++; 7643 if (wq_stall >= wq_panic_on_stall) 7644 panic("workqueue: %u stall(s) exceeded threshold %u\n", 7645 wq_stall, wq_panic_on_stall); 7646 } 7647 7648 if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time) 7649 panic("workqueue: stall lasted %us, exceeding threshold %us\n", 7650 stall_time_sec, wq_panic_on_stall_time); 7651 } 7652 7653 static void wq_watchdog_reset_touched(void) 7654 { 7655 int cpu; 7656 7657 wq_watchdog_touched = jiffies; 7658 for_each_possible_cpu(cpu) 7659 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; 7660 } 7661 7662 static void wq_watchdog_timer_fn(struct timer_list *unused) 7663 { 7664 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; 7665 unsigned int max_stall_time = 0; 7666 bool lockup_detected = false; 7667 bool cpu_pool_stall = false; 7668 unsigned long now = jiffies; 7669 struct worker_pool *pool; 7670 unsigned int stall_time; 7671 int pi; 7672 7673 if (!thresh) 7674 return; 7675 7676 for_each_pool(pool, pi) { 7677 unsigned long pool_ts, touched, ts; 7678 7679 pool->cpu_stall = false; 7680 if (list_empty(&pool->worklist)) 7681 continue; 7682 7683 /* 7684 * If a virtual machine is stopped by the host it can look to 7685 * the watchdog like a stall. 7686 */ 7687 kvm_check_and_clear_guest_paused(); 7688 7689 /* get the latest of pool and touched timestamps */ 7690 if (pool->cpu >= 0) 7691 touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); 7692 else 7693 touched = READ_ONCE(wq_watchdog_touched); 7694 pool_ts = READ_ONCE(pool->watchdog_ts); 7695 7696 if (time_after(pool_ts, touched)) 7697 ts = pool_ts; 7698 else 7699 ts = touched; 7700 7701 /* did we stall? */ 7702 if (time_after(now, ts + thresh)) { 7703 lockup_detected = true; 7704 stall_time = jiffies_to_msecs(now - pool_ts) / 1000; 7705 max_stall_time = max(max_stall_time, stall_time); 7706 if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { 7707 pool->cpu_stall = true; 7708 cpu_pool_stall = true; 7709 } 7710 pr_emerg("BUG: workqueue lockup - pool"); 7711 pr_cont_pool_info(pool); 7712 pr_cont(" stuck for %us!\n", stall_time); 7713 } 7714 7715 7716 } 7717 7718 if (lockup_detected) 7719 show_all_workqueues(); 7720 7721 if (cpu_pool_stall) 7722 show_cpu_pools_hogs(); 7723 7724 if (lockup_detected) 7725 panic_on_wq_watchdog(max_stall_time); 7726 7727 wq_watchdog_reset_touched(); 7728 mod_timer(&wq_watchdog_timer, jiffies + thresh); 7729 } 7730 7731 notrace void wq_watchdog_touch(int cpu) 7732 { 7733 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; 7734 unsigned long touch_ts = READ_ONCE(wq_watchdog_touched); 7735 unsigned long now = jiffies; 7736 7737 if (cpu >= 0) 7738 per_cpu(wq_watchdog_touched_cpu, cpu) = now; 7739 else 7740 WARN_ONCE(1, "%s should be called with valid CPU", __func__); 7741 7742 /* Don't unnecessarily store to global cacheline */ 7743 if (time_after(now, touch_ts + thresh / 4)) 7744 WRITE_ONCE(wq_watchdog_touched, jiffies); 7745 } 7746 7747 static void wq_watchdog_set_thresh(unsigned long thresh) 7748 { 7749 wq_watchdog_thresh = 0; 7750 timer_delete_sync(&wq_watchdog_timer); 7751 7752 if (thresh) { 7753 wq_watchdog_thresh = thresh; 7754 wq_watchdog_reset_touched(); 7755 mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); 7756 } 7757 } 7758 7759 static int wq_watchdog_param_set_thresh(const char *val, 7760 const struct kernel_param *kp) 7761 { 7762 unsigned long thresh; 7763 int ret; 7764 7765 ret = kstrtoul(val, 0, &thresh); 7766 if (ret) 7767 return ret; 7768 7769 if (system_percpu_wq) 7770 wq_watchdog_set_thresh(thresh); 7771 else 7772 wq_watchdog_thresh = thresh; 7773 7774 return 0; 7775 } 7776 7777 static const struct kernel_param_ops wq_watchdog_thresh_ops = { 7778 .set = wq_watchdog_param_set_thresh, 7779 .get = param_get_ulong, 7780 }; 7781 7782 module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, 7783 0644); 7784 7785 static void wq_watchdog_init(void) 7786 { 7787 timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); 7788 wq_watchdog_set_thresh(wq_watchdog_thresh); 7789 } 7790 7791 #else /* CONFIG_WQ_WATCHDOG */ 7792 7793 static inline void wq_watchdog_init(void) { } 7794 7795 #endif /* CONFIG_WQ_WATCHDOG */ 7796 7797 static void bh_pool_kick_normal(struct irq_work *irq_work) 7798 { 7799 raise_softirq_irqoff(TASKLET_SOFTIRQ); 7800 } 7801 7802 static void bh_pool_kick_highpri(struct irq_work *irq_work) 7803 { 7804 raise_softirq_irqoff(HI_SOFTIRQ); 7805 } 7806 7807 static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) 7808 { 7809 if (!cpumask_intersects(wq_unbound_cpumask, mask)) { 7810 pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", 7811 cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); 7812 return; 7813 } 7814 7815 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); 7816 } 7817 7818 static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice) 7819 { 7820 BUG_ON(init_worker_pool(pool)); 7821 pool->cpu = cpu; 7822 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); 7823 cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); 7824 pool->attrs->nice = nice; 7825 pool->attrs->affn_strict = true; 7826 pool->node = cpu_to_node(cpu); 7827 7828 /* alloc pool ID */ 7829 mutex_lock(&wq_pool_mutex); 7830 BUG_ON(worker_pool_assign_id(pool)); 7831 mutex_unlock(&wq_pool_mutex); 7832 } 7833 7834 /** 7835 * workqueue_init_early - early init for workqueue subsystem 7836 * 7837 * This is the first step of three-staged workqueue subsystem initialization and 7838 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are 7839 * up. It sets up all the data structures and system workqueues and allows early 7840 * boot code to create workqueues and queue/cancel work items. Actual work item 7841 * execution starts only after kthreads can be created and scheduled right 7842 * before early initcalls. 7843 */ 7844 void __init workqueue_init_early(void) 7845 { 7846 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; 7847 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 7848 void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal, 7849 bh_pool_kick_highpri }; 7850 int i, cpu; 7851 7852 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 7853 7854 BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL)); 7855 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 7856 BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); 7857 BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); 7858 7859 cpumask_copy(wq_online_cpumask, cpu_online_mask); 7860 cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); 7861 restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); 7862 restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); 7863 if (!cpumask_empty(&wq_cmdline_cpumask)) 7864 restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); 7865 7866 cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); 7867 cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, 7868 housekeeping_cpumask(HK_TYPE_DOMAIN)); 7869 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 7870 7871 unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); 7872 BUG_ON(!unbound_wq_update_pwq_attrs_buf); 7873 7874 /* 7875 * If nohz_full is enabled, set power efficient workqueue as unbound. 7876 * This allows workqueue items to be moved to HK CPUs. 7877 */ 7878 if (housekeeping_enabled(HK_TYPE_TICK)) 7879 wq_power_efficient = true; 7880 7881 /* initialize WQ_AFFN_SYSTEM pods */ 7882 pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); 7883 pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); 7884 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); 7885 BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); 7886 7887 BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); 7888 7889 pt->nr_pods = 1; 7890 cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); 7891 pt->pod_node[0] = NUMA_NO_NODE; 7892 pt->cpu_pod[0] = 0; 7893 7894 /* initialize BH and CPU pools */ 7895 for_each_possible_cpu(cpu) { 7896 struct worker_pool *pool; 7897 7898 i = 0; 7899 for_each_bh_worker_pool(pool, cpu) { 7900 init_cpu_worker_pool(pool, cpu, std_nice[i]); 7901 pool->flags |= POOL_BH; 7902 init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]); 7903 i++; 7904 } 7905 7906 i = 0; 7907 for_each_cpu_worker_pool(pool, cpu) 7908 init_cpu_worker_pool(pool, cpu, std_nice[i++]); 7909 } 7910 7911 /* create default unbound and ordered wq attrs */ 7912 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 7913 struct workqueue_attrs *attrs; 7914 7915 BUG_ON(!(attrs = alloc_workqueue_attrs())); 7916 attrs->nice = std_nice[i]; 7917 unbound_std_wq_attrs[i] = attrs; 7918 7919 /* 7920 * An ordered wq should have only one pwq as ordering is 7921 * guaranteed by max_active which is enforced by pwqs. 7922 */ 7923 BUG_ON(!(attrs = alloc_workqueue_attrs())); 7924 attrs->nice = std_nice[i]; 7925 attrs->ordered = true; 7926 ordered_wq_attrs[i] = attrs; 7927 } 7928 7929 system_wq = alloc_workqueue("events", WQ_PERCPU, 0); 7930 system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); 7931 system_highpri_wq = alloc_workqueue("events_highpri", 7932 WQ_HIGHPRI | WQ_PERCPU, 0); 7933 system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); 7934 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); 7935 system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); 7936 system_freezable_wq = alloc_workqueue("events_freezable", 7937 WQ_FREEZABLE | WQ_PERCPU, 0); 7938 system_power_efficient_wq = alloc_workqueue("events_power_efficient", 7939 WQ_POWER_EFFICIENT | WQ_PERCPU, 0); 7940 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", 7941 WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); 7942 system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); 7943 system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", 7944 WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); 7945 BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || 7946 !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || 7947 !system_power_efficient_wq || 7948 !system_freezable_power_efficient_wq || 7949 !system_bh_wq || !system_bh_highpri_wq); 7950 } 7951 7952 static void __init wq_cpu_intensive_thresh_init(void) 7953 { 7954 unsigned long thresh; 7955 unsigned long bogo; 7956 7957 pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); 7958 BUG_ON(IS_ERR(pwq_release_worker)); 7959 7960 /* if the user set it to a specific value, keep it */ 7961 if (wq_cpu_intensive_thresh_us != ULONG_MAX) 7962 return; 7963 7964 /* 7965 * The default of 10ms is derived from the fact that most modern (as of 7966 * 2023) processors can do a lot in 10ms and that it's just below what 7967 * most consider human-perceivable. However, the kernel also runs on a 7968 * lot slower CPUs including microcontrollers where the threshold is way 7969 * too low. 7970 * 7971 * Let's scale up the threshold upto 1 second if BogoMips is below 4000. 7972 * This is by no means accurate but it doesn't have to be. The mechanism 7973 * is still useful even when the threshold is fully scaled up. Also, as 7974 * the reports would usually be applicable to everyone, some machines 7975 * operating on longer thresholds won't significantly diminish their 7976 * usefulness. 7977 */ 7978 thresh = 10 * USEC_PER_MSEC; 7979 7980 /* see init/calibrate.c for lpj -> BogoMIPS calculation */ 7981 bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); 7982 if (bogo < 4000) 7983 thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); 7984 7985 pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", 7986 loops_per_jiffy, bogo, thresh); 7987 7988 wq_cpu_intensive_thresh_us = thresh; 7989 } 7990 7991 /** 7992 * workqueue_init - bring workqueue subsystem fully online 7993 * 7994 * This is the second step of three-staged workqueue subsystem initialization 7995 * and invoked as soon as kthreads can be created and scheduled. Workqueues have 7996 * been created and work items queued on them, but there are no kworkers 7997 * executing the work items yet. Populate the worker pools with the initial 7998 * workers and enable future kworker creations. 7999 */ 8000 void __init workqueue_init(void) 8001 { 8002 struct workqueue_struct *wq; 8003 struct worker_pool *pool; 8004 int cpu, bkt; 8005 8006 wq_cpu_intensive_thresh_init(); 8007 8008 mutex_lock(&wq_pool_mutex); 8009 8010 /* 8011 * Per-cpu pools created earlier could be missing node hint. Fix them 8012 * up. Also, create a rescuer for workqueues that requested it. 8013 */ 8014 for_each_possible_cpu(cpu) { 8015 for_each_bh_worker_pool(pool, cpu) 8016 pool->node = cpu_to_node(cpu); 8017 for_each_cpu_worker_pool(pool, cpu) 8018 pool->node = cpu_to_node(cpu); 8019 } 8020 8021 list_for_each_entry(wq, &workqueues, list) { 8022 WARN(init_rescuer(wq), 8023 "workqueue: failed to create early rescuer for %s", 8024 wq->name); 8025 } 8026 8027 mutex_unlock(&wq_pool_mutex); 8028 8029 /* 8030 * Create the initial workers. A BH pool has one pseudo worker that 8031 * represents the shared BH execution context and thus doesn't get 8032 * affected by hotplug events. Create the BH pseudo workers for all 8033 * possible CPUs here. 8034 */ 8035 for_each_possible_cpu(cpu) 8036 for_each_bh_worker_pool(pool, cpu) 8037 BUG_ON(!create_worker(pool)); 8038 8039 for_each_online_cpu(cpu) { 8040 for_each_cpu_worker_pool(pool, cpu) { 8041 pool->flags &= ~POOL_DISASSOCIATED; 8042 BUG_ON(!create_worker(pool)); 8043 } 8044 } 8045 8046 hash_for_each(unbound_pool_hash, bkt, pool, hash_node) 8047 BUG_ON(!create_worker(pool)); 8048 8049 wq_online = true; 8050 wq_watchdog_init(); 8051 } 8052 8053 /* 8054 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to 8055 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique 8056 * and consecutive pod ID. The rest of @pt is initialized accordingly. 8057 */ 8058 static void __init init_pod_type(struct wq_pod_type *pt, 8059 bool (*cpus_share_pod)(int, int)) 8060 { 8061 int cur, pre, cpu, pod; 8062 8063 pt->nr_pods = 0; 8064 8065 /* init @pt->cpu_pod[] according to @cpus_share_pod() */ 8066 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); 8067 BUG_ON(!pt->cpu_pod); 8068 8069 for_each_possible_cpu(cur) { 8070 for_each_possible_cpu(pre) { 8071 if (pre >= cur) { 8072 pt->cpu_pod[cur] = pt->nr_pods++; 8073 break; 8074 } 8075 if (cpus_share_pod(cur, pre)) { 8076 pt->cpu_pod[cur] = pt->cpu_pod[pre]; 8077 break; 8078 } 8079 } 8080 } 8081 8082 /* init the rest to match @pt->cpu_pod[] */ 8083 pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); 8084 pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); 8085 BUG_ON(!pt->pod_cpus || !pt->pod_node); 8086 8087 for (pod = 0; pod < pt->nr_pods; pod++) 8088 BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); 8089 8090 for_each_possible_cpu(cpu) { 8091 cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); 8092 pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); 8093 } 8094 } 8095 8096 static bool __init cpus_dont_share(int cpu0, int cpu1) 8097 { 8098 return false; 8099 } 8100 8101 static bool __init cpus_share_smt(int cpu0, int cpu1) 8102 { 8103 #ifdef CONFIG_SCHED_SMT 8104 return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); 8105 #else 8106 return false; 8107 #endif 8108 } 8109 8110 static bool __init cpus_share_numa(int cpu0, int cpu1) 8111 { 8112 return cpu_to_node(cpu0) == cpu_to_node(cpu1); 8113 } 8114 8115 /** 8116 * workqueue_init_topology - initialize CPU pods for unbound workqueues 8117 * 8118 * This is the third step of three-staged workqueue subsystem initialization and 8119 * invoked after SMP and topology information are fully initialized. It 8120 * initializes the unbound CPU pods accordingly. 8121 */ 8122 void __init workqueue_init_topology(void) 8123 { 8124 struct workqueue_struct *wq; 8125 int cpu; 8126 8127 init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); 8128 init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); 8129 init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); 8130 init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); 8131 8132 wq_topo_initialized = true; 8133 8134 mutex_lock(&wq_pool_mutex); 8135 8136 /* 8137 * Workqueues allocated earlier would have all CPUs sharing the default 8138 * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue 8139 * and CPU combinations to apply per-pod sharing. 8140 */ 8141 list_for_each_entry(wq, &workqueues, list) { 8142 for_each_online_cpu(cpu) 8143 unbound_wq_update_pwq(wq, cpu); 8144 if (wq->flags & WQ_UNBOUND) { 8145 mutex_lock(&wq->mutex); 8146 wq_update_node_max_active(wq, -1); 8147 mutex_unlock(&wq->mutex); 8148 } 8149 } 8150 8151 mutex_unlock(&wq_pool_mutex); 8152 } 8153 8154 void __warn_flushing_systemwide_wq(void) 8155 { 8156 pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); 8157 dump_stack(); 8158 } 8159 EXPORT_SYMBOL(__warn_flushing_systemwide_wq); 8160 8161 static int __init workqueue_unbound_cpus_setup(char *str) 8162 { 8163 if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { 8164 cpumask_clear(&wq_cmdline_cpumask); 8165 pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); 8166 } 8167 8168 return 1; 8169 } 8170 __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); 8171