1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kernel/workqueue.c - generic async execution with shared worker pool 4 * 5 * Copyright (C) 2002 Ingo Molnar 6 * 7 * Derived from the taskqueue/keventd code by: 8 * David Woodhouse <dwmw2@infradead.org> 9 * Andrew Morton 10 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 11 * Theodore Ts'o <tytso@mit.edu> 12 * 13 * Made to use alloc_percpu by Christoph Lameter. 14 * 15 * Copyright (C) 2010 SUSE Linux Products GmbH 16 * Copyright (C) 2010 Tejun Heo <tj@kernel.org> 17 * 18 * This is the generic async execution mechanism. Work items as are 19 * executed in process context. The worker pool is shared and 20 * automatically managed. There are two worker pools for each CPU (one for 21 * normal work items and the other for high priority ones) and some extra 22 * pools for workqueues which are not bound to any specific CPU - the 23 * number of these backing pools is dynamic. 24 * 25 * Please read Documentation/core-api/workqueue.rst for details. 26 */ 27 28 #include <linux/export.h> 29 #include <linux/kernel.h> 30 #include <linux/sched.h> 31 #include <linux/init.h> 32 #include <linux/signal.h> 33 #include <linux/completion.h> 34 #include <linux/workqueue.h> 35 #include <linux/slab.h> 36 #include <linux/cpu.h> 37 #include <linux/notifier.h> 38 #include <linux/kthread.h> 39 #include <linux/hardirq.h> 40 #include <linux/mempolicy.h> 41 #include <linux/freezer.h> 42 #include <linux/debug_locks.h> 43 #include <linux/lockdep.h> 44 #include <linux/idr.h> 45 #include <linux/jhash.h> 46 #include <linux/hashtable.h> 47 #include <linux/rculist.h> 48 #include <linux/nodemask.h> 49 #include <linux/moduleparam.h> 50 #include <linux/uaccess.h> 51 #include <linux/sched/isolation.h> 52 #include <linux/sched/debug.h> 53 #include <linux/nmi.h> 54 #include <linux/kvm_para.h> 55 56 #include "workqueue_internal.h" 57 58 enum { 59 /* 60 * worker_pool flags 61 * 62 * A bound pool is either associated or disassociated with its CPU. 63 * While associated (!DISASSOCIATED), all workers are bound to the 64 * CPU and none has %WORKER_UNBOUND set and concurrency management 65 * is in effect. 66 * 67 * While DISASSOCIATED, the cpu may be offline and all workers have 68 * %WORKER_UNBOUND set and concurrency management disabled, and may 69 * be executing on any CPU. The pool behaves as an unbound one. 70 * 71 * Note that DISASSOCIATED should be flipped only while holding 72 * wq_pool_attach_mutex to avoid changing binding state while 73 * worker_attach_to_pool() is in progress. 74 */ 75 POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ 76 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 77 78 /* worker flags */ 79 WORKER_DIE = 1 << 1, /* die die die */ 80 WORKER_IDLE = 1 << 2, /* is idle */ 81 WORKER_PREP = 1 << 3, /* preparing to run works */ 82 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 83 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 84 WORKER_REBOUND = 1 << 8, /* worker was rebound */ 85 86 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | 87 WORKER_UNBOUND | WORKER_REBOUND, 88 89 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 90 91 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ 92 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 93 94 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 95 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 96 97 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, 98 /* call for help after 10ms 99 (min two ticks) */ 100 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 101 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 102 103 /* 104 * Rescue workers are used only on emergencies and shared by 105 * all cpus. Give MIN_NICE. 106 */ 107 RESCUER_NICE_LEVEL = MIN_NICE, 108 HIGHPRI_NICE_LEVEL = MIN_NICE, 109 110 WQ_NAME_LEN = 24, 111 }; 112 113 /* 114 * Structure fields follow one of the following exclusion rules. 115 * 116 * I: Modifiable by initialization/destruction paths and read-only for 117 * everyone else. 118 * 119 * P: Preemption protected. Disabling preemption is enough and should 120 * only be modified and accessed from the local cpu. 121 * 122 * L: pool->lock protected. Access with pool->lock held. 123 * 124 * X: During normal operation, modification requires pool->lock and should 125 * be done only from local cpu. Either disabling preemption on local 126 * cpu or grabbing pool->lock is enough for read access. If 127 * POOL_DISASSOCIATED is set, it's identical to L. 128 * 129 * K: Only modified by worker while holding pool->lock. Can be safely read by 130 * self, while holding pool->lock or from IRQ context if %current is the 131 * kworker. 132 * 133 * S: Only modified by worker self. 134 * 135 * A: wq_pool_attach_mutex protected. 136 * 137 * PL: wq_pool_mutex protected. 138 * 139 * PR: wq_pool_mutex protected for writes. RCU protected for reads. 140 * 141 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. 142 * 143 * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or 144 * RCU for reads. 145 * 146 * WQ: wq->mutex protected. 147 * 148 * WR: wq->mutex protected for writes. RCU protected for reads. 149 * 150 * MD: wq_mayday_lock protected. 151 * 152 * WD: Used internally by the watchdog. 153 */ 154 155 /* struct worker is defined in workqueue_internal.h */ 156 157 struct worker_pool { 158 raw_spinlock_t lock; /* the pool lock */ 159 int cpu; /* I: the associated cpu */ 160 int node; /* I: the associated node ID */ 161 int id; /* I: pool ID */ 162 unsigned int flags; /* X: flags */ 163 164 unsigned long watchdog_ts; /* L: watchdog timestamp */ 165 bool cpu_stall; /* WD: stalled cpu bound pool */ 166 167 /* 168 * The counter is incremented in a process context on the associated CPU 169 * w/ preemption disabled, and decremented or reset in the same context 170 * but w/ pool->lock held. The readers grab pool->lock and are 171 * guaranteed to see if the counter reached zero. 172 */ 173 int nr_running; 174 175 struct list_head worklist; /* L: list of pending works */ 176 177 int nr_workers; /* L: total number of workers */ 178 int nr_idle; /* L: currently idle workers */ 179 180 struct list_head idle_list; /* L: list of idle workers */ 181 struct timer_list idle_timer; /* L: worker idle timeout */ 182 struct work_struct idle_cull_work; /* L: worker idle cleanup */ 183 184 struct timer_list mayday_timer; /* L: SOS timer for workers */ 185 186 /* a workers is either on busy_hash or idle_list, or the manager */ 187 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); 188 /* L: hash of busy workers */ 189 190 struct worker *manager; /* L: purely informational */ 191 struct list_head workers; /* A: attached workers */ 192 struct list_head dying_workers; /* A: workers about to die */ 193 struct completion *detach_completion; /* all workers detached */ 194 195 struct ida worker_ida; /* worker IDs for task name */ 196 197 struct workqueue_attrs *attrs; /* I: worker attributes */ 198 struct hlist_node hash_node; /* PL: unbound_pool_hash node */ 199 int refcnt; /* PL: refcnt for unbound pools */ 200 201 /* 202 * Destruction of pool is RCU protected to allow dereferences 203 * from get_work_pool(). 204 */ 205 struct rcu_head rcu; 206 }; 207 208 /* 209 * Per-pool_workqueue statistics. These can be monitored using 210 * tools/workqueue/wq_monitor.py. 211 */ 212 enum pool_workqueue_stats { 213 PWQ_STAT_STARTED, /* work items started execution */ 214 PWQ_STAT_COMPLETED, /* work items completed execution */ 215 PWQ_STAT_CPU_TIME, /* total CPU time consumed */ 216 PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ 217 PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ 218 PWQ_STAT_MAYDAY, /* maydays to rescuer */ 219 PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ 220 221 PWQ_NR_STATS, 222 }; 223 224 /* 225 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS 226 * of work_struct->data are used for flags and the remaining high bits 227 * point to the pwq; thus, pwqs need to be aligned at two's power of the 228 * number of flag bits. 229 */ 230 struct pool_workqueue { 231 struct worker_pool *pool; /* I: the associated pool */ 232 struct workqueue_struct *wq; /* I: the owning workqueue */ 233 int work_color; /* L: current color */ 234 int flush_color; /* L: flushing color */ 235 int refcnt; /* L: reference count */ 236 int nr_in_flight[WORK_NR_COLORS]; 237 /* L: nr of in_flight works */ 238 239 /* 240 * nr_active management and WORK_STRUCT_INACTIVE: 241 * 242 * When pwq->nr_active >= max_active, new work item is queued to 243 * pwq->inactive_works instead of pool->worklist and marked with 244 * WORK_STRUCT_INACTIVE. 245 * 246 * All work items marked with WORK_STRUCT_INACTIVE do not participate 247 * in pwq->nr_active and all work items in pwq->inactive_works are 248 * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE 249 * work items are in pwq->inactive_works. Some of them are ready to 250 * run in pool->worklist or worker->scheduled. Those work itmes are 251 * only struct wq_barrier which is used for flush_work() and should 252 * not participate in pwq->nr_active. For non-barrier work item, it 253 * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. 254 */ 255 int nr_active; /* L: nr of active works */ 256 int max_active; /* L: max active works */ 257 struct list_head inactive_works; /* L: inactive works */ 258 struct list_head pwqs_node; /* WR: node on wq->pwqs */ 259 struct list_head mayday_node; /* MD: node on wq->maydays */ 260 261 u64 stats[PWQ_NR_STATS]; 262 263 /* 264 * Release of unbound pwq is punted to system_wq. See put_pwq() 265 * and pwq_unbound_release_workfn() for details. pool_workqueue 266 * itself is also RCU protected so that the first pwq can be 267 * determined without grabbing wq->mutex. 268 */ 269 struct work_struct unbound_release_work; 270 struct rcu_head rcu; 271 } __aligned(1 << WORK_STRUCT_FLAG_BITS); 272 273 /* 274 * Structure used to wait for workqueue flush. 275 */ 276 struct wq_flusher { 277 struct list_head list; /* WQ: list of flushers */ 278 int flush_color; /* WQ: flush color waiting for */ 279 struct completion done; /* flush completion */ 280 }; 281 282 struct wq_device; 283 284 /* 285 * The externally visible workqueue. It relays the issued work items to 286 * the appropriate worker_pool through its pool_workqueues. 287 */ 288 struct workqueue_struct { 289 struct list_head pwqs; /* WR: all pwqs of this wq */ 290 struct list_head list; /* PR: list of all workqueues */ 291 292 struct mutex mutex; /* protects this wq */ 293 int work_color; /* WQ: current work color */ 294 int flush_color; /* WQ: current flush color */ 295 atomic_t nr_pwqs_to_flush; /* flush in progress */ 296 struct wq_flusher *first_flusher; /* WQ: first flusher */ 297 struct list_head flusher_queue; /* WQ: flush waiters */ 298 struct list_head flusher_overflow; /* WQ: flush overflow list */ 299 300 struct list_head maydays; /* MD: pwqs requesting rescue */ 301 struct worker *rescuer; /* MD: rescue worker */ 302 303 int nr_drainers; /* WQ: drain in progress */ 304 int saved_max_active; /* WQ: saved pwq max_active */ 305 306 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ 307 struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ 308 309 #ifdef CONFIG_SYSFS 310 struct wq_device *wq_dev; /* I: for sysfs interface */ 311 #endif 312 #ifdef CONFIG_LOCKDEP 313 char *lock_name; 314 struct lock_class_key key; 315 struct lockdep_map lockdep_map; 316 #endif 317 char name[WQ_NAME_LEN]; /* I: workqueue name */ 318 319 /* 320 * Destruction of workqueue_struct is RCU protected to allow walking 321 * the workqueues list without grabbing wq_pool_mutex. 322 * This is used to dump all workqueues from sysrq. 323 */ 324 struct rcu_head rcu; 325 326 /* hot fields used during command issue, aligned to cacheline */ 327 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ 328 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ 329 struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ 330 }; 331 332 static struct kmem_cache *pwq_cache; 333 334 static cpumask_var_t *wq_numa_possible_cpumask; 335 /* possible CPUs of each node */ 336 337 /* 338 * Per-cpu work items which run for longer than the following threshold are 339 * automatically considered CPU intensive and excluded from concurrency 340 * management to prevent them from noticeably delaying other per-cpu work items. 341 */ 342 static unsigned long wq_cpu_intensive_thresh_us = 10000; 343 module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); 344 345 static bool wq_disable_numa; 346 module_param_named(disable_numa, wq_disable_numa, bool, 0444); 347 348 /* see the comment above the definition of WQ_POWER_EFFICIENT */ 349 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); 350 module_param_named(power_efficient, wq_power_efficient, bool, 0444); 351 352 static bool wq_online; /* can kworkers be created yet? */ 353 354 static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ 355 356 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ 357 static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; 358 359 static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 360 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ 361 static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 362 /* wait for manager to go away */ 363 static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); 364 365 static LIST_HEAD(workqueues); /* PR: list of all workqueues */ 366 static bool workqueue_freezing; /* PL: have wqs started freezing? */ 367 368 /* PL&A: allowable cpus for unbound wqs and work items */ 369 static cpumask_var_t wq_unbound_cpumask; 370 371 /* CPU where unbound work was last round robin scheduled from this CPU */ 372 static DEFINE_PER_CPU(int, wq_rr_cpu_last); 373 374 /* 375 * Local execution of unbound work items is no longer guaranteed. The 376 * following always forces round-robin CPU selection on unbound work items 377 * to uncover usages which depend on it. 378 */ 379 #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU 380 static bool wq_debug_force_rr_cpu = true; 381 #else 382 static bool wq_debug_force_rr_cpu = false; 383 #endif 384 module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); 385 386 /* the per-cpu worker pools */ 387 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); 388 389 static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ 390 391 /* PL: hash of all unbound pools keyed by pool->attrs */ 392 static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); 393 394 /* I: attributes used when instantiating standard unbound pools on demand */ 395 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 396 397 /* I: attributes used when instantiating ordered pools on demand */ 398 static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; 399 400 struct workqueue_struct *system_wq __read_mostly; 401 EXPORT_SYMBOL(system_wq); 402 struct workqueue_struct *system_highpri_wq __read_mostly; 403 EXPORT_SYMBOL_GPL(system_highpri_wq); 404 struct workqueue_struct *system_long_wq __read_mostly; 405 EXPORT_SYMBOL_GPL(system_long_wq); 406 struct workqueue_struct *system_unbound_wq __read_mostly; 407 EXPORT_SYMBOL_GPL(system_unbound_wq); 408 struct workqueue_struct *system_freezable_wq __read_mostly; 409 EXPORT_SYMBOL_GPL(system_freezable_wq); 410 struct workqueue_struct *system_power_efficient_wq __read_mostly; 411 EXPORT_SYMBOL_GPL(system_power_efficient_wq); 412 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; 413 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); 414 415 static int worker_thread(void *__worker); 416 static void workqueue_sysfs_unregister(struct workqueue_struct *wq); 417 static void show_pwq(struct pool_workqueue *pwq); 418 static void show_one_worker_pool(struct worker_pool *pool); 419 420 #define CREATE_TRACE_POINTS 421 #include <trace/events/workqueue.h> 422 423 #define assert_rcu_or_pool_mutex() \ 424 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 425 !lockdep_is_held(&wq_pool_mutex), \ 426 "RCU or wq_pool_mutex should be held") 427 428 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ 429 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 430 !lockdep_is_held(&wq->mutex) && \ 431 !lockdep_is_held(&wq_pool_mutex), \ 432 "RCU, wq->mutex or wq_pool_mutex should be held") 433 434 #define for_each_cpu_worker_pool(pool, cpu) \ 435 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 436 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 437 (pool)++) 438 439 /** 440 * for_each_pool - iterate through all worker_pools in the system 441 * @pool: iteration cursor 442 * @pi: integer used for iteration 443 * 444 * This must be called either with wq_pool_mutex held or RCU read 445 * locked. If the pool needs to be used beyond the locking in effect, the 446 * caller is responsible for guaranteeing that the pool stays online. 447 * 448 * The if/else clause exists only for the lockdep assertion and can be 449 * ignored. 450 */ 451 #define for_each_pool(pool, pi) \ 452 idr_for_each_entry(&worker_pool_idr, pool, pi) \ 453 if (({ assert_rcu_or_pool_mutex(); false; })) { } \ 454 else 455 456 /** 457 * for_each_pool_worker - iterate through all workers of a worker_pool 458 * @worker: iteration cursor 459 * @pool: worker_pool to iterate workers of 460 * 461 * This must be called with wq_pool_attach_mutex. 462 * 463 * The if/else clause exists only for the lockdep assertion and can be 464 * ignored. 465 */ 466 #define for_each_pool_worker(worker, pool) \ 467 list_for_each_entry((worker), &(pool)->workers, node) \ 468 if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ 469 else 470 471 /** 472 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue 473 * @pwq: iteration cursor 474 * @wq: the target workqueue 475 * 476 * This must be called either with wq->mutex held or RCU read locked. 477 * If the pwq needs to be used beyond the locking in effect, the caller is 478 * responsible for guaranteeing that the pwq stays online. 479 * 480 * The if/else clause exists only for the lockdep assertion and can be 481 * ignored. 482 */ 483 #define for_each_pwq(pwq, wq) \ 484 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ 485 lockdep_is_held(&(wq->mutex))) 486 487 #ifdef CONFIG_DEBUG_OBJECTS_WORK 488 489 static const struct debug_obj_descr work_debug_descr; 490 491 static void *work_debug_hint(void *addr) 492 { 493 return ((struct work_struct *) addr)->func; 494 } 495 496 static bool work_is_static_object(void *addr) 497 { 498 struct work_struct *work = addr; 499 500 return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); 501 } 502 503 /* 504 * fixup_init is called when: 505 * - an active object is initialized 506 */ 507 static bool work_fixup_init(void *addr, enum debug_obj_state state) 508 { 509 struct work_struct *work = addr; 510 511 switch (state) { 512 case ODEBUG_STATE_ACTIVE: 513 cancel_work_sync(work); 514 debug_object_init(work, &work_debug_descr); 515 return true; 516 default: 517 return false; 518 } 519 } 520 521 /* 522 * fixup_free is called when: 523 * - an active object is freed 524 */ 525 static bool work_fixup_free(void *addr, enum debug_obj_state state) 526 { 527 struct work_struct *work = addr; 528 529 switch (state) { 530 case ODEBUG_STATE_ACTIVE: 531 cancel_work_sync(work); 532 debug_object_free(work, &work_debug_descr); 533 return true; 534 default: 535 return false; 536 } 537 } 538 539 static const struct debug_obj_descr work_debug_descr = { 540 .name = "work_struct", 541 .debug_hint = work_debug_hint, 542 .is_static_object = work_is_static_object, 543 .fixup_init = work_fixup_init, 544 .fixup_free = work_fixup_free, 545 }; 546 547 static inline void debug_work_activate(struct work_struct *work) 548 { 549 debug_object_activate(work, &work_debug_descr); 550 } 551 552 static inline void debug_work_deactivate(struct work_struct *work) 553 { 554 debug_object_deactivate(work, &work_debug_descr); 555 } 556 557 void __init_work(struct work_struct *work, int onstack) 558 { 559 if (onstack) 560 debug_object_init_on_stack(work, &work_debug_descr); 561 else 562 debug_object_init(work, &work_debug_descr); 563 } 564 EXPORT_SYMBOL_GPL(__init_work); 565 566 void destroy_work_on_stack(struct work_struct *work) 567 { 568 debug_object_free(work, &work_debug_descr); 569 } 570 EXPORT_SYMBOL_GPL(destroy_work_on_stack); 571 572 void destroy_delayed_work_on_stack(struct delayed_work *work) 573 { 574 destroy_timer_on_stack(&work->timer); 575 debug_object_free(&work->work, &work_debug_descr); 576 } 577 EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); 578 579 #else 580 static inline void debug_work_activate(struct work_struct *work) { } 581 static inline void debug_work_deactivate(struct work_struct *work) { } 582 #endif 583 584 /** 585 * worker_pool_assign_id - allocate ID and assign it to @pool 586 * @pool: the pool pointer of interest 587 * 588 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned 589 * successfully, -errno on failure. 590 */ 591 static int worker_pool_assign_id(struct worker_pool *pool) 592 { 593 int ret; 594 595 lockdep_assert_held(&wq_pool_mutex); 596 597 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, 598 GFP_KERNEL); 599 if (ret >= 0) { 600 pool->id = ret; 601 return 0; 602 } 603 return ret; 604 } 605 606 /** 607 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node 608 * @wq: the target workqueue 609 * @node: the node ID 610 * 611 * This must be called with any of wq_pool_mutex, wq->mutex or RCU 612 * read locked. 613 * If the pwq needs to be used beyond the locking in effect, the caller is 614 * responsible for guaranteeing that the pwq stays online. 615 * 616 * Return: The unbound pool_workqueue for @node. 617 */ 618 static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, 619 int node) 620 { 621 assert_rcu_or_wq_mutex_or_pool_mutex(wq); 622 623 /* 624 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a 625 * delayed item is pending. The plan is to keep CPU -> NODE 626 * mapping valid and stable across CPU on/offlines. Once that 627 * happens, this workaround can be removed. 628 */ 629 if (unlikely(node == NUMA_NO_NODE)) 630 return wq->dfl_pwq; 631 632 return rcu_dereference_raw(wq->numa_pwq_tbl[node]); 633 } 634 635 static unsigned int work_color_to_flags(int color) 636 { 637 return color << WORK_STRUCT_COLOR_SHIFT; 638 } 639 640 static int get_work_color(unsigned long work_data) 641 { 642 return (work_data >> WORK_STRUCT_COLOR_SHIFT) & 643 ((1 << WORK_STRUCT_COLOR_BITS) - 1); 644 } 645 646 static int work_next_color(int color) 647 { 648 return (color + 1) % WORK_NR_COLORS; 649 } 650 651 /* 652 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data 653 * contain the pointer to the queued pwq. Once execution starts, the flag 654 * is cleared and the high bits contain OFFQ flags and pool ID. 655 * 656 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() 657 * and clear_work_data() can be used to set the pwq, pool or clear 658 * work->data. These functions should only be called while the work is 659 * owned - ie. while the PENDING bit is set. 660 * 661 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq 662 * corresponding to a work. Pool is available once the work has been 663 * queued anywhere after initialization until it is sync canceled. pwq is 664 * available only while the work item is queued. 665 * 666 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 667 * canceled. While being canceled, a work item may have its PENDING set 668 * but stay off timer and worklist for arbitrarily long and nobody should 669 * try to steal the PENDING bit. 670 */ 671 static inline void set_work_data(struct work_struct *work, unsigned long data, 672 unsigned long flags) 673 { 674 WARN_ON_ONCE(!work_pending(work)); 675 atomic_long_set(&work->data, data | flags | work_static(work)); 676 } 677 678 static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, 679 unsigned long extra_flags) 680 { 681 set_work_data(work, (unsigned long)pwq, 682 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); 683 } 684 685 static void set_work_pool_and_keep_pending(struct work_struct *work, 686 int pool_id) 687 { 688 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 689 WORK_STRUCT_PENDING); 690 } 691 692 static void set_work_pool_and_clear_pending(struct work_struct *work, 693 int pool_id) 694 { 695 /* 696 * The following wmb is paired with the implied mb in 697 * test_and_set_bit(PENDING) and ensures all updates to @work made 698 * here are visible to and precede any updates by the next PENDING 699 * owner. 700 */ 701 smp_wmb(); 702 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); 703 /* 704 * The following mb guarantees that previous clear of a PENDING bit 705 * will not be reordered with any speculative LOADS or STORES from 706 * work->current_func, which is executed afterwards. This possible 707 * reordering can lead to a missed execution on attempt to queue 708 * the same @work. E.g. consider this case: 709 * 710 * CPU#0 CPU#1 711 * ---------------------------- -------------------------------- 712 * 713 * 1 STORE event_indicated 714 * 2 queue_work_on() { 715 * 3 test_and_set_bit(PENDING) 716 * 4 } set_..._and_clear_pending() { 717 * 5 set_work_data() # clear bit 718 * 6 smp_mb() 719 * 7 work->current_func() { 720 * 8 LOAD event_indicated 721 * } 722 * 723 * Without an explicit full barrier speculative LOAD on line 8 can 724 * be executed before CPU#0 does STORE on line 1. If that happens, 725 * CPU#0 observes the PENDING bit is still set and new execution of 726 * a @work is not queued in a hope, that CPU#1 will eventually 727 * finish the queued @work. Meanwhile CPU#1 does not see 728 * event_indicated is set, because speculative LOAD was executed 729 * before actual STORE. 730 */ 731 smp_mb(); 732 } 733 734 static void clear_work_data(struct work_struct *work) 735 { 736 smp_wmb(); /* see set_work_pool_and_clear_pending() */ 737 set_work_data(work, WORK_STRUCT_NO_POOL, 0); 738 } 739 740 static inline struct pool_workqueue *work_struct_pwq(unsigned long data) 741 { 742 return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); 743 } 744 745 static struct pool_workqueue *get_work_pwq(struct work_struct *work) 746 { 747 unsigned long data = atomic_long_read(&work->data); 748 749 if (data & WORK_STRUCT_PWQ) 750 return work_struct_pwq(data); 751 else 752 return NULL; 753 } 754 755 /** 756 * get_work_pool - return the worker_pool a given work was associated with 757 * @work: the work item of interest 758 * 759 * Pools are created and destroyed under wq_pool_mutex, and allows read 760 * access under RCU read lock. As such, this function should be 761 * called under wq_pool_mutex or inside of a rcu_read_lock() region. 762 * 763 * All fields of the returned pool are accessible as long as the above 764 * mentioned locking is in effect. If the returned pool needs to be used 765 * beyond the critical section, the caller is responsible for ensuring the 766 * returned pool is and stays online. 767 * 768 * Return: The worker_pool @work was last associated with. %NULL if none. 769 */ 770 static struct worker_pool *get_work_pool(struct work_struct *work) 771 { 772 unsigned long data = atomic_long_read(&work->data); 773 int pool_id; 774 775 assert_rcu_or_pool_mutex(); 776 777 if (data & WORK_STRUCT_PWQ) 778 return work_struct_pwq(data)->pool; 779 780 pool_id = data >> WORK_OFFQ_POOL_SHIFT; 781 if (pool_id == WORK_OFFQ_POOL_NONE) 782 return NULL; 783 784 return idr_find(&worker_pool_idr, pool_id); 785 } 786 787 /** 788 * get_work_pool_id - return the worker pool ID a given work is associated with 789 * @work: the work item of interest 790 * 791 * Return: The worker_pool ID @work was last associated with. 792 * %WORK_OFFQ_POOL_NONE if none. 793 */ 794 static int get_work_pool_id(struct work_struct *work) 795 { 796 unsigned long data = atomic_long_read(&work->data); 797 798 if (data & WORK_STRUCT_PWQ) 799 return work_struct_pwq(data)->pool->id; 800 801 return data >> WORK_OFFQ_POOL_SHIFT; 802 } 803 804 static void mark_work_canceling(struct work_struct *work) 805 { 806 unsigned long pool_id = get_work_pool_id(work); 807 808 pool_id <<= WORK_OFFQ_POOL_SHIFT; 809 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); 810 } 811 812 static bool work_is_canceling(struct work_struct *work) 813 { 814 unsigned long data = atomic_long_read(&work->data); 815 816 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); 817 } 818 819 /* 820 * Policy functions. These define the policies on how the global worker 821 * pools are managed. Unless noted otherwise, these functions assume that 822 * they're being called with pool->lock held. 823 */ 824 825 static bool __need_more_worker(struct worker_pool *pool) 826 { 827 return !pool->nr_running; 828 } 829 830 /* 831 * Need to wake up a worker? Called from anything but currently 832 * running workers. 833 * 834 * Note that, because unbound workers never contribute to nr_running, this 835 * function will always return %true for unbound pools as long as the 836 * worklist isn't empty. 837 */ 838 static bool need_more_worker(struct worker_pool *pool) 839 { 840 return !list_empty(&pool->worklist) && __need_more_worker(pool); 841 } 842 843 /* Can I start working? Called from busy but !running workers. */ 844 static bool may_start_working(struct worker_pool *pool) 845 { 846 return pool->nr_idle; 847 } 848 849 /* Do I need to keep working? Called from currently running workers. */ 850 static bool keep_working(struct worker_pool *pool) 851 { 852 return !list_empty(&pool->worklist) && (pool->nr_running <= 1); 853 } 854 855 /* Do we need a new worker? Called from manager. */ 856 static bool need_to_create_worker(struct worker_pool *pool) 857 { 858 return need_more_worker(pool) && !may_start_working(pool); 859 } 860 861 /* Do we have too many workers and should some go away? */ 862 static bool too_many_workers(struct worker_pool *pool) 863 { 864 bool managing = pool->flags & POOL_MANAGER_ACTIVE; 865 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 866 int nr_busy = pool->nr_workers - nr_idle; 867 868 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 869 } 870 871 /* 872 * Wake up functions. 873 */ 874 875 /* Return the first idle worker. Called with pool->lock held. */ 876 static struct worker *first_idle_worker(struct worker_pool *pool) 877 { 878 if (unlikely(list_empty(&pool->idle_list))) 879 return NULL; 880 881 return list_first_entry(&pool->idle_list, struct worker, entry); 882 } 883 884 /** 885 * wake_up_worker - wake up an idle worker 886 * @pool: worker pool to wake worker from 887 * 888 * Wake up the first idle worker of @pool. 889 * 890 * CONTEXT: 891 * raw_spin_lock_irq(pool->lock). 892 */ 893 static void wake_up_worker(struct worker_pool *pool) 894 { 895 struct worker *worker = first_idle_worker(pool); 896 897 if (likely(worker)) 898 wake_up_process(worker->task); 899 } 900 901 /** 902 * worker_set_flags - set worker flags and adjust nr_running accordingly 903 * @worker: self 904 * @flags: flags to set 905 * 906 * Set @flags in @worker->flags and adjust nr_running accordingly. 907 * 908 * CONTEXT: 909 * raw_spin_lock_irq(pool->lock) 910 */ 911 static inline void worker_set_flags(struct worker *worker, unsigned int flags) 912 { 913 struct worker_pool *pool = worker->pool; 914 915 WARN_ON_ONCE(worker->task != current); 916 917 /* If transitioning into NOT_RUNNING, adjust nr_running. */ 918 if ((flags & WORKER_NOT_RUNNING) && 919 !(worker->flags & WORKER_NOT_RUNNING)) { 920 pool->nr_running--; 921 } 922 923 worker->flags |= flags; 924 } 925 926 /** 927 * worker_clr_flags - clear worker flags and adjust nr_running accordingly 928 * @worker: self 929 * @flags: flags to clear 930 * 931 * Clear @flags in @worker->flags and adjust nr_running accordingly. 932 * 933 * CONTEXT: 934 * raw_spin_lock_irq(pool->lock) 935 */ 936 static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 937 { 938 struct worker_pool *pool = worker->pool; 939 unsigned int oflags = worker->flags; 940 941 WARN_ON_ONCE(worker->task != current); 942 943 worker->flags &= ~flags; 944 945 /* 946 * If transitioning out of NOT_RUNNING, increment nr_running. Note 947 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask 948 * of multiple flags, not a single flag. 949 */ 950 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 951 if (!(worker->flags & WORKER_NOT_RUNNING)) 952 pool->nr_running++; 953 } 954 955 #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT 956 957 /* 958 * Concurrency-managed per-cpu work items that hog CPU for longer than 959 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, 960 * which prevents them from stalling other concurrency-managed work items. If a 961 * work function keeps triggering this mechanism, it's likely that the work item 962 * should be using an unbound workqueue instead. 963 * 964 * wq_cpu_intensive_report() tracks work functions which trigger such conditions 965 * and report them so that they can be examined and converted to use unbound 966 * workqueues as appropriate. To avoid flooding the console, each violating work 967 * function is tracked and reported with exponential backoff. 968 */ 969 #define WCI_MAX_ENTS 128 970 971 struct wci_ent { 972 work_func_t func; 973 atomic64_t cnt; 974 struct hlist_node hash_node; 975 }; 976 977 static struct wci_ent wci_ents[WCI_MAX_ENTS]; 978 static int wci_nr_ents; 979 static DEFINE_RAW_SPINLOCK(wci_lock); 980 static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); 981 982 static struct wci_ent *wci_find_ent(work_func_t func) 983 { 984 struct wci_ent *ent; 985 986 hash_for_each_possible_rcu(wci_hash, ent, hash_node, 987 (unsigned long)func) { 988 if (ent->func == func) 989 return ent; 990 } 991 return NULL; 992 } 993 994 static void wq_cpu_intensive_report(work_func_t func) 995 { 996 struct wci_ent *ent; 997 998 restart: 999 ent = wci_find_ent(func); 1000 if (ent) { 1001 u64 cnt; 1002 1003 /* 1004 * Start reporting from the fourth time and back off 1005 * exponentially. 1006 */ 1007 cnt = atomic64_inc_return_relaxed(&ent->cnt); 1008 if (cnt >= 4 && is_power_of_2(cnt)) 1009 printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", 1010 ent->func, wq_cpu_intensive_thresh_us, 1011 atomic64_read(&ent->cnt)); 1012 return; 1013 } 1014 1015 /* 1016 * @func is a new violation. Allocate a new entry for it. If wcn_ents[] 1017 * is exhausted, something went really wrong and we probably made enough 1018 * noise already. 1019 */ 1020 if (wci_nr_ents >= WCI_MAX_ENTS) 1021 return; 1022 1023 raw_spin_lock(&wci_lock); 1024 1025 if (wci_nr_ents >= WCI_MAX_ENTS) { 1026 raw_spin_unlock(&wci_lock); 1027 return; 1028 } 1029 1030 if (wci_find_ent(func)) { 1031 raw_spin_unlock(&wci_lock); 1032 goto restart; 1033 } 1034 1035 ent = &wci_ents[wci_nr_ents++]; 1036 ent->func = func; 1037 atomic64_set(&ent->cnt, 1); 1038 hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); 1039 1040 raw_spin_unlock(&wci_lock); 1041 } 1042 1043 #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1044 static void wq_cpu_intensive_report(work_func_t func) {} 1045 #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1046 1047 /** 1048 * wq_worker_running - a worker is running again 1049 * @task: task waking up 1050 * 1051 * This function is called when a worker returns from schedule() 1052 */ 1053 void wq_worker_running(struct task_struct *task) 1054 { 1055 struct worker *worker = kthread_data(task); 1056 1057 if (!READ_ONCE(worker->sleeping)) 1058 return; 1059 1060 /* 1061 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check 1062 * and the nr_running increment below, we may ruin the nr_running reset 1063 * and leave with an unexpected pool->nr_running == 1 on the newly unbound 1064 * pool. Protect against such race. 1065 */ 1066 preempt_disable(); 1067 if (!(worker->flags & WORKER_NOT_RUNNING)) 1068 worker->pool->nr_running++; 1069 preempt_enable(); 1070 1071 /* 1072 * CPU intensive auto-detection cares about how long a work item hogged 1073 * CPU without sleeping. Reset the starting timestamp on wakeup. 1074 */ 1075 worker->current_at = worker->task->se.sum_exec_runtime; 1076 1077 WRITE_ONCE(worker->sleeping, 0); 1078 } 1079 1080 /** 1081 * wq_worker_sleeping - a worker is going to sleep 1082 * @task: task going to sleep 1083 * 1084 * This function is called from schedule() when a busy worker is 1085 * going to sleep. 1086 */ 1087 void wq_worker_sleeping(struct task_struct *task) 1088 { 1089 struct worker *worker = kthread_data(task); 1090 struct worker_pool *pool; 1091 1092 /* 1093 * Rescuers, which may not have all the fields set up like normal 1094 * workers, also reach here, let's not access anything before 1095 * checking NOT_RUNNING. 1096 */ 1097 if (worker->flags & WORKER_NOT_RUNNING) 1098 return; 1099 1100 pool = worker->pool; 1101 1102 /* Return if preempted before wq_worker_running() was reached */ 1103 if (READ_ONCE(worker->sleeping)) 1104 return; 1105 1106 WRITE_ONCE(worker->sleeping, 1); 1107 raw_spin_lock_irq(&pool->lock); 1108 1109 /* 1110 * Recheck in case unbind_workers() preempted us. We don't 1111 * want to decrement nr_running after the worker is unbound 1112 * and nr_running has been reset. 1113 */ 1114 if (worker->flags & WORKER_NOT_RUNNING) { 1115 raw_spin_unlock_irq(&pool->lock); 1116 return; 1117 } 1118 1119 pool->nr_running--; 1120 if (need_more_worker(pool)) { 1121 worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1122 wake_up_worker(pool); 1123 } 1124 raw_spin_unlock_irq(&pool->lock); 1125 } 1126 1127 /** 1128 * wq_worker_tick - a scheduler tick occurred while a kworker is running 1129 * @task: task currently running 1130 * 1131 * Called from scheduler_tick(). We're in the IRQ context and the current 1132 * worker's fields which follow the 'K' locking rule can be accessed safely. 1133 */ 1134 void wq_worker_tick(struct task_struct *task) 1135 { 1136 struct worker *worker = kthread_data(task); 1137 struct pool_workqueue *pwq = worker->current_pwq; 1138 struct worker_pool *pool = worker->pool; 1139 1140 if (!pwq) 1141 return; 1142 1143 pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; 1144 1145 if (!wq_cpu_intensive_thresh_us) 1146 return; 1147 1148 /* 1149 * If the current worker is concurrency managed and hogged the CPU for 1150 * longer than wq_cpu_intensive_thresh_us, it's automatically marked 1151 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. 1152 * 1153 * Set @worker->sleeping means that @worker is in the process of 1154 * switching out voluntarily and won't be contributing to 1155 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also 1156 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to 1157 * double decrements. The task is releasing the CPU anyway. Let's skip. 1158 * We probably want to make this prettier in the future. 1159 */ 1160 if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || 1161 worker->task->se.sum_exec_runtime - worker->current_at < 1162 wq_cpu_intensive_thresh_us * NSEC_PER_USEC) 1163 return; 1164 1165 raw_spin_lock(&pool->lock); 1166 1167 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 1168 wq_cpu_intensive_report(worker->current_func); 1169 pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; 1170 1171 if (need_more_worker(pool)) { 1172 pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1173 wake_up_worker(pool); 1174 } 1175 1176 raw_spin_unlock(&pool->lock); 1177 } 1178 1179 /** 1180 * wq_worker_last_func - retrieve worker's last work function 1181 * @task: Task to retrieve last work function of. 1182 * 1183 * Determine the last function a worker executed. This is called from 1184 * the scheduler to get a worker's last known identity. 1185 * 1186 * CONTEXT: 1187 * raw_spin_lock_irq(rq->lock) 1188 * 1189 * This function is called during schedule() when a kworker is going 1190 * to sleep. It's used by psi to identify aggregation workers during 1191 * dequeuing, to allow periodic aggregation to shut-off when that 1192 * worker is the last task in the system or cgroup to go to sleep. 1193 * 1194 * As this function doesn't involve any workqueue-related locking, it 1195 * only returns stable values when called from inside the scheduler's 1196 * queuing and dequeuing paths, when @task, which must be a kworker, 1197 * is guaranteed to not be processing any works. 1198 * 1199 * Return: 1200 * The last work function %current executed as a worker, NULL if it 1201 * hasn't executed any work yet. 1202 */ 1203 work_func_t wq_worker_last_func(struct task_struct *task) 1204 { 1205 struct worker *worker = kthread_data(task); 1206 1207 return worker->last_func; 1208 } 1209 1210 /** 1211 * find_worker_executing_work - find worker which is executing a work 1212 * @pool: pool of interest 1213 * @work: work to find worker for 1214 * 1215 * Find a worker which is executing @work on @pool by searching 1216 * @pool->busy_hash which is keyed by the address of @work. For a worker 1217 * to match, its current execution should match the address of @work and 1218 * its work function. This is to avoid unwanted dependency between 1219 * unrelated work executions through a work item being recycled while still 1220 * being executed. 1221 * 1222 * This is a bit tricky. A work item may be freed once its execution 1223 * starts and nothing prevents the freed area from being recycled for 1224 * another work item. If the same work item address ends up being reused 1225 * before the original execution finishes, workqueue will identify the 1226 * recycled work item as currently executing and make it wait until the 1227 * current execution finishes, introducing an unwanted dependency. 1228 * 1229 * This function checks the work item address and work function to avoid 1230 * false positives. Note that this isn't complete as one may construct a 1231 * work function which can introduce dependency onto itself through a 1232 * recycled work item. Well, if somebody wants to shoot oneself in the 1233 * foot that badly, there's only so much we can do, and if such deadlock 1234 * actually occurs, it should be easy to locate the culprit work function. 1235 * 1236 * CONTEXT: 1237 * raw_spin_lock_irq(pool->lock). 1238 * 1239 * Return: 1240 * Pointer to worker which is executing @work if found, %NULL 1241 * otherwise. 1242 */ 1243 static struct worker *find_worker_executing_work(struct worker_pool *pool, 1244 struct work_struct *work) 1245 { 1246 struct worker *worker; 1247 1248 hash_for_each_possible(pool->busy_hash, worker, hentry, 1249 (unsigned long)work) 1250 if (worker->current_work == work && 1251 worker->current_func == work->func) 1252 return worker; 1253 1254 return NULL; 1255 } 1256 1257 /** 1258 * move_linked_works - move linked works to a list 1259 * @work: start of series of works to be scheduled 1260 * @head: target list to append @work to 1261 * @nextp: out parameter for nested worklist walking 1262 * 1263 * Schedule linked works starting from @work to @head. Work series to 1264 * be scheduled starts at @work and includes any consecutive work with 1265 * WORK_STRUCT_LINKED set in its predecessor. 1266 * 1267 * If @nextp is not NULL, it's updated to point to the next work of 1268 * the last scheduled work. This allows move_linked_works() to be 1269 * nested inside outer list_for_each_entry_safe(). 1270 * 1271 * CONTEXT: 1272 * raw_spin_lock_irq(pool->lock). 1273 */ 1274 static void move_linked_works(struct work_struct *work, struct list_head *head, 1275 struct work_struct **nextp) 1276 { 1277 struct work_struct *n; 1278 1279 /* 1280 * Linked worklist will always end before the end of the list, 1281 * use NULL for list head. 1282 */ 1283 list_for_each_entry_safe_from(work, n, NULL, entry) { 1284 list_move_tail(&work->entry, head); 1285 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) 1286 break; 1287 } 1288 1289 /* 1290 * If we're already inside safe list traversal and have moved 1291 * multiple works to the scheduled queue, the next position 1292 * needs to be updated. 1293 */ 1294 if (nextp) 1295 *nextp = n; 1296 } 1297 1298 /** 1299 * get_pwq - get an extra reference on the specified pool_workqueue 1300 * @pwq: pool_workqueue to get 1301 * 1302 * Obtain an extra reference on @pwq. The caller should guarantee that 1303 * @pwq has positive refcnt and be holding the matching pool->lock. 1304 */ 1305 static void get_pwq(struct pool_workqueue *pwq) 1306 { 1307 lockdep_assert_held(&pwq->pool->lock); 1308 WARN_ON_ONCE(pwq->refcnt <= 0); 1309 pwq->refcnt++; 1310 } 1311 1312 /** 1313 * put_pwq - put a pool_workqueue reference 1314 * @pwq: pool_workqueue to put 1315 * 1316 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its 1317 * destruction. The caller should be holding the matching pool->lock. 1318 */ 1319 static void put_pwq(struct pool_workqueue *pwq) 1320 { 1321 lockdep_assert_held(&pwq->pool->lock); 1322 if (likely(--pwq->refcnt)) 1323 return; 1324 if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) 1325 return; 1326 /* 1327 * @pwq can't be released under pool->lock, bounce to 1328 * pwq_unbound_release_workfn(). This never recurses on the same 1329 * pool->lock as this path is taken only for unbound workqueues and 1330 * the release work item is scheduled on a per-cpu workqueue. To 1331 * avoid lockdep warning, unbound pool->locks are given lockdep 1332 * subclass of 1 in get_unbound_pool(). 1333 */ 1334 schedule_work(&pwq->unbound_release_work); 1335 } 1336 1337 /** 1338 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock 1339 * @pwq: pool_workqueue to put (can be %NULL) 1340 * 1341 * put_pwq() with locking. This function also allows %NULL @pwq. 1342 */ 1343 static void put_pwq_unlocked(struct pool_workqueue *pwq) 1344 { 1345 if (pwq) { 1346 /* 1347 * As both pwqs and pools are RCU protected, the 1348 * following lock operations are safe. 1349 */ 1350 raw_spin_lock_irq(&pwq->pool->lock); 1351 put_pwq(pwq); 1352 raw_spin_unlock_irq(&pwq->pool->lock); 1353 } 1354 } 1355 1356 static void pwq_activate_inactive_work(struct work_struct *work) 1357 { 1358 struct pool_workqueue *pwq = get_work_pwq(work); 1359 1360 trace_workqueue_activate_work(work); 1361 if (list_empty(&pwq->pool->worklist)) 1362 pwq->pool->watchdog_ts = jiffies; 1363 move_linked_works(work, &pwq->pool->worklist, NULL); 1364 __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); 1365 pwq->nr_active++; 1366 } 1367 1368 static void pwq_activate_first_inactive(struct pool_workqueue *pwq) 1369 { 1370 struct work_struct *work = list_first_entry(&pwq->inactive_works, 1371 struct work_struct, entry); 1372 1373 pwq_activate_inactive_work(work); 1374 } 1375 1376 /** 1377 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight 1378 * @pwq: pwq of interest 1379 * @work_data: work_data of work which left the queue 1380 * 1381 * A work either has completed or is removed from pending queue, 1382 * decrement nr_in_flight of its pwq and handle workqueue flushing. 1383 * 1384 * CONTEXT: 1385 * raw_spin_lock_irq(pool->lock). 1386 */ 1387 static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) 1388 { 1389 int color = get_work_color(work_data); 1390 1391 if (!(work_data & WORK_STRUCT_INACTIVE)) { 1392 pwq->nr_active--; 1393 if (!list_empty(&pwq->inactive_works)) { 1394 /* one down, submit an inactive one */ 1395 if (pwq->nr_active < pwq->max_active) 1396 pwq_activate_first_inactive(pwq); 1397 } 1398 } 1399 1400 pwq->nr_in_flight[color]--; 1401 1402 /* is flush in progress and are we at the flushing tip? */ 1403 if (likely(pwq->flush_color != color)) 1404 goto out_put; 1405 1406 /* are there still in-flight works? */ 1407 if (pwq->nr_in_flight[color]) 1408 goto out_put; 1409 1410 /* this pwq is done, clear flush_color */ 1411 pwq->flush_color = -1; 1412 1413 /* 1414 * If this was the last pwq, wake up the first flusher. It 1415 * will handle the rest. 1416 */ 1417 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) 1418 complete(&pwq->wq->first_flusher->done); 1419 out_put: 1420 put_pwq(pwq); 1421 } 1422 1423 /** 1424 * try_to_grab_pending - steal work item from worklist and disable irq 1425 * @work: work item to steal 1426 * @is_dwork: @work is a delayed_work 1427 * @flags: place to store irq state 1428 * 1429 * Try to grab PENDING bit of @work. This function can handle @work in any 1430 * stable state - idle, on timer or on worklist. 1431 * 1432 * Return: 1433 * 1434 * ======== ================================================================ 1435 * 1 if @work was pending and we successfully stole PENDING 1436 * 0 if @work was idle and we claimed PENDING 1437 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry 1438 * -ENOENT if someone else is canceling @work, this state may persist 1439 * for arbitrarily long 1440 * ======== ================================================================ 1441 * 1442 * Note: 1443 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting 1444 * interrupted while holding PENDING and @work off queue, irq must be 1445 * disabled on entry. This, combined with delayed_work->timer being 1446 * irqsafe, ensures that we return -EAGAIN for finite short period of time. 1447 * 1448 * On successful return, >= 0, irq is disabled and the caller is 1449 * responsible for releasing it using local_irq_restore(*@flags). 1450 * 1451 * This function is safe to call from any context including IRQ handler. 1452 */ 1453 static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 1454 unsigned long *flags) 1455 { 1456 struct worker_pool *pool; 1457 struct pool_workqueue *pwq; 1458 1459 local_irq_save(*flags); 1460 1461 /* try to steal the timer if it exists */ 1462 if (is_dwork) { 1463 struct delayed_work *dwork = to_delayed_work(work); 1464 1465 /* 1466 * dwork->timer is irqsafe. If del_timer() fails, it's 1467 * guaranteed that the timer is not queued anywhere and not 1468 * running on the local CPU. 1469 */ 1470 if (likely(del_timer(&dwork->timer))) 1471 return 1; 1472 } 1473 1474 /* try to claim PENDING the normal way */ 1475 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) 1476 return 0; 1477 1478 rcu_read_lock(); 1479 /* 1480 * The queueing is in progress, or it is already queued. Try to 1481 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 1482 */ 1483 pool = get_work_pool(work); 1484 if (!pool) 1485 goto fail; 1486 1487 raw_spin_lock(&pool->lock); 1488 /* 1489 * work->data is guaranteed to point to pwq only while the work 1490 * item is queued on pwq->wq, and both updating work->data to point 1491 * to pwq on queueing and to pool on dequeueing are done under 1492 * pwq->pool->lock. This in turn guarantees that, if work->data 1493 * points to pwq which is associated with a locked pool, the work 1494 * item is currently queued on that pool. 1495 */ 1496 pwq = get_work_pwq(work); 1497 if (pwq && pwq->pool == pool) { 1498 debug_work_deactivate(work); 1499 1500 /* 1501 * A cancelable inactive work item must be in the 1502 * pwq->inactive_works since a queued barrier can't be 1503 * canceled (see the comments in insert_wq_barrier()). 1504 * 1505 * An inactive work item cannot be grabbed directly because 1506 * it might have linked barrier work items which, if left 1507 * on the inactive_works list, will confuse pwq->nr_active 1508 * management later on and cause stall. Make sure the work 1509 * item is activated before grabbing. 1510 */ 1511 if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) 1512 pwq_activate_inactive_work(work); 1513 1514 list_del_init(&work->entry); 1515 pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); 1516 1517 /* work->data points to pwq iff queued, point to pool */ 1518 set_work_pool_and_keep_pending(work, pool->id); 1519 1520 raw_spin_unlock(&pool->lock); 1521 rcu_read_unlock(); 1522 return 1; 1523 } 1524 raw_spin_unlock(&pool->lock); 1525 fail: 1526 rcu_read_unlock(); 1527 local_irq_restore(*flags); 1528 if (work_is_canceling(work)) 1529 return -ENOENT; 1530 cpu_relax(); 1531 return -EAGAIN; 1532 } 1533 1534 /** 1535 * insert_work - insert a work into a pool 1536 * @pwq: pwq @work belongs to 1537 * @work: work to insert 1538 * @head: insertion point 1539 * @extra_flags: extra WORK_STRUCT_* flags to set 1540 * 1541 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to 1542 * work_struct flags. 1543 * 1544 * CONTEXT: 1545 * raw_spin_lock_irq(pool->lock). 1546 */ 1547 static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, 1548 struct list_head *head, unsigned int extra_flags) 1549 { 1550 struct worker_pool *pool = pwq->pool; 1551 1552 /* record the work call stack in order to print it in KASAN reports */ 1553 kasan_record_aux_stack_noalloc(work); 1554 1555 /* we own @work, set data and link */ 1556 set_work_pwq(work, pwq, extra_flags); 1557 list_add_tail(&work->entry, head); 1558 get_pwq(pwq); 1559 1560 if (__need_more_worker(pool)) 1561 wake_up_worker(pool); 1562 } 1563 1564 /* 1565 * Test whether @work is being queued from another work executing on the 1566 * same workqueue. 1567 */ 1568 static bool is_chained_work(struct workqueue_struct *wq) 1569 { 1570 struct worker *worker; 1571 1572 worker = current_wq_worker(); 1573 /* 1574 * Return %true iff I'm a worker executing a work item on @wq. If 1575 * I'm @worker, it's safe to dereference it without locking. 1576 */ 1577 return worker && worker->current_pwq->wq == wq; 1578 } 1579 1580 /* 1581 * When queueing an unbound work item to a wq, prefer local CPU if allowed 1582 * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to 1583 * avoid perturbing sensitive tasks. 1584 */ 1585 static int wq_select_unbound_cpu(int cpu) 1586 { 1587 int new_cpu; 1588 1589 if (likely(!wq_debug_force_rr_cpu)) { 1590 if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) 1591 return cpu; 1592 } else { 1593 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); 1594 } 1595 1596 if (cpumask_empty(wq_unbound_cpumask)) 1597 return cpu; 1598 1599 new_cpu = __this_cpu_read(wq_rr_cpu_last); 1600 new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); 1601 if (unlikely(new_cpu >= nr_cpu_ids)) { 1602 new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); 1603 if (unlikely(new_cpu >= nr_cpu_ids)) 1604 return cpu; 1605 } 1606 __this_cpu_write(wq_rr_cpu_last, new_cpu); 1607 1608 return new_cpu; 1609 } 1610 1611 static void __queue_work(int cpu, struct workqueue_struct *wq, 1612 struct work_struct *work) 1613 { 1614 struct pool_workqueue *pwq; 1615 struct worker_pool *last_pool; 1616 struct list_head *worklist; 1617 unsigned int work_flags; 1618 unsigned int req_cpu = cpu; 1619 1620 /* 1621 * While a work item is PENDING && off queue, a task trying to 1622 * steal the PENDING will busy-loop waiting for it to either get 1623 * queued or lose PENDING. Grabbing PENDING and queueing should 1624 * happen with IRQ disabled. 1625 */ 1626 lockdep_assert_irqs_disabled(); 1627 1628 1629 /* 1630 * For a draining wq, only works from the same workqueue are 1631 * allowed. The __WQ_DESTROYING helps to spot the issue that 1632 * queues a new work item to a wq after destroy_workqueue(wq). 1633 */ 1634 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && 1635 WARN_ON_ONCE(!is_chained_work(wq)))) 1636 return; 1637 rcu_read_lock(); 1638 retry: 1639 /* pwq which will be used unless @work is executing elsewhere */ 1640 if (wq->flags & WQ_UNBOUND) { 1641 if (req_cpu == WORK_CPU_UNBOUND) 1642 cpu = wq_select_unbound_cpu(raw_smp_processor_id()); 1643 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); 1644 } else { 1645 if (req_cpu == WORK_CPU_UNBOUND) 1646 cpu = raw_smp_processor_id(); 1647 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); 1648 } 1649 1650 /* 1651 * If @work was previously on a different pool, it might still be 1652 * running there, in which case the work needs to be queued on that 1653 * pool to guarantee non-reentrancy. 1654 */ 1655 last_pool = get_work_pool(work); 1656 if (last_pool && last_pool != pwq->pool) { 1657 struct worker *worker; 1658 1659 raw_spin_lock(&last_pool->lock); 1660 1661 worker = find_worker_executing_work(last_pool, work); 1662 1663 if (worker && worker->current_pwq->wq == wq) { 1664 pwq = worker->current_pwq; 1665 } else { 1666 /* meh... not running there, queue here */ 1667 raw_spin_unlock(&last_pool->lock); 1668 raw_spin_lock(&pwq->pool->lock); 1669 } 1670 } else { 1671 raw_spin_lock(&pwq->pool->lock); 1672 } 1673 1674 /* 1675 * pwq is determined and locked. For unbound pools, we could have 1676 * raced with pwq release and it could already be dead. If its 1677 * refcnt is zero, repeat pwq selection. Note that pwqs never die 1678 * without another pwq replacing it in the numa_pwq_tbl or while 1679 * work items are executing on it, so the retrying is guaranteed to 1680 * make forward-progress. 1681 */ 1682 if (unlikely(!pwq->refcnt)) { 1683 if (wq->flags & WQ_UNBOUND) { 1684 raw_spin_unlock(&pwq->pool->lock); 1685 cpu_relax(); 1686 goto retry; 1687 } 1688 /* oops */ 1689 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", 1690 wq->name, cpu); 1691 } 1692 1693 /* pwq determined, queue */ 1694 trace_workqueue_queue_work(req_cpu, pwq, work); 1695 1696 if (WARN_ON(!list_empty(&work->entry))) 1697 goto out; 1698 1699 pwq->nr_in_flight[pwq->work_color]++; 1700 work_flags = work_color_to_flags(pwq->work_color); 1701 1702 if (likely(pwq->nr_active < pwq->max_active)) { 1703 trace_workqueue_activate_work(work); 1704 pwq->nr_active++; 1705 worklist = &pwq->pool->worklist; 1706 if (list_empty(worklist)) 1707 pwq->pool->watchdog_ts = jiffies; 1708 } else { 1709 work_flags |= WORK_STRUCT_INACTIVE; 1710 worklist = &pwq->inactive_works; 1711 } 1712 1713 debug_work_activate(work); 1714 insert_work(pwq, work, worklist, work_flags); 1715 1716 out: 1717 raw_spin_unlock(&pwq->pool->lock); 1718 rcu_read_unlock(); 1719 } 1720 1721 /** 1722 * queue_work_on - queue work on specific cpu 1723 * @cpu: CPU number to execute work on 1724 * @wq: workqueue to use 1725 * @work: work to queue 1726 * 1727 * We queue the work to a specific CPU, the caller must ensure it 1728 * can't go away. Callers that fail to ensure that the specified 1729 * CPU cannot go away will execute on a randomly chosen CPU. 1730 * But note well that callers specifying a CPU that never has been 1731 * online will get a splat. 1732 * 1733 * Return: %false if @work was already on a queue, %true otherwise. 1734 */ 1735 bool queue_work_on(int cpu, struct workqueue_struct *wq, 1736 struct work_struct *work) 1737 { 1738 bool ret = false; 1739 unsigned long flags; 1740 1741 local_irq_save(flags); 1742 1743 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1744 __queue_work(cpu, wq, work); 1745 ret = true; 1746 } 1747 1748 local_irq_restore(flags); 1749 return ret; 1750 } 1751 EXPORT_SYMBOL(queue_work_on); 1752 1753 /** 1754 * workqueue_select_cpu_near - Select a CPU based on NUMA node 1755 * @node: NUMA node ID that we want to select a CPU from 1756 * 1757 * This function will attempt to find a "random" cpu available on a given 1758 * node. If there are no CPUs available on the given node it will return 1759 * WORK_CPU_UNBOUND indicating that we should just schedule to any 1760 * available CPU if we need to schedule this work. 1761 */ 1762 static int workqueue_select_cpu_near(int node) 1763 { 1764 int cpu; 1765 1766 /* No point in doing this if NUMA isn't enabled for workqueues */ 1767 if (!wq_numa_enabled) 1768 return WORK_CPU_UNBOUND; 1769 1770 /* Delay binding to CPU if node is not valid or online */ 1771 if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) 1772 return WORK_CPU_UNBOUND; 1773 1774 /* Use local node/cpu if we are already there */ 1775 cpu = raw_smp_processor_id(); 1776 if (node == cpu_to_node(cpu)) 1777 return cpu; 1778 1779 /* Use "random" otherwise know as "first" online CPU of node */ 1780 cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); 1781 1782 /* If CPU is valid return that, otherwise just defer */ 1783 return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; 1784 } 1785 1786 /** 1787 * queue_work_node - queue work on a "random" cpu for a given NUMA node 1788 * @node: NUMA node that we are targeting the work for 1789 * @wq: workqueue to use 1790 * @work: work to queue 1791 * 1792 * We queue the work to a "random" CPU within a given NUMA node. The basic 1793 * idea here is to provide a way to somehow associate work with a given 1794 * NUMA node. 1795 * 1796 * This function will only make a best effort attempt at getting this onto 1797 * the right NUMA node. If no node is requested or the requested node is 1798 * offline then we just fall back to standard queue_work behavior. 1799 * 1800 * Currently the "random" CPU ends up being the first available CPU in the 1801 * intersection of cpu_online_mask and the cpumask of the node, unless we 1802 * are running on the node. In that case we just use the current CPU. 1803 * 1804 * Return: %false if @work was already on a queue, %true otherwise. 1805 */ 1806 bool queue_work_node(int node, struct workqueue_struct *wq, 1807 struct work_struct *work) 1808 { 1809 unsigned long flags; 1810 bool ret = false; 1811 1812 /* 1813 * This current implementation is specific to unbound workqueues. 1814 * Specifically we only return the first available CPU for a given 1815 * node instead of cycling through individual CPUs within the node. 1816 * 1817 * If this is used with a per-cpu workqueue then the logic in 1818 * workqueue_select_cpu_near would need to be updated to allow for 1819 * some round robin type logic. 1820 */ 1821 WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); 1822 1823 local_irq_save(flags); 1824 1825 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1826 int cpu = workqueue_select_cpu_near(node); 1827 1828 __queue_work(cpu, wq, work); 1829 ret = true; 1830 } 1831 1832 local_irq_restore(flags); 1833 return ret; 1834 } 1835 EXPORT_SYMBOL_GPL(queue_work_node); 1836 1837 void delayed_work_timer_fn(struct timer_list *t) 1838 { 1839 struct delayed_work *dwork = from_timer(dwork, t, timer); 1840 1841 /* should have been called from irqsafe timer with irq already off */ 1842 __queue_work(dwork->cpu, dwork->wq, &dwork->work); 1843 } 1844 EXPORT_SYMBOL(delayed_work_timer_fn); 1845 1846 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 1847 struct delayed_work *dwork, unsigned long delay) 1848 { 1849 struct timer_list *timer = &dwork->timer; 1850 struct work_struct *work = &dwork->work; 1851 1852 WARN_ON_ONCE(!wq); 1853 WARN_ON_ONCE(timer->function != delayed_work_timer_fn); 1854 WARN_ON_ONCE(timer_pending(timer)); 1855 WARN_ON_ONCE(!list_empty(&work->entry)); 1856 1857 /* 1858 * If @delay is 0, queue @dwork->work immediately. This is for 1859 * both optimization and correctness. The earliest @timer can 1860 * expire is on the closest next tick and delayed_work users depend 1861 * on that there's no such delay when @delay is 0. 1862 */ 1863 if (!delay) { 1864 __queue_work(cpu, wq, &dwork->work); 1865 return; 1866 } 1867 1868 dwork->wq = wq; 1869 dwork->cpu = cpu; 1870 timer->expires = jiffies + delay; 1871 1872 if (unlikely(cpu != WORK_CPU_UNBOUND)) 1873 add_timer_on(timer, cpu); 1874 else 1875 add_timer(timer); 1876 } 1877 1878 /** 1879 * queue_delayed_work_on - queue work on specific CPU after delay 1880 * @cpu: CPU number to execute work on 1881 * @wq: workqueue to use 1882 * @dwork: work to queue 1883 * @delay: number of jiffies to wait before queueing 1884 * 1885 * Return: %false if @work was already on a queue, %true otherwise. If 1886 * @delay is zero and @dwork is idle, it will be scheduled for immediate 1887 * execution. 1888 */ 1889 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 1890 struct delayed_work *dwork, unsigned long delay) 1891 { 1892 struct work_struct *work = &dwork->work; 1893 bool ret = false; 1894 unsigned long flags; 1895 1896 /* read the comment in __queue_work() */ 1897 local_irq_save(flags); 1898 1899 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1900 __queue_delayed_work(cpu, wq, dwork, delay); 1901 ret = true; 1902 } 1903 1904 local_irq_restore(flags); 1905 return ret; 1906 } 1907 EXPORT_SYMBOL(queue_delayed_work_on); 1908 1909 /** 1910 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1911 * @cpu: CPU number to execute work on 1912 * @wq: workqueue to use 1913 * @dwork: work to queue 1914 * @delay: number of jiffies to wait before queueing 1915 * 1916 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, 1917 * modify @dwork's timer so that it expires after @delay. If @delay is 1918 * zero, @work is guaranteed to be scheduled immediately regardless of its 1919 * current state. 1920 * 1921 * Return: %false if @dwork was idle and queued, %true if @dwork was 1922 * pending and its timer was modified. 1923 * 1924 * This function is safe to call from any context including IRQ handler. 1925 * See try_to_grab_pending() for details. 1926 */ 1927 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, 1928 struct delayed_work *dwork, unsigned long delay) 1929 { 1930 unsigned long flags; 1931 int ret; 1932 1933 do { 1934 ret = try_to_grab_pending(&dwork->work, true, &flags); 1935 } while (unlikely(ret == -EAGAIN)); 1936 1937 if (likely(ret >= 0)) { 1938 __queue_delayed_work(cpu, wq, dwork, delay); 1939 local_irq_restore(flags); 1940 } 1941 1942 /* -ENOENT from try_to_grab_pending() becomes %true */ 1943 return ret; 1944 } 1945 EXPORT_SYMBOL_GPL(mod_delayed_work_on); 1946 1947 static void rcu_work_rcufn(struct rcu_head *rcu) 1948 { 1949 struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); 1950 1951 /* read the comment in __queue_work() */ 1952 local_irq_disable(); 1953 __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); 1954 local_irq_enable(); 1955 } 1956 1957 /** 1958 * queue_rcu_work - queue work after a RCU grace period 1959 * @wq: workqueue to use 1960 * @rwork: work to queue 1961 * 1962 * Return: %false if @rwork was already pending, %true otherwise. Note 1963 * that a full RCU grace period is guaranteed only after a %true return. 1964 * While @rwork is guaranteed to be executed after a %false return, the 1965 * execution may happen before a full RCU grace period has passed. 1966 */ 1967 bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) 1968 { 1969 struct work_struct *work = &rwork->work; 1970 1971 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1972 rwork->wq = wq; 1973 call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); 1974 return true; 1975 } 1976 1977 return false; 1978 } 1979 EXPORT_SYMBOL(queue_rcu_work); 1980 1981 /** 1982 * worker_enter_idle - enter idle state 1983 * @worker: worker which is entering idle state 1984 * 1985 * @worker is entering idle state. Update stats and idle timer if 1986 * necessary. 1987 * 1988 * LOCKING: 1989 * raw_spin_lock_irq(pool->lock). 1990 */ 1991 static void worker_enter_idle(struct worker *worker) 1992 { 1993 struct worker_pool *pool = worker->pool; 1994 1995 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || 1996 WARN_ON_ONCE(!list_empty(&worker->entry) && 1997 (worker->hentry.next || worker->hentry.pprev))) 1998 return; 1999 2000 /* can't use worker_set_flags(), also called from create_worker() */ 2001 worker->flags |= WORKER_IDLE; 2002 pool->nr_idle++; 2003 worker->last_active = jiffies; 2004 2005 /* idle_list is LIFO */ 2006 list_add(&worker->entry, &pool->idle_list); 2007 2008 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) 2009 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 2010 2011 /* Sanity check nr_running. */ 2012 WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); 2013 } 2014 2015 /** 2016 * worker_leave_idle - leave idle state 2017 * @worker: worker which is leaving idle state 2018 * 2019 * @worker is leaving idle state. Update stats. 2020 * 2021 * LOCKING: 2022 * raw_spin_lock_irq(pool->lock). 2023 */ 2024 static void worker_leave_idle(struct worker *worker) 2025 { 2026 struct worker_pool *pool = worker->pool; 2027 2028 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) 2029 return; 2030 worker_clr_flags(worker, WORKER_IDLE); 2031 pool->nr_idle--; 2032 list_del_init(&worker->entry); 2033 } 2034 2035 static struct worker *alloc_worker(int node) 2036 { 2037 struct worker *worker; 2038 2039 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); 2040 if (worker) { 2041 INIT_LIST_HEAD(&worker->entry); 2042 INIT_LIST_HEAD(&worker->scheduled); 2043 INIT_LIST_HEAD(&worker->node); 2044 /* on creation a worker is in !idle && prep state */ 2045 worker->flags = WORKER_PREP; 2046 } 2047 return worker; 2048 } 2049 2050 /** 2051 * worker_attach_to_pool() - attach a worker to a pool 2052 * @worker: worker to be attached 2053 * @pool: the target pool 2054 * 2055 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and 2056 * cpu-binding of @worker are kept coordinated with the pool across 2057 * cpu-[un]hotplugs. 2058 */ 2059 static void worker_attach_to_pool(struct worker *worker, 2060 struct worker_pool *pool) 2061 { 2062 mutex_lock(&wq_pool_attach_mutex); 2063 2064 /* 2065 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains 2066 * stable across this function. See the comments above the flag 2067 * definition for details. 2068 */ 2069 if (pool->flags & POOL_DISASSOCIATED) 2070 worker->flags |= WORKER_UNBOUND; 2071 else 2072 kthread_set_per_cpu(worker->task, pool->cpu); 2073 2074 if (worker->rescue_wq) 2075 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 2076 2077 list_add_tail(&worker->node, &pool->workers); 2078 worker->pool = pool; 2079 2080 mutex_unlock(&wq_pool_attach_mutex); 2081 } 2082 2083 /** 2084 * worker_detach_from_pool() - detach a worker from its pool 2085 * @worker: worker which is attached to its pool 2086 * 2087 * Undo the attaching which had been done in worker_attach_to_pool(). The 2088 * caller worker shouldn't access to the pool after detached except it has 2089 * other reference to the pool. 2090 */ 2091 static void worker_detach_from_pool(struct worker *worker) 2092 { 2093 struct worker_pool *pool = worker->pool; 2094 struct completion *detach_completion = NULL; 2095 2096 mutex_lock(&wq_pool_attach_mutex); 2097 2098 kthread_set_per_cpu(worker->task, -1); 2099 list_del(&worker->node); 2100 worker->pool = NULL; 2101 2102 if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) 2103 detach_completion = pool->detach_completion; 2104 mutex_unlock(&wq_pool_attach_mutex); 2105 2106 /* clear leftover flags without pool->lock after it is detached */ 2107 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); 2108 2109 if (detach_completion) 2110 complete(detach_completion); 2111 } 2112 2113 /** 2114 * create_worker - create a new workqueue worker 2115 * @pool: pool the new worker will belong to 2116 * 2117 * Create and start a new worker which is attached to @pool. 2118 * 2119 * CONTEXT: 2120 * Might sleep. Does GFP_KERNEL allocations. 2121 * 2122 * Return: 2123 * Pointer to the newly created worker. 2124 */ 2125 static struct worker *create_worker(struct worker_pool *pool) 2126 { 2127 struct worker *worker; 2128 int id; 2129 char id_buf[16]; 2130 2131 /* ID is needed to determine kthread name */ 2132 id = ida_alloc(&pool->worker_ida, GFP_KERNEL); 2133 if (id < 0) { 2134 pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", 2135 ERR_PTR(id)); 2136 return NULL; 2137 } 2138 2139 worker = alloc_worker(pool->node); 2140 if (!worker) { 2141 pr_err_once("workqueue: Failed to allocate a worker\n"); 2142 goto fail; 2143 } 2144 2145 worker->id = id; 2146 2147 if (pool->cpu >= 0) 2148 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, 2149 pool->attrs->nice < 0 ? "H" : ""); 2150 else 2151 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); 2152 2153 worker->task = kthread_create_on_node(worker_thread, worker, pool->node, 2154 "kworker/%s", id_buf); 2155 if (IS_ERR(worker->task)) { 2156 if (PTR_ERR(worker->task) == -EINTR) { 2157 pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", 2158 id_buf); 2159 } else { 2160 pr_err_once("workqueue: Failed to create a worker thread: %pe", 2161 worker->task); 2162 } 2163 goto fail; 2164 } 2165 2166 set_user_nice(worker->task, pool->attrs->nice); 2167 kthread_bind_mask(worker->task, pool->attrs->cpumask); 2168 2169 /* successful, attach the worker to the pool */ 2170 worker_attach_to_pool(worker, pool); 2171 2172 /* start the newly created worker */ 2173 raw_spin_lock_irq(&pool->lock); 2174 worker->pool->nr_workers++; 2175 worker_enter_idle(worker); 2176 wake_up_process(worker->task); 2177 raw_spin_unlock_irq(&pool->lock); 2178 2179 return worker; 2180 2181 fail: 2182 ida_free(&pool->worker_ida, id); 2183 kfree(worker); 2184 return NULL; 2185 } 2186 2187 static void unbind_worker(struct worker *worker) 2188 { 2189 lockdep_assert_held(&wq_pool_attach_mutex); 2190 2191 kthread_set_per_cpu(worker->task, -1); 2192 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) 2193 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); 2194 else 2195 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); 2196 } 2197 2198 static void wake_dying_workers(struct list_head *cull_list) 2199 { 2200 struct worker *worker, *tmp; 2201 2202 list_for_each_entry_safe(worker, tmp, cull_list, entry) { 2203 list_del_init(&worker->entry); 2204 unbind_worker(worker); 2205 /* 2206 * If the worker was somehow already running, then it had to be 2207 * in pool->idle_list when set_worker_dying() happened or we 2208 * wouldn't have gotten here. 2209 * 2210 * Thus, the worker must either have observed the WORKER_DIE 2211 * flag, or have set its state to TASK_IDLE. Either way, the 2212 * below will be observed by the worker and is safe to do 2213 * outside of pool->lock. 2214 */ 2215 wake_up_process(worker->task); 2216 } 2217 } 2218 2219 /** 2220 * set_worker_dying - Tag a worker for destruction 2221 * @worker: worker to be destroyed 2222 * @list: transfer worker away from its pool->idle_list and into list 2223 * 2224 * Tag @worker for destruction and adjust @pool stats accordingly. The worker 2225 * should be idle. 2226 * 2227 * CONTEXT: 2228 * raw_spin_lock_irq(pool->lock). 2229 */ 2230 static void set_worker_dying(struct worker *worker, struct list_head *list) 2231 { 2232 struct worker_pool *pool = worker->pool; 2233 2234 lockdep_assert_held(&pool->lock); 2235 lockdep_assert_held(&wq_pool_attach_mutex); 2236 2237 /* sanity check frenzy */ 2238 if (WARN_ON(worker->current_work) || 2239 WARN_ON(!list_empty(&worker->scheduled)) || 2240 WARN_ON(!(worker->flags & WORKER_IDLE))) 2241 return; 2242 2243 pool->nr_workers--; 2244 pool->nr_idle--; 2245 2246 worker->flags |= WORKER_DIE; 2247 2248 list_move(&worker->entry, list); 2249 list_move(&worker->node, &pool->dying_workers); 2250 } 2251 2252 /** 2253 * idle_worker_timeout - check if some idle workers can now be deleted. 2254 * @t: The pool's idle_timer that just expired 2255 * 2256 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in 2257 * worker_leave_idle(), as a worker flicking between idle and active while its 2258 * pool is at the too_many_workers() tipping point would cause too much timer 2259 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let 2260 * it expire and re-evaluate things from there. 2261 */ 2262 static void idle_worker_timeout(struct timer_list *t) 2263 { 2264 struct worker_pool *pool = from_timer(pool, t, idle_timer); 2265 bool do_cull = false; 2266 2267 if (work_pending(&pool->idle_cull_work)) 2268 return; 2269 2270 raw_spin_lock_irq(&pool->lock); 2271 2272 if (too_many_workers(pool)) { 2273 struct worker *worker; 2274 unsigned long expires; 2275 2276 /* idle_list is kept in LIFO order, check the last one */ 2277 worker = list_entry(pool->idle_list.prev, struct worker, entry); 2278 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 2279 do_cull = !time_before(jiffies, expires); 2280 2281 if (!do_cull) 2282 mod_timer(&pool->idle_timer, expires); 2283 } 2284 raw_spin_unlock_irq(&pool->lock); 2285 2286 if (do_cull) 2287 queue_work(system_unbound_wq, &pool->idle_cull_work); 2288 } 2289 2290 /** 2291 * idle_cull_fn - cull workers that have been idle for too long. 2292 * @work: the pool's work for handling these idle workers 2293 * 2294 * This goes through a pool's idle workers and gets rid of those that have been 2295 * idle for at least IDLE_WORKER_TIMEOUT seconds. 2296 * 2297 * We don't want to disturb isolated CPUs because of a pcpu kworker being 2298 * culled, so this also resets worker affinity. This requires a sleepable 2299 * context, hence the split between timer callback and work item. 2300 */ 2301 static void idle_cull_fn(struct work_struct *work) 2302 { 2303 struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); 2304 struct list_head cull_list; 2305 2306 INIT_LIST_HEAD(&cull_list); 2307 /* 2308 * Grabbing wq_pool_attach_mutex here ensures an already-running worker 2309 * cannot proceed beyong worker_detach_from_pool() in its self-destruct 2310 * path. This is required as a previously-preempted worker could run after 2311 * set_worker_dying() has happened but before wake_dying_workers() did. 2312 */ 2313 mutex_lock(&wq_pool_attach_mutex); 2314 raw_spin_lock_irq(&pool->lock); 2315 2316 while (too_many_workers(pool)) { 2317 struct worker *worker; 2318 unsigned long expires; 2319 2320 worker = list_entry(pool->idle_list.prev, struct worker, entry); 2321 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 2322 2323 if (time_before(jiffies, expires)) { 2324 mod_timer(&pool->idle_timer, expires); 2325 break; 2326 } 2327 2328 set_worker_dying(worker, &cull_list); 2329 } 2330 2331 raw_spin_unlock_irq(&pool->lock); 2332 wake_dying_workers(&cull_list); 2333 mutex_unlock(&wq_pool_attach_mutex); 2334 } 2335 2336 static void send_mayday(struct work_struct *work) 2337 { 2338 struct pool_workqueue *pwq = get_work_pwq(work); 2339 struct workqueue_struct *wq = pwq->wq; 2340 2341 lockdep_assert_held(&wq_mayday_lock); 2342 2343 if (!wq->rescuer) 2344 return; 2345 2346 /* mayday mayday mayday */ 2347 if (list_empty(&pwq->mayday_node)) { 2348 /* 2349 * If @pwq is for an unbound wq, its base ref may be put at 2350 * any time due to an attribute change. Pin @pwq until the 2351 * rescuer is done with it. 2352 */ 2353 get_pwq(pwq); 2354 list_add_tail(&pwq->mayday_node, &wq->maydays); 2355 wake_up_process(wq->rescuer->task); 2356 pwq->stats[PWQ_STAT_MAYDAY]++; 2357 } 2358 } 2359 2360 static void pool_mayday_timeout(struct timer_list *t) 2361 { 2362 struct worker_pool *pool = from_timer(pool, t, mayday_timer); 2363 struct work_struct *work; 2364 2365 raw_spin_lock_irq(&pool->lock); 2366 raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ 2367 2368 if (need_to_create_worker(pool)) { 2369 /* 2370 * We've been trying to create a new worker but 2371 * haven't been successful. We might be hitting an 2372 * allocation deadlock. Send distress signals to 2373 * rescuers. 2374 */ 2375 list_for_each_entry(work, &pool->worklist, entry) 2376 send_mayday(work); 2377 } 2378 2379 raw_spin_unlock(&wq_mayday_lock); 2380 raw_spin_unlock_irq(&pool->lock); 2381 2382 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 2383 } 2384 2385 /** 2386 * maybe_create_worker - create a new worker if necessary 2387 * @pool: pool to create a new worker for 2388 * 2389 * Create a new worker for @pool if necessary. @pool is guaranteed to 2390 * have at least one idle worker on return from this function. If 2391 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 2392 * sent to all rescuers with works scheduled on @pool to resolve 2393 * possible allocation deadlock. 2394 * 2395 * On return, need_to_create_worker() is guaranteed to be %false and 2396 * may_start_working() %true. 2397 * 2398 * LOCKING: 2399 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 2400 * multiple times. Does GFP_KERNEL allocations. Called only from 2401 * manager. 2402 */ 2403 static void maybe_create_worker(struct worker_pool *pool) 2404 __releases(&pool->lock) 2405 __acquires(&pool->lock) 2406 { 2407 restart: 2408 raw_spin_unlock_irq(&pool->lock); 2409 2410 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 2411 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 2412 2413 while (true) { 2414 if (create_worker(pool) || !need_to_create_worker(pool)) 2415 break; 2416 2417 schedule_timeout_interruptible(CREATE_COOLDOWN); 2418 2419 if (!need_to_create_worker(pool)) 2420 break; 2421 } 2422 2423 del_timer_sync(&pool->mayday_timer); 2424 raw_spin_lock_irq(&pool->lock); 2425 /* 2426 * This is necessary even after a new worker was just successfully 2427 * created as @pool->lock was dropped and the new worker might have 2428 * already become busy. 2429 */ 2430 if (need_to_create_worker(pool)) 2431 goto restart; 2432 } 2433 2434 /** 2435 * manage_workers - manage worker pool 2436 * @worker: self 2437 * 2438 * Assume the manager role and manage the worker pool @worker belongs 2439 * to. At any given time, there can be only zero or one manager per 2440 * pool. The exclusion is handled automatically by this function. 2441 * 2442 * The caller can safely start processing works on false return. On 2443 * true return, it's guaranteed that need_to_create_worker() is false 2444 * and may_start_working() is true. 2445 * 2446 * CONTEXT: 2447 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 2448 * multiple times. Does GFP_KERNEL allocations. 2449 * 2450 * Return: 2451 * %false if the pool doesn't need management and the caller can safely 2452 * start processing works, %true if management function was performed and 2453 * the conditions that the caller verified before calling the function may 2454 * no longer be true. 2455 */ 2456 static bool manage_workers(struct worker *worker) 2457 { 2458 struct worker_pool *pool = worker->pool; 2459 2460 if (pool->flags & POOL_MANAGER_ACTIVE) 2461 return false; 2462 2463 pool->flags |= POOL_MANAGER_ACTIVE; 2464 pool->manager = worker; 2465 2466 maybe_create_worker(pool); 2467 2468 pool->manager = NULL; 2469 pool->flags &= ~POOL_MANAGER_ACTIVE; 2470 rcuwait_wake_up(&manager_wait); 2471 return true; 2472 } 2473 2474 /** 2475 * process_one_work - process single work 2476 * @worker: self 2477 * @work: work to process 2478 * 2479 * Process @work. This function contains all the logics necessary to 2480 * process a single work including synchronization against and 2481 * interaction with other workers on the same cpu, queueing and 2482 * flushing. As long as context requirement is met, any worker can 2483 * call this function to process a work. 2484 * 2485 * CONTEXT: 2486 * raw_spin_lock_irq(pool->lock) which is released and regrabbed. 2487 */ 2488 static void process_one_work(struct worker *worker, struct work_struct *work) 2489 __releases(&pool->lock) 2490 __acquires(&pool->lock) 2491 { 2492 struct pool_workqueue *pwq = get_work_pwq(work); 2493 struct worker_pool *pool = worker->pool; 2494 unsigned long work_data; 2495 struct worker *collision; 2496 #ifdef CONFIG_LOCKDEP 2497 /* 2498 * It is permissible to free the struct work_struct from 2499 * inside the function that is called from it, this we need to 2500 * take into account for lockdep too. To avoid bogus "held 2501 * lock freed" warnings as well as problems when looking into 2502 * work->lockdep_map, make a copy and use that here. 2503 */ 2504 struct lockdep_map lockdep_map; 2505 2506 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 2507 #endif 2508 /* ensure we're on the correct CPU */ 2509 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && 2510 raw_smp_processor_id() != pool->cpu); 2511 2512 /* 2513 * A single work shouldn't be executed concurrently by 2514 * multiple workers on a single cpu. Check whether anyone is 2515 * already processing the work. If so, defer the work to the 2516 * currently executing one. 2517 */ 2518 collision = find_worker_executing_work(pool, work); 2519 if (unlikely(collision)) { 2520 move_linked_works(work, &collision->scheduled, NULL); 2521 return; 2522 } 2523 2524 /* claim and dequeue */ 2525 debug_work_deactivate(work); 2526 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); 2527 worker->current_work = work; 2528 worker->current_func = work->func; 2529 worker->current_pwq = pwq; 2530 worker->current_at = worker->task->se.sum_exec_runtime; 2531 work_data = *work_data_bits(work); 2532 worker->current_color = get_work_color(work_data); 2533 2534 /* 2535 * Record wq name for cmdline and debug reporting, may get 2536 * overridden through set_worker_desc(). 2537 */ 2538 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); 2539 2540 list_del_init(&work->entry); 2541 2542 /* 2543 * CPU intensive works don't participate in concurrency management. 2544 * They're the scheduler's responsibility. This takes @worker out 2545 * of concurrency management and the next code block will chain 2546 * execution of the pending work items. 2547 */ 2548 if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) 2549 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 2550 2551 /* 2552 * Wake up another worker if necessary. The condition is always 2553 * false for normal per-cpu workers since nr_running would always 2554 * be >= 1 at this point. This is used to chain execution of the 2555 * pending work items for WORKER_NOT_RUNNING workers such as the 2556 * UNBOUND and CPU_INTENSIVE ones. 2557 */ 2558 if (need_more_worker(pool)) 2559 wake_up_worker(pool); 2560 2561 /* 2562 * Record the last pool and clear PENDING which should be the last 2563 * update to @work. Also, do this inside @pool->lock so that 2564 * PENDING and queued state changes happen together while IRQ is 2565 * disabled. 2566 */ 2567 set_work_pool_and_clear_pending(work, pool->id); 2568 2569 raw_spin_unlock_irq(&pool->lock); 2570 2571 lock_map_acquire(&pwq->wq->lockdep_map); 2572 lock_map_acquire(&lockdep_map); 2573 /* 2574 * Strictly speaking we should mark the invariant state without holding 2575 * any locks, that is, before these two lock_map_acquire()'s. 2576 * 2577 * However, that would result in: 2578 * 2579 * A(W1) 2580 * WFC(C) 2581 * A(W1) 2582 * C(C) 2583 * 2584 * Which would create W1->C->W1 dependencies, even though there is no 2585 * actual deadlock possible. There are two solutions, using a 2586 * read-recursive acquire on the work(queue) 'locks', but this will then 2587 * hit the lockdep limitation on recursive locks, or simply discard 2588 * these locks. 2589 * 2590 * AFAICT there is no possible deadlock scenario between the 2591 * flush_work() and complete() primitives (except for single-threaded 2592 * workqueues), so hiding them isn't a problem. 2593 */ 2594 lockdep_invariant_state(true); 2595 pwq->stats[PWQ_STAT_STARTED]++; 2596 trace_workqueue_execute_start(work); 2597 worker->current_func(work); 2598 /* 2599 * While we must be careful to not use "work" after this, the trace 2600 * point will only record its address. 2601 */ 2602 trace_workqueue_execute_end(work, worker->current_func); 2603 pwq->stats[PWQ_STAT_COMPLETED]++; 2604 lock_map_release(&lockdep_map); 2605 lock_map_release(&pwq->wq->lockdep_map); 2606 2607 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2608 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2609 " last function: %ps\n", 2610 current->comm, preempt_count(), task_pid_nr(current), 2611 worker->current_func); 2612 debug_show_held_locks(current); 2613 dump_stack(); 2614 } 2615 2616 /* 2617 * The following prevents a kworker from hogging CPU on !PREEMPTION 2618 * kernels, where a requeueing work item waiting for something to 2619 * happen could deadlock with stop_machine as such work item could 2620 * indefinitely requeue itself while all other CPUs are trapped in 2621 * stop_machine. At the same time, report a quiescent RCU state so 2622 * the same condition doesn't freeze RCU. 2623 */ 2624 cond_resched(); 2625 2626 raw_spin_lock_irq(&pool->lock); 2627 2628 /* 2629 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked 2630 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than 2631 * wq_cpu_intensive_thresh_us. Clear it. 2632 */ 2633 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2634 2635 /* tag the worker for identification in schedule() */ 2636 worker->last_func = worker->current_func; 2637 2638 /* we're done with it, release */ 2639 hash_del(&worker->hentry); 2640 worker->current_work = NULL; 2641 worker->current_func = NULL; 2642 worker->current_pwq = NULL; 2643 worker->current_color = INT_MAX; 2644 pwq_dec_nr_in_flight(pwq, work_data); 2645 } 2646 2647 /** 2648 * process_scheduled_works - process scheduled works 2649 * @worker: self 2650 * 2651 * Process all scheduled works. Please note that the scheduled list 2652 * may change while processing a work, so this function repeatedly 2653 * fetches a work from the top and executes it. 2654 * 2655 * CONTEXT: 2656 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 2657 * multiple times. 2658 */ 2659 static void process_scheduled_works(struct worker *worker) 2660 { 2661 while (!list_empty(&worker->scheduled)) { 2662 struct work_struct *work = list_first_entry(&worker->scheduled, 2663 struct work_struct, entry); 2664 process_one_work(worker, work); 2665 } 2666 } 2667 2668 static void set_pf_worker(bool val) 2669 { 2670 mutex_lock(&wq_pool_attach_mutex); 2671 if (val) 2672 current->flags |= PF_WQ_WORKER; 2673 else 2674 current->flags &= ~PF_WQ_WORKER; 2675 mutex_unlock(&wq_pool_attach_mutex); 2676 } 2677 2678 /** 2679 * worker_thread - the worker thread function 2680 * @__worker: self 2681 * 2682 * The worker thread function. All workers belong to a worker_pool - 2683 * either a per-cpu one or dynamic unbound one. These workers process all 2684 * work items regardless of their specific target workqueue. The only 2685 * exception is work items which belong to workqueues with a rescuer which 2686 * will be explained in rescuer_thread(). 2687 * 2688 * Return: 0 2689 */ 2690 static int worker_thread(void *__worker) 2691 { 2692 struct worker *worker = __worker; 2693 struct worker_pool *pool = worker->pool; 2694 2695 /* tell the scheduler that this is a workqueue worker */ 2696 set_pf_worker(true); 2697 woke_up: 2698 raw_spin_lock_irq(&pool->lock); 2699 2700 /* am I supposed to die? */ 2701 if (unlikely(worker->flags & WORKER_DIE)) { 2702 raw_spin_unlock_irq(&pool->lock); 2703 set_pf_worker(false); 2704 2705 set_task_comm(worker->task, "kworker/dying"); 2706 ida_free(&pool->worker_ida, worker->id); 2707 worker_detach_from_pool(worker); 2708 WARN_ON_ONCE(!list_empty(&worker->entry)); 2709 kfree(worker); 2710 return 0; 2711 } 2712 2713 worker_leave_idle(worker); 2714 recheck: 2715 /* no more worker necessary? */ 2716 if (!need_more_worker(pool)) 2717 goto sleep; 2718 2719 /* do we need to manage? */ 2720 if (unlikely(!may_start_working(pool)) && manage_workers(worker)) 2721 goto recheck; 2722 2723 /* 2724 * ->scheduled list can only be filled while a worker is 2725 * preparing to process a work or actually processing it. 2726 * Make sure nobody diddled with it while I was sleeping. 2727 */ 2728 WARN_ON_ONCE(!list_empty(&worker->scheduled)); 2729 2730 /* 2731 * Finish PREP stage. We're guaranteed to have at least one idle 2732 * worker or that someone else has already assumed the manager 2733 * role. This is where @worker starts participating in concurrency 2734 * management if applicable and concurrency management is restored 2735 * after being rebound. See rebind_workers() for details. 2736 */ 2737 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); 2738 2739 do { 2740 struct work_struct *work = 2741 list_first_entry(&pool->worklist, 2742 struct work_struct, entry); 2743 2744 pool->watchdog_ts = jiffies; 2745 2746 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { 2747 /* optimization path, not strictly necessary */ 2748 process_one_work(worker, work); 2749 if (unlikely(!list_empty(&worker->scheduled))) 2750 process_scheduled_works(worker); 2751 } else { 2752 move_linked_works(work, &worker->scheduled, NULL); 2753 process_scheduled_works(worker); 2754 } 2755 } while (keep_working(pool)); 2756 2757 worker_set_flags(worker, WORKER_PREP); 2758 sleep: 2759 /* 2760 * pool->lock is held and there's no work to process and no need to 2761 * manage, sleep. Workers are woken up only while holding 2762 * pool->lock or from local cpu, so setting the current state 2763 * before releasing pool->lock is enough to prevent losing any 2764 * event. 2765 */ 2766 worker_enter_idle(worker); 2767 __set_current_state(TASK_IDLE); 2768 raw_spin_unlock_irq(&pool->lock); 2769 schedule(); 2770 goto woke_up; 2771 } 2772 2773 /** 2774 * rescuer_thread - the rescuer thread function 2775 * @__rescuer: self 2776 * 2777 * Workqueue rescuer thread function. There's one rescuer for each 2778 * workqueue which has WQ_MEM_RECLAIM set. 2779 * 2780 * Regular work processing on a pool may block trying to create a new 2781 * worker which uses GFP_KERNEL allocation which has slight chance of 2782 * developing into deadlock if some works currently on the same queue 2783 * need to be processed to satisfy the GFP_KERNEL allocation. This is 2784 * the problem rescuer solves. 2785 * 2786 * When such condition is possible, the pool summons rescuers of all 2787 * workqueues which have works queued on the pool and let them process 2788 * those works so that forward progress can be guaranteed. 2789 * 2790 * This should happen rarely. 2791 * 2792 * Return: 0 2793 */ 2794 static int rescuer_thread(void *__rescuer) 2795 { 2796 struct worker *rescuer = __rescuer; 2797 struct workqueue_struct *wq = rescuer->rescue_wq; 2798 struct list_head *scheduled = &rescuer->scheduled; 2799 bool should_stop; 2800 2801 set_user_nice(current, RESCUER_NICE_LEVEL); 2802 2803 /* 2804 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it 2805 * doesn't participate in concurrency management. 2806 */ 2807 set_pf_worker(true); 2808 repeat: 2809 set_current_state(TASK_IDLE); 2810 2811 /* 2812 * By the time the rescuer is requested to stop, the workqueue 2813 * shouldn't have any work pending, but @wq->maydays may still have 2814 * pwq(s) queued. This can happen by non-rescuer workers consuming 2815 * all the work items before the rescuer got to them. Go through 2816 * @wq->maydays processing before acting on should_stop so that the 2817 * list is always empty on exit. 2818 */ 2819 should_stop = kthread_should_stop(); 2820 2821 /* see whether any pwq is asking for help */ 2822 raw_spin_lock_irq(&wq_mayday_lock); 2823 2824 while (!list_empty(&wq->maydays)) { 2825 struct pool_workqueue *pwq = list_first_entry(&wq->maydays, 2826 struct pool_workqueue, mayday_node); 2827 struct worker_pool *pool = pwq->pool; 2828 struct work_struct *work, *n; 2829 bool first = true; 2830 2831 __set_current_state(TASK_RUNNING); 2832 list_del_init(&pwq->mayday_node); 2833 2834 raw_spin_unlock_irq(&wq_mayday_lock); 2835 2836 worker_attach_to_pool(rescuer, pool); 2837 2838 raw_spin_lock_irq(&pool->lock); 2839 2840 /* 2841 * Slurp in all works issued via this workqueue and 2842 * process'em. 2843 */ 2844 WARN_ON_ONCE(!list_empty(scheduled)); 2845 list_for_each_entry_safe(work, n, &pool->worklist, entry) { 2846 if (get_work_pwq(work) == pwq) { 2847 if (first) 2848 pool->watchdog_ts = jiffies; 2849 move_linked_works(work, scheduled, &n); 2850 pwq->stats[PWQ_STAT_RESCUED]++; 2851 } 2852 first = false; 2853 } 2854 2855 if (!list_empty(scheduled)) { 2856 process_scheduled_works(rescuer); 2857 2858 /* 2859 * The above execution of rescued work items could 2860 * have created more to rescue through 2861 * pwq_activate_first_inactive() or chained 2862 * queueing. Let's put @pwq back on mayday list so 2863 * that such back-to-back work items, which may be 2864 * being used to relieve memory pressure, don't 2865 * incur MAYDAY_INTERVAL delay inbetween. 2866 */ 2867 if (pwq->nr_active && need_to_create_worker(pool)) { 2868 raw_spin_lock(&wq_mayday_lock); 2869 /* 2870 * Queue iff we aren't racing destruction 2871 * and somebody else hasn't queued it already. 2872 */ 2873 if (wq->rescuer && list_empty(&pwq->mayday_node)) { 2874 get_pwq(pwq); 2875 list_add_tail(&pwq->mayday_node, &wq->maydays); 2876 } 2877 raw_spin_unlock(&wq_mayday_lock); 2878 } 2879 } 2880 2881 /* 2882 * Put the reference grabbed by send_mayday(). @pool won't 2883 * go away while we're still attached to it. 2884 */ 2885 put_pwq(pwq); 2886 2887 /* 2888 * Leave this pool. If need_more_worker() is %true, notify a 2889 * regular worker; otherwise, we end up with 0 concurrency 2890 * and stalling the execution. 2891 */ 2892 if (need_more_worker(pool)) 2893 wake_up_worker(pool); 2894 2895 raw_spin_unlock_irq(&pool->lock); 2896 2897 worker_detach_from_pool(rescuer); 2898 2899 raw_spin_lock_irq(&wq_mayday_lock); 2900 } 2901 2902 raw_spin_unlock_irq(&wq_mayday_lock); 2903 2904 if (should_stop) { 2905 __set_current_state(TASK_RUNNING); 2906 set_pf_worker(false); 2907 return 0; 2908 } 2909 2910 /* rescuers should never participate in concurrency management */ 2911 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2912 schedule(); 2913 goto repeat; 2914 } 2915 2916 /** 2917 * check_flush_dependency - check for flush dependency sanity 2918 * @target_wq: workqueue being flushed 2919 * @target_work: work item being flushed (NULL for workqueue flushes) 2920 * 2921 * %current is trying to flush the whole @target_wq or @target_work on it. 2922 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not 2923 * reclaiming memory or running on a workqueue which doesn't have 2924 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to 2925 * a deadlock. 2926 */ 2927 static void check_flush_dependency(struct workqueue_struct *target_wq, 2928 struct work_struct *target_work) 2929 { 2930 work_func_t target_func = target_work ? target_work->func : NULL; 2931 struct worker *worker; 2932 2933 if (target_wq->flags & WQ_MEM_RECLAIM) 2934 return; 2935 2936 worker = current_wq_worker(); 2937 2938 WARN_ONCE(current->flags & PF_MEMALLOC, 2939 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", 2940 current->pid, current->comm, target_wq->name, target_func); 2941 WARN_ONCE(worker && ((worker->current_pwq->wq->flags & 2942 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), 2943 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", 2944 worker->current_pwq->wq->name, worker->current_func, 2945 target_wq->name, target_func); 2946 } 2947 2948 struct wq_barrier { 2949 struct work_struct work; 2950 struct completion done; 2951 struct task_struct *task; /* purely informational */ 2952 }; 2953 2954 static void wq_barrier_func(struct work_struct *work) 2955 { 2956 struct wq_barrier *barr = container_of(work, struct wq_barrier, work); 2957 complete(&barr->done); 2958 } 2959 2960 /** 2961 * insert_wq_barrier - insert a barrier work 2962 * @pwq: pwq to insert barrier into 2963 * @barr: wq_barrier to insert 2964 * @target: target work to attach @barr to 2965 * @worker: worker currently executing @target, NULL if @target is not executing 2966 * 2967 * @barr is linked to @target such that @barr is completed only after 2968 * @target finishes execution. Please note that the ordering 2969 * guarantee is observed only with respect to @target and on the local 2970 * cpu. 2971 * 2972 * Currently, a queued barrier can't be canceled. This is because 2973 * try_to_grab_pending() can't determine whether the work to be 2974 * grabbed is at the head of the queue and thus can't clear LINKED 2975 * flag of the previous work while there must be a valid next work 2976 * after a work with LINKED flag set. 2977 * 2978 * Note that when @worker is non-NULL, @target may be modified 2979 * underneath us, so we can't reliably determine pwq from @target. 2980 * 2981 * CONTEXT: 2982 * raw_spin_lock_irq(pool->lock). 2983 */ 2984 static void insert_wq_barrier(struct pool_workqueue *pwq, 2985 struct wq_barrier *barr, 2986 struct work_struct *target, struct worker *worker) 2987 { 2988 unsigned int work_flags = 0; 2989 unsigned int work_color; 2990 struct list_head *head; 2991 2992 /* 2993 * debugobject calls are safe here even with pool->lock locked 2994 * as we know for sure that this will not trigger any of the 2995 * checks and call back into the fixup functions where we 2996 * might deadlock. 2997 */ 2998 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); 2999 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 3000 3001 init_completion_map(&barr->done, &target->lockdep_map); 3002 3003 barr->task = current; 3004 3005 /* The barrier work item does not participate in pwq->nr_active. */ 3006 work_flags |= WORK_STRUCT_INACTIVE; 3007 3008 /* 3009 * If @target is currently being executed, schedule the 3010 * barrier to the worker; otherwise, put it after @target. 3011 */ 3012 if (worker) { 3013 head = worker->scheduled.next; 3014 work_color = worker->current_color; 3015 } else { 3016 unsigned long *bits = work_data_bits(target); 3017 3018 head = target->entry.next; 3019 /* there can already be other linked works, inherit and set */ 3020 work_flags |= *bits & WORK_STRUCT_LINKED; 3021 work_color = get_work_color(*bits); 3022 __set_bit(WORK_STRUCT_LINKED_BIT, bits); 3023 } 3024 3025 pwq->nr_in_flight[work_color]++; 3026 work_flags |= work_color_to_flags(work_color); 3027 3028 debug_work_activate(&barr->work); 3029 insert_work(pwq, &barr->work, head, work_flags); 3030 } 3031 3032 /** 3033 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing 3034 * @wq: workqueue being flushed 3035 * @flush_color: new flush color, < 0 for no-op 3036 * @work_color: new work color, < 0 for no-op 3037 * 3038 * Prepare pwqs for workqueue flushing. 3039 * 3040 * If @flush_color is non-negative, flush_color on all pwqs should be 3041 * -1. If no pwq has in-flight commands at the specified color, all 3042 * pwq->flush_color's stay at -1 and %false is returned. If any pwq 3043 * has in flight commands, its pwq->flush_color is set to 3044 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq 3045 * wakeup logic is armed and %true is returned. 3046 * 3047 * The caller should have initialized @wq->first_flusher prior to 3048 * calling this function with non-negative @flush_color. If 3049 * @flush_color is negative, no flush color update is done and %false 3050 * is returned. 3051 * 3052 * If @work_color is non-negative, all pwqs should have the same 3053 * work_color which is previous to @work_color and all will be 3054 * advanced to @work_color. 3055 * 3056 * CONTEXT: 3057 * mutex_lock(wq->mutex). 3058 * 3059 * Return: 3060 * %true if @flush_color >= 0 and there's something to flush. %false 3061 * otherwise. 3062 */ 3063 static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, 3064 int flush_color, int work_color) 3065 { 3066 bool wait = false; 3067 struct pool_workqueue *pwq; 3068 3069 if (flush_color >= 0) { 3070 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); 3071 atomic_set(&wq->nr_pwqs_to_flush, 1); 3072 } 3073 3074 for_each_pwq(pwq, wq) { 3075 struct worker_pool *pool = pwq->pool; 3076 3077 raw_spin_lock_irq(&pool->lock); 3078 3079 if (flush_color >= 0) { 3080 WARN_ON_ONCE(pwq->flush_color != -1); 3081 3082 if (pwq->nr_in_flight[flush_color]) { 3083 pwq->flush_color = flush_color; 3084 atomic_inc(&wq->nr_pwqs_to_flush); 3085 wait = true; 3086 } 3087 } 3088 3089 if (work_color >= 0) { 3090 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); 3091 pwq->work_color = work_color; 3092 } 3093 3094 raw_spin_unlock_irq(&pool->lock); 3095 } 3096 3097 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) 3098 complete(&wq->first_flusher->done); 3099 3100 return wait; 3101 } 3102 3103 /** 3104 * __flush_workqueue - ensure that any scheduled work has run to completion. 3105 * @wq: workqueue to flush 3106 * 3107 * This function sleeps until all work items which were queued on entry 3108 * have finished execution, but it is not livelocked by new incoming ones. 3109 */ 3110 void __flush_workqueue(struct workqueue_struct *wq) 3111 { 3112 struct wq_flusher this_flusher = { 3113 .list = LIST_HEAD_INIT(this_flusher.list), 3114 .flush_color = -1, 3115 .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), 3116 }; 3117 int next_color; 3118 3119 if (WARN_ON(!wq_online)) 3120 return; 3121 3122 lock_map_acquire(&wq->lockdep_map); 3123 lock_map_release(&wq->lockdep_map); 3124 3125 mutex_lock(&wq->mutex); 3126 3127 /* 3128 * Start-to-wait phase 3129 */ 3130 next_color = work_next_color(wq->work_color); 3131 3132 if (next_color != wq->flush_color) { 3133 /* 3134 * Color space is not full. The current work_color 3135 * becomes our flush_color and work_color is advanced 3136 * by one. 3137 */ 3138 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); 3139 this_flusher.flush_color = wq->work_color; 3140 wq->work_color = next_color; 3141 3142 if (!wq->first_flusher) { 3143 /* no flush in progress, become the first flusher */ 3144 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 3145 3146 wq->first_flusher = &this_flusher; 3147 3148 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, 3149 wq->work_color)) { 3150 /* nothing to flush, done */ 3151 wq->flush_color = next_color; 3152 wq->first_flusher = NULL; 3153 goto out_unlock; 3154 } 3155 } else { 3156 /* wait in queue */ 3157 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); 3158 list_add_tail(&this_flusher.list, &wq->flusher_queue); 3159 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 3160 } 3161 } else { 3162 /* 3163 * Oops, color space is full, wait on overflow queue. 3164 * The next flush completion will assign us 3165 * flush_color and transfer to flusher_queue. 3166 */ 3167 list_add_tail(&this_flusher.list, &wq->flusher_overflow); 3168 } 3169 3170 check_flush_dependency(wq, NULL); 3171 3172 mutex_unlock(&wq->mutex); 3173 3174 wait_for_completion(&this_flusher.done); 3175 3176 /* 3177 * Wake-up-and-cascade phase 3178 * 3179 * First flushers are responsible for cascading flushes and 3180 * handling overflow. Non-first flushers can simply return. 3181 */ 3182 if (READ_ONCE(wq->first_flusher) != &this_flusher) 3183 return; 3184 3185 mutex_lock(&wq->mutex); 3186 3187 /* we might have raced, check again with mutex held */ 3188 if (wq->first_flusher != &this_flusher) 3189 goto out_unlock; 3190 3191 WRITE_ONCE(wq->first_flusher, NULL); 3192 3193 WARN_ON_ONCE(!list_empty(&this_flusher.list)); 3194 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 3195 3196 while (true) { 3197 struct wq_flusher *next, *tmp; 3198 3199 /* complete all the flushers sharing the current flush color */ 3200 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { 3201 if (next->flush_color != wq->flush_color) 3202 break; 3203 list_del_init(&next->list); 3204 complete(&next->done); 3205 } 3206 3207 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && 3208 wq->flush_color != work_next_color(wq->work_color)); 3209 3210 /* this flush_color is finished, advance by one */ 3211 wq->flush_color = work_next_color(wq->flush_color); 3212 3213 /* one color has been freed, handle overflow queue */ 3214 if (!list_empty(&wq->flusher_overflow)) { 3215 /* 3216 * Assign the same color to all overflowed 3217 * flushers, advance work_color and append to 3218 * flusher_queue. This is the start-to-wait 3219 * phase for these overflowed flushers. 3220 */ 3221 list_for_each_entry(tmp, &wq->flusher_overflow, list) 3222 tmp->flush_color = wq->work_color; 3223 3224 wq->work_color = work_next_color(wq->work_color); 3225 3226 list_splice_tail_init(&wq->flusher_overflow, 3227 &wq->flusher_queue); 3228 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 3229 } 3230 3231 if (list_empty(&wq->flusher_queue)) { 3232 WARN_ON_ONCE(wq->flush_color != wq->work_color); 3233 break; 3234 } 3235 3236 /* 3237 * Need to flush more colors. Make the next flusher 3238 * the new first flusher and arm pwqs. 3239 */ 3240 WARN_ON_ONCE(wq->flush_color == wq->work_color); 3241 WARN_ON_ONCE(wq->flush_color != next->flush_color); 3242 3243 list_del_init(&next->list); 3244 wq->first_flusher = next; 3245 3246 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) 3247 break; 3248 3249 /* 3250 * Meh... this color is already done, clear first 3251 * flusher and repeat cascading. 3252 */ 3253 wq->first_flusher = NULL; 3254 } 3255 3256 out_unlock: 3257 mutex_unlock(&wq->mutex); 3258 } 3259 EXPORT_SYMBOL(__flush_workqueue); 3260 3261 /** 3262 * drain_workqueue - drain a workqueue 3263 * @wq: workqueue to drain 3264 * 3265 * Wait until the workqueue becomes empty. While draining is in progress, 3266 * only chain queueing is allowed. IOW, only currently pending or running 3267 * work items on @wq can queue further work items on it. @wq is flushed 3268 * repeatedly until it becomes empty. The number of flushing is determined 3269 * by the depth of chaining and should be relatively short. Whine if it 3270 * takes too long. 3271 */ 3272 void drain_workqueue(struct workqueue_struct *wq) 3273 { 3274 unsigned int flush_cnt = 0; 3275 struct pool_workqueue *pwq; 3276 3277 /* 3278 * __queue_work() needs to test whether there are drainers, is much 3279 * hotter than drain_workqueue() and already looks at @wq->flags. 3280 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. 3281 */ 3282 mutex_lock(&wq->mutex); 3283 if (!wq->nr_drainers++) 3284 wq->flags |= __WQ_DRAINING; 3285 mutex_unlock(&wq->mutex); 3286 reflush: 3287 __flush_workqueue(wq); 3288 3289 mutex_lock(&wq->mutex); 3290 3291 for_each_pwq(pwq, wq) { 3292 bool drained; 3293 3294 raw_spin_lock_irq(&pwq->pool->lock); 3295 drained = !pwq->nr_active && list_empty(&pwq->inactive_works); 3296 raw_spin_unlock_irq(&pwq->pool->lock); 3297 3298 if (drained) 3299 continue; 3300 3301 if (++flush_cnt == 10 || 3302 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 3303 pr_warn("workqueue %s: %s() isn't complete after %u tries\n", 3304 wq->name, __func__, flush_cnt); 3305 3306 mutex_unlock(&wq->mutex); 3307 goto reflush; 3308 } 3309 3310 if (!--wq->nr_drainers) 3311 wq->flags &= ~__WQ_DRAINING; 3312 mutex_unlock(&wq->mutex); 3313 } 3314 EXPORT_SYMBOL_GPL(drain_workqueue); 3315 3316 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 3317 bool from_cancel) 3318 { 3319 struct worker *worker = NULL; 3320 struct worker_pool *pool; 3321 struct pool_workqueue *pwq; 3322 3323 might_sleep(); 3324 3325 rcu_read_lock(); 3326 pool = get_work_pool(work); 3327 if (!pool) { 3328 rcu_read_unlock(); 3329 return false; 3330 } 3331 3332 raw_spin_lock_irq(&pool->lock); 3333 /* see the comment in try_to_grab_pending() with the same code */ 3334 pwq = get_work_pwq(work); 3335 if (pwq) { 3336 if (unlikely(pwq->pool != pool)) 3337 goto already_gone; 3338 } else { 3339 worker = find_worker_executing_work(pool, work); 3340 if (!worker) 3341 goto already_gone; 3342 pwq = worker->current_pwq; 3343 } 3344 3345 check_flush_dependency(pwq->wq, work); 3346 3347 insert_wq_barrier(pwq, barr, work, worker); 3348 raw_spin_unlock_irq(&pool->lock); 3349 3350 /* 3351 * Force a lock recursion deadlock when using flush_work() inside a 3352 * single-threaded or rescuer equipped workqueue. 3353 * 3354 * For single threaded workqueues the deadlock happens when the work 3355 * is after the work issuing the flush_work(). For rescuer equipped 3356 * workqueues the deadlock happens when the rescuer stalls, blocking 3357 * forward progress. 3358 */ 3359 if (!from_cancel && 3360 (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { 3361 lock_map_acquire(&pwq->wq->lockdep_map); 3362 lock_map_release(&pwq->wq->lockdep_map); 3363 } 3364 rcu_read_unlock(); 3365 return true; 3366 already_gone: 3367 raw_spin_unlock_irq(&pool->lock); 3368 rcu_read_unlock(); 3369 return false; 3370 } 3371 3372 static bool __flush_work(struct work_struct *work, bool from_cancel) 3373 { 3374 struct wq_barrier barr; 3375 3376 if (WARN_ON(!wq_online)) 3377 return false; 3378 3379 if (WARN_ON(!work->func)) 3380 return false; 3381 3382 lock_map_acquire(&work->lockdep_map); 3383 lock_map_release(&work->lockdep_map); 3384 3385 if (start_flush_work(work, &barr, from_cancel)) { 3386 wait_for_completion(&barr.done); 3387 destroy_work_on_stack(&barr.work); 3388 return true; 3389 } else { 3390 return false; 3391 } 3392 } 3393 3394 /** 3395 * flush_work - wait for a work to finish executing the last queueing instance 3396 * @work: the work to flush 3397 * 3398 * Wait until @work has finished execution. @work is guaranteed to be idle 3399 * on return if it hasn't been requeued since flush started. 3400 * 3401 * Return: 3402 * %true if flush_work() waited for the work to finish execution, 3403 * %false if it was already idle. 3404 */ 3405 bool flush_work(struct work_struct *work) 3406 { 3407 return __flush_work(work, false); 3408 } 3409 EXPORT_SYMBOL_GPL(flush_work); 3410 3411 struct cwt_wait { 3412 wait_queue_entry_t wait; 3413 struct work_struct *work; 3414 }; 3415 3416 static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 3417 { 3418 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); 3419 3420 if (cwait->work != key) 3421 return 0; 3422 return autoremove_wake_function(wait, mode, sync, key); 3423 } 3424 3425 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) 3426 { 3427 static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); 3428 unsigned long flags; 3429 int ret; 3430 3431 do { 3432 ret = try_to_grab_pending(work, is_dwork, &flags); 3433 /* 3434 * If someone else is already canceling, wait for it to 3435 * finish. flush_work() doesn't work for PREEMPT_NONE 3436 * because we may get scheduled between @work's completion 3437 * and the other canceling task resuming and clearing 3438 * CANCELING - flush_work() will return false immediately 3439 * as @work is no longer busy, try_to_grab_pending() will 3440 * return -ENOENT as @work is still being canceled and the 3441 * other canceling task won't be able to clear CANCELING as 3442 * we're hogging the CPU. 3443 * 3444 * Let's wait for completion using a waitqueue. As this 3445 * may lead to the thundering herd problem, use a custom 3446 * wake function which matches @work along with exclusive 3447 * wait and wakeup. 3448 */ 3449 if (unlikely(ret == -ENOENT)) { 3450 struct cwt_wait cwait; 3451 3452 init_wait(&cwait.wait); 3453 cwait.wait.func = cwt_wakefn; 3454 cwait.work = work; 3455 3456 prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, 3457 TASK_UNINTERRUPTIBLE); 3458 if (work_is_canceling(work)) 3459 schedule(); 3460 finish_wait(&cancel_waitq, &cwait.wait); 3461 } 3462 } while (unlikely(ret < 0)); 3463 3464 /* tell other tasks trying to grab @work to back off */ 3465 mark_work_canceling(work); 3466 local_irq_restore(flags); 3467 3468 /* 3469 * This allows canceling during early boot. We know that @work 3470 * isn't executing. 3471 */ 3472 if (wq_online) 3473 __flush_work(work, true); 3474 3475 clear_work_data(work); 3476 3477 /* 3478 * Paired with prepare_to_wait() above so that either 3479 * waitqueue_active() is visible here or !work_is_canceling() is 3480 * visible there. 3481 */ 3482 smp_mb(); 3483 if (waitqueue_active(&cancel_waitq)) 3484 __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); 3485 3486 return ret; 3487 } 3488 3489 /** 3490 * cancel_work_sync - cancel a work and wait for it to finish 3491 * @work: the work to cancel 3492 * 3493 * Cancel @work and wait for its execution to finish. This function 3494 * can be used even if the work re-queues itself or migrates to 3495 * another workqueue. On return from this function, @work is 3496 * guaranteed to be not pending or executing on any CPU. 3497 * 3498 * cancel_work_sync(&delayed_work->work) must not be used for 3499 * delayed_work's. Use cancel_delayed_work_sync() instead. 3500 * 3501 * The caller must ensure that the workqueue on which @work was last 3502 * queued can't be destroyed before this function returns. 3503 * 3504 * Return: 3505 * %true if @work was pending, %false otherwise. 3506 */ 3507 bool cancel_work_sync(struct work_struct *work) 3508 { 3509 return __cancel_work_timer(work, false); 3510 } 3511 EXPORT_SYMBOL_GPL(cancel_work_sync); 3512 3513 /** 3514 * flush_delayed_work - wait for a dwork to finish executing the last queueing 3515 * @dwork: the delayed work to flush 3516 * 3517 * Delayed timer is cancelled and the pending work is queued for 3518 * immediate execution. Like flush_work(), this function only 3519 * considers the last queueing instance of @dwork. 3520 * 3521 * Return: 3522 * %true if flush_work() waited for the work to finish execution, 3523 * %false if it was already idle. 3524 */ 3525 bool flush_delayed_work(struct delayed_work *dwork) 3526 { 3527 local_irq_disable(); 3528 if (del_timer_sync(&dwork->timer)) 3529 __queue_work(dwork->cpu, dwork->wq, &dwork->work); 3530 local_irq_enable(); 3531 return flush_work(&dwork->work); 3532 } 3533 EXPORT_SYMBOL(flush_delayed_work); 3534 3535 /** 3536 * flush_rcu_work - wait for a rwork to finish executing the last queueing 3537 * @rwork: the rcu work to flush 3538 * 3539 * Return: 3540 * %true if flush_rcu_work() waited for the work to finish execution, 3541 * %false if it was already idle. 3542 */ 3543 bool flush_rcu_work(struct rcu_work *rwork) 3544 { 3545 if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { 3546 rcu_barrier(); 3547 flush_work(&rwork->work); 3548 return true; 3549 } else { 3550 return flush_work(&rwork->work); 3551 } 3552 } 3553 EXPORT_SYMBOL(flush_rcu_work); 3554 3555 static bool __cancel_work(struct work_struct *work, bool is_dwork) 3556 { 3557 unsigned long flags; 3558 int ret; 3559 3560 do { 3561 ret = try_to_grab_pending(work, is_dwork, &flags); 3562 } while (unlikely(ret == -EAGAIN)); 3563 3564 if (unlikely(ret < 0)) 3565 return false; 3566 3567 set_work_pool_and_clear_pending(work, get_work_pool_id(work)); 3568 local_irq_restore(flags); 3569 return ret; 3570 } 3571 3572 /* 3573 * See cancel_delayed_work() 3574 */ 3575 bool cancel_work(struct work_struct *work) 3576 { 3577 return __cancel_work(work, false); 3578 } 3579 EXPORT_SYMBOL(cancel_work); 3580 3581 /** 3582 * cancel_delayed_work - cancel a delayed work 3583 * @dwork: delayed_work to cancel 3584 * 3585 * Kill off a pending delayed_work. 3586 * 3587 * Return: %true if @dwork was pending and canceled; %false if it wasn't 3588 * pending. 3589 * 3590 * Note: 3591 * The work callback function may still be running on return, unless 3592 * it returns %true and the work doesn't re-arm itself. Explicitly flush or 3593 * use cancel_delayed_work_sync() to wait on it. 3594 * 3595 * This function is safe to call from any context including IRQ handler. 3596 */ 3597 bool cancel_delayed_work(struct delayed_work *dwork) 3598 { 3599 return __cancel_work(&dwork->work, true); 3600 } 3601 EXPORT_SYMBOL(cancel_delayed_work); 3602 3603 /** 3604 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish 3605 * @dwork: the delayed work cancel 3606 * 3607 * This is cancel_work_sync() for delayed works. 3608 * 3609 * Return: 3610 * %true if @dwork was pending, %false otherwise. 3611 */ 3612 bool cancel_delayed_work_sync(struct delayed_work *dwork) 3613 { 3614 return __cancel_work_timer(&dwork->work, true); 3615 } 3616 EXPORT_SYMBOL(cancel_delayed_work_sync); 3617 3618 /** 3619 * schedule_on_each_cpu - execute a function synchronously on each online CPU 3620 * @func: the function to call 3621 * 3622 * schedule_on_each_cpu() executes @func on each online CPU using the 3623 * system workqueue and blocks until all CPUs have completed. 3624 * schedule_on_each_cpu() is very slow. 3625 * 3626 * Return: 3627 * 0 on success, -errno on failure. 3628 */ 3629 int schedule_on_each_cpu(work_func_t func) 3630 { 3631 int cpu; 3632 struct work_struct __percpu *works; 3633 3634 works = alloc_percpu(struct work_struct); 3635 if (!works) 3636 return -ENOMEM; 3637 3638 cpus_read_lock(); 3639 3640 for_each_online_cpu(cpu) { 3641 struct work_struct *work = per_cpu_ptr(works, cpu); 3642 3643 INIT_WORK(work, func); 3644 schedule_work_on(cpu, work); 3645 } 3646 3647 for_each_online_cpu(cpu) 3648 flush_work(per_cpu_ptr(works, cpu)); 3649 3650 cpus_read_unlock(); 3651 free_percpu(works); 3652 return 0; 3653 } 3654 3655 /** 3656 * execute_in_process_context - reliably execute the routine with user context 3657 * @fn: the function to execute 3658 * @ew: guaranteed storage for the execute work structure (must 3659 * be available when the work executes) 3660 * 3661 * Executes the function immediately if process context is available, 3662 * otherwise schedules the function for delayed execution. 3663 * 3664 * Return: 0 - function was executed 3665 * 1 - function was scheduled for execution 3666 */ 3667 int execute_in_process_context(work_func_t fn, struct execute_work *ew) 3668 { 3669 if (!in_interrupt()) { 3670 fn(&ew->work); 3671 return 0; 3672 } 3673 3674 INIT_WORK(&ew->work, fn); 3675 schedule_work(&ew->work); 3676 3677 return 1; 3678 } 3679 EXPORT_SYMBOL_GPL(execute_in_process_context); 3680 3681 /** 3682 * free_workqueue_attrs - free a workqueue_attrs 3683 * @attrs: workqueue_attrs to free 3684 * 3685 * Undo alloc_workqueue_attrs(). 3686 */ 3687 void free_workqueue_attrs(struct workqueue_attrs *attrs) 3688 { 3689 if (attrs) { 3690 free_cpumask_var(attrs->cpumask); 3691 kfree(attrs); 3692 } 3693 } 3694 3695 /** 3696 * alloc_workqueue_attrs - allocate a workqueue_attrs 3697 * 3698 * Allocate a new workqueue_attrs, initialize with default settings and 3699 * return it. 3700 * 3701 * Return: The allocated new workqueue_attr on success. %NULL on failure. 3702 */ 3703 struct workqueue_attrs *alloc_workqueue_attrs(void) 3704 { 3705 struct workqueue_attrs *attrs; 3706 3707 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 3708 if (!attrs) 3709 goto fail; 3710 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) 3711 goto fail; 3712 3713 cpumask_copy(attrs->cpumask, cpu_possible_mask); 3714 return attrs; 3715 fail: 3716 free_workqueue_attrs(attrs); 3717 return NULL; 3718 } 3719 3720 static void copy_workqueue_attrs(struct workqueue_attrs *to, 3721 const struct workqueue_attrs *from) 3722 { 3723 to->nice = from->nice; 3724 cpumask_copy(to->cpumask, from->cpumask); 3725 /* 3726 * Unlike hash and equality test, this function doesn't ignore 3727 * ->no_numa as it is used for both pool and wq attrs. Instead, 3728 * get_unbound_pool() explicitly clears ->no_numa after copying. 3729 */ 3730 to->no_numa = from->no_numa; 3731 } 3732 3733 /* hash value of the content of @attr */ 3734 static u32 wqattrs_hash(const struct workqueue_attrs *attrs) 3735 { 3736 u32 hash = 0; 3737 3738 hash = jhash_1word(attrs->nice, hash); 3739 hash = jhash(cpumask_bits(attrs->cpumask), 3740 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); 3741 return hash; 3742 } 3743 3744 /* content equality test */ 3745 static bool wqattrs_equal(const struct workqueue_attrs *a, 3746 const struct workqueue_attrs *b) 3747 { 3748 if (a->nice != b->nice) 3749 return false; 3750 if (!cpumask_equal(a->cpumask, b->cpumask)) 3751 return false; 3752 return true; 3753 } 3754 3755 /** 3756 * init_worker_pool - initialize a newly zalloc'd worker_pool 3757 * @pool: worker_pool to initialize 3758 * 3759 * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. 3760 * 3761 * Return: 0 on success, -errno on failure. Even on failure, all fields 3762 * inside @pool proper are initialized and put_unbound_pool() can be called 3763 * on @pool safely to release it. 3764 */ 3765 static int init_worker_pool(struct worker_pool *pool) 3766 { 3767 raw_spin_lock_init(&pool->lock); 3768 pool->id = -1; 3769 pool->cpu = -1; 3770 pool->node = NUMA_NO_NODE; 3771 pool->flags |= POOL_DISASSOCIATED; 3772 pool->watchdog_ts = jiffies; 3773 INIT_LIST_HEAD(&pool->worklist); 3774 INIT_LIST_HEAD(&pool->idle_list); 3775 hash_init(pool->busy_hash); 3776 3777 timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); 3778 INIT_WORK(&pool->idle_cull_work, idle_cull_fn); 3779 3780 timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); 3781 3782 INIT_LIST_HEAD(&pool->workers); 3783 INIT_LIST_HEAD(&pool->dying_workers); 3784 3785 ida_init(&pool->worker_ida); 3786 INIT_HLIST_NODE(&pool->hash_node); 3787 pool->refcnt = 1; 3788 3789 /* shouldn't fail above this point */ 3790 pool->attrs = alloc_workqueue_attrs(); 3791 if (!pool->attrs) 3792 return -ENOMEM; 3793 return 0; 3794 } 3795 3796 #ifdef CONFIG_LOCKDEP 3797 static void wq_init_lockdep(struct workqueue_struct *wq) 3798 { 3799 char *lock_name; 3800 3801 lockdep_register_key(&wq->key); 3802 lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); 3803 if (!lock_name) 3804 lock_name = wq->name; 3805 3806 wq->lock_name = lock_name; 3807 lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); 3808 } 3809 3810 static void wq_unregister_lockdep(struct workqueue_struct *wq) 3811 { 3812 lockdep_unregister_key(&wq->key); 3813 } 3814 3815 static void wq_free_lockdep(struct workqueue_struct *wq) 3816 { 3817 if (wq->lock_name != wq->name) 3818 kfree(wq->lock_name); 3819 } 3820 #else 3821 static void wq_init_lockdep(struct workqueue_struct *wq) 3822 { 3823 } 3824 3825 static void wq_unregister_lockdep(struct workqueue_struct *wq) 3826 { 3827 } 3828 3829 static void wq_free_lockdep(struct workqueue_struct *wq) 3830 { 3831 } 3832 #endif 3833 3834 static void rcu_free_wq(struct rcu_head *rcu) 3835 { 3836 struct workqueue_struct *wq = 3837 container_of(rcu, struct workqueue_struct, rcu); 3838 3839 wq_free_lockdep(wq); 3840 3841 if (!(wq->flags & WQ_UNBOUND)) 3842 free_percpu(wq->cpu_pwqs); 3843 else 3844 free_workqueue_attrs(wq->unbound_attrs); 3845 3846 kfree(wq); 3847 } 3848 3849 static void rcu_free_pool(struct rcu_head *rcu) 3850 { 3851 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3852 3853 ida_destroy(&pool->worker_ida); 3854 free_workqueue_attrs(pool->attrs); 3855 kfree(pool); 3856 } 3857 3858 /** 3859 * put_unbound_pool - put a worker_pool 3860 * @pool: worker_pool to put 3861 * 3862 * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU 3863 * safe manner. get_unbound_pool() calls this function on its failure path 3864 * and this function should be able to release pools which went through, 3865 * successfully or not, init_worker_pool(). 3866 * 3867 * Should be called with wq_pool_mutex held. 3868 */ 3869 static void put_unbound_pool(struct worker_pool *pool) 3870 { 3871 DECLARE_COMPLETION_ONSTACK(detach_completion); 3872 struct list_head cull_list; 3873 struct worker *worker; 3874 3875 INIT_LIST_HEAD(&cull_list); 3876 3877 lockdep_assert_held(&wq_pool_mutex); 3878 3879 if (--pool->refcnt) 3880 return; 3881 3882 /* sanity checks */ 3883 if (WARN_ON(!(pool->cpu < 0)) || 3884 WARN_ON(!list_empty(&pool->worklist))) 3885 return; 3886 3887 /* release id and unhash */ 3888 if (pool->id >= 0) 3889 idr_remove(&worker_pool_idr, pool->id); 3890 hash_del(&pool->hash_node); 3891 3892 /* 3893 * Become the manager and destroy all workers. This prevents 3894 * @pool's workers from blocking on attach_mutex. We're the last 3895 * manager and @pool gets freed with the flag set. 3896 * 3897 * Having a concurrent manager is quite unlikely to happen as we can 3898 * only get here with 3899 * pwq->refcnt == pool->refcnt == 0 3900 * which implies no work queued to the pool, which implies no worker can 3901 * become the manager. However a worker could have taken the role of 3902 * manager before the refcnts dropped to 0, since maybe_create_worker() 3903 * drops pool->lock 3904 */ 3905 while (true) { 3906 rcuwait_wait_event(&manager_wait, 3907 !(pool->flags & POOL_MANAGER_ACTIVE), 3908 TASK_UNINTERRUPTIBLE); 3909 3910 mutex_lock(&wq_pool_attach_mutex); 3911 raw_spin_lock_irq(&pool->lock); 3912 if (!(pool->flags & POOL_MANAGER_ACTIVE)) { 3913 pool->flags |= POOL_MANAGER_ACTIVE; 3914 break; 3915 } 3916 raw_spin_unlock_irq(&pool->lock); 3917 mutex_unlock(&wq_pool_attach_mutex); 3918 } 3919 3920 while ((worker = first_idle_worker(pool))) 3921 set_worker_dying(worker, &cull_list); 3922 WARN_ON(pool->nr_workers || pool->nr_idle); 3923 raw_spin_unlock_irq(&pool->lock); 3924 3925 wake_dying_workers(&cull_list); 3926 3927 if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) 3928 pool->detach_completion = &detach_completion; 3929 mutex_unlock(&wq_pool_attach_mutex); 3930 3931 if (pool->detach_completion) 3932 wait_for_completion(pool->detach_completion); 3933 3934 /* shut down the timers */ 3935 del_timer_sync(&pool->idle_timer); 3936 cancel_work_sync(&pool->idle_cull_work); 3937 del_timer_sync(&pool->mayday_timer); 3938 3939 /* RCU protected to allow dereferences from get_work_pool() */ 3940 call_rcu(&pool->rcu, rcu_free_pool); 3941 } 3942 3943 /** 3944 * get_unbound_pool - get a worker_pool with the specified attributes 3945 * @attrs: the attributes of the worker_pool to get 3946 * 3947 * Obtain a worker_pool which has the same attributes as @attrs, bump the 3948 * reference count and return it. If there already is a matching 3949 * worker_pool, it will be used; otherwise, this function attempts to 3950 * create a new one. 3951 * 3952 * Should be called with wq_pool_mutex held. 3953 * 3954 * Return: On success, a worker_pool with the same attributes as @attrs. 3955 * On failure, %NULL. 3956 */ 3957 static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) 3958 { 3959 u32 hash = wqattrs_hash(attrs); 3960 struct worker_pool *pool; 3961 int node; 3962 int target_node = NUMA_NO_NODE; 3963 3964 lockdep_assert_held(&wq_pool_mutex); 3965 3966 /* do we already have a matching pool? */ 3967 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { 3968 if (wqattrs_equal(pool->attrs, attrs)) { 3969 pool->refcnt++; 3970 return pool; 3971 } 3972 } 3973 3974 /* if cpumask is contained inside a NUMA node, we belong to that node */ 3975 if (wq_numa_enabled) { 3976 for_each_node(node) { 3977 if (cpumask_subset(attrs->cpumask, 3978 wq_numa_possible_cpumask[node])) { 3979 target_node = node; 3980 break; 3981 } 3982 } 3983 } 3984 3985 /* nope, create a new one */ 3986 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); 3987 if (!pool || init_worker_pool(pool) < 0) 3988 goto fail; 3989 3990 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3991 copy_workqueue_attrs(pool->attrs, attrs); 3992 pool->node = target_node; 3993 3994 /* 3995 * no_numa isn't a worker_pool attribute, always clear it. See 3996 * 'struct workqueue_attrs' comments for detail. 3997 */ 3998 pool->attrs->no_numa = false; 3999 4000 if (worker_pool_assign_id(pool) < 0) 4001 goto fail; 4002 4003 /* create and start the initial worker */ 4004 if (wq_online && !create_worker(pool)) 4005 goto fail; 4006 4007 /* install */ 4008 hash_add(unbound_pool_hash, &pool->hash_node, hash); 4009 4010 return pool; 4011 fail: 4012 if (pool) 4013 put_unbound_pool(pool); 4014 return NULL; 4015 } 4016 4017 static void rcu_free_pwq(struct rcu_head *rcu) 4018 { 4019 kmem_cache_free(pwq_cache, 4020 container_of(rcu, struct pool_workqueue, rcu)); 4021 } 4022 4023 /* 4024 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt 4025 * and needs to be destroyed. 4026 */ 4027 static void pwq_unbound_release_workfn(struct work_struct *work) 4028 { 4029 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, 4030 unbound_release_work); 4031 struct workqueue_struct *wq = pwq->wq; 4032 struct worker_pool *pool = pwq->pool; 4033 bool is_last = false; 4034 4035 /* 4036 * when @pwq is not linked, it doesn't hold any reference to the 4037 * @wq, and @wq is invalid to access. 4038 */ 4039 if (!list_empty(&pwq->pwqs_node)) { 4040 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) 4041 return; 4042 4043 mutex_lock(&wq->mutex); 4044 list_del_rcu(&pwq->pwqs_node); 4045 is_last = list_empty(&wq->pwqs); 4046 mutex_unlock(&wq->mutex); 4047 } 4048 4049 mutex_lock(&wq_pool_mutex); 4050 put_unbound_pool(pool); 4051 mutex_unlock(&wq_pool_mutex); 4052 4053 call_rcu(&pwq->rcu, rcu_free_pwq); 4054 4055 /* 4056 * If we're the last pwq going away, @wq is already dead and no one 4057 * is gonna access it anymore. Schedule RCU free. 4058 */ 4059 if (is_last) { 4060 wq_unregister_lockdep(wq); 4061 call_rcu(&wq->rcu, rcu_free_wq); 4062 } 4063 } 4064 4065 /** 4066 * pwq_adjust_max_active - update a pwq's max_active to the current setting 4067 * @pwq: target pool_workqueue 4068 * 4069 * If @pwq isn't freezing, set @pwq->max_active to the associated 4070 * workqueue's saved_max_active and activate inactive work items 4071 * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. 4072 */ 4073 static void pwq_adjust_max_active(struct pool_workqueue *pwq) 4074 { 4075 struct workqueue_struct *wq = pwq->wq; 4076 bool freezable = wq->flags & WQ_FREEZABLE; 4077 unsigned long flags; 4078 4079 /* for @wq->saved_max_active */ 4080 lockdep_assert_held(&wq->mutex); 4081 4082 /* fast exit for non-freezable wqs */ 4083 if (!freezable && pwq->max_active == wq->saved_max_active) 4084 return; 4085 4086 /* this function can be called during early boot w/ irq disabled */ 4087 raw_spin_lock_irqsave(&pwq->pool->lock, flags); 4088 4089 /* 4090 * During [un]freezing, the caller is responsible for ensuring that 4091 * this function is called at least once after @workqueue_freezing 4092 * is updated and visible. 4093 */ 4094 if (!freezable || !workqueue_freezing) { 4095 bool kick = false; 4096 4097 pwq->max_active = wq->saved_max_active; 4098 4099 while (!list_empty(&pwq->inactive_works) && 4100 pwq->nr_active < pwq->max_active) { 4101 pwq_activate_first_inactive(pwq); 4102 kick = true; 4103 } 4104 4105 /* 4106 * Need to kick a worker after thawed or an unbound wq's 4107 * max_active is bumped. In realtime scenarios, always kicking a 4108 * worker will cause interference on the isolated cpu cores, so 4109 * let's kick iff work items were activated. 4110 */ 4111 if (kick) 4112 wake_up_worker(pwq->pool); 4113 } else { 4114 pwq->max_active = 0; 4115 } 4116 4117 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); 4118 } 4119 4120 /* initialize newly allocated @pwq which is associated with @wq and @pool */ 4121 static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, 4122 struct worker_pool *pool) 4123 { 4124 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); 4125 4126 memset(pwq, 0, sizeof(*pwq)); 4127 4128 pwq->pool = pool; 4129 pwq->wq = wq; 4130 pwq->flush_color = -1; 4131 pwq->refcnt = 1; 4132 INIT_LIST_HEAD(&pwq->inactive_works); 4133 INIT_LIST_HEAD(&pwq->pwqs_node); 4134 INIT_LIST_HEAD(&pwq->mayday_node); 4135 INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); 4136 } 4137 4138 /* sync @pwq with the current state of its associated wq and link it */ 4139 static void link_pwq(struct pool_workqueue *pwq) 4140 { 4141 struct workqueue_struct *wq = pwq->wq; 4142 4143 lockdep_assert_held(&wq->mutex); 4144 4145 /* may be called multiple times, ignore if already linked */ 4146 if (!list_empty(&pwq->pwqs_node)) 4147 return; 4148 4149 /* set the matching work_color */ 4150 pwq->work_color = wq->work_color; 4151 4152 /* sync max_active to the current setting */ 4153 pwq_adjust_max_active(pwq); 4154 4155 /* link in @pwq */ 4156 list_add_rcu(&pwq->pwqs_node, &wq->pwqs); 4157 } 4158 4159 /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ 4160 static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, 4161 const struct workqueue_attrs *attrs) 4162 { 4163 struct worker_pool *pool; 4164 struct pool_workqueue *pwq; 4165 4166 lockdep_assert_held(&wq_pool_mutex); 4167 4168 pool = get_unbound_pool(attrs); 4169 if (!pool) 4170 return NULL; 4171 4172 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); 4173 if (!pwq) { 4174 put_unbound_pool(pool); 4175 return NULL; 4176 } 4177 4178 init_pwq(pwq, wq, pool); 4179 return pwq; 4180 } 4181 4182 /** 4183 * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node 4184 * @attrs: the wq_attrs of the default pwq of the target workqueue 4185 * @node: the target NUMA node 4186 * @cpu_going_down: if >= 0, the CPU to consider as offline 4187 * @cpumask: outarg, the resulting cpumask 4188 * 4189 * Calculate the cpumask a workqueue with @attrs should use on @node. If 4190 * @cpu_going_down is >= 0, that cpu is considered offline during 4191 * calculation. The result is stored in @cpumask. 4192 * 4193 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If 4194 * enabled and @node has online CPUs requested by @attrs, the returned 4195 * cpumask is the intersection of the possible CPUs of @node and 4196 * @attrs->cpumask. 4197 * 4198 * The caller is responsible for ensuring that the cpumask of @node stays 4199 * stable. 4200 * 4201 * Return: %true if the resulting @cpumask is different from @attrs->cpumask, 4202 * %false if equal. 4203 */ 4204 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, 4205 int cpu_going_down, cpumask_t *cpumask) 4206 { 4207 if (!wq_numa_enabled || attrs->no_numa) 4208 goto use_dfl; 4209 4210 /* does @node have any online CPUs @attrs wants? */ 4211 cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); 4212 if (cpu_going_down >= 0) 4213 cpumask_clear_cpu(cpu_going_down, cpumask); 4214 4215 if (cpumask_empty(cpumask)) 4216 goto use_dfl; 4217 4218 /* yeap, return possible CPUs in @node that @attrs wants */ 4219 cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); 4220 4221 if (cpumask_empty(cpumask)) { 4222 pr_warn_once("WARNING: workqueue cpumask: online intersect > " 4223 "possible intersect\n"); 4224 return false; 4225 } 4226 4227 return !cpumask_equal(cpumask, attrs->cpumask); 4228 4229 use_dfl: 4230 cpumask_copy(cpumask, attrs->cpumask); 4231 return false; 4232 } 4233 4234 /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ 4235 static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, 4236 int node, 4237 struct pool_workqueue *pwq) 4238 { 4239 struct pool_workqueue *old_pwq; 4240 4241 lockdep_assert_held(&wq_pool_mutex); 4242 lockdep_assert_held(&wq->mutex); 4243 4244 /* link_pwq() can handle duplicate calls */ 4245 link_pwq(pwq); 4246 4247 old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); 4248 rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); 4249 return old_pwq; 4250 } 4251 4252 /* context to store the prepared attrs & pwqs before applying */ 4253 struct apply_wqattrs_ctx { 4254 struct workqueue_struct *wq; /* target workqueue */ 4255 struct workqueue_attrs *attrs; /* attrs to apply */ 4256 struct list_head list; /* queued for batching commit */ 4257 struct pool_workqueue *dfl_pwq; 4258 struct pool_workqueue *pwq_tbl[]; 4259 }; 4260 4261 /* free the resources after success or abort */ 4262 static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) 4263 { 4264 if (ctx) { 4265 int node; 4266 4267 for_each_node(node) 4268 put_pwq_unlocked(ctx->pwq_tbl[node]); 4269 put_pwq_unlocked(ctx->dfl_pwq); 4270 4271 free_workqueue_attrs(ctx->attrs); 4272 4273 kfree(ctx); 4274 } 4275 } 4276 4277 /* allocate the attrs and pwqs for later installation */ 4278 static struct apply_wqattrs_ctx * 4279 apply_wqattrs_prepare(struct workqueue_struct *wq, 4280 const struct workqueue_attrs *attrs, 4281 const cpumask_var_t unbound_cpumask) 4282 { 4283 struct apply_wqattrs_ctx *ctx; 4284 struct workqueue_attrs *new_attrs, *tmp_attrs; 4285 int node; 4286 4287 lockdep_assert_held(&wq_pool_mutex); 4288 4289 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); 4290 4291 new_attrs = alloc_workqueue_attrs(); 4292 tmp_attrs = alloc_workqueue_attrs(); 4293 if (!ctx || !new_attrs || !tmp_attrs) 4294 goto out_free; 4295 4296 /* 4297 * Calculate the attrs of the default pwq with unbound_cpumask 4298 * which is wq_unbound_cpumask or to set to wq_unbound_cpumask. 4299 * If the user configured cpumask doesn't overlap with the 4300 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. 4301 */ 4302 copy_workqueue_attrs(new_attrs, attrs); 4303 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask); 4304 if (unlikely(cpumask_empty(new_attrs->cpumask))) 4305 cpumask_copy(new_attrs->cpumask, unbound_cpumask); 4306 4307 /* 4308 * We may create multiple pwqs with differing cpumasks. Make a 4309 * copy of @new_attrs which will be modified and used to obtain 4310 * pools. 4311 */ 4312 copy_workqueue_attrs(tmp_attrs, new_attrs); 4313 4314 /* 4315 * If something goes wrong during CPU up/down, we'll fall back to 4316 * the default pwq covering whole @attrs->cpumask. Always create 4317 * it even if we don't use it immediately. 4318 */ 4319 ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); 4320 if (!ctx->dfl_pwq) 4321 goto out_free; 4322 4323 for_each_node(node) { 4324 if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { 4325 ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); 4326 if (!ctx->pwq_tbl[node]) 4327 goto out_free; 4328 } else { 4329 ctx->dfl_pwq->refcnt++; 4330 ctx->pwq_tbl[node] = ctx->dfl_pwq; 4331 } 4332 } 4333 4334 /* save the user configured attrs and sanitize it. */ 4335 copy_workqueue_attrs(new_attrs, attrs); 4336 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); 4337 ctx->attrs = new_attrs; 4338 4339 ctx->wq = wq; 4340 free_workqueue_attrs(tmp_attrs); 4341 return ctx; 4342 4343 out_free: 4344 free_workqueue_attrs(tmp_attrs); 4345 free_workqueue_attrs(new_attrs); 4346 apply_wqattrs_cleanup(ctx); 4347 return NULL; 4348 } 4349 4350 /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ 4351 static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) 4352 { 4353 int node; 4354 4355 /* all pwqs have been created successfully, let's install'em */ 4356 mutex_lock(&ctx->wq->mutex); 4357 4358 copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); 4359 4360 /* save the previous pwq and install the new one */ 4361 for_each_node(node) 4362 ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, 4363 ctx->pwq_tbl[node]); 4364 4365 /* @dfl_pwq might not have been used, ensure it's linked */ 4366 link_pwq(ctx->dfl_pwq); 4367 swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); 4368 4369 mutex_unlock(&ctx->wq->mutex); 4370 } 4371 4372 static void apply_wqattrs_lock(void) 4373 { 4374 /* CPUs should stay stable across pwq creations and installations */ 4375 cpus_read_lock(); 4376 mutex_lock(&wq_pool_mutex); 4377 } 4378 4379 static void apply_wqattrs_unlock(void) 4380 { 4381 mutex_unlock(&wq_pool_mutex); 4382 cpus_read_unlock(); 4383 } 4384 4385 static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, 4386 const struct workqueue_attrs *attrs) 4387 { 4388 struct apply_wqattrs_ctx *ctx; 4389 4390 /* only unbound workqueues can change attributes */ 4391 if (WARN_ON(!(wq->flags & WQ_UNBOUND))) 4392 return -EINVAL; 4393 4394 /* creating multiple pwqs breaks ordering guarantee */ 4395 if (!list_empty(&wq->pwqs)) { 4396 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) 4397 return -EINVAL; 4398 4399 wq->flags &= ~__WQ_ORDERED; 4400 } 4401 4402 ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); 4403 if (!ctx) 4404 return -ENOMEM; 4405 4406 /* the ctx has been prepared successfully, let's commit it */ 4407 apply_wqattrs_commit(ctx); 4408 apply_wqattrs_cleanup(ctx); 4409 4410 return 0; 4411 } 4412 4413 /** 4414 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue 4415 * @wq: the target workqueue 4416 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() 4417 * 4418 * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA 4419 * machines, this function maps a separate pwq to each NUMA node with 4420 * possibles CPUs in @attrs->cpumask so that work items are affine to the 4421 * NUMA node it was issued on. Older pwqs are released as in-flight work 4422 * items finish. Note that a work item which repeatedly requeues itself 4423 * back-to-back will stay on its current pwq. 4424 * 4425 * Performs GFP_KERNEL allocations. 4426 * 4427 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). 4428 * 4429 * Return: 0 on success and -errno on failure. 4430 */ 4431 int apply_workqueue_attrs(struct workqueue_struct *wq, 4432 const struct workqueue_attrs *attrs) 4433 { 4434 int ret; 4435 4436 lockdep_assert_cpus_held(); 4437 4438 mutex_lock(&wq_pool_mutex); 4439 ret = apply_workqueue_attrs_locked(wq, attrs); 4440 mutex_unlock(&wq_pool_mutex); 4441 4442 return ret; 4443 } 4444 4445 /** 4446 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug 4447 * @wq: the target workqueue 4448 * @cpu: the CPU coming up or going down 4449 * @online: whether @cpu is coming up or going down 4450 * 4451 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and 4452 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of 4453 * @wq accordingly. 4454 * 4455 * If NUMA affinity can't be adjusted due to memory allocation failure, it 4456 * falls back to @wq->dfl_pwq which may not be optimal but is always 4457 * correct. 4458 * 4459 * Note that when the last allowed CPU of a NUMA node goes offline for a 4460 * workqueue with a cpumask spanning multiple nodes, the workers which were 4461 * already executing the work items for the workqueue will lose their CPU 4462 * affinity and may execute on any CPU. This is similar to how per-cpu 4463 * workqueues behave on CPU_DOWN. If a workqueue user wants strict 4464 * affinity, it's the user's responsibility to flush the work item from 4465 * CPU_DOWN_PREPARE. 4466 */ 4467 static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, 4468 bool online) 4469 { 4470 int node = cpu_to_node(cpu); 4471 int cpu_off = online ? -1 : cpu; 4472 struct pool_workqueue *old_pwq = NULL, *pwq; 4473 struct workqueue_attrs *target_attrs; 4474 cpumask_t *cpumask; 4475 4476 lockdep_assert_held(&wq_pool_mutex); 4477 4478 if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || 4479 wq->unbound_attrs->no_numa) 4480 return; 4481 4482 /* 4483 * We don't wanna alloc/free wq_attrs for each wq for each CPU. 4484 * Let's use a preallocated one. The following buf is protected by 4485 * CPU hotplug exclusion. 4486 */ 4487 target_attrs = wq_update_unbound_numa_attrs_buf; 4488 cpumask = target_attrs->cpumask; 4489 4490 copy_workqueue_attrs(target_attrs, wq->unbound_attrs); 4491 pwq = unbound_pwq_by_node(wq, node); 4492 4493 /* 4494 * Let's determine what needs to be done. If the target cpumask is 4495 * different from the default pwq's, we need to compare it to @pwq's 4496 * and create a new one if they don't match. If the target cpumask 4497 * equals the default pwq's, the default pwq should be used. 4498 */ 4499 if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { 4500 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) 4501 return; 4502 } else { 4503 goto use_dfl_pwq; 4504 } 4505 4506 /* create a new pwq */ 4507 pwq = alloc_unbound_pwq(wq, target_attrs); 4508 if (!pwq) { 4509 pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", 4510 wq->name); 4511 goto use_dfl_pwq; 4512 } 4513 4514 /* Install the new pwq. */ 4515 mutex_lock(&wq->mutex); 4516 old_pwq = numa_pwq_tbl_install(wq, node, pwq); 4517 goto out_unlock; 4518 4519 use_dfl_pwq: 4520 mutex_lock(&wq->mutex); 4521 raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); 4522 get_pwq(wq->dfl_pwq); 4523 raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); 4524 old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); 4525 out_unlock: 4526 mutex_unlock(&wq->mutex); 4527 put_pwq_unlocked(old_pwq); 4528 } 4529 4530 static int alloc_and_link_pwqs(struct workqueue_struct *wq) 4531 { 4532 bool highpri = wq->flags & WQ_HIGHPRI; 4533 int cpu, ret; 4534 4535 if (!(wq->flags & WQ_UNBOUND)) { 4536 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); 4537 if (!wq->cpu_pwqs) 4538 return -ENOMEM; 4539 4540 for_each_possible_cpu(cpu) { 4541 struct pool_workqueue *pwq = 4542 per_cpu_ptr(wq->cpu_pwqs, cpu); 4543 struct worker_pool *cpu_pools = 4544 per_cpu(cpu_worker_pools, cpu); 4545 4546 init_pwq(pwq, wq, &cpu_pools[highpri]); 4547 4548 mutex_lock(&wq->mutex); 4549 link_pwq(pwq); 4550 mutex_unlock(&wq->mutex); 4551 } 4552 return 0; 4553 } 4554 4555 cpus_read_lock(); 4556 if (wq->flags & __WQ_ORDERED) { 4557 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); 4558 /* there should only be single pwq for ordering guarantee */ 4559 WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || 4560 wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), 4561 "ordering guarantee broken for workqueue %s\n", wq->name); 4562 } else { 4563 ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); 4564 } 4565 cpus_read_unlock(); 4566 4567 return ret; 4568 } 4569 4570 static int wq_clamp_max_active(int max_active, unsigned int flags, 4571 const char *name) 4572 { 4573 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; 4574 4575 if (max_active < 1 || max_active > lim) 4576 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", 4577 max_active, name, 1, lim); 4578 4579 return clamp_val(max_active, 1, lim); 4580 } 4581 4582 /* 4583 * Workqueues which may be used during memory reclaim should have a rescuer 4584 * to guarantee forward progress. 4585 */ 4586 static int init_rescuer(struct workqueue_struct *wq) 4587 { 4588 struct worker *rescuer; 4589 int ret; 4590 4591 if (!(wq->flags & WQ_MEM_RECLAIM)) 4592 return 0; 4593 4594 rescuer = alloc_worker(NUMA_NO_NODE); 4595 if (!rescuer) { 4596 pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", 4597 wq->name); 4598 return -ENOMEM; 4599 } 4600 4601 rescuer->rescue_wq = wq; 4602 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); 4603 if (IS_ERR(rescuer->task)) { 4604 ret = PTR_ERR(rescuer->task); 4605 pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", 4606 wq->name, ERR_PTR(ret)); 4607 kfree(rescuer); 4608 return ret; 4609 } 4610 4611 wq->rescuer = rescuer; 4612 kthread_bind_mask(rescuer->task, cpu_possible_mask); 4613 wake_up_process(rescuer->task); 4614 4615 return 0; 4616 } 4617 4618 __printf(1, 4) 4619 struct workqueue_struct *alloc_workqueue(const char *fmt, 4620 unsigned int flags, 4621 int max_active, ...) 4622 { 4623 size_t tbl_size = 0; 4624 va_list args; 4625 struct workqueue_struct *wq; 4626 struct pool_workqueue *pwq; 4627 4628 /* 4629 * Unbound && max_active == 1 used to imply ordered, which is no 4630 * longer the case on NUMA machines due to per-node pools. While 4631 * alloc_ordered_workqueue() is the right way to create an ordered 4632 * workqueue, keep the previous behavior to avoid subtle breakages 4633 * on NUMA. 4634 */ 4635 if ((flags & WQ_UNBOUND) && max_active == 1) 4636 flags |= __WQ_ORDERED; 4637 4638 /* see the comment above the definition of WQ_POWER_EFFICIENT */ 4639 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) 4640 flags |= WQ_UNBOUND; 4641 4642 /* allocate wq and format name */ 4643 if (flags & WQ_UNBOUND) 4644 tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); 4645 4646 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); 4647 if (!wq) 4648 return NULL; 4649 4650 if (flags & WQ_UNBOUND) { 4651 wq->unbound_attrs = alloc_workqueue_attrs(); 4652 if (!wq->unbound_attrs) 4653 goto err_free_wq; 4654 } 4655 4656 va_start(args, max_active); 4657 vsnprintf(wq->name, sizeof(wq->name), fmt, args); 4658 va_end(args); 4659 4660 max_active = max_active ?: WQ_DFL_ACTIVE; 4661 max_active = wq_clamp_max_active(max_active, flags, wq->name); 4662 4663 /* init wq */ 4664 wq->flags = flags; 4665 wq->saved_max_active = max_active; 4666 mutex_init(&wq->mutex); 4667 atomic_set(&wq->nr_pwqs_to_flush, 0); 4668 INIT_LIST_HEAD(&wq->pwqs); 4669 INIT_LIST_HEAD(&wq->flusher_queue); 4670 INIT_LIST_HEAD(&wq->flusher_overflow); 4671 INIT_LIST_HEAD(&wq->maydays); 4672 4673 wq_init_lockdep(wq); 4674 INIT_LIST_HEAD(&wq->list); 4675 4676 if (alloc_and_link_pwqs(wq) < 0) 4677 goto err_unreg_lockdep; 4678 4679 if (wq_online && init_rescuer(wq) < 0) 4680 goto err_destroy; 4681 4682 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) 4683 goto err_destroy; 4684 4685 /* 4686 * wq_pool_mutex protects global freeze state and workqueues list. 4687 * Grab it, adjust max_active and add the new @wq to workqueues 4688 * list. 4689 */ 4690 mutex_lock(&wq_pool_mutex); 4691 4692 mutex_lock(&wq->mutex); 4693 for_each_pwq(pwq, wq) 4694 pwq_adjust_max_active(pwq); 4695 mutex_unlock(&wq->mutex); 4696 4697 list_add_tail_rcu(&wq->list, &workqueues); 4698 4699 mutex_unlock(&wq_pool_mutex); 4700 4701 return wq; 4702 4703 err_unreg_lockdep: 4704 wq_unregister_lockdep(wq); 4705 wq_free_lockdep(wq); 4706 err_free_wq: 4707 free_workqueue_attrs(wq->unbound_attrs); 4708 kfree(wq); 4709 return NULL; 4710 err_destroy: 4711 destroy_workqueue(wq); 4712 return NULL; 4713 } 4714 EXPORT_SYMBOL_GPL(alloc_workqueue); 4715 4716 static bool pwq_busy(struct pool_workqueue *pwq) 4717 { 4718 int i; 4719 4720 for (i = 0; i < WORK_NR_COLORS; i++) 4721 if (pwq->nr_in_flight[i]) 4722 return true; 4723 4724 if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) 4725 return true; 4726 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) 4727 return true; 4728 4729 return false; 4730 } 4731 4732 /** 4733 * destroy_workqueue - safely terminate a workqueue 4734 * @wq: target workqueue 4735 * 4736 * Safely destroy a workqueue. All work currently pending will be done first. 4737 */ 4738 void destroy_workqueue(struct workqueue_struct *wq) 4739 { 4740 struct pool_workqueue *pwq; 4741 int node; 4742 4743 /* 4744 * Remove it from sysfs first so that sanity check failure doesn't 4745 * lead to sysfs name conflicts. 4746 */ 4747 workqueue_sysfs_unregister(wq); 4748 4749 /* mark the workqueue destruction is in progress */ 4750 mutex_lock(&wq->mutex); 4751 wq->flags |= __WQ_DESTROYING; 4752 mutex_unlock(&wq->mutex); 4753 4754 /* drain it before proceeding with destruction */ 4755 drain_workqueue(wq); 4756 4757 /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ 4758 if (wq->rescuer) { 4759 struct worker *rescuer = wq->rescuer; 4760 4761 /* this prevents new queueing */ 4762 raw_spin_lock_irq(&wq_mayday_lock); 4763 wq->rescuer = NULL; 4764 raw_spin_unlock_irq(&wq_mayday_lock); 4765 4766 /* rescuer will empty maydays list before exiting */ 4767 kthread_stop(rescuer->task); 4768 kfree(rescuer); 4769 } 4770 4771 /* 4772 * Sanity checks - grab all the locks so that we wait for all 4773 * in-flight operations which may do put_pwq(). 4774 */ 4775 mutex_lock(&wq_pool_mutex); 4776 mutex_lock(&wq->mutex); 4777 for_each_pwq(pwq, wq) { 4778 raw_spin_lock_irq(&pwq->pool->lock); 4779 if (WARN_ON(pwq_busy(pwq))) { 4780 pr_warn("%s: %s has the following busy pwq\n", 4781 __func__, wq->name); 4782 show_pwq(pwq); 4783 raw_spin_unlock_irq(&pwq->pool->lock); 4784 mutex_unlock(&wq->mutex); 4785 mutex_unlock(&wq_pool_mutex); 4786 show_one_workqueue(wq); 4787 return; 4788 } 4789 raw_spin_unlock_irq(&pwq->pool->lock); 4790 } 4791 mutex_unlock(&wq->mutex); 4792 4793 /* 4794 * wq list is used to freeze wq, remove from list after 4795 * flushing is complete in case freeze races us. 4796 */ 4797 list_del_rcu(&wq->list); 4798 mutex_unlock(&wq_pool_mutex); 4799 4800 if (!(wq->flags & WQ_UNBOUND)) { 4801 wq_unregister_lockdep(wq); 4802 /* 4803 * The base ref is never dropped on per-cpu pwqs. Directly 4804 * schedule RCU free. 4805 */ 4806 call_rcu(&wq->rcu, rcu_free_wq); 4807 } else { 4808 /* 4809 * We're the sole accessor of @wq at this point. Directly 4810 * access numa_pwq_tbl[] and dfl_pwq to put the base refs. 4811 * @wq will be freed when the last pwq is released. 4812 */ 4813 for_each_node(node) { 4814 pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); 4815 RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); 4816 put_pwq_unlocked(pwq); 4817 } 4818 4819 /* 4820 * Put dfl_pwq. @wq may be freed any time after dfl_pwq is 4821 * put. Don't access it afterwards. 4822 */ 4823 pwq = wq->dfl_pwq; 4824 wq->dfl_pwq = NULL; 4825 put_pwq_unlocked(pwq); 4826 } 4827 } 4828 EXPORT_SYMBOL_GPL(destroy_workqueue); 4829 4830 /** 4831 * workqueue_set_max_active - adjust max_active of a workqueue 4832 * @wq: target workqueue 4833 * @max_active: new max_active value. 4834 * 4835 * Set max_active of @wq to @max_active. 4836 * 4837 * CONTEXT: 4838 * Don't call from IRQ context. 4839 */ 4840 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) 4841 { 4842 struct pool_workqueue *pwq; 4843 4844 /* disallow meddling with max_active for ordered workqueues */ 4845 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) 4846 return; 4847 4848 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); 4849 4850 mutex_lock(&wq->mutex); 4851 4852 wq->flags &= ~__WQ_ORDERED; 4853 wq->saved_max_active = max_active; 4854 4855 for_each_pwq(pwq, wq) 4856 pwq_adjust_max_active(pwq); 4857 4858 mutex_unlock(&wq->mutex); 4859 } 4860 EXPORT_SYMBOL_GPL(workqueue_set_max_active); 4861 4862 /** 4863 * current_work - retrieve %current task's work struct 4864 * 4865 * Determine if %current task is a workqueue worker and what it's working on. 4866 * Useful to find out the context that the %current task is running in. 4867 * 4868 * Return: work struct if %current task is a workqueue worker, %NULL otherwise. 4869 */ 4870 struct work_struct *current_work(void) 4871 { 4872 struct worker *worker = current_wq_worker(); 4873 4874 return worker ? worker->current_work : NULL; 4875 } 4876 EXPORT_SYMBOL(current_work); 4877 4878 /** 4879 * current_is_workqueue_rescuer - is %current workqueue rescuer? 4880 * 4881 * Determine whether %current is a workqueue rescuer. Can be used from 4882 * work functions to determine whether it's being run off the rescuer task. 4883 * 4884 * Return: %true if %current is a workqueue rescuer. %false otherwise. 4885 */ 4886 bool current_is_workqueue_rescuer(void) 4887 { 4888 struct worker *worker = current_wq_worker(); 4889 4890 return worker && worker->rescue_wq; 4891 } 4892 4893 /** 4894 * workqueue_congested - test whether a workqueue is congested 4895 * @cpu: CPU in question 4896 * @wq: target workqueue 4897 * 4898 * Test whether @wq's cpu workqueue for @cpu is congested. There is 4899 * no synchronization around this function and the test result is 4900 * unreliable and only useful as advisory hints or for debugging. 4901 * 4902 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. 4903 * Note that both per-cpu and unbound workqueues may be associated with 4904 * multiple pool_workqueues which have separate congested states. A 4905 * workqueue being congested on one CPU doesn't mean the workqueue is also 4906 * contested on other CPUs / NUMA nodes. 4907 * 4908 * Return: 4909 * %true if congested, %false otherwise. 4910 */ 4911 bool workqueue_congested(int cpu, struct workqueue_struct *wq) 4912 { 4913 struct pool_workqueue *pwq; 4914 bool ret; 4915 4916 rcu_read_lock(); 4917 preempt_disable(); 4918 4919 if (cpu == WORK_CPU_UNBOUND) 4920 cpu = smp_processor_id(); 4921 4922 if (!(wq->flags & WQ_UNBOUND)) 4923 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); 4924 else 4925 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); 4926 4927 ret = !list_empty(&pwq->inactive_works); 4928 preempt_enable(); 4929 rcu_read_unlock(); 4930 4931 return ret; 4932 } 4933 EXPORT_SYMBOL_GPL(workqueue_congested); 4934 4935 /** 4936 * work_busy - test whether a work is currently pending or running 4937 * @work: the work to be tested 4938 * 4939 * Test whether @work is currently pending or running. There is no 4940 * synchronization around this function and the test result is 4941 * unreliable and only useful as advisory hints or for debugging. 4942 * 4943 * Return: 4944 * OR'd bitmask of WORK_BUSY_* bits. 4945 */ 4946 unsigned int work_busy(struct work_struct *work) 4947 { 4948 struct worker_pool *pool; 4949 unsigned long flags; 4950 unsigned int ret = 0; 4951 4952 if (work_pending(work)) 4953 ret |= WORK_BUSY_PENDING; 4954 4955 rcu_read_lock(); 4956 pool = get_work_pool(work); 4957 if (pool) { 4958 raw_spin_lock_irqsave(&pool->lock, flags); 4959 if (find_worker_executing_work(pool, work)) 4960 ret |= WORK_BUSY_RUNNING; 4961 raw_spin_unlock_irqrestore(&pool->lock, flags); 4962 } 4963 rcu_read_unlock(); 4964 4965 return ret; 4966 } 4967 EXPORT_SYMBOL_GPL(work_busy); 4968 4969 /** 4970 * set_worker_desc - set description for the current work item 4971 * @fmt: printf-style format string 4972 * @...: arguments for the format string 4973 * 4974 * This function can be called by a running work function to describe what 4975 * the work item is about. If the worker task gets dumped, this 4976 * information will be printed out together to help debugging. The 4977 * description can be at most WORKER_DESC_LEN including the trailing '\0'. 4978 */ 4979 void set_worker_desc(const char *fmt, ...) 4980 { 4981 struct worker *worker = current_wq_worker(); 4982 va_list args; 4983 4984 if (worker) { 4985 va_start(args, fmt); 4986 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); 4987 va_end(args); 4988 } 4989 } 4990 EXPORT_SYMBOL_GPL(set_worker_desc); 4991 4992 /** 4993 * print_worker_info - print out worker information and description 4994 * @log_lvl: the log level to use when printing 4995 * @task: target task 4996 * 4997 * If @task is a worker and currently executing a work item, print out the 4998 * name of the workqueue being serviced and worker description set with 4999 * set_worker_desc() by the currently executing work item. 5000 * 5001 * This function can be safely called on any task as long as the 5002 * task_struct itself is accessible. While safe, this function isn't 5003 * synchronized and may print out mixups or garbages of limited length. 5004 */ 5005 void print_worker_info(const char *log_lvl, struct task_struct *task) 5006 { 5007 work_func_t *fn = NULL; 5008 char name[WQ_NAME_LEN] = { }; 5009 char desc[WORKER_DESC_LEN] = { }; 5010 struct pool_workqueue *pwq = NULL; 5011 struct workqueue_struct *wq = NULL; 5012 struct worker *worker; 5013 5014 if (!(task->flags & PF_WQ_WORKER)) 5015 return; 5016 5017 /* 5018 * This function is called without any synchronization and @task 5019 * could be in any state. Be careful with dereferences. 5020 */ 5021 worker = kthread_probe_data(task); 5022 5023 /* 5024 * Carefully copy the associated workqueue's workfn, name and desc. 5025 * Keep the original last '\0' in case the original is garbage. 5026 */ 5027 copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); 5028 copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); 5029 copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); 5030 copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); 5031 copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); 5032 5033 if (fn || name[0] || desc[0]) { 5034 printk("%sWorkqueue: %s %ps", log_lvl, name, fn); 5035 if (strcmp(name, desc)) 5036 pr_cont(" (%s)", desc); 5037 pr_cont("\n"); 5038 } 5039 } 5040 5041 static void pr_cont_pool_info(struct worker_pool *pool) 5042 { 5043 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); 5044 if (pool->node != NUMA_NO_NODE) 5045 pr_cont(" node=%d", pool->node); 5046 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); 5047 } 5048 5049 struct pr_cont_work_struct { 5050 bool comma; 5051 work_func_t func; 5052 long ctr; 5053 }; 5054 5055 static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) 5056 { 5057 if (!pcwsp->ctr) 5058 goto out_record; 5059 if (func == pcwsp->func) { 5060 pcwsp->ctr++; 5061 return; 5062 } 5063 if (pcwsp->ctr == 1) 5064 pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); 5065 else 5066 pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); 5067 pcwsp->ctr = 0; 5068 out_record: 5069 if ((long)func == -1L) 5070 return; 5071 pcwsp->comma = comma; 5072 pcwsp->func = func; 5073 pcwsp->ctr = 1; 5074 } 5075 5076 static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) 5077 { 5078 if (work->func == wq_barrier_func) { 5079 struct wq_barrier *barr; 5080 5081 barr = container_of(work, struct wq_barrier, work); 5082 5083 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); 5084 pr_cont("%s BAR(%d)", comma ? "," : "", 5085 task_pid_nr(barr->task)); 5086 } else { 5087 if (!comma) 5088 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); 5089 pr_cont_work_flush(comma, work->func, pcwsp); 5090 } 5091 } 5092 5093 static void show_pwq(struct pool_workqueue *pwq) 5094 { 5095 struct pr_cont_work_struct pcws = { .ctr = 0, }; 5096 struct worker_pool *pool = pwq->pool; 5097 struct work_struct *work; 5098 struct worker *worker; 5099 bool has_in_flight = false, has_pending = false; 5100 int bkt; 5101 5102 pr_info(" pwq %d:", pool->id); 5103 pr_cont_pool_info(pool); 5104 5105 pr_cont(" active=%d/%d refcnt=%d%s\n", 5106 pwq->nr_active, pwq->max_active, pwq->refcnt, 5107 !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); 5108 5109 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 5110 if (worker->current_pwq == pwq) { 5111 has_in_flight = true; 5112 break; 5113 } 5114 } 5115 if (has_in_flight) { 5116 bool comma = false; 5117 5118 pr_info(" in-flight:"); 5119 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 5120 if (worker->current_pwq != pwq) 5121 continue; 5122 5123 pr_cont("%s %d%s:%ps", comma ? "," : "", 5124 task_pid_nr(worker->task), 5125 worker->rescue_wq ? "(RESCUER)" : "", 5126 worker->current_func); 5127 list_for_each_entry(work, &worker->scheduled, entry) 5128 pr_cont_work(false, work, &pcws); 5129 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 5130 comma = true; 5131 } 5132 pr_cont("\n"); 5133 } 5134 5135 list_for_each_entry(work, &pool->worklist, entry) { 5136 if (get_work_pwq(work) == pwq) { 5137 has_pending = true; 5138 break; 5139 } 5140 } 5141 if (has_pending) { 5142 bool comma = false; 5143 5144 pr_info(" pending:"); 5145 list_for_each_entry(work, &pool->worklist, entry) { 5146 if (get_work_pwq(work) != pwq) 5147 continue; 5148 5149 pr_cont_work(comma, work, &pcws); 5150 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); 5151 } 5152 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 5153 pr_cont("\n"); 5154 } 5155 5156 if (!list_empty(&pwq->inactive_works)) { 5157 bool comma = false; 5158 5159 pr_info(" inactive:"); 5160 list_for_each_entry(work, &pwq->inactive_works, entry) { 5161 pr_cont_work(comma, work, &pcws); 5162 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); 5163 } 5164 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 5165 pr_cont("\n"); 5166 } 5167 } 5168 5169 /** 5170 * show_one_workqueue - dump state of specified workqueue 5171 * @wq: workqueue whose state will be printed 5172 */ 5173 void show_one_workqueue(struct workqueue_struct *wq) 5174 { 5175 struct pool_workqueue *pwq; 5176 bool idle = true; 5177 unsigned long flags; 5178 5179 for_each_pwq(pwq, wq) { 5180 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { 5181 idle = false; 5182 break; 5183 } 5184 } 5185 if (idle) /* Nothing to print for idle workqueue */ 5186 return; 5187 5188 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); 5189 5190 for_each_pwq(pwq, wq) { 5191 raw_spin_lock_irqsave(&pwq->pool->lock, flags); 5192 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { 5193 /* 5194 * Defer printing to avoid deadlocks in console 5195 * drivers that queue work while holding locks 5196 * also taken in their write paths. 5197 */ 5198 printk_deferred_enter(); 5199 show_pwq(pwq); 5200 printk_deferred_exit(); 5201 } 5202 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); 5203 /* 5204 * We could be printing a lot from atomic context, e.g. 5205 * sysrq-t -> show_all_workqueues(). Avoid triggering 5206 * hard lockup. 5207 */ 5208 touch_nmi_watchdog(); 5209 } 5210 5211 } 5212 5213 /** 5214 * show_one_worker_pool - dump state of specified worker pool 5215 * @pool: worker pool whose state will be printed 5216 */ 5217 static void show_one_worker_pool(struct worker_pool *pool) 5218 { 5219 struct worker *worker; 5220 bool first = true; 5221 unsigned long flags; 5222 unsigned long hung = 0; 5223 5224 raw_spin_lock_irqsave(&pool->lock, flags); 5225 if (pool->nr_workers == pool->nr_idle) 5226 goto next_pool; 5227 5228 /* How long the first pending work is waiting for a worker. */ 5229 if (!list_empty(&pool->worklist)) 5230 hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; 5231 5232 /* 5233 * Defer printing to avoid deadlocks in console drivers that 5234 * queue work while holding locks also taken in their write 5235 * paths. 5236 */ 5237 printk_deferred_enter(); 5238 pr_info("pool %d:", pool->id); 5239 pr_cont_pool_info(pool); 5240 pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); 5241 if (pool->manager) 5242 pr_cont(" manager: %d", 5243 task_pid_nr(pool->manager->task)); 5244 list_for_each_entry(worker, &pool->idle_list, entry) { 5245 pr_cont(" %s%d", first ? "idle: " : "", 5246 task_pid_nr(worker->task)); 5247 first = false; 5248 } 5249 pr_cont("\n"); 5250 printk_deferred_exit(); 5251 next_pool: 5252 raw_spin_unlock_irqrestore(&pool->lock, flags); 5253 /* 5254 * We could be printing a lot from atomic context, e.g. 5255 * sysrq-t -> show_all_workqueues(). Avoid triggering 5256 * hard lockup. 5257 */ 5258 touch_nmi_watchdog(); 5259 5260 } 5261 5262 /** 5263 * show_all_workqueues - dump workqueue state 5264 * 5265 * Called from a sysrq handler and prints out all busy workqueues and pools. 5266 */ 5267 void show_all_workqueues(void) 5268 { 5269 struct workqueue_struct *wq; 5270 struct worker_pool *pool; 5271 int pi; 5272 5273 rcu_read_lock(); 5274 5275 pr_info("Showing busy workqueues and worker pools:\n"); 5276 5277 list_for_each_entry_rcu(wq, &workqueues, list) 5278 show_one_workqueue(wq); 5279 5280 for_each_pool(pool, pi) 5281 show_one_worker_pool(pool); 5282 5283 rcu_read_unlock(); 5284 } 5285 5286 /** 5287 * show_freezable_workqueues - dump freezable workqueue state 5288 * 5289 * Called from try_to_freeze_tasks() and prints out all freezable workqueues 5290 * still busy. 5291 */ 5292 void show_freezable_workqueues(void) 5293 { 5294 struct workqueue_struct *wq; 5295 5296 rcu_read_lock(); 5297 5298 pr_info("Showing freezable workqueues that are still busy:\n"); 5299 5300 list_for_each_entry_rcu(wq, &workqueues, list) { 5301 if (!(wq->flags & WQ_FREEZABLE)) 5302 continue; 5303 show_one_workqueue(wq); 5304 } 5305 5306 rcu_read_unlock(); 5307 } 5308 5309 /* used to show worker information through /proc/PID/{comm,stat,status} */ 5310 void wq_worker_comm(char *buf, size_t size, struct task_struct *task) 5311 { 5312 int off; 5313 5314 /* always show the actual comm */ 5315 off = strscpy(buf, task->comm, size); 5316 if (off < 0) 5317 return; 5318 5319 /* stabilize PF_WQ_WORKER and worker pool association */ 5320 mutex_lock(&wq_pool_attach_mutex); 5321 5322 if (task->flags & PF_WQ_WORKER) { 5323 struct worker *worker = kthread_data(task); 5324 struct worker_pool *pool = worker->pool; 5325 5326 if (pool) { 5327 raw_spin_lock_irq(&pool->lock); 5328 /* 5329 * ->desc tracks information (wq name or 5330 * set_worker_desc()) for the latest execution. If 5331 * current, prepend '+', otherwise '-'. 5332 */ 5333 if (worker->desc[0] != '\0') { 5334 if (worker->current_work) 5335 scnprintf(buf + off, size - off, "+%s", 5336 worker->desc); 5337 else 5338 scnprintf(buf + off, size - off, "-%s", 5339 worker->desc); 5340 } 5341 raw_spin_unlock_irq(&pool->lock); 5342 } 5343 } 5344 5345 mutex_unlock(&wq_pool_attach_mutex); 5346 } 5347 5348 #ifdef CONFIG_SMP 5349 5350 /* 5351 * CPU hotplug. 5352 * 5353 * There are two challenges in supporting CPU hotplug. Firstly, there 5354 * are a lot of assumptions on strong associations among work, pwq and 5355 * pool which make migrating pending and scheduled works very 5356 * difficult to implement without impacting hot paths. Secondly, 5357 * worker pools serve mix of short, long and very long running works making 5358 * blocked draining impractical. 5359 * 5360 * This is solved by allowing the pools to be disassociated from the CPU 5361 * running as an unbound one and allowing it to be reattached later if the 5362 * cpu comes back online. 5363 */ 5364 5365 static void unbind_workers(int cpu) 5366 { 5367 struct worker_pool *pool; 5368 struct worker *worker; 5369 5370 for_each_cpu_worker_pool(pool, cpu) { 5371 mutex_lock(&wq_pool_attach_mutex); 5372 raw_spin_lock_irq(&pool->lock); 5373 5374 /* 5375 * We've blocked all attach/detach operations. Make all workers 5376 * unbound and set DISASSOCIATED. Before this, all workers 5377 * must be on the cpu. After this, they may become diasporas. 5378 * And the preemption disabled section in their sched callbacks 5379 * are guaranteed to see WORKER_UNBOUND since the code here 5380 * is on the same cpu. 5381 */ 5382 for_each_pool_worker(worker, pool) 5383 worker->flags |= WORKER_UNBOUND; 5384 5385 pool->flags |= POOL_DISASSOCIATED; 5386 5387 /* 5388 * The handling of nr_running in sched callbacks are disabled 5389 * now. Zap nr_running. After this, nr_running stays zero and 5390 * need_more_worker() and keep_working() are always true as 5391 * long as the worklist is not empty. This pool now behaves as 5392 * an unbound (in terms of concurrency management) pool which 5393 * are served by workers tied to the pool. 5394 */ 5395 pool->nr_running = 0; 5396 5397 /* 5398 * With concurrency management just turned off, a busy 5399 * worker blocking could lead to lengthy stalls. Kick off 5400 * unbound chain execution of currently pending work items. 5401 */ 5402 wake_up_worker(pool); 5403 5404 raw_spin_unlock_irq(&pool->lock); 5405 5406 for_each_pool_worker(worker, pool) 5407 unbind_worker(worker); 5408 5409 mutex_unlock(&wq_pool_attach_mutex); 5410 } 5411 } 5412 5413 /** 5414 * rebind_workers - rebind all workers of a pool to the associated CPU 5415 * @pool: pool of interest 5416 * 5417 * @pool->cpu is coming online. Rebind all workers to the CPU. 5418 */ 5419 static void rebind_workers(struct worker_pool *pool) 5420 { 5421 struct worker *worker; 5422 5423 lockdep_assert_held(&wq_pool_attach_mutex); 5424 5425 /* 5426 * Restore CPU affinity of all workers. As all idle workers should 5427 * be on the run-queue of the associated CPU before any local 5428 * wake-ups for concurrency management happen, restore CPU affinity 5429 * of all workers first and then clear UNBOUND. As we're called 5430 * from CPU_ONLINE, the following shouldn't fail. 5431 */ 5432 for_each_pool_worker(worker, pool) { 5433 kthread_set_per_cpu(worker->task, pool->cpu); 5434 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 5435 pool->attrs->cpumask) < 0); 5436 } 5437 5438 raw_spin_lock_irq(&pool->lock); 5439 5440 pool->flags &= ~POOL_DISASSOCIATED; 5441 5442 for_each_pool_worker(worker, pool) { 5443 unsigned int worker_flags = worker->flags; 5444 5445 /* 5446 * We want to clear UNBOUND but can't directly call 5447 * worker_clr_flags() or adjust nr_running. Atomically 5448 * replace UNBOUND with another NOT_RUNNING flag REBOUND. 5449 * @worker will clear REBOUND using worker_clr_flags() when 5450 * it initiates the next execution cycle thus restoring 5451 * concurrency management. Note that when or whether 5452 * @worker clears REBOUND doesn't affect correctness. 5453 * 5454 * WRITE_ONCE() is necessary because @worker->flags may be 5455 * tested without holding any lock in 5456 * wq_worker_running(). Without it, NOT_RUNNING test may 5457 * fail incorrectly leading to premature concurrency 5458 * management operations. 5459 */ 5460 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); 5461 worker_flags |= WORKER_REBOUND; 5462 worker_flags &= ~WORKER_UNBOUND; 5463 WRITE_ONCE(worker->flags, worker_flags); 5464 } 5465 5466 raw_spin_unlock_irq(&pool->lock); 5467 } 5468 5469 /** 5470 * restore_unbound_workers_cpumask - restore cpumask of unbound workers 5471 * @pool: unbound pool of interest 5472 * @cpu: the CPU which is coming up 5473 * 5474 * An unbound pool may end up with a cpumask which doesn't have any online 5475 * CPUs. When a worker of such pool get scheduled, the scheduler resets 5476 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any 5477 * online CPU before, cpus_allowed of all its workers should be restored. 5478 */ 5479 static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) 5480 { 5481 static cpumask_t cpumask; 5482 struct worker *worker; 5483 5484 lockdep_assert_held(&wq_pool_attach_mutex); 5485 5486 /* is @cpu allowed for @pool? */ 5487 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 5488 return; 5489 5490 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); 5491 5492 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 5493 for_each_pool_worker(worker, pool) 5494 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); 5495 } 5496 5497 int workqueue_prepare_cpu(unsigned int cpu) 5498 { 5499 struct worker_pool *pool; 5500 5501 for_each_cpu_worker_pool(pool, cpu) { 5502 if (pool->nr_workers) 5503 continue; 5504 if (!create_worker(pool)) 5505 return -ENOMEM; 5506 } 5507 return 0; 5508 } 5509 5510 int workqueue_online_cpu(unsigned int cpu) 5511 { 5512 struct worker_pool *pool; 5513 struct workqueue_struct *wq; 5514 int pi; 5515 5516 mutex_lock(&wq_pool_mutex); 5517 5518 for_each_pool(pool, pi) { 5519 mutex_lock(&wq_pool_attach_mutex); 5520 5521 if (pool->cpu == cpu) 5522 rebind_workers(pool); 5523 else if (pool->cpu < 0) 5524 restore_unbound_workers_cpumask(pool, cpu); 5525 5526 mutex_unlock(&wq_pool_attach_mutex); 5527 } 5528 5529 /* update NUMA affinity of unbound workqueues */ 5530 list_for_each_entry(wq, &workqueues, list) 5531 wq_update_unbound_numa(wq, cpu, true); 5532 5533 mutex_unlock(&wq_pool_mutex); 5534 return 0; 5535 } 5536 5537 int workqueue_offline_cpu(unsigned int cpu) 5538 { 5539 struct workqueue_struct *wq; 5540 5541 /* unbinding per-cpu workers should happen on the local CPU */ 5542 if (WARN_ON(cpu != smp_processor_id())) 5543 return -1; 5544 5545 unbind_workers(cpu); 5546 5547 /* update NUMA affinity of unbound workqueues */ 5548 mutex_lock(&wq_pool_mutex); 5549 list_for_each_entry(wq, &workqueues, list) 5550 wq_update_unbound_numa(wq, cpu, false); 5551 mutex_unlock(&wq_pool_mutex); 5552 5553 return 0; 5554 } 5555 5556 struct work_for_cpu { 5557 struct work_struct work; 5558 long (*fn)(void *); 5559 void *arg; 5560 long ret; 5561 }; 5562 5563 static void work_for_cpu_fn(struct work_struct *work) 5564 { 5565 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); 5566 5567 wfc->ret = wfc->fn(wfc->arg); 5568 } 5569 5570 /** 5571 * work_on_cpu - run a function in thread context on a particular cpu 5572 * @cpu: the cpu to run on 5573 * @fn: the function to run 5574 * @arg: the function arg 5575 * 5576 * It is up to the caller to ensure that the cpu doesn't go offline. 5577 * The caller must not hold any locks which would prevent @fn from completing. 5578 * 5579 * Return: The value @fn returns. 5580 */ 5581 long work_on_cpu(int cpu, long (*fn)(void *), void *arg) 5582 { 5583 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 5584 5585 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 5586 schedule_work_on(cpu, &wfc.work); 5587 flush_work(&wfc.work); 5588 destroy_work_on_stack(&wfc.work); 5589 return wfc.ret; 5590 } 5591 EXPORT_SYMBOL_GPL(work_on_cpu); 5592 5593 /** 5594 * work_on_cpu_safe - run a function in thread context on a particular cpu 5595 * @cpu: the cpu to run on 5596 * @fn: the function to run 5597 * @arg: the function argument 5598 * 5599 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold 5600 * any locks which would prevent @fn from completing. 5601 * 5602 * Return: The value @fn returns. 5603 */ 5604 long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) 5605 { 5606 long ret = -ENODEV; 5607 5608 cpus_read_lock(); 5609 if (cpu_online(cpu)) 5610 ret = work_on_cpu(cpu, fn, arg); 5611 cpus_read_unlock(); 5612 return ret; 5613 } 5614 EXPORT_SYMBOL_GPL(work_on_cpu_safe); 5615 #endif /* CONFIG_SMP */ 5616 5617 #ifdef CONFIG_FREEZER 5618 5619 /** 5620 * freeze_workqueues_begin - begin freezing workqueues 5621 * 5622 * Start freezing workqueues. After this function returns, all freezable 5623 * workqueues will queue new works to their inactive_works list instead of 5624 * pool->worklist. 5625 * 5626 * CONTEXT: 5627 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. 5628 */ 5629 void freeze_workqueues_begin(void) 5630 { 5631 struct workqueue_struct *wq; 5632 struct pool_workqueue *pwq; 5633 5634 mutex_lock(&wq_pool_mutex); 5635 5636 WARN_ON_ONCE(workqueue_freezing); 5637 workqueue_freezing = true; 5638 5639 list_for_each_entry(wq, &workqueues, list) { 5640 mutex_lock(&wq->mutex); 5641 for_each_pwq(pwq, wq) 5642 pwq_adjust_max_active(pwq); 5643 mutex_unlock(&wq->mutex); 5644 } 5645 5646 mutex_unlock(&wq_pool_mutex); 5647 } 5648 5649 /** 5650 * freeze_workqueues_busy - are freezable workqueues still busy? 5651 * 5652 * Check whether freezing is complete. This function must be called 5653 * between freeze_workqueues_begin() and thaw_workqueues(). 5654 * 5655 * CONTEXT: 5656 * Grabs and releases wq_pool_mutex. 5657 * 5658 * Return: 5659 * %true if some freezable workqueues are still busy. %false if freezing 5660 * is complete. 5661 */ 5662 bool freeze_workqueues_busy(void) 5663 { 5664 bool busy = false; 5665 struct workqueue_struct *wq; 5666 struct pool_workqueue *pwq; 5667 5668 mutex_lock(&wq_pool_mutex); 5669 5670 WARN_ON_ONCE(!workqueue_freezing); 5671 5672 list_for_each_entry(wq, &workqueues, list) { 5673 if (!(wq->flags & WQ_FREEZABLE)) 5674 continue; 5675 /* 5676 * nr_active is monotonically decreasing. It's safe 5677 * to peek without lock. 5678 */ 5679 rcu_read_lock(); 5680 for_each_pwq(pwq, wq) { 5681 WARN_ON_ONCE(pwq->nr_active < 0); 5682 if (pwq->nr_active) { 5683 busy = true; 5684 rcu_read_unlock(); 5685 goto out_unlock; 5686 } 5687 } 5688 rcu_read_unlock(); 5689 } 5690 out_unlock: 5691 mutex_unlock(&wq_pool_mutex); 5692 return busy; 5693 } 5694 5695 /** 5696 * thaw_workqueues - thaw workqueues 5697 * 5698 * Thaw workqueues. Normal queueing is restored and all collected 5699 * frozen works are transferred to their respective pool worklists. 5700 * 5701 * CONTEXT: 5702 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. 5703 */ 5704 void thaw_workqueues(void) 5705 { 5706 struct workqueue_struct *wq; 5707 struct pool_workqueue *pwq; 5708 5709 mutex_lock(&wq_pool_mutex); 5710 5711 if (!workqueue_freezing) 5712 goto out_unlock; 5713 5714 workqueue_freezing = false; 5715 5716 /* restore max_active and repopulate worklist */ 5717 list_for_each_entry(wq, &workqueues, list) { 5718 mutex_lock(&wq->mutex); 5719 for_each_pwq(pwq, wq) 5720 pwq_adjust_max_active(pwq); 5721 mutex_unlock(&wq->mutex); 5722 } 5723 5724 out_unlock: 5725 mutex_unlock(&wq_pool_mutex); 5726 } 5727 #endif /* CONFIG_FREEZER */ 5728 5729 static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) 5730 { 5731 LIST_HEAD(ctxs); 5732 int ret = 0; 5733 struct workqueue_struct *wq; 5734 struct apply_wqattrs_ctx *ctx, *n; 5735 5736 lockdep_assert_held(&wq_pool_mutex); 5737 5738 list_for_each_entry(wq, &workqueues, list) { 5739 if (!(wq->flags & WQ_UNBOUND)) 5740 continue; 5741 /* creating multiple pwqs breaks ordering guarantee */ 5742 if (wq->flags & __WQ_ORDERED) 5743 continue; 5744 5745 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); 5746 if (!ctx) { 5747 ret = -ENOMEM; 5748 break; 5749 } 5750 5751 list_add_tail(&ctx->list, &ctxs); 5752 } 5753 5754 list_for_each_entry_safe(ctx, n, &ctxs, list) { 5755 if (!ret) 5756 apply_wqattrs_commit(ctx); 5757 apply_wqattrs_cleanup(ctx); 5758 } 5759 5760 if (!ret) { 5761 mutex_lock(&wq_pool_attach_mutex); 5762 cpumask_copy(wq_unbound_cpumask, unbound_cpumask); 5763 mutex_unlock(&wq_pool_attach_mutex); 5764 } 5765 return ret; 5766 } 5767 5768 /** 5769 * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask 5770 * @cpumask: the cpumask to set 5771 * 5772 * The low-level workqueues cpumask is a global cpumask that limits 5773 * the affinity of all unbound workqueues. This function check the @cpumask 5774 * and apply it to all unbound workqueues and updates all pwqs of them. 5775 * 5776 * Return: 0 - Success 5777 * -EINVAL - Invalid @cpumask 5778 * -ENOMEM - Failed to allocate memory for attrs or pwqs. 5779 */ 5780 int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) 5781 { 5782 int ret = -EINVAL; 5783 5784 /* 5785 * Not excluding isolated cpus on purpose. 5786 * If the user wishes to include them, we allow that. 5787 */ 5788 cpumask_and(cpumask, cpumask, cpu_possible_mask); 5789 if (!cpumask_empty(cpumask)) { 5790 apply_wqattrs_lock(); 5791 if (cpumask_equal(cpumask, wq_unbound_cpumask)) { 5792 ret = 0; 5793 goto out_unlock; 5794 } 5795 5796 ret = workqueue_apply_unbound_cpumask(cpumask); 5797 5798 out_unlock: 5799 apply_wqattrs_unlock(); 5800 } 5801 5802 return ret; 5803 } 5804 5805 #ifdef CONFIG_SYSFS 5806 /* 5807 * Workqueues with WQ_SYSFS flag set is visible to userland via 5808 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the 5809 * following attributes. 5810 * 5811 * per_cpu RO bool : whether the workqueue is per-cpu or unbound 5812 * max_active RW int : maximum number of in-flight work items 5813 * 5814 * Unbound workqueues have the following extra attributes. 5815 * 5816 * pool_ids RO int : the associated pool IDs for each node 5817 * nice RW int : nice value of the workers 5818 * cpumask RW mask : bitmask of allowed CPUs for the workers 5819 * numa RW bool : whether enable NUMA affinity 5820 */ 5821 struct wq_device { 5822 struct workqueue_struct *wq; 5823 struct device dev; 5824 }; 5825 5826 static struct workqueue_struct *dev_to_wq(struct device *dev) 5827 { 5828 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); 5829 5830 return wq_dev->wq; 5831 } 5832 5833 static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, 5834 char *buf) 5835 { 5836 struct workqueue_struct *wq = dev_to_wq(dev); 5837 5838 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 5839 } 5840 static DEVICE_ATTR_RO(per_cpu); 5841 5842 static ssize_t max_active_show(struct device *dev, 5843 struct device_attribute *attr, char *buf) 5844 { 5845 struct workqueue_struct *wq = dev_to_wq(dev); 5846 5847 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 5848 } 5849 5850 static ssize_t max_active_store(struct device *dev, 5851 struct device_attribute *attr, const char *buf, 5852 size_t count) 5853 { 5854 struct workqueue_struct *wq = dev_to_wq(dev); 5855 int val; 5856 5857 if (sscanf(buf, "%d", &val) != 1 || val <= 0) 5858 return -EINVAL; 5859 5860 workqueue_set_max_active(wq, val); 5861 return count; 5862 } 5863 static DEVICE_ATTR_RW(max_active); 5864 5865 static struct attribute *wq_sysfs_attrs[] = { 5866 &dev_attr_per_cpu.attr, 5867 &dev_attr_max_active.attr, 5868 NULL, 5869 }; 5870 ATTRIBUTE_GROUPS(wq_sysfs); 5871 5872 static ssize_t wq_pool_ids_show(struct device *dev, 5873 struct device_attribute *attr, char *buf) 5874 { 5875 struct workqueue_struct *wq = dev_to_wq(dev); 5876 const char *delim = ""; 5877 int node, written = 0; 5878 5879 cpus_read_lock(); 5880 rcu_read_lock(); 5881 for_each_node(node) { 5882 written += scnprintf(buf + written, PAGE_SIZE - written, 5883 "%s%d:%d", delim, node, 5884 unbound_pwq_by_node(wq, node)->pool->id); 5885 delim = " "; 5886 } 5887 written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); 5888 rcu_read_unlock(); 5889 cpus_read_unlock(); 5890 5891 return written; 5892 } 5893 5894 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, 5895 char *buf) 5896 { 5897 struct workqueue_struct *wq = dev_to_wq(dev); 5898 int written; 5899 5900 mutex_lock(&wq->mutex); 5901 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); 5902 mutex_unlock(&wq->mutex); 5903 5904 return written; 5905 } 5906 5907 /* prepare workqueue_attrs for sysfs store operations */ 5908 static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) 5909 { 5910 struct workqueue_attrs *attrs; 5911 5912 lockdep_assert_held(&wq_pool_mutex); 5913 5914 attrs = alloc_workqueue_attrs(); 5915 if (!attrs) 5916 return NULL; 5917 5918 copy_workqueue_attrs(attrs, wq->unbound_attrs); 5919 return attrs; 5920 } 5921 5922 static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, 5923 const char *buf, size_t count) 5924 { 5925 struct workqueue_struct *wq = dev_to_wq(dev); 5926 struct workqueue_attrs *attrs; 5927 int ret = -ENOMEM; 5928 5929 apply_wqattrs_lock(); 5930 5931 attrs = wq_sysfs_prep_attrs(wq); 5932 if (!attrs) 5933 goto out_unlock; 5934 5935 if (sscanf(buf, "%d", &attrs->nice) == 1 && 5936 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) 5937 ret = apply_workqueue_attrs_locked(wq, attrs); 5938 else 5939 ret = -EINVAL; 5940 5941 out_unlock: 5942 apply_wqattrs_unlock(); 5943 free_workqueue_attrs(attrs); 5944 return ret ?: count; 5945 } 5946 5947 static ssize_t wq_cpumask_show(struct device *dev, 5948 struct device_attribute *attr, char *buf) 5949 { 5950 struct workqueue_struct *wq = dev_to_wq(dev); 5951 int written; 5952 5953 mutex_lock(&wq->mutex); 5954 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", 5955 cpumask_pr_args(wq->unbound_attrs->cpumask)); 5956 mutex_unlock(&wq->mutex); 5957 return written; 5958 } 5959 5960 static ssize_t wq_cpumask_store(struct device *dev, 5961 struct device_attribute *attr, 5962 const char *buf, size_t count) 5963 { 5964 struct workqueue_struct *wq = dev_to_wq(dev); 5965 struct workqueue_attrs *attrs; 5966 int ret = -ENOMEM; 5967 5968 apply_wqattrs_lock(); 5969 5970 attrs = wq_sysfs_prep_attrs(wq); 5971 if (!attrs) 5972 goto out_unlock; 5973 5974 ret = cpumask_parse(buf, attrs->cpumask); 5975 if (!ret) 5976 ret = apply_workqueue_attrs_locked(wq, attrs); 5977 5978 out_unlock: 5979 apply_wqattrs_unlock(); 5980 free_workqueue_attrs(attrs); 5981 return ret ?: count; 5982 } 5983 5984 static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, 5985 char *buf) 5986 { 5987 struct workqueue_struct *wq = dev_to_wq(dev); 5988 int written; 5989 5990 mutex_lock(&wq->mutex); 5991 written = scnprintf(buf, PAGE_SIZE, "%d\n", 5992 !wq->unbound_attrs->no_numa); 5993 mutex_unlock(&wq->mutex); 5994 5995 return written; 5996 } 5997 5998 static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, 5999 const char *buf, size_t count) 6000 { 6001 struct workqueue_struct *wq = dev_to_wq(dev); 6002 struct workqueue_attrs *attrs; 6003 int v, ret = -ENOMEM; 6004 6005 apply_wqattrs_lock(); 6006 6007 attrs = wq_sysfs_prep_attrs(wq); 6008 if (!attrs) 6009 goto out_unlock; 6010 6011 ret = -EINVAL; 6012 if (sscanf(buf, "%d", &v) == 1) { 6013 attrs->no_numa = !v; 6014 ret = apply_workqueue_attrs_locked(wq, attrs); 6015 } 6016 6017 out_unlock: 6018 apply_wqattrs_unlock(); 6019 free_workqueue_attrs(attrs); 6020 return ret ?: count; 6021 } 6022 6023 static struct device_attribute wq_sysfs_unbound_attrs[] = { 6024 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), 6025 __ATTR(nice, 0644, wq_nice_show, wq_nice_store), 6026 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), 6027 __ATTR(numa, 0644, wq_numa_show, wq_numa_store), 6028 __ATTR_NULL, 6029 }; 6030 6031 static struct bus_type wq_subsys = { 6032 .name = "workqueue", 6033 .dev_groups = wq_sysfs_groups, 6034 }; 6035 6036 static ssize_t wq_unbound_cpumask_show(struct device *dev, 6037 struct device_attribute *attr, char *buf) 6038 { 6039 int written; 6040 6041 mutex_lock(&wq_pool_mutex); 6042 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", 6043 cpumask_pr_args(wq_unbound_cpumask)); 6044 mutex_unlock(&wq_pool_mutex); 6045 6046 return written; 6047 } 6048 6049 static ssize_t wq_unbound_cpumask_store(struct device *dev, 6050 struct device_attribute *attr, const char *buf, size_t count) 6051 { 6052 cpumask_var_t cpumask; 6053 int ret; 6054 6055 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 6056 return -ENOMEM; 6057 6058 ret = cpumask_parse(buf, cpumask); 6059 if (!ret) 6060 ret = workqueue_set_unbound_cpumask(cpumask); 6061 6062 free_cpumask_var(cpumask); 6063 return ret ? ret : count; 6064 } 6065 6066 static struct device_attribute wq_sysfs_cpumask_attr = 6067 __ATTR(cpumask, 0644, wq_unbound_cpumask_show, 6068 wq_unbound_cpumask_store); 6069 6070 static int __init wq_sysfs_init(void) 6071 { 6072 struct device *dev_root; 6073 int err; 6074 6075 err = subsys_virtual_register(&wq_subsys, NULL); 6076 if (err) 6077 return err; 6078 6079 dev_root = bus_get_dev_root(&wq_subsys); 6080 if (dev_root) { 6081 err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); 6082 put_device(dev_root); 6083 } 6084 return err; 6085 } 6086 core_initcall(wq_sysfs_init); 6087 6088 static void wq_device_release(struct device *dev) 6089 { 6090 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); 6091 6092 kfree(wq_dev); 6093 } 6094 6095 /** 6096 * workqueue_sysfs_register - make a workqueue visible in sysfs 6097 * @wq: the workqueue to register 6098 * 6099 * Expose @wq in sysfs under /sys/bus/workqueue/devices. 6100 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set 6101 * which is the preferred method. 6102 * 6103 * Workqueue user should use this function directly iff it wants to apply 6104 * workqueue_attrs before making the workqueue visible in sysfs; otherwise, 6105 * apply_workqueue_attrs() may race against userland updating the 6106 * attributes. 6107 * 6108 * Return: 0 on success, -errno on failure. 6109 */ 6110 int workqueue_sysfs_register(struct workqueue_struct *wq) 6111 { 6112 struct wq_device *wq_dev; 6113 int ret; 6114 6115 /* 6116 * Adjusting max_active or creating new pwqs by applying 6117 * attributes breaks ordering guarantee. Disallow exposing ordered 6118 * workqueues. 6119 */ 6120 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) 6121 return -EINVAL; 6122 6123 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); 6124 if (!wq_dev) 6125 return -ENOMEM; 6126 6127 wq_dev->wq = wq; 6128 wq_dev->dev.bus = &wq_subsys; 6129 wq_dev->dev.release = wq_device_release; 6130 dev_set_name(&wq_dev->dev, "%s", wq->name); 6131 6132 /* 6133 * unbound_attrs are created separately. Suppress uevent until 6134 * everything is ready. 6135 */ 6136 dev_set_uevent_suppress(&wq_dev->dev, true); 6137 6138 ret = device_register(&wq_dev->dev); 6139 if (ret) { 6140 put_device(&wq_dev->dev); 6141 wq->wq_dev = NULL; 6142 return ret; 6143 } 6144 6145 if (wq->flags & WQ_UNBOUND) { 6146 struct device_attribute *attr; 6147 6148 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { 6149 ret = device_create_file(&wq_dev->dev, attr); 6150 if (ret) { 6151 device_unregister(&wq_dev->dev); 6152 wq->wq_dev = NULL; 6153 return ret; 6154 } 6155 } 6156 } 6157 6158 dev_set_uevent_suppress(&wq_dev->dev, false); 6159 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 6160 return 0; 6161 } 6162 6163 /** 6164 * workqueue_sysfs_unregister - undo workqueue_sysfs_register() 6165 * @wq: the workqueue to unregister 6166 * 6167 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. 6168 */ 6169 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) 6170 { 6171 struct wq_device *wq_dev = wq->wq_dev; 6172 6173 if (!wq->wq_dev) 6174 return; 6175 6176 wq->wq_dev = NULL; 6177 device_unregister(&wq_dev->dev); 6178 } 6179 #else /* CONFIG_SYSFS */ 6180 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } 6181 #endif /* CONFIG_SYSFS */ 6182 6183 /* 6184 * Workqueue watchdog. 6185 * 6186 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal 6187 * flush dependency, a concurrency managed work item which stays RUNNING 6188 * indefinitely. Workqueue stalls can be very difficult to debug as the 6189 * usual warning mechanisms don't trigger and internal workqueue state is 6190 * largely opaque. 6191 * 6192 * Workqueue watchdog monitors all worker pools periodically and dumps 6193 * state if some pools failed to make forward progress for a while where 6194 * forward progress is defined as the first item on ->worklist changing. 6195 * 6196 * This mechanism is controlled through the kernel parameter 6197 * "workqueue.watchdog_thresh" which can be updated at runtime through the 6198 * corresponding sysfs parameter file. 6199 */ 6200 #ifdef CONFIG_WQ_WATCHDOG 6201 6202 static unsigned long wq_watchdog_thresh = 30; 6203 static struct timer_list wq_watchdog_timer; 6204 6205 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; 6206 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; 6207 6208 /* 6209 * Show workers that might prevent the processing of pending work items. 6210 * The only candidates are CPU-bound workers in the running state. 6211 * Pending work items should be handled by another idle worker 6212 * in all other situations. 6213 */ 6214 static void show_cpu_pool_hog(struct worker_pool *pool) 6215 { 6216 struct worker *worker; 6217 unsigned long flags; 6218 int bkt; 6219 6220 raw_spin_lock_irqsave(&pool->lock, flags); 6221 6222 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 6223 if (task_is_running(worker->task)) { 6224 /* 6225 * Defer printing to avoid deadlocks in console 6226 * drivers that queue work while holding locks 6227 * also taken in their write paths. 6228 */ 6229 printk_deferred_enter(); 6230 6231 pr_info("pool %d:\n", pool->id); 6232 sched_show_task(worker->task); 6233 6234 printk_deferred_exit(); 6235 } 6236 } 6237 6238 raw_spin_unlock_irqrestore(&pool->lock, flags); 6239 } 6240 6241 static void show_cpu_pools_hogs(void) 6242 { 6243 struct worker_pool *pool; 6244 int pi; 6245 6246 pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); 6247 6248 rcu_read_lock(); 6249 6250 for_each_pool(pool, pi) { 6251 if (pool->cpu_stall) 6252 show_cpu_pool_hog(pool); 6253 6254 } 6255 6256 rcu_read_unlock(); 6257 } 6258 6259 static void wq_watchdog_reset_touched(void) 6260 { 6261 int cpu; 6262 6263 wq_watchdog_touched = jiffies; 6264 for_each_possible_cpu(cpu) 6265 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; 6266 } 6267 6268 static void wq_watchdog_timer_fn(struct timer_list *unused) 6269 { 6270 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; 6271 bool lockup_detected = false; 6272 bool cpu_pool_stall = false; 6273 unsigned long now = jiffies; 6274 struct worker_pool *pool; 6275 int pi; 6276 6277 if (!thresh) 6278 return; 6279 6280 rcu_read_lock(); 6281 6282 for_each_pool(pool, pi) { 6283 unsigned long pool_ts, touched, ts; 6284 6285 pool->cpu_stall = false; 6286 if (list_empty(&pool->worklist)) 6287 continue; 6288 6289 /* 6290 * If a virtual machine is stopped by the host it can look to 6291 * the watchdog like a stall. 6292 */ 6293 kvm_check_and_clear_guest_paused(); 6294 6295 /* get the latest of pool and touched timestamps */ 6296 if (pool->cpu >= 0) 6297 touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); 6298 else 6299 touched = READ_ONCE(wq_watchdog_touched); 6300 pool_ts = READ_ONCE(pool->watchdog_ts); 6301 6302 if (time_after(pool_ts, touched)) 6303 ts = pool_ts; 6304 else 6305 ts = touched; 6306 6307 /* did we stall? */ 6308 if (time_after(now, ts + thresh)) { 6309 lockup_detected = true; 6310 if (pool->cpu >= 0) { 6311 pool->cpu_stall = true; 6312 cpu_pool_stall = true; 6313 } 6314 pr_emerg("BUG: workqueue lockup - pool"); 6315 pr_cont_pool_info(pool); 6316 pr_cont(" stuck for %us!\n", 6317 jiffies_to_msecs(now - pool_ts) / 1000); 6318 } 6319 6320 6321 } 6322 6323 rcu_read_unlock(); 6324 6325 if (lockup_detected) 6326 show_all_workqueues(); 6327 6328 if (cpu_pool_stall) 6329 show_cpu_pools_hogs(); 6330 6331 wq_watchdog_reset_touched(); 6332 mod_timer(&wq_watchdog_timer, jiffies + thresh); 6333 } 6334 6335 notrace void wq_watchdog_touch(int cpu) 6336 { 6337 if (cpu >= 0) 6338 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; 6339 6340 wq_watchdog_touched = jiffies; 6341 } 6342 6343 static void wq_watchdog_set_thresh(unsigned long thresh) 6344 { 6345 wq_watchdog_thresh = 0; 6346 del_timer_sync(&wq_watchdog_timer); 6347 6348 if (thresh) { 6349 wq_watchdog_thresh = thresh; 6350 wq_watchdog_reset_touched(); 6351 mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); 6352 } 6353 } 6354 6355 static int wq_watchdog_param_set_thresh(const char *val, 6356 const struct kernel_param *kp) 6357 { 6358 unsigned long thresh; 6359 int ret; 6360 6361 ret = kstrtoul(val, 0, &thresh); 6362 if (ret) 6363 return ret; 6364 6365 if (system_wq) 6366 wq_watchdog_set_thresh(thresh); 6367 else 6368 wq_watchdog_thresh = thresh; 6369 6370 return 0; 6371 } 6372 6373 static const struct kernel_param_ops wq_watchdog_thresh_ops = { 6374 .set = wq_watchdog_param_set_thresh, 6375 .get = param_get_ulong, 6376 }; 6377 6378 module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, 6379 0644); 6380 6381 static void wq_watchdog_init(void) 6382 { 6383 timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); 6384 wq_watchdog_set_thresh(wq_watchdog_thresh); 6385 } 6386 6387 #else /* CONFIG_WQ_WATCHDOG */ 6388 6389 static inline void wq_watchdog_init(void) { } 6390 6391 #endif /* CONFIG_WQ_WATCHDOG */ 6392 6393 static void __init wq_numa_init(void) 6394 { 6395 cpumask_var_t *tbl; 6396 int node, cpu; 6397 6398 if (num_possible_nodes() <= 1) 6399 return; 6400 6401 if (wq_disable_numa) { 6402 pr_info("workqueue: NUMA affinity support disabled\n"); 6403 return; 6404 } 6405 6406 for_each_possible_cpu(cpu) { 6407 if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { 6408 pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); 6409 return; 6410 } 6411 } 6412 6413 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); 6414 BUG_ON(!wq_update_unbound_numa_attrs_buf); 6415 6416 /* 6417 * We want masks of possible CPUs of each node which isn't readily 6418 * available. Build one from cpu_to_node() which should have been 6419 * fully initialized by now. 6420 */ 6421 tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); 6422 BUG_ON(!tbl); 6423 6424 for_each_node(node) 6425 BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, 6426 node_online(node) ? node : NUMA_NO_NODE)); 6427 6428 for_each_possible_cpu(cpu) { 6429 node = cpu_to_node(cpu); 6430 cpumask_set_cpu(cpu, tbl[node]); 6431 } 6432 6433 wq_numa_possible_cpumask = tbl; 6434 wq_numa_enabled = true; 6435 } 6436 6437 /** 6438 * workqueue_init_early - early init for workqueue subsystem 6439 * 6440 * This is the first half of two-staged workqueue subsystem initialization 6441 * and invoked as soon as the bare basics - memory allocation, cpumasks and 6442 * idr are up. It sets up all the data structures and system workqueues 6443 * and allows early boot code to create workqueues and queue/cancel work 6444 * items. Actual work item execution starts only after kthreads can be 6445 * created and scheduled right before early initcalls. 6446 */ 6447 void __init workqueue_init_early(void) 6448 { 6449 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 6450 int i, cpu; 6451 6452 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 6453 6454 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 6455 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); 6456 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); 6457 6458 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 6459 6460 /* initialize CPU pools */ 6461 for_each_possible_cpu(cpu) { 6462 struct worker_pool *pool; 6463 6464 i = 0; 6465 for_each_cpu_worker_pool(pool, cpu) { 6466 BUG_ON(init_worker_pool(pool)); 6467 pool->cpu = cpu; 6468 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); 6469 pool->attrs->nice = std_nice[i++]; 6470 pool->node = cpu_to_node(cpu); 6471 6472 /* alloc pool ID */ 6473 mutex_lock(&wq_pool_mutex); 6474 BUG_ON(worker_pool_assign_id(pool)); 6475 mutex_unlock(&wq_pool_mutex); 6476 } 6477 } 6478 6479 /* create default unbound and ordered wq attrs */ 6480 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 6481 struct workqueue_attrs *attrs; 6482 6483 BUG_ON(!(attrs = alloc_workqueue_attrs())); 6484 attrs->nice = std_nice[i]; 6485 unbound_std_wq_attrs[i] = attrs; 6486 6487 /* 6488 * An ordered wq should have only one pwq as ordering is 6489 * guaranteed by max_active which is enforced by pwqs. 6490 * Turn off NUMA so that dfl_pwq is used for all nodes. 6491 */ 6492 BUG_ON(!(attrs = alloc_workqueue_attrs())); 6493 attrs->nice = std_nice[i]; 6494 attrs->no_numa = true; 6495 ordered_wq_attrs[i] = attrs; 6496 } 6497 6498 system_wq = alloc_workqueue("events", 0, 0); 6499 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); 6500 system_long_wq = alloc_workqueue("events_long", 0, 0); 6501 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 6502 WQ_UNBOUND_MAX_ACTIVE); 6503 system_freezable_wq = alloc_workqueue("events_freezable", 6504 WQ_FREEZABLE, 0); 6505 system_power_efficient_wq = alloc_workqueue("events_power_efficient", 6506 WQ_POWER_EFFICIENT, 0); 6507 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", 6508 WQ_FREEZABLE | WQ_POWER_EFFICIENT, 6509 0); 6510 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 6511 !system_unbound_wq || !system_freezable_wq || 6512 !system_power_efficient_wq || 6513 !system_freezable_power_efficient_wq); 6514 } 6515 6516 /** 6517 * workqueue_init - bring workqueue subsystem fully online 6518 * 6519 * This is the latter half of two-staged workqueue subsystem initialization 6520 * and invoked as soon as kthreads can be created and scheduled. 6521 * Workqueues have been created and work items queued on them, but there 6522 * are no kworkers executing the work items yet. Populate the worker pools 6523 * with the initial workers and enable future kworker creations. 6524 */ 6525 void __init workqueue_init(void) 6526 { 6527 struct workqueue_struct *wq; 6528 struct worker_pool *pool; 6529 int cpu, bkt; 6530 6531 /* 6532 * It'd be simpler to initialize NUMA in workqueue_init_early() but 6533 * CPU to node mapping may not be available that early on some 6534 * archs such as power and arm64. As per-cpu pools created 6535 * previously could be missing node hint and unbound pools NUMA 6536 * affinity, fix them up. 6537 * 6538 * Also, while iterating workqueues, create rescuers if requested. 6539 */ 6540 wq_numa_init(); 6541 6542 mutex_lock(&wq_pool_mutex); 6543 6544 for_each_possible_cpu(cpu) { 6545 for_each_cpu_worker_pool(pool, cpu) { 6546 pool->node = cpu_to_node(cpu); 6547 } 6548 } 6549 6550 list_for_each_entry(wq, &workqueues, list) { 6551 wq_update_unbound_numa(wq, smp_processor_id(), true); 6552 WARN(init_rescuer(wq), 6553 "workqueue: failed to create early rescuer for %s", 6554 wq->name); 6555 } 6556 6557 mutex_unlock(&wq_pool_mutex); 6558 6559 /* create the initial workers */ 6560 for_each_online_cpu(cpu) { 6561 for_each_cpu_worker_pool(pool, cpu) { 6562 pool->flags &= ~POOL_DISASSOCIATED; 6563 BUG_ON(!create_worker(pool)); 6564 } 6565 } 6566 6567 hash_for_each(unbound_pool_hash, bkt, pool, hash_node) 6568 BUG_ON(!create_worker(pool)); 6569 6570 wq_online = true; 6571 wq_watchdog_init(); 6572 } 6573 6574 /* 6575 * Despite the naming, this is a no-op function which is here only for avoiding 6576 * link error. Since compile-time warning may fail to catch, we will need to 6577 * emit run-time warning from __flush_workqueue(). 6578 */ 6579 void __warn_flushing_systemwide_wq(void) { } 6580 EXPORT_SYMBOL(__warn_flushing_systemwide_wq); 6581