1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. 4 * Copyright (C) 2007 The Regents of the University of California. 5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 6 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 7 * UCRL-CODE-235197 8 * 9 * This file is part of the SPL, Solaris Porting Layer. 10 * 11 * The SPL is free software; you can redistribute it and/or modify it 12 * under the terms of the GNU General Public License as published by the 13 * Free Software Foundation; either version 2 of the License, or (at your 14 * option) any later version. 15 * 16 * The SPL is distributed in the hope that it will be useful, but WITHOUT 17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 19 * for more details. 20 * 21 * You should have received a copy of the GNU General Public License along 22 * with the SPL. If not, see <http://www.gnu.org/licenses/>. 23 * 24 * Solaris Porting Layer (SPL) Task Queue Implementation. 25 */ 26 /* 27 * Copyright (c) 2024, Klara Inc. 28 * Copyright (c) 2024, Syneto 29 */ 30 31 #include <sys/timer.h> 32 #include <sys/taskq.h> 33 #include <sys/kmem.h> 34 #include <sys/tsd.h> 35 #include <sys/time.h> 36 #include <sys/atomic.h> 37 #include <sys/kstat.h> 38 #include <linux/cpuhotplug.h> 39 #include <linux/mod_compat.h> 40 41 /* Linux 6.2 renamed timer_delete_sync(); point it at its old name for those. */ 42 #ifndef HAVE_TIMER_DELETE_SYNC 43 #define timer_delete_sync(t) del_timer_sync(t) 44 #endif 45 46 typedef struct taskq_kstats { 47 /* static values, for completeness */ 48 kstat_named_t tqks_threads_max; 49 kstat_named_t tqks_entry_pool_min; 50 kstat_named_t tqks_entry_pool_max; 51 52 /* gauges (inc/dec counters, current value) */ 53 kstat_named_t tqks_threads_active; 54 kstat_named_t tqks_threads_idle; 55 kstat_named_t tqks_threads_total; 56 kstat_named_t tqks_tasks_pending; 57 kstat_named_t tqks_tasks_priority; 58 kstat_named_t tqks_tasks_total; 59 kstat_named_t tqks_tasks_delayed; 60 kstat_named_t tqks_entries_free; 61 62 /* counters (inc only, since taskq creation) */ 63 kstat_named_t tqks_threads_created; 64 kstat_named_t tqks_threads_destroyed; 65 kstat_named_t tqks_tasks_dispatched; 66 kstat_named_t tqks_tasks_dispatched_delayed; 67 kstat_named_t tqks_tasks_executed_normal; 68 kstat_named_t tqks_tasks_executed_priority; 69 kstat_named_t tqks_tasks_executed; 70 kstat_named_t tqks_tasks_delayed_requeued; 71 kstat_named_t tqks_tasks_cancelled; 72 kstat_named_t tqks_thread_wakeups; 73 kstat_named_t tqks_thread_wakeups_nowork; 74 kstat_named_t tqks_thread_sleeps; 75 } taskq_kstats_t; 76 77 static taskq_kstats_t taskq_kstats_template = { 78 { "threads_max", KSTAT_DATA_UINT64 }, 79 { "entry_pool_min", KSTAT_DATA_UINT64 }, 80 { "entry_pool_max", KSTAT_DATA_UINT64 }, 81 { "threads_active", KSTAT_DATA_UINT64 }, 82 { "threads_idle", KSTAT_DATA_UINT64 }, 83 { "threads_total", KSTAT_DATA_UINT64 }, 84 { "tasks_pending", KSTAT_DATA_UINT64 }, 85 { "tasks_priority", KSTAT_DATA_UINT64 }, 86 { "tasks_total", KSTAT_DATA_UINT64 }, 87 { "tasks_delayed", KSTAT_DATA_UINT64 }, 88 { "entries_free", KSTAT_DATA_UINT64 }, 89 90 { "threads_created", KSTAT_DATA_UINT64 }, 91 { "threads_destroyed", KSTAT_DATA_UINT64 }, 92 { "tasks_dispatched", KSTAT_DATA_UINT64 }, 93 { "tasks_dispatched_delayed", KSTAT_DATA_UINT64 }, 94 { "tasks_executed_normal", KSTAT_DATA_UINT64 }, 95 { "tasks_executed_priority", KSTAT_DATA_UINT64 }, 96 { "tasks_executed", KSTAT_DATA_UINT64 }, 97 { "tasks_delayed_requeued", KSTAT_DATA_UINT64 }, 98 { "tasks_cancelled", KSTAT_DATA_UINT64 }, 99 { "thread_wakeups", KSTAT_DATA_UINT64 }, 100 { "thread_wakeups_nowork", KSTAT_DATA_UINT64 }, 101 { "thread_sleeps", KSTAT_DATA_UINT64 }, 102 }; 103 104 #define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1) 105 #define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1) 106 107 #define _TQSTAT_MOD_LIST(mod, tq, t) do { \ 108 switch (t->tqent_flags & TQENT_LIST_MASK) { \ 109 case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\ 110 case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \ 111 case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \ 112 case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \ 113 } \ 114 } while (0) 115 #define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t) 116 #define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t) 117 118 #define TQENT_SET_LIST(t, l) \ 119 t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l; 120 121 static int spl_taskq_thread_bind = 0; 122 module_param(spl_taskq_thread_bind, int, 0644); 123 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); 124 125 static uint_t spl_taskq_thread_timeout_ms = 5000; 126 module_param(spl_taskq_thread_timeout_ms, uint, 0644); 127 MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, 128 "Minimum idle threads exit interval for dynamic taskqs"); 129 130 static int spl_taskq_thread_dynamic = 1; 131 module_param(spl_taskq_thread_dynamic, int, 0444); 132 MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); 133 134 static int spl_taskq_thread_priority = 1; 135 module_param(spl_taskq_thread_priority, int, 0644); 136 MODULE_PARM_DESC(spl_taskq_thread_priority, 137 "Allow non-default priority for taskq threads"); 138 139 static uint_t spl_taskq_thread_sequential = 4; 140 module_param(spl_taskq_thread_sequential, uint, 0644); 141 MODULE_PARM_DESC(spl_taskq_thread_sequential, 142 "Create new taskq threads after N sequential tasks"); 143 144 /* 145 * Global system-wide dynamic task queue available for all consumers. This 146 * taskq is not intended for long-running tasks; instead, a dedicated taskq 147 * should be created. 148 */ 149 taskq_t *system_taskq; 150 EXPORT_SYMBOL(system_taskq); 151 /* Global dynamic task queue for long delay */ 152 taskq_t *system_delay_taskq; 153 EXPORT_SYMBOL(system_delay_taskq); 154 155 /* Private dedicated taskq for creating new taskq threads on demand. */ 156 static taskq_t *dynamic_taskq; 157 static taskq_thread_t *taskq_thread_create(taskq_t *); 158 159 /* Multi-callback id for cpu hotplugging. */ 160 static int spl_taskq_cpuhp_state; 161 162 /* List of all taskqs */ 163 LIST_HEAD(tq_list); 164 struct rw_semaphore tq_list_sem; 165 static uint_t taskq_tsd; 166 167 static int 168 task_km_flags(uint_t flags) 169 { 170 if (flags & TQ_NOSLEEP) 171 return (KM_NOSLEEP); 172 173 if (flags & TQ_PUSHPAGE) 174 return (KM_PUSHPAGE); 175 176 return (KM_SLEEP); 177 } 178 179 /* 180 * taskq_find_by_name - Find the largest instance number of a named taskq. 181 */ 182 static int 183 taskq_find_by_name(const char *name) 184 { 185 struct list_head *tql = NULL; 186 taskq_t *tq; 187 188 list_for_each_prev(tql, &tq_list) { 189 tq = list_entry(tql, taskq_t, tq_taskqs); 190 if (strcmp(name, tq->tq_name) == 0) 191 return (tq->tq_instance); 192 } 193 return (-1); 194 } 195 196 /* 197 * NOTE: Must be called with tq->tq_lock held, returns a list_t which 198 * is not attached to the free, work, or pending taskq lists. 199 */ 200 static taskq_ent_t * 201 task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) 202 { 203 taskq_ent_t *t; 204 int count = 0; 205 206 ASSERT(tq); 207 retry: 208 /* Acquire taskq_ent_t's from free list if available */ 209 if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { 210 t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); 211 212 ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); 213 ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); 214 ASSERT(!timer_pending(&t->tqent_timer)); 215 216 list_del_init(&t->tqent_list); 217 TQSTAT_DEC(tq, entries_free); 218 return (t); 219 } 220 221 /* Free list is empty and memory allocations are prohibited */ 222 if (flags & TQ_NOALLOC) 223 return (NULL); 224 225 /* Hit maximum taskq_ent_t pool size */ 226 if (tq->tq_nalloc >= tq->tq_maxalloc) { 227 if (flags & TQ_NOSLEEP) 228 return (NULL); 229 230 /* 231 * Sleep periodically polling the free list for an available 232 * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed 233 * but we cannot block forever waiting for an taskq_ent_t to 234 * show up in the free list, otherwise a deadlock can happen. 235 * 236 * Therefore, we need to allocate a new task even if the number 237 * of allocated tasks is above tq->tq_maxalloc, but we still 238 * end up delaying the task allocation by one second, thereby 239 * throttling the task dispatch rate. 240 */ 241 spin_unlock_irqrestore(&tq->tq_lock, *irqflags); 242 schedule_timeout_interruptible(HZ / 100); 243 spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, 244 tq->tq_lock_class); 245 if (count < 100) { 246 count++; 247 goto retry; 248 } 249 } 250 251 spin_unlock_irqrestore(&tq->tq_lock, *irqflags); 252 t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); 253 spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); 254 255 if (t) { 256 taskq_init_ent(t); 257 tq->tq_nalloc++; 258 } 259 260 return (t); 261 } 262 263 /* 264 * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t 265 * to already be removed from the free, work, or pending taskq lists. 266 */ 267 static void 268 task_free(taskq_t *tq, taskq_ent_t *t) 269 { 270 ASSERT(tq); 271 ASSERT(t); 272 ASSERT(list_empty(&t->tqent_list)); 273 ASSERT(!timer_pending(&t->tqent_timer)); 274 275 kmem_free(t, sizeof (taskq_ent_t)); 276 tq->tq_nalloc--; 277 } 278 279 /* 280 * NOTE: Must be called with tq->tq_lock held, either destroys the 281 * taskq_ent_t if too many exist or moves it to the free list for later use. 282 */ 283 static void 284 task_done(taskq_t *tq, taskq_ent_t *t) 285 { 286 ASSERT(tq); 287 ASSERT(t); 288 ASSERT(list_empty(&t->tqent_list)); 289 290 /* Wake tasks blocked in taskq_wait_id() */ 291 wake_up_all(&t->tqent_waitq); 292 293 if (tq->tq_nalloc <= tq->tq_minalloc) { 294 t->tqent_id = TASKQID_INVALID; 295 t->tqent_func = NULL; 296 t->tqent_arg = NULL; 297 t->tqent_flags = 0; 298 299 list_add_tail(&t->tqent_list, &tq->tq_free_list); 300 TQSTAT_INC(tq, entries_free); 301 } else { 302 task_free(tq, t); 303 } 304 } 305 306 /* 307 * When a delayed task timer expires remove it from the delay list and 308 * add it to the priority list in order for immediate processing. 309 */ 310 static void 311 task_expire_impl(taskq_ent_t *t) 312 { 313 taskq_ent_t *w; 314 taskq_t *tq = t->tqent_taskq; 315 struct list_head *l = NULL; 316 unsigned long flags; 317 318 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 319 320 if (t->tqent_flags & TQENT_FLAG_CANCEL) { 321 ASSERT(list_empty(&t->tqent_list)); 322 spin_unlock_irqrestore(&tq->tq_lock, flags); 323 return; 324 } 325 326 t->tqent_birth = jiffies; 327 328 /* 329 * The priority list must be maintained in strict task id order 330 * from lowest to highest for lowest_id to be easily calculable. 331 */ 332 list_del(&t->tqent_list); 333 list_for_each_prev(l, &tq->tq_prio_list) { 334 w = list_entry(l, taskq_ent_t, tqent_list); 335 if (w->tqent_id < t->tqent_id) { 336 list_add(&t->tqent_list, l); 337 break; 338 } 339 } 340 if (l == &tq->tq_prio_list) 341 list_add(&t->tqent_list, &tq->tq_prio_list); 342 343 spin_unlock_irqrestore(&tq->tq_lock, flags); 344 345 wake_up(&tq->tq_work_waitq); 346 347 TQSTAT_INC(tq, tasks_delayed_requeued); 348 } 349 350 static void 351 task_expire(struct timer_list *tl) 352 { 353 struct timer_list *tmr = (struct timer_list *)tl; 354 taskq_ent_t *t = from_timer(t, tmr, tqent_timer); 355 task_expire_impl(t); 356 } 357 358 /* 359 * Returns the lowest incomplete taskqid_t. The taskqid_t may 360 * be queued on the pending list, on the priority list, on the 361 * delay list, or on the work list currently being handled, but 362 * it is not 100% complete yet. 363 */ 364 static taskqid_t 365 taskq_lowest_id(taskq_t *tq) 366 { 367 taskqid_t lowest_id = tq->tq_next_id; 368 taskq_ent_t *t; 369 taskq_thread_t *tqt; 370 371 if (!list_empty(&tq->tq_pend_list)) { 372 t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); 373 lowest_id = MIN(lowest_id, t->tqent_id); 374 } 375 376 if (!list_empty(&tq->tq_prio_list)) { 377 t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); 378 lowest_id = MIN(lowest_id, t->tqent_id); 379 } 380 381 if (!list_empty(&tq->tq_delay_list)) { 382 t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); 383 lowest_id = MIN(lowest_id, t->tqent_id); 384 } 385 386 if (!list_empty(&tq->tq_active_list)) { 387 tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, 388 tqt_active_list); 389 ASSERT(tqt->tqt_id != TASKQID_INVALID); 390 lowest_id = MIN(lowest_id, tqt->tqt_id); 391 } 392 393 return (lowest_id); 394 } 395 396 /* 397 * Insert a task into a list keeping the list sorted by increasing taskqid. 398 */ 399 static void 400 taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) 401 { 402 taskq_thread_t *w; 403 struct list_head *l = NULL; 404 405 ASSERT(tq); 406 ASSERT(tqt); 407 408 list_for_each_prev(l, &tq->tq_active_list) { 409 w = list_entry(l, taskq_thread_t, tqt_active_list); 410 if (w->tqt_id < tqt->tqt_id) { 411 list_add(&tqt->tqt_active_list, l); 412 break; 413 } 414 } 415 if (l == &tq->tq_active_list) 416 list_add(&tqt->tqt_active_list, &tq->tq_active_list); 417 } 418 419 /* 420 * Find and return a task from the given list if it exists. The list 421 * must be in lowest to highest task id order. 422 */ 423 static taskq_ent_t * 424 taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) 425 { 426 struct list_head *l = NULL; 427 taskq_ent_t *t; 428 429 list_for_each(l, lh) { 430 t = list_entry(l, taskq_ent_t, tqent_list); 431 432 if (t->tqent_id == id) 433 return (t); 434 435 if (t->tqent_id > id) 436 break; 437 } 438 439 return (NULL); 440 } 441 442 /* 443 * Find an already dispatched task given the task id regardless of what 444 * state it is in. If a task is still pending it will be returned. 445 * If a task is executing, then -EBUSY will be returned instead. 446 * If the task has already been run then NULL is returned. 447 */ 448 static taskq_ent_t * 449 taskq_find(taskq_t *tq, taskqid_t id) 450 { 451 taskq_thread_t *tqt; 452 struct list_head *l = NULL; 453 taskq_ent_t *t; 454 455 t = taskq_find_list(tq, &tq->tq_delay_list, id); 456 if (t) 457 return (t); 458 459 t = taskq_find_list(tq, &tq->tq_prio_list, id); 460 if (t) 461 return (t); 462 463 t = taskq_find_list(tq, &tq->tq_pend_list, id); 464 if (t) 465 return (t); 466 467 list_for_each(l, &tq->tq_active_list) { 468 tqt = list_entry(l, taskq_thread_t, tqt_active_list); 469 if (tqt->tqt_id == id) { 470 /* 471 * Instead of returning tqt_task, we just return a non 472 * NULL value to prevent misuse, since tqt_task only 473 * has two valid fields. 474 */ 475 return (ERR_PTR(-EBUSY)); 476 } 477 } 478 479 return (NULL); 480 } 481 482 /* 483 * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and 484 * taskq_wait() functions below. 485 * 486 * Taskq waiting is accomplished by tracking the lowest outstanding task 487 * id and the next available task id. As tasks are dispatched they are 488 * added to the tail of the pending, priority, or delay lists. As worker 489 * threads become available the tasks are removed from the heads of these 490 * lists and linked to the worker threads. This ensures the lists are 491 * kept sorted by lowest to highest task id. 492 * 493 * Therefore the lowest outstanding task id can be quickly determined by 494 * checking the head item from all of these lists. This value is stored 495 * with the taskq as the lowest id. It only needs to be recalculated when 496 * either the task with the current lowest id completes or is canceled. 497 * 498 * By blocking until the lowest task id exceeds the passed task id the 499 * taskq_wait_outstanding() function can be easily implemented. Similarly, 500 * by blocking until the lowest task id matches the next task id taskq_wait() 501 * can be implemented. 502 * 503 * Callers should be aware that when there are multiple worked threads it 504 * is possible for larger task ids to complete before smaller ones. Also 505 * when the taskq contains delay tasks with small task ids callers may 506 * block for a considerable length of time waiting for them to expire and 507 * execute. 508 */ 509 static int 510 taskq_wait_id_check(taskq_t *tq, taskqid_t id) 511 { 512 int rc; 513 unsigned long flags; 514 515 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 516 rc = (taskq_find(tq, id) == NULL); 517 spin_unlock_irqrestore(&tq->tq_lock, flags); 518 519 return (rc); 520 } 521 522 /* 523 * The taskq_wait_id() function blocks until the passed task id completes. 524 * This does not guarantee that all lower task ids have completed. 525 */ 526 void 527 taskq_wait_id(taskq_t *tq, taskqid_t id) 528 { 529 wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); 530 } 531 EXPORT_SYMBOL(taskq_wait_id); 532 533 static int 534 taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) 535 { 536 int rc; 537 unsigned long flags; 538 539 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 540 rc = (id < tq->tq_lowest_id); 541 spin_unlock_irqrestore(&tq->tq_lock, flags); 542 543 return (rc); 544 } 545 546 /* 547 * The taskq_wait_outstanding() function will block until all tasks with a 548 * lower taskqid than the passed 'id' have been completed. Note that all 549 * task id's are assigned monotonically at dispatch time. Zero may be 550 * passed for the id to indicate all tasks dispatch up to this point, 551 * but not after, should be waited for. 552 */ 553 void 554 taskq_wait_outstanding(taskq_t *tq, taskqid_t id) 555 { 556 id = id ? id : tq->tq_next_id - 1; 557 wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); 558 } 559 EXPORT_SYMBOL(taskq_wait_outstanding); 560 561 static int 562 taskq_wait_check(taskq_t *tq) 563 { 564 int rc; 565 unsigned long flags; 566 567 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 568 rc = (tq->tq_lowest_id == tq->tq_next_id); 569 spin_unlock_irqrestore(&tq->tq_lock, flags); 570 571 return (rc); 572 } 573 574 /* 575 * The taskq_wait() function will block until the taskq is empty. 576 * This means that if a taskq re-dispatches work to itself taskq_wait() 577 * callers will block indefinitely. 578 */ 579 void 580 taskq_wait(taskq_t *tq) 581 { 582 wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); 583 } 584 EXPORT_SYMBOL(taskq_wait); 585 586 int 587 taskq_member(taskq_t *tq, kthread_t *t) 588 { 589 return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); 590 } 591 EXPORT_SYMBOL(taskq_member); 592 593 taskq_t * 594 taskq_of_curthread(void) 595 { 596 return (tsd_get(taskq_tsd)); 597 } 598 EXPORT_SYMBOL(taskq_of_curthread); 599 600 /* 601 * Cancel an already dispatched task given the task id. Still pending tasks 602 * will be immediately canceled, and if the task is active the function will 603 * block until it completes. Preallocated tasks which are canceled must be 604 * freed by the caller. 605 */ 606 int 607 taskq_cancel_id(taskq_t *tq, taskqid_t id) 608 { 609 taskq_ent_t *t; 610 int rc = ENOENT; 611 unsigned long flags; 612 613 ASSERT(tq); 614 615 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 616 t = taskq_find(tq, id); 617 if (t && t != ERR_PTR(-EBUSY)) { 618 list_del_init(&t->tqent_list); 619 TQSTAT_DEC_LIST(tq, t); 620 TQSTAT_DEC(tq, tasks_total); 621 622 t->tqent_flags |= TQENT_FLAG_CANCEL; 623 TQSTAT_INC(tq, tasks_cancelled); 624 625 /* 626 * When canceling the lowest outstanding task id we 627 * must recalculate the new lowest outstanding id. 628 */ 629 if (tq->tq_lowest_id == t->tqent_id) { 630 tq->tq_lowest_id = taskq_lowest_id(tq); 631 ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); 632 } 633 634 /* 635 * The task_expire() function takes the tq->tq_lock so drop 636 * drop the lock before synchronously cancelling the timer. 637 */ 638 if (timer_pending(&t->tqent_timer)) { 639 spin_unlock_irqrestore(&tq->tq_lock, flags); 640 timer_delete_sync(&t->tqent_timer); 641 spin_lock_irqsave_nested(&tq->tq_lock, flags, 642 tq->tq_lock_class); 643 } 644 645 if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) 646 task_done(tq, t); 647 648 rc = 0; 649 } 650 spin_unlock_irqrestore(&tq->tq_lock, flags); 651 652 if (t == ERR_PTR(-EBUSY)) { 653 taskq_wait_id(tq, id); 654 rc = EBUSY; 655 } 656 657 return (rc); 658 } 659 EXPORT_SYMBOL(taskq_cancel_id); 660 661 static int taskq_thread_spawn(taskq_t *tq); 662 663 taskqid_t 664 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) 665 { 666 taskq_ent_t *t; 667 taskqid_t rc = TASKQID_INVALID; 668 unsigned long irqflags; 669 670 ASSERT(tq); 671 ASSERT(func); 672 673 spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); 674 675 /* Taskq being destroyed and all tasks drained */ 676 if (!(tq->tq_flags & TASKQ_ACTIVE)) 677 goto out; 678 679 /* Do not queue the task unless there is idle thread for it */ 680 ASSERT(tq->tq_nactive <= tq->tq_nthreads); 681 if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { 682 /* Dynamic taskq may be able to spawn another thread */ 683 if (taskq_thread_spawn(tq) == 0) 684 goto out; 685 } 686 687 if ((t = task_alloc(tq, flags, &irqflags)) == NULL) 688 goto out; 689 690 spin_lock(&t->tqent_lock); 691 692 /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ 693 if (flags & TQ_NOQUEUE) { 694 TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); 695 list_add(&t->tqent_list, &tq->tq_prio_list); 696 /* Queue to the priority list instead of the pending list */ 697 } else if (flags & TQ_FRONT) { 698 TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); 699 list_add_tail(&t->tqent_list, &tq->tq_prio_list); 700 } else { 701 TQENT_SET_LIST(t, TQENT_LIST_PENDING); 702 list_add_tail(&t->tqent_list, &tq->tq_pend_list); 703 } 704 TQSTAT_INC_LIST(tq, t); 705 TQSTAT_INC(tq, tasks_total); 706 707 t->tqent_id = rc = tq->tq_next_id; 708 tq->tq_next_id++; 709 t->tqent_func = func; 710 t->tqent_arg = arg; 711 t->tqent_taskq = tq; 712 t->tqent_timer.function = NULL; 713 t->tqent_timer.expires = 0; 714 t->tqent_birth = jiffies; 715 716 ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); 717 718 spin_unlock(&t->tqent_lock); 719 720 wake_up(&tq->tq_work_waitq); 721 722 TQSTAT_INC(tq, tasks_dispatched); 723 724 /* Spawn additional taskq threads if required. */ 725 if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) 726 (void) taskq_thread_spawn(tq); 727 out: 728 spin_unlock_irqrestore(&tq->tq_lock, irqflags); 729 return (rc); 730 } 731 EXPORT_SYMBOL(taskq_dispatch); 732 733 taskqid_t 734 taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, 735 uint_t flags, clock_t expire_time) 736 { 737 taskqid_t rc = TASKQID_INVALID; 738 taskq_ent_t *t; 739 unsigned long irqflags; 740 741 ASSERT(tq); 742 ASSERT(func); 743 744 spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); 745 746 /* Taskq being destroyed and all tasks drained */ 747 if (!(tq->tq_flags & TASKQ_ACTIVE)) 748 goto out; 749 750 if ((t = task_alloc(tq, flags, &irqflags)) == NULL) 751 goto out; 752 753 spin_lock(&t->tqent_lock); 754 755 /* Queue to the delay list for subsequent execution */ 756 list_add_tail(&t->tqent_list, &tq->tq_delay_list); 757 TQENT_SET_LIST(t, TQENT_LIST_DELAY); 758 TQSTAT_INC_LIST(tq, t); 759 TQSTAT_INC(tq, tasks_total); 760 761 t->tqent_id = rc = tq->tq_next_id; 762 tq->tq_next_id++; 763 t->tqent_func = func; 764 t->tqent_arg = arg; 765 t->tqent_taskq = tq; 766 t->tqent_timer.function = task_expire; 767 t->tqent_timer.expires = (unsigned long)expire_time; 768 add_timer(&t->tqent_timer); 769 770 ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); 771 772 spin_unlock(&t->tqent_lock); 773 774 TQSTAT_INC(tq, tasks_dispatched_delayed); 775 776 /* Spawn additional taskq threads if required. */ 777 if (tq->tq_nactive == tq->tq_nthreads) 778 (void) taskq_thread_spawn(tq); 779 out: 780 spin_unlock_irqrestore(&tq->tq_lock, irqflags); 781 return (rc); 782 } 783 EXPORT_SYMBOL(taskq_dispatch_delay); 784 785 void 786 taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, 787 taskq_ent_t *t) 788 { 789 unsigned long irqflags; 790 ASSERT(tq); 791 ASSERT(func); 792 793 spin_lock_irqsave_nested(&tq->tq_lock, irqflags, 794 tq->tq_lock_class); 795 796 /* Taskq being destroyed and all tasks drained */ 797 if (!(tq->tq_flags & TASKQ_ACTIVE)) { 798 t->tqent_id = TASKQID_INVALID; 799 goto out; 800 } 801 802 if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { 803 /* Dynamic taskq may be able to spawn another thread */ 804 if (taskq_thread_spawn(tq) == 0) 805 goto out; 806 flags |= TQ_FRONT; 807 } 808 809 spin_lock(&t->tqent_lock); 810 811 /* 812 * Make sure the entry is not on some other taskq; it is important to 813 * ASSERT() under lock 814 */ 815 ASSERT(taskq_empty_ent(t)); 816 817 /* 818 * Mark it as a prealloc'd task. This is important 819 * to ensure that we don't free it later. 820 */ 821 t->tqent_flags |= TQENT_FLAG_PREALLOC; 822 823 /* Queue to the priority list instead of the pending list */ 824 if (flags & TQ_FRONT) { 825 TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); 826 list_add_tail(&t->tqent_list, &tq->tq_prio_list); 827 } else { 828 TQENT_SET_LIST(t, TQENT_LIST_PENDING); 829 list_add_tail(&t->tqent_list, &tq->tq_pend_list); 830 } 831 TQSTAT_INC_LIST(tq, t); 832 TQSTAT_INC(tq, tasks_total); 833 834 t->tqent_id = tq->tq_next_id; 835 tq->tq_next_id++; 836 t->tqent_func = func; 837 t->tqent_arg = arg; 838 t->tqent_taskq = tq; 839 t->tqent_birth = jiffies; 840 841 spin_unlock(&t->tqent_lock); 842 843 wake_up(&tq->tq_work_waitq); 844 845 TQSTAT_INC(tq, tasks_dispatched); 846 847 /* Spawn additional taskq threads if required. */ 848 if (tq->tq_nactive == tq->tq_nthreads) 849 (void) taskq_thread_spawn(tq); 850 out: 851 spin_unlock_irqrestore(&tq->tq_lock, irqflags); 852 } 853 EXPORT_SYMBOL(taskq_dispatch_ent); 854 855 int 856 taskq_empty_ent(taskq_ent_t *t) 857 { 858 return (list_empty(&t->tqent_list)); 859 } 860 EXPORT_SYMBOL(taskq_empty_ent); 861 862 void 863 taskq_init_ent(taskq_ent_t *t) 864 { 865 spin_lock_init(&t->tqent_lock); 866 init_waitqueue_head(&t->tqent_waitq); 867 timer_setup(&t->tqent_timer, NULL, 0); 868 INIT_LIST_HEAD(&t->tqent_list); 869 t->tqent_id = 0; 870 t->tqent_func = NULL; 871 t->tqent_arg = NULL; 872 t->tqent_flags = 0; 873 t->tqent_taskq = NULL; 874 } 875 EXPORT_SYMBOL(taskq_init_ent); 876 877 /* 878 * Return the next pending task, preference is given to tasks on the 879 * priority list which were dispatched with TQ_FRONT. 880 */ 881 static taskq_ent_t * 882 taskq_next_ent(taskq_t *tq) 883 { 884 struct list_head *list; 885 886 if (!list_empty(&tq->tq_prio_list)) 887 list = &tq->tq_prio_list; 888 else if (!list_empty(&tq->tq_pend_list)) 889 list = &tq->tq_pend_list; 890 else 891 return (NULL); 892 893 return (list_entry(list->next, taskq_ent_t, tqent_list)); 894 } 895 896 /* 897 * Spawns a new thread for the specified taskq. 898 */ 899 static void 900 taskq_thread_spawn_task(void *arg) 901 { 902 taskq_t *tq = (taskq_t *)arg; 903 unsigned long flags; 904 905 if (taskq_thread_create(tq) == NULL) { 906 /* restore spawning count if failed */ 907 spin_lock_irqsave_nested(&tq->tq_lock, flags, 908 tq->tq_lock_class); 909 tq->tq_nspawn--; 910 spin_unlock_irqrestore(&tq->tq_lock, flags); 911 } 912 } 913 914 /* 915 * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current 916 * number of threads is insufficient to handle the pending tasks. These 917 * new threads must be created by the dedicated dynamic_taskq to avoid 918 * deadlocks between thread creation and memory reclaim. The system_taskq 919 * which is also a dynamic taskq cannot be safely used for this. 920 */ 921 static int 922 taskq_thread_spawn(taskq_t *tq) 923 { 924 int spawning = 0; 925 926 if (!(tq->tq_flags & TASKQ_DYNAMIC)) 927 return (0); 928 929 tq->lastspawnstop = jiffies; 930 if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && 931 (tq->tq_flags & TASKQ_ACTIVE)) { 932 spawning = (++tq->tq_nspawn); 933 taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, 934 tq, TQ_NOSLEEP); 935 } 936 937 return (spawning); 938 } 939 940 /* 941 * Threads in a dynamic taskq may exit once there is no more work to do. 942 * To prevent threads from being created and destroyed too often limit 943 * the exit rate to one per spl_taskq_thread_timeout_ms. 944 * 945 * The first thread is the thread list is treated as the primary thread. 946 * There is nothing special about the primary thread but in order to avoid 947 * all the taskq pids from changing we opt to make it long running. 948 */ 949 static int 950 taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) 951 { 952 ASSERT(!taskq_next_ent(tq)); 953 if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic) 954 return (0); 955 if (!(tq->tq_flags & TASKQ_ACTIVE)) 956 return (1); 957 if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, 958 tqt_thread_list) == tqt) 959 return (0); 960 ASSERT3U(tq->tq_nthreads, >, 1); 961 if (tq->tq_nspawn != 0) 962 return (0); 963 if (time_before(jiffies, tq->lastspawnstop + 964 msecs_to_jiffies(spl_taskq_thread_timeout_ms))) 965 return (0); 966 tq->lastspawnstop = jiffies; 967 return (1); 968 } 969 970 static int 971 taskq_thread(void *args) 972 { 973 DECLARE_WAITQUEUE(wait, current); 974 sigset_t blocked; 975 taskq_thread_t *tqt = args; 976 taskq_t *tq; 977 taskq_ent_t *t; 978 int seq_tasks = 0; 979 unsigned long flags; 980 taskq_ent_t dup_task = {}; 981 982 ASSERT(tqt); 983 ASSERT(tqt->tqt_tq); 984 tq = tqt->tqt_tq; 985 current->flags |= PF_NOFREEZE; 986 987 (void) spl_fstrans_mark(); 988 989 sigfillset(&blocked); 990 sigprocmask(SIG_BLOCK, &blocked, NULL); 991 flush_signals(current); 992 993 tsd_set(taskq_tsd, tq); 994 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 995 /* 996 * If we are dynamically spawned, decrease spawning count. Note that 997 * we could be created during taskq_create, in which case we shouldn't 998 * do the decrement. But it's fine because taskq_create will reset 999 * tq_nspawn later. 1000 */ 1001 if (tq->tq_flags & TASKQ_DYNAMIC) 1002 tq->tq_nspawn--; 1003 1004 /* Immediately exit if more threads than allowed were created. */ 1005 if (tq->tq_nthreads >= tq->tq_maxthreads) 1006 goto error; 1007 1008 tq->tq_nthreads++; 1009 list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); 1010 wake_up(&tq->tq_wait_waitq); 1011 set_current_state(TASK_INTERRUPTIBLE); 1012 1013 TQSTAT_INC(tq, threads_total); 1014 1015 while (!kthread_should_stop()) { 1016 1017 if (list_empty(&tq->tq_pend_list) && 1018 list_empty(&tq->tq_prio_list)) { 1019 1020 if (taskq_thread_should_stop(tq, tqt)) 1021 break; 1022 1023 add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); 1024 spin_unlock_irqrestore(&tq->tq_lock, flags); 1025 1026 TQSTAT_INC(tq, thread_sleeps); 1027 TQSTAT_INC(tq, threads_idle); 1028 1029 schedule(); 1030 seq_tasks = 0; 1031 1032 TQSTAT_DEC(tq, threads_idle); 1033 TQSTAT_INC(tq, thread_wakeups); 1034 1035 spin_lock_irqsave_nested(&tq->tq_lock, flags, 1036 tq->tq_lock_class); 1037 remove_wait_queue(&tq->tq_work_waitq, &wait); 1038 } else { 1039 __set_current_state(TASK_RUNNING); 1040 } 1041 1042 if ((t = taskq_next_ent(tq)) != NULL) { 1043 list_del_init(&t->tqent_list); 1044 TQSTAT_DEC_LIST(tq, t); 1045 TQSTAT_DEC(tq, tasks_total); 1046 1047 /* 1048 * A TQENT_FLAG_PREALLOC task may be reused or freed 1049 * during the task function call. Store tqent_id and 1050 * tqent_flags here. 1051 */ 1052 tqt->tqt_id = t->tqent_id; 1053 tqt->tqt_flags = t->tqent_flags; 1054 1055 if (t->tqent_flags & TQENT_FLAG_PREALLOC) { 1056 dup_task = *t; 1057 t = &dup_task; 1058 } 1059 tqt->tqt_task = t; 1060 1061 taskq_insert_in_order(tq, tqt); 1062 tq->tq_nactive++; 1063 spin_unlock_irqrestore(&tq->tq_lock, flags); 1064 1065 TQSTAT_INC(tq, threads_active); 1066 1067 /* Perform the requested task */ 1068 t->tqent_func(t->tqent_arg); 1069 1070 TQSTAT_DEC(tq, threads_active); 1071 if ((t->tqent_flags & TQENT_LIST_MASK) == 1072 TQENT_LIST_PENDING) 1073 TQSTAT_INC(tq, tasks_executed_normal); 1074 else 1075 TQSTAT_INC(tq, tasks_executed_priority); 1076 TQSTAT_INC(tq, tasks_executed); 1077 1078 spin_lock_irqsave_nested(&tq->tq_lock, flags, 1079 tq->tq_lock_class); 1080 1081 tq->tq_nactive--; 1082 list_del_init(&tqt->tqt_active_list); 1083 tqt->tqt_task = NULL; 1084 1085 /* For prealloc'd tasks, we don't free anything. */ 1086 if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) 1087 task_done(tq, t); 1088 1089 /* 1090 * When the current lowest outstanding taskqid is 1091 * done calculate the new lowest outstanding id 1092 */ 1093 if (tq->tq_lowest_id == tqt->tqt_id) { 1094 tq->tq_lowest_id = taskq_lowest_id(tq); 1095 ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); 1096 } 1097 1098 /* Spawn additional taskq threads if required. */ 1099 if ((++seq_tasks) > spl_taskq_thread_sequential && 1100 taskq_thread_spawn(tq)) 1101 seq_tasks = 0; 1102 1103 tqt->tqt_id = TASKQID_INVALID; 1104 tqt->tqt_flags = 0; 1105 wake_up_all(&tq->tq_wait_waitq); 1106 } else 1107 TQSTAT_INC(tq, thread_wakeups_nowork); 1108 1109 set_current_state(TASK_INTERRUPTIBLE); 1110 1111 } 1112 1113 __set_current_state(TASK_RUNNING); 1114 tq->tq_nthreads--; 1115 list_del_init(&tqt->tqt_thread_list); 1116 1117 TQSTAT_DEC(tq, threads_total); 1118 TQSTAT_INC(tq, threads_destroyed); 1119 1120 error: 1121 kmem_free(tqt, sizeof (taskq_thread_t)); 1122 spin_unlock_irqrestore(&tq->tq_lock, flags); 1123 1124 tsd_set(taskq_tsd, NULL); 1125 thread_exit(); 1126 1127 return (0); 1128 } 1129 1130 static taskq_thread_t * 1131 taskq_thread_create(taskq_t *tq) 1132 { 1133 static int last_used_cpu = 0; 1134 taskq_thread_t *tqt; 1135 1136 tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); 1137 INIT_LIST_HEAD(&tqt->tqt_thread_list); 1138 INIT_LIST_HEAD(&tqt->tqt_active_list); 1139 tqt->tqt_tq = tq; 1140 tqt->tqt_id = TASKQID_INVALID; 1141 1142 tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, 1143 "%s", tq->tq_name); 1144 if (tqt->tqt_thread == NULL) { 1145 kmem_free(tqt, sizeof (taskq_thread_t)); 1146 return (NULL); 1147 } 1148 1149 if (spl_taskq_thread_bind) { 1150 last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); 1151 kthread_bind(tqt->tqt_thread, last_used_cpu); 1152 } 1153 1154 if (spl_taskq_thread_priority) 1155 set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); 1156 1157 wake_up_process(tqt->tqt_thread); 1158 1159 TQSTAT_INC(tq, threads_created); 1160 1161 return (tqt); 1162 } 1163 1164 static void 1165 taskq_stats_init(taskq_t *tq) 1166 { 1167 taskq_sums_t *tqs = &tq->tq_sums; 1168 wmsum_init(&tqs->tqs_threads_active, 0); 1169 wmsum_init(&tqs->tqs_threads_idle, 0); 1170 wmsum_init(&tqs->tqs_threads_total, 0); 1171 wmsum_init(&tqs->tqs_tasks_pending, 0); 1172 wmsum_init(&tqs->tqs_tasks_priority, 0); 1173 wmsum_init(&tqs->tqs_tasks_total, 0); 1174 wmsum_init(&tqs->tqs_tasks_delayed, 0); 1175 wmsum_init(&tqs->tqs_entries_free, 0); 1176 wmsum_init(&tqs->tqs_threads_created, 0); 1177 wmsum_init(&tqs->tqs_threads_destroyed, 0); 1178 wmsum_init(&tqs->tqs_tasks_dispatched, 0); 1179 wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0); 1180 wmsum_init(&tqs->tqs_tasks_executed_normal, 0); 1181 wmsum_init(&tqs->tqs_tasks_executed_priority, 0); 1182 wmsum_init(&tqs->tqs_tasks_executed, 0); 1183 wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0); 1184 wmsum_init(&tqs->tqs_tasks_cancelled, 0); 1185 wmsum_init(&tqs->tqs_thread_wakeups, 0); 1186 wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0); 1187 wmsum_init(&tqs->tqs_thread_sleeps, 0); 1188 } 1189 1190 static void 1191 taskq_stats_fini(taskq_t *tq) 1192 { 1193 taskq_sums_t *tqs = &tq->tq_sums; 1194 wmsum_fini(&tqs->tqs_threads_active); 1195 wmsum_fini(&tqs->tqs_threads_idle); 1196 wmsum_fini(&tqs->tqs_threads_total); 1197 wmsum_fini(&tqs->tqs_tasks_pending); 1198 wmsum_fini(&tqs->tqs_tasks_priority); 1199 wmsum_fini(&tqs->tqs_tasks_total); 1200 wmsum_fini(&tqs->tqs_tasks_delayed); 1201 wmsum_fini(&tqs->tqs_entries_free); 1202 wmsum_fini(&tqs->tqs_threads_created); 1203 wmsum_fini(&tqs->tqs_threads_destroyed); 1204 wmsum_fini(&tqs->tqs_tasks_dispatched); 1205 wmsum_fini(&tqs->tqs_tasks_dispatched_delayed); 1206 wmsum_fini(&tqs->tqs_tasks_executed_normal); 1207 wmsum_fini(&tqs->tqs_tasks_executed_priority); 1208 wmsum_fini(&tqs->tqs_tasks_executed); 1209 wmsum_fini(&tqs->tqs_tasks_delayed_requeued); 1210 wmsum_fini(&tqs->tqs_tasks_cancelled); 1211 wmsum_fini(&tqs->tqs_thread_wakeups); 1212 wmsum_fini(&tqs->tqs_thread_wakeups_nowork); 1213 wmsum_fini(&tqs->tqs_thread_sleeps); 1214 } 1215 1216 static int 1217 taskq_kstats_update(kstat_t *ksp, int rw) 1218 { 1219 if (rw == KSTAT_WRITE) 1220 return (EACCES); 1221 1222 taskq_t *tq = ksp->ks_private; 1223 taskq_kstats_t *tqks = ksp->ks_data; 1224 1225 tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads; 1226 tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc; 1227 tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc; 1228 1229 taskq_sums_t *tqs = &tq->tq_sums; 1230 1231 tqks->tqks_threads_active.value.ui64 = 1232 wmsum_value(&tqs->tqs_threads_active); 1233 tqks->tqks_threads_idle.value.ui64 = 1234 wmsum_value(&tqs->tqs_threads_idle); 1235 tqks->tqks_threads_total.value.ui64 = 1236 wmsum_value(&tqs->tqs_threads_total); 1237 tqks->tqks_tasks_pending.value.ui64 = 1238 wmsum_value(&tqs->tqs_tasks_pending); 1239 tqks->tqks_tasks_priority.value.ui64 = 1240 wmsum_value(&tqs->tqs_tasks_priority); 1241 tqks->tqks_tasks_total.value.ui64 = 1242 wmsum_value(&tqs->tqs_tasks_total); 1243 tqks->tqks_tasks_delayed.value.ui64 = 1244 wmsum_value(&tqs->tqs_tasks_delayed); 1245 tqks->tqks_entries_free.value.ui64 = 1246 wmsum_value(&tqs->tqs_entries_free); 1247 tqks->tqks_threads_created.value.ui64 = 1248 wmsum_value(&tqs->tqs_threads_created); 1249 tqks->tqks_threads_destroyed.value.ui64 = 1250 wmsum_value(&tqs->tqs_threads_destroyed); 1251 tqks->tqks_tasks_dispatched.value.ui64 = 1252 wmsum_value(&tqs->tqs_tasks_dispatched); 1253 tqks->tqks_tasks_dispatched_delayed.value.ui64 = 1254 wmsum_value(&tqs->tqs_tasks_dispatched_delayed); 1255 tqks->tqks_tasks_executed_normal.value.ui64 = 1256 wmsum_value(&tqs->tqs_tasks_executed_normal); 1257 tqks->tqks_tasks_executed_priority.value.ui64 = 1258 wmsum_value(&tqs->tqs_tasks_executed_priority); 1259 tqks->tqks_tasks_executed.value.ui64 = 1260 wmsum_value(&tqs->tqs_tasks_executed); 1261 tqks->tqks_tasks_delayed_requeued.value.ui64 = 1262 wmsum_value(&tqs->tqs_tasks_delayed_requeued); 1263 tqks->tqks_tasks_cancelled.value.ui64 = 1264 wmsum_value(&tqs->tqs_tasks_cancelled); 1265 tqks->tqks_thread_wakeups.value.ui64 = 1266 wmsum_value(&tqs->tqs_thread_wakeups); 1267 tqks->tqks_thread_wakeups_nowork.value.ui64 = 1268 wmsum_value(&tqs->tqs_thread_wakeups_nowork); 1269 tqks->tqks_thread_sleeps.value.ui64 = 1270 wmsum_value(&tqs->tqs_thread_sleeps); 1271 1272 return (0); 1273 } 1274 1275 static void 1276 taskq_kstats_init(taskq_t *tq) 1277 { 1278 char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ 1279 snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance); 1280 1281 kstat_t *ksp = kstat_create("taskq", 0, name, "misc", 1282 KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t), 1283 KSTAT_FLAG_VIRTUAL); 1284 1285 if (ksp == NULL) 1286 return; 1287 1288 ksp->ks_private = tq; 1289 ksp->ks_update = taskq_kstats_update; 1290 ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP); 1291 memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t)); 1292 kstat_install(ksp); 1293 1294 tq->tq_ksp = ksp; 1295 } 1296 1297 static void 1298 taskq_kstats_fini(taskq_t *tq) 1299 { 1300 if (tq->tq_ksp == NULL) 1301 return; 1302 1303 kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t)); 1304 kstat_delete(tq->tq_ksp); 1305 1306 tq->tq_ksp = NULL; 1307 } 1308 1309 taskq_t * 1310 taskq_create(const char *name, int threads_arg, pri_t pri, 1311 int minalloc, int maxalloc, uint_t flags) 1312 { 1313 taskq_t *tq; 1314 taskq_thread_t *tqt; 1315 int count = 0, rc = 0, i; 1316 unsigned long irqflags; 1317 int nthreads = threads_arg; 1318 1319 ASSERT(name != NULL); 1320 ASSERT(minalloc >= 0); 1321 ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ 1322 1323 /* Scale the number of threads using nthreads as a percentage */ 1324 if (flags & TASKQ_THREADS_CPU_PCT) { 1325 ASSERT(nthreads <= 100); 1326 ASSERT(nthreads >= 0); 1327 nthreads = MIN(threads_arg, 100); 1328 nthreads = MAX(nthreads, 0); 1329 nthreads = MAX((num_online_cpus() * nthreads) /100, 1); 1330 } 1331 1332 tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); 1333 if (tq == NULL) 1334 return (NULL); 1335 1336 tq->tq_hp_support = B_FALSE; 1337 1338 if (flags & TASKQ_THREADS_CPU_PCT) { 1339 tq->tq_hp_support = B_TRUE; 1340 if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, 1341 &tq->tq_hp_cb_node) != 0) { 1342 kmem_free(tq, sizeof (*tq)); 1343 return (NULL); 1344 } 1345 } 1346 1347 spin_lock_init(&tq->tq_lock); 1348 INIT_LIST_HEAD(&tq->tq_thread_list); 1349 INIT_LIST_HEAD(&tq->tq_active_list); 1350 tq->tq_name = kmem_strdup(name); 1351 tq->tq_nactive = 0; 1352 tq->tq_nthreads = 0; 1353 tq->tq_nspawn = 0; 1354 tq->tq_maxthreads = nthreads; 1355 tq->tq_cpu_pct = threads_arg; 1356 tq->tq_pri = pri; 1357 tq->tq_minalloc = minalloc; 1358 tq->tq_maxalloc = maxalloc; 1359 tq->tq_nalloc = 0; 1360 tq->tq_flags = (flags | TASKQ_ACTIVE); 1361 tq->tq_next_id = TASKQID_INITIAL; 1362 tq->tq_lowest_id = TASKQID_INITIAL; 1363 tq->lastspawnstop = jiffies; 1364 INIT_LIST_HEAD(&tq->tq_free_list); 1365 INIT_LIST_HEAD(&tq->tq_pend_list); 1366 INIT_LIST_HEAD(&tq->tq_prio_list); 1367 INIT_LIST_HEAD(&tq->tq_delay_list); 1368 init_waitqueue_head(&tq->tq_work_waitq); 1369 init_waitqueue_head(&tq->tq_wait_waitq); 1370 tq->tq_lock_class = TQ_LOCK_GENERAL; 1371 INIT_LIST_HEAD(&tq->tq_taskqs); 1372 taskq_stats_init(tq); 1373 1374 if (flags & TASKQ_PREPOPULATE) { 1375 spin_lock_irqsave_nested(&tq->tq_lock, irqflags, 1376 tq->tq_lock_class); 1377 1378 for (i = 0; i < minalloc; i++) 1379 task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, 1380 &irqflags)); 1381 1382 spin_unlock_irqrestore(&tq->tq_lock, irqflags); 1383 } 1384 1385 if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) 1386 nthreads = 1; 1387 1388 for (i = 0; i < nthreads; i++) { 1389 tqt = taskq_thread_create(tq); 1390 if (tqt == NULL) 1391 rc = 1; 1392 else 1393 count++; 1394 } 1395 1396 /* Wait for all threads to be started before potential destroy */ 1397 wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); 1398 /* 1399 * taskq_thread might have touched nspawn, but we don't want them to 1400 * because they're not dynamically spawned. So we reset it to 0 1401 */ 1402 tq->tq_nspawn = 0; 1403 1404 if (rc) { 1405 taskq_destroy(tq); 1406 return (NULL); 1407 } 1408 1409 down_write(&tq_list_sem); 1410 tq->tq_instance = taskq_find_by_name(name) + 1; 1411 list_add_tail(&tq->tq_taskqs, &tq_list); 1412 up_write(&tq_list_sem); 1413 1414 /* Install kstats late, because the name includes tq_instance */ 1415 taskq_kstats_init(tq); 1416 1417 return (tq); 1418 } 1419 EXPORT_SYMBOL(taskq_create); 1420 1421 void 1422 taskq_destroy(taskq_t *tq) 1423 { 1424 struct task_struct *thread; 1425 taskq_thread_t *tqt; 1426 taskq_ent_t *t; 1427 unsigned long flags; 1428 1429 ASSERT(tq); 1430 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 1431 tq->tq_flags &= ~TASKQ_ACTIVE; 1432 spin_unlock_irqrestore(&tq->tq_lock, flags); 1433 1434 if (tq->tq_hp_support) { 1435 VERIFY0(cpuhp_state_remove_instance_nocalls( 1436 spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); 1437 } 1438 1439 /* 1440 * When TASKQ_ACTIVE is clear new tasks may not be added nor may 1441 * new worker threads be spawned for dynamic taskq. 1442 */ 1443 if (dynamic_taskq != NULL) 1444 taskq_wait_outstanding(dynamic_taskq, 0); 1445 1446 taskq_wait(tq); 1447 1448 taskq_kstats_fini(tq); 1449 1450 /* remove taskq from global list used by the kstats */ 1451 down_write(&tq_list_sem); 1452 list_del(&tq->tq_taskqs); 1453 up_write(&tq_list_sem); 1454 1455 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 1456 /* wait for spawning threads to insert themselves to the list */ 1457 while (tq->tq_nspawn) { 1458 spin_unlock_irqrestore(&tq->tq_lock, flags); 1459 schedule_timeout_interruptible(1); 1460 spin_lock_irqsave_nested(&tq->tq_lock, flags, 1461 tq->tq_lock_class); 1462 } 1463 1464 /* 1465 * Signal each thread to exit and block until it does. Each thread 1466 * is responsible for removing itself from the list and freeing its 1467 * taskq_thread_t. This allows for idle threads to opt to remove 1468 * themselves from the taskq. They can be recreated as needed. 1469 */ 1470 while (!list_empty(&tq->tq_thread_list)) { 1471 tqt = list_entry(tq->tq_thread_list.next, 1472 taskq_thread_t, tqt_thread_list); 1473 thread = tqt->tqt_thread; 1474 spin_unlock_irqrestore(&tq->tq_lock, flags); 1475 1476 kthread_stop(thread); 1477 1478 spin_lock_irqsave_nested(&tq->tq_lock, flags, 1479 tq->tq_lock_class); 1480 } 1481 1482 while (!list_empty(&tq->tq_free_list)) { 1483 t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); 1484 1485 ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); 1486 1487 list_del_init(&t->tqent_list); 1488 task_free(tq, t); 1489 } 1490 1491 ASSERT0(tq->tq_nthreads); 1492 ASSERT0(tq->tq_nalloc); 1493 ASSERT0(tq->tq_nspawn); 1494 ASSERT(list_empty(&tq->tq_thread_list)); 1495 ASSERT(list_empty(&tq->tq_active_list)); 1496 ASSERT(list_empty(&tq->tq_free_list)); 1497 ASSERT(list_empty(&tq->tq_pend_list)); 1498 ASSERT(list_empty(&tq->tq_prio_list)); 1499 ASSERT(list_empty(&tq->tq_delay_list)); 1500 1501 spin_unlock_irqrestore(&tq->tq_lock, flags); 1502 1503 taskq_stats_fini(tq); 1504 kmem_strfree(tq->tq_name); 1505 kmem_free(tq, sizeof (taskq_t)); 1506 } 1507 EXPORT_SYMBOL(taskq_destroy); 1508 1509 /* 1510 * Create a taskq with a specified number of pool threads. Allocate 1511 * and return an array of nthreads kthread_t pointers, one for each 1512 * thread in the pool. The array is not ordered and must be freed 1513 * by the caller. 1514 */ 1515 taskq_t * 1516 taskq_create_synced(const char *name, int nthreads, pri_t pri, 1517 int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) 1518 { 1519 taskq_t *tq; 1520 taskq_thread_t *tqt; 1521 int i = 0; 1522 kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, 1523 KM_SLEEP); 1524 1525 flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); 1526 1527 /* taskq_create spawns all the threads before returning */ 1528 tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, 1529 flags | TASKQ_PREPOPULATE); 1530 VERIFY(tq != NULL); 1531 VERIFY(tq->tq_nthreads == nthreads); 1532 1533 list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) { 1534 kthreads[i] = tqt->tqt_thread; 1535 i++; 1536 } 1537 1538 ASSERT3S(i, ==, nthreads); 1539 *ktpp = kthreads; 1540 1541 return (tq); 1542 } 1543 EXPORT_SYMBOL(taskq_create_synced); 1544 1545 static kstat_t *taskq_summary_ksp = NULL; 1546 1547 static int 1548 spl_taskq_kstat_headers(char *buf, size_t size) 1549 { 1550 size_t n = snprintf(buf, size, 1551 "%-20s | %-17s | %-23s\n" 1552 "%-20s | %-17s | %-23s\n" 1553 "%-20s | %-17s | %-23s\n", 1554 "", "threads", "tasks on queue", 1555 "taskq name", "tot [act idl] max", " pend [ norm high] dly", 1556 "--------------------", "-----------------", 1557 "-----------------------"); 1558 return (n >= size ? ENOMEM : 0); 1559 } 1560 1561 static int 1562 spl_taskq_kstat_data(char *buf, size_t size, void *data) 1563 { 1564 struct list_head *tql = NULL; 1565 taskq_t *tq; 1566 char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ 1567 char threads[25]; 1568 char tasks[30]; 1569 size_t n; 1570 int err = 0; 1571 1572 down_read(&tq_list_sem); 1573 list_for_each_prev(tql, &tq_list) { 1574 tq = list_entry(tql, taskq_t, tq_taskqs); 1575 1576 mutex_enter(tq->tq_ksp->ks_lock); 1577 taskq_kstats_update(tq->tq_ksp, KSTAT_READ); 1578 taskq_kstats_t *tqks = tq->tq_ksp->ks_data; 1579 1580 snprintf(name, sizeof (name), "%s.%d", tq->tq_name, 1581 tq->tq_instance); 1582 snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu", 1583 tqks->tqks_threads_total.value.ui64, 1584 tqks->tqks_threads_active.value.ui64, 1585 tqks->tqks_threads_idle.value.ui64, 1586 tqks->tqks_threads_max.value.ui64); 1587 snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu", 1588 tqks->tqks_tasks_total.value.ui64, 1589 tqks->tqks_tasks_pending.value.ui64, 1590 tqks->tqks_tasks_priority.value.ui64, 1591 tqks->tqks_tasks_delayed.value.ui64); 1592 1593 mutex_exit(tq->tq_ksp->ks_lock); 1594 1595 n = snprintf(buf, size, "%-20s | %-17s | %-23s\n", 1596 name, threads, tasks); 1597 if (n >= size) { 1598 err = ENOMEM; 1599 break; 1600 } 1601 1602 buf = &buf[n]; 1603 size -= n; 1604 } 1605 1606 up_read(&tq_list_sem); 1607 1608 return (err); 1609 } 1610 1611 static void 1612 spl_taskq_kstat_init(void) 1613 { 1614 kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc", 1615 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 1616 1617 if (ksp == NULL) 1618 return; 1619 1620 ksp->ks_data = (void *)(uintptr_t)1; 1621 ksp->ks_ndata = 1; 1622 kstat_set_raw_ops(ksp, spl_taskq_kstat_headers, 1623 spl_taskq_kstat_data, NULL); 1624 kstat_install(ksp); 1625 1626 taskq_summary_ksp = ksp; 1627 } 1628 1629 static void 1630 spl_taskq_kstat_fini(void) 1631 { 1632 if (taskq_summary_ksp == NULL) 1633 return; 1634 1635 kstat_delete(taskq_summary_ksp); 1636 taskq_summary_ksp = NULL; 1637 } 1638 1639 static unsigned int spl_taskq_kick = 0; 1640 1641 static int 1642 param_set_taskq_kick(const char *val, zfs_kernel_param_t *kp) 1643 { 1644 int ret; 1645 taskq_t *tq = NULL; 1646 taskq_ent_t *t; 1647 unsigned long flags; 1648 1649 ret = param_set_uint(val, kp); 1650 if (ret < 0 || !spl_taskq_kick) 1651 return (ret); 1652 /* reset value */ 1653 spl_taskq_kick = 0; 1654 1655 down_read(&tq_list_sem); 1656 list_for_each_entry(tq, &tq_list, tq_taskqs) { 1657 spin_lock_irqsave_nested(&tq->tq_lock, flags, 1658 tq->tq_lock_class); 1659 /* Check if the first pending is older than 5 seconds */ 1660 t = taskq_next_ent(tq); 1661 if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { 1662 (void) taskq_thread_spawn(tq); 1663 printk(KERN_INFO "spl: Kicked taskq %s/%d\n", 1664 tq->tq_name, tq->tq_instance); 1665 } 1666 spin_unlock_irqrestore(&tq->tq_lock, flags); 1667 } 1668 up_read(&tq_list_sem); 1669 return (ret); 1670 } 1671 1672 module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, 1673 &spl_taskq_kick, 0644); 1674 MODULE_PARM_DESC(spl_taskq_kick, 1675 "Write nonzero to kick stuck taskqs to spawn more threads"); 1676 1677 /* 1678 * This callback will be called exactly once for each core that comes online, 1679 * for each dynamic taskq. We attempt to expand taskqs that have 1680 * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every 1681 * time, to correctly determine whether or not to add a thread. 1682 */ 1683 static int 1684 spl_taskq_expand(unsigned int cpu, struct hlist_node *node) 1685 { 1686 taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); 1687 unsigned long flags; 1688 int err = 0; 1689 1690 ASSERT(tq); 1691 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 1692 1693 if (!(tq->tq_flags & TASKQ_ACTIVE)) { 1694 spin_unlock_irqrestore(&tq->tq_lock, flags); 1695 return (err); 1696 } 1697 1698 ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); 1699 int nthreads = MIN(tq->tq_cpu_pct, 100); 1700 nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); 1701 tq->tq_maxthreads = nthreads; 1702 1703 if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && 1704 tq->tq_maxthreads > tq->tq_nthreads) { 1705 spin_unlock_irqrestore(&tq->tq_lock, flags); 1706 taskq_thread_t *tqt = taskq_thread_create(tq); 1707 if (tqt == NULL) 1708 err = -1; 1709 return (err); 1710 } 1711 spin_unlock_irqrestore(&tq->tq_lock, flags); 1712 return (err); 1713 } 1714 1715 /* 1716 * While we don't support offlining CPUs, it is possible that CPUs will fail 1717 * to online successfully. We do need to be able to handle this case 1718 * gracefully. 1719 */ 1720 static int 1721 spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) 1722 { 1723 taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); 1724 unsigned long flags; 1725 1726 ASSERT(tq); 1727 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); 1728 1729 if (!(tq->tq_flags & TASKQ_ACTIVE)) 1730 goto out; 1731 1732 ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); 1733 int nthreads = MIN(tq->tq_cpu_pct, 100); 1734 nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); 1735 tq->tq_maxthreads = nthreads; 1736 1737 if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && 1738 tq->tq_maxthreads < tq->tq_nthreads) { 1739 ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); 1740 taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, 1741 taskq_thread_t, tqt_thread_list); 1742 struct task_struct *thread = tqt->tqt_thread; 1743 spin_unlock_irqrestore(&tq->tq_lock, flags); 1744 1745 kthread_stop(thread); 1746 1747 return (0); 1748 } 1749 1750 out: 1751 spin_unlock_irqrestore(&tq->tq_lock, flags); 1752 return (0); 1753 } 1754 1755 int 1756 spl_taskq_init(void) 1757 { 1758 init_rwsem(&tq_list_sem); 1759 tsd_create(&taskq_tsd, NULL); 1760 1761 spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, 1762 "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); 1763 1764 system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), 1765 maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 1766 if (system_taskq == NULL) 1767 return (-ENOMEM); 1768 1769 system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), 1770 maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 1771 if (system_delay_taskq == NULL) { 1772 cpuhp_remove_multi_state(spl_taskq_cpuhp_state); 1773 taskq_destroy(system_taskq); 1774 return (-ENOMEM); 1775 } 1776 1777 dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, 1778 maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); 1779 if (dynamic_taskq == NULL) { 1780 cpuhp_remove_multi_state(spl_taskq_cpuhp_state); 1781 taskq_destroy(system_taskq); 1782 taskq_destroy(system_delay_taskq); 1783 return (-ENOMEM); 1784 } 1785 1786 /* 1787 * This is used to annotate tq_lock, so 1788 * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch 1789 * does not trigger a lockdep warning re: possible recursive locking 1790 */ 1791 dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; 1792 1793 spl_taskq_kstat_init(); 1794 1795 return (0); 1796 } 1797 1798 void 1799 spl_taskq_fini(void) 1800 { 1801 spl_taskq_kstat_fini(); 1802 1803 taskq_destroy(dynamic_taskq); 1804 dynamic_taskq = NULL; 1805 1806 taskq_destroy(system_delay_taskq); 1807 system_delay_taskq = NULL; 1808 1809 taskq_destroy(system_taskq); 1810 system_taskq = NULL; 1811 1812 tsd_destroy(&taskq_tsd); 1813 1814 cpuhp_remove_multi_state(spl_taskq_cpuhp_state); 1815 spl_taskq_cpuhp_state = 0; 1816 } 1817