xref: /titanic_51/usr/src/uts/common/os/ddi_intr_irm.c (revision dd313879ab7d5d89df4625e2b3763fc24c76a3e8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/note.h>
27 #include <sys/sysmacros.h>
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kmem.h>
32 #include <sys/cmn_err.h>
33 #include <sys/debug.h>
34 #include <sys/ddi.h>
35 #include <sys/sunndi.h>
36 #include <sys/ndi_impldefs.h>	/* include prototypes */
37 
38 /*
39  * Interrupt Resource Management (IRM).
40  */
41 
42 #define	DDI_IRM_BALANCE_DELAY	(60)	/* In seconds */
43 
44 #define	DDI_IRM_HAS_CB(c)	((c) && (c->cb_flags & DDI_CB_FLAG_INTR))
45 
46 #define	DDI_IRM_IS_REDUCIBLE(r)	(((r->ireq_flags & DDI_IRM_FLAG_CALLBACK) && \
47 				(r->ireq_type == DDI_INTR_TYPE_MSIX)) || \
48 				(r->ireq_flags & DDI_IRM_FLAG_NEW))
49 
50 extern pri_t	minclsyspri;
51 
52 /* Global policies */
53 int		irm_enable = 1;
54 boolean_t	irm_active = B_FALSE;
55 int		irm_default_policy = DDI_IRM_POLICY_LARGE;
56 uint_t		irm_balance_delay = DDI_IRM_BALANCE_DELAY;
57 
58 /* Global list of interrupt pools */
59 kmutex_t	irm_pools_lock;
60 list_t		irm_pools_list;
61 
62 /* Global debug tunables */
63 #ifdef	DEBUG
64 int		irm_debug_policy = 0;
65 uint_t		irm_debug_size = 0;
66 #endif	/* DEBUG */
67 
68 static void	irm_balance_thread(ddi_irm_pool_t *);
69 static void	i_ddi_irm_balance(ddi_irm_pool_t *);
70 static void	i_ddi_irm_enqueue(ddi_irm_pool_t *, boolean_t);
71 static void	i_ddi_irm_reduce(ddi_irm_pool_t *pool);
72 static int	i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *, int, int);
73 static void	i_ddi_irm_reduce_new(ddi_irm_pool_t *, int);
74 static void	i_ddi_irm_insertion_sort(list_t *, ddi_irm_req_t *);
75 static int	i_ddi_irm_notify(ddi_irm_pool_t *, ddi_irm_req_t *);
76 
77 /*
78  * OS Initialization Routines
79  */
80 
81 /*
82  * irm_init()
83  *
84  *	Initialize IRM subsystem before any drivers are attached.
85  */
86 void
87 irm_init(void)
88 {
89 	/* Do nothing if IRM is disabled */
90 	if (!irm_enable)
91 		return;
92 
93 	/* Verify that the default balancing policy is valid */
94 	if (!DDI_IRM_POLICY_VALID(irm_default_policy))
95 		irm_default_policy = DDI_IRM_POLICY_LARGE;
96 
97 	/* Initialize the global list of interrupt pools */
98 	mutex_init(&irm_pools_lock, NULL, MUTEX_DRIVER, NULL);
99 	list_create(&irm_pools_list, sizeof (ddi_irm_pool_t),
100 	    offsetof(ddi_irm_pool_t, ipool_link));
101 }
102 
103 /*
104  * i_ddi_irm_poststartup()
105  *
106  *	IRM is not activated until after the IO subsystem is initialized.
107  *	When activated, per-pool balancing threads are spawned and a flag
108  *	is set so that all future pools will be activated when created.
109  *
110  *	NOTE: the global variable 'irm_enable' disables IRM if zero.
111  */
112 void
113 i_ddi_irm_poststartup(void)
114 {
115 	ddi_irm_pool_t	*pool_p;
116 
117 	/* Do nothing if IRM is disabled */
118 	if (!irm_enable)
119 		return;
120 
121 	/* Lock the global list */
122 	mutex_enter(&irm_pools_lock);
123 
124 	/* Activate all defined pools */
125 	for (pool_p = list_head(&irm_pools_list); pool_p;
126 	    pool_p = list_next(&irm_pools_list, pool_p))
127 		pool_p->ipool_thread = thread_create(NULL, 0,
128 		    irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
129 
130 	/* Set future pools to be active */
131 	irm_active = B_TRUE;
132 
133 	/* Unlock the global list */
134 	mutex_exit(&irm_pools_lock);
135 }
136 
137 /*
138  * NDI interfaces for creating/destroying IRM pools.
139  */
140 
141 /*
142  * ndi_irm_create()
143  *
144  *	Nexus interface to create an IRM pool.  Create the new
145  *	pool and add it to the global list of interrupt pools.
146  */
147 int
148 ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp,
149     ddi_irm_pool_t **pool_retp)
150 {
151 	ddi_irm_pool_t	*pool_p;
152 
153 	ASSERT(dip != NULL);
154 	ASSERT(paramsp != NULL);
155 	ASSERT(pool_retp != NULL);
156 	ASSERT(paramsp->iparams_total >= 1);
157 	ASSERT(paramsp->iparams_types != 0);
158 
159 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_create: dip %p\n", (void *)dip));
160 
161 	/* Check if IRM is enabled */
162 	if (!irm_enable)
163 		return (NDI_FAILURE);
164 
165 	/* Validate parameters */
166 	if ((dip == NULL) || (paramsp == NULL) || (pool_retp == NULL) ||
167 	    (paramsp->iparams_total < 1) || (paramsp->iparams_types == 0))
168 		return (NDI_FAILURE);
169 
170 	/* Allocate and initialize the pool */
171 	pool_p = kmem_zalloc(sizeof (ddi_irm_pool_t), KM_SLEEP);
172 	pool_p->ipool_owner = dip;
173 	pool_p->ipool_policy = irm_default_policy;
174 	pool_p->ipool_types = paramsp->iparams_types;
175 	pool_p->ipool_totsz = paramsp->iparams_total;
176 	pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, MAX(DDI_MIN_MSIX_ALLOC,
177 	    paramsp->iparams_total / DDI_MSIX_ALLOC_DIVIDER));
178 	list_create(&pool_p->ipool_req_list, sizeof (ddi_irm_req_t),
179 	    offsetof(ddi_irm_req_t, ireq_link));
180 	list_create(&pool_p->ipool_scratch_list, sizeof (ddi_irm_req_t),
181 	    offsetof(ddi_irm_req_t, ireq_scratch_link));
182 	cv_init(&pool_p->ipool_cv, NULL, CV_DRIVER, NULL);
183 	mutex_init(&pool_p->ipool_lock, NULL, MUTEX_DRIVER, NULL);
184 	mutex_init(&pool_p->ipool_navail_lock, NULL, MUTEX_DRIVER, NULL);
185 
186 	/* Add to global list of pools */
187 	mutex_enter(&irm_pools_lock);
188 	list_insert_tail(&irm_pools_list, pool_p);
189 	mutex_exit(&irm_pools_lock);
190 
191 	/* If IRM is active, then activate the pool */
192 	if (irm_active)
193 		pool_p->ipool_thread = thread_create(NULL, 0,
194 		    irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
195 
196 	*pool_retp = pool_p;
197 	return (NDI_SUCCESS);
198 }
199 
200 /*
201  * ndi_irm_destroy()
202  *
203  *	Nexus interface to destroy an IRM pool.  Destroy the pool
204  *	and remove it from the global list of interrupt pools.
205  */
206 int
207 ndi_irm_destroy(ddi_irm_pool_t *pool_p)
208 {
209 	ASSERT(pool_p != NULL);
210 	ASSERT(pool_p->ipool_resno == 0);
211 
212 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_destroy: pool_p %p\n",
213 	    (void *)pool_p));
214 
215 	/* Validate parameters */
216 	if (pool_p == NULL)
217 		return (NDI_FAILURE);
218 
219 	/* Validate that pool is empty */
220 	if (pool_p->ipool_resno != 0)
221 		return (NDI_BUSY);
222 
223 	/* Remove the pool from the global list */
224 	mutex_enter(&irm_pools_lock);
225 	list_remove(&irm_pools_list, pool_p);
226 	mutex_exit(&irm_pools_lock);
227 
228 	/* Terminate the balancing thread */
229 	mutex_enter(&pool_p->ipool_lock);
230 	if (pool_p->ipool_thread &&
231 	    (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) {
232 		pool_p->ipool_flags |= DDI_IRM_FLAG_EXIT;
233 		cv_signal(&pool_p->ipool_cv);
234 		mutex_exit(&pool_p->ipool_lock);
235 		thread_join(pool_p->ipool_thread->t_did);
236 	} else
237 		mutex_exit(&pool_p->ipool_lock);
238 
239 	/* Destroy the pool */
240 	cv_destroy(&pool_p->ipool_cv);
241 	mutex_destroy(&pool_p->ipool_lock);
242 	mutex_destroy(&pool_p->ipool_navail_lock);
243 	list_destroy(&pool_p->ipool_req_list);
244 	list_destroy(&pool_p->ipool_scratch_list);
245 	kmem_free(pool_p, sizeof (ddi_irm_pool_t));
246 
247 	return (NDI_SUCCESS);
248 }
249 
250 /*
251  * Insert/Modify/Remove Interrupt Requests
252  */
253 
254 /*
255  * i_ddi_irm_insert()
256  *
257  *	Insert a new request into an interrupt pool, and balance the pool.
258  */
259 int
260 i_ddi_irm_insert(dev_info_t *dip, int type, int count)
261 {
262 	ddi_cb_t	*cb_p;
263 	ddi_irm_req_t	*req_p;
264 	devinfo_intr_t	*intr_p;
265 	ddi_irm_pool_t	*pool_p;
266 	uint_t		nreq, nmin, npartial;
267 	boolean_t	irm_flag = B_FALSE;
268 
269 	ASSERT(dip != NULL);
270 	ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
271 	ASSERT(count > 0);
272 
273 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: dip %p type %d count %d\n",
274 	    (void *)dip, type, count));
275 
276 	/* Validate parameters */
277 	if ((dip == NULL) || (count < 1) || !DDI_INTR_TYPE_FLAG_VALID(type)) {
278 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: invalid args\n"));
279 		return (DDI_EINVAL);
280 	}
281 
282 	/* Check for an existing request */
283 	if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
284 	    (intr_p->devi_irm_req_p != NULL))
285 		return (DDI_SUCCESS);
286 
287 	/* Check for IRM support from the system */
288 	if ((pool_p = i_ddi_intr_get_pool(dip, type)) == NULL) {
289 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: not supported\n"));
290 		return (DDI_ENOTSUP);
291 	}
292 
293 	/* Check for IRM support from the driver */
294 	if (((cb_p = DEVI(dip)->devi_cb_p) != NULL) && DDI_IRM_HAS_CB(cb_p) &&
295 	    (type == DDI_INTR_TYPE_MSIX))
296 		irm_flag = B_TRUE;
297 
298 	/* Determine request size */
299 	nreq = (irm_flag) ? count :
300 	    MIN(count, i_ddi_intr_get_current_navail(dip, type));
301 	nmin = (irm_flag) ? 1 : nreq;
302 	npartial = MIN(nreq, pool_p->ipool_defsz);
303 
304 	/* Allocate and initialize the request */
305 	req_p = kmem_zalloc(sizeof (ddi_irm_req_t), KM_SLEEP);
306 	req_p->ireq_type = type;
307 	req_p->ireq_dip = dip;
308 	req_p->ireq_pool_p = pool_p;
309 	req_p->ireq_nreq = nreq;
310 	req_p->ireq_flags = DDI_IRM_FLAG_NEW;
311 	if (DDI_IRM_HAS_CB(cb_p))
312 		req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
313 
314 	/* Lock the pool */
315 	mutex_enter(&pool_p->ipool_lock);
316 
317 	/* Check for minimal fit before inserting */
318 	if ((pool_p->ipool_minno + nmin) > pool_p->ipool_totsz) {
319 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
320 		    ddi_driver_name(dip), ddi_get_instance(dip));
321 		mutex_exit(&pool_p->ipool_lock);
322 		kmem_free(req_p, sizeof (ddi_irm_req_t));
323 		return (DDI_EAGAIN);
324 	}
325 
326 	/* Insert the request into the pool */
327 	pool_p->ipool_reqno += nreq;
328 	pool_p->ipool_minno += nmin;
329 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
330 
331 	/*
332 	 * Try to fulfill the request.
333 	 *
334 	 * If all the interrupts are available, and either the request
335 	 * is static or the pool is active, then just take them directly.
336 	 *
337 	 * If only some of the interrupts are available, and the request
338 	 * can receive future callbacks, then take some now but queue the
339 	 * pool to be rebalanced later.
340 	 *
341 	 * Otherwise, immediately rebalance the pool and wait.
342 	 */
343 	if ((!irm_flag || (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) &&
344 	    ((pool_p->ipool_resno + nreq) <= pool_p->ipool_totsz)) {
345 
346 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
347 		    "request completely fulfilled.\n"));
348 		pool_p->ipool_resno += nreq;
349 		req_p->ireq_navail = nreq;
350 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
351 
352 	} else if (irm_flag &&
353 	    ((pool_p->ipool_resno + npartial) <= pool_p->ipool_totsz)) {
354 
355 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
356 		    "request partially fulfilled.\n"));
357 		pool_p->ipool_resno += npartial;
358 		req_p->ireq_navail = npartial;
359 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
360 		i_ddi_irm_enqueue(pool_p, B_FALSE);
361 
362 	} else {
363 
364 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
365 		    "request needs immediate rebalance.\n"));
366 		i_ddi_irm_enqueue(pool_p, B_TRUE);
367 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
368 	}
369 
370 	/* Fail if the request cannot be fulfilled at all */
371 	if (req_p->ireq_navail == 0) {
372 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
373 		    ddi_driver_name(dip), ddi_get_instance(dip));
374 		pool_p->ipool_reqno -= nreq;
375 		pool_p->ipool_minno -= nmin;
376 		list_remove(&pool_p->ipool_req_list, req_p);
377 		mutex_exit(&pool_p->ipool_lock);
378 		kmem_free(req_p, sizeof (ddi_irm_req_t));
379 		return (DDI_EAGAIN);
380 	}
381 
382 	/* Unlock the pool */
383 	mutex_exit(&pool_p->ipool_lock);
384 
385 	intr_p->devi_irm_req_p = req_p;
386 	return (DDI_SUCCESS);
387 }
388 
389 /*
390  * i_ddi_irm_modify()
391  *
392  *	Modify an existing request in an interrupt pool, and balance the pool.
393  */
394 int
395 i_ddi_irm_modify(dev_info_t *dip, int nreq)
396 {
397 	devinfo_intr_t	*intr_p;
398 	ddi_irm_req_t	*req_p;
399 	ddi_irm_pool_t	*pool_p;
400 
401 	ASSERT(dip != NULL);
402 
403 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: dip %p nreq %d\n",
404 	    (void *)dip, nreq));
405 
406 	/* Validate parameters */
407 	if ((dip == NULL) || (nreq < 1)) {
408 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
409 		return (DDI_EINVAL);
410 	}
411 
412 	/* Check that the operation is supported */
413 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
414 	    !(req_p = intr_p->devi_irm_req_p) ||
415 	    !DDI_IRM_IS_REDUCIBLE(req_p)) {
416 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: not supported\n"));
417 		return (DDI_ENOTSUP);
418 	}
419 
420 	/* Validate request size is not too large */
421 	if (nreq > intr_p->devi_intr_sup_nintrs) {
422 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
423 		return (DDI_EINVAL);
424 	}
425 
426 	/*
427 	 * Modify request, but only if new size is different.
428 	 */
429 	if (nreq != req_p->ireq_nreq) {
430 
431 		/* Lock the pool */
432 		pool_p = req_p->ireq_pool_p;
433 		mutex_enter(&pool_p->ipool_lock);
434 
435 		/* Update pool and request */
436 		pool_p->ipool_reqno -= req_p->ireq_nreq;
437 		pool_p->ipool_reqno += nreq;
438 		req_p->ireq_nreq = nreq;
439 
440 		/* Re-sort request in the pool */
441 		list_remove(&pool_p->ipool_req_list, req_p);
442 		i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
443 
444 		/* Queue pool to be rebalanced */
445 		i_ddi_irm_enqueue(pool_p, B_FALSE);
446 
447 		/* Unlock the pool */
448 		mutex_exit(&pool_p->ipool_lock);
449 	}
450 
451 	return (DDI_SUCCESS);
452 }
453 
454 /*
455  * i_ddi_irm_remove()
456  *
457  *	Remove a request from an interrupt pool, and balance the pool.
458  */
459 int
460 i_ddi_irm_remove(dev_info_t *dip)
461 {
462 	devinfo_intr_t	*intr_p;
463 	ddi_irm_pool_t	*pool_p;
464 	ddi_irm_req_t	*req_p;
465 	uint_t		nmin;
466 
467 	ASSERT(dip != NULL);
468 
469 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: dip %p\n", (void *)dip));
470 
471 	/* Validate parameters */
472 	if (dip == NULL) {
473 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: invalid args\n"));
474 		return (DDI_EINVAL);
475 	}
476 
477 	/* Check if the device has a request */
478 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
479 	    !(req_p = intr_p->devi_irm_req_p)) {
480 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: not found\n"));
481 		return (DDI_EINVAL);
482 	}
483 
484 	/* Lock the pool */
485 	pool_p = req_p->ireq_pool_p;
486 	mutex_enter(&pool_p->ipool_lock);
487 
488 	/* Remove request */
489 	nmin = DDI_IRM_IS_REDUCIBLE(req_p) ? 1 : req_p->ireq_nreq;
490 	pool_p->ipool_minno -= nmin;
491 	pool_p->ipool_reqno -= req_p->ireq_nreq;
492 	pool_p->ipool_resno -= req_p->ireq_navail;
493 	list_remove(&pool_p->ipool_req_list, req_p);
494 
495 	/* Queue pool to be rebalanced */
496 	i_ddi_irm_enqueue(pool_p, B_FALSE);
497 
498 	/* Unlock the pool */
499 	mutex_exit(&pool_p->ipool_lock);
500 
501 	/* Destroy the request */
502 	intr_p->devi_irm_req_p = NULL;
503 	kmem_free(req_p, sizeof (ddi_irm_req_t));
504 
505 	return (DDI_SUCCESS);
506 }
507 
508 /*
509  * i_ddi_irm_set_cb()
510  *
511  *	Change the callback flag for a request, in response to
512  *	a change in its callback registration.  Then rebalance
513  *	the interrupt pool.
514  *
515  *	NOTE: the request is not locked because the navail value
516  *	      is not directly affected.  The balancing thread may
517  *	      modify the navail value in the background after it
518  *	      locks the request itself.
519  */
520 void
521 i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
522 {
523 	devinfo_intr_t	*intr_p;
524 	ddi_irm_pool_t	*pool_p;
525 	ddi_irm_req_t	*req_p;
526 	uint_t		nreq;
527 
528 	ASSERT(dip != NULL);
529 
530 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: dip %p has_cb_flag %d\n",
531 	    (void *)dip, (int)has_cb_flag));
532 
533 	/* Validate parameters */
534 	if (dip == NULL)
535 		return;
536 
537 	/* Check for association with interrupt pool */
538 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
539 	    !(req_p = intr_p->devi_irm_req_p)) {
540 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: not in pool\n"));
541 		return;
542 	}
543 
544 	/* Lock the pool */
545 	pool_p = req_p->ireq_pool_p;
546 	mutex_enter(&pool_p->ipool_lock);
547 
548 	/*
549 	 * Update the request and the pool
550 	 */
551 	if (has_cb_flag) {
552 
553 		/* Update pool statistics */
554 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
555 			pool_p->ipool_minno -= (req_p->ireq_nreq - 1);
556 
557 		/* Update request */
558 		req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
559 
560 		/* Rebalance in background */
561 		i_ddi_irm_enqueue(pool_p, B_FALSE);
562 
563 	} else {
564 
565 		/* Determine new request size */
566 		nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
567 
568 		/* Update pool statistics */
569 		pool_p->ipool_reqno -= req_p->ireq_nreq;
570 		pool_p->ipool_reqno += nreq;
571 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) {
572 			pool_p->ipool_minno -= 1;
573 			pool_p->ipool_minno += nreq;
574 		} else {
575 			pool_p->ipool_minno -= req_p->ireq_nreq;
576 			pool_p->ipool_minno += nreq;
577 		}
578 
579 		/* Update request size, and re-sort in pool */
580 		req_p->ireq_nreq = nreq;
581 		list_remove(&pool_p->ipool_req_list, req_p);
582 		i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
583 
584 		/* Rebalance synchronously, before losing callback */
585 		i_ddi_irm_enqueue(pool_p, B_TRUE);
586 
587 		/* Remove callback flag */
588 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_CALLBACK);
589 	}
590 
591 	/* Unlock the pool */
592 	mutex_exit(&pool_p->ipool_lock);
593 }
594 
595 /*
596  * Interrupt Pool Balancing
597  */
598 
599 /*
600  * irm_balance_thread()
601  *
602  *	One instance of this thread operates per each defined IRM pool.
603  *	It does the initial activation of the pool, as well as balancing
604  *	any requests that were queued up before the pool was active.
605  *	Once active, it waits forever to service balance operations.
606  */
607 static void
608 irm_balance_thread(ddi_irm_pool_t *pool_p)
609 {
610 	clock_t		interval, wakeup;
611 
612 	DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: pool_p %p\n",
613 	    (void *)pool_p));
614 
615 	/* Lock the pool */
616 	mutex_enter(&pool_p->ipool_lock);
617 
618 	/* Perform initial balance if required */
619 	if (pool_p->ipool_reqno > pool_p->ipool_resno)
620 		i_ddi_irm_balance(pool_p);
621 
622 	/* Activate the pool */
623 	pool_p->ipool_flags |= DDI_IRM_FLAG_ACTIVE;
624 
625 	/* Main loop */
626 	for (;;) {
627 
628 		/* Compute the delay interval */
629 		interval = drv_usectohz(irm_balance_delay * 1000000);
630 
631 		/* Sleep until queued */
632 		cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
633 
634 		DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: signaled.\n"));
635 
636 		/* Wait one interval, or until there are waiters */
637 		if ((interval > 0) &&
638 		    !(pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) &&
639 		    !(pool_p->ipool_flags & DDI_IRM_FLAG_EXIT)) {
640 			wakeup = ddi_get_lbolt() + interval;
641 			(void) cv_timedwait(&pool_p->ipool_cv,
642 			    &pool_p->ipool_lock, wakeup);
643 		}
644 
645 		/* Check if awakened to exit */
646 		if (pool_p->ipool_flags & DDI_IRM_FLAG_EXIT) {
647 			DDI_INTR_IRMDBG((CE_CONT,
648 			    "irm_balance_thread: exiting...\n"));
649 			mutex_exit(&pool_p->ipool_lock);
650 			thread_exit();
651 		}
652 
653 		/* Balance the pool */
654 		i_ddi_irm_balance(pool_p);
655 
656 		/* Notify waiters */
657 		if (pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) {
658 			cv_broadcast(&pool_p->ipool_cv);
659 			pool_p->ipool_flags &= ~(DDI_IRM_FLAG_WAITERS);
660 		}
661 
662 		/* Clear QUEUED condition */
663 		pool_p->ipool_flags &= ~(DDI_IRM_FLAG_QUEUED);
664 	}
665 }
666 
667 /*
668  * i_ddi_irm_balance()
669  *
670  *	Balance a pool.  The general algorithm is to first reset all
671  *	requests to their maximum size, use reduction algorithms to
672  *	solve any imbalance, and then notify affected drivers.
673  */
674 static void
675 i_ddi_irm_balance(ddi_irm_pool_t *pool_p)
676 {
677 	ddi_irm_req_t	*req_p;
678 
679 #ifdef	DEBUG
680 	uint_t		debug_totsz = 0;
681 	int		debug_policy = 0;
682 #endif	/* DEBUG */
683 
684 	ASSERT(pool_p != NULL);
685 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
686 
687 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: pool_p %p\n",
688 	    (void *)pool_p));
689 
690 #ifdef	DEBUG	/* Adjust size and policy settings */
691 	if (irm_debug_size > pool_p->ipool_minno) {
692 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: debug size %d\n",
693 		    irm_debug_size));
694 		debug_totsz = pool_p->ipool_totsz;
695 		pool_p->ipool_totsz = irm_debug_size;
696 	}
697 	if (DDI_IRM_POLICY_VALID(irm_debug_policy)) {
698 		DDI_INTR_IRMDBG((CE_CONT,
699 		    "i_ddi_irm_balance: debug policy %d\n", irm_debug_policy));
700 		debug_policy = pool_p->ipool_policy;
701 		pool_p->ipool_policy = irm_debug_policy;
702 	}
703 #endif	/* DEBUG */
704 
705 	/* Lock the availability lock */
706 	mutex_enter(&pool_p->ipool_navail_lock);
707 
708 	/*
709 	 * Put all of the reducible requests into a scratch list.
710 	 * Reset each one of them to their maximum availability.
711 	 */
712 	for (req_p = list_head(&pool_p->ipool_req_list); req_p;
713 	    req_p = list_next(&pool_p->ipool_req_list, req_p)) {
714 		if (DDI_IRM_IS_REDUCIBLE(req_p)) {
715 			pool_p->ipool_resno -= req_p->ireq_navail;
716 			req_p->ireq_scratch = req_p->ireq_navail;
717 			req_p->ireq_navail = req_p->ireq_nreq;
718 			pool_p->ipool_resno += req_p->ireq_navail;
719 			list_insert_tail(&pool_p->ipool_scratch_list, req_p);
720 		}
721 	}
722 
723 	/* Balance the requests */
724 	i_ddi_irm_reduce(pool_p);
725 
726 	/* Unlock the availability lock */
727 	mutex_exit(&pool_p->ipool_navail_lock);
728 
729 	/*
730 	 * Process REMOVE notifications.
731 	 *
732 	 * If a driver fails to release interrupts: exclude it from
733 	 * further processing, correct the resulting imbalance, and
734 	 * start over again at the head of the scratch list.
735 	 */
736 	req_p = list_head(&pool_p->ipool_scratch_list);
737 	while (req_p) {
738 		if ((req_p->ireq_navail < req_p->ireq_scratch) &&
739 		    (i_ddi_irm_notify(pool_p, req_p) != DDI_SUCCESS)) {
740 			list_remove(&pool_p->ipool_scratch_list, req_p);
741 			mutex_enter(&pool_p->ipool_navail_lock);
742 			i_ddi_irm_reduce(pool_p);
743 			mutex_exit(&pool_p->ipool_navail_lock);
744 			req_p = list_head(&pool_p->ipool_scratch_list);
745 		} else {
746 			req_p = list_next(&pool_p->ipool_scratch_list, req_p);
747 		}
748 	}
749 
750 	/*
751 	 * Process ADD notifications.
752 	 *
753 	 * This is the last use of the scratch list, so empty it.
754 	 */
755 	while (req_p = list_remove_head(&pool_p->ipool_scratch_list)) {
756 		if (req_p->ireq_navail > req_p->ireq_scratch) {
757 			(void) i_ddi_irm_notify(pool_p, req_p);
758 		}
759 	}
760 
761 #ifdef	DEBUG	/* Restore size and policy settings */
762 	if (debug_totsz != 0)
763 		pool_p->ipool_totsz = debug_totsz;
764 	if (debug_policy != 0)
765 		pool_p->ipool_policy = debug_policy;
766 #endif	/* DEBUG */
767 }
768 
769 /*
770  * i_ddi_irm_reduce()
771  *
772  *	Use reduction algorithms to correct an imbalance in a pool.
773  */
774 static void
775 i_ddi_irm_reduce(ddi_irm_pool_t *pool_p)
776 {
777 	int	imbalance;
778 
779 	ASSERT(pool_p != NULL);
780 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
781 	ASSERT(DDI_IRM_POLICY_VALID(pool_p->ipool_policy));
782 
783 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_reduce: pool_p %p\n",
784 	    (void *)pool_p));
785 
786 	/* Compute the imbalance.  Do nothing if already balanced. */
787 	if ((imbalance = pool_p->ipool_resno - pool_p->ipool_totsz) <= 0)
788 		return;
789 
790 	/*
791 	 * Try policy based reduction first. If it failed, then
792 	 * possibly reduce new requests as a last resort.
793 	 */
794 	if (i_ddi_irm_reduce_by_policy(pool_p, imbalance, pool_p->ipool_policy)
795 	    != DDI_SUCCESS) {
796 
797 		DDI_INTR_IRMDBG((CE_CONT,
798 		    "i_ddi_irm_reduce: policy reductions failed.\n"));
799 
800 		/* Compute remaining imbalance */
801 		imbalance = pool_p->ipool_resno - pool_p->ipool_totsz;
802 
803 		ASSERT(imbalance > 0);
804 
805 		i_ddi_irm_reduce_new(pool_p, imbalance);
806 	}
807 }
808 
809 /*
810  * i_ddi_irm_enqueue()
811  *
812  *	Queue a pool to be balanced.  Signals the balancing thread to wake
813  *	up and process the pool.  If 'wait_flag' is true, then the current
814  *	thread becomes a waiter and blocks until the balance is completed.
815  */
816 static void
817 i_ddi_irm_enqueue(ddi_irm_pool_t *pool_p, boolean_t wait_flag)
818 {
819 	ASSERT(pool_p != NULL);
820 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
821 
822 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool_p %p wait_flag %d\n",
823 	    (void *)pool_p, (int)wait_flag));
824 
825 	/* Do nothing if pool is already balanced */
826 #ifndef	DEBUG
827 	if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
828 #else
829 	if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
830 #endif	/* DEBUG */
831 		DDI_INTR_IRMDBG((CE_CONT,
832 		    "i_ddi_irm_enqueue: pool already balanced\n"));
833 		return;
834 	}
835 
836 	/* Avoid deadlocks when IRM is not active */
837 	if (!irm_active && wait_flag) {
838 		DDI_INTR_IRMDBG((CE_CONT,
839 		    "i_ddi_irm_enqueue: pool not active.\n"));
840 		return;
841 	}
842 
843 	if (wait_flag)
844 		pool_p->ipool_flags |= DDI_IRM_FLAG_WAITERS;
845 
846 	if (wait_flag || !(pool_p->ipool_flags & DDI_IRM_FLAG_QUEUED)) {
847 		pool_p->ipool_flags |= DDI_IRM_FLAG_QUEUED;
848 		cv_signal(&pool_p->ipool_cv);
849 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool queued.\n"));
850 	}
851 
852 	if (wait_flag) {
853 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: waiting...\n"));
854 		cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
855 	}
856 }
857 
858 /*
859  * i_ddi_irm_reduce_by_policy()
860  *
861  *	Reduces requests based on reduction policies.
862  *
863  *	For the DDI_IRM_POLICY_LARGE reduction policy, the algorithm
864  *	generally reduces larger requests first, before advancing
865  *	to smaller requests.
866  *	For the DDI_IRM_POLICY_EVEN reduction policy, the algorithm
867  *	reduces requests evenly, without giving a specific preference
868  *	to smaller or larger requests. Each iteration reduces all
869  *	reducible requests by the same amount until the imbalance is
870  *	corrected.
871  *
872  *	The scratch list is initially sorted in descending order by current
873  *	navail values, which are maximized prior to reduction. This sorted
874  *	order is preserved.  It avoids reducing requests below the threshold
875  *	of the interrupt pool's default allocation size.
876  *
877  *	Optimizations in this algorithm include trying to reduce multiple
878  *	requests together.  And the algorithm attempts to reduce in larger
879  *	increments when possible to minimize the total number of iterations.
880  */
881 static int
882 i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *pool_p, int imbalance, int policy)
883 {
884 	ASSERT(pool_p != NULL);
885 	ASSERT(imbalance > 0);
886 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
887 
888 	while (imbalance > 0) {
889 		list_t		*slist_p = &pool_p->ipool_scratch_list;
890 		ddi_irm_req_t	*req_p = list_head(slist_p), *last_p;
891 		uint_t		nreduce = 0, nremain = 0, stop_navail;
892 		uint_t		pool_defsz = pool_p->ipool_defsz;
893 		uint_t		reduction, max_redu;
894 
895 		/* Fail if none are reducible */
896 		if (!req_p || req_p->ireq_navail <= pool_defsz) {
897 			DDI_INTR_IRMDBG((CE_CONT,
898 			    "i_ddi_irm_reduce_by_policy: Failure. "
899 			    "All requests have downsized to low limit.\n"));
900 			return (DDI_FAILURE);
901 		}
902 
903 		/* Count reducible requests */
904 		stop_navail = (policy == DDI_IRM_POLICY_LARGE) ?
905 		    req_p->ireq_navail - 1 : pool_defsz;
906 		for (; req_p; req_p = list_next(slist_p, req_p)) {
907 			if (req_p->ireq_navail <= stop_navail)
908 				break;
909 			nreduce++;
910 		}
911 
912 		/* Compute reduction */
913 		last_p = req_p ? list_prev(slist_p, req_p) : list_tail(slist_p);
914 		if ((policy == DDI_IRM_POLICY_LARGE) && req_p &&
915 		    req_p->ireq_navail > pool_defsz)
916 			reduction = last_p->ireq_navail - req_p->ireq_navail;
917 		else
918 			reduction = last_p->ireq_navail - pool_defsz;
919 
920 		if ((max_redu = reduction * nreduce) > imbalance) {
921 			reduction = imbalance / nreduce;
922 			nremain = imbalance % nreduce;
923 			pool_p->ipool_resno -= imbalance;
924 			imbalance = 0;
925 		} else {
926 			pool_p->ipool_resno -= max_redu;
927 			imbalance -= max_redu;
928 		}
929 
930 		/* Reduce */
931 		for (req_p = list_head(slist_p); (reduction != 0) && nreduce--;
932 		    req_p = list_next(slist_p, req_p)) {
933 			req_p->ireq_navail -= reduction;
934 		}
935 
936 		for (req_p = last_p; nremain--;
937 		    req_p = list_prev(slist_p, req_p)) {
938 			req_p->ireq_navail--;
939 		}
940 	}
941 
942 	return (DDI_SUCCESS);
943 }
944 
945 /*
946  * i_ddi_irm_reduce_new()
947  *
948  *	Reduces new requests.  This is only used as a last resort
949  *	after another reduction algorithm failed.
950  *
951  *	NOTE: The pool locking in i_ddi_irm_insert() ensures
952  *	there can be only one new request at a time in a pool.
953  */
954 static void
955 i_ddi_irm_reduce_new(ddi_irm_pool_t *pool_p, int imbalance)
956 {
957 	ddi_irm_req_t	*req_p;
958 
959 	ASSERT(pool_p != NULL);
960 	ASSERT(imbalance > 0);
961 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
962 
963 	DDI_INTR_IRMDBG((CE_CONT,
964 	    "i_ddi_irm_reduce_new: pool_p %p imbalance %d\n",
965 	    (void *)pool_p, imbalance));
966 
967 	for (req_p = list_head(&pool_p->ipool_scratch_list); req_p;
968 	    req_p = list_next(&pool_p->ipool_scratch_list, req_p)) {
969 		if (req_p->ireq_flags & DDI_IRM_FLAG_NEW) {
970 			ASSERT(req_p->ireq_navail >= imbalance);
971 			req_p->ireq_navail -= imbalance;
972 			pool_p->ipool_resno -= imbalance;
973 			return;
974 		}
975 	}
976 
977 	/* should never go here */
978 	ASSERT(B_FALSE);
979 }
980 
981 /*
982  * Miscellaneous Helper Functions
983  */
984 
985 /*
986  * i_ddi_intr_get_pool()
987  *
988  *	Get an IRM pool that supplies interrupts of a specified type.
989  *	Invokes a DDI_INTROP_GETPOOL to the bus nexus driver.  Fails
990  *	if no pool exists.
991  */
992 ddi_irm_pool_t *
993 i_ddi_intr_get_pool(dev_info_t *dip, int type)
994 {
995 	devinfo_intr_t		*intr_p;
996 	ddi_irm_pool_t		*pool_p;
997 	ddi_irm_req_t		*req_p;
998 	ddi_intr_handle_impl_t	hdl;
999 
1000 	ASSERT(dip != NULL);
1001 	ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
1002 
1003 	if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
1004 	    ((req_p = intr_p->devi_irm_req_p) != NULL) &&
1005 	    ((pool_p = req_p->ireq_pool_p) != NULL) &&
1006 	    (pool_p->ipool_types & type)) {
1007 		return (pool_p);
1008 	}
1009 
1010 	bzero(&hdl, sizeof (ddi_intr_handle_impl_t));
1011 	hdl.ih_dip = dip;
1012 	hdl.ih_type = type;
1013 
1014 	if (i_ddi_intr_ops(dip, dip, DDI_INTROP_GETPOOL,
1015 	    &hdl, (void *)&pool_p) == DDI_SUCCESS)
1016 		return (pool_p);
1017 
1018 	return (NULL);
1019 }
1020 
1021 /*
1022  * i_ddi_irm_insertion_sort()
1023  *
1024  *	Use the insertion sort method to insert a request into a list.
1025  *	The list is sorted in descending order by request size.
1026  */
1027 static void
1028 i_ddi_irm_insertion_sort(list_t *req_list, ddi_irm_req_t *req_p)
1029 {
1030 	ddi_irm_req_t	*next_p;
1031 
1032 	next_p = list_head(req_list);
1033 
1034 	while (next_p && (next_p->ireq_nreq > req_p->ireq_nreq))
1035 		next_p = list_next(req_list, next_p);
1036 
1037 	list_insert_before(req_list, next_p, req_p);
1038 }
1039 
1040 /*
1041  * i_ddi_irm_notify()
1042  *
1043  *	Notify a driver of changes to its interrupt request using the
1044  *	generic callback mechanism.  Checks for errors in processing.
1045  */
1046 static int
1047 i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
1048 {
1049 	ddi_cb_action_t	action;
1050 	ddi_cb_t	*cb_p;
1051 	uint_t		nintrs;
1052 	int		ret, count;
1053 
1054 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: pool_p %p req_p %p\n",
1055 	    (void *)pool_p, (void *)req_p));
1056 
1057 	/* Do not notify new or unchanged requests */
1058 	if ((req_p->ireq_navail == req_p->ireq_scratch) ||
1059 	    (req_p->ireq_flags & DDI_IRM_FLAG_NEW))
1060 		return (DDI_SUCCESS);
1061 
1062 	/* Determine action and count */
1063 	if (req_p->ireq_navail > req_p->ireq_scratch) {
1064 		action = DDI_CB_INTR_ADD;
1065 		count = req_p->ireq_navail - req_p->ireq_scratch;
1066 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: adding %d\n",
1067 		    count));
1068 	} else {
1069 		action = DDI_CB_INTR_REMOVE;
1070 		count = req_p->ireq_scratch - req_p->ireq_navail;
1071 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: removing %d\n",
1072 		    count));
1073 	}
1074 
1075 	/* Lookup driver callback */
1076 	if ((cb_p = DEVI(req_p->ireq_dip)->devi_cb_p) == NULL) {
1077 		DDI_INTR_IRMDBG((CE_WARN, "i_ddi_irm_notify: no callback!\n"));
1078 		return (DDI_FAILURE);
1079 	}
1080 
1081 	/* Do callback */
1082 	ret = cb_p->cb_func(req_p->ireq_dip, action, (void *)(uintptr_t)count,
1083 	    cb_p->cb_arg1, cb_p->cb_arg2);
1084 
1085 	/* Log callback errors */
1086 	if (ret != DDI_SUCCESS) {
1087 		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
1088 		    ddi_driver_name(req_p->ireq_dip),
1089 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
1090 	}
1091 
1092 	/* Check if the driver exceeds its availability */
1093 	nintrs = i_ddi_intr_get_current_nintrs(req_p->ireq_dip);
1094 	if (nintrs > req_p->ireq_navail) {
1095 		cmn_err(CE_WARN, "%s%d: failed to release interrupts "
1096 		    "(nintrs=%d, navail=%d).\n",
1097 		    ddi_driver_name(req_p->ireq_dip),
1098 		    ddi_get_instance(req_p->ireq_dip), nintrs,
1099 		    req_p->ireq_navail);
1100 		pool_p->ipool_resno += (nintrs - req_p->ireq_navail);
1101 		req_p->ireq_navail = nintrs;
1102 		return (DDI_FAILURE);
1103 	}
1104 
1105 	/* Update request */
1106 	req_p->ireq_scratch = req_p->ireq_navail;
1107 
1108 	return (DDI_SUCCESS);
1109 }
1110 
1111 /*
1112  * i_ddi_irm_debug_balance()
1113  *
1114  *	A debug/test only routine to force the immediate,
1115  *	synchronous rebalancing of an interrupt pool.
1116  */
1117 #ifdef	DEBUG
1118 void
1119 i_ddi_irm_debug_balance(dev_info_t *dip, boolean_t wait_flag)
1120 {
1121 	ddi_irm_pool_t	*pool_p;
1122 	int		type;
1123 
1124 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_debug_balance: dip %p wait %d\n",
1125 	    (void *)dip, (int)wait_flag));
1126 
1127 	if (((type = i_ddi_intr_get_current_type(dip)) != 0) &&
1128 	    ((pool_p = i_ddi_intr_get_pool(dip, type)) != NULL)) {
1129 		mutex_enter(&pool_p->ipool_lock);
1130 		i_ddi_irm_enqueue(pool_p, wait_flag);
1131 		mutex_exit(&pool_p->ipool_lock);
1132 	}
1133 }
1134 #endif
1135