xref: /titanic_51/usr/src/uts/common/os/ddi_intr_irm.c (revision 427f591e1fcd1cf040406a539ad2725469a680fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/note.h>
26 #include <sys/sysmacros.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kmem.h>
31 #include <sys/cmn_err.h>
32 #include <sys/debug.h>
33 #include <sys/ddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/ndi_impldefs.h>	/* include prototypes */
36 
37 #if defined(__i386) || defined(__amd64)
38 /*
39  * MSI-X allocation limit.
40  */
41 extern uint_t		ddi_msix_alloc_limit;
42 #endif
43 
44 /*
45  * Interrupt Resource Management (IRM).
46  */
47 
48 #define	DDI_IRM_BALANCE_DELAY	(60)	/* In seconds */
49 
50 #define	DDI_IRM_HAS_CB(c)	((c) && (c->cb_flags & DDI_CB_FLAG_INTR))
51 
52 #define	DDI_IRM_IS_REDUCIBLE(r)	(((r->ireq_flags & DDI_IRM_FLAG_CALLBACK) && \
53 				(r->ireq_type == DDI_INTR_TYPE_MSIX)) || \
54 				(r->ireq_flags & DDI_IRM_FLAG_NEW))
55 
56 extern pri_t	minclsyspri;
57 
58 /* Global policies */
59 int		irm_enable = 1;
60 boolean_t	irm_active = B_FALSE;
61 int		irm_default_policy = DDI_IRM_POLICY_LARGE;
62 uint_t		irm_balance_delay = DDI_IRM_BALANCE_DELAY;
63 
64 /* Global list of interrupt pools */
65 kmutex_t	irm_pools_lock;
66 list_t		irm_pools_list;
67 
68 /* Global debug tunables */
69 #ifdef	DEBUG
70 int		irm_debug_policy = 0;
71 uint_t		irm_debug_size = 0;
72 #endif	/* DEBUG */
73 
74 static void	irm_balance_thread(ddi_irm_pool_t *);
75 static void	i_ddi_irm_balance(ddi_irm_pool_t *);
76 static void	i_ddi_irm_enqueue(ddi_irm_pool_t *, boolean_t);
77 static void	i_ddi_irm_reduce(ddi_irm_pool_t *pool);
78 static int	i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *, int, int);
79 static void	i_ddi_irm_reduce_new(ddi_irm_pool_t *, int);
80 static void	i_ddi_irm_insertion_sort(list_t *, ddi_irm_req_t *);
81 static int	i_ddi_irm_notify(ddi_irm_pool_t *, ddi_irm_req_t *);
82 static int	i_ddi_irm_modify_increase(ddi_irm_req_t *, int);
83 
84 /*
85  * OS Initialization Routines
86  */
87 
88 /*
89  * irm_init()
90  *
91  *	Initialize IRM subsystem before any drivers are attached.
92  */
93 void
94 irm_init(void)
95 {
96 	/* Do nothing if IRM is disabled */
97 	if (!irm_enable)
98 		return;
99 
100 	/* Verify that the default balancing policy is valid */
101 	if (!DDI_IRM_POLICY_VALID(irm_default_policy))
102 		irm_default_policy = DDI_IRM_POLICY_LARGE;
103 
104 	/* Initialize the global list of interrupt pools */
105 	mutex_init(&irm_pools_lock, NULL, MUTEX_DRIVER, NULL);
106 	list_create(&irm_pools_list, sizeof (ddi_irm_pool_t),
107 	    offsetof(ddi_irm_pool_t, ipool_link));
108 }
109 
110 /*
111  * i_ddi_irm_poststartup()
112  *
113  *	IRM is not activated until after the IO subsystem is initialized.
114  *	When activated, per-pool balancing threads are spawned and a flag
115  *	is set so that all future pools will be activated when created.
116  *
117  *	NOTE: the global variable 'irm_enable' disables IRM if zero.
118  */
119 void
120 i_ddi_irm_poststartup(void)
121 {
122 	ddi_irm_pool_t	*pool_p;
123 
124 	/* Do nothing if IRM is disabled */
125 	if (!irm_enable)
126 		return;
127 
128 	/* Lock the global list */
129 	mutex_enter(&irm_pools_lock);
130 
131 	/* Activate all defined pools */
132 	for (pool_p = list_head(&irm_pools_list); pool_p;
133 	    pool_p = list_next(&irm_pools_list, pool_p))
134 		pool_p->ipool_thread = thread_create(NULL, 0,
135 		    irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
136 
137 	/* Set future pools to be active */
138 	irm_active = B_TRUE;
139 
140 	/* Unlock the global list */
141 	mutex_exit(&irm_pools_lock);
142 }
143 
144 /*
145  * NDI interfaces for creating/destroying IRM pools.
146  */
147 
148 /*
149  * ndi_irm_create()
150  *
151  *	Nexus interface to create an IRM pool.  Create the new
152  *	pool and add it to the global list of interrupt pools.
153  */
154 int
155 ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp,
156     ddi_irm_pool_t **pool_retp)
157 {
158 	ddi_irm_pool_t	*pool_p;
159 
160 	ASSERT(dip != NULL);
161 	ASSERT(paramsp != NULL);
162 	ASSERT(pool_retp != NULL);
163 	ASSERT(paramsp->iparams_total >= 1);
164 	ASSERT(paramsp->iparams_types != 0);
165 
166 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_create: dip %p\n", (void *)dip));
167 
168 	/* Check if IRM is enabled */
169 	if (!irm_enable)
170 		return (NDI_FAILURE);
171 
172 	/* Validate parameters */
173 	if ((dip == NULL) || (paramsp == NULL) || (pool_retp == NULL) ||
174 	    (paramsp->iparams_total < 1) || (paramsp->iparams_types == 0))
175 		return (NDI_FAILURE);
176 
177 	/* Allocate and initialize the pool */
178 	pool_p = kmem_zalloc(sizeof (ddi_irm_pool_t), KM_SLEEP);
179 	pool_p->ipool_owner = dip;
180 	pool_p->ipool_policy = irm_default_policy;
181 	pool_p->ipool_types = paramsp->iparams_types;
182 	pool_p->ipool_totsz = paramsp->iparams_total;
183 	pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, MAX(DDI_MIN_MSIX_ALLOC,
184 	    paramsp->iparams_total / DDI_MSIX_ALLOC_DIVIDER));
185 	list_create(&pool_p->ipool_req_list, sizeof (ddi_irm_req_t),
186 	    offsetof(ddi_irm_req_t, ireq_link));
187 	list_create(&pool_p->ipool_scratch_list, sizeof (ddi_irm_req_t),
188 	    offsetof(ddi_irm_req_t, ireq_scratch_link));
189 	cv_init(&pool_p->ipool_cv, NULL, CV_DRIVER, NULL);
190 	mutex_init(&pool_p->ipool_lock, NULL, MUTEX_DRIVER, NULL);
191 	mutex_init(&pool_p->ipool_navail_lock, NULL, MUTEX_DRIVER, NULL);
192 
193 	/* Add to global list of pools */
194 	mutex_enter(&irm_pools_lock);
195 	list_insert_tail(&irm_pools_list, pool_p);
196 	mutex_exit(&irm_pools_lock);
197 
198 	/* If IRM is active, then activate the pool */
199 	if (irm_active)
200 		pool_p->ipool_thread = thread_create(NULL, 0,
201 		    irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
202 
203 	*pool_retp = pool_p;
204 	return (NDI_SUCCESS);
205 }
206 
207 /*
208  * ndi_irm_resize_pool()
209  *
210  *	Nexus interface to resize IRM pool. If the pool size drops
211  *	below  the allocated number of vectors then initiate rebalance
212  *	operation before resizing the pool. If rebalance operation fails
213  *	then return NDI_FAILURE.
214  */
215 int
216 ndi_irm_resize_pool(ddi_irm_pool_t *pool_p, uint_t new_size)
217 {
218 	uint_t prev_size;
219 
220 	ASSERT(pool_p != NULL);
221 
222 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
223 	    " current-size 0x%x new-size 0x%x\n",
224 	    (void *)pool_p, pool_p->ipool_totsz, new_size));
225 
226 	if (pool_p == NULL)
227 		return (NDI_EINVAL);
228 
229 	/* Check if IRM is enabled */
230 	if (!irm_enable)
231 		return (NDI_FAILURE);
232 
233 	mutex_enter(&pool_p->ipool_lock);
234 
235 	/*
236 	 * If we are increasing the pool size or if the reserved
237 	 * number of vectors is <= the new pool size then simply
238 	 * update the pool size and enqueue a reblance operation
239 	 * if necessary to use the new vectors.
240 	 */
241 	if ((pool_p->ipool_totsz < new_size) ||
242 	    (pool_p->ipool_resno <= new_size)) {
243 		/* set new pool size */
244 		pool_p->ipool_totsz = new_size;
245 		/* adjust the default allocation limit */
246 		pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
247 		    MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
248 		/* queue a rebalance operation to use the new vectors */
249 		if (pool_p->ipool_reqno > pool_p->ipool_resno)
250 			i_ddi_irm_enqueue(pool_p, B_FALSE);
251 		mutex_exit(&pool_p->ipool_lock);
252 		return (NDI_SUCCESS);
253 	}
254 
255 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
256 	    " needs a rebalance operation\n", (void *)pool_p));
257 
258 	/*
259 	 * requires a rebalance operation
260 	 */
261 	/* save the current pool size */
262 	prev_size = pool_p->ipool_totsz;
263 	/* set the pool size to the desired new value */
264 	pool_p->ipool_totsz = new_size;
265 	/* perform the rebalance operation */
266 	i_ddi_irm_enqueue(pool_p, B_TRUE);
267 
268 	/*
269 	 * If rebalance operation couldn't free up enough
270 	 * vectors then fail the resize operation.
271 	 */
272 	if (pool_p->ipool_resno > new_size) { /* rebalance failed */
273 		/* restore the pool size to the previous value */
274 		pool_p->ipool_totsz = prev_size;
275 		/* enqueue a rebalance operation for the original pool size */
276 		i_ddi_irm_enqueue(pool_p, B_FALSE);
277 		mutex_exit(&pool_p->ipool_lock);
278 		return (NDI_FAILURE);
279 	} else { /* rebalance worked */
280 		/* adjust the default allocation limit */
281 		pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
282 		    MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
283 		mutex_exit(&pool_p->ipool_lock);
284 		DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
285 		    " resized from %x to %x\n",
286 		    (void *)pool_p, prev_size, pool_p->ipool_totsz));
287 		return (NDI_SUCCESS);
288 	}
289 }
290 
291 /*
292  * ndi_irm_destroy()
293  *
294  *	Nexus interface to destroy an IRM pool.  Destroy the pool
295  *	and remove it from the global list of interrupt pools.
296  */
297 int
298 ndi_irm_destroy(ddi_irm_pool_t *pool_p)
299 {
300 	ASSERT(pool_p != NULL);
301 	ASSERT(pool_p->ipool_resno == 0);
302 
303 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_destroy: pool_p %p\n",
304 	    (void *)pool_p));
305 
306 	/* Validate parameters */
307 	if (pool_p == NULL)
308 		return (NDI_FAILURE);
309 
310 	/* Validate that pool is empty */
311 	if (pool_p->ipool_resno != 0)
312 		return (NDI_BUSY);
313 
314 	/* Remove the pool from the global list */
315 	mutex_enter(&irm_pools_lock);
316 	list_remove(&irm_pools_list, pool_p);
317 	mutex_exit(&irm_pools_lock);
318 
319 	/* Terminate the balancing thread */
320 	mutex_enter(&pool_p->ipool_lock);
321 	if (pool_p->ipool_thread &&
322 	    (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) {
323 		pool_p->ipool_flags |= DDI_IRM_FLAG_EXIT;
324 		cv_signal(&pool_p->ipool_cv);
325 		mutex_exit(&pool_p->ipool_lock);
326 		thread_join(pool_p->ipool_thread->t_did);
327 	} else
328 		mutex_exit(&pool_p->ipool_lock);
329 
330 	/* Destroy the pool */
331 	cv_destroy(&pool_p->ipool_cv);
332 	mutex_destroy(&pool_p->ipool_lock);
333 	mutex_destroy(&pool_p->ipool_navail_lock);
334 	list_destroy(&pool_p->ipool_req_list);
335 	list_destroy(&pool_p->ipool_scratch_list);
336 	kmem_free(pool_p, sizeof (ddi_irm_pool_t));
337 
338 	return (NDI_SUCCESS);
339 }
340 
341 /*
342  * Insert/Modify/Remove Interrupt Requests
343  */
344 
345 /*
346  * i_ddi_irm_insert()
347  *
348  *	Insert a new request into an interrupt pool, and balance the pool.
349  */
350 int
351 i_ddi_irm_insert(dev_info_t *dip, int type, int count)
352 {
353 	ddi_irm_req_t	*req_p;
354 	devinfo_intr_t	*intr_p;
355 	ddi_irm_pool_t	*pool_p;
356 	uint_t		nreq, nmin, npartial;
357 	boolean_t	irm_flag = B_FALSE;
358 
359 	ASSERT(dip != NULL);
360 	ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
361 	ASSERT(count > 0);
362 
363 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: dip %p type %d count %d\n",
364 	    (void *)dip, type, count));
365 
366 	/* Validate parameters */
367 	if ((dip == NULL) || (count < 1) || !DDI_INTR_TYPE_FLAG_VALID(type)) {
368 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: invalid args\n"));
369 		return (DDI_EINVAL);
370 	}
371 
372 	/* Check for an existing request */
373 	if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
374 	    (intr_p->devi_irm_req_p != NULL))
375 		return (DDI_SUCCESS);
376 
377 	/* Check for IRM support from the system */
378 	if ((pool_p = i_ddi_intr_get_pool(dip, type)) == NULL) {
379 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: not supported\n"));
380 		return (DDI_ENOTSUP);
381 	}
382 
383 	/* Check for IRM support from the driver */
384 	if (i_ddi_irm_supported(dip, type) == DDI_SUCCESS)
385 		irm_flag = B_TRUE;
386 
387 	/* Determine request size */
388 	nreq = (irm_flag) ? count :
389 	    MIN(count, i_ddi_intr_get_limit(dip, type, pool_p));
390 	nmin = (irm_flag) ? 1 : nreq;
391 	npartial = MIN(nreq, pool_p->ipool_defsz);
392 
393 	/* Allocate and initialize the request */
394 	req_p = kmem_zalloc(sizeof (ddi_irm_req_t), KM_SLEEP);
395 	req_p->ireq_type = type;
396 	req_p->ireq_dip = dip;
397 	req_p->ireq_pool_p = pool_p;
398 	req_p->ireq_nreq = nreq;
399 	req_p->ireq_flags = DDI_IRM_FLAG_NEW;
400 	if (irm_flag)
401 		req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
402 
403 	/* Lock the pool */
404 	mutex_enter(&pool_p->ipool_lock);
405 
406 	/* Check for minimal fit before inserting */
407 	if ((pool_p->ipool_minno + nmin) > pool_p->ipool_totsz) {
408 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
409 		    ddi_driver_name(dip), ddi_get_instance(dip));
410 		mutex_exit(&pool_p->ipool_lock);
411 		kmem_free(req_p, sizeof (ddi_irm_req_t));
412 		return (DDI_EAGAIN);
413 	}
414 
415 	/* Insert the request into the pool */
416 	pool_p->ipool_reqno += nreq;
417 	pool_p->ipool_minno += nmin;
418 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
419 
420 	/*
421 	 * Try to fulfill the request.
422 	 *
423 	 * If all the interrupts are available, and either the request
424 	 * is static or the pool is active, then just take them directly.
425 	 *
426 	 * If only some of the interrupts are available, and the request
427 	 * can receive future callbacks, then take some now but queue the
428 	 * pool to be rebalanced later.
429 	 *
430 	 * Otherwise, immediately rebalance the pool and wait.
431 	 */
432 	if ((!irm_flag || (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) &&
433 	    ((pool_p->ipool_resno + nreq) <= pool_p->ipool_totsz)) {
434 
435 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
436 		    "request completely fulfilled.\n"));
437 		pool_p->ipool_resno += nreq;
438 		req_p->ireq_navail = nreq;
439 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
440 
441 	} else if (irm_flag &&
442 	    ((pool_p->ipool_resno + npartial) <= pool_p->ipool_totsz)) {
443 
444 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
445 		    "request partially fulfilled.\n"));
446 		pool_p->ipool_resno += npartial;
447 		req_p->ireq_navail = npartial;
448 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
449 		i_ddi_irm_enqueue(pool_p, B_FALSE);
450 
451 	} else {
452 
453 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
454 		    "request needs immediate rebalance.\n"));
455 		i_ddi_irm_enqueue(pool_p, B_TRUE);
456 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
457 	}
458 
459 	/* Fail if the request cannot be fulfilled at all */
460 	if (req_p->ireq_navail == 0) {
461 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
462 		    ddi_driver_name(dip), ddi_get_instance(dip));
463 		pool_p->ipool_reqno -= nreq;
464 		pool_p->ipool_minno -= nmin;
465 		list_remove(&pool_p->ipool_req_list, req_p);
466 		mutex_exit(&pool_p->ipool_lock);
467 		kmem_free(req_p, sizeof (ddi_irm_req_t));
468 		return (DDI_EAGAIN);
469 	}
470 
471 	/* Unlock the pool */
472 	mutex_exit(&pool_p->ipool_lock);
473 
474 	intr_p->devi_irm_req_p = req_p;
475 	return (DDI_SUCCESS);
476 }
477 
478 /*
479  * i_ddi_irm_modify()
480  *
481  *	Modify an existing request in an interrupt pool, and balance the pool.
482  */
483 int
484 i_ddi_irm_modify(dev_info_t *dip, int nreq)
485 {
486 	devinfo_intr_t	*intr_p;
487 	ddi_irm_req_t	*req_p;
488 	ddi_irm_pool_t	*pool_p;
489 	int		type;
490 	int		retval = DDI_SUCCESS;
491 
492 	ASSERT(dip != NULL);
493 	ASSERT(nreq > 0);
494 
495 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: dip %p nreq %d\n",
496 	    (void *)dip, nreq));
497 
498 	/* Validate parameters */
499 	if ((dip == NULL) || (nreq < 1)) {
500 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
501 		return (DDI_EINVAL);
502 	}
503 
504 	/* Do nothing if not mapped to an IRM pool */
505 	if (((intr_p = DEVI(dip)->devi_intr_p) == NULL) ||
506 	    ((req_p = intr_p->devi_irm_req_p) == NULL))
507 		return (DDI_SUCCESS);
508 
509 	/* Do nothing if new size is the same */
510 	if (nreq == req_p->ireq_nreq)
511 		return (DDI_SUCCESS);
512 
513 	/* Do not allow MSI requests to be resized */
514 	if ((type = req_p->ireq_type) == DDI_INTR_TYPE_MSI) {
515 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid type\n"));
516 		return (DDI_ENOTSUP);
517 	}
518 
519 	/* Select the pool */
520 	if ((pool_p = req_p->ireq_pool_p) == NULL) {
521 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: missing pool\n"));
522 		return (DDI_FAILURE);
523 	}
524 
525 	/* Validate request size is not too large */
526 	if (nreq > i_ddi_intr_get_limit(dip, type, pool_p)) {
527 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
528 		return (DDI_EINVAL);
529 	}
530 
531 	/* Lock the pool */
532 	mutex_enter(&pool_p->ipool_lock);
533 
534 	/*
535 	 * Process the modification.
536 	 *
537 	 *	- To increase a non-IRM request, call the implementation in
538 	 *	  i_ddi_irm_modify_increase().
539 	 *
540 	 *	- To decrease a non-IRM request, directly update the pool and
541 	 *	  request, then queue the pool for later rebalancing.
542 	 *
543 	 *	- To modify an IRM request, always queue the pool for later
544 	 *	  rebalancing.  IRM consumers rely upon callbacks for changes.
545 	 */
546 	if ((nreq > req_p->ireq_nreq) &&
547 	    (i_ddi_irm_supported(dip, type) != DDI_SUCCESS)) {
548 
549 		retval = i_ddi_irm_modify_increase(req_p, nreq);
550 
551 	} else {
552 
553 		/* Update pool and request */
554 		pool_p->ipool_reqno -= req_p->ireq_nreq;
555 		pool_p->ipool_reqno += nreq;
556 		if (i_ddi_irm_supported(dip, type) != DDI_SUCCESS) {
557 			pool_p->ipool_minno -= req_p->ireq_navail;
558 			pool_p->ipool_resno -= req_p->ireq_navail;
559 			pool_p->ipool_minno += nreq;
560 			pool_p->ipool_resno += nreq;
561 			req_p->ireq_navail = nreq;
562 		}
563 		req_p->ireq_nreq = nreq;
564 
565 		/* Re-sort request into the pool */
566 		list_remove(&pool_p->ipool_req_list, req_p);
567 		i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
568 
569 		/* Queue pool for asynchronous rebalance */
570 		i_ddi_irm_enqueue(pool_p, B_FALSE);
571 	}
572 
573 	/* Unlock the pool */
574 	mutex_exit(&pool_p->ipool_lock);
575 
576 	return (retval);
577 }
578 
579 /*
580  * i_ddi_irm_modify_increase()
581  *
582  *	Increase a non-IRM request.  The additional interrupts are
583  *	directly taken from the pool when possible.  Otherwise, an
584  *	immediate, synchronous rebalance is performed.  A temporary
585  *	proxy request is used for any rebalance operation to ensure
586  *	the request is not reduced below its current allocation.
587  *
588  *	NOTE: pool must already be locked.
589  */
590 static int
591 i_ddi_irm_modify_increase(ddi_irm_req_t *req_p, int nreq)
592 {
593 	dev_info_t	*dip = req_p->ireq_dip;
594 	ddi_irm_pool_t	*pool_p = req_p->ireq_pool_p;
595 	ddi_irm_req_t	new_req;
596 	int		count, delta;
597 
598 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
599 
600 	/* Compute number of additional vectors */
601 	count = nreq - req_p->ireq_nreq;
602 
603 	/* Check for minimal fit */
604 	if ((pool_p->ipool_minno + count) > pool_p->ipool_totsz) {
605 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
606 		    ddi_driver_name(dip), ddi_get_instance(dip));
607 		return (DDI_EAGAIN);
608 	}
609 
610 	/* Update the pool */
611 	pool_p->ipool_reqno += count;
612 	pool_p->ipool_minno += count;
613 
614 	/* Attempt direct implementation */
615 	if ((pool_p->ipool_resno + count) <= pool_p->ipool_totsz) {
616 		req_p->ireq_nreq += count;
617 		req_p->ireq_navail += count;
618 		pool_p->ipool_resno += count;
619 		return (DDI_SUCCESS);
620 	}
621 
622 	/* Rebalance required: fail if pool is not active */
623 	if ((pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE) == 0) {
624 		pool_p->ipool_reqno -= count;
625 		pool_p->ipool_minno -= count;
626 		return (DDI_EAGAIN);
627 	}
628 
629 	/* Insert temporary proxy request */
630 	bzero(&new_req, sizeof (ddi_irm_req_t));
631 	new_req.ireq_dip = dip;
632 	new_req.ireq_nreq = count;
633 	new_req.ireq_pool_p = pool_p;
634 	new_req.ireq_type = req_p->ireq_type;
635 	new_req.ireq_flags = DDI_IRM_FLAG_NEW;
636 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, &new_req);
637 
638 	/* Synchronously rebalance */
639 	i_ddi_irm_enqueue(pool_p, B_TRUE);
640 
641 	/* Remove proxy request, and merge into original request */
642 	req_p->ireq_nreq += count;
643 	if ((delta = (count - new_req.ireq_navail)) > 0) {
644 		req_p->ireq_nreq -= delta;
645 		pool_p->ipool_reqno -= delta;
646 		pool_p->ipool_minno -= delta;
647 	}
648 	req_p->ireq_navail += new_req.ireq_navail;
649 	list_remove(&pool_p->ipool_req_list, req_p);
650 	list_remove(&pool_p->ipool_req_list, &new_req);
651 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
652 
653 	return (DDI_SUCCESS);
654 }
655 
656 /*
657  * i_ddi_irm_remove()
658  *
659  *	Remove a request from an interrupt pool, and balance the pool.
660  */
661 int
662 i_ddi_irm_remove(dev_info_t *dip)
663 {
664 	devinfo_intr_t	*intr_p;
665 	ddi_irm_pool_t	*pool_p;
666 	ddi_irm_req_t	*req_p;
667 	uint_t		nmin;
668 
669 	ASSERT(dip != NULL);
670 
671 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: dip %p\n", (void *)dip));
672 
673 	/* Validate parameters */
674 	if (dip == NULL) {
675 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: invalid args\n"));
676 		return (DDI_EINVAL);
677 	}
678 
679 	/* Check if the device has a request */
680 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
681 	    !(req_p = intr_p->devi_irm_req_p)) {
682 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: not found\n"));
683 		return (DDI_EINVAL);
684 	}
685 
686 	/* Lock the pool */
687 	pool_p = req_p->ireq_pool_p;
688 	mutex_enter(&pool_p->ipool_lock);
689 
690 	/* Remove request */
691 	nmin = DDI_IRM_IS_REDUCIBLE(req_p) ? 1 : req_p->ireq_nreq;
692 	pool_p->ipool_minno -= nmin;
693 	pool_p->ipool_reqno -= req_p->ireq_nreq;
694 	pool_p->ipool_resno -= req_p->ireq_navail;
695 	list_remove(&pool_p->ipool_req_list, req_p);
696 
697 	/* Queue pool to be rebalanced */
698 	i_ddi_irm_enqueue(pool_p, B_FALSE);
699 
700 	/* Unlock the pool */
701 	mutex_exit(&pool_p->ipool_lock);
702 
703 	/* Destroy the request */
704 	intr_p->devi_irm_req_p = NULL;
705 	kmem_free(req_p, sizeof (ddi_irm_req_t));
706 
707 	return (DDI_SUCCESS);
708 }
709 
710 /*
711  * i_ddi_irm_set_cb()
712  *
713  *	Change the callback flag for a request, in response to
714  *	a change in its callback registration.  Then rebalance
715  *	the interrupt pool.
716  *
717  *	NOTE: the request is not locked because the navail value
718  *	      is not directly affected.  The balancing thread may
719  *	      modify the navail value in the background after it
720  *	      locks the request itself.
721  */
722 void
723 i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
724 {
725 	devinfo_intr_t	*intr_p;
726 	ddi_irm_pool_t	*pool_p;
727 	ddi_irm_req_t	*req_p;
728 	uint_t		nreq;
729 
730 	ASSERT(dip != NULL);
731 
732 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: dip %p has_cb_flag %d\n",
733 	    (void *)dip, (int)has_cb_flag));
734 
735 	/* Validate parameters */
736 	if (dip == NULL)
737 		return;
738 
739 	/* Check for association with interrupt pool */
740 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
741 	    !(req_p = intr_p->devi_irm_req_p)) {
742 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: not in pool\n"));
743 		return;
744 	}
745 
746 	/* Lock the pool */
747 	pool_p = req_p->ireq_pool_p;
748 	mutex_enter(&pool_p->ipool_lock);
749 
750 	/*
751 	 * Update the request and the pool
752 	 */
753 	if (has_cb_flag) {
754 
755 		/* Update pool statistics */
756 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
757 			pool_p->ipool_minno -= (req_p->ireq_nreq - 1);
758 
759 		/* Update request */
760 		req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
761 
762 		/* Rebalance in background */
763 		i_ddi_irm_enqueue(pool_p, B_FALSE);
764 
765 	} else {
766 
767 		/* Determine new request size */
768 		nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
769 
770 #if defined(__i386) || defined(__amd64)
771 		/* Use the default static limit for non-IRM drivers */
772 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
773 			nreq = MIN(nreq, ddi_msix_alloc_limit);
774 #endif
775 
776 		/* Update pool statistics */
777 		pool_p->ipool_reqno -= req_p->ireq_nreq;
778 		pool_p->ipool_reqno += nreq;
779 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) {
780 			pool_p->ipool_minno -= 1;
781 			pool_p->ipool_minno += nreq;
782 		} else {
783 			pool_p->ipool_minno -= req_p->ireq_nreq;
784 			pool_p->ipool_minno += nreq;
785 		}
786 
787 		/* Update request size, and re-sort in pool */
788 		req_p->ireq_nreq = nreq;
789 		list_remove(&pool_p->ipool_req_list, req_p);
790 		i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
791 
792 		/* Rebalance synchronously, before losing callback */
793 		i_ddi_irm_enqueue(pool_p, B_TRUE);
794 
795 		/* Remove callback flag */
796 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_CALLBACK);
797 	}
798 
799 	/* Unlock the pool */
800 	mutex_exit(&pool_p->ipool_lock);
801 }
802 
803 /*
804  * i_ddi_irm_supported()
805  *
806  *	Query if IRM is supported by a driver using a specific interrupt type.
807  *	Notice that IRM is limited to MSI-X users with registered callbacks.
808  */
809 int
810 i_ddi_irm_supported(dev_info_t *dip, int type)
811 {
812 	ddi_cb_t	*cb_p = DEVI(dip)->devi_cb_p;
813 
814 	return ((DDI_IRM_HAS_CB(cb_p) && (type == DDI_INTR_TYPE_MSIX)) ?
815 	    DDI_SUCCESS : DDI_ENOTSUP);
816 }
817 
818 /*
819  * Interrupt Pool Balancing
820  */
821 
822 /*
823  * irm_balance_thread()
824  *
825  *	One instance of this thread operates per each defined IRM pool.
826  *	It does the initial activation of the pool, as well as balancing
827  *	any requests that were queued up before the pool was active.
828  *	Once active, it waits forever to service balance operations.
829  */
830 static void
831 irm_balance_thread(ddi_irm_pool_t *pool_p)
832 {
833 	clock_t		interval;
834 
835 	DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: pool_p %p\n",
836 	    (void *)pool_p));
837 
838 	/* Lock the pool */
839 	mutex_enter(&pool_p->ipool_lock);
840 
841 	/* Perform initial balance if required */
842 	if (pool_p->ipool_reqno > pool_p->ipool_resno)
843 		i_ddi_irm_balance(pool_p);
844 
845 	/* Activate the pool */
846 	pool_p->ipool_flags |= DDI_IRM_FLAG_ACTIVE;
847 
848 	/*
849 	 * Main loop.
850 	 * Iterate once first before wait on signal, in case there is signal
851 	 * sent before this thread being created
852 	 */
853 	for (;;) {
854 
855 		/* Compute the delay interval */
856 		interval = drv_usectohz(irm_balance_delay * 1000000);
857 
858 		/* Wait one interval, or until there are waiters */
859 		if ((interval > 0) &&
860 		    !(pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) &&
861 		    !(pool_p->ipool_flags & DDI_IRM_FLAG_EXIT)) {
862 			(void) cv_reltimedwait(&pool_p->ipool_cv,
863 			    &pool_p->ipool_lock, interval, TR_CLOCK_TICK);
864 		}
865 
866 		/* Check if awakened to exit */
867 		if (pool_p->ipool_flags & DDI_IRM_FLAG_EXIT) {
868 			DDI_INTR_IRMDBG((CE_CONT,
869 			    "irm_balance_thread: exiting...\n"));
870 			mutex_exit(&pool_p->ipool_lock);
871 			thread_exit();
872 		}
873 
874 		/* Balance the pool */
875 		i_ddi_irm_balance(pool_p);
876 
877 		/* Notify waiters */
878 		if (pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) {
879 			cv_broadcast(&pool_p->ipool_cv);
880 			pool_p->ipool_flags &= ~(DDI_IRM_FLAG_WAITERS);
881 		}
882 
883 		/* Clear QUEUED condition */
884 		pool_p->ipool_flags &= ~(DDI_IRM_FLAG_QUEUED);
885 
886 		/* Sleep until queued */
887 		cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
888 
889 		DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: signaled.\n"));
890 	}
891 }
892 
893 /*
894  * i_ddi_irm_balance()
895  *
896  *	Balance a pool.  The general algorithm is to first reset all
897  *	requests to their maximum size, use reduction algorithms to
898  *	solve any imbalance, and then notify affected drivers.
899  */
900 static void
901 i_ddi_irm_balance(ddi_irm_pool_t *pool_p)
902 {
903 	ddi_irm_req_t	*req_p;
904 
905 #ifdef	DEBUG
906 	uint_t		debug_totsz = 0;
907 	int		debug_policy = 0;
908 #endif	/* DEBUG */
909 
910 	ASSERT(pool_p != NULL);
911 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
912 
913 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: pool_p %p\n",
914 	    (void *)pool_p));
915 
916 #ifndef DEBUG
917 	if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
918 #else
919 	if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
920 #endif  /* DEBUG */
921 		DDI_INTR_IRMDBG((CE_CONT,
922 		    "i_ddi_irm_balance: pool already balanced\n"));
923 		return;
924 	}
925 
926 #ifdef	DEBUG	/* Adjust size and policy settings */
927 	if (irm_debug_size > pool_p->ipool_minno) {
928 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: debug size %d\n",
929 		    irm_debug_size));
930 		debug_totsz = pool_p->ipool_totsz;
931 		pool_p->ipool_totsz = irm_debug_size;
932 	}
933 	if (DDI_IRM_POLICY_VALID(irm_debug_policy)) {
934 		DDI_INTR_IRMDBG((CE_CONT,
935 		    "i_ddi_irm_balance: debug policy %d\n", irm_debug_policy));
936 		debug_policy = pool_p->ipool_policy;
937 		pool_p->ipool_policy = irm_debug_policy;
938 	}
939 #endif	/* DEBUG */
940 
941 	/* Lock the availability lock */
942 	mutex_enter(&pool_p->ipool_navail_lock);
943 
944 	/*
945 	 * Put all of the reducible requests into a scratch list.
946 	 * Reset each one of them to their maximum availability.
947 	 */
948 	for (req_p = list_head(&pool_p->ipool_req_list); req_p;
949 	    req_p = list_next(&pool_p->ipool_req_list, req_p)) {
950 		if (DDI_IRM_IS_REDUCIBLE(req_p)) {
951 			pool_p->ipool_resno -= req_p->ireq_navail;
952 			req_p->ireq_scratch = req_p->ireq_navail;
953 			req_p->ireq_navail = req_p->ireq_nreq;
954 			pool_p->ipool_resno += req_p->ireq_navail;
955 			list_insert_tail(&pool_p->ipool_scratch_list, req_p);
956 		}
957 	}
958 
959 	/* Balance the requests */
960 	i_ddi_irm_reduce(pool_p);
961 
962 	/* Unlock the availability lock */
963 	mutex_exit(&pool_p->ipool_navail_lock);
964 
965 	/*
966 	 * Process REMOVE notifications.
967 	 *
968 	 * If a driver fails to release interrupts: exclude it from
969 	 * further processing, correct the resulting imbalance, and
970 	 * start over again at the head of the scratch list.
971 	 */
972 	req_p = list_head(&pool_p->ipool_scratch_list);
973 	while (req_p) {
974 		if ((req_p->ireq_navail < req_p->ireq_scratch) &&
975 		    (i_ddi_irm_notify(pool_p, req_p) != DDI_SUCCESS)) {
976 			list_remove(&pool_p->ipool_scratch_list, req_p);
977 			mutex_enter(&pool_p->ipool_navail_lock);
978 			i_ddi_irm_reduce(pool_p);
979 			mutex_exit(&pool_p->ipool_navail_lock);
980 			req_p = list_head(&pool_p->ipool_scratch_list);
981 		} else {
982 			req_p = list_next(&pool_p->ipool_scratch_list, req_p);
983 		}
984 	}
985 
986 	/*
987 	 * Process ADD notifications.
988 	 *
989 	 * This is the last use of the scratch list, so empty it.
990 	 */
991 	while (req_p = list_remove_head(&pool_p->ipool_scratch_list)) {
992 		if (req_p->ireq_navail > req_p->ireq_scratch) {
993 			(void) i_ddi_irm_notify(pool_p, req_p);
994 		}
995 	}
996 
997 #ifdef	DEBUG	/* Restore size and policy settings */
998 	if (debug_totsz != 0)
999 		pool_p->ipool_totsz = debug_totsz;
1000 	if (debug_policy != 0)
1001 		pool_p->ipool_policy = debug_policy;
1002 #endif	/* DEBUG */
1003 }
1004 
1005 /*
1006  * i_ddi_irm_reduce()
1007  *
1008  *	Use reduction algorithms to correct an imbalance in a pool.
1009  */
1010 static void
1011 i_ddi_irm_reduce(ddi_irm_pool_t *pool_p)
1012 {
1013 	int	imbalance;
1014 
1015 	ASSERT(pool_p != NULL);
1016 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1017 	ASSERT(DDI_IRM_POLICY_VALID(pool_p->ipool_policy));
1018 
1019 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_reduce: pool_p %p\n",
1020 	    (void *)pool_p));
1021 
1022 	/* Compute the imbalance.  Do nothing if already balanced. */
1023 	if ((imbalance = pool_p->ipool_resno - pool_p->ipool_totsz) <= 0)
1024 		return;
1025 
1026 	/*
1027 	 * Try policy based reduction first. If it failed, then
1028 	 * possibly reduce new requests as a last resort.
1029 	 */
1030 	if (i_ddi_irm_reduce_by_policy(pool_p, imbalance, pool_p->ipool_policy)
1031 	    != DDI_SUCCESS) {
1032 
1033 		DDI_INTR_IRMDBG((CE_CONT,
1034 		    "i_ddi_irm_reduce: policy reductions failed.\n"));
1035 
1036 		/* Compute remaining imbalance */
1037 		imbalance = pool_p->ipool_resno - pool_p->ipool_totsz;
1038 
1039 		ASSERT(imbalance > 0);
1040 
1041 		i_ddi_irm_reduce_new(pool_p, imbalance);
1042 	}
1043 }
1044 
1045 /*
1046  * i_ddi_irm_enqueue()
1047  *
1048  *	Queue a pool to be balanced.  Signals the balancing thread to wake
1049  *	up and process the pool.  If 'wait_flag' is true, then the current
1050  *	thread becomes a waiter and blocks until the balance is completed.
1051  */
1052 static void
1053 i_ddi_irm_enqueue(ddi_irm_pool_t *pool_p, boolean_t wait_flag)
1054 {
1055 	ASSERT(pool_p != NULL);
1056 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1057 
1058 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool_p %p wait_flag %d\n",
1059 	    (void *)pool_p, (int)wait_flag));
1060 
1061 	/* Do nothing if pool is already balanced */
1062 #ifndef	DEBUG
1063 	if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
1064 #else
1065 	if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
1066 #endif	/* DEBUG */
1067 		DDI_INTR_IRMDBG((CE_CONT,
1068 		    "i_ddi_irm_enqueue: pool already balanced\n"));
1069 		return;
1070 	}
1071 
1072 	/* Avoid deadlocks when IRM is not active */
1073 	if (!irm_active && wait_flag) {
1074 		DDI_INTR_IRMDBG((CE_CONT,
1075 		    "i_ddi_irm_enqueue: pool not active.\n"));
1076 		return;
1077 	}
1078 
1079 	if (wait_flag)
1080 		pool_p->ipool_flags |= DDI_IRM_FLAG_WAITERS;
1081 
1082 	if (wait_flag || !(pool_p->ipool_flags & DDI_IRM_FLAG_QUEUED)) {
1083 		pool_p->ipool_flags |= DDI_IRM_FLAG_QUEUED;
1084 		cv_signal(&pool_p->ipool_cv);
1085 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool queued.\n"));
1086 	}
1087 
1088 	if (wait_flag) {
1089 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: waiting...\n"));
1090 		cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
1091 	}
1092 }
1093 
1094 /*
1095  * i_ddi_irm_reduce_by_policy()
1096  *
1097  *	Reduces requests based on reduction policies.
1098  *
1099  *	For the DDI_IRM_POLICY_LARGE reduction policy, the algorithm
1100  *	generally reduces larger requests first, before advancing
1101  *	to smaller requests.
1102  *	For the DDI_IRM_POLICY_EVEN reduction policy, the algorithm
1103  *	reduces requests evenly, without giving a specific preference
1104  *	to smaller or larger requests. Each iteration reduces all
1105  *	reducible requests by the same amount until the imbalance is
1106  *	corrected.
1107  *
1108  *	The scratch list is initially sorted in descending order by current
1109  *	navail values, which are maximized prior to reduction. This sorted
1110  *	order is preserved.  It avoids reducing requests below the threshold
1111  *	of the interrupt pool's default allocation size.
1112  *
1113  *	Optimizations in this algorithm include trying to reduce multiple
1114  *	requests together.  And the algorithm attempts to reduce in larger
1115  *	increments when possible to minimize the total number of iterations.
1116  */
1117 static int
1118 i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *pool_p, int imbalance, int policy)
1119 {
1120 	ASSERT(pool_p != NULL);
1121 	ASSERT(imbalance > 0);
1122 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1123 
1124 	while (imbalance > 0) {
1125 		list_t		*slist_p = &pool_p->ipool_scratch_list;
1126 		ddi_irm_req_t	*req_p = list_head(slist_p), *last_p;
1127 		uint_t		nreduce = 0, nremain = 0, stop_navail;
1128 		uint_t		pool_defsz = pool_p->ipool_defsz;
1129 		uint_t		reduction, max_redu;
1130 
1131 		/* Fail if none are reducible */
1132 		if (!req_p || req_p->ireq_navail <= pool_defsz) {
1133 			DDI_INTR_IRMDBG((CE_CONT,
1134 			    "i_ddi_irm_reduce_by_policy: Failure. "
1135 			    "All requests have downsized to low limit.\n"));
1136 			return (DDI_FAILURE);
1137 		}
1138 
1139 		/* Count reducible requests */
1140 		stop_navail = (policy == DDI_IRM_POLICY_LARGE) ?
1141 		    req_p->ireq_navail - 1 : pool_defsz;
1142 		for (; req_p; req_p = list_next(slist_p, req_p)) {
1143 			if (req_p->ireq_navail <= stop_navail)
1144 				break;
1145 			nreduce++;
1146 		}
1147 
1148 		/* Compute reduction */
1149 		last_p = req_p ? list_prev(slist_p, req_p) : list_tail(slist_p);
1150 		if ((policy == DDI_IRM_POLICY_LARGE) && req_p &&
1151 		    req_p->ireq_navail > pool_defsz)
1152 			reduction = last_p->ireq_navail - req_p->ireq_navail;
1153 		else
1154 			reduction = last_p->ireq_navail - pool_defsz;
1155 
1156 		if ((max_redu = reduction * nreduce) > imbalance) {
1157 			reduction = imbalance / nreduce;
1158 			nremain = imbalance % nreduce;
1159 			pool_p->ipool_resno -= imbalance;
1160 			imbalance = 0;
1161 		} else {
1162 			pool_p->ipool_resno -= max_redu;
1163 			imbalance -= max_redu;
1164 		}
1165 
1166 		/* Reduce */
1167 		for (req_p = list_head(slist_p); (reduction != 0) && nreduce--;
1168 		    req_p = list_next(slist_p, req_p)) {
1169 			req_p->ireq_navail -= reduction;
1170 		}
1171 
1172 		for (req_p = last_p; nremain--;
1173 		    req_p = list_prev(slist_p, req_p)) {
1174 			req_p->ireq_navail--;
1175 		}
1176 	}
1177 
1178 	return (DDI_SUCCESS);
1179 }
1180 
1181 /*
1182  * i_ddi_irm_reduce_new()
1183  *
1184  *	Reduces new requests.  This is only used as a last resort
1185  *	after another reduction algorithm failed.
1186  *
1187  *	NOTE: The pool locking in i_ddi_irm_insert() ensures
1188  *	there can be only one new request at a time in a pool.
1189  */
1190 static void
1191 i_ddi_irm_reduce_new(ddi_irm_pool_t *pool_p, int imbalance)
1192 {
1193 	ddi_irm_req_t	*req_p;
1194 
1195 	ASSERT(pool_p != NULL);
1196 	ASSERT(imbalance > 0);
1197 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1198 
1199 	DDI_INTR_IRMDBG((CE_CONT,
1200 	    "i_ddi_irm_reduce_new: pool_p %p imbalance %d\n",
1201 	    (void *)pool_p, imbalance));
1202 
1203 	for (req_p = list_head(&pool_p->ipool_scratch_list); req_p;
1204 	    req_p = list_next(&pool_p->ipool_scratch_list, req_p)) {
1205 		if (req_p->ireq_flags & DDI_IRM_FLAG_NEW) {
1206 			ASSERT(req_p->ireq_navail >= imbalance);
1207 			req_p->ireq_navail -= imbalance;
1208 			pool_p->ipool_resno -= imbalance;
1209 			return;
1210 		}
1211 	}
1212 
1213 	/* should never go here */
1214 	ASSERT(B_FALSE);
1215 }
1216 
1217 /*
1218  * Miscellaneous Helper Functions
1219  */
1220 
1221 /*
1222  * i_ddi_intr_get_pool()
1223  *
1224  *	Get an IRM pool that supplies interrupts of a specified type.
1225  *	Invokes a DDI_INTROP_GETPOOL to the bus nexus driver.  Fails
1226  *	if no pool exists.
1227  */
1228 ddi_irm_pool_t *
1229 i_ddi_intr_get_pool(dev_info_t *dip, int type)
1230 {
1231 	devinfo_intr_t		*intr_p;
1232 	ddi_irm_pool_t		*pool_p;
1233 	ddi_irm_req_t		*req_p;
1234 	ddi_intr_handle_impl_t	hdl;
1235 
1236 	ASSERT(dip != NULL);
1237 	ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
1238 
1239 	if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
1240 	    ((req_p = intr_p->devi_irm_req_p) != NULL) &&
1241 	    ((pool_p = req_p->ireq_pool_p) != NULL) &&
1242 	    (pool_p->ipool_types & type)) {
1243 		return (pool_p);
1244 	}
1245 
1246 	bzero(&hdl, sizeof (ddi_intr_handle_impl_t));
1247 	hdl.ih_dip = dip;
1248 	hdl.ih_type = type;
1249 
1250 	if (i_ddi_intr_ops(dip, dip, DDI_INTROP_GETPOOL,
1251 	    &hdl, (void *)&pool_p) == DDI_SUCCESS)
1252 		return (pool_p);
1253 
1254 	return (NULL);
1255 }
1256 
1257 /*
1258  * i_ddi_irm_insertion_sort()
1259  *
1260  *	Use the insertion sort method to insert a request into a list.
1261  *	The list is sorted in descending order by request size.
1262  */
1263 static void
1264 i_ddi_irm_insertion_sort(list_t *req_list, ddi_irm_req_t *req_p)
1265 {
1266 	ddi_irm_req_t	*next_p;
1267 
1268 	next_p = list_head(req_list);
1269 
1270 	while (next_p && (next_p->ireq_nreq > req_p->ireq_nreq))
1271 		next_p = list_next(req_list, next_p);
1272 
1273 	list_insert_before(req_list, next_p, req_p);
1274 }
1275 
1276 /*
1277  * i_ddi_irm_notify()
1278  *
1279  *	Notify a driver of changes to its interrupt request using the
1280  *	generic callback mechanism.  Checks for errors in processing.
1281  */
1282 static int
1283 i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
1284 {
1285 	ddi_cb_action_t	action;
1286 	ddi_cb_t	*cb_p;
1287 	uint_t		nintrs;
1288 	int		ret, count;
1289 
1290 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: pool_p %p req_p %p\n",
1291 	    (void *)pool_p, (void *)req_p));
1292 
1293 	/* Do not notify new or unchanged requests */
1294 	if ((req_p->ireq_navail == req_p->ireq_scratch) ||
1295 	    (req_p->ireq_flags & DDI_IRM_FLAG_NEW))
1296 		return (DDI_SUCCESS);
1297 
1298 	/* Determine action and count */
1299 	if (req_p->ireq_navail > req_p->ireq_scratch) {
1300 		action = DDI_CB_INTR_ADD;
1301 		count = req_p->ireq_navail - req_p->ireq_scratch;
1302 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: adding %d\n",
1303 		    count));
1304 	} else {
1305 		action = DDI_CB_INTR_REMOVE;
1306 		count = req_p->ireq_scratch - req_p->ireq_navail;
1307 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: removing %d\n",
1308 		    count));
1309 	}
1310 
1311 	/* Lookup driver callback */
1312 	if ((cb_p = DEVI(req_p->ireq_dip)->devi_cb_p) == NULL) {
1313 		DDI_INTR_IRMDBG((CE_WARN, "i_ddi_irm_notify: no callback!\n"));
1314 		return (DDI_FAILURE);
1315 	}
1316 
1317 	/* Do callback */
1318 	ret = cb_p->cb_func(req_p->ireq_dip, action, (void *)(uintptr_t)count,
1319 	    cb_p->cb_arg1, cb_p->cb_arg2);
1320 
1321 	/* Log callback errors */
1322 	if (ret != DDI_SUCCESS) {
1323 		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
1324 		    ddi_driver_name(req_p->ireq_dip),
1325 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
1326 	}
1327 
1328 	/* Check if the driver exceeds its availability */
1329 	nintrs = i_ddi_intr_get_current_nintrs(req_p->ireq_dip);
1330 	if (nintrs > req_p->ireq_navail) {
1331 		cmn_err(CE_WARN, "%s%d: failed to release interrupts "
1332 		    "(nintrs=%d, navail=%d).\n",
1333 		    ddi_driver_name(req_p->ireq_dip),
1334 		    ddi_get_instance(req_p->ireq_dip), nintrs,
1335 		    req_p->ireq_navail);
1336 		pool_p->ipool_resno += (nintrs - req_p->ireq_navail);
1337 		req_p->ireq_navail = nintrs;
1338 		return (DDI_FAILURE);
1339 	}
1340 
1341 	/* Update request */
1342 	req_p->ireq_scratch = req_p->ireq_navail;
1343 
1344 	return (DDI_SUCCESS);
1345 }
1346 
1347 /*
1348  * i_ddi_irm_debug_balance()
1349  *
1350  *	A debug/test only routine to force the immediate,
1351  *	synchronous rebalancing of an interrupt pool.
1352  */
1353 #ifdef	DEBUG
1354 void
1355 i_ddi_irm_debug_balance(dev_info_t *dip, boolean_t wait_flag)
1356 {
1357 	ddi_irm_pool_t	*pool_p;
1358 	int		type;
1359 
1360 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_debug_balance: dip %p wait %d\n",
1361 	    (void *)dip, (int)wait_flag));
1362 
1363 	if (((type = i_ddi_intr_get_current_type(dip)) != 0) &&
1364 	    ((pool_p = i_ddi_intr_get_pool(dip, type)) != NULL)) {
1365 		mutex_enter(&pool_p->ipool_lock);
1366 		i_ddi_irm_enqueue(pool_p, wait_flag);
1367 		mutex_exit(&pool_p->ipool_lock);
1368 	}
1369 }
1370 #endif
1371