xref: /illumos-gate/usr/src/uts/common/os/ddi_intr_irm.c (revision 848f70c9866a9757d1c6dcb2b9db5e7c49997ba5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/note.h>
26 #include <sys/sysmacros.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kmem.h>
31 #include <sys/cmn_err.h>
32 #include <sys/debug.h>
33 #include <sys/ddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/ndi_impldefs.h>	/* include prototypes */
36 
37 #if defined(__i386) || defined(__amd64)
38 /*
39  * MSI-X allocation limit.
40  */
41 extern uint_t		ddi_msix_alloc_limit;
42 #endif
43 
44 /*
45  * Interrupt Resource Management (IRM).
46  */
47 
48 #define	DDI_IRM_BALANCE_DELAY	(60)	/* In seconds */
49 
50 #define	DDI_IRM_HAS_CB(c)	((c) && (c->cb_flags & DDI_CB_FLAG_INTR))
51 
52 #define	DDI_IRM_IS_REDUCIBLE(r)	(((r->ireq_flags & DDI_IRM_FLAG_CALLBACK) && \
53 				(r->ireq_type == DDI_INTR_TYPE_MSIX)) || \
54 				(r->ireq_flags & DDI_IRM_FLAG_NEW))
55 
56 extern pri_t	minclsyspri;
57 
58 /* Global policies */
59 int		irm_enable = 1;
60 boolean_t	irm_active = B_FALSE;
61 int		irm_default_policy = DDI_IRM_POLICY_LARGE;
62 uint_t		irm_balance_delay = DDI_IRM_BALANCE_DELAY;
63 
64 /* Global list of interrupt pools */
65 kmutex_t	irm_pools_lock;
66 list_t		irm_pools_list;
67 
68 /* Global debug tunables */
69 #ifdef	DEBUG
70 int		irm_debug_policy = 0;
71 uint_t		irm_debug_size = 0;
72 #endif	/* DEBUG */
73 
74 static void	irm_balance_thread(ddi_irm_pool_t *);
75 static void	i_ddi_irm_balance(ddi_irm_pool_t *);
76 static void	i_ddi_irm_enqueue(ddi_irm_pool_t *, boolean_t);
77 static void	i_ddi_irm_reduce(ddi_irm_pool_t *pool);
78 static int	i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *, int, int);
79 static void	i_ddi_irm_reduce_new(ddi_irm_pool_t *, int);
80 static void	i_ddi_irm_insertion_sort(list_t *, ddi_irm_req_t *);
81 static int	i_ddi_irm_notify(ddi_irm_pool_t *, ddi_irm_req_t *);
82 static int	i_ddi_irm_modify_increase(ddi_irm_req_t *, int);
83 
84 /*
85  * OS Initialization Routines
86  */
87 
88 /*
89  * irm_init()
90  *
91  *	Initialize IRM subsystem before any drivers are attached.
92  */
93 void
94 irm_init(void)
95 {
96 	/* Do nothing if IRM is disabled */
97 	if (!irm_enable)
98 		return;
99 
100 	/* Verify that the default balancing policy is valid */
101 	if (!DDI_IRM_POLICY_VALID(irm_default_policy))
102 		irm_default_policy = DDI_IRM_POLICY_LARGE;
103 
104 	/* Initialize the global list of interrupt pools */
105 	mutex_init(&irm_pools_lock, NULL, MUTEX_DRIVER, NULL);
106 	list_create(&irm_pools_list, sizeof (ddi_irm_pool_t),
107 	    offsetof(ddi_irm_pool_t, ipool_link));
108 }
109 
110 /*
111  * i_ddi_irm_poststartup()
112  *
113  *	IRM is not activated until after the IO subsystem is initialized.
114  *	When activated, per-pool balancing threads are spawned and a flag
115  *	is set so that all future pools will be activated when created.
116  *
117  *	NOTE: the global variable 'irm_enable' disables IRM if zero.
118  */
119 void
120 i_ddi_irm_poststartup(void)
121 {
122 	ddi_irm_pool_t	*pool_p;
123 
124 	/* Do nothing if IRM is disabled */
125 	if (!irm_enable)
126 		return;
127 
128 	/* Lock the global list */
129 	mutex_enter(&irm_pools_lock);
130 
131 	/* Activate all defined pools */
132 	for (pool_p = list_head(&irm_pools_list); pool_p;
133 	    pool_p = list_next(&irm_pools_list, pool_p))
134 		pool_p->ipool_thread = thread_create(NULL, 0,
135 		    irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
136 
137 	/* Set future pools to be active */
138 	irm_active = B_TRUE;
139 
140 	/* Unlock the global list */
141 	mutex_exit(&irm_pools_lock);
142 }
143 
144 /*
145  * NDI interfaces for creating/destroying IRM pools.
146  */
147 
148 /*
149  * ndi_irm_create()
150  *
151  *	Nexus interface to create an IRM pool.  Create the new
152  *	pool and add it to the global list of interrupt pools.
153  */
154 int
155 ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp,
156     ddi_irm_pool_t **pool_retp)
157 {
158 	ddi_irm_pool_t	*pool_p;
159 
160 	ASSERT(dip != NULL);
161 	ASSERT(paramsp != NULL);
162 	ASSERT(pool_retp != NULL);
163 	ASSERT(paramsp->iparams_total >= 1);
164 	ASSERT(paramsp->iparams_types != 0);
165 
166 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_create: dip %p\n", (void *)dip));
167 
168 	/* Check if IRM is enabled */
169 	if (!irm_enable)
170 		return (NDI_FAILURE);
171 
172 	/* Validate parameters */
173 	if ((dip == NULL) || (paramsp == NULL) || (pool_retp == NULL) ||
174 	    (paramsp->iparams_total < 1) || (paramsp->iparams_types == 0))
175 		return (NDI_FAILURE);
176 
177 	/* Allocate and initialize the pool */
178 	pool_p = kmem_zalloc(sizeof (ddi_irm_pool_t), KM_SLEEP);
179 	pool_p->ipool_owner = dip;
180 	pool_p->ipool_policy = irm_default_policy;
181 	pool_p->ipool_types = paramsp->iparams_types;
182 	pool_p->ipool_totsz = paramsp->iparams_total;
183 	pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, MAX(DDI_MIN_MSIX_ALLOC,
184 	    paramsp->iparams_total / DDI_MSIX_ALLOC_DIVIDER));
185 	list_create(&pool_p->ipool_req_list, sizeof (ddi_irm_req_t),
186 	    offsetof(ddi_irm_req_t, ireq_link));
187 	list_create(&pool_p->ipool_scratch_list, sizeof (ddi_irm_req_t),
188 	    offsetof(ddi_irm_req_t, ireq_scratch_link));
189 	cv_init(&pool_p->ipool_cv, NULL, CV_DRIVER, NULL);
190 	mutex_init(&pool_p->ipool_lock, NULL, MUTEX_DRIVER, NULL);
191 	mutex_init(&pool_p->ipool_navail_lock, NULL, MUTEX_DRIVER, NULL);
192 
193 	/* Add to global list of pools */
194 	mutex_enter(&irm_pools_lock);
195 	list_insert_tail(&irm_pools_list, pool_p);
196 	mutex_exit(&irm_pools_lock);
197 
198 	/* If IRM is active, then activate the pool */
199 	if (irm_active)
200 		pool_p->ipool_thread = thread_create(NULL, 0,
201 		    irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
202 
203 	*pool_retp = pool_p;
204 	return (NDI_SUCCESS);
205 }
206 
207 /*
208  * ndi_irm_resize_pool()
209  *
210  *	Nexus interface to resize IRM pool. If the pool size drops
211  *	below  the allocated number of vectors then initiate rebalance
212  *	operation before resizing the pool. If rebalance operation fails
213  *	then return NDI_FAILURE.
214  */
215 int
216 ndi_irm_resize_pool(ddi_irm_pool_t *pool_p, uint_t new_size)
217 {
218 	uint_t prev_size;
219 
220 	ASSERT(pool_p != NULL);
221 
222 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
223 	    " current-size 0x%x new-size 0x%x\n",
224 	    (void *)pool_p, pool_p->ipool_totsz, new_size));
225 
226 	if (pool_p == NULL)
227 		return (NDI_EINVAL);
228 
229 	/* Check if IRM is enabled */
230 	if (!irm_enable)
231 		return (NDI_FAILURE);
232 
233 	mutex_enter(&pool_p->ipool_lock);
234 
235 	/*
236 	 * If we are increasing the pool size or if the reserved
237 	 * number of vectors is <= the new pool size then simply
238 	 * update the pool size and enqueue a reblance operation
239 	 * if necessary to use the new vectors.
240 	 */
241 	if ((pool_p->ipool_totsz < new_size) ||
242 	    (pool_p->ipool_resno <= new_size)) {
243 		/* set new pool size */
244 		pool_p->ipool_totsz = new_size;
245 		/* adjust the default allocation limit */
246 		pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
247 		    MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
248 		/* queue a rebalance operation to use the new vectors */
249 		if (pool_p->ipool_reqno > pool_p->ipool_resno)
250 			i_ddi_irm_enqueue(pool_p, B_FALSE);
251 		mutex_exit(&pool_p->ipool_lock);
252 		return (NDI_SUCCESS);
253 	}
254 
255 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
256 	    " needs a rebalance operation\n", (void *)pool_p));
257 
258 	/*
259 	 * requires a rebalance operation
260 	 */
261 	/* save the current pool size */
262 	prev_size = pool_p->ipool_totsz;
263 	/* set the pool size to the desired new value */
264 	pool_p->ipool_totsz = new_size;
265 	/* perform the rebalance operation */
266 	i_ddi_irm_enqueue(pool_p, B_TRUE);
267 
268 	/*
269 	 * If rebalance operation couldn't free up enough
270 	 * vectors then fail the resize operation.
271 	 */
272 	if (pool_p->ipool_resno > new_size) { /* rebalance failed */
273 		/* restore the pool size to the previous value */
274 		pool_p->ipool_totsz = prev_size;
275 		/* enqueue a rebalance operation for the original pool size */
276 		i_ddi_irm_enqueue(pool_p, B_FALSE);
277 		mutex_exit(&pool_p->ipool_lock);
278 		return (NDI_FAILURE);
279 	} else { /* rebalance worked */
280 		/* adjust the default allocation limit */
281 		pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
282 		    MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
283 		mutex_exit(&pool_p->ipool_lock);
284 		DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
285 		    " resized from %x to %x\n",
286 		    (void *)pool_p, prev_size, pool_p->ipool_totsz));
287 		return (NDI_SUCCESS);
288 	}
289 }
290 
291 /*
292  * ndi_irm_destroy()
293  *
294  *	Nexus interface to destroy an IRM pool.  Destroy the pool
295  *	and remove it from the global list of interrupt pools.
296  */
297 int
298 ndi_irm_destroy(ddi_irm_pool_t *pool_p)
299 {
300 	ASSERT(pool_p != NULL);
301 	ASSERT(pool_p->ipool_resno == 0);
302 
303 	DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_destroy: pool_p %p\n",
304 	    (void *)pool_p));
305 
306 	/* Validate parameters */
307 	if (pool_p == NULL)
308 		return (NDI_FAILURE);
309 
310 	/* Validate that pool is empty */
311 	if (pool_p->ipool_resno != 0)
312 		return (NDI_BUSY);
313 
314 	/* Remove the pool from the global list */
315 	mutex_enter(&irm_pools_lock);
316 	list_remove(&irm_pools_list, pool_p);
317 	mutex_exit(&irm_pools_lock);
318 
319 	/* Terminate the balancing thread */
320 	mutex_enter(&pool_p->ipool_lock);
321 	if (pool_p->ipool_thread &&
322 	    (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) {
323 		pool_p->ipool_flags |= DDI_IRM_FLAG_EXIT;
324 		cv_signal(&pool_p->ipool_cv);
325 		mutex_exit(&pool_p->ipool_lock);
326 		thread_join(pool_p->ipool_thread->t_did);
327 	} else
328 		mutex_exit(&pool_p->ipool_lock);
329 
330 	/* Destroy the pool */
331 	cv_destroy(&pool_p->ipool_cv);
332 	mutex_destroy(&pool_p->ipool_lock);
333 	mutex_destroy(&pool_p->ipool_navail_lock);
334 	list_destroy(&pool_p->ipool_req_list);
335 	list_destroy(&pool_p->ipool_scratch_list);
336 	kmem_free(pool_p, sizeof (ddi_irm_pool_t));
337 
338 	return (NDI_SUCCESS);
339 }
340 
341 /*
342  * Insert/Modify/Remove Interrupt Requests
343  */
344 
345 /*
346  * i_ddi_irm_insert()
347  *
348  *	Insert a new request into an interrupt pool, and balance the pool.
349  */
350 int
351 i_ddi_irm_insert(dev_info_t *dip, int type, int count)
352 {
353 	ddi_irm_req_t	*req_p;
354 	devinfo_intr_t	*intr_p;
355 	ddi_irm_pool_t	*pool_p;
356 	uint_t		nreq, nmin, npartial;
357 	boolean_t	irm_flag = B_FALSE;
358 
359 	ASSERT(dip != NULL);
360 	ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
361 	ASSERT(count > 0);
362 
363 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: dip %p type %d count %d\n",
364 	    (void *)dip, type, count));
365 
366 	/* Validate parameters */
367 	if ((dip == NULL) || (count < 1) || !DDI_INTR_TYPE_FLAG_VALID(type)) {
368 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: invalid args\n"));
369 		return (DDI_EINVAL);
370 	}
371 
372 	/* Check for an existing request */
373 	if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
374 	    (intr_p->devi_irm_req_p != NULL))
375 		return (DDI_SUCCESS);
376 
377 	/* Check for IRM support from the system */
378 	if ((pool_p = i_ddi_intr_get_pool(dip, type)) == NULL) {
379 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: not supported\n"));
380 		return (DDI_ENOTSUP);
381 	}
382 
383 	/* Check for IRM support from the driver */
384 	if (i_ddi_irm_supported(dip, type) == DDI_SUCCESS)
385 		irm_flag = B_TRUE;
386 
387 	/* Determine request size */
388 	nreq = (irm_flag) ? count :
389 	    MIN(count, i_ddi_intr_get_limit(dip, type, pool_p));
390 	nmin = (irm_flag) ? 1 : nreq;
391 	npartial = MIN(nreq, pool_p->ipool_defsz);
392 
393 	/* Allocate and initialize the request */
394 	req_p = kmem_zalloc(sizeof (ddi_irm_req_t), KM_SLEEP);
395 	req_p->ireq_type = type;
396 	req_p->ireq_dip = dip;
397 	req_p->ireq_pool_p = pool_p;
398 	req_p->ireq_nreq = nreq;
399 	req_p->ireq_flags = DDI_IRM_FLAG_NEW;
400 	if (irm_flag)
401 		req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
402 
403 	/* Lock the pool */
404 	mutex_enter(&pool_p->ipool_lock);
405 
406 	/* Check for minimal fit before inserting */
407 	if ((pool_p->ipool_minno + nmin) > pool_p->ipool_totsz) {
408 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
409 		    ddi_driver_name(dip), ddi_get_instance(dip));
410 		mutex_exit(&pool_p->ipool_lock);
411 		kmem_free(req_p, sizeof (ddi_irm_req_t));
412 		return (DDI_EAGAIN);
413 	}
414 
415 	/* Insert the request into the pool */
416 	pool_p->ipool_reqno += nreq;
417 	pool_p->ipool_minno += nmin;
418 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
419 
420 	/*
421 	 * Try to fulfill the request.
422 	 *
423 	 * If all the interrupts are available, and either the request
424 	 * is static or the pool is active, then just take them directly.
425 	 *
426 	 * If only some of the interrupts are available, and the request
427 	 * can receive future callbacks, then take some now but queue the
428 	 * pool to be rebalanced later.
429 	 *
430 	 * Otherwise, immediately rebalance the pool and wait.
431 	 */
432 	if ((!irm_flag || (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) &&
433 	    ((pool_p->ipool_resno + nreq) <= pool_p->ipool_totsz)) {
434 
435 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
436 		    "request completely fulfilled.\n"));
437 		pool_p->ipool_resno += nreq;
438 		req_p->ireq_navail = nreq;
439 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
440 
441 	} else if (irm_flag &&
442 	    ((pool_p->ipool_resno + npartial) <= pool_p->ipool_totsz)) {
443 
444 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
445 		    "request partially fulfilled.\n"));
446 		pool_p->ipool_resno += npartial;
447 		req_p->ireq_navail = npartial;
448 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
449 		i_ddi_irm_enqueue(pool_p, B_FALSE);
450 
451 	} else {
452 
453 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
454 		    "request needs immediate rebalance.\n"));
455 		i_ddi_irm_enqueue(pool_p, B_TRUE);
456 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
457 	}
458 
459 	/* Fail if the request cannot be fulfilled at all */
460 	if (req_p->ireq_navail == 0) {
461 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
462 		    ddi_driver_name(dip), ddi_get_instance(dip));
463 		pool_p->ipool_reqno -= nreq;
464 		pool_p->ipool_minno -= nmin;
465 		list_remove(&pool_p->ipool_req_list, req_p);
466 		mutex_exit(&pool_p->ipool_lock);
467 		kmem_free(req_p, sizeof (ddi_irm_req_t));
468 		return (DDI_EAGAIN);
469 	}
470 
471 	/* Unlock the pool */
472 	mutex_exit(&pool_p->ipool_lock);
473 
474 	intr_p->devi_irm_req_p = req_p;
475 	return (DDI_SUCCESS);
476 }
477 
478 /*
479  * i_ddi_irm_modify()
480  *
481  *	Modify an existing request in an interrupt pool, and balance the pool.
482  */
483 int
484 i_ddi_irm_modify(dev_info_t *dip, int nreq)
485 {
486 	devinfo_intr_t	*intr_p;
487 	ddi_irm_req_t	*req_p;
488 	ddi_irm_pool_t	*pool_p;
489 	int		type;
490 	int		retval = DDI_SUCCESS;
491 
492 	ASSERT(dip != NULL);
493 	ASSERT(nreq > 0);
494 
495 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: dip %p nreq %d\n",
496 	    (void *)dip, nreq));
497 
498 	/* Validate parameters */
499 	if ((dip == NULL) || (nreq < 1)) {
500 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
501 		return (DDI_EINVAL);
502 	}
503 
504 	/* Do nothing if not mapped to an IRM pool */
505 	if (((intr_p = DEVI(dip)->devi_intr_p) == NULL) ||
506 	    ((req_p = intr_p->devi_irm_req_p) == NULL))
507 		return (DDI_SUCCESS);
508 
509 	/* Do nothing if new size is the same */
510 	if (nreq == req_p->ireq_nreq)
511 		return (DDI_SUCCESS);
512 
513 	/* Do not allow MSI requests to be resized */
514 	if ((type = req_p->ireq_type) == DDI_INTR_TYPE_MSI) {
515 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid type\n"));
516 		return (DDI_ENOTSUP);
517 	}
518 
519 	/* Select the pool */
520 	if ((pool_p = req_p->ireq_pool_p) == NULL) {
521 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: missing pool\n"));
522 		return (DDI_FAILURE);
523 	}
524 
525 	/* Validate request size is not too large */
526 	if (nreq > i_ddi_intr_get_limit(dip, type, pool_p)) {
527 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
528 		return (DDI_EINVAL);
529 	}
530 
531 	/* Lock the pool */
532 	mutex_enter(&pool_p->ipool_lock);
533 
534 	/*
535 	 * Process the modification.
536 	 *
537 	 *	- To increase a non-IRM request, call the implementation in
538 	 *	  i_ddi_irm_modify_increase().
539 	 *
540 	 *	- To decrease a non-IRM request, directly update the pool and
541 	 *	  request, then queue the pool for later rebalancing.
542 	 *
543 	 *	- To modify an IRM request, always queue the pool for later
544 	 *	  rebalancing.  IRM consumers rely upon callbacks for changes.
545 	 */
546 	if ((nreq > req_p->ireq_nreq) &&
547 	    (i_ddi_irm_supported(dip, type) != DDI_SUCCESS)) {
548 
549 		retval = i_ddi_irm_modify_increase(req_p, nreq);
550 
551 	} else {
552 
553 		/* Update pool and request */
554 		pool_p->ipool_reqno -= req_p->ireq_nreq;
555 		pool_p->ipool_reqno += nreq;
556 		if (i_ddi_irm_supported(dip, type) != DDI_SUCCESS) {
557 			pool_p->ipool_minno -= req_p->ireq_navail;
558 			pool_p->ipool_resno -= req_p->ireq_navail;
559 			pool_p->ipool_minno += nreq;
560 			pool_p->ipool_resno += nreq;
561 			req_p->ireq_navail = nreq;
562 		}
563 		req_p->ireq_nreq = nreq;
564 
565 		/* Re-sort request into the pool */
566 		list_remove(&pool_p->ipool_req_list, req_p);
567 		i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
568 
569 		/* Queue pool for asynchronous rebalance */
570 		i_ddi_irm_enqueue(pool_p, B_FALSE);
571 	}
572 
573 	/* Unlock the pool */
574 	mutex_exit(&pool_p->ipool_lock);
575 
576 	return (retval);
577 }
578 
579 /*
580  * i_ddi_irm_modify_increase()
581  *
582  *	Increase a non-IRM request.  The additional interrupts are
583  *	directly taken from the pool when possible.  Otherwise, an
584  *	immediate, synchronous rebalance is performed.  A temporary
585  *	proxy request is used for any rebalance operation to ensure
586  *	the request is not reduced below its current allocation.
587  *
588  *	NOTE: pool must already be locked.
589  */
590 static int
591 i_ddi_irm_modify_increase(ddi_irm_req_t *req_p, int nreq)
592 {
593 	dev_info_t	*dip = req_p->ireq_dip;
594 	ddi_irm_pool_t	*pool_p = req_p->ireq_pool_p;
595 	ddi_irm_req_t	new_req;
596 	int		count, delta;
597 
598 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
599 
600 	/* Compute number of additional vectors */
601 	count = nreq - req_p->ireq_nreq;
602 
603 	/* Check for minimal fit */
604 	if ((pool_p->ipool_minno + count) > pool_p->ipool_totsz) {
605 		cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
606 		    ddi_driver_name(dip), ddi_get_instance(dip));
607 		return (DDI_EAGAIN);
608 	}
609 
610 	/* Update the pool */
611 	pool_p->ipool_reqno += count;
612 	pool_p->ipool_minno += count;
613 
614 	/* Attempt direct implementation */
615 	if ((pool_p->ipool_resno + count) <= pool_p->ipool_totsz) {
616 		req_p->ireq_nreq += count;
617 		req_p->ireq_navail += count;
618 		pool_p->ipool_resno += count;
619 		return (DDI_SUCCESS);
620 	}
621 
622 	/* Rebalance required: fail if pool is not active */
623 	if ((pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE) == 0) {
624 		pool_p->ipool_reqno -= count;
625 		pool_p->ipool_minno -= count;
626 		return (DDI_EAGAIN);
627 	}
628 
629 	/* Insert temporary proxy request */
630 	bzero(&new_req, sizeof (ddi_irm_req_t));
631 	new_req.ireq_dip = dip;
632 	new_req.ireq_nreq = count;
633 	new_req.ireq_pool_p = pool_p;
634 	new_req.ireq_type = req_p->ireq_type;
635 	new_req.ireq_flags = DDI_IRM_FLAG_NEW;
636 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, &new_req);
637 
638 	/* Synchronously rebalance */
639 	i_ddi_irm_enqueue(pool_p, B_TRUE);
640 
641 	/* Remove proxy request, and merge into original request */
642 	req_p->ireq_nreq += count;
643 	if ((delta = (count - new_req.ireq_navail)) > 0) {
644 		req_p->ireq_nreq -= delta;
645 		pool_p->ipool_reqno -= delta;
646 		pool_p->ipool_minno -= delta;
647 	}
648 	req_p->ireq_navail += new_req.ireq_navail;
649 	list_remove(&pool_p->ipool_req_list, req_p);
650 	list_remove(&pool_p->ipool_req_list, &new_req);
651 	i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
652 
653 	return (DDI_SUCCESS);
654 }
655 
656 /*
657  * i_ddi_irm_remove()
658  *
659  *	Remove a request from an interrupt pool, and balance the pool.
660  */
661 int
662 i_ddi_irm_remove(dev_info_t *dip)
663 {
664 	devinfo_intr_t	*intr_p;
665 	ddi_irm_pool_t	*pool_p;
666 	ddi_irm_req_t	*req_p;
667 	uint_t		nmin;
668 
669 	ASSERT(dip != NULL);
670 
671 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: dip %p\n", (void *)dip));
672 
673 	/* Validate parameters */
674 	if (dip == NULL) {
675 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: invalid args\n"));
676 		return (DDI_EINVAL);
677 	}
678 
679 	/* Check if the device has a request */
680 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
681 	    !(req_p = intr_p->devi_irm_req_p)) {
682 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: not found\n"));
683 		return (DDI_EINVAL);
684 	}
685 
686 	/* Lock the pool */
687 	pool_p = req_p->ireq_pool_p;
688 	mutex_enter(&pool_p->ipool_lock);
689 
690 	/* Remove request */
691 	nmin = DDI_IRM_IS_REDUCIBLE(req_p) ? 1 : req_p->ireq_nreq;
692 	pool_p->ipool_minno -= nmin;
693 	pool_p->ipool_reqno -= req_p->ireq_nreq;
694 	pool_p->ipool_resno -= req_p->ireq_navail;
695 	list_remove(&pool_p->ipool_req_list, req_p);
696 
697 	/* Queue pool to be rebalanced */
698 	i_ddi_irm_enqueue(pool_p, B_FALSE);
699 
700 	/* Unlock the pool */
701 	mutex_exit(&pool_p->ipool_lock);
702 
703 	/* Destroy the request */
704 	intr_p->devi_irm_req_p = NULL;
705 	kmem_free(req_p, sizeof (ddi_irm_req_t));
706 
707 	return (DDI_SUCCESS);
708 }
709 
710 /*
711  * i_ddi_irm_set_cb()
712  *
713  *	Change the callback flag for a request, in response to
714  *	a change in its callback registration.  Then rebalance
715  *	the interrupt pool.
716  *
717  *	NOTE: the request is not locked because the navail value
718  *	      is not directly affected.  The balancing thread may
719  *	      modify the navail value in the background after it
720  *	      locks the request itself.
721  */
722 void
723 i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
724 {
725 	devinfo_intr_t	*intr_p;
726 	ddi_irm_pool_t	*pool_p;
727 	ddi_irm_req_t	*req_p;
728 	uint_t		nreq;
729 
730 	ASSERT(dip != NULL);
731 
732 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: dip %p has_cb_flag %d\n",
733 	    (void *)dip, (int)has_cb_flag));
734 
735 	/* Validate parameters */
736 	if (dip == NULL)
737 		return;
738 
739 	/* Check for association with interrupt pool */
740 	if (!(intr_p = DEVI(dip)->devi_intr_p) ||
741 	    !(req_p = intr_p->devi_irm_req_p)) {
742 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: not in pool\n"));
743 		return;
744 	}
745 
746 	/* Lock the pool */
747 	pool_p = req_p->ireq_pool_p;
748 	mutex_enter(&pool_p->ipool_lock);
749 
750 	/*
751 	 * Update the request and the pool
752 	 */
753 	if (has_cb_flag) {
754 
755 		/* Update pool statistics */
756 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
757 			pool_p->ipool_minno -= (req_p->ireq_nreq - 1);
758 
759 		/* Update request */
760 		req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
761 
762 		/* Rebalance in background */
763 		i_ddi_irm_enqueue(pool_p, B_FALSE);
764 
765 	} else {
766 
767 		/* Determine new request size */
768 		nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
769 
770 #if defined(__i386) || defined(__amd64)
771 		/* Use the default static limit for non-IRM drivers */
772 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
773 			nreq = MIN(nreq, ddi_msix_alloc_limit);
774 #endif
775 
776 		/* Update pool statistics */
777 		pool_p->ipool_reqno -= req_p->ireq_nreq;
778 		pool_p->ipool_reqno += nreq;
779 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) {
780 			pool_p->ipool_minno -= 1;
781 			pool_p->ipool_minno += nreq;
782 		} else {
783 			pool_p->ipool_minno -= req_p->ireq_nreq;
784 			pool_p->ipool_minno += nreq;
785 		}
786 
787 		/* Update request size, and re-sort in pool */
788 		req_p->ireq_nreq = nreq;
789 		list_remove(&pool_p->ipool_req_list, req_p);
790 		i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
791 
792 		/* Rebalance synchronously, before losing callback */
793 		i_ddi_irm_enqueue(pool_p, B_TRUE);
794 
795 		/* Remove callback flag */
796 		req_p->ireq_flags &= ~(DDI_IRM_FLAG_CALLBACK);
797 	}
798 
799 	/* Unlock the pool */
800 	mutex_exit(&pool_p->ipool_lock);
801 }
802 
803 /*
804  * i_ddi_irm_supported()
805  *
806  *	Query if IRM is supported by a driver using a specific interrupt type.
807  *	Notice that IRM is limited to MSI-X users with registered callbacks.
808  */
809 int
810 i_ddi_irm_supported(dev_info_t *dip, int type)
811 {
812 	ddi_cb_t	*cb_p = DEVI(dip)->devi_cb_p;
813 
814 	return ((DDI_IRM_HAS_CB(cb_p) && (type == DDI_INTR_TYPE_MSIX)) ?
815 	    DDI_SUCCESS : DDI_ENOTSUP);
816 }
817 
818 /*
819  * Interrupt Pool Balancing
820  */
821 
822 /*
823  * irm_balance_thread()
824  *
825  *	One instance of this thread operates per each defined IRM pool.
826  *	It does the initial activation of the pool, as well as balancing
827  *	any requests that were queued up before the pool was active.
828  *	Once active, it waits forever to service balance operations.
829  */
830 static void
831 irm_balance_thread(ddi_irm_pool_t *pool_p)
832 {
833 	clock_t		interval;
834 
835 	DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: pool_p %p\n",
836 	    (void *)pool_p));
837 
838 	/* Lock the pool */
839 	mutex_enter(&pool_p->ipool_lock);
840 
841 	/* Perform initial balance if required */
842 	if (pool_p->ipool_reqno > pool_p->ipool_resno)
843 		i_ddi_irm_balance(pool_p);
844 
845 	/* Activate the pool */
846 	pool_p->ipool_flags |= DDI_IRM_FLAG_ACTIVE;
847 
848 	/* Main loop */
849 	for (;;) {
850 
851 		/* Compute the delay interval */
852 		interval = drv_usectohz(irm_balance_delay * 1000000);
853 
854 		/* Sleep until queued */
855 		cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
856 
857 		DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: signaled.\n"));
858 
859 		/* Wait one interval, or until there are waiters */
860 		if ((interval > 0) &&
861 		    !(pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) &&
862 		    !(pool_p->ipool_flags & DDI_IRM_FLAG_EXIT)) {
863 			(void) cv_reltimedwait(&pool_p->ipool_cv,
864 			    &pool_p->ipool_lock, interval, TR_CLOCK_TICK);
865 		}
866 
867 		/* Check if awakened to exit */
868 		if (pool_p->ipool_flags & DDI_IRM_FLAG_EXIT) {
869 			DDI_INTR_IRMDBG((CE_CONT,
870 			    "irm_balance_thread: exiting...\n"));
871 			mutex_exit(&pool_p->ipool_lock);
872 			thread_exit();
873 		}
874 
875 		/* Balance the pool */
876 		i_ddi_irm_balance(pool_p);
877 
878 		/* Notify waiters */
879 		if (pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) {
880 			cv_broadcast(&pool_p->ipool_cv);
881 			pool_p->ipool_flags &= ~(DDI_IRM_FLAG_WAITERS);
882 		}
883 
884 		/* Clear QUEUED condition */
885 		pool_p->ipool_flags &= ~(DDI_IRM_FLAG_QUEUED);
886 	}
887 }
888 
889 /*
890  * i_ddi_irm_balance()
891  *
892  *	Balance a pool.  The general algorithm is to first reset all
893  *	requests to their maximum size, use reduction algorithms to
894  *	solve any imbalance, and then notify affected drivers.
895  */
896 static void
897 i_ddi_irm_balance(ddi_irm_pool_t *pool_p)
898 {
899 	ddi_irm_req_t	*req_p;
900 
901 #ifdef	DEBUG
902 	uint_t		debug_totsz = 0;
903 	int		debug_policy = 0;
904 #endif	/* DEBUG */
905 
906 	ASSERT(pool_p != NULL);
907 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
908 
909 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: pool_p %p\n",
910 	    (void *)pool_p));
911 
912 #ifdef	DEBUG	/* Adjust size and policy settings */
913 	if (irm_debug_size > pool_p->ipool_minno) {
914 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: debug size %d\n",
915 		    irm_debug_size));
916 		debug_totsz = pool_p->ipool_totsz;
917 		pool_p->ipool_totsz = irm_debug_size;
918 	}
919 	if (DDI_IRM_POLICY_VALID(irm_debug_policy)) {
920 		DDI_INTR_IRMDBG((CE_CONT,
921 		    "i_ddi_irm_balance: debug policy %d\n", irm_debug_policy));
922 		debug_policy = pool_p->ipool_policy;
923 		pool_p->ipool_policy = irm_debug_policy;
924 	}
925 #endif	/* DEBUG */
926 
927 	/* Lock the availability lock */
928 	mutex_enter(&pool_p->ipool_navail_lock);
929 
930 	/*
931 	 * Put all of the reducible requests into a scratch list.
932 	 * Reset each one of them to their maximum availability.
933 	 */
934 	for (req_p = list_head(&pool_p->ipool_req_list); req_p;
935 	    req_p = list_next(&pool_p->ipool_req_list, req_p)) {
936 		if (DDI_IRM_IS_REDUCIBLE(req_p)) {
937 			pool_p->ipool_resno -= req_p->ireq_navail;
938 			req_p->ireq_scratch = req_p->ireq_navail;
939 			req_p->ireq_navail = req_p->ireq_nreq;
940 			pool_p->ipool_resno += req_p->ireq_navail;
941 			list_insert_tail(&pool_p->ipool_scratch_list, req_p);
942 		}
943 	}
944 
945 	/* Balance the requests */
946 	i_ddi_irm_reduce(pool_p);
947 
948 	/* Unlock the availability lock */
949 	mutex_exit(&pool_p->ipool_navail_lock);
950 
951 	/*
952 	 * Process REMOVE notifications.
953 	 *
954 	 * If a driver fails to release interrupts: exclude it from
955 	 * further processing, correct the resulting imbalance, and
956 	 * start over again at the head of the scratch list.
957 	 */
958 	req_p = list_head(&pool_p->ipool_scratch_list);
959 	while (req_p) {
960 		if ((req_p->ireq_navail < req_p->ireq_scratch) &&
961 		    (i_ddi_irm_notify(pool_p, req_p) != DDI_SUCCESS)) {
962 			list_remove(&pool_p->ipool_scratch_list, req_p);
963 			mutex_enter(&pool_p->ipool_navail_lock);
964 			i_ddi_irm_reduce(pool_p);
965 			mutex_exit(&pool_p->ipool_navail_lock);
966 			req_p = list_head(&pool_p->ipool_scratch_list);
967 		} else {
968 			req_p = list_next(&pool_p->ipool_scratch_list, req_p);
969 		}
970 	}
971 
972 	/*
973 	 * Process ADD notifications.
974 	 *
975 	 * This is the last use of the scratch list, so empty it.
976 	 */
977 	while (req_p = list_remove_head(&pool_p->ipool_scratch_list)) {
978 		if (req_p->ireq_navail > req_p->ireq_scratch) {
979 			(void) i_ddi_irm_notify(pool_p, req_p);
980 		}
981 	}
982 
983 #ifdef	DEBUG	/* Restore size and policy settings */
984 	if (debug_totsz != 0)
985 		pool_p->ipool_totsz = debug_totsz;
986 	if (debug_policy != 0)
987 		pool_p->ipool_policy = debug_policy;
988 #endif	/* DEBUG */
989 }
990 
991 /*
992  * i_ddi_irm_reduce()
993  *
994  *	Use reduction algorithms to correct an imbalance in a pool.
995  */
996 static void
997 i_ddi_irm_reduce(ddi_irm_pool_t *pool_p)
998 {
999 	int	imbalance;
1000 
1001 	ASSERT(pool_p != NULL);
1002 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1003 	ASSERT(DDI_IRM_POLICY_VALID(pool_p->ipool_policy));
1004 
1005 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_reduce: pool_p %p\n",
1006 	    (void *)pool_p));
1007 
1008 	/* Compute the imbalance.  Do nothing if already balanced. */
1009 	if ((imbalance = pool_p->ipool_resno - pool_p->ipool_totsz) <= 0)
1010 		return;
1011 
1012 	/*
1013 	 * Try policy based reduction first. If it failed, then
1014 	 * possibly reduce new requests as a last resort.
1015 	 */
1016 	if (i_ddi_irm_reduce_by_policy(pool_p, imbalance, pool_p->ipool_policy)
1017 	    != DDI_SUCCESS) {
1018 
1019 		DDI_INTR_IRMDBG((CE_CONT,
1020 		    "i_ddi_irm_reduce: policy reductions failed.\n"));
1021 
1022 		/* Compute remaining imbalance */
1023 		imbalance = pool_p->ipool_resno - pool_p->ipool_totsz;
1024 
1025 		ASSERT(imbalance > 0);
1026 
1027 		i_ddi_irm_reduce_new(pool_p, imbalance);
1028 	}
1029 }
1030 
1031 /*
1032  * i_ddi_irm_enqueue()
1033  *
1034  *	Queue a pool to be balanced.  Signals the balancing thread to wake
1035  *	up and process the pool.  If 'wait_flag' is true, then the current
1036  *	thread becomes a waiter and blocks until the balance is completed.
1037  */
1038 static void
1039 i_ddi_irm_enqueue(ddi_irm_pool_t *pool_p, boolean_t wait_flag)
1040 {
1041 	ASSERT(pool_p != NULL);
1042 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1043 
1044 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool_p %p wait_flag %d\n",
1045 	    (void *)pool_p, (int)wait_flag));
1046 
1047 	/* Do nothing if pool is already balanced */
1048 #ifndef	DEBUG
1049 	if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
1050 #else
1051 	if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
1052 #endif	/* DEBUG */
1053 		DDI_INTR_IRMDBG((CE_CONT,
1054 		    "i_ddi_irm_enqueue: pool already balanced\n"));
1055 		return;
1056 	}
1057 
1058 	/* Avoid deadlocks when IRM is not active */
1059 	if (!irm_active && wait_flag) {
1060 		DDI_INTR_IRMDBG((CE_CONT,
1061 		    "i_ddi_irm_enqueue: pool not active.\n"));
1062 		return;
1063 	}
1064 
1065 	if (wait_flag)
1066 		pool_p->ipool_flags |= DDI_IRM_FLAG_WAITERS;
1067 
1068 	if (wait_flag || !(pool_p->ipool_flags & DDI_IRM_FLAG_QUEUED)) {
1069 		pool_p->ipool_flags |= DDI_IRM_FLAG_QUEUED;
1070 		cv_signal(&pool_p->ipool_cv);
1071 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool queued.\n"));
1072 	}
1073 
1074 	if (wait_flag) {
1075 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: waiting...\n"));
1076 		cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
1077 	}
1078 }
1079 
1080 /*
1081  * i_ddi_irm_reduce_by_policy()
1082  *
1083  *	Reduces requests based on reduction policies.
1084  *
1085  *	For the DDI_IRM_POLICY_LARGE reduction policy, the algorithm
1086  *	generally reduces larger requests first, before advancing
1087  *	to smaller requests.
1088  *	For the DDI_IRM_POLICY_EVEN reduction policy, the algorithm
1089  *	reduces requests evenly, without giving a specific preference
1090  *	to smaller or larger requests. Each iteration reduces all
1091  *	reducible requests by the same amount until the imbalance is
1092  *	corrected.
1093  *
1094  *	The scratch list is initially sorted in descending order by current
1095  *	navail values, which are maximized prior to reduction. This sorted
1096  *	order is preserved.  It avoids reducing requests below the threshold
1097  *	of the interrupt pool's default allocation size.
1098  *
1099  *	Optimizations in this algorithm include trying to reduce multiple
1100  *	requests together.  And the algorithm attempts to reduce in larger
1101  *	increments when possible to minimize the total number of iterations.
1102  */
1103 static int
1104 i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *pool_p, int imbalance, int policy)
1105 {
1106 	ASSERT(pool_p != NULL);
1107 	ASSERT(imbalance > 0);
1108 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1109 
1110 	while (imbalance > 0) {
1111 		list_t		*slist_p = &pool_p->ipool_scratch_list;
1112 		ddi_irm_req_t	*req_p = list_head(slist_p), *last_p;
1113 		uint_t		nreduce = 0, nremain = 0, stop_navail;
1114 		uint_t		pool_defsz = pool_p->ipool_defsz;
1115 		uint_t		reduction, max_redu;
1116 
1117 		/* Fail if none are reducible */
1118 		if (!req_p || req_p->ireq_navail <= pool_defsz) {
1119 			DDI_INTR_IRMDBG((CE_CONT,
1120 			    "i_ddi_irm_reduce_by_policy: Failure. "
1121 			    "All requests have downsized to low limit.\n"));
1122 			return (DDI_FAILURE);
1123 		}
1124 
1125 		/* Count reducible requests */
1126 		stop_navail = (policy == DDI_IRM_POLICY_LARGE) ?
1127 		    req_p->ireq_navail - 1 : pool_defsz;
1128 		for (; req_p; req_p = list_next(slist_p, req_p)) {
1129 			if (req_p->ireq_navail <= stop_navail)
1130 				break;
1131 			nreduce++;
1132 		}
1133 
1134 		/* Compute reduction */
1135 		last_p = req_p ? list_prev(slist_p, req_p) : list_tail(slist_p);
1136 		if ((policy == DDI_IRM_POLICY_LARGE) && req_p &&
1137 		    req_p->ireq_navail > pool_defsz)
1138 			reduction = last_p->ireq_navail - req_p->ireq_navail;
1139 		else
1140 			reduction = last_p->ireq_navail - pool_defsz;
1141 
1142 		if ((max_redu = reduction * nreduce) > imbalance) {
1143 			reduction = imbalance / nreduce;
1144 			nremain = imbalance % nreduce;
1145 			pool_p->ipool_resno -= imbalance;
1146 			imbalance = 0;
1147 		} else {
1148 			pool_p->ipool_resno -= max_redu;
1149 			imbalance -= max_redu;
1150 		}
1151 
1152 		/* Reduce */
1153 		for (req_p = list_head(slist_p); (reduction != 0) && nreduce--;
1154 		    req_p = list_next(slist_p, req_p)) {
1155 			req_p->ireq_navail -= reduction;
1156 		}
1157 
1158 		for (req_p = last_p; nremain--;
1159 		    req_p = list_prev(slist_p, req_p)) {
1160 			req_p->ireq_navail--;
1161 		}
1162 	}
1163 
1164 	return (DDI_SUCCESS);
1165 }
1166 
1167 /*
1168  * i_ddi_irm_reduce_new()
1169  *
1170  *	Reduces new requests.  This is only used as a last resort
1171  *	after another reduction algorithm failed.
1172  *
1173  *	NOTE: The pool locking in i_ddi_irm_insert() ensures
1174  *	there can be only one new request at a time in a pool.
1175  */
1176 static void
1177 i_ddi_irm_reduce_new(ddi_irm_pool_t *pool_p, int imbalance)
1178 {
1179 	ddi_irm_req_t	*req_p;
1180 
1181 	ASSERT(pool_p != NULL);
1182 	ASSERT(imbalance > 0);
1183 	ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1184 
1185 	DDI_INTR_IRMDBG((CE_CONT,
1186 	    "i_ddi_irm_reduce_new: pool_p %p imbalance %d\n",
1187 	    (void *)pool_p, imbalance));
1188 
1189 	for (req_p = list_head(&pool_p->ipool_scratch_list); req_p;
1190 	    req_p = list_next(&pool_p->ipool_scratch_list, req_p)) {
1191 		if (req_p->ireq_flags & DDI_IRM_FLAG_NEW) {
1192 			ASSERT(req_p->ireq_navail >= imbalance);
1193 			req_p->ireq_navail -= imbalance;
1194 			pool_p->ipool_resno -= imbalance;
1195 			return;
1196 		}
1197 	}
1198 
1199 	/* should never go here */
1200 	ASSERT(B_FALSE);
1201 }
1202 
1203 /*
1204  * Miscellaneous Helper Functions
1205  */
1206 
1207 /*
1208  * i_ddi_intr_get_pool()
1209  *
1210  *	Get an IRM pool that supplies interrupts of a specified type.
1211  *	Invokes a DDI_INTROP_GETPOOL to the bus nexus driver.  Fails
1212  *	if no pool exists.
1213  */
1214 ddi_irm_pool_t *
1215 i_ddi_intr_get_pool(dev_info_t *dip, int type)
1216 {
1217 	devinfo_intr_t		*intr_p;
1218 	ddi_irm_pool_t		*pool_p;
1219 	ddi_irm_req_t		*req_p;
1220 	ddi_intr_handle_impl_t	hdl;
1221 
1222 	ASSERT(dip != NULL);
1223 	ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
1224 
1225 	if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
1226 	    ((req_p = intr_p->devi_irm_req_p) != NULL) &&
1227 	    ((pool_p = req_p->ireq_pool_p) != NULL) &&
1228 	    (pool_p->ipool_types & type)) {
1229 		return (pool_p);
1230 	}
1231 
1232 	bzero(&hdl, sizeof (ddi_intr_handle_impl_t));
1233 	hdl.ih_dip = dip;
1234 	hdl.ih_type = type;
1235 
1236 	if (i_ddi_intr_ops(dip, dip, DDI_INTROP_GETPOOL,
1237 	    &hdl, (void *)&pool_p) == DDI_SUCCESS)
1238 		return (pool_p);
1239 
1240 	return (NULL);
1241 }
1242 
1243 /*
1244  * i_ddi_irm_insertion_sort()
1245  *
1246  *	Use the insertion sort method to insert a request into a list.
1247  *	The list is sorted in descending order by request size.
1248  */
1249 static void
1250 i_ddi_irm_insertion_sort(list_t *req_list, ddi_irm_req_t *req_p)
1251 {
1252 	ddi_irm_req_t	*next_p;
1253 
1254 	next_p = list_head(req_list);
1255 
1256 	while (next_p && (next_p->ireq_nreq > req_p->ireq_nreq))
1257 		next_p = list_next(req_list, next_p);
1258 
1259 	list_insert_before(req_list, next_p, req_p);
1260 }
1261 
1262 /*
1263  * i_ddi_irm_notify()
1264  *
1265  *	Notify a driver of changes to its interrupt request using the
1266  *	generic callback mechanism.  Checks for errors in processing.
1267  */
1268 static int
1269 i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
1270 {
1271 	ddi_cb_action_t	action;
1272 	ddi_cb_t	*cb_p;
1273 	uint_t		nintrs;
1274 	int		ret, count;
1275 
1276 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: pool_p %p req_p %p\n",
1277 	    (void *)pool_p, (void *)req_p));
1278 
1279 	/* Do not notify new or unchanged requests */
1280 	if ((req_p->ireq_navail == req_p->ireq_scratch) ||
1281 	    (req_p->ireq_flags & DDI_IRM_FLAG_NEW))
1282 		return (DDI_SUCCESS);
1283 
1284 	/* Determine action and count */
1285 	if (req_p->ireq_navail > req_p->ireq_scratch) {
1286 		action = DDI_CB_INTR_ADD;
1287 		count = req_p->ireq_navail - req_p->ireq_scratch;
1288 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: adding %d\n",
1289 		    count));
1290 	} else {
1291 		action = DDI_CB_INTR_REMOVE;
1292 		count = req_p->ireq_scratch - req_p->ireq_navail;
1293 		DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: removing %d\n",
1294 		    count));
1295 	}
1296 
1297 	/* Lookup driver callback */
1298 	if ((cb_p = DEVI(req_p->ireq_dip)->devi_cb_p) == NULL) {
1299 		DDI_INTR_IRMDBG((CE_WARN, "i_ddi_irm_notify: no callback!\n"));
1300 		return (DDI_FAILURE);
1301 	}
1302 
1303 	/* Do callback */
1304 	ret = cb_p->cb_func(req_p->ireq_dip, action, (void *)(uintptr_t)count,
1305 	    cb_p->cb_arg1, cb_p->cb_arg2);
1306 
1307 	/* Log callback errors */
1308 	if (ret != DDI_SUCCESS) {
1309 		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
1310 		    ddi_driver_name(req_p->ireq_dip),
1311 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
1312 	}
1313 
1314 	/* Check if the driver exceeds its availability */
1315 	nintrs = i_ddi_intr_get_current_nintrs(req_p->ireq_dip);
1316 	if (nintrs > req_p->ireq_navail) {
1317 		cmn_err(CE_WARN, "%s%d: failed to release interrupts "
1318 		    "(nintrs=%d, navail=%d).\n",
1319 		    ddi_driver_name(req_p->ireq_dip),
1320 		    ddi_get_instance(req_p->ireq_dip), nintrs,
1321 		    req_p->ireq_navail);
1322 		pool_p->ipool_resno += (nintrs - req_p->ireq_navail);
1323 		req_p->ireq_navail = nintrs;
1324 		return (DDI_FAILURE);
1325 	}
1326 
1327 	/* Update request */
1328 	req_p->ireq_scratch = req_p->ireq_navail;
1329 
1330 	return (DDI_SUCCESS);
1331 }
1332 
1333 /*
1334  * i_ddi_irm_debug_balance()
1335  *
1336  *	A debug/test only routine to force the immediate,
1337  *	synchronous rebalancing of an interrupt pool.
1338  */
1339 #ifdef	DEBUG
1340 void
1341 i_ddi_irm_debug_balance(dev_info_t *dip, boolean_t wait_flag)
1342 {
1343 	ddi_irm_pool_t	*pool_p;
1344 	int		type;
1345 
1346 	DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_debug_balance: dip %p wait %d\n",
1347 	    (void *)dip, (int)wait_flag));
1348 
1349 	if (((type = i_ddi_intr_get_current_type(dip)) != 0) &&
1350 	    ((pool_p = i_ddi_intr_get_pool(dip, type)) != NULL)) {
1351 		mutex_enter(&pool_p->ipool_lock);
1352 		i_ddi_irm_enqueue(pool_p, wait_flag);
1353 		mutex_exit(&pool_p->ipool_lock);
1354 	}
1355 }
1356 #endif
1357