xref: /titanic_50/usr/src/uts/common/io/idm/idm.c (revision 09ce0d4acf1a79c720d7e54b60e87cbfa0f1b2d6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpuvar.h>
27 #include <sys/conf.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 
33 #include <sys/socket.h>
34 #include <sys/strsubr.h>
35 #include <sys/sysmacros.h>
36 
37 #include <sys/socketvar.h>
38 #include <netinet/in.h>
39 
40 #include <sys/idm/idm.h>
41 #include <sys/idm/idm_so.h>
42 
43 #define	IDM_NAME_VERSION	"iSCSI Data Mover"
44 
45 extern struct mod_ops mod_miscops;
46 extern struct mod_ops mod_miscops;
47 
48 static struct modlmisc modlmisc = {
49 	&mod_miscops,	/* Type of module */
50 	IDM_NAME_VERSION
51 };
52 
53 static struct modlinkage modlinkage = {
54 	MODREV_1, (void *)&modlmisc, NULL
55 };
56 
57 extern int idm_task_compare(const void *t1, const void *t2);
58 extern void idm_wd_thread(void *arg);
59 
60 static int _idm_init(void);
61 static int _idm_fini(void);
62 static void idm_buf_bind_in_locked(idm_task_t *idt, idm_buf_t *buf);
63 static void idm_buf_bind_out_locked(idm_task_t *idt, idm_buf_t *buf);
64 static void idm_buf_unbind_in_locked(idm_task_t *idt, idm_buf_t *buf);
65 static void idm_buf_unbind_out_locked(idm_task_t *idt, idm_buf_t *buf);
66 static void idm_task_abort_one(idm_conn_t *ic, idm_task_t *idt,
67     idm_abort_type_t abort_type);
68 static void idm_task_aborted(idm_task_t *idt, idm_status_t status);
69 static idm_pdu_t *idm_pdu_alloc_common(uint_t hdrlen, uint_t datalen,
70     int sleepflag);
71 
72 boolean_t idm_conn_logging = 0;
73 boolean_t idm_svc_logging = 0;
74 #ifdef DEBUG
75 boolean_t idm_pattern_checking = 1;
76 #else
77 boolean_t idm_pattern_checking = 0;
78 #endif
79 
80 /*
81  * Potential tuneable for the maximum number of tasks.  Default to
82  * IDM_TASKIDS_MAX
83  */
84 
85 uint32_t	idm_max_taskids = IDM_TASKIDS_MAX;
86 
87 /*
88  * Global list of transport handles
89  *   These are listed in preferential order, so we can simply take the
90  *   first "it_conn_is_capable" hit. Note also that the order maps to
91  *   the order of the idm_transport_type_t list.
92  */
93 idm_transport_t idm_transport_list[] = {
94 
95 	/* iSER on InfiniBand transport handle */
96 	{IDM_TRANSPORT_TYPE_ISER,	/* type */
97 	"/devices/ib/iser@0:iser",	/* device path */
98 	NULL,				/* LDI handle */
99 	NULL,				/* transport ops */
100 	NULL},				/* transport caps */
101 
102 	/* IDM native sockets transport handle */
103 	{IDM_TRANSPORT_TYPE_SOCKETS,	/* type */
104 	NULL,				/* device path */
105 	NULL,				/* LDI handle */
106 	NULL,				/* transport ops */
107 	NULL}				/* transport caps */
108 
109 };
110 
111 int
112 _init(void)
113 {
114 	int rc;
115 
116 	if ((rc = _idm_init()) != 0) {
117 		return (rc);
118 	}
119 
120 	return (mod_install(&modlinkage));
121 }
122 
123 int
124 _fini(void)
125 {
126 	int rc;
127 
128 	if ((rc = _idm_fini()) != 0) {
129 		return (rc);
130 	}
131 
132 	if ((rc = mod_remove(&modlinkage)) != 0) {
133 		return (rc);
134 	}
135 
136 	return (rc);
137 }
138 
139 int
140 _info(struct modinfo *modinfop)
141 {
142 	return (mod_info(&modlinkage, modinfop));
143 }
144 
145 /*
146  * idm_transport_register()
147  *
148  * Provides a mechanism for an IDM transport driver to register its
149  * transport ops and caps with the IDM kernel module. Invoked during
150  * a transport driver's attach routine.
151  */
152 idm_status_t
153 idm_transport_register(idm_transport_attr_t *attr)
154 {
155 	ASSERT(attr->it_ops != NULL);
156 	ASSERT(attr->it_caps != NULL);
157 
158 	switch (attr->type) {
159 	/* All known non-native transports here; for now, iSER */
160 	case IDM_TRANSPORT_TYPE_ISER:
161 		idm_transport_list[attr->type].it_ops	= attr->it_ops;
162 		idm_transport_list[attr->type].it_caps	= attr->it_caps;
163 		return (IDM_STATUS_SUCCESS);
164 
165 	default:
166 		cmn_err(CE_NOTE, "idm: unknown transport type (0x%x) in "
167 		    "idm_transport_register", attr->type);
168 		return (IDM_STATUS_SUCCESS);
169 	}
170 }
171 
172 /*
173  * idm_ini_conn_create
174  *
175  * This function is invoked by the iSCSI layer to create a connection context.
176  * This does not actually establish the socket connection.
177  *
178  * cr - Connection request parameters
179  * new_con - Output parameter that contains the new request if successful
180  *
181  */
182 idm_status_t
183 idm_ini_conn_create(idm_conn_req_t *cr, idm_conn_t **new_con)
184 {
185 	idm_transport_t		*it;
186 	idm_conn_t		*ic;
187 	int			rc;
188 
189 	it = idm_transport_lookup(cr);
190 
191 retry:
192 	ic = idm_conn_create_common(CONN_TYPE_INI, it->it_type,
193 	    &cr->icr_conn_ops);
194 
195 	bcopy(&cr->cr_ini_dst_addr, &ic->ic_ini_dst_addr,
196 	    sizeof (cr->cr_ini_dst_addr));
197 
198 	/* create the transport-specific connection components */
199 	rc = it->it_ops->it_ini_conn_create(cr, ic);
200 	if (rc != IDM_STATUS_SUCCESS) {
201 		/* cleanup the failed connection */
202 		idm_conn_destroy_common(ic);
203 		kmem_free(ic, sizeof (idm_conn_t));
204 
205 		/*
206 		 * It is possible for an IB client to connect to
207 		 * an ethernet-only client via an IB-eth gateway.
208 		 * Therefore, if we are attempting to use iSER and
209 		 * fail, retry with sockets before ultimately
210 		 * failing the connection.
211 		 */
212 		if (it->it_type == IDM_TRANSPORT_TYPE_ISER) {
213 			it = &idm_transport_list[IDM_TRANSPORT_TYPE_SOCKETS];
214 			goto retry;
215 		}
216 
217 		return (IDM_STATUS_FAIL);
218 	}
219 
220 	*new_con = ic;
221 
222 	mutex_enter(&idm.idm_global_mutex);
223 	list_insert_tail(&idm.idm_ini_conn_list, ic);
224 	mutex_exit(&idm.idm_global_mutex);
225 
226 	return (IDM_STATUS_SUCCESS);
227 }
228 
229 /*
230  * idm_ini_conn_destroy
231  *
232  * Releases any resources associated with the connection.  This is the
233  * complement to idm_ini_conn_create.
234  * ic - idm_conn_t structure representing the relevant connection
235  *
236  */
237 void
238 idm_ini_conn_destroy_task(void *ic_void)
239 {
240 	idm_conn_t *ic = ic_void;
241 
242 	ic->ic_transport_ops->it_ini_conn_destroy(ic);
243 	idm_conn_destroy_common(ic);
244 }
245 
246 void
247 idm_ini_conn_destroy(idm_conn_t *ic)
248 {
249 	/*
250 	 * It's reasonable for the initiator to call idm_ini_conn_destroy
251 	 * from within the context of the CN_CONNECT_DESTROY notification.
252 	 * That's a problem since we want to destroy the taskq for the
253 	 * state machine associated with the connection.  Remove the
254 	 * connection from the list right away then handle the remaining
255 	 * work via the idm_global_taskq.
256 	 */
257 	mutex_enter(&idm.idm_global_mutex);
258 	list_remove(&idm.idm_ini_conn_list, ic);
259 	mutex_exit(&idm.idm_global_mutex);
260 
261 	if (taskq_dispatch(idm.idm_global_taskq,
262 	    &idm_ini_conn_destroy_task, ic, TQ_SLEEP) == NULL) {
263 		cmn_err(CE_WARN,
264 		    "idm_ini_conn_destroy: Couldn't dispatch task");
265 	}
266 }
267 
268 /*
269  * idm_ini_conn_connect
270  *
271  * Establish connection to the remote system identified in idm_conn_t.
272  * The connection parameters including the remote IP address were established
273  * in the call to idm_ini_conn_create.  The IDM state machine will
274  * perform client notifications as necessary to prompt the initiator through
275  * the login process.  IDM also keeps a timer running so that if the login
276  * process doesn't complete in a timely manner it will fail.
277  *
278  * ic - idm_conn_t structure representing the relevant connection
279  *
280  * Returns success if the connection was established, otherwise some kind
281  * of meaningful error code.
282  *
283  * Upon return the login has either failed or is loggin in (ffp)
284  */
285 idm_status_t
286 idm_ini_conn_connect(idm_conn_t *ic)
287 {
288 	idm_status_t	rc = IDM_STATUS_SUCCESS;
289 
290 	rc = idm_conn_sm_init(ic);
291 	if (rc != IDM_STATUS_SUCCESS) {
292 		return (ic->ic_conn_sm_status);
293 	}
294 
295 	/* Hold connection until we return */
296 	idm_conn_hold(ic);
297 
298 	/* Kick state machine */
299 	idm_conn_event(ic, CE_CONNECT_REQ, NULL);
300 
301 	/* Wait for login flag */
302 	mutex_enter(&ic->ic_state_mutex);
303 	while (!(ic->ic_state_flags & CF_LOGIN_READY) &&
304 	    !(ic->ic_state_flags & CF_ERROR)) {
305 		cv_wait(&ic->ic_state_cv, &ic->ic_state_mutex);
306 	}
307 	mutex_exit(&ic->ic_state_mutex);
308 
309 	if (ic->ic_state_flags & CF_ERROR) {
310 		/* ic->ic_conn_sm_status will contains failure status */
311 		idm_conn_rele(ic);
312 		return (ic->ic_conn_sm_status);
313 	}
314 
315 	/* Ready to login */
316 	ASSERT(ic->ic_state_flags & CF_LOGIN_READY);
317 	(void) idm_notify_client(ic, CN_READY_FOR_LOGIN, NULL);
318 
319 	idm_conn_rele(ic);
320 
321 	return (rc);
322 }
323 
324 /*
325  * idm_ini_conn_disconnect
326  *
327  * Forces a connection (previously established using idm_ini_conn_connect)
328  * to perform a controlled shutdown, cleaning up any outstanding requests.
329  *
330  * ic - idm_conn_t structure representing the relevant connection
331  *
332  * This is asynchronous and will return before the connection is properly
333  * shutdown
334  */
335 /* ARGSUSED */
336 void
337 idm_ini_conn_disconnect(idm_conn_t *ic)
338 {
339 	idm_conn_event(ic, CE_TRANSPORT_FAIL, NULL);
340 }
341 
342 /*
343  * idm_ini_conn_disconnect_wait
344  *
345  * Forces a connection (previously established using idm_ini_conn_connect)
346  * to perform a controlled shutdown.  Blocks until the connection is
347  * disconnected.
348  *
349  * ic - idm_conn_t structure representing the relevant connection
350  */
351 /* ARGSUSED */
352 void
353 idm_ini_conn_disconnect_sync(idm_conn_t *ic)
354 {
355 	mutex_enter(&ic->ic_state_mutex);
356 	if ((ic->ic_state != CS_S9_INIT_ERROR) &&
357 	    (ic->ic_state != CS_S11_COMPLETE)) {
358 		idm_conn_event_locked(ic, CE_TRANSPORT_FAIL, NULL, CT_NONE);
359 		while ((ic->ic_state != CS_S9_INIT_ERROR) &&
360 		    (ic->ic_state != CS_S11_COMPLETE))
361 			cv_wait(&ic->ic_state_cv, &ic->ic_state_mutex);
362 	}
363 	mutex_exit(&ic->ic_state_mutex);
364 }
365 
366 /*
367  * idm_tgt_svc_create
368  *
369  * The target calls this service to obtain a service context for each available
370  * transport, starting a service of each type related to the IP address and port
371  * passed. The idm_svc_req_t contains the service parameters.
372  */
373 idm_status_t
374 idm_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t **new_svc)
375 {
376 	idm_transport_type_t	type;
377 	idm_transport_t		*it;
378 	idm_svc_t		*is;
379 	int			rc;
380 
381 	*new_svc = NULL;
382 	is = kmem_zalloc(sizeof (idm_svc_t), KM_SLEEP);
383 
384 	/* Initialize transport-agnostic components of the service handle */
385 	is->is_svc_req = *sr;
386 	mutex_init(&is->is_mutex, NULL, MUTEX_DEFAULT, NULL);
387 	cv_init(&is->is_cv, NULL, CV_DEFAULT, NULL);
388 	mutex_init(&is->is_count_mutex, NULL, MUTEX_DEFAULT, NULL);
389 	cv_init(&is->is_count_cv, NULL, CV_DEFAULT, NULL);
390 	idm_refcnt_init(&is->is_refcnt, is);
391 
392 	/*
393 	 * Make sure all available transports are setup.  We call this now
394 	 * instead of at initialization time in case IB has become available
395 	 * since we started (hotplug, etc).
396 	 */
397 	idm_transport_setup(sr->sr_li);
398 
399 	/*
400 	 * Loop through the transports, configuring the transport-specific
401 	 * components of each one.
402 	 */
403 	for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
404 
405 		it = &idm_transport_list[type];
406 		/*
407 		 * If it_ops is NULL then the transport is unconfigured
408 		 * and we shouldn't try to start the service.
409 		 */
410 		if (it->it_ops == NULL) {
411 			continue;
412 		}
413 
414 		rc = it->it_ops->it_tgt_svc_create(sr, is);
415 		if (rc != IDM_STATUS_SUCCESS) {
416 			/* Teardown any configured services */
417 			while (type--) {
418 				it = &idm_transport_list[type];
419 				if (it->it_ops == NULL) {
420 					continue;
421 				}
422 				it->it_ops->it_tgt_svc_destroy(is);
423 			}
424 			/* Free the svc context and return */
425 			kmem_free(is, sizeof (idm_svc_t));
426 			return (rc);
427 		}
428 	}
429 
430 	*new_svc = is;
431 
432 	mutex_enter(&idm.idm_global_mutex);
433 	list_insert_tail(&idm.idm_tgt_svc_list, is);
434 	mutex_exit(&idm.idm_global_mutex);
435 
436 	return (IDM_STATUS_SUCCESS);
437 }
438 
439 /*
440  * idm_tgt_svc_destroy
441  *
442  * is - idm_svc_t returned by the call to idm_tgt_svc_create
443  *
444  * Cleanup any resources associated with the idm_svc_t.
445  */
446 void
447 idm_tgt_svc_destroy(idm_svc_t *is)
448 {
449 	idm_transport_type_t	type;
450 	idm_transport_t		*it;
451 
452 	mutex_enter(&idm.idm_global_mutex);
453 	/* remove this service from the global list */
454 	list_remove(&idm.idm_tgt_svc_list, is);
455 	/* wakeup any waiters for service change */
456 	cv_broadcast(&idm.idm_tgt_svc_cv);
457 	mutex_exit(&idm.idm_global_mutex);
458 
459 	/* teardown each transport-specific service */
460 	for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
461 		it = &idm_transport_list[type];
462 		if (it->it_ops == NULL) {
463 			continue;
464 		}
465 
466 		it->it_ops->it_tgt_svc_destroy(is);
467 	}
468 
469 	/* tear down the svc resources */
470 	idm_refcnt_destroy(&is->is_refcnt);
471 	cv_destroy(&is->is_count_cv);
472 	mutex_destroy(&is->is_count_mutex);
473 	cv_destroy(&is->is_cv);
474 	mutex_destroy(&is->is_mutex);
475 
476 	/* free the svc handle */
477 	kmem_free(is, sizeof (idm_svc_t));
478 }
479 
480 void
481 idm_tgt_svc_hold(idm_svc_t *is)
482 {
483 	idm_refcnt_hold(&is->is_refcnt);
484 }
485 
486 void
487 idm_tgt_svc_rele_and_destroy(idm_svc_t *is)
488 {
489 	idm_refcnt_rele_and_destroy(&is->is_refcnt,
490 	    (idm_refcnt_cb_t *)&idm_tgt_svc_destroy);
491 }
492 
493 /*
494  * idm_tgt_svc_online
495  *
496  * is - idm_svc_t returned by the call to idm_tgt_svc_create
497  *
498  * Online each transport service, as we want this target to be accessible
499  * via any configured transport.
500  *
501  * When the initiator establishes a new connection to the target, IDM will
502  * call the "new connect" callback defined in the idm_svc_req_t structure
503  * and it will pass an idm_conn_t structure representing that new connection.
504  */
505 idm_status_t
506 idm_tgt_svc_online(idm_svc_t *is)
507 {
508 
509 	idm_transport_type_t	type, last_type;
510 	idm_transport_t		*it;
511 	int			rc = IDM_STATUS_SUCCESS;
512 
513 	mutex_enter(&is->is_mutex);
514 	if (is->is_online == 0) {
515 		/* Walk through each of the transports and online them */
516 		for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
517 			it = &idm_transport_list[type];
518 			if (it->it_ops == NULL) {
519 				/* transport is not registered */
520 				continue;
521 			}
522 
523 			mutex_exit(&is->is_mutex);
524 			rc = it->it_ops->it_tgt_svc_online(is);
525 			mutex_enter(&is->is_mutex);
526 			if (rc != IDM_STATUS_SUCCESS) {
527 				last_type = type;
528 				break;
529 			}
530 		}
531 		if (rc != IDM_STATUS_SUCCESS) {
532 			/*
533 			 * The last transport failed to online.
534 			 * Offline any transport onlined above and
535 			 * do not online the target.
536 			 */
537 			for (type = 0; type < last_type; type++) {
538 				it = &idm_transport_list[type];
539 				if (it->it_ops == NULL) {
540 					/* transport is not registered */
541 					continue;
542 				}
543 
544 				mutex_exit(&is->is_mutex);
545 				it->it_ops->it_tgt_svc_offline(is);
546 				mutex_enter(&is->is_mutex);
547 			}
548 		} else {
549 			/* Target service now online */
550 			is->is_online = 1;
551 		}
552 	} else {
553 		/* Target service already online, just bump the count */
554 		is->is_online++;
555 	}
556 	mutex_exit(&is->is_mutex);
557 
558 	return (rc);
559 }
560 
561 /*
562  * idm_tgt_svc_offline
563  *
564  * is - idm_svc_t returned by the call to idm_tgt_svc_create
565  *
566  * Shutdown any online target services.
567  */
568 void
569 idm_tgt_svc_offline(idm_svc_t *is)
570 {
571 	idm_transport_type_t	type;
572 	idm_transport_t		*it;
573 
574 	mutex_enter(&is->is_mutex);
575 	is->is_online--;
576 	if (is->is_online == 0) {
577 		/* Walk through each of the transports and offline them */
578 		for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
579 			it = &idm_transport_list[type];
580 			if (it->it_ops == NULL) {
581 				/* transport is not registered */
582 				continue;
583 			}
584 
585 			mutex_exit(&is->is_mutex);
586 			it->it_ops->it_tgt_svc_offline(is);
587 			mutex_enter(&is->is_mutex);
588 		}
589 	}
590 	mutex_exit(&is->is_mutex);
591 }
592 
593 /*
594  * idm_tgt_svc_lookup
595  *
596  * Lookup a service instance listening on the specified port
597  */
598 
599 idm_svc_t *
600 idm_tgt_svc_lookup(uint16_t port)
601 {
602 	idm_svc_t *result;
603 
604 retry:
605 	mutex_enter(&idm.idm_global_mutex);
606 	for (result = list_head(&idm.idm_tgt_svc_list);
607 	    result != NULL;
608 	    result = list_next(&idm.idm_tgt_svc_list, result)) {
609 		if (result->is_svc_req.sr_port == port) {
610 			if (result->is_online == 0) {
611 				/*
612 				 * A service exists on this port, but it
613 				 * is going away, wait for it to cleanup.
614 				 */
615 				cv_wait(&idm.idm_tgt_svc_cv,
616 				    &idm.idm_global_mutex);
617 				mutex_exit(&idm.idm_global_mutex);
618 				goto retry;
619 			}
620 			idm_tgt_svc_hold(result);
621 			mutex_exit(&idm.idm_global_mutex);
622 			return (result);
623 		}
624 	}
625 	mutex_exit(&idm.idm_global_mutex);
626 
627 	return (NULL);
628 }
629 
630 /*
631  * idm_negotiate_key_values()
632  * Give IDM level a chance to negotiate any login parameters it should own.
633  *  -- leave unhandled parameters alone on request_nvl
634  *  -- move all handled parameters to response_nvl with an appropriate response
635  *  -- also add an entry to negotiated_nvl for any accepted parameters
636  */
637 kv_status_t
638 idm_negotiate_key_values(idm_conn_t *ic, nvlist_t *request_nvl,
639     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
640 {
641 	ASSERT(ic->ic_transport_ops != NULL);
642 	return (ic->ic_transport_ops->it_negotiate_key_values(ic,
643 	    request_nvl, response_nvl, negotiated_nvl));
644 }
645 
646 /*
647  * idm_notice_key_values()
648  * Activate at the IDM level any parameters that have been negotiated.
649  * Passes the set of key value pairs to the transport for activation.
650  * This will be invoked as the connection is entering full-feature mode.
651  */
652 void
653 idm_notice_key_values(idm_conn_t *ic, nvlist_t *negotiated_nvl)
654 {
655 	ASSERT(ic->ic_transport_ops != NULL);
656 	ic->ic_transport_ops->it_notice_key_values(ic, negotiated_nvl);
657 }
658 
659 /*
660  * idm_buf_tx_to_ini
661  *
662  * This is IDM's implementation of the 'Put_Data' operational primitive.
663  *
664  * This function is invoked by a target iSCSI layer to request its local
665  * Datamover layer to transmit the Data-In PDU to the peer iSCSI layer
666  * on the remote iSCSI node. The I/O buffer represented by 'idb' is
667  * transferred to the initiator associated with task 'idt'. The connection
668  * info, contents of the Data-In PDU header, the DataDescriptorIn, BHS,
669  * and the callback (idb->idb_buf_cb) at transfer completion are
670  * provided as input.
671  *
672  * This data transfer takes place transparently to the remote iSCSI layer,
673  * i.e. without its participation.
674  *
675  * Using sockets, IDM implements the data transfer by segmenting the data
676  * buffer into appropriately sized iSCSI PDUs and transmitting them to the
677  * initiator. iSER performs the transfer using RDMA write.
678  *
679  */
680 idm_status_t
681 idm_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb,
682     uint32_t offset, uint32_t xfer_len,
683     idm_buf_cb_t idb_buf_cb, void *cb_arg)
684 {
685 	idm_status_t rc;
686 
687 	idb->idb_bufoffset = offset;
688 	idb->idb_xfer_len = xfer_len;
689 	idb->idb_buf_cb = idb_buf_cb;
690 	idb->idb_cb_arg = cb_arg;
691 	gethrestime(&idb->idb_xfer_start);
692 
693 	/*
694 	 * Buffer should not contain the pattern.  If the pattern is
695 	 * present then we've been asked to transmit initialized data
696 	 */
697 	IDM_BUFPAT_CHECK(idb, xfer_len, BP_CHECK_ASSERT);
698 
699 	mutex_enter(&idt->idt_mutex);
700 	switch (idt->idt_state) {
701 	case TASK_ACTIVE:
702 		idt->idt_tx_to_ini_start++;
703 		idm_task_hold(idt);
704 		idm_buf_bind_in_locked(idt, idb);
705 		idb->idb_in_transport = B_TRUE;
706 		rc = (*idt->idt_ic->ic_transport_ops->it_buf_tx_to_ini)
707 		    (idt, idb);
708 		return (rc);
709 
710 	case TASK_SUSPENDING:
711 	case TASK_SUSPENDED:
712 		/*
713 		 * Bind buffer but don't start a transfer since the task
714 		 * is suspended
715 		 */
716 		idm_buf_bind_in_locked(idt, idb);
717 		mutex_exit(&idt->idt_mutex);
718 		return (IDM_STATUS_SUCCESS);
719 
720 	case TASK_ABORTING:
721 	case TASK_ABORTED:
722 		/*
723 		 * Once the task is aborted, any buffers added to the
724 		 * idt_inbufv will never get cleaned up, so just return
725 		 * SUCCESS.  The buffer should get cleaned up by the
726 		 * client or framework once task_aborted has completed.
727 		 */
728 		mutex_exit(&idt->idt_mutex);
729 		return (IDM_STATUS_SUCCESS);
730 
731 	default:
732 		ASSERT(0);
733 		break;
734 	}
735 	mutex_exit(&idt->idt_mutex);
736 
737 	return (IDM_STATUS_FAIL);
738 }
739 
740 /*
741  * idm_buf_rx_from_ini
742  *
743  * This is IDM's implementation of the 'Get_Data' operational primitive.
744  *
745  * This function is invoked by a target iSCSI layer to request its local
746  * Datamover layer to retrieve certain data identified by the R2T PDU from the
747  * peer iSCSI layer on the remote node. The retrieved Data-Out PDU will be
748  * mapped to the respective buffer by the task tags (ITT & TTT).
749  * The connection information, contents of an R2T PDU, DataDescriptor, BHS, and
750  * the callback (idb->idb_buf_cb) notification for data transfer completion are
751  * are provided as input.
752  *
753  * When an iSCSI node sends an R2T PDU to its local Datamover layer, the local
754  * Datamover layer, the local and remote Datamover layers transparently bring
755  * about the data transfer requested by the R2T PDU, without the participation
756  * of the iSCSI layers.
757  *
758  * Using sockets, IDM transmits an R2T PDU for each buffer and the rx_data_out()
759  * assembles the Data-Out PDUs into the buffer. iSER uses RDMA read.
760  *
761  */
762 idm_status_t
763 idm_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb,
764     uint32_t offset, uint32_t xfer_len,
765     idm_buf_cb_t idb_buf_cb, void *cb_arg)
766 {
767 	idm_status_t rc;
768 
769 	idb->idb_bufoffset = offset;
770 	idb->idb_xfer_len = xfer_len;
771 	idb->idb_buf_cb = idb_buf_cb;
772 	idb->idb_cb_arg = cb_arg;
773 	gethrestime(&idb->idb_xfer_start);
774 
775 	/*
776 	 * "In" buf list is for "Data In" PDU's, "Out" buf list is for
777 	 * "Data Out" PDU's
778 	 */
779 	mutex_enter(&idt->idt_mutex);
780 	switch (idt->idt_state) {
781 	case TASK_ACTIVE:
782 		idt->idt_rx_from_ini_start++;
783 		idm_task_hold(idt);
784 		idm_buf_bind_out_locked(idt, idb);
785 		idb->idb_in_transport = B_TRUE;
786 		rc = (*idt->idt_ic->ic_transport_ops->it_buf_rx_from_ini)
787 		    (idt, idb);
788 		return (rc);
789 	case TASK_SUSPENDING:
790 	case TASK_SUSPENDED:
791 	case TASK_ABORTING:
792 	case TASK_ABORTED:
793 		/*
794 		 * Bind buffer but don't start a transfer since the task
795 		 * is suspended
796 		 */
797 		idm_buf_bind_out_locked(idt, idb);
798 		mutex_exit(&idt->idt_mutex);
799 		return (IDM_STATUS_SUCCESS);
800 	default:
801 		ASSERT(0);
802 		break;
803 	}
804 	mutex_exit(&idt->idt_mutex);
805 
806 	return (IDM_STATUS_FAIL);
807 }
808 
809 /*
810  * idm_buf_tx_to_ini_done
811  *
812  * The transport calls this after it has completed a transfer requested by
813  * a call to transport_buf_tx_to_ini
814  *
815  * Caller holds idt->idt_mutex, idt->idt_mutex is released before returning.
816  * idt may be freed after the call to idb->idb_buf_cb.
817  */
818 void
819 idm_buf_tx_to_ini_done(idm_task_t *idt, idm_buf_t *idb, idm_status_t status)
820 {
821 	ASSERT(mutex_owned(&idt->idt_mutex));
822 	idb->idb_in_transport = B_FALSE;
823 	idb->idb_tx_thread = B_FALSE;
824 	idt->idt_tx_to_ini_done++;
825 	gethrestime(&idb->idb_xfer_done);
826 
827 	/*
828 	 * idm_refcnt_rele may cause TASK_SUSPENDING --> TASK_SUSPENDED or
829 	 * TASK_ABORTING --> TASK_ABORTED transistion if the refcount goes
830 	 * to 0.
831 	 */
832 	idm_task_rele(idt);
833 	idb->idb_status = status;
834 
835 	switch (idt->idt_state) {
836 	case TASK_ACTIVE:
837 		idm_buf_unbind_in_locked(idt, idb);
838 		mutex_exit(&idt->idt_mutex);
839 		(*idb->idb_buf_cb)(idb, status);
840 		return;
841 	case TASK_SUSPENDING:
842 	case TASK_SUSPENDED:
843 	case TASK_ABORTING:
844 	case TASK_ABORTED:
845 		/*
846 		 * To keep things simple we will ignore the case where the
847 		 * transfer was successful and leave all buffers bound to the
848 		 * task.  This allows us to also ignore the case where we've
849 		 * been asked to abort a task but the last transfer of the
850 		 * task has completed.  IDM has no idea whether this was, in
851 		 * fact, the last transfer of the task so it would be difficult
852 		 * to handle this case.  Everything should get sorted out again
853 		 * after task reassignment is complete.
854 		 *
855 		 * In the case of TASK_ABORTING we could conceivably call the
856 		 * buffer callback here but the timing of when the client's
857 		 * client_task_aborted callback is invoked vs. when the client's
858 		 * buffer callback gets invoked gets sticky.  We don't want
859 		 * the client to here from us again after the call to
860 		 * client_task_aborted() but we don't want to give it a bunch
861 		 * of failed buffer transfers until we've called
862 		 * client_task_aborted().  Instead we'll just leave all the
863 		 * buffers bound and allow the client to cleanup.
864 		 */
865 		break;
866 	default:
867 		ASSERT(0);
868 	}
869 	mutex_exit(&idt->idt_mutex);
870 }
871 
872 /*
873  * idm_buf_rx_from_ini_done
874  *
875  * The transport calls this after it has completed a transfer requested by
876  * a call totransport_buf_tx_to_ini
877  *
878  * Caller holds idt->idt_mutex, idt->idt_mutex is released before returning.
879  * idt may be freed after the call to idb->idb_buf_cb.
880  */
881 void
882 idm_buf_rx_from_ini_done(idm_task_t *idt, idm_buf_t *idb, idm_status_t status)
883 {
884 	ASSERT(mutex_owned(&idt->idt_mutex));
885 	idb->idb_in_transport = B_FALSE;
886 	idt->idt_rx_from_ini_done++;
887 	gethrestime(&idb->idb_xfer_done);
888 
889 	/*
890 	 * idm_refcnt_rele may cause TASK_SUSPENDING --> TASK_SUSPENDED or
891 	 * TASK_ABORTING --> TASK_ABORTED transistion if the refcount goes
892 	 * to 0.
893 	 */
894 	idm_task_rele(idt);
895 	idb->idb_status = status;
896 
897 	if (status == IDM_STATUS_SUCCESS) {
898 		/*
899 		 * Buffer should not contain the pattern.  If it does then
900 		 * we did not get the data from the remote host.
901 		 */
902 		IDM_BUFPAT_CHECK(idb, idb->idb_xfer_len, BP_CHECK_ASSERT);
903 	}
904 
905 	switch (idt->idt_state) {
906 	case TASK_ACTIVE:
907 		idm_buf_unbind_out_locked(idt, idb);
908 		mutex_exit(&idt->idt_mutex);
909 		(*idb->idb_buf_cb)(idb, status);
910 		return;
911 	case TASK_SUSPENDING:
912 	case TASK_SUSPENDED:
913 	case TASK_ABORTING:
914 	case TASK_ABORTED:
915 		/*
916 		 * To keep things simple we will ignore the case where the
917 		 * transfer was successful and leave all buffers bound to the
918 		 * task.  This allows us to also ignore the case where we've
919 		 * been asked to abort a task but the last transfer of the
920 		 * task has completed.  IDM has no idea whether this was, in
921 		 * fact, the last transfer of the task so it would be difficult
922 		 * to handle this case.  Everything should get sorted out again
923 		 * after task reassignment is complete.
924 		 *
925 		 * In the case of TASK_ABORTING we could conceivably call the
926 		 * buffer callback here but the timing of when the client's
927 		 * client_task_aborted callback is invoked vs. when the client's
928 		 * buffer callback gets invoked gets sticky.  We don't want
929 		 * the client to here from us again after the call to
930 		 * client_task_aborted() but we don't want to give it a bunch
931 		 * of failed buffer transfers until we've called
932 		 * client_task_aborted().  Instead we'll just leave all the
933 		 * buffers bound and allow the client to cleanup.
934 		 */
935 		break;
936 	default:
937 		ASSERT(0);
938 	}
939 	mutex_exit(&idt->idt_mutex);
940 }
941 
942 /*
943  * idm_buf_alloc
944  *
945  * Allocates a buffer handle and registers it for use with the transport
946  * layer. If a buffer is not passed on bufptr, the buffer will be allocated
947  * as well as the handle.
948  *
949  * ic		- connection on which the buffer will be transferred
950  * bufptr	- allocate memory for buffer if NULL, else assign to buffer
951  * buflen	- length of buffer
952  *
953  * Returns idm_buf_t handle if successful, otherwise NULL
954  */
955 idm_buf_t *
956 idm_buf_alloc(idm_conn_t *ic, void *bufptr, uint64_t buflen)
957 {
958 	idm_buf_t	*buf = NULL;
959 	int		rc;
960 
961 	ASSERT(ic != NULL);
962 	ASSERT(idm.idm_buf_cache != NULL);
963 	ASSERT(buflen > 0);
964 
965 	/* Don't allocate new buffers if we are not in FFP */
966 	mutex_enter(&ic->ic_state_mutex);
967 	if (!ic->ic_ffp) {
968 		mutex_exit(&ic->ic_state_mutex);
969 		return (NULL);
970 	}
971 
972 
973 	idm_conn_hold(ic);
974 	mutex_exit(&ic->ic_state_mutex);
975 
976 	buf = kmem_cache_alloc(idm.idm_buf_cache, KM_NOSLEEP);
977 	if (buf == NULL) {
978 		idm_conn_rele(ic);
979 		return (NULL);
980 	}
981 
982 	buf->idb_ic		= ic;
983 	buf->idb_buflen		= buflen;
984 	buf->idb_exp_offset	= 0;
985 	buf->idb_bufoffset	= 0;
986 	buf->idb_xfer_len 	= 0;
987 	buf->idb_magic		= IDM_BUF_MAGIC;
988 	buf->idb_in_transport	= B_FALSE;
989 	buf->idb_bufbcopy	= B_FALSE;
990 
991 	/*
992 	 * If bufptr is NULL, we have an implicit request to allocate
993 	 * memory for this IDM buffer handle and register it for use
994 	 * with the transport. To simplify this, and to give more freedom
995 	 * to the transport layer for it's own buffer management, both of
996 	 * these actions will take place in the transport layer.
997 	 * If bufptr is set, then the caller has allocated memory (or more
998 	 * likely it's been passed from an upper layer), and we need only
999 	 * register the buffer for use with the transport layer.
1000 	 */
1001 	if (bufptr == NULL) {
1002 		/*
1003 		 * Allocate a buffer from the transport layer (which
1004 		 * will also register the buffer for use).
1005 		 */
1006 		rc = ic->ic_transport_ops->it_buf_alloc(buf, buflen);
1007 		if (rc != 0) {
1008 			idm_conn_rele(ic);
1009 			kmem_cache_free(idm.idm_buf_cache, buf);
1010 			return (NULL);
1011 		}
1012 		/* Set the bufalloc'd flag */
1013 		buf->idb_bufalloc = B_TRUE;
1014 	} else {
1015 		/*
1016 		 * For large transfers, Set the passed bufptr into
1017 		 * the buf handle, and register the handle with the
1018 		 * transport layer. As memory registration with the
1019 		 * transport layer is a time/cpu intensive operation,
1020 		 * for small transfers (up to a pre-defined bcopy
1021 		 * threshold), use pre-registered memory buffers
1022 		 * and bcopy data at the appropriate time.
1023 		 */
1024 		buf->idb_buf = bufptr;
1025 
1026 		rc = ic->ic_transport_ops->it_buf_setup(buf);
1027 		if (rc != 0) {
1028 			idm_conn_rele(ic);
1029 			kmem_cache_free(idm.idm_buf_cache, buf);
1030 			return (NULL);
1031 		}
1032 		/*
1033 		 * The transport layer is now expected to set the idb_bufalloc
1034 		 * correctly to indicate if resources have been allocated.
1035 		 */
1036 	}
1037 
1038 	IDM_BUFPAT_SET(buf);
1039 
1040 	return (buf);
1041 }
1042 
1043 /*
1044  * idm_buf_free
1045  *
1046  * Release a buffer handle along with the associated buffer that was allocated
1047  * or assigned with idm_buf_alloc
1048  */
1049 void
1050 idm_buf_free(idm_buf_t *buf)
1051 {
1052 	idm_conn_t *ic = buf->idb_ic;
1053 
1054 
1055 	buf->idb_task_binding	= NULL;
1056 
1057 	if (buf->idb_bufalloc) {
1058 		ic->ic_transport_ops->it_buf_free(buf);
1059 	} else {
1060 		ic->ic_transport_ops->it_buf_teardown(buf);
1061 	}
1062 	kmem_cache_free(idm.idm_buf_cache, buf);
1063 	idm_conn_rele(ic);
1064 }
1065 
1066 /*
1067  * idm_buf_bind_in
1068  *
1069  * This function associates a buffer with a task. This is only for use by the
1070  * iSCSI initiator that will have only one buffer per transfer direction
1071  *
1072  */
1073 void
1074 idm_buf_bind_in(idm_task_t *idt, idm_buf_t *buf)
1075 {
1076 	mutex_enter(&idt->idt_mutex);
1077 	idm_buf_bind_in_locked(idt, buf);
1078 	mutex_exit(&idt->idt_mutex);
1079 }
1080 
1081 static void
1082 idm_buf_bind_in_locked(idm_task_t *idt, idm_buf_t *buf)
1083 {
1084 	buf->idb_task_binding = idt;
1085 	buf->idb_ic = idt->idt_ic;
1086 	idm_listbuf_insert(&idt->idt_inbufv, buf);
1087 }
1088 
1089 void
1090 idm_buf_bind_out(idm_task_t *idt, idm_buf_t *buf)
1091 {
1092 	/*
1093 	 * For small transfers, the iSER transport delegates the IDM
1094 	 * layer to bcopy the SCSI Write data for faster IOPS.
1095 	 */
1096 	if (buf->idb_bufbcopy == B_TRUE) {
1097 
1098 		bcopy(buf->idb_bufptr, buf->idb_buf, buf->idb_buflen);
1099 	}
1100 	mutex_enter(&idt->idt_mutex);
1101 	idm_buf_bind_out_locked(idt, buf);
1102 	mutex_exit(&idt->idt_mutex);
1103 }
1104 
1105 static void
1106 idm_buf_bind_out_locked(idm_task_t *idt, idm_buf_t *buf)
1107 {
1108 	buf->idb_task_binding = idt;
1109 	buf->idb_ic = idt->idt_ic;
1110 	idm_listbuf_insert(&idt->idt_outbufv, buf);
1111 }
1112 
1113 void
1114 idm_buf_unbind_in(idm_task_t *idt, idm_buf_t *buf)
1115 {
1116 	/*
1117 	 * For small transfers, the iSER transport delegates the IDM
1118 	 * layer to bcopy the SCSI Read data into the read buufer
1119 	 * for faster IOPS.
1120 	 */
1121 	if (buf->idb_bufbcopy == B_TRUE) {
1122 		bcopy(buf->idb_buf, buf->idb_bufptr, buf->idb_buflen);
1123 	}
1124 	mutex_enter(&idt->idt_mutex);
1125 	idm_buf_unbind_in_locked(idt, buf);
1126 	mutex_exit(&idt->idt_mutex);
1127 }
1128 
1129 static void
1130 idm_buf_unbind_in_locked(idm_task_t *idt, idm_buf_t *buf)
1131 {
1132 	list_remove(&idt->idt_inbufv, buf);
1133 }
1134 
1135 void
1136 idm_buf_unbind_out(idm_task_t *idt, idm_buf_t *buf)
1137 {
1138 	mutex_enter(&idt->idt_mutex);
1139 	idm_buf_unbind_out_locked(idt, buf);
1140 	mutex_exit(&idt->idt_mutex);
1141 }
1142 
1143 static void
1144 idm_buf_unbind_out_locked(idm_task_t *idt, idm_buf_t *buf)
1145 {
1146 	list_remove(&idt->idt_outbufv, buf);
1147 }
1148 
1149 /*
1150  * idm_buf_find() will lookup the idm_buf_t based on the relative offset in the
1151  * iSCSI PDU
1152  */
1153 idm_buf_t *
1154 idm_buf_find(void *lbuf, size_t data_offset)
1155 {
1156 	idm_buf_t	*idb;
1157 	list_t		*lst = (list_t *)lbuf;
1158 
1159 	/* iterate through the list to find the buffer */
1160 	for (idb = list_head(lst); idb != NULL; idb = list_next(lst, idb)) {
1161 
1162 		ASSERT((idb->idb_ic->ic_conn_type == CONN_TYPE_TGT) ||
1163 		    (idb->idb_bufoffset == 0));
1164 
1165 		if ((data_offset >= idb->idb_bufoffset) &&
1166 		    (data_offset < (idb->idb_bufoffset + idb->idb_buflen))) {
1167 
1168 			return (idb);
1169 		}
1170 	}
1171 
1172 	return (NULL);
1173 }
1174 
1175 void
1176 idm_bufpat_set(idm_buf_t *idb)
1177 {
1178 	idm_bufpat_t	*bufpat;
1179 	int		len, i;
1180 
1181 	len = idb->idb_buflen;
1182 	len = (len / sizeof (idm_bufpat_t)) * sizeof (idm_bufpat_t);
1183 
1184 	bufpat = idb->idb_buf;
1185 	for (i = 0; i < len; i += sizeof (idm_bufpat_t)) {
1186 		bufpat->bufpat_idb = idb;
1187 		bufpat->bufpat_bufmagic = IDM_BUF_MAGIC;
1188 		bufpat->bufpat_offset = i;
1189 		bufpat++;
1190 	}
1191 }
1192 
1193 boolean_t
1194 idm_bufpat_check(idm_buf_t *idb, int check_len, idm_bufpat_check_type_t type)
1195 {
1196 	idm_bufpat_t	*bufpat;
1197 	int		len, i;
1198 
1199 	len = (type == BP_CHECK_QUICK) ? sizeof (idm_bufpat_t) : check_len;
1200 	len = (len / sizeof (idm_bufpat_t)) * sizeof (idm_bufpat_t);
1201 	ASSERT(len <= idb->idb_buflen);
1202 	bufpat = idb->idb_buf;
1203 
1204 	/*
1205 	 * Don't check the pattern in buffers that came from outside IDM
1206 	 * (these will be buffers from the initiator that we opted not
1207 	 * to double-buffer)
1208 	 */
1209 	if (!idb->idb_bufalloc)
1210 		return (B_FALSE);
1211 
1212 	/*
1213 	 * Return true if we find the pattern anywhere in the buffer
1214 	 */
1215 	for (i = 0; i < len; i += sizeof (idm_bufpat_t)) {
1216 		if (BUFPAT_MATCH(bufpat, idb)) {
1217 			IDM_CONN_LOG(CE_WARN, "idm_bufpat_check found: "
1218 			    "idb %p bufpat %p "
1219 			    "bufpat_idb=%p bufmagic=%08x offset=%08x",
1220 			    (void *)idb, (void *)bufpat, bufpat->bufpat_idb,
1221 			    bufpat->bufpat_bufmagic, bufpat->bufpat_offset);
1222 			DTRACE_PROBE2(bufpat__pattern__found,
1223 			    idm_buf_t *, idb, idm_bufpat_t *, bufpat);
1224 			if (type == BP_CHECK_ASSERT) {
1225 				ASSERT(0);
1226 			}
1227 			return (B_TRUE);
1228 		}
1229 		bufpat++;
1230 	}
1231 
1232 	return (B_FALSE);
1233 }
1234 
1235 /*
1236  * idm_task_alloc
1237  *
1238  * This function will allocate a idm_task_t structure. A task tag is also
1239  * generated and saved in idt_tt. The task is not active.
1240  */
1241 idm_task_t *
1242 idm_task_alloc(idm_conn_t *ic)
1243 {
1244 	idm_task_t	*idt;
1245 
1246 	ASSERT(ic != NULL);
1247 
1248 	/* Don't allocate new tasks if we are not in FFP */
1249 	mutex_enter(&ic->ic_state_mutex);
1250 	if (!ic->ic_ffp) {
1251 		mutex_exit(&ic->ic_state_mutex);
1252 		return (NULL);
1253 	}
1254 	idt = kmem_cache_alloc(idm.idm_task_cache, KM_NOSLEEP);
1255 	if (idt == NULL) {
1256 		mutex_exit(&ic->ic_state_mutex);
1257 		return (NULL);
1258 	}
1259 
1260 	ASSERT(list_is_empty(&idt->idt_inbufv));
1261 	ASSERT(list_is_empty(&idt->idt_outbufv));
1262 
1263 	idm_conn_hold(ic);
1264 	mutex_exit(&ic->ic_state_mutex);
1265 
1266 	idt->idt_state		= TASK_IDLE;
1267 	idt->idt_ic		= ic;
1268 	idt->idt_private 	= NULL;
1269 	idt->idt_exp_datasn	= 0;
1270 	idt->idt_exp_rttsn	= 0;
1271 
1272 	return (idt);
1273 }
1274 
1275 /*
1276  * idm_task_start
1277  *
1278  * Mark the task active and initialize some stats. The caller
1279  * sets up the idm_task_t structure with a prior call to idm_task_alloc().
1280  * The task service does not function as a task/work engine, it is the
1281  * responsibility of the initiator to start the data transfer and free the
1282  * resources.
1283  */
1284 void
1285 idm_task_start(idm_task_t *idt, uintptr_t handle)
1286 {
1287 	ASSERT(idt != NULL);
1288 
1289 	/* mark the task as ACTIVE */
1290 	idt->idt_state = TASK_ACTIVE;
1291 	idt->idt_client_handle = handle;
1292 	idt->idt_tx_to_ini_start = idt->idt_tx_to_ini_done =
1293 	    idt->idt_rx_from_ini_start = idt->idt_rx_from_ini_done =
1294 	    idt->idt_tx_bytes = idt->idt_rx_bytes = 0;
1295 }
1296 
1297 /*
1298  * idm_task_done
1299  *
1300  * This function sets the state to indicate that the task is no longer active.
1301  */
1302 void
1303 idm_task_done(idm_task_t *idt)
1304 {
1305 	ASSERT(idt != NULL);
1306 
1307 	mutex_enter(&idt->idt_mutex);
1308 	idt->idt_state = TASK_IDLE;
1309 	mutex_exit(&idt->idt_mutex);
1310 
1311 	/*
1312 	 * Although unlikely it is possible for a reference to come in after
1313 	 * the client has decided the task is over but before we've marked
1314 	 * the task idle.  One specific unavoidable scenario is the case where
1315 	 * received PDU with the matching ITT/TTT results in a successful
1316 	 * lookup of this task.  We are at the mercy of the remote node in
1317 	 * that case so we need to handle it.  Now that the task state
1318 	 * has changed no more references will occur so a simple call to
1319 	 * idm_refcnt_wait_ref should deal with the situation.
1320 	 */
1321 	idm_refcnt_wait_ref(&idt->idt_refcnt);
1322 	idm_refcnt_reset(&idt->idt_refcnt);
1323 }
1324 
1325 /*
1326  * idm_task_free
1327  *
1328  * This function will free the Task Tag and the memory allocated for the task
1329  * idm_task_done should be called prior to this call
1330  */
1331 void
1332 idm_task_free(idm_task_t *idt)
1333 {
1334 	idm_conn_t *ic;
1335 
1336 	ASSERT(idt != NULL);
1337 	ASSERT(idt->idt_refcnt.ir_refcnt == 0);
1338 	ASSERT(idt->idt_state == TASK_IDLE);
1339 
1340 	ic = idt->idt_ic;
1341 
1342 	/*
1343 	 * It's possible for items to still be in the idt_inbufv list if
1344 	 * they were added after idm_task_cleanup was called.  We rely on
1345 	 * STMF to free all buffers associated with the task however STMF
1346 	 * doesn't know that we have this reference to the buffers.
1347 	 * Use list_create so that we don't end up with stale references
1348 	 * to these buffers.
1349 	 */
1350 	list_create(&idt->idt_inbufv, sizeof (idm_buf_t),
1351 	    offsetof(idm_buf_t, idb_buflink));
1352 	list_create(&idt->idt_outbufv, sizeof (idm_buf_t),
1353 	    offsetof(idm_buf_t, idb_buflink));
1354 
1355 	kmem_cache_free(idm.idm_task_cache, idt);
1356 
1357 	idm_conn_rele(ic);
1358 }
1359 
1360 /*
1361  * idm_task_find_common
1362  *	common code for idm_task_find() and idm_task_find_and_complete()
1363  */
1364 /*ARGSUSED*/
1365 static idm_task_t *
1366 idm_task_find_common(idm_conn_t *ic, uint32_t itt, uint32_t ttt,
1367     boolean_t complete)
1368 {
1369 	uint32_t	tt, client_handle;
1370 	idm_task_t	*idt;
1371 
1372 	/*
1373 	 * Must match both itt and ttt.  The table is indexed by itt
1374 	 * for initiator connections and ttt for target connections.
1375 	 */
1376 	if (IDM_CONN_ISTGT(ic)) {
1377 		tt = ttt;
1378 		client_handle = itt;
1379 	} else {
1380 		tt = itt;
1381 		client_handle = ttt;
1382 	}
1383 
1384 	rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1385 	if (tt >= idm.idm_taskid_max) {
1386 		rw_exit(&idm.idm_taskid_table_lock);
1387 		return (NULL);
1388 	}
1389 
1390 	idt = idm.idm_taskid_table[tt];
1391 
1392 	if (idt != NULL) {
1393 		mutex_enter(&idt->idt_mutex);
1394 		if ((idt->idt_state != TASK_ACTIVE) ||
1395 		    (idt->idt_ic != ic) ||
1396 		    (IDM_CONN_ISTGT(ic) &&
1397 		    (idt->idt_client_handle != client_handle))) {
1398 			/*
1399 			 * Task doesn't match or task is aborting and
1400 			 * we don't want any more references.
1401 			 */
1402 			if ((idt->idt_ic != ic) &&
1403 			    (idt->idt_state == TASK_ACTIVE) &&
1404 			    (IDM_CONN_ISINI(ic) || idt->idt_client_handle ==
1405 			    client_handle)) {
1406 				IDM_CONN_LOG(CE_WARN,
1407 				"idm_task_find: wrong connection %p != %p",
1408 				    (void *)ic, (void *)idt->idt_ic);
1409 			}
1410 			mutex_exit(&idt->idt_mutex);
1411 			rw_exit(&idm.idm_taskid_table_lock);
1412 			return (NULL);
1413 		}
1414 		idm_task_hold(idt);
1415 		/*
1416 		 * Set the task state to TASK_COMPLETE so it can no longer
1417 		 * be found or aborted.
1418 		 */
1419 		if (B_TRUE == complete)
1420 			idt->idt_state = TASK_COMPLETE;
1421 		mutex_exit(&idt->idt_mutex);
1422 	}
1423 	rw_exit(&idm.idm_taskid_table_lock);
1424 
1425 	return (idt);
1426 }
1427 
1428 /*
1429  * This function looks up a task by task tag.
1430  */
1431 idm_task_t *
1432 idm_task_find(idm_conn_t *ic, uint32_t itt, uint32_t ttt)
1433 {
1434 	return (idm_task_find_common(ic, itt, ttt, B_FALSE));
1435 }
1436 
1437 /*
1438  * This function looks up a task by task tag. If found, the task state
1439  * is atomically set to TASK_COMPLETE so it can longer be found or aborted.
1440  */
1441 idm_task_t *
1442 idm_task_find_and_complete(idm_conn_t *ic, uint32_t itt, uint32_t ttt)
1443 {
1444 	return (idm_task_find_common(ic, itt, ttt, B_TRUE));
1445 }
1446 
1447 /*
1448  * idm_task_find_by_handle
1449  *
1450  * This function looks up a task by the client-private idt_client_handle.
1451  *
1452  * This function should NEVER be called in the performance path.  It is
1453  * intended strictly for error recovery/task management.
1454  */
1455 /*ARGSUSED*/
1456 void *
1457 idm_task_find_by_handle(idm_conn_t *ic, uintptr_t handle)
1458 {
1459 	idm_task_t	*idt = NULL;
1460 	int		idx = 0;
1461 
1462 	rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1463 
1464 	for (idx = 0; idx < idm.idm_taskid_max; idx++) {
1465 		idt = idm.idm_taskid_table[idx];
1466 
1467 		if (idt == NULL)
1468 			continue;
1469 
1470 		mutex_enter(&idt->idt_mutex);
1471 
1472 		if (idt->idt_state != TASK_ACTIVE) {
1473 			/*
1474 			 * Task is either in suspend, abort, or already
1475 			 * complete.
1476 			 */
1477 			mutex_exit(&idt->idt_mutex);
1478 			continue;
1479 		}
1480 
1481 		if (idt->idt_client_handle == handle) {
1482 			idm_task_hold(idt);
1483 			mutex_exit(&idt->idt_mutex);
1484 			break;
1485 		}
1486 
1487 		mutex_exit(&idt->idt_mutex);
1488 	}
1489 
1490 	rw_exit(&idm.idm_taskid_table_lock);
1491 
1492 	if ((idt == NULL) || (idx == idm.idm_taskid_max))
1493 		return (NULL);
1494 
1495 	return (idt->idt_private);
1496 }
1497 
1498 void
1499 idm_task_hold(idm_task_t *idt)
1500 {
1501 	idm_refcnt_hold(&idt->idt_refcnt);
1502 }
1503 
1504 void
1505 idm_task_rele(idm_task_t *idt)
1506 {
1507 	idm_refcnt_rele(&idt->idt_refcnt);
1508 }
1509 
1510 void
1511 idm_task_abort(idm_conn_t *ic, idm_task_t *idt, idm_abort_type_t abort_type)
1512 {
1513 	idm_task_t	*task;
1514 	int		idx;
1515 
1516 	/*
1517 	 * Passing NULL as the task indicates that all tasks
1518 	 * for this connection should be aborted.
1519 	 */
1520 	if (idt == NULL) {
1521 		/*
1522 		 * Only the connection state machine should ask for
1523 		 * all tasks to abort and this should never happen in FFP.
1524 		 */
1525 		ASSERT(!ic->ic_ffp);
1526 		rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1527 		for (idx = 0; idx < idm.idm_taskid_max; idx++) {
1528 			task = idm.idm_taskid_table[idx];
1529 			if (task == NULL)
1530 				continue;
1531 			mutex_enter(&task->idt_mutex);
1532 			if ((task->idt_state != TASK_IDLE) &&
1533 			    (task->idt_state != TASK_COMPLETE) &&
1534 			    (task->idt_ic == ic)) {
1535 				rw_exit(&idm.idm_taskid_table_lock);
1536 				idm_task_abort_one(ic, task, abort_type);
1537 				rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1538 			} else
1539 				mutex_exit(&task->idt_mutex);
1540 		}
1541 		rw_exit(&idm.idm_taskid_table_lock);
1542 	} else {
1543 		mutex_enter(&idt->idt_mutex);
1544 		idm_task_abort_one(ic, idt, abort_type);
1545 	}
1546 }
1547 
1548 static void
1549 idm_task_abort_unref_cb(void *ref)
1550 {
1551 	idm_task_t *idt = ref;
1552 
1553 	mutex_enter(&idt->idt_mutex);
1554 	switch (idt->idt_state) {
1555 	case TASK_SUSPENDING:
1556 		idt->idt_state = TASK_SUSPENDED;
1557 		mutex_exit(&idt->idt_mutex);
1558 		idm_task_aborted(idt, IDM_STATUS_SUSPENDED);
1559 		return;
1560 	case TASK_ABORTING:
1561 		idt->idt_state = TASK_ABORTED;
1562 		mutex_exit(&idt->idt_mutex);
1563 		idm_task_aborted(idt, IDM_STATUS_ABORTED);
1564 		return;
1565 	default:
1566 		mutex_exit(&idt->idt_mutex);
1567 		ASSERT(0);
1568 		break;
1569 	}
1570 }
1571 
1572 /*
1573  * Abort the idm task.
1574  *    Caller must hold the task mutex, which will be released before return
1575  */
1576 static void
1577 idm_task_abort_one(idm_conn_t *ic, idm_task_t *idt, idm_abort_type_t abort_type)
1578 {
1579 	/* Caller must hold connection mutex */
1580 	ASSERT(mutex_owned(&idt->idt_mutex));
1581 	switch (idt->idt_state) {
1582 	case TASK_ACTIVE:
1583 		switch (abort_type) {
1584 		case AT_INTERNAL_SUSPEND:
1585 			/* Call transport to release any resources */
1586 			idt->idt_state = TASK_SUSPENDING;
1587 			mutex_exit(&idt->idt_mutex);
1588 			ic->ic_transport_ops->it_free_task_rsrc(idt);
1589 
1590 			/*
1591 			 * Wait for outstanding references.  When all
1592 			 * references are released the callback will call
1593 			 * idm_task_aborted().
1594 			 */
1595 			idm_refcnt_async_wait_ref(&idt->idt_refcnt,
1596 			    &idm_task_abort_unref_cb);
1597 			return;
1598 		case AT_INTERNAL_ABORT:
1599 		case AT_TASK_MGMT_ABORT:
1600 			idt->idt_state = TASK_ABORTING;
1601 			mutex_exit(&idt->idt_mutex);
1602 			ic->ic_transport_ops->it_free_task_rsrc(idt);
1603 
1604 			/*
1605 			 * Wait for outstanding references.  When all
1606 			 * references are released the callback will call
1607 			 * idm_task_aborted().
1608 			 */
1609 			idm_refcnt_async_wait_ref(&idt->idt_refcnt,
1610 			    &idm_task_abort_unref_cb);
1611 			return;
1612 		default:
1613 			ASSERT(0);
1614 		}
1615 		break;
1616 	case TASK_SUSPENDING:
1617 		/* Already called transport_free_task_rsrc(); */
1618 		switch (abort_type) {
1619 		case AT_INTERNAL_SUSPEND:
1620 			/* Already doing it */
1621 			break;
1622 		case AT_INTERNAL_ABORT:
1623 		case AT_TASK_MGMT_ABORT:
1624 			idt->idt_state = TASK_ABORTING;
1625 			break;
1626 		default:
1627 			ASSERT(0);
1628 		}
1629 		break;
1630 	case TASK_SUSPENDED:
1631 		/* Already called transport_free_task_rsrc(); */
1632 		switch (abort_type) {
1633 		case AT_INTERNAL_SUSPEND:
1634 			/* Already doing it */
1635 			break;
1636 		case AT_INTERNAL_ABORT:
1637 		case AT_TASK_MGMT_ABORT:
1638 			idt->idt_state = TASK_ABORTING;
1639 			mutex_exit(&idt->idt_mutex);
1640 
1641 			/*
1642 			 * We could probably call idm_task_aborted directly
1643 			 * here but we may be holding the conn lock. It's
1644 			 * easier to just switch contexts.  Even though
1645 			 * we shouldn't really have any references we'll
1646 			 * set the state to TASK_ABORTING instead of
1647 			 * TASK_ABORTED so we can use the same code path.
1648 			 */
1649 			idm_refcnt_async_wait_ref(&idt->idt_refcnt,
1650 			    &idm_task_abort_unref_cb);
1651 			return;
1652 		default:
1653 			ASSERT(0);
1654 		}
1655 		break;
1656 	case TASK_ABORTING:
1657 	case TASK_ABORTED:
1658 		switch (abort_type) {
1659 		case AT_INTERNAL_SUSPEND:
1660 			/* We're already past this point... */
1661 		case AT_INTERNAL_ABORT:
1662 		case AT_TASK_MGMT_ABORT:
1663 			/* Already doing it */
1664 			break;
1665 		default:
1666 			ASSERT(0);
1667 		}
1668 		break;
1669 	case TASK_COMPLETE:
1670 		/*
1671 		 * In this case, let it go.  The status has already been
1672 		 * sent (which may or may not get successfully transmitted)
1673 		 * and we don't want to end up in a race between completing
1674 		 * the status PDU and marking the task suspended.
1675 		 */
1676 		break;
1677 	default:
1678 		ASSERT(0);
1679 	}
1680 	mutex_exit(&idt->idt_mutex);
1681 }
1682 
1683 static void
1684 idm_task_aborted(idm_task_t *idt, idm_status_t status)
1685 {
1686 	(*idt->idt_ic->ic_conn_ops.icb_task_aborted)(idt, status);
1687 }
1688 
1689 void
1690 idm_task_cleanup(idm_task_t *idt)
1691 {
1692 	idm_buf_t *idb, *next_idb;
1693 	list_t		tmp_buflist;
1694 	ASSERT((idt->idt_state == TASK_SUSPENDED) ||
1695 	    (idt->idt_state == TASK_ABORTED));
1696 
1697 	list_create(&tmp_buflist, sizeof (idm_buf_t),
1698 	    offsetof(idm_buf_t, idb_buflink));
1699 
1700 	/*
1701 	 * Remove all the buffers from the task and add them to a
1702 	 * temporary local list -- we do this so that we can hold
1703 	 * the task lock and prevent the task from going away if
1704 	 * the client decides to call idm_task_done/idm_task_free.
1705 	 * This could happen during abort in iscsit.
1706 	 */
1707 	mutex_enter(&idt->idt_mutex);
1708 	for (idb = list_head(&idt->idt_inbufv);
1709 	    idb != NULL;
1710 	    idb = next_idb) {
1711 		next_idb = list_next(&idt->idt_inbufv, idb);
1712 		idm_buf_unbind_in_locked(idt, idb);
1713 		list_insert_tail(&tmp_buflist, idb);
1714 	}
1715 
1716 	for (idb = list_head(&idt->idt_outbufv);
1717 	    idb != NULL;
1718 	    idb = next_idb) {
1719 		next_idb = list_next(&idt->idt_outbufv, idb);
1720 		idm_buf_unbind_out_locked(idt, idb);
1721 		list_insert_tail(&tmp_buflist, idb);
1722 	}
1723 	mutex_exit(&idt->idt_mutex);
1724 
1725 	for (idb = list_head(&tmp_buflist); idb != NULL; idb = next_idb) {
1726 		next_idb = list_next(&tmp_buflist, idb);
1727 		list_remove(&tmp_buflist, idb);
1728 		(*idb->idb_buf_cb)(idb, IDM_STATUS_ABORTED);
1729 	}
1730 	list_destroy(&tmp_buflist);
1731 }
1732 
1733 
1734 /*
1735  * idm_pdu_tx
1736  *
1737  * This is IDM's implementation of the 'Send_Control' operational primitive.
1738  * This function is invoked by an initiator iSCSI layer requesting the transfer
1739  * of a iSCSI command PDU or a target iSCSI layer requesting the transfer of a
1740  * iSCSI response PDU. The PDU will be transmitted as-is by the local Datamover
1741  * layer to the peer iSCSI layer in the remote iSCSI node. The connection info
1742  * and iSCSI PDU-specific qualifiers namely BHS, AHS, DataDescriptor and Size
1743  * are provided as input.
1744  *
1745  */
1746 void
1747 idm_pdu_tx(idm_pdu_t *pdu)
1748 {
1749 	idm_conn_t		*ic = pdu->isp_ic;
1750 	iscsi_async_evt_hdr_t	*async_evt;
1751 
1752 	/*
1753 	 * If we are in full-featured mode then route SCSI-related
1754 	 * commands to the appropriate function vector without checking
1755 	 * the connection state.  We will only be in full-feature mode
1756 	 * when we are in an acceptable state for SCSI PDU's.
1757 	 *
1758 	 * We also need to ensure that there are no PDU events outstanding
1759 	 * on the state machine.  Any non-SCSI PDU's received in full-feature
1760 	 * mode will result in PDU events and until these have been handled
1761 	 * we need to route all PDU's through the state machine as PDU
1762 	 * events to maintain ordering.
1763 	 *
1764 	 * Note that IDM cannot enter FFP mode until it processes in
1765 	 * its state machine the last xmit of the login process.
1766 	 * Hence, checking the IDM_PDU_LOGIN_TX flag here would be
1767 	 * superfluous.
1768 	 */
1769 	mutex_enter(&ic->ic_state_mutex);
1770 	if (ic->ic_ffp && (ic->ic_pdu_events == 0)) {
1771 		mutex_exit(&ic->ic_state_mutex);
1772 		switch (IDM_PDU_OPCODE(pdu)) {
1773 		case ISCSI_OP_SCSI_RSP:
1774 			/* Target only */
1775 			idm_pdu_tx_forward(ic, pdu);
1776 			return;
1777 		case ISCSI_OP_SCSI_TASK_MGT_RSP:
1778 			/* Target only */
1779 			idm_pdu_tx_forward(ic, pdu);
1780 			return;
1781 		case ISCSI_OP_SCSI_DATA_RSP:
1782 			/* Target only */
1783 			idm_pdu_tx_forward(ic, pdu);
1784 			return;
1785 		case ISCSI_OP_RTT_RSP:
1786 			/* Target only */
1787 			idm_pdu_tx_forward(ic, pdu);
1788 			return;
1789 		case ISCSI_OP_NOOP_IN:
1790 			/* Target only */
1791 			idm_pdu_tx_forward(ic, pdu);
1792 			return;
1793 		case ISCSI_OP_TEXT_RSP:
1794 			/* Target only */
1795 			idm_pdu_tx_forward(ic, pdu);
1796 			return;
1797 		case ISCSI_OP_TEXT_CMD:
1798 		case ISCSI_OP_NOOP_OUT:
1799 		case ISCSI_OP_SCSI_CMD:
1800 		case ISCSI_OP_SCSI_DATA:
1801 		case ISCSI_OP_SCSI_TASK_MGT_MSG:
1802 			/* Initiator only */
1803 			idm_pdu_tx_forward(ic, pdu);
1804 			return;
1805 		default:
1806 			break;
1807 		}
1808 
1809 		mutex_enter(&ic->ic_state_mutex);
1810 	}
1811 
1812 	/*
1813 	 * Any PDU's processed outside of full-feature mode and non-SCSI
1814 	 * PDU's in full-feature mode are handled by generating an
1815 	 * event to the connection state machine.  The state machine
1816 	 * will validate the PDU against the current state and either
1817 	 * transmit the PDU if the opcode is allowed or handle an
1818 	 * error if the PDU is not allowed.
1819 	 *
1820 	 * This code-path will also generate any events that are implied
1821 	 * by the PDU opcode.  For example a "login response" with success
1822 	 * status generates a CE_LOGOUT_SUCCESS_SND event.
1823 	 */
1824 	switch (IDM_PDU_OPCODE(pdu)) {
1825 	case ISCSI_OP_LOGIN_CMD:
1826 		idm_conn_tx_pdu_event(ic, CE_LOGIN_SND, (uintptr_t)pdu);
1827 		break;
1828 	case ISCSI_OP_LOGIN_RSP:
1829 		idm_parse_login_rsp(ic, pdu, /* Is RX */ B_FALSE);
1830 		break;
1831 	case ISCSI_OP_LOGOUT_CMD:
1832 		idm_parse_logout_req(ic, pdu, /* Is RX */ B_FALSE);
1833 		break;
1834 	case ISCSI_OP_LOGOUT_RSP:
1835 		idm_parse_logout_rsp(ic, pdu, /* Is RX */ B_FALSE);
1836 		break;
1837 	case ISCSI_OP_ASYNC_EVENT:
1838 		async_evt = (iscsi_async_evt_hdr_t *)pdu->isp_hdr;
1839 		switch (async_evt->async_event) {
1840 		case ISCSI_ASYNC_EVENT_REQUEST_LOGOUT:
1841 			idm_conn_tx_pdu_event(ic, CE_ASYNC_LOGOUT_SND,
1842 			    (uintptr_t)pdu);
1843 			break;
1844 		case ISCSI_ASYNC_EVENT_DROPPING_CONNECTION:
1845 			idm_conn_tx_pdu_event(ic, CE_ASYNC_DROP_CONN_SND,
1846 			    (uintptr_t)pdu);
1847 			break;
1848 		case ISCSI_ASYNC_EVENT_DROPPING_ALL_CONNECTIONS:
1849 			idm_conn_tx_pdu_event(ic, CE_ASYNC_DROP_ALL_CONN_SND,
1850 			    (uintptr_t)pdu);
1851 			break;
1852 		case ISCSI_ASYNC_EVENT_SCSI_EVENT:
1853 		case ISCSI_ASYNC_EVENT_PARAM_NEGOTIATION:
1854 		default:
1855 			idm_conn_tx_pdu_event(ic, CE_MISC_TX,
1856 			    (uintptr_t)pdu);
1857 			break;
1858 		}
1859 		break;
1860 	case ISCSI_OP_SCSI_RSP:
1861 		/* Target only */
1862 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1863 		break;
1864 	case ISCSI_OP_SCSI_TASK_MGT_RSP:
1865 		/* Target only */
1866 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1867 		break;
1868 	case ISCSI_OP_SCSI_DATA_RSP:
1869 		/* Target only */
1870 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1871 		break;
1872 	case ISCSI_OP_RTT_RSP:
1873 		/* Target only */
1874 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1875 		break;
1876 	case ISCSI_OP_NOOP_IN:
1877 		/* Target only */
1878 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1879 		break;
1880 	case ISCSI_OP_TEXT_RSP:
1881 		/* Target only */
1882 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1883 		break;
1884 		/* Initiator only */
1885 	case ISCSI_OP_SCSI_CMD:
1886 	case ISCSI_OP_SCSI_TASK_MGT_MSG:
1887 	case ISCSI_OP_SCSI_DATA:
1888 	case ISCSI_OP_NOOP_OUT:
1889 	case ISCSI_OP_TEXT_CMD:
1890 	case ISCSI_OP_SNACK_CMD:
1891 	case ISCSI_OP_REJECT_MSG:
1892 	default:
1893 		/*
1894 		 * Connection state machine will validate these PDU's against
1895 		 * the current state.  A PDU not allowed in the current
1896 		 * state will cause a protocol error.
1897 		 */
1898 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1899 		break;
1900 	}
1901 	mutex_exit(&ic->ic_state_mutex);
1902 }
1903 
1904 /*
1905  * Common allocation of a PDU along with memory for header and data.
1906  */
1907 static idm_pdu_t *
1908 idm_pdu_alloc_common(uint_t hdrlen, uint_t datalen, int sleepflag)
1909 {
1910 	idm_pdu_t *result;
1911 
1912 	/*
1913 	 * IDM clients should cache these structures for performance
1914 	 * critical paths.  We can't cache effectively in IDM because we
1915 	 * don't know the correct header and data size.
1916 	 *
1917 	 * Valid header length is assumed to be hdrlen and valid data
1918 	 * length is assumed to be datalen.  isp_hdrlen and isp_datalen
1919 	 * can be adjusted after the PDU is returned if necessary.
1920 	 */
1921 	result = kmem_zalloc(sizeof (idm_pdu_t) + hdrlen + datalen, sleepflag);
1922 	if (result != NULL) {
1923 		/* For idm_pdu_free sanity check */
1924 		result->isp_flags |= IDM_PDU_ALLOC;
1925 		/* pointer arithmetic */
1926 		result->isp_hdr = (iscsi_hdr_t *)(result + 1);
1927 		result->isp_hdrlen = hdrlen;
1928 		result->isp_hdrbuflen = hdrlen;
1929 		result->isp_transport_hdrlen = 0;
1930 		result->isp_data = (uint8_t *)result->isp_hdr + hdrlen;
1931 		result->isp_datalen = datalen;
1932 		result->isp_databuflen = datalen;
1933 		result->isp_magic = IDM_PDU_MAGIC;
1934 	}
1935 
1936 	return (result);
1937 }
1938 
1939 /*
1940  * Typical idm_pdu_alloc invocation, will block for resources.
1941  */
1942 idm_pdu_t *
1943 idm_pdu_alloc(uint_t hdrlen, uint_t datalen)
1944 {
1945 	return (idm_pdu_alloc_common(hdrlen, datalen, KM_SLEEP));
1946 }
1947 
1948 /*
1949  * Non-blocking idm_pdu_alloc implementation, returns NULL if resources
1950  * are not available.  Needed for transport-layer allocations which may
1951  * be invoking in interrupt context.
1952  */
1953 idm_pdu_t *
1954 idm_pdu_alloc_nosleep(uint_t hdrlen, uint_t datalen)
1955 {
1956 	return (idm_pdu_alloc_common(hdrlen, datalen, KM_NOSLEEP));
1957 }
1958 
1959 /*
1960  * Free a PDU previously allocated with idm_pdu_alloc() including any
1961  * header and data space allocated as part of the original request.
1962  * Additional memory regions referenced by subsequent modification of
1963  * the isp_hdr and/or isp_data fields will not be freed.
1964  */
1965 void
1966 idm_pdu_free(idm_pdu_t *pdu)
1967 {
1968 	/* Make sure the structure was allocated using idm_pdu_alloc() */
1969 	ASSERT(pdu->isp_flags & IDM_PDU_ALLOC);
1970 	kmem_free(pdu,
1971 	    sizeof (idm_pdu_t) + pdu->isp_hdrbuflen + pdu->isp_databuflen);
1972 }
1973 
1974 /*
1975  * Initialize the connection, private and callback fields in a PDU.
1976  */
1977 void
1978 idm_pdu_init(idm_pdu_t *pdu, idm_conn_t *ic, void *private, idm_pdu_cb_t *cb)
1979 {
1980 	/*
1981 	 * idm_pdu_complete() will call idm_pdu_free if the callback is
1982 	 * NULL.  This will only work if the PDU was originally allocated
1983 	 * with idm_pdu_alloc().
1984 	 */
1985 	ASSERT((pdu->isp_flags & IDM_PDU_ALLOC) ||
1986 	    (cb != NULL));
1987 	pdu->isp_magic = IDM_PDU_MAGIC;
1988 	pdu->isp_ic = ic;
1989 	pdu->isp_private = private;
1990 	pdu->isp_callback = cb;
1991 }
1992 
1993 /*
1994  * Initialize the header and header length field.  This function should
1995  * not be used to adjust the header length in a buffer allocated via
1996  * pdu_pdu_alloc since it overwrites the existing header pointer.
1997  */
1998 void
1999 idm_pdu_init_hdr(idm_pdu_t *pdu, uint8_t *hdr, uint_t hdrlen)
2000 {
2001 	pdu->isp_hdr = (iscsi_hdr_t *)((void *)hdr);
2002 	pdu->isp_hdrlen = hdrlen;
2003 }
2004 
2005 /*
2006  * Initialize the data and data length fields.  This function should
2007  * not be used to adjust the data length of a buffer allocated via
2008  * idm_pdu_alloc since it overwrites the existing data pointer.
2009  */
2010 void
2011 idm_pdu_init_data(idm_pdu_t *pdu, uint8_t *data, uint_t datalen)
2012 {
2013 	pdu->isp_data = data;
2014 	pdu->isp_datalen = datalen;
2015 }
2016 
2017 void
2018 idm_pdu_complete(idm_pdu_t *pdu, idm_status_t status)
2019 {
2020 	if (pdu->isp_callback) {
2021 		pdu->isp_status = status;
2022 		(*pdu->isp_callback)(pdu, status);
2023 	} else {
2024 		idm_pdu_free(pdu);
2025 	}
2026 }
2027 
2028 /*
2029  * State machine auditing
2030  */
2031 
2032 void
2033 idm_sm_audit_init(sm_audit_buf_t *audit_buf)
2034 {
2035 	bzero(audit_buf, sizeof (sm_audit_buf_t));
2036 	audit_buf->sab_max_index = SM_AUDIT_BUF_MAX_REC - 1;
2037 }
2038 
2039 static
2040 sm_audit_record_t *
2041 idm_sm_audit_common(sm_audit_buf_t *audit_buf, sm_audit_record_type_t r_type,
2042     sm_audit_sm_type_t sm_type,
2043     int current_state)
2044 {
2045 	sm_audit_record_t *sar;
2046 
2047 	sar = audit_buf->sab_records;
2048 	sar += audit_buf->sab_index;
2049 	audit_buf->sab_index++;
2050 	audit_buf->sab_index &= audit_buf->sab_max_index;
2051 
2052 	sar->sar_type = r_type;
2053 	gethrestime(&sar->sar_timestamp);
2054 	sar->sar_sm_type = sm_type;
2055 	sar->sar_state = current_state;
2056 
2057 	return (sar);
2058 }
2059 
2060 void
2061 idm_sm_audit_event(sm_audit_buf_t *audit_buf,
2062     sm_audit_sm_type_t sm_type, int current_state,
2063     int event, uintptr_t event_info)
2064 {
2065 	sm_audit_record_t *sar;
2066 
2067 	sar = idm_sm_audit_common(audit_buf, SAR_STATE_EVENT,
2068 	    sm_type, current_state);
2069 	sar->sar_event = event;
2070 	sar->sar_event_info = event_info;
2071 }
2072 
2073 void
2074 idm_sm_audit_state_change(sm_audit_buf_t *audit_buf,
2075     sm_audit_sm_type_t sm_type, int current_state, int new_state)
2076 {
2077 	sm_audit_record_t *sar;
2078 
2079 	sar = idm_sm_audit_common(audit_buf, SAR_STATE_CHANGE,
2080 	    sm_type, current_state);
2081 	sar->sar_new_state = new_state;
2082 }
2083 
2084 
2085 /*
2086  * Object reference tracking
2087  */
2088 
2089 void
2090 idm_refcnt_init(idm_refcnt_t *refcnt, void *referenced_obj)
2091 {
2092 	bzero(refcnt, sizeof (*refcnt));
2093 	idm_refcnt_reset(refcnt);
2094 	refcnt->ir_referenced_obj = referenced_obj;
2095 	bzero(&refcnt->ir_audit_buf, sizeof (refcnt_audit_buf_t));
2096 	refcnt->ir_audit_buf.anb_max_index = REFCNT_AUDIT_BUF_MAX_REC - 1;
2097 	mutex_init(&refcnt->ir_mutex, NULL, MUTEX_DEFAULT, NULL);
2098 	cv_init(&refcnt->ir_cv, NULL, CV_DEFAULT, NULL);
2099 }
2100 
2101 void
2102 idm_refcnt_destroy(idm_refcnt_t *refcnt)
2103 {
2104 	ASSERT(refcnt->ir_refcnt == 0);
2105 	cv_destroy(&refcnt->ir_cv);
2106 	mutex_destroy(&refcnt->ir_mutex);
2107 }
2108 
2109 void
2110 idm_refcnt_reset(idm_refcnt_t *refcnt)
2111 {
2112 	refcnt->ir_waiting = REF_NOWAIT;
2113 	refcnt->ir_refcnt = 0;
2114 }
2115 
2116 void
2117 idm_refcnt_hold(idm_refcnt_t *refcnt)
2118 {
2119 	/*
2120 	 * Nothing should take a hold on an object after a call to
2121 	 * idm_refcnt_wait_ref or idm_refcnd_async_wait_ref
2122 	 */
2123 	ASSERT(refcnt->ir_waiting == REF_NOWAIT);
2124 
2125 	mutex_enter(&refcnt->ir_mutex);
2126 	refcnt->ir_refcnt++;
2127 	REFCNT_AUDIT(refcnt);
2128 	mutex_exit(&refcnt->ir_mutex);
2129 }
2130 
2131 static void
2132 idm_refcnt_unref_task(void *refcnt_void)
2133 {
2134 	idm_refcnt_t *refcnt = refcnt_void;
2135 
2136 	REFCNT_AUDIT(refcnt);
2137 	(*refcnt->ir_cb)(refcnt->ir_referenced_obj);
2138 }
2139 
2140 void
2141 idm_refcnt_rele(idm_refcnt_t *refcnt)
2142 {
2143 	mutex_enter(&refcnt->ir_mutex);
2144 	ASSERT(refcnt->ir_refcnt > 0);
2145 	refcnt->ir_refcnt--;
2146 	REFCNT_AUDIT(refcnt);
2147 	if (refcnt->ir_waiting == REF_NOWAIT) {
2148 		/* No one is waiting on this object */
2149 		mutex_exit(&refcnt->ir_mutex);
2150 		return;
2151 	}
2152 
2153 	/*
2154 	 * Someone is waiting for this object to go idle so check if
2155 	 * refcnt is 0.  Waiting on an object then later grabbing another
2156 	 * reference is not allowed so we don't need to handle that case.
2157 	 */
2158 	if (refcnt->ir_refcnt == 0) {
2159 		if (refcnt->ir_waiting == REF_WAIT_ASYNC) {
2160 			if (taskq_dispatch(idm.idm_global_taskq,
2161 			    &idm_refcnt_unref_task, refcnt, TQ_SLEEP) == NULL) {
2162 				cmn_err(CE_WARN,
2163 				    "idm_refcnt_rele: Couldn't dispatch task");
2164 			}
2165 		} else if (refcnt->ir_waiting == REF_WAIT_SYNC) {
2166 			cv_signal(&refcnt->ir_cv);
2167 		}
2168 	}
2169 	mutex_exit(&refcnt->ir_mutex);
2170 }
2171 
2172 void
2173 idm_refcnt_rele_and_destroy(idm_refcnt_t *refcnt, idm_refcnt_cb_t *cb_func)
2174 {
2175 	mutex_enter(&refcnt->ir_mutex);
2176 	ASSERT(refcnt->ir_refcnt > 0);
2177 	refcnt->ir_refcnt--;
2178 	REFCNT_AUDIT(refcnt);
2179 
2180 	/*
2181 	 * Someone is waiting for this object to go idle so check if
2182 	 * refcnt is 0.  Waiting on an object then later grabbing another
2183 	 * reference is not allowed so we don't need to handle that case.
2184 	 */
2185 	if (refcnt->ir_refcnt == 0) {
2186 		refcnt->ir_cb = cb_func;
2187 		refcnt->ir_waiting = REF_WAIT_ASYNC;
2188 		if (taskq_dispatch(idm.idm_global_taskq,
2189 		    &idm_refcnt_unref_task, refcnt, TQ_SLEEP) == NULL) {
2190 			cmn_err(CE_WARN,
2191 			    "idm_refcnt_rele: Couldn't dispatch task");
2192 		}
2193 	}
2194 	mutex_exit(&refcnt->ir_mutex);
2195 }
2196 
2197 void
2198 idm_refcnt_wait_ref(idm_refcnt_t *refcnt)
2199 {
2200 	mutex_enter(&refcnt->ir_mutex);
2201 	refcnt->ir_waiting = REF_WAIT_SYNC;
2202 	REFCNT_AUDIT(refcnt);
2203 	while (refcnt->ir_refcnt != 0)
2204 		cv_wait(&refcnt->ir_cv, &refcnt->ir_mutex);
2205 	mutex_exit(&refcnt->ir_mutex);
2206 }
2207 
2208 void
2209 idm_refcnt_async_wait_ref(idm_refcnt_t *refcnt, idm_refcnt_cb_t *cb_func)
2210 {
2211 	mutex_enter(&refcnt->ir_mutex);
2212 	refcnt->ir_waiting = REF_WAIT_ASYNC;
2213 	refcnt->ir_cb = cb_func;
2214 	REFCNT_AUDIT(refcnt);
2215 	/*
2216 	 * It's possible we don't have any references.  To make things easier
2217 	 * on the caller use a taskq to call the callback instead of
2218 	 * calling it synchronously
2219 	 */
2220 	if (refcnt->ir_refcnt == 0) {
2221 		if (taskq_dispatch(idm.idm_global_taskq,
2222 		    &idm_refcnt_unref_task, refcnt, TQ_SLEEP) == NULL) {
2223 			cmn_err(CE_WARN,
2224 			    "idm_refcnt_async_wait_ref: "
2225 			    "Couldn't dispatch task");
2226 		}
2227 	}
2228 	mutex_exit(&refcnt->ir_mutex);
2229 }
2230 
2231 void
2232 idm_refcnt_destroy_unref_obj(idm_refcnt_t *refcnt,
2233     idm_refcnt_cb_t *cb_func)
2234 {
2235 	mutex_enter(&refcnt->ir_mutex);
2236 	if (refcnt->ir_refcnt == 0) {
2237 		mutex_exit(&refcnt->ir_mutex);
2238 		(*cb_func)(refcnt->ir_referenced_obj);
2239 		return;
2240 	}
2241 	mutex_exit(&refcnt->ir_mutex);
2242 }
2243 
2244 void
2245 idm_conn_hold(idm_conn_t *ic)
2246 {
2247 	idm_refcnt_hold(&ic->ic_refcnt);
2248 }
2249 
2250 void
2251 idm_conn_rele(idm_conn_t *ic)
2252 {
2253 	idm_refcnt_rele(&ic->ic_refcnt);
2254 }
2255 
2256 
2257 static int
2258 _idm_init(void)
2259 {
2260 	/* Initialize the rwlock for the taskid table */
2261 	rw_init(&idm.idm_taskid_table_lock, NULL, RW_DRIVER, NULL);
2262 
2263 	/* Initialize the global mutex and taskq */
2264 	mutex_init(&idm.idm_global_mutex, NULL, MUTEX_DEFAULT, NULL);
2265 
2266 	cv_init(&idm.idm_tgt_svc_cv, NULL, CV_DEFAULT, NULL);
2267 	cv_init(&idm.idm_wd_cv, NULL, CV_DEFAULT, NULL);
2268 
2269 	/*
2270 	 * The maximum allocation needs to be high here since there can be
2271 	 * many concurrent tasks using the global taskq.
2272 	 */
2273 	idm.idm_global_taskq = taskq_create("idm_global_taskq", 1, minclsyspri,
2274 	    128, 16384, TASKQ_PREPOPULATE);
2275 	if (idm.idm_global_taskq == NULL) {
2276 		cv_destroy(&idm.idm_wd_cv);
2277 		cv_destroy(&idm.idm_tgt_svc_cv);
2278 		mutex_destroy(&idm.idm_global_mutex);
2279 		rw_destroy(&idm.idm_taskid_table_lock);
2280 		return (ENOMEM);
2281 	}
2282 
2283 	/* Start watchdog thread */
2284 	idm.idm_wd_thread = thread_create(NULL, 0,
2285 	    idm_wd_thread, NULL, 0, &p0, TS_RUN, minclsyspri);
2286 	if (idm.idm_wd_thread == NULL) {
2287 		/* Couldn't create the watchdog thread */
2288 		taskq_destroy(idm.idm_global_taskq);
2289 		cv_destroy(&idm.idm_wd_cv);
2290 		cv_destroy(&idm.idm_tgt_svc_cv);
2291 		mutex_destroy(&idm.idm_global_mutex);
2292 		rw_destroy(&idm.idm_taskid_table_lock);
2293 		return (ENOMEM);
2294 	}
2295 
2296 	/* Pause until the watchdog thread is running */
2297 	mutex_enter(&idm.idm_global_mutex);
2298 	while (!idm.idm_wd_thread_running)
2299 		cv_wait(&idm.idm_wd_cv, &idm.idm_global_mutex);
2300 	mutex_exit(&idm.idm_global_mutex);
2301 
2302 	/*
2303 	 * Allocate the task ID table and set "next" to 0.
2304 	 */
2305 
2306 	idm.idm_taskid_max = idm_max_taskids;
2307 	idm.idm_taskid_table = (idm_task_t **)
2308 	    kmem_zalloc(idm.idm_taskid_max * sizeof (idm_task_t *), KM_SLEEP);
2309 	idm.idm_taskid_next = 0;
2310 
2311 	/* Create the global buffer and task kmem caches */
2312 	idm.idm_buf_cache = kmem_cache_create("idm_buf_cache",
2313 	    sizeof (idm_buf_t), 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
2314 
2315 	/*
2316 	 * Note, we're explicitly allocating an additional iSER header-
2317 	 * sized chunk for each of these elements. See idm_task_constructor().
2318 	 */
2319 	idm.idm_task_cache = kmem_cache_create("idm_task_cache",
2320 	    sizeof (idm_task_t) + IDM_TRANSPORT_HEADER_LENGTH, 8,
2321 	    &idm_task_constructor, &idm_task_destructor,
2322 	    NULL, NULL, NULL, KM_SLEEP);
2323 
2324 	/* Create the service and connection context lists */
2325 	list_create(&idm.idm_tgt_svc_list, sizeof (idm_svc_t),
2326 	    offsetof(idm_svc_t, is_list_node));
2327 	list_create(&idm.idm_tgt_conn_list, sizeof (idm_conn_t),
2328 	    offsetof(idm_conn_t, ic_list_node));
2329 	list_create(&idm.idm_ini_conn_list, sizeof (idm_conn_t),
2330 	    offsetof(idm_conn_t, ic_list_node));
2331 
2332 	/* Initialize the native sockets transport */
2333 	idm_so_init(&idm_transport_list[IDM_TRANSPORT_TYPE_SOCKETS]);
2334 
2335 	/* Create connection ID pool */
2336 	(void) idm_idpool_create(&idm.idm_conn_id_pool);
2337 
2338 	return (DDI_SUCCESS);
2339 }
2340 
2341 static int
2342 _idm_fini(void)
2343 {
2344 	if (!list_is_empty(&idm.idm_ini_conn_list) ||
2345 	    !list_is_empty(&idm.idm_tgt_conn_list) ||
2346 	    !list_is_empty(&idm.idm_tgt_svc_list)) {
2347 		return (EBUSY);
2348 	}
2349 
2350 	mutex_enter(&idm.idm_global_mutex);
2351 	idm.idm_wd_thread_running = B_FALSE;
2352 	cv_signal(&idm.idm_wd_cv);
2353 	mutex_exit(&idm.idm_global_mutex);
2354 
2355 	thread_join(idm.idm_wd_thread_did);
2356 
2357 	idm_idpool_destroy(&idm.idm_conn_id_pool);
2358 
2359 	/* Close any LDI handles we have open on transport drivers */
2360 	mutex_enter(&idm.idm_global_mutex);
2361 	idm_transport_teardown();
2362 	mutex_exit(&idm.idm_global_mutex);
2363 
2364 	/* Teardown the native sockets transport */
2365 	idm_so_fini();
2366 
2367 	list_destroy(&idm.idm_ini_conn_list);
2368 	list_destroy(&idm.idm_tgt_conn_list);
2369 	list_destroy(&idm.idm_tgt_svc_list);
2370 	kmem_cache_destroy(idm.idm_task_cache);
2371 	kmem_cache_destroy(idm.idm_buf_cache);
2372 	kmem_free(idm.idm_taskid_table,
2373 	    idm.idm_taskid_max * sizeof (idm_task_t *));
2374 	mutex_destroy(&idm.idm_global_mutex);
2375 	cv_destroy(&idm.idm_wd_cv);
2376 	cv_destroy(&idm.idm_tgt_svc_cv);
2377 	rw_destroy(&idm.idm_taskid_table_lock);
2378 
2379 	return (0);
2380 }
2381