xref: /titanic_52/usr/src/uts/common/io/idm/idm.c (revision 88447a05f537aabe9a1bc3d5313f22581ec992a7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpuvar.h>
27 #include <sys/conf.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 
33 #include <sys/socket.h>
34 #include <sys/strsubr.h>
35 #include <sys/sysmacros.h>
36 
37 #include <sys/socketvar.h>
38 #include <netinet/in.h>
39 
40 #include <sys/idm/idm.h>
41 #include <sys/idm/idm_so.h>
42 
43 #define	IDM_NAME_VERSION	"iSCSI Data Mover"
44 
45 extern struct mod_ops mod_miscops;
46 extern struct mod_ops mod_miscops;
47 
48 static struct modlmisc modlmisc = {
49 	&mod_miscops,	/* Type of module */
50 	IDM_NAME_VERSION
51 };
52 
53 static struct modlinkage modlinkage = {
54 	MODREV_1, (void *)&modlmisc, NULL
55 };
56 
57 extern int idm_task_compare(const void *t1, const void *t2);
58 extern void idm_wd_thread(void *arg);
59 
60 static int _idm_init(void);
61 static int _idm_fini(void);
62 static void idm_buf_bind_in_locked(idm_task_t *idt, idm_buf_t *buf);
63 static void idm_buf_bind_out_locked(idm_task_t *idt, idm_buf_t *buf);
64 static void idm_buf_unbind_in_locked(idm_task_t *idt, idm_buf_t *buf);
65 static void idm_buf_unbind_out_locked(idm_task_t *idt, idm_buf_t *buf);
66 static void idm_task_abort_one(idm_conn_t *ic, idm_task_t *idt,
67     idm_abort_type_t abort_type);
68 static void idm_task_aborted(idm_task_t *idt, idm_status_t status);
69 static idm_pdu_t *idm_pdu_alloc_common(uint_t hdrlen, uint_t datalen,
70     int sleepflag);
71 
72 boolean_t idm_conn_logging = 0;
73 boolean_t idm_svc_logging = 0;
74 #ifdef DEBUG
75 boolean_t idm_pattern_checking = 1;
76 #else
77 boolean_t idm_pattern_checking = 0;
78 #endif
79 
80 /*
81  * Potential tuneable for the maximum number of tasks.  Default to
82  * IDM_TASKIDS_MAX
83  */
84 
85 uint32_t	idm_max_taskids = IDM_TASKIDS_MAX;
86 
87 /*
88  * Global list of transport handles
89  *   These are listed in preferential order, so we can simply take the
90  *   first "it_conn_is_capable" hit. Note also that the order maps to
91  *   the order of the idm_transport_type_t list.
92  */
93 idm_transport_t idm_transport_list[] = {
94 
95 	/* iSER on InfiniBand transport handle */
96 	{IDM_TRANSPORT_TYPE_ISER,	/* type */
97 	"/devices/ib/iser@0:iser",	/* device path */
98 	NULL,				/* LDI handle */
99 	NULL,				/* transport ops */
100 	NULL},				/* transport caps */
101 
102 	/* IDM native sockets transport handle */
103 	{IDM_TRANSPORT_TYPE_SOCKETS,	/* type */
104 	NULL,				/* device path */
105 	NULL,				/* LDI handle */
106 	NULL,				/* transport ops */
107 	NULL}				/* transport caps */
108 
109 };
110 
111 int
112 _init(void)
113 {
114 	int rc;
115 
116 	if ((rc = _idm_init()) != 0) {
117 		return (rc);
118 	}
119 
120 	return (mod_install(&modlinkage));
121 }
122 
123 int
124 _fini(void)
125 {
126 	int rc;
127 
128 	if ((rc = _idm_fini()) != 0) {
129 		return (rc);
130 	}
131 
132 	if ((rc = mod_remove(&modlinkage)) != 0) {
133 		return (rc);
134 	}
135 
136 	return (rc);
137 }
138 
139 int
140 _info(struct modinfo *modinfop)
141 {
142 	return (mod_info(&modlinkage, modinfop));
143 }
144 
145 /*
146  * idm_transport_register()
147  *
148  * Provides a mechanism for an IDM transport driver to register its
149  * transport ops and caps with the IDM kernel module. Invoked during
150  * a transport driver's attach routine.
151  */
152 idm_status_t
153 idm_transport_register(idm_transport_attr_t *attr)
154 {
155 	ASSERT(attr->it_ops != NULL);
156 	ASSERT(attr->it_caps != NULL);
157 
158 	switch (attr->type) {
159 	/* All known non-native transports here; for now, iSER */
160 	case IDM_TRANSPORT_TYPE_ISER:
161 		idm_transport_list[attr->type].it_ops	= attr->it_ops;
162 		idm_transport_list[attr->type].it_caps	= attr->it_caps;
163 		return (IDM_STATUS_SUCCESS);
164 
165 	default:
166 		cmn_err(CE_NOTE, "idm: unknown transport type (0x%x) in "
167 		    "idm_transport_register", attr->type);
168 		return (IDM_STATUS_SUCCESS);
169 	}
170 }
171 
172 /*
173  * idm_ini_conn_create
174  *
175  * This function is invoked by the iSCSI layer to create a connection context.
176  * This does not actually establish the socket connection.
177  *
178  * cr - Connection request parameters
179  * new_con - Output parameter that contains the new request if successful
180  *
181  */
182 idm_status_t
183 idm_ini_conn_create(idm_conn_req_t *cr, idm_conn_t **new_con)
184 {
185 	idm_transport_t		*it;
186 	idm_conn_t		*ic;
187 	int			rc;
188 
189 	it = idm_transport_lookup(cr);
190 
191 retry:
192 	ic = idm_conn_create_common(CONN_TYPE_INI, it->it_type,
193 	    &cr->icr_conn_ops);
194 
195 	bcopy(&cr->cr_ini_dst_addr, &ic->ic_ini_dst_addr,
196 	    sizeof (cr->cr_ini_dst_addr));
197 
198 	/* create the transport-specific connection components */
199 	rc = it->it_ops->it_ini_conn_create(cr, ic);
200 	if (rc != IDM_STATUS_SUCCESS) {
201 		/* cleanup the failed connection */
202 		idm_conn_destroy_common(ic);
203 
204 		/*
205 		 * It is possible for an IB client to connect to
206 		 * an ethernet-only client via an IB-eth gateway.
207 		 * Therefore, if we are attempting to use iSER and
208 		 * fail, retry with sockets before ultimately
209 		 * failing the connection.
210 		 */
211 		if (it->it_type == IDM_TRANSPORT_TYPE_ISER) {
212 			it = &idm_transport_list[IDM_TRANSPORT_TYPE_SOCKETS];
213 			goto retry;
214 		}
215 
216 		return (IDM_STATUS_FAIL);
217 	}
218 
219 	*new_con = ic;
220 
221 	mutex_enter(&idm.idm_global_mutex);
222 	list_insert_tail(&idm.idm_ini_conn_list, ic);
223 	mutex_exit(&idm.idm_global_mutex);
224 
225 	return (IDM_STATUS_SUCCESS);
226 }
227 
228 /*
229  * idm_ini_conn_destroy
230  *
231  * Releases any resources associated with the connection.  This is the
232  * complement to idm_ini_conn_create.
233  * ic - idm_conn_t structure representing the relevant connection
234  *
235  */
236 void
237 idm_ini_conn_destroy_task(void *ic_void)
238 {
239 	idm_conn_t *ic = ic_void;
240 
241 	ic->ic_transport_ops->it_ini_conn_destroy(ic);
242 	idm_conn_destroy_common(ic);
243 }
244 
245 void
246 idm_ini_conn_destroy(idm_conn_t *ic)
247 {
248 	/*
249 	 * It's reasonable for the initiator to call idm_ini_conn_destroy
250 	 * from within the context of the CN_CONNECT_DESTROY notification.
251 	 * That's a problem since we want to destroy the taskq for the
252 	 * state machine associated with the connection.  Remove the
253 	 * connection from the list right away then handle the remaining
254 	 * work via the idm_global_taskq.
255 	 */
256 	mutex_enter(&idm.idm_global_mutex);
257 	list_remove(&idm.idm_ini_conn_list, ic);
258 	mutex_exit(&idm.idm_global_mutex);
259 
260 	if (taskq_dispatch(idm.idm_global_taskq,
261 	    &idm_ini_conn_destroy_task, ic, TQ_SLEEP) == NULL) {
262 		cmn_err(CE_WARN,
263 		    "idm_ini_conn_destroy: Couldn't dispatch task");
264 	}
265 }
266 
267 /*
268  * idm_ini_conn_connect
269  *
270  * Establish connection to the remote system identified in idm_conn_t.
271  * The connection parameters including the remote IP address were established
272  * in the call to idm_ini_conn_create.  The IDM state machine will
273  * perform client notifications as necessary to prompt the initiator through
274  * the login process.  IDM also keeps a timer running so that if the login
275  * process doesn't complete in a timely manner it will fail.
276  *
277  * ic - idm_conn_t structure representing the relevant connection
278  *
279  * Returns success if the connection was established, otherwise some kind
280  * of meaningful error code.
281  *
282  * Upon return the login has either failed or is loggin in (ffp)
283  */
284 idm_status_t
285 idm_ini_conn_connect(idm_conn_t *ic)
286 {
287 	idm_status_t	rc = IDM_STATUS_SUCCESS;
288 
289 	rc = idm_conn_sm_init(ic);
290 	if (rc != IDM_STATUS_SUCCESS) {
291 		return (ic->ic_conn_sm_status);
292 	}
293 
294 	/* Hold connection until we return */
295 	idm_conn_hold(ic);
296 
297 	/* Kick state machine */
298 	idm_conn_event(ic, CE_CONNECT_REQ, NULL);
299 
300 	/* Wait for login flag */
301 	mutex_enter(&ic->ic_state_mutex);
302 	while (!(ic->ic_state_flags & CF_LOGIN_READY) &&
303 	    !(ic->ic_state_flags & CF_ERROR)) {
304 		cv_wait(&ic->ic_state_cv, &ic->ic_state_mutex);
305 	}
306 	mutex_exit(&ic->ic_state_mutex);
307 
308 	if (ic->ic_state_flags & CF_ERROR) {
309 		/* ic->ic_conn_sm_status will contains failure status */
310 		idm_conn_rele(ic);
311 		return (ic->ic_conn_sm_status);
312 	}
313 
314 	/* Ready to login */
315 	ASSERT(ic->ic_state_flags & CF_LOGIN_READY);
316 	(void) idm_notify_client(ic, CN_READY_FOR_LOGIN, NULL);
317 
318 	idm_conn_rele(ic);
319 
320 	return (rc);
321 }
322 
323 /*
324  * idm_ini_conn_disconnect
325  *
326  * Forces a connection (previously established using idm_ini_conn_connect)
327  * to perform a controlled shutdown, cleaning up any outstanding requests.
328  *
329  * ic - idm_conn_t structure representing the relevant connection
330  *
331  * This is asynchronous and will return before the connection is properly
332  * shutdown
333  */
334 /* ARGSUSED */
335 void
336 idm_ini_conn_disconnect(idm_conn_t *ic)
337 {
338 	idm_conn_event(ic, CE_TRANSPORT_FAIL, NULL);
339 }
340 
341 /*
342  * idm_ini_conn_disconnect_wait
343  *
344  * Forces a connection (previously established using idm_ini_conn_connect)
345  * to perform a controlled shutdown.  Blocks until the connection is
346  * disconnected.
347  *
348  * ic - idm_conn_t structure representing the relevant connection
349  */
350 /* ARGSUSED */
351 void
352 idm_ini_conn_disconnect_sync(idm_conn_t *ic)
353 {
354 	mutex_enter(&ic->ic_state_mutex);
355 	if ((ic->ic_state != CS_S9_INIT_ERROR) &&
356 	    (ic->ic_state != CS_S11_COMPLETE)) {
357 		idm_conn_event_locked(ic, CE_TRANSPORT_FAIL, NULL, CT_NONE);
358 		while ((ic->ic_state != CS_S9_INIT_ERROR) &&
359 		    (ic->ic_state != CS_S11_COMPLETE))
360 			cv_wait(&ic->ic_state_cv, &ic->ic_state_mutex);
361 	}
362 	mutex_exit(&ic->ic_state_mutex);
363 }
364 
365 /*
366  * idm_tgt_svc_create
367  *
368  * The target calls this service to obtain a service context for each available
369  * transport, starting a service of each type related to the IP address and port
370  * passed. The idm_svc_req_t contains the service parameters.
371  */
372 idm_status_t
373 idm_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t **new_svc)
374 {
375 	idm_transport_type_t	type;
376 	idm_transport_t		*it;
377 	idm_svc_t		*is;
378 	int			rc;
379 
380 	*new_svc = NULL;
381 	is = kmem_zalloc(sizeof (idm_svc_t), KM_SLEEP);
382 
383 	/* Initialize transport-agnostic components of the service handle */
384 	is->is_svc_req = *sr;
385 	mutex_init(&is->is_mutex, NULL, MUTEX_DEFAULT, NULL);
386 	cv_init(&is->is_cv, NULL, CV_DEFAULT, NULL);
387 	mutex_init(&is->is_count_mutex, NULL, MUTEX_DEFAULT, NULL);
388 	cv_init(&is->is_count_cv, NULL, CV_DEFAULT, NULL);
389 	idm_refcnt_init(&is->is_refcnt, is);
390 
391 	/*
392 	 * Make sure all available transports are setup.  We call this now
393 	 * instead of at initialization time in case IB has become available
394 	 * since we started (hotplug, etc).
395 	 */
396 	idm_transport_setup(sr->sr_li);
397 
398 	/*
399 	 * Loop through the transports, configuring the transport-specific
400 	 * components of each one.
401 	 */
402 	for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
403 
404 		it = &idm_transport_list[type];
405 		/*
406 		 * If it_ops is NULL then the transport is unconfigured
407 		 * and we shouldn't try to start the service.
408 		 */
409 		if (it->it_ops == NULL) {
410 			continue;
411 		}
412 
413 		rc = it->it_ops->it_tgt_svc_create(sr, is);
414 		if (rc != IDM_STATUS_SUCCESS) {
415 			/* Teardown any configured services */
416 			while (type--) {
417 				it = &idm_transport_list[type];
418 				if (it->it_ops == NULL) {
419 					continue;
420 				}
421 				it->it_ops->it_tgt_svc_destroy(is);
422 			}
423 			/* Free the svc context and return */
424 			kmem_free(is, sizeof (idm_svc_t));
425 			return (rc);
426 		}
427 	}
428 
429 	*new_svc = is;
430 
431 	mutex_enter(&idm.idm_global_mutex);
432 	list_insert_tail(&idm.idm_tgt_svc_list, is);
433 	mutex_exit(&idm.idm_global_mutex);
434 
435 	return (IDM_STATUS_SUCCESS);
436 }
437 
438 /*
439  * idm_tgt_svc_destroy
440  *
441  * is - idm_svc_t returned by the call to idm_tgt_svc_create
442  *
443  * Cleanup any resources associated with the idm_svc_t.
444  */
445 void
446 idm_tgt_svc_destroy(idm_svc_t *is)
447 {
448 	idm_transport_type_t	type;
449 	idm_transport_t		*it;
450 
451 	mutex_enter(&idm.idm_global_mutex);
452 	/* remove this service from the global list */
453 	list_remove(&idm.idm_tgt_svc_list, is);
454 	/* wakeup any waiters for service change */
455 	cv_broadcast(&idm.idm_tgt_svc_cv);
456 	mutex_exit(&idm.idm_global_mutex);
457 
458 	/* teardown each transport-specific service */
459 	for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
460 		it = &idm_transport_list[type];
461 		if (it->it_ops == NULL) {
462 			continue;
463 		}
464 
465 		it->it_ops->it_tgt_svc_destroy(is);
466 	}
467 
468 	/* tear down the svc resources */
469 	idm_refcnt_destroy(&is->is_refcnt);
470 	cv_destroy(&is->is_count_cv);
471 	mutex_destroy(&is->is_count_mutex);
472 	cv_destroy(&is->is_cv);
473 	mutex_destroy(&is->is_mutex);
474 
475 	/* free the svc handle */
476 	kmem_free(is, sizeof (idm_svc_t));
477 }
478 
479 void
480 idm_tgt_svc_hold(idm_svc_t *is)
481 {
482 	idm_refcnt_hold(&is->is_refcnt);
483 }
484 
485 void
486 idm_tgt_svc_rele_and_destroy(idm_svc_t *is)
487 {
488 	idm_refcnt_rele_and_destroy(&is->is_refcnt,
489 	    (idm_refcnt_cb_t *)&idm_tgt_svc_destroy);
490 }
491 
492 /*
493  * idm_tgt_svc_online
494  *
495  * is - idm_svc_t returned by the call to idm_tgt_svc_create
496  *
497  * Online each transport service, as we want this target to be accessible
498  * via any configured transport.
499  *
500  * When the initiator establishes a new connection to the target, IDM will
501  * call the "new connect" callback defined in the idm_svc_req_t structure
502  * and it will pass an idm_conn_t structure representing that new connection.
503  */
504 idm_status_t
505 idm_tgt_svc_online(idm_svc_t *is)
506 {
507 
508 	idm_transport_type_t	type, last_type;
509 	idm_transport_t		*it;
510 	int			rc = IDM_STATUS_SUCCESS;
511 
512 	mutex_enter(&is->is_mutex);
513 	if (is->is_online == 0) {
514 		/* Walk through each of the transports and online them */
515 		for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
516 			it = &idm_transport_list[type];
517 			if (it->it_ops == NULL) {
518 				/* transport is not registered */
519 				continue;
520 			}
521 
522 			mutex_exit(&is->is_mutex);
523 			rc = it->it_ops->it_tgt_svc_online(is);
524 			mutex_enter(&is->is_mutex);
525 			if (rc != IDM_STATUS_SUCCESS) {
526 				last_type = type;
527 				break;
528 			}
529 		}
530 		if (rc != IDM_STATUS_SUCCESS) {
531 			/*
532 			 * The last transport failed to online.
533 			 * Offline any transport onlined above and
534 			 * do not online the target.
535 			 */
536 			for (type = 0; type < last_type; type++) {
537 				it = &idm_transport_list[type];
538 				if (it->it_ops == NULL) {
539 					/* transport is not registered */
540 					continue;
541 				}
542 
543 				mutex_exit(&is->is_mutex);
544 				it->it_ops->it_tgt_svc_offline(is);
545 				mutex_enter(&is->is_mutex);
546 			}
547 		} else {
548 			/* Target service now online */
549 			is->is_online = 1;
550 		}
551 	} else {
552 		/* Target service already online, just bump the count */
553 		is->is_online++;
554 	}
555 	mutex_exit(&is->is_mutex);
556 
557 	return (rc);
558 }
559 
560 /*
561  * idm_tgt_svc_offline
562  *
563  * is - idm_svc_t returned by the call to idm_tgt_svc_create
564  *
565  * Shutdown any online target services.
566  */
567 void
568 idm_tgt_svc_offline(idm_svc_t *is)
569 {
570 	idm_transport_type_t	type;
571 	idm_transport_t		*it;
572 
573 	mutex_enter(&is->is_mutex);
574 	is->is_online--;
575 	if (is->is_online == 0) {
576 		/* Walk through each of the transports and offline them */
577 		for (type = 0; type < IDM_TRANSPORT_NUM_TYPES; type++) {
578 			it = &idm_transport_list[type];
579 			if (it->it_ops == NULL) {
580 				/* transport is not registered */
581 				continue;
582 			}
583 
584 			mutex_exit(&is->is_mutex);
585 			it->it_ops->it_tgt_svc_offline(is);
586 			mutex_enter(&is->is_mutex);
587 		}
588 	}
589 	mutex_exit(&is->is_mutex);
590 }
591 
592 /*
593  * idm_tgt_svc_lookup
594  *
595  * Lookup a service instance listening on the specified port
596  */
597 
598 idm_svc_t *
599 idm_tgt_svc_lookup(uint16_t port)
600 {
601 	idm_svc_t *result;
602 
603 retry:
604 	mutex_enter(&idm.idm_global_mutex);
605 	for (result = list_head(&idm.idm_tgt_svc_list);
606 	    result != NULL;
607 	    result = list_next(&idm.idm_tgt_svc_list, result)) {
608 		if (result->is_svc_req.sr_port == port) {
609 			if (result->is_online == 0) {
610 				/*
611 				 * A service exists on this port, but it
612 				 * is going away, wait for it to cleanup.
613 				 */
614 				cv_wait(&idm.idm_tgt_svc_cv,
615 				    &idm.idm_global_mutex);
616 				mutex_exit(&idm.idm_global_mutex);
617 				goto retry;
618 			}
619 			idm_tgt_svc_hold(result);
620 			mutex_exit(&idm.idm_global_mutex);
621 			return (result);
622 		}
623 	}
624 	mutex_exit(&idm.idm_global_mutex);
625 
626 	return (NULL);
627 }
628 
629 /*
630  * idm_negotiate_key_values()
631  * Give IDM level a chance to negotiate any login parameters it should own.
632  *  -- leave unhandled parameters alone on request_nvl
633  *  -- move all handled parameters to response_nvl with an appropriate response
634  *  -- also add an entry to negotiated_nvl for any accepted parameters
635  */
636 kv_status_t
637 idm_negotiate_key_values(idm_conn_t *ic, nvlist_t *request_nvl,
638     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
639 {
640 	ASSERT(ic->ic_transport_ops != NULL);
641 	return (ic->ic_transport_ops->it_negotiate_key_values(ic,
642 	    request_nvl, response_nvl, negotiated_nvl));
643 }
644 
645 /*
646  * idm_notice_key_values()
647  * Activate at the IDM level any parameters that have been negotiated.
648  * Passes the set of key value pairs to the transport for activation.
649  * This will be invoked as the connection is entering full-feature mode.
650  */
651 void
652 idm_notice_key_values(idm_conn_t *ic, nvlist_t *negotiated_nvl)
653 {
654 	ASSERT(ic->ic_transport_ops != NULL);
655 	ic->ic_transport_ops->it_notice_key_values(ic, negotiated_nvl);
656 }
657 
658 /*
659  * idm_buf_tx_to_ini
660  *
661  * This is IDM's implementation of the 'Put_Data' operational primitive.
662  *
663  * This function is invoked by a target iSCSI layer to request its local
664  * Datamover layer to transmit the Data-In PDU to the peer iSCSI layer
665  * on the remote iSCSI node. The I/O buffer represented by 'idb' is
666  * transferred to the initiator associated with task 'idt'. The connection
667  * info, contents of the Data-In PDU header, the DataDescriptorIn, BHS,
668  * and the callback (idb->idb_buf_cb) at transfer completion are
669  * provided as input.
670  *
671  * This data transfer takes place transparently to the remote iSCSI layer,
672  * i.e. without its participation.
673  *
674  * Using sockets, IDM implements the data transfer by segmenting the data
675  * buffer into appropriately sized iSCSI PDUs and transmitting them to the
676  * initiator. iSER performs the transfer using RDMA write.
677  *
678  */
679 idm_status_t
680 idm_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb,
681     uint32_t offset, uint32_t xfer_len,
682     idm_buf_cb_t idb_buf_cb, void *cb_arg)
683 {
684 	idm_status_t rc;
685 
686 	idb->idb_bufoffset = offset;
687 	idb->idb_xfer_len = xfer_len;
688 	idb->idb_buf_cb = idb_buf_cb;
689 	idb->idb_cb_arg = cb_arg;
690 	gethrestime(&idb->idb_xfer_start);
691 
692 	/*
693 	 * Buffer should not contain the pattern.  If the pattern is
694 	 * present then we've been asked to transmit initialized data
695 	 */
696 	IDM_BUFPAT_CHECK(idb, xfer_len, BP_CHECK_ASSERT);
697 
698 	mutex_enter(&idt->idt_mutex);
699 	switch (idt->idt_state) {
700 	case TASK_ACTIVE:
701 		idt->idt_tx_to_ini_start++;
702 		idm_task_hold(idt);
703 		idm_buf_bind_in_locked(idt, idb);
704 		idb->idb_in_transport = B_TRUE;
705 		rc = (*idt->idt_ic->ic_transport_ops->it_buf_tx_to_ini)
706 		    (idt, idb);
707 		return (rc);
708 
709 	case TASK_SUSPENDING:
710 	case TASK_SUSPENDED:
711 		/*
712 		 * Bind buffer but don't start a transfer since the task
713 		 * is suspended
714 		 */
715 		idm_buf_bind_in_locked(idt, idb);
716 		mutex_exit(&idt->idt_mutex);
717 		return (IDM_STATUS_SUCCESS);
718 
719 	case TASK_ABORTING:
720 	case TASK_ABORTED:
721 		/*
722 		 * Once the task is aborted, any buffers added to the
723 		 * idt_inbufv will never get cleaned up, so just return
724 		 * SUCCESS.  The buffer should get cleaned up by the
725 		 * client or framework once task_aborted has completed.
726 		 */
727 		mutex_exit(&idt->idt_mutex);
728 		return (IDM_STATUS_SUCCESS);
729 
730 	default:
731 		ASSERT(0);
732 		break;
733 	}
734 	mutex_exit(&idt->idt_mutex);
735 
736 	return (IDM_STATUS_FAIL);
737 }
738 
739 /*
740  * idm_buf_rx_from_ini
741  *
742  * This is IDM's implementation of the 'Get_Data' operational primitive.
743  *
744  * This function is invoked by a target iSCSI layer to request its local
745  * Datamover layer to retrieve certain data identified by the R2T PDU from the
746  * peer iSCSI layer on the remote node. The retrieved Data-Out PDU will be
747  * mapped to the respective buffer by the task tags (ITT & TTT).
748  * The connection information, contents of an R2T PDU, DataDescriptor, BHS, and
749  * the callback (idb->idb_buf_cb) notification for data transfer completion are
750  * are provided as input.
751  *
752  * When an iSCSI node sends an R2T PDU to its local Datamover layer, the local
753  * Datamover layer, the local and remote Datamover layers transparently bring
754  * about the data transfer requested by the R2T PDU, without the participation
755  * of the iSCSI layers.
756  *
757  * Using sockets, IDM transmits an R2T PDU for each buffer and the rx_data_out()
758  * assembles the Data-Out PDUs into the buffer. iSER uses RDMA read.
759  *
760  */
761 idm_status_t
762 idm_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb,
763     uint32_t offset, uint32_t xfer_len,
764     idm_buf_cb_t idb_buf_cb, void *cb_arg)
765 {
766 	idm_status_t rc;
767 
768 	idb->idb_bufoffset = offset;
769 	idb->idb_xfer_len = xfer_len;
770 	idb->idb_buf_cb = idb_buf_cb;
771 	idb->idb_cb_arg = cb_arg;
772 	gethrestime(&idb->idb_xfer_start);
773 
774 	/*
775 	 * "In" buf list is for "Data In" PDU's, "Out" buf list is for
776 	 * "Data Out" PDU's
777 	 */
778 	mutex_enter(&idt->idt_mutex);
779 	switch (idt->idt_state) {
780 	case TASK_ACTIVE:
781 		idt->idt_rx_from_ini_start++;
782 		idm_task_hold(idt);
783 		idm_buf_bind_out_locked(idt, idb);
784 		idb->idb_in_transport = B_TRUE;
785 		rc = (*idt->idt_ic->ic_transport_ops->it_buf_rx_from_ini)
786 		    (idt, idb);
787 		return (rc);
788 	case TASK_SUSPENDING:
789 	case TASK_SUSPENDED:
790 	case TASK_ABORTING:
791 	case TASK_ABORTED:
792 		/*
793 		 * Bind buffer but don't start a transfer since the task
794 		 * is suspended
795 		 */
796 		idm_buf_bind_out_locked(idt, idb);
797 		mutex_exit(&idt->idt_mutex);
798 		return (IDM_STATUS_SUCCESS);
799 	default:
800 		ASSERT(0);
801 		break;
802 	}
803 	mutex_exit(&idt->idt_mutex);
804 
805 	return (IDM_STATUS_FAIL);
806 }
807 
808 /*
809  * idm_buf_tx_to_ini_done
810  *
811  * The transport calls this after it has completed a transfer requested by
812  * a call to transport_buf_tx_to_ini
813  *
814  * Caller holds idt->idt_mutex, idt->idt_mutex is released before returning.
815  * idt may be freed after the call to idb->idb_buf_cb.
816  */
817 void
818 idm_buf_tx_to_ini_done(idm_task_t *idt, idm_buf_t *idb, idm_status_t status)
819 {
820 	ASSERT(mutex_owned(&idt->idt_mutex));
821 	idb->idb_in_transport = B_FALSE;
822 	idb->idb_tx_thread = B_FALSE;
823 	idt->idt_tx_to_ini_done++;
824 	gethrestime(&idb->idb_xfer_done);
825 
826 	/*
827 	 * idm_refcnt_rele may cause TASK_SUSPENDING --> TASK_SUSPENDED or
828 	 * TASK_ABORTING --> TASK_ABORTED transistion if the refcount goes
829 	 * to 0.
830 	 */
831 	idm_task_rele(idt);
832 	idb->idb_status = status;
833 
834 	switch (idt->idt_state) {
835 	case TASK_ACTIVE:
836 		idm_buf_unbind_in_locked(idt, idb);
837 		mutex_exit(&idt->idt_mutex);
838 		(*idb->idb_buf_cb)(idb, status);
839 		return;
840 	case TASK_SUSPENDING:
841 	case TASK_SUSPENDED:
842 	case TASK_ABORTING:
843 	case TASK_ABORTED:
844 		/*
845 		 * To keep things simple we will ignore the case where the
846 		 * transfer was successful and leave all buffers bound to the
847 		 * task.  This allows us to also ignore the case where we've
848 		 * been asked to abort a task but the last transfer of the
849 		 * task has completed.  IDM has no idea whether this was, in
850 		 * fact, the last transfer of the task so it would be difficult
851 		 * to handle this case.  Everything should get sorted out again
852 		 * after task reassignment is complete.
853 		 *
854 		 * In the case of TASK_ABORTING we could conceivably call the
855 		 * buffer callback here but the timing of when the client's
856 		 * client_task_aborted callback is invoked vs. when the client's
857 		 * buffer callback gets invoked gets sticky.  We don't want
858 		 * the client to here from us again after the call to
859 		 * client_task_aborted() but we don't want to give it a bunch
860 		 * of failed buffer transfers until we've called
861 		 * client_task_aborted().  Instead we'll just leave all the
862 		 * buffers bound and allow the client to cleanup.
863 		 */
864 		break;
865 	default:
866 		ASSERT(0);
867 	}
868 	mutex_exit(&idt->idt_mutex);
869 }
870 
871 /*
872  * idm_buf_rx_from_ini_done
873  *
874  * The transport calls this after it has completed a transfer requested by
875  * a call totransport_buf_tx_to_ini
876  *
877  * Caller holds idt->idt_mutex, idt->idt_mutex is released before returning.
878  * idt may be freed after the call to idb->idb_buf_cb.
879  */
880 void
881 idm_buf_rx_from_ini_done(idm_task_t *idt, idm_buf_t *idb, idm_status_t status)
882 {
883 	ASSERT(mutex_owned(&idt->idt_mutex));
884 	idb->idb_in_transport = B_FALSE;
885 	idt->idt_rx_from_ini_done++;
886 	gethrestime(&idb->idb_xfer_done);
887 
888 	/*
889 	 * idm_refcnt_rele may cause TASK_SUSPENDING --> TASK_SUSPENDED or
890 	 * TASK_ABORTING --> TASK_ABORTED transistion if the refcount goes
891 	 * to 0.
892 	 */
893 	idm_task_rele(idt);
894 	idb->idb_status = status;
895 
896 	if (status == IDM_STATUS_SUCCESS) {
897 		/*
898 		 * Buffer should not contain the pattern.  If it does then
899 		 * we did not get the data from the remote host.
900 		 */
901 		IDM_BUFPAT_CHECK(idb, idb->idb_xfer_len, BP_CHECK_ASSERT);
902 	}
903 
904 	switch (idt->idt_state) {
905 	case TASK_ACTIVE:
906 		idm_buf_unbind_out_locked(idt, idb);
907 		mutex_exit(&idt->idt_mutex);
908 		(*idb->idb_buf_cb)(idb, status);
909 		return;
910 	case TASK_SUSPENDING:
911 	case TASK_SUSPENDED:
912 	case TASK_ABORTING:
913 	case TASK_ABORTED:
914 		/*
915 		 * To keep things simple we will ignore the case where the
916 		 * transfer was successful and leave all buffers bound to the
917 		 * task.  This allows us to also ignore the case where we've
918 		 * been asked to abort a task but the last transfer of the
919 		 * task has completed.  IDM has no idea whether this was, in
920 		 * fact, the last transfer of the task so it would be difficult
921 		 * to handle this case.  Everything should get sorted out again
922 		 * after task reassignment is complete.
923 		 *
924 		 * In the case of TASK_ABORTING we could conceivably call the
925 		 * buffer callback here but the timing of when the client's
926 		 * client_task_aborted callback is invoked vs. when the client's
927 		 * buffer callback gets invoked gets sticky.  We don't want
928 		 * the client to here from us again after the call to
929 		 * client_task_aborted() but we don't want to give it a bunch
930 		 * of failed buffer transfers until we've called
931 		 * client_task_aborted().  Instead we'll just leave all the
932 		 * buffers bound and allow the client to cleanup.
933 		 */
934 		break;
935 	default:
936 		ASSERT(0);
937 	}
938 	mutex_exit(&idt->idt_mutex);
939 }
940 
941 /*
942  * idm_buf_alloc
943  *
944  * Allocates a buffer handle and registers it for use with the transport
945  * layer. If a buffer is not passed on bufptr, the buffer will be allocated
946  * as well as the handle.
947  *
948  * ic		- connection on which the buffer will be transferred
949  * bufptr	- allocate memory for buffer if NULL, else assign to buffer
950  * buflen	- length of buffer
951  *
952  * Returns idm_buf_t handle if successful, otherwise NULL
953  */
954 idm_buf_t *
955 idm_buf_alloc(idm_conn_t *ic, void *bufptr, uint64_t buflen)
956 {
957 	idm_buf_t	*buf = NULL;
958 	int		rc;
959 
960 	ASSERT(ic != NULL);
961 	ASSERT(idm.idm_buf_cache != NULL);
962 	ASSERT(buflen > 0);
963 
964 	/* Don't allocate new buffers if we are not in FFP */
965 	mutex_enter(&ic->ic_state_mutex);
966 	if (!ic->ic_ffp) {
967 		mutex_exit(&ic->ic_state_mutex);
968 		return (NULL);
969 	}
970 
971 
972 	idm_conn_hold(ic);
973 	mutex_exit(&ic->ic_state_mutex);
974 
975 	buf = kmem_cache_alloc(idm.idm_buf_cache, KM_NOSLEEP);
976 	if (buf == NULL) {
977 		idm_conn_rele(ic);
978 		return (NULL);
979 	}
980 
981 	buf->idb_ic		= ic;
982 	buf->idb_buflen		= buflen;
983 	buf->idb_exp_offset	= 0;
984 	buf->idb_bufoffset	= 0;
985 	buf->idb_xfer_len 	= 0;
986 	buf->idb_magic		= IDM_BUF_MAGIC;
987 	buf->idb_in_transport	= B_FALSE;
988 	buf->idb_bufbcopy	= B_FALSE;
989 
990 	/*
991 	 * If bufptr is NULL, we have an implicit request to allocate
992 	 * memory for this IDM buffer handle and register it for use
993 	 * with the transport. To simplify this, and to give more freedom
994 	 * to the transport layer for it's own buffer management, both of
995 	 * these actions will take place in the transport layer.
996 	 * If bufptr is set, then the caller has allocated memory (or more
997 	 * likely it's been passed from an upper layer), and we need only
998 	 * register the buffer for use with the transport layer.
999 	 */
1000 	if (bufptr == NULL) {
1001 		/*
1002 		 * Allocate a buffer from the transport layer (which
1003 		 * will also register the buffer for use).
1004 		 */
1005 		rc = ic->ic_transport_ops->it_buf_alloc(buf, buflen);
1006 		if (rc != 0) {
1007 			idm_conn_rele(ic);
1008 			kmem_cache_free(idm.idm_buf_cache, buf);
1009 			return (NULL);
1010 		}
1011 		/* Set the bufalloc'd flag */
1012 		buf->idb_bufalloc = B_TRUE;
1013 	} else {
1014 		/*
1015 		 * For large transfers, Set the passed bufptr into
1016 		 * the buf handle, and register the handle with the
1017 		 * transport layer. As memory registration with the
1018 		 * transport layer is a time/cpu intensive operation,
1019 		 * for small transfers (up to a pre-defined bcopy
1020 		 * threshold), use pre-registered memory buffers
1021 		 * and bcopy data at the appropriate time.
1022 		 */
1023 		buf->idb_buf = bufptr;
1024 
1025 		rc = ic->ic_transport_ops->it_buf_setup(buf);
1026 		if (rc != 0) {
1027 			idm_conn_rele(ic);
1028 			kmem_cache_free(idm.idm_buf_cache, buf);
1029 			return (NULL);
1030 		}
1031 		/*
1032 		 * The transport layer is now expected to set the idb_bufalloc
1033 		 * correctly to indicate if resources have been allocated.
1034 		 */
1035 	}
1036 
1037 	IDM_BUFPAT_SET(buf);
1038 
1039 	return (buf);
1040 }
1041 
1042 /*
1043  * idm_buf_free
1044  *
1045  * Release a buffer handle along with the associated buffer that was allocated
1046  * or assigned with idm_buf_alloc
1047  */
1048 void
1049 idm_buf_free(idm_buf_t *buf)
1050 {
1051 	idm_conn_t *ic = buf->idb_ic;
1052 
1053 
1054 	buf->idb_task_binding	= NULL;
1055 
1056 	if (buf->idb_bufalloc) {
1057 		ic->ic_transport_ops->it_buf_free(buf);
1058 	} else {
1059 		ic->ic_transport_ops->it_buf_teardown(buf);
1060 	}
1061 	kmem_cache_free(idm.idm_buf_cache, buf);
1062 	idm_conn_rele(ic);
1063 }
1064 
1065 /*
1066  * idm_buf_bind_in
1067  *
1068  * This function associates a buffer with a task. This is only for use by the
1069  * iSCSI initiator that will have only one buffer per transfer direction
1070  *
1071  */
1072 void
1073 idm_buf_bind_in(idm_task_t *idt, idm_buf_t *buf)
1074 {
1075 	mutex_enter(&idt->idt_mutex);
1076 	idm_buf_bind_in_locked(idt, buf);
1077 	mutex_exit(&idt->idt_mutex);
1078 }
1079 
1080 static void
1081 idm_buf_bind_in_locked(idm_task_t *idt, idm_buf_t *buf)
1082 {
1083 	buf->idb_task_binding = idt;
1084 	buf->idb_ic = idt->idt_ic;
1085 	idm_listbuf_insert(&idt->idt_inbufv, buf);
1086 }
1087 
1088 void
1089 idm_buf_bind_out(idm_task_t *idt, idm_buf_t *buf)
1090 {
1091 	/*
1092 	 * For small transfers, the iSER transport delegates the IDM
1093 	 * layer to bcopy the SCSI Write data for faster IOPS.
1094 	 */
1095 	if (buf->idb_bufbcopy == B_TRUE) {
1096 
1097 		bcopy(buf->idb_bufptr, buf->idb_buf, buf->idb_buflen);
1098 	}
1099 	mutex_enter(&idt->idt_mutex);
1100 	idm_buf_bind_out_locked(idt, buf);
1101 	mutex_exit(&idt->idt_mutex);
1102 }
1103 
1104 static void
1105 idm_buf_bind_out_locked(idm_task_t *idt, idm_buf_t *buf)
1106 {
1107 	buf->idb_task_binding = idt;
1108 	buf->idb_ic = idt->idt_ic;
1109 	idm_listbuf_insert(&idt->idt_outbufv, buf);
1110 }
1111 
1112 void
1113 idm_buf_unbind_in(idm_task_t *idt, idm_buf_t *buf)
1114 {
1115 	/*
1116 	 * For small transfers, the iSER transport delegates the IDM
1117 	 * layer to bcopy the SCSI Read data into the read buufer
1118 	 * for faster IOPS.
1119 	 */
1120 	if (buf->idb_bufbcopy == B_TRUE) {
1121 		bcopy(buf->idb_buf, buf->idb_bufptr, buf->idb_buflen);
1122 	}
1123 	mutex_enter(&idt->idt_mutex);
1124 	idm_buf_unbind_in_locked(idt, buf);
1125 	mutex_exit(&idt->idt_mutex);
1126 }
1127 
1128 static void
1129 idm_buf_unbind_in_locked(idm_task_t *idt, idm_buf_t *buf)
1130 {
1131 	list_remove(&idt->idt_inbufv, buf);
1132 }
1133 
1134 void
1135 idm_buf_unbind_out(idm_task_t *idt, idm_buf_t *buf)
1136 {
1137 	mutex_enter(&idt->idt_mutex);
1138 	idm_buf_unbind_out_locked(idt, buf);
1139 	mutex_exit(&idt->idt_mutex);
1140 }
1141 
1142 static void
1143 idm_buf_unbind_out_locked(idm_task_t *idt, idm_buf_t *buf)
1144 {
1145 	list_remove(&idt->idt_outbufv, buf);
1146 }
1147 
1148 /*
1149  * idm_buf_find() will lookup the idm_buf_t based on the relative offset in the
1150  * iSCSI PDU
1151  */
1152 idm_buf_t *
1153 idm_buf_find(void *lbuf, size_t data_offset)
1154 {
1155 	idm_buf_t	*idb;
1156 	list_t		*lst = (list_t *)lbuf;
1157 
1158 	/* iterate through the list to find the buffer */
1159 	for (idb = list_head(lst); idb != NULL; idb = list_next(lst, idb)) {
1160 
1161 		ASSERT((idb->idb_ic->ic_conn_type == CONN_TYPE_TGT) ||
1162 		    (idb->idb_bufoffset == 0));
1163 
1164 		if ((data_offset >= idb->idb_bufoffset) &&
1165 		    (data_offset < (idb->idb_bufoffset + idb->idb_buflen))) {
1166 
1167 			return (idb);
1168 		}
1169 	}
1170 
1171 	return (NULL);
1172 }
1173 
1174 void
1175 idm_bufpat_set(idm_buf_t *idb)
1176 {
1177 	idm_bufpat_t	*bufpat;
1178 	int		len, i;
1179 
1180 	len = idb->idb_buflen;
1181 	len = (len / sizeof (idm_bufpat_t)) * sizeof (idm_bufpat_t);
1182 
1183 	bufpat = idb->idb_buf;
1184 	for (i = 0; i < len; i += sizeof (idm_bufpat_t)) {
1185 		bufpat->bufpat_idb = idb;
1186 		bufpat->bufpat_bufmagic = IDM_BUF_MAGIC;
1187 		bufpat->bufpat_offset = i;
1188 		bufpat++;
1189 	}
1190 }
1191 
1192 boolean_t
1193 idm_bufpat_check(idm_buf_t *idb, int check_len, idm_bufpat_check_type_t type)
1194 {
1195 	idm_bufpat_t	*bufpat;
1196 	int		len, i;
1197 
1198 	len = (type == BP_CHECK_QUICK) ? sizeof (idm_bufpat_t) : check_len;
1199 	len = (len / sizeof (idm_bufpat_t)) * sizeof (idm_bufpat_t);
1200 	ASSERT(len <= idb->idb_buflen);
1201 	bufpat = idb->idb_buf;
1202 
1203 	/*
1204 	 * Don't check the pattern in buffers that came from outside IDM
1205 	 * (these will be buffers from the initiator that we opted not
1206 	 * to double-buffer)
1207 	 */
1208 	if (!idb->idb_bufalloc)
1209 		return (B_FALSE);
1210 
1211 	/*
1212 	 * Return true if we find the pattern anywhere in the buffer
1213 	 */
1214 	for (i = 0; i < len; i += sizeof (idm_bufpat_t)) {
1215 		if (BUFPAT_MATCH(bufpat, idb)) {
1216 			IDM_CONN_LOG(CE_WARN, "idm_bufpat_check found: "
1217 			    "idb %p bufpat %p "
1218 			    "bufpat_idb=%p bufmagic=%08x offset=%08x",
1219 			    (void *)idb, (void *)bufpat, bufpat->bufpat_idb,
1220 			    bufpat->bufpat_bufmagic, bufpat->bufpat_offset);
1221 			DTRACE_PROBE2(bufpat__pattern__found,
1222 			    idm_buf_t *, idb, idm_bufpat_t *, bufpat);
1223 			if (type == BP_CHECK_ASSERT) {
1224 				ASSERT(0);
1225 			}
1226 			return (B_TRUE);
1227 		}
1228 		bufpat++;
1229 	}
1230 
1231 	return (B_FALSE);
1232 }
1233 
1234 /*
1235  * idm_task_alloc
1236  *
1237  * This function will allocate a idm_task_t structure. A task tag is also
1238  * generated and saved in idt_tt. The task is not active.
1239  */
1240 idm_task_t *
1241 idm_task_alloc(idm_conn_t *ic)
1242 {
1243 	idm_task_t	*idt;
1244 
1245 	ASSERT(ic != NULL);
1246 
1247 	/* Don't allocate new tasks if we are not in FFP */
1248 	mutex_enter(&ic->ic_state_mutex);
1249 	if (!ic->ic_ffp) {
1250 		mutex_exit(&ic->ic_state_mutex);
1251 		return (NULL);
1252 	}
1253 	idt = kmem_cache_alloc(idm.idm_task_cache, KM_NOSLEEP);
1254 	if (idt == NULL) {
1255 		mutex_exit(&ic->ic_state_mutex);
1256 		return (NULL);
1257 	}
1258 
1259 	ASSERT(list_is_empty(&idt->idt_inbufv));
1260 	ASSERT(list_is_empty(&idt->idt_outbufv));
1261 
1262 	idm_conn_hold(ic);
1263 	mutex_exit(&ic->ic_state_mutex);
1264 
1265 	idt->idt_state		= TASK_IDLE;
1266 	idt->idt_ic		= ic;
1267 	idt->idt_private 	= NULL;
1268 	idt->idt_exp_datasn	= 0;
1269 	idt->idt_exp_rttsn	= 0;
1270 
1271 	return (idt);
1272 }
1273 
1274 /*
1275  * idm_task_start
1276  *
1277  * Mark the task active and initialize some stats. The caller
1278  * sets up the idm_task_t structure with a prior call to idm_task_alloc().
1279  * The task service does not function as a task/work engine, it is the
1280  * responsibility of the initiator to start the data transfer and free the
1281  * resources.
1282  */
1283 void
1284 idm_task_start(idm_task_t *idt, uintptr_t handle)
1285 {
1286 	ASSERT(idt != NULL);
1287 
1288 	/* mark the task as ACTIVE */
1289 	idt->idt_state = TASK_ACTIVE;
1290 	idt->idt_client_handle = handle;
1291 	idt->idt_tx_to_ini_start = idt->idt_tx_to_ini_done =
1292 	    idt->idt_rx_from_ini_start = idt->idt_rx_from_ini_done =
1293 	    idt->idt_tx_bytes = idt->idt_rx_bytes = 0;
1294 }
1295 
1296 /*
1297  * idm_task_done
1298  *
1299  * This function sets the state to indicate that the task is no longer active.
1300  */
1301 void
1302 idm_task_done(idm_task_t *idt)
1303 {
1304 	ASSERT(idt != NULL);
1305 
1306 	mutex_enter(&idt->idt_mutex);
1307 	idt->idt_state = TASK_IDLE;
1308 	mutex_exit(&idt->idt_mutex);
1309 
1310 	/*
1311 	 * Although unlikely it is possible for a reference to come in after
1312 	 * the client has decided the task is over but before we've marked
1313 	 * the task idle.  One specific unavoidable scenario is the case where
1314 	 * received PDU with the matching ITT/TTT results in a successful
1315 	 * lookup of this task.  We are at the mercy of the remote node in
1316 	 * that case so we need to handle it.  Now that the task state
1317 	 * has changed no more references will occur so a simple call to
1318 	 * idm_refcnt_wait_ref should deal with the situation.
1319 	 */
1320 	idm_refcnt_wait_ref(&idt->idt_refcnt);
1321 	idm_refcnt_reset(&idt->idt_refcnt);
1322 }
1323 
1324 /*
1325  * idm_task_free
1326  *
1327  * This function will free the Task Tag and the memory allocated for the task
1328  * idm_task_done should be called prior to this call
1329  */
1330 void
1331 idm_task_free(idm_task_t *idt)
1332 {
1333 	idm_conn_t *ic;
1334 
1335 	ASSERT(idt != NULL);
1336 	ASSERT(idt->idt_refcnt.ir_refcnt == 0);
1337 	ASSERT(idt->idt_state == TASK_IDLE);
1338 
1339 	ic = idt->idt_ic;
1340 
1341 	/*
1342 	 * It's possible for items to still be in the idt_inbufv list if
1343 	 * they were added after idm_task_cleanup was called.  We rely on
1344 	 * STMF to free all buffers associated with the task however STMF
1345 	 * doesn't know that we have this reference to the buffers.
1346 	 * Use list_create so that we don't end up with stale references
1347 	 * to these buffers.
1348 	 */
1349 	list_create(&idt->idt_inbufv, sizeof (idm_buf_t),
1350 	    offsetof(idm_buf_t, idb_buflink));
1351 	list_create(&idt->idt_outbufv, sizeof (idm_buf_t),
1352 	    offsetof(idm_buf_t, idb_buflink));
1353 
1354 	kmem_cache_free(idm.idm_task_cache, idt);
1355 
1356 	idm_conn_rele(ic);
1357 }
1358 
1359 /*
1360  * idm_task_find_common
1361  *	common code for idm_task_find() and idm_task_find_and_complete()
1362  */
1363 /*ARGSUSED*/
1364 static idm_task_t *
1365 idm_task_find_common(idm_conn_t *ic, uint32_t itt, uint32_t ttt,
1366     boolean_t complete)
1367 {
1368 	uint32_t	tt, client_handle;
1369 	idm_task_t	*idt;
1370 
1371 	/*
1372 	 * Must match both itt and ttt.  The table is indexed by itt
1373 	 * for initiator connections and ttt for target connections.
1374 	 */
1375 	if (IDM_CONN_ISTGT(ic)) {
1376 		tt = ttt;
1377 		client_handle = itt;
1378 	} else {
1379 		tt = itt;
1380 		client_handle = ttt;
1381 	}
1382 
1383 	rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1384 	if (tt >= idm.idm_taskid_max) {
1385 		rw_exit(&idm.idm_taskid_table_lock);
1386 		return (NULL);
1387 	}
1388 
1389 	idt = idm.idm_taskid_table[tt];
1390 
1391 	if (idt != NULL) {
1392 		mutex_enter(&idt->idt_mutex);
1393 		if ((idt->idt_state != TASK_ACTIVE) ||
1394 		    (idt->idt_ic != ic) ||
1395 		    (IDM_CONN_ISTGT(ic) &&
1396 		    (idt->idt_client_handle != client_handle))) {
1397 			/*
1398 			 * Task doesn't match or task is aborting and
1399 			 * we don't want any more references.
1400 			 */
1401 			if ((idt->idt_ic != ic) &&
1402 			    (idt->idt_state == TASK_ACTIVE) &&
1403 			    (IDM_CONN_ISINI(ic) || idt->idt_client_handle ==
1404 			    client_handle)) {
1405 				IDM_CONN_LOG(CE_WARN,
1406 				"idm_task_find: wrong connection %p != %p",
1407 				    (void *)ic, (void *)idt->idt_ic);
1408 			}
1409 			mutex_exit(&idt->idt_mutex);
1410 			rw_exit(&idm.idm_taskid_table_lock);
1411 			return (NULL);
1412 		}
1413 		idm_task_hold(idt);
1414 		/*
1415 		 * Set the task state to TASK_COMPLETE so it can no longer
1416 		 * be found or aborted.
1417 		 */
1418 		if (B_TRUE == complete)
1419 			idt->idt_state = TASK_COMPLETE;
1420 		mutex_exit(&idt->idt_mutex);
1421 	}
1422 	rw_exit(&idm.idm_taskid_table_lock);
1423 
1424 	return (idt);
1425 }
1426 
1427 /*
1428  * This function looks up a task by task tag.
1429  */
1430 idm_task_t *
1431 idm_task_find(idm_conn_t *ic, uint32_t itt, uint32_t ttt)
1432 {
1433 	return (idm_task_find_common(ic, itt, ttt, B_FALSE));
1434 }
1435 
1436 /*
1437  * This function looks up a task by task tag. If found, the task state
1438  * is atomically set to TASK_COMPLETE so it can longer be found or aborted.
1439  */
1440 idm_task_t *
1441 idm_task_find_and_complete(idm_conn_t *ic, uint32_t itt, uint32_t ttt)
1442 {
1443 	return (idm_task_find_common(ic, itt, ttt, B_TRUE));
1444 }
1445 
1446 /*
1447  * idm_task_find_by_handle
1448  *
1449  * This function looks up a task by the client-private idt_client_handle.
1450  *
1451  * This function should NEVER be called in the performance path.  It is
1452  * intended strictly for error recovery/task management.
1453  */
1454 /*ARGSUSED*/
1455 void *
1456 idm_task_find_by_handle(idm_conn_t *ic, uintptr_t handle)
1457 {
1458 	idm_task_t	*idt = NULL;
1459 	int		idx = 0;
1460 
1461 	rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1462 
1463 	for (idx = 0; idx < idm.idm_taskid_max; idx++) {
1464 		idt = idm.idm_taskid_table[idx];
1465 
1466 		if (idt == NULL)
1467 			continue;
1468 
1469 		mutex_enter(&idt->idt_mutex);
1470 
1471 		if (idt->idt_state != TASK_ACTIVE) {
1472 			/*
1473 			 * Task is either in suspend, abort, or already
1474 			 * complete.
1475 			 */
1476 			mutex_exit(&idt->idt_mutex);
1477 			continue;
1478 		}
1479 
1480 		if (idt->idt_client_handle == handle) {
1481 			idm_task_hold(idt);
1482 			mutex_exit(&idt->idt_mutex);
1483 			break;
1484 		}
1485 
1486 		mutex_exit(&idt->idt_mutex);
1487 	}
1488 
1489 	rw_exit(&idm.idm_taskid_table_lock);
1490 
1491 	if ((idt == NULL) || (idx == idm.idm_taskid_max))
1492 		return (NULL);
1493 
1494 	return (idt->idt_private);
1495 }
1496 
1497 void
1498 idm_task_hold(idm_task_t *idt)
1499 {
1500 	idm_refcnt_hold(&idt->idt_refcnt);
1501 }
1502 
1503 void
1504 idm_task_rele(idm_task_t *idt)
1505 {
1506 	idm_refcnt_rele(&idt->idt_refcnt);
1507 }
1508 
1509 void
1510 idm_task_abort(idm_conn_t *ic, idm_task_t *idt, idm_abort_type_t abort_type)
1511 {
1512 	idm_task_t	*task;
1513 	int		idx;
1514 
1515 	/*
1516 	 * Passing NULL as the task indicates that all tasks
1517 	 * for this connection should be aborted.
1518 	 */
1519 	if (idt == NULL) {
1520 		/*
1521 		 * Only the connection state machine should ask for
1522 		 * all tasks to abort and this should never happen in FFP.
1523 		 */
1524 		ASSERT(!ic->ic_ffp);
1525 		rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1526 		for (idx = 0; idx < idm.idm_taskid_max; idx++) {
1527 			task = idm.idm_taskid_table[idx];
1528 			if (task == NULL)
1529 				continue;
1530 			mutex_enter(&task->idt_mutex);
1531 			if ((task->idt_state != TASK_IDLE) &&
1532 			    (task->idt_state != TASK_COMPLETE) &&
1533 			    (task->idt_ic == ic)) {
1534 				rw_exit(&idm.idm_taskid_table_lock);
1535 				idm_task_abort_one(ic, task, abort_type);
1536 				rw_enter(&idm.idm_taskid_table_lock, RW_READER);
1537 			} else
1538 				mutex_exit(&task->idt_mutex);
1539 		}
1540 		rw_exit(&idm.idm_taskid_table_lock);
1541 	} else {
1542 		mutex_enter(&idt->idt_mutex);
1543 		idm_task_abort_one(ic, idt, abort_type);
1544 	}
1545 }
1546 
1547 static void
1548 idm_task_abort_unref_cb(void *ref)
1549 {
1550 	idm_task_t *idt = ref;
1551 
1552 	mutex_enter(&idt->idt_mutex);
1553 	switch (idt->idt_state) {
1554 	case TASK_SUSPENDING:
1555 		idt->idt_state = TASK_SUSPENDED;
1556 		mutex_exit(&idt->idt_mutex);
1557 		idm_task_aborted(idt, IDM_STATUS_SUSPENDED);
1558 		return;
1559 	case TASK_ABORTING:
1560 		idt->idt_state = TASK_ABORTED;
1561 		mutex_exit(&idt->idt_mutex);
1562 		idm_task_aborted(idt, IDM_STATUS_ABORTED);
1563 		return;
1564 	default:
1565 		mutex_exit(&idt->idt_mutex);
1566 		ASSERT(0);
1567 		break;
1568 	}
1569 }
1570 
1571 /*
1572  * Abort the idm task.
1573  *    Caller must hold the task mutex, which will be released before return
1574  */
1575 static void
1576 idm_task_abort_one(idm_conn_t *ic, idm_task_t *idt, idm_abort_type_t abort_type)
1577 {
1578 	/* Caller must hold connection mutex */
1579 	ASSERT(mutex_owned(&idt->idt_mutex));
1580 	switch (idt->idt_state) {
1581 	case TASK_ACTIVE:
1582 		switch (abort_type) {
1583 		case AT_INTERNAL_SUSPEND:
1584 			/* Call transport to release any resources */
1585 			idt->idt_state = TASK_SUSPENDING;
1586 			mutex_exit(&idt->idt_mutex);
1587 			ic->ic_transport_ops->it_free_task_rsrc(idt);
1588 
1589 			/*
1590 			 * Wait for outstanding references.  When all
1591 			 * references are released the callback will call
1592 			 * idm_task_aborted().
1593 			 */
1594 			idm_refcnt_async_wait_ref(&idt->idt_refcnt,
1595 			    &idm_task_abort_unref_cb);
1596 			return;
1597 		case AT_INTERNAL_ABORT:
1598 		case AT_TASK_MGMT_ABORT:
1599 			idt->idt_state = TASK_ABORTING;
1600 			mutex_exit(&idt->idt_mutex);
1601 			ic->ic_transport_ops->it_free_task_rsrc(idt);
1602 
1603 			/*
1604 			 * Wait for outstanding references.  When all
1605 			 * references are released the callback will call
1606 			 * idm_task_aborted().
1607 			 */
1608 			idm_refcnt_async_wait_ref(&idt->idt_refcnt,
1609 			    &idm_task_abort_unref_cb);
1610 			return;
1611 		default:
1612 			ASSERT(0);
1613 		}
1614 		break;
1615 	case TASK_SUSPENDING:
1616 		/* Already called transport_free_task_rsrc(); */
1617 		switch (abort_type) {
1618 		case AT_INTERNAL_SUSPEND:
1619 			/* Already doing it */
1620 			break;
1621 		case AT_INTERNAL_ABORT:
1622 		case AT_TASK_MGMT_ABORT:
1623 			idt->idt_state = TASK_ABORTING;
1624 			break;
1625 		default:
1626 			ASSERT(0);
1627 		}
1628 		break;
1629 	case TASK_SUSPENDED:
1630 		/* Already called transport_free_task_rsrc(); */
1631 		switch (abort_type) {
1632 		case AT_INTERNAL_SUSPEND:
1633 			/* Already doing it */
1634 			break;
1635 		case AT_INTERNAL_ABORT:
1636 		case AT_TASK_MGMT_ABORT:
1637 			idt->idt_state = TASK_ABORTING;
1638 			mutex_exit(&idt->idt_mutex);
1639 
1640 			/*
1641 			 * We could probably call idm_task_aborted directly
1642 			 * here but we may be holding the conn lock. It's
1643 			 * easier to just switch contexts.  Even though
1644 			 * we shouldn't really have any references we'll
1645 			 * set the state to TASK_ABORTING instead of
1646 			 * TASK_ABORTED so we can use the same code path.
1647 			 */
1648 			idm_refcnt_async_wait_ref(&idt->idt_refcnt,
1649 			    &idm_task_abort_unref_cb);
1650 			return;
1651 		default:
1652 			ASSERT(0);
1653 		}
1654 		break;
1655 	case TASK_ABORTING:
1656 	case TASK_ABORTED:
1657 		switch (abort_type) {
1658 		case AT_INTERNAL_SUSPEND:
1659 			/* We're already past this point... */
1660 		case AT_INTERNAL_ABORT:
1661 		case AT_TASK_MGMT_ABORT:
1662 			/* Already doing it */
1663 			break;
1664 		default:
1665 			ASSERT(0);
1666 		}
1667 		break;
1668 	case TASK_COMPLETE:
1669 		/*
1670 		 * In this case, let it go.  The status has already been
1671 		 * sent (which may or may not get successfully transmitted)
1672 		 * and we don't want to end up in a race between completing
1673 		 * the status PDU and marking the task suspended.
1674 		 */
1675 		break;
1676 	default:
1677 		ASSERT(0);
1678 	}
1679 	mutex_exit(&idt->idt_mutex);
1680 }
1681 
1682 static void
1683 idm_task_aborted(idm_task_t *idt, idm_status_t status)
1684 {
1685 	(*idt->idt_ic->ic_conn_ops.icb_task_aborted)(idt, status);
1686 }
1687 
1688 void
1689 idm_task_cleanup(idm_task_t *idt)
1690 {
1691 	idm_buf_t *idb, *next_idb;
1692 	list_t		tmp_buflist;
1693 	ASSERT((idt->idt_state == TASK_SUSPENDED) ||
1694 	    (idt->idt_state == TASK_ABORTED));
1695 
1696 	list_create(&tmp_buflist, sizeof (idm_buf_t),
1697 	    offsetof(idm_buf_t, idb_buflink));
1698 
1699 	/*
1700 	 * Remove all the buffers from the task and add them to a
1701 	 * temporary local list -- we do this so that we can hold
1702 	 * the task lock and prevent the task from going away if
1703 	 * the client decides to call idm_task_done/idm_task_free.
1704 	 * This could happen during abort in iscsit.
1705 	 */
1706 	mutex_enter(&idt->idt_mutex);
1707 	for (idb = list_head(&idt->idt_inbufv);
1708 	    idb != NULL;
1709 	    idb = next_idb) {
1710 		next_idb = list_next(&idt->idt_inbufv, idb);
1711 		idm_buf_unbind_in_locked(idt, idb);
1712 		list_insert_tail(&tmp_buflist, idb);
1713 	}
1714 
1715 	for (idb = list_head(&idt->idt_outbufv);
1716 	    idb != NULL;
1717 	    idb = next_idb) {
1718 		next_idb = list_next(&idt->idt_outbufv, idb);
1719 		idm_buf_unbind_out_locked(idt, idb);
1720 		list_insert_tail(&tmp_buflist, idb);
1721 	}
1722 	mutex_exit(&idt->idt_mutex);
1723 
1724 	for (idb = list_head(&tmp_buflist); idb != NULL; idb = next_idb) {
1725 		next_idb = list_next(&tmp_buflist, idb);
1726 		list_remove(&tmp_buflist, idb);
1727 		(*idb->idb_buf_cb)(idb, IDM_STATUS_ABORTED);
1728 	}
1729 	list_destroy(&tmp_buflist);
1730 }
1731 
1732 
1733 /*
1734  * idm_pdu_tx
1735  *
1736  * This is IDM's implementation of the 'Send_Control' operational primitive.
1737  * This function is invoked by an initiator iSCSI layer requesting the transfer
1738  * of a iSCSI command PDU or a target iSCSI layer requesting the transfer of a
1739  * iSCSI response PDU. The PDU will be transmitted as-is by the local Datamover
1740  * layer to the peer iSCSI layer in the remote iSCSI node. The connection info
1741  * and iSCSI PDU-specific qualifiers namely BHS, AHS, DataDescriptor and Size
1742  * are provided as input.
1743  *
1744  */
1745 void
1746 idm_pdu_tx(idm_pdu_t *pdu)
1747 {
1748 	idm_conn_t		*ic = pdu->isp_ic;
1749 	iscsi_async_evt_hdr_t	*async_evt;
1750 
1751 	/*
1752 	 * If we are in full-featured mode then route SCSI-related
1753 	 * commands to the appropriate function vector without checking
1754 	 * the connection state.  We will only be in full-feature mode
1755 	 * when we are in an acceptable state for SCSI PDU's.
1756 	 *
1757 	 * We also need to ensure that there are no PDU events outstanding
1758 	 * on the state machine.  Any non-SCSI PDU's received in full-feature
1759 	 * mode will result in PDU events and until these have been handled
1760 	 * we need to route all PDU's through the state machine as PDU
1761 	 * events to maintain ordering.
1762 	 *
1763 	 * Note that IDM cannot enter FFP mode until it processes in
1764 	 * its state machine the last xmit of the login process.
1765 	 * Hence, checking the IDM_PDU_LOGIN_TX flag here would be
1766 	 * superfluous.
1767 	 */
1768 	mutex_enter(&ic->ic_state_mutex);
1769 	if (ic->ic_ffp && (ic->ic_pdu_events == 0)) {
1770 		mutex_exit(&ic->ic_state_mutex);
1771 		switch (IDM_PDU_OPCODE(pdu)) {
1772 		case ISCSI_OP_SCSI_RSP:
1773 			/* Target only */
1774 			idm_pdu_tx_forward(ic, pdu);
1775 			return;
1776 		case ISCSI_OP_SCSI_TASK_MGT_RSP:
1777 			/* Target only */
1778 			idm_pdu_tx_forward(ic, pdu);
1779 			return;
1780 		case ISCSI_OP_SCSI_DATA_RSP:
1781 			/* Target only */
1782 			idm_pdu_tx_forward(ic, pdu);
1783 			return;
1784 		case ISCSI_OP_RTT_RSP:
1785 			/* Target only */
1786 			idm_pdu_tx_forward(ic, pdu);
1787 			return;
1788 		case ISCSI_OP_NOOP_IN:
1789 			/* Target only */
1790 			idm_pdu_tx_forward(ic, pdu);
1791 			return;
1792 		case ISCSI_OP_TEXT_RSP:
1793 			/* Target only */
1794 			idm_pdu_tx_forward(ic, pdu);
1795 			return;
1796 		case ISCSI_OP_TEXT_CMD:
1797 		case ISCSI_OP_NOOP_OUT:
1798 		case ISCSI_OP_SCSI_CMD:
1799 		case ISCSI_OP_SCSI_DATA:
1800 		case ISCSI_OP_SCSI_TASK_MGT_MSG:
1801 			/* Initiator only */
1802 			idm_pdu_tx_forward(ic, pdu);
1803 			return;
1804 		default:
1805 			break;
1806 		}
1807 
1808 		mutex_enter(&ic->ic_state_mutex);
1809 	}
1810 
1811 	/*
1812 	 * Any PDU's processed outside of full-feature mode and non-SCSI
1813 	 * PDU's in full-feature mode are handled by generating an
1814 	 * event to the connection state machine.  The state machine
1815 	 * will validate the PDU against the current state and either
1816 	 * transmit the PDU if the opcode is allowed or handle an
1817 	 * error if the PDU is not allowed.
1818 	 *
1819 	 * This code-path will also generate any events that are implied
1820 	 * by the PDU opcode.  For example a "login response" with success
1821 	 * status generates a CE_LOGOUT_SUCCESS_SND event.
1822 	 */
1823 	switch (IDM_PDU_OPCODE(pdu)) {
1824 	case ISCSI_OP_LOGIN_CMD:
1825 		idm_conn_tx_pdu_event(ic, CE_LOGIN_SND, (uintptr_t)pdu);
1826 		break;
1827 	case ISCSI_OP_LOGIN_RSP:
1828 		idm_parse_login_rsp(ic, pdu, /* Is RX */ B_FALSE);
1829 		break;
1830 	case ISCSI_OP_LOGOUT_CMD:
1831 		idm_parse_logout_req(ic, pdu, /* Is RX */ B_FALSE);
1832 		break;
1833 	case ISCSI_OP_LOGOUT_RSP:
1834 		idm_parse_logout_rsp(ic, pdu, /* Is RX */ B_FALSE);
1835 		break;
1836 	case ISCSI_OP_ASYNC_EVENT:
1837 		async_evt = (iscsi_async_evt_hdr_t *)pdu->isp_hdr;
1838 		switch (async_evt->async_event) {
1839 		case ISCSI_ASYNC_EVENT_REQUEST_LOGOUT:
1840 			idm_conn_tx_pdu_event(ic, CE_ASYNC_LOGOUT_SND,
1841 			    (uintptr_t)pdu);
1842 			break;
1843 		case ISCSI_ASYNC_EVENT_DROPPING_CONNECTION:
1844 			idm_conn_tx_pdu_event(ic, CE_ASYNC_DROP_CONN_SND,
1845 			    (uintptr_t)pdu);
1846 			break;
1847 		case ISCSI_ASYNC_EVENT_DROPPING_ALL_CONNECTIONS:
1848 			idm_conn_tx_pdu_event(ic, CE_ASYNC_DROP_ALL_CONN_SND,
1849 			    (uintptr_t)pdu);
1850 			break;
1851 		case ISCSI_ASYNC_EVENT_SCSI_EVENT:
1852 		case ISCSI_ASYNC_EVENT_PARAM_NEGOTIATION:
1853 		default:
1854 			idm_conn_tx_pdu_event(ic, CE_MISC_TX,
1855 			    (uintptr_t)pdu);
1856 			break;
1857 		}
1858 		break;
1859 	case ISCSI_OP_SCSI_RSP:
1860 		/* Target only */
1861 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1862 		break;
1863 	case ISCSI_OP_SCSI_TASK_MGT_RSP:
1864 		/* Target only */
1865 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1866 		break;
1867 	case ISCSI_OP_SCSI_DATA_RSP:
1868 		/* Target only */
1869 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1870 		break;
1871 	case ISCSI_OP_RTT_RSP:
1872 		/* Target only */
1873 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1874 		break;
1875 	case ISCSI_OP_NOOP_IN:
1876 		/* Target only */
1877 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1878 		break;
1879 	case ISCSI_OP_TEXT_RSP:
1880 		/* Target only */
1881 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1882 		break;
1883 		/* Initiator only */
1884 	case ISCSI_OP_SCSI_CMD:
1885 	case ISCSI_OP_SCSI_TASK_MGT_MSG:
1886 	case ISCSI_OP_SCSI_DATA:
1887 	case ISCSI_OP_NOOP_OUT:
1888 	case ISCSI_OP_TEXT_CMD:
1889 	case ISCSI_OP_SNACK_CMD:
1890 	case ISCSI_OP_REJECT_MSG:
1891 	default:
1892 		/*
1893 		 * Connection state machine will validate these PDU's against
1894 		 * the current state.  A PDU not allowed in the current
1895 		 * state will cause a protocol error.
1896 		 */
1897 		idm_conn_tx_pdu_event(ic, CE_MISC_TX, (uintptr_t)pdu);
1898 		break;
1899 	}
1900 	mutex_exit(&ic->ic_state_mutex);
1901 }
1902 
1903 /*
1904  * Common allocation of a PDU along with memory for header and data.
1905  */
1906 static idm_pdu_t *
1907 idm_pdu_alloc_common(uint_t hdrlen, uint_t datalen, int sleepflag)
1908 {
1909 	idm_pdu_t *result;
1910 
1911 	/*
1912 	 * IDM clients should cache these structures for performance
1913 	 * critical paths.  We can't cache effectively in IDM because we
1914 	 * don't know the correct header and data size.
1915 	 *
1916 	 * Valid header length is assumed to be hdrlen and valid data
1917 	 * length is assumed to be datalen.  isp_hdrlen and isp_datalen
1918 	 * can be adjusted after the PDU is returned if necessary.
1919 	 */
1920 	result = kmem_zalloc(sizeof (idm_pdu_t) + hdrlen + datalen, sleepflag);
1921 	if (result != NULL) {
1922 		/* For idm_pdu_free sanity check */
1923 		result->isp_flags |= IDM_PDU_ALLOC;
1924 		/* pointer arithmetic */
1925 		result->isp_hdr = (iscsi_hdr_t *)(result + 1);
1926 		result->isp_hdrlen = hdrlen;
1927 		result->isp_hdrbuflen = hdrlen;
1928 		result->isp_transport_hdrlen = 0;
1929 		result->isp_data = (uint8_t *)result->isp_hdr + hdrlen;
1930 		result->isp_datalen = datalen;
1931 		result->isp_databuflen = datalen;
1932 		result->isp_magic = IDM_PDU_MAGIC;
1933 	}
1934 
1935 	return (result);
1936 }
1937 
1938 /*
1939  * Typical idm_pdu_alloc invocation, will block for resources.
1940  */
1941 idm_pdu_t *
1942 idm_pdu_alloc(uint_t hdrlen, uint_t datalen)
1943 {
1944 	return (idm_pdu_alloc_common(hdrlen, datalen, KM_SLEEP));
1945 }
1946 
1947 /*
1948  * Non-blocking idm_pdu_alloc implementation, returns NULL if resources
1949  * are not available.  Needed for transport-layer allocations which may
1950  * be invoking in interrupt context.
1951  */
1952 idm_pdu_t *
1953 idm_pdu_alloc_nosleep(uint_t hdrlen, uint_t datalen)
1954 {
1955 	return (idm_pdu_alloc_common(hdrlen, datalen, KM_NOSLEEP));
1956 }
1957 
1958 /*
1959  * Free a PDU previously allocated with idm_pdu_alloc() including any
1960  * header and data space allocated as part of the original request.
1961  * Additional memory regions referenced by subsequent modification of
1962  * the isp_hdr and/or isp_data fields will not be freed.
1963  */
1964 void
1965 idm_pdu_free(idm_pdu_t *pdu)
1966 {
1967 	/* Make sure the structure was allocated using idm_pdu_alloc() */
1968 	ASSERT(pdu->isp_flags & IDM_PDU_ALLOC);
1969 	kmem_free(pdu,
1970 	    sizeof (idm_pdu_t) + pdu->isp_hdrbuflen + pdu->isp_databuflen);
1971 }
1972 
1973 /*
1974  * Initialize the connection, private and callback fields in a PDU.
1975  */
1976 void
1977 idm_pdu_init(idm_pdu_t *pdu, idm_conn_t *ic, void *private, idm_pdu_cb_t *cb)
1978 {
1979 	/*
1980 	 * idm_pdu_complete() will call idm_pdu_free if the callback is
1981 	 * NULL.  This will only work if the PDU was originally allocated
1982 	 * with idm_pdu_alloc().
1983 	 */
1984 	ASSERT((pdu->isp_flags & IDM_PDU_ALLOC) ||
1985 	    (cb != NULL));
1986 	pdu->isp_magic = IDM_PDU_MAGIC;
1987 	pdu->isp_ic = ic;
1988 	pdu->isp_private = private;
1989 	pdu->isp_callback = cb;
1990 }
1991 
1992 /*
1993  * Initialize the header and header length field.  This function should
1994  * not be used to adjust the header length in a buffer allocated via
1995  * pdu_pdu_alloc since it overwrites the existing header pointer.
1996  */
1997 void
1998 idm_pdu_init_hdr(idm_pdu_t *pdu, uint8_t *hdr, uint_t hdrlen)
1999 {
2000 	pdu->isp_hdr = (iscsi_hdr_t *)((void *)hdr);
2001 	pdu->isp_hdrlen = hdrlen;
2002 }
2003 
2004 /*
2005  * Initialize the data and data length fields.  This function should
2006  * not be used to adjust the data length of a buffer allocated via
2007  * idm_pdu_alloc since it overwrites the existing data pointer.
2008  */
2009 void
2010 idm_pdu_init_data(idm_pdu_t *pdu, uint8_t *data, uint_t datalen)
2011 {
2012 	pdu->isp_data = data;
2013 	pdu->isp_datalen = datalen;
2014 }
2015 
2016 void
2017 idm_pdu_complete(idm_pdu_t *pdu, idm_status_t status)
2018 {
2019 	if (pdu->isp_callback) {
2020 		pdu->isp_status = status;
2021 		(*pdu->isp_callback)(pdu, status);
2022 	} else {
2023 		idm_pdu_free(pdu);
2024 	}
2025 }
2026 
2027 /*
2028  * State machine auditing
2029  */
2030 
2031 void
2032 idm_sm_audit_init(sm_audit_buf_t *audit_buf)
2033 {
2034 	bzero(audit_buf, sizeof (sm_audit_buf_t));
2035 	audit_buf->sab_max_index = SM_AUDIT_BUF_MAX_REC - 1;
2036 }
2037 
2038 static
2039 sm_audit_record_t *
2040 idm_sm_audit_common(sm_audit_buf_t *audit_buf, sm_audit_record_type_t r_type,
2041     sm_audit_sm_type_t sm_type,
2042     int current_state)
2043 {
2044 	sm_audit_record_t *sar;
2045 
2046 	sar = audit_buf->sab_records;
2047 	sar += audit_buf->sab_index;
2048 	audit_buf->sab_index++;
2049 	audit_buf->sab_index &= audit_buf->sab_max_index;
2050 
2051 	sar->sar_type = r_type;
2052 	gethrestime(&sar->sar_timestamp);
2053 	sar->sar_sm_type = sm_type;
2054 	sar->sar_state = current_state;
2055 
2056 	return (sar);
2057 }
2058 
2059 void
2060 idm_sm_audit_event(sm_audit_buf_t *audit_buf,
2061     sm_audit_sm_type_t sm_type, int current_state,
2062     int event, uintptr_t event_info)
2063 {
2064 	sm_audit_record_t *sar;
2065 
2066 	sar = idm_sm_audit_common(audit_buf, SAR_STATE_EVENT,
2067 	    sm_type, current_state);
2068 	sar->sar_event = event;
2069 	sar->sar_event_info = event_info;
2070 }
2071 
2072 void
2073 idm_sm_audit_state_change(sm_audit_buf_t *audit_buf,
2074     sm_audit_sm_type_t sm_type, int current_state, int new_state)
2075 {
2076 	sm_audit_record_t *sar;
2077 
2078 	sar = idm_sm_audit_common(audit_buf, SAR_STATE_CHANGE,
2079 	    sm_type, current_state);
2080 	sar->sar_new_state = new_state;
2081 }
2082 
2083 
2084 /*
2085  * Object reference tracking
2086  */
2087 
2088 void
2089 idm_refcnt_init(idm_refcnt_t *refcnt, void *referenced_obj)
2090 {
2091 	bzero(refcnt, sizeof (*refcnt));
2092 	idm_refcnt_reset(refcnt);
2093 	refcnt->ir_referenced_obj = referenced_obj;
2094 	bzero(&refcnt->ir_audit_buf, sizeof (refcnt_audit_buf_t));
2095 	refcnt->ir_audit_buf.anb_max_index = REFCNT_AUDIT_BUF_MAX_REC - 1;
2096 	mutex_init(&refcnt->ir_mutex, NULL, MUTEX_DEFAULT, NULL);
2097 	cv_init(&refcnt->ir_cv, NULL, CV_DEFAULT, NULL);
2098 }
2099 
2100 void
2101 idm_refcnt_destroy(idm_refcnt_t *refcnt)
2102 {
2103 	ASSERT(refcnt->ir_refcnt == 0);
2104 	cv_destroy(&refcnt->ir_cv);
2105 	mutex_destroy(&refcnt->ir_mutex);
2106 }
2107 
2108 void
2109 idm_refcnt_reset(idm_refcnt_t *refcnt)
2110 {
2111 	refcnt->ir_waiting = REF_NOWAIT;
2112 	refcnt->ir_refcnt = 0;
2113 }
2114 
2115 void
2116 idm_refcnt_hold(idm_refcnt_t *refcnt)
2117 {
2118 	/*
2119 	 * Nothing should take a hold on an object after a call to
2120 	 * idm_refcnt_wait_ref or idm_refcnd_async_wait_ref
2121 	 */
2122 	ASSERT(refcnt->ir_waiting == REF_NOWAIT);
2123 
2124 	mutex_enter(&refcnt->ir_mutex);
2125 	refcnt->ir_refcnt++;
2126 	REFCNT_AUDIT(refcnt);
2127 	mutex_exit(&refcnt->ir_mutex);
2128 }
2129 
2130 static void
2131 idm_refcnt_unref_task(void *refcnt_void)
2132 {
2133 	idm_refcnt_t *refcnt = refcnt_void;
2134 
2135 	REFCNT_AUDIT(refcnt);
2136 	(*refcnt->ir_cb)(refcnt->ir_referenced_obj);
2137 }
2138 
2139 void
2140 idm_refcnt_rele(idm_refcnt_t *refcnt)
2141 {
2142 	mutex_enter(&refcnt->ir_mutex);
2143 	ASSERT(refcnt->ir_refcnt > 0);
2144 	refcnt->ir_refcnt--;
2145 	REFCNT_AUDIT(refcnt);
2146 	if (refcnt->ir_waiting == REF_NOWAIT) {
2147 		/* No one is waiting on this object */
2148 		mutex_exit(&refcnt->ir_mutex);
2149 		return;
2150 	}
2151 
2152 	/*
2153 	 * Someone is waiting for this object to go idle so check if
2154 	 * refcnt is 0.  Waiting on an object then later grabbing another
2155 	 * reference is not allowed so we don't need to handle that case.
2156 	 */
2157 	if (refcnt->ir_refcnt == 0) {
2158 		if (refcnt->ir_waiting == REF_WAIT_ASYNC) {
2159 			if (taskq_dispatch(idm.idm_global_taskq,
2160 			    &idm_refcnt_unref_task, refcnt, TQ_SLEEP) == NULL) {
2161 				cmn_err(CE_WARN,
2162 				    "idm_refcnt_rele: Couldn't dispatch task");
2163 			}
2164 		} else if (refcnt->ir_waiting == REF_WAIT_SYNC) {
2165 			cv_signal(&refcnt->ir_cv);
2166 		}
2167 	}
2168 	mutex_exit(&refcnt->ir_mutex);
2169 }
2170 
2171 void
2172 idm_refcnt_rele_and_destroy(idm_refcnt_t *refcnt, idm_refcnt_cb_t *cb_func)
2173 {
2174 	mutex_enter(&refcnt->ir_mutex);
2175 	ASSERT(refcnt->ir_refcnt > 0);
2176 	refcnt->ir_refcnt--;
2177 	REFCNT_AUDIT(refcnt);
2178 
2179 	/*
2180 	 * Someone is waiting for this object to go idle so check if
2181 	 * refcnt is 0.  Waiting on an object then later grabbing another
2182 	 * reference is not allowed so we don't need to handle that case.
2183 	 */
2184 	if (refcnt->ir_refcnt == 0) {
2185 		refcnt->ir_cb = cb_func;
2186 		refcnt->ir_waiting = REF_WAIT_ASYNC;
2187 		if (taskq_dispatch(idm.idm_global_taskq,
2188 		    &idm_refcnt_unref_task, refcnt, TQ_SLEEP) == NULL) {
2189 			cmn_err(CE_WARN,
2190 			    "idm_refcnt_rele: Couldn't dispatch task");
2191 		}
2192 	}
2193 	mutex_exit(&refcnt->ir_mutex);
2194 }
2195 
2196 void
2197 idm_refcnt_wait_ref(idm_refcnt_t *refcnt)
2198 {
2199 	mutex_enter(&refcnt->ir_mutex);
2200 	refcnt->ir_waiting = REF_WAIT_SYNC;
2201 	REFCNT_AUDIT(refcnt);
2202 	while (refcnt->ir_refcnt != 0)
2203 		cv_wait(&refcnt->ir_cv, &refcnt->ir_mutex);
2204 	mutex_exit(&refcnt->ir_mutex);
2205 }
2206 
2207 void
2208 idm_refcnt_async_wait_ref(idm_refcnt_t *refcnt, idm_refcnt_cb_t *cb_func)
2209 {
2210 	mutex_enter(&refcnt->ir_mutex);
2211 	refcnt->ir_waiting = REF_WAIT_ASYNC;
2212 	refcnt->ir_cb = cb_func;
2213 	REFCNT_AUDIT(refcnt);
2214 	/*
2215 	 * It's possible we don't have any references.  To make things easier
2216 	 * on the caller use a taskq to call the callback instead of
2217 	 * calling it synchronously
2218 	 */
2219 	if (refcnt->ir_refcnt == 0) {
2220 		if (taskq_dispatch(idm.idm_global_taskq,
2221 		    &idm_refcnt_unref_task, refcnt, TQ_SLEEP) == NULL) {
2222 			cmn_err(CE_WARN,
2223 			    "idm_refcnt_async_wait_ref: "
2224 			    "Couldn't dispatch task");
2225 		}
2226 	}
2227 	mutex_exit(&refcnt->ir_mutex);
2228 }
2229 
2230 void
2231 idm_refcnt_destroy_unref_obj(idm_refcnt_t *refcnt,
2232     idm_refcnt_cb_t *cb_func)
2233 {
2234 	mutex_enter(&refcnt->ir_mutex);
2235 	if (refcnt->ir_refcnt == 0) {
2236 		mutex_exit(&refcnt->ir_mutex);
2237 		(*cb_func)(refcnt->ir_referenced_obj);
2238 		return;
2239 	}
2240 	mutex_exit(&refcnt->ir_mutex);
2241 }
2242 
2243 void
2244 idm_conn_hold(idm_conn_t *ic)
2245 {
2246 	idm_refcnt_hold(&ic->ic_refcnt);
2247 }
2248 
2249 void
2250 idm_conn_rele(idm_conn_t *ic)
2251 {
2252 	idm_refcnt_rele(&ic->ic_refcnt);
2253 }
2254 
2255 
2256 static int
2257 _idm_init(void)
2258 {
2259 	/* Initialize the rwlock for the taskid table */
2260 	rw_init(&idm.idm_taskid_table_lock, NULL, RW_DRIVER, NULL);
2261 
2262 	/* Initialize the global mutex and taskq */
2263 	mutex_init(&idm.idm_global_mutex, NULL, MUTEX_DEFAULT, NULL);
2264 
2265 	cv_init(&idm.idm_tgt_svc_cv, NULL, CV_DEFAULT, NULL);
2266 	cv_init(&idm.idm_wd_cv, NULL, CV_DEFAULT, NULL);
2267 
2268 	/*
2269 	 * The maximum allocation needs to be high here since there can be
2270 	 * many concurrent tasks using the global taskq.
2271 	 */
2272 	idm.idm_global_taskq = taskq_create("idm_global_taskq", 1, minclsyspri,
2273 	    128, 16384, TASKQ_PREPOPULATE);
2274 	if (idm.idm_global_taskq == NULL) {
2275 		cv_destroy(&idm.idm_wd_cv);
2276 		cv_destroy(&idm.idm_tgt_svc_cv);
2277 		mutex_destroy(&idm.idm_global_mutex);
2278 		rw_destroy(&idm.idm_taskid_table_lock);
2279 		return (ENOMEM);
2280 	}
2281 
2282 	/* Start watchdog thread */
2283 	idm.idm_wd_thread = thread_create(NULL, 0,
2284 	    idm_wd_thread, NULL, 0, &p0, TS_RUN, minclsyspri);
2285 	if (idm.idm_wd_thread == NULL) {
2286 		/* Couldn't create the watchdog thread */
2287 		taskq_destroy(idm.idm_global_taskq);
2288 		cv_destroy(&idm.idm_wd_cv);
2289 		cv_destroy(&idm.idm_tgt_svc_cv);
2290 		mutex_destroy(&idm.idm_global_mutex);
2291 		rw_destroy(&idm.idm_taskid_table_lock);
2292 		return (ENOMEM);
2293 	}
2294 
2295 	/* Pause until the watchdog thread is running */
2296 	mutex_enter(&idm.idm_global_mutex);
2297 	while (!idm.idm_wd_thread_running)
2298 		cv_wait(&idm.idm_wd_cv, &idm.idm_global_mutex);
2299 	mutex_exit(&idm.idm_global_mutex);
2300 
2301 	/*
2302 	 * Allocate the task ID table and set "next" to 0.
2303 	 */
2304 
2305 	idm.idm_taskid_max = idm_max_taskids;
2306 	idm.idm_taskid_table = (idm_task_t **)
2307 	    kmem_zalloc(idm.idm_taskid_max * sizeof (idm_task_t *), KM_SLEEP);
2308 	idm.idm_taskid_next = 0;
2309 
2310 	/* Create the global buffer and task kmem caches */
2311 	idm.idm_buf_cache = kmem_cache_create("idm_buf_cache",
2312 	    sizeof (idm_buf_t), 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
2313 
2314 	/*
2315 	 * Note, we're explicitly allocating an additional iSER header-
2316 	 * sized chunk for each of these elements. See idm_task_constructor().
2317 	 */
2318 	idm.idm_task_cache = kmem_cache_create("idm_task_cache",
2319 	    sizeof (idm_task_t) + IDM_TRANSPORT_HEADER_LENGTH, 8,
2320 	    &idm_task_constructor, &idm_task_destructor,
2321 	    NULL, NULL, NULL, KM_SLEEP);
2322 
2323 	/* Create the service and connection context lists */
2324 	list_create(&idm.idm_tgt_svc_list, sizeof (idm_svc_t),
2325 	    offsetof(idm_svc_t, is_list_node));
2326 	list_create(&idm.idm_tgt_conn_list, sizeof (idm_conn_t),
2327 	    offsetof(idm_conn_t, ic_list_node));
2328 	list_create(&idm.idm_ini_conn_list, sizeof (idm_conn_t),
2329 	    offsetof(idm_conn_t, ic_list_node));
2330 
2331 	/* Initialize the native sockets transport */
2332 	idm_so_init(&idm_transport_list[IDM_TRANSPORT_TYPE_SOCKETS]);
2333 
2334 	/* Create connection ID pool */
2335 	(void) idm_idpool_create(&idm.idm_conn_id_pool);
2336 
2337 	return (DDI_SUCCESS);
2338 }
2339 
2340 static int
2341 _idm_fini(void)
2342 {
2343 	if (!list_is_empty(&idm.idm_ini_conn_list) ||
2344 	    !list_is_empty(&idm.idm_tgt_conn_list) ||
2345 	    !list_is_empty(&idm.idm_tgt_svc_list)) {
2346 		return (EBUSY);
2347 	}
2348 
2349 	mutex_enter(&idm.idm_global_mutex);
2350 	idm.idm_wd_thread_running = B_FALSE;
2351 	cv_signal(&idm.idm_wd_cv);
2352 	mutex_exit(&idm.idm_global_mutex);
2353 
2354 	thread_join(idm.idm_wd_thread_did);
2355 
2356 	idm_idpool_destroy(&idm.idm_conn_id_pool);
2357 
2358 	/* Close any LDI handles we have open on transport drivers */
2359 	mutex_enter(&idm.idm_global_mutex);
2360 	idm_transport_teardown();
2361 	mutex_exit(&idm.idm_global_mutex);
2362 
2363 	/* Teardown the native sockets transport */
2364 	idm_so_fini();
2365 
2366 	list_destroy(&idm.idm_ini_conn_list);
2367 	list_destroy(&idm.idm_tgt_conn_list);
2368 	list_destroy(&idm.idm_tgt_svc_list);
2369 	kmem_cache_destroy(idm.idm_task_cache);
2370 	kmem_cache_destroy(idm.idm_buf_cache);
2371 	kmem_free(idm.idm_taskid_table,
2372 	    idm.idm_taskid_max * sizeof (idm_task_t *));
2373 	mutex_destroy(&idm.idm_global_mutex);
2374 	cv_destroy(&idm.idm_wd_cv);
2375 	cv_destroy(&idm.idm_tgt_svc_cv);
2376 	rw_destroy(&idm.idm_taskid_table_lock);
2377 
2378 	return (0);
2379 }
2380