xref: /illumos-gate/usr/src/uts/sun4v/io/vnet_rxdring.c (revision 5084e753b79a753c8b532c06eb3ad1d025e8e472)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/sysmacros.h>
29 #include <sys/param.h>
30 #include <sys/machsystm.h>
31 #include <sys/stream.h>
32 #include <sys/strsubr.h>
33 #include <sys/kmem.h>
34 #include <sys/strsun.h>
35 #include <sys/callb.h>
36 #include <sys/sdt.h>
37 #include <sys/ethernet.h>
38 #include <sys/mach_descrip.h>
39 #include <sys/mdeg.h>
40 #include <sys/vnet.h>
41 #include <sys/vio_mailbox.h>
42 #include <sys/vio_common.h>
43 #include <sys/vnet_common.h>
44 #include <sys/vnet_mailbox.h>
45 #include <sys/vio_util.h>
46 #include <sys/vnet_gen.h>
47 
48 /*
49  * This file contains the implementation of RxDringData transfer mode of VIO
50  * Protocol in vnet. The functions in this file are invoked from vnet_gen.c
51  * after RxDringData mode is negotiated with the peer during attribute phase of
52  * handshake. This file contains functions that setup the transmit and receive
53  * descriptor rings, and associated resources in RxDringData mode. It also
54  * contains the transmit and receive data processing functions that are invoked
55  * in RxDringData mode. The data processing routines in this file have the
56  * suffix '_shm' to indicate the shared memory mechanism used in RxDringData
57  * mode.
58  */
59 
60 /* Functions exported to vnet_gen.c */
61 int vgen_create_rx_dring(vgen_ldc_t *ldcp);
62 void vgen_destroy_rx_dring(vgen_ldc_t *ldcp);
63 int vgen_map_tx_dring(vgen_ldc_t *ldcp, void *pkt);
64 void vgen_unmap_tx_dring(vgen_ldc_t *ldcp);
65 int vgen_map_data(vgen_ldc_t *ldcp, void *pkt);
66 int vgen_dringsend_shm(void *arg, mblk_t *mp);
67 int vgen_handle_dringdata_shm(void *arg1, void *arg2);
68 mblk_t *vgen_poll_rcv_shm(vgen_ldc_t *ldcp, int bytes_to_pickup);
69 int vgen_send_dringack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
70     uint32_t start, int32_t end, uint8_t pstate);
71 
72 /* Internal functions */
73 static int vgen_handle_dringdata_info_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tp);
74 static int vgen_handle_dringdata_ack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
75 static int vgen_handle_dringdata_nack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tp);
76 static int vgen_intr_rcv_shm(vgen_ldc_t *ldcp);
77 static int vgen_receive_packet(vgen_ldc_t *ldcp, mblk_t **bp, uint_t *size);
78 static int vgen_send_dringdata_shm(vgen_ldc_t *ldcp, uint32_t start,
79     int32_t end);
80 static int vgen_sendmsg_shm(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen);
81 
82 /* Functions imported from vnet_gen.c */
83 extern int vgen_handle_evt_read(vgen_ldc_t *ldcp, vgen_caller_t caller);
84 extern int vgen_handle_evt_reset(vgen_ldc_t *ldcp, vgen_caller_t caller);
85 extern void vgen_handle_pkt_data(void *arg1, void *arg2, uint32_t msglen);
86 extern void vgen_destroy_rxpools(void *arg);
87 
88 /* Tunables */
89 extern uint32_t vnet_num_descriptors;
90 extern uint32_t vgen_chain_len;
91 extern uint32_t vgen_ldcwr_retries;
92 extern uint32_t vgen_recv_delay;
93 extern uint32_t vgen_recv_retries;
94 extern uint32_t vgen_nrbufs_factor;
95 
96 #ifdef DEBUG
97 
98 #define	DEBUG_PRINTF	vgen_debug_printf
99 
100 extern int vnet_dbglevel;
101 extern int vgen_inject_err_flag;
102 
103 extern void vgen_debug_printf(const char *fname, vgen_t *vgenp,
104 	vgen_ldc_t *ldcp, const char *fmt, ...);
105 extern boolean_t vgen_inject_error(vgen_ldc_t *ldcp, int error);
106 
107 #endif
108 
109 /*
110  * Allocate receive resources for the channel. The resources consist of a
111  * receive descriptor ring and an associated receive buffer area.
112  */
113 int
114 vgen_create_rx_dring(vgen_ldc_t *ldcp)
115 {
116 	int 				i, j;
117 	int 				rv;
118 	uint32_t			ncookies;
119 	ldc_mem_info_t			minfo;
120 	vnet_rx_dringdata_desc_t	*rxdp;
121 	size_t				data_sz;
122 	vio_mblk_t			*vmp;
123 	vio_mblk_t			**rxdp_to_vmp;
124 	uint32_t			rxdsize;
125 	caddr_t				datap = NULL;
126 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
127 
128 	rxdsize = sizeof (vnet_rx_dringdata_desc_t);
129 	ldcp->num_rxds = vnet_num_descriptors;
130 	ldcp->num_rbufs = vnet_num_descriptors * vgen_nrbufs_factor;
131 
132 	/* Create the receive descriptor ring */
133 	rv = ldc_mem_dring_create(ldcp->num_rxds, rxdsize,
134 	    &ldcp->rx_dring_handle);
135 	if (rv != 0) {
136 		DWARN(vgenp, ldcp, "ldc_mem_dring_create() failed\n");
137 		goto fail;
138 	}
139 
140 	/* Get the addr of descriptor ring */
141 	rv = ldc_mem_dring_info(ldcp->rx_dring_handle, &minfo);
142 	if (rv != 0) {
143 		DWARN(vgenp, ldcp, "ldc_mem_dring_info() failed\n");
144 		goto fail;
145 	}
146 	ldcp->rxdp = (vnet_rx_dringdata_desc_t *)(minfo.vaddr);
147 	bzero(ldcp->rxdp, sizeof (*rxdp) * (ldcp->num_rxds));
148 
149 	/*
150 	 * Allocate a table that maps descriptor to its associated buffer;
151 	 * used while receiving to validate that the peer has not changed the
152 	 * buffer offset provided in the descriptor.
153 	 */
154 	rxdp_to_vmp = kmem_zalloc(ldcp->num_rxds * sizeof (uintptr_t),
155 	    KM_SLEEP);
156 	ldcp->rxdp_to_vmp = rxdp_to_vmp;
157 
158 	/*
159 	 * Allocate a single large buffer that serves as the rx buffer area.
160 	 * We allocate a ldc memory handle and export the buffer area as shared
161 	 * memory. We send the ldc memcookie for this buffer space to the peer,
162 	 * as part of dring registration phase during handshake. We manage this
163 	 * buffer area as individual buffers of max_frame_size and provide
164 	 * specific buffer offsets in each descriptor to the peer. Note that
165 	 * the factor used to compute the # of buffers (above) must be > 1 to
166 	 * ensure that there are more buffers than the # of descriptors. This
167 	 * is needed because, while the shared memory buffers are sent up our
168 	 * stack during receive, the sender needs additional buffers that can
169 	 * be used for further transmits. This also means there is no one to
170 	 * one correspondence between the descriptor index and buffer offset.
171 	 * The sender has to read the buffer offset in the descriptor and use
172 	 * the specified offset to copy the tx data into the shared buffer. We
173 	 * (receiver) manage the individual buffers and their state (see
174 	 * VIO_MBLK_STATEs in vio_util.h).
175 	 */
176 	data_sz = vgenp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
177 	data_sz = VNET_ROUNDUP_2K(data_sz);
178 
179 	ldcp->rx_data_sz = data_sz * ldcp->num_rbufs;
180 	ldcp->rx_dblk_sz = data_sz;
181 	datap = kmem_zalloc(ldcp->rx_data_sz, KM_SLEEP);
182 	ldcp->rx_datap = datap;
183 
184 	/* Allocate a ldc memhandle for the entire rx data area */
185 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &ldcp->rx_data_handle);
186 	if (rv) {
187 		ldcp->rx_data_handle = 0;
188 		goto fail;
189 	}
190 
191 	/* Allocate memory for the data cookies */
192 	ldcp->rx_data_cookie = kmem_zalloc(VNET_DATA_AREA_COOKIES *
193 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
194 
195 	/*
196 	 * Bind ldc memhandle to the corresponding rx data area.
197 	 */
198 	ncookies = 0;
199 	rv = ldc_mem_bind_handle(ldcp->rx_data_handle, (caddr_t)datap,
200 	    ldcp->rx_data_sz, LDC_DIRECT_MAP, LDC_MEM_W,
201 	    ldcp->rx_data_cookie, &ncookies);
202 	if (rv != 0) {
203 		goto fail;
204 	}
205 	if ((ncookies == 0) || (ncookies > VNET_DATA_AREA_COOKIES)) {
206 		goto fail;
207 	}
208 	ldcp->rx_data_ncookies = ncookies;
209 
210 	for (j = 1; j < ncookies; j++) {
211 		rv = ldc_mem_nextcookie(ldcp->rx_data_handle,
212 		    &(ldcp->rx_data_cookie[j]));
213 		if (rv != 0) {
214 			DERR(vgenp, ldcp, "ldc_mem_nextcookie "
215 			    "failed rv (%d)", rv);
216 			goto fail;
217 		}
218 	}
219 
220 	/*
221 	 * Successful in binding the handle to rx data area. Now setup mblks
222 	 * around each data buffer and setup the descriptors to point to these
223 	 * rx data buffers. We associate each descriptor with a buffer
224 	 * by specifying the buffer offset in the descriptor. When the peer
225 	 * needs to transmit data, this offset is read by the peer to determine
226 	 * the buffer in the mapped buffer area where the data to be
227 	 * transmitted should be copied, for a specific descriptor.
228 	 */
229 	rv = vio_create_mblks(ldcp->num_rbufs, data_sz, (uint8_t *)datap,
230 	    &ldcp->rx_vmp);
231 	if (rv != 0) {
232 		goto fail;
233 	}
234 
235 	for (i = 0; i < ldcp->num_rxds; i++) {
236 		rxdp = &(ldcp->rxdp[i]);
237 		/* allocate an mblk around this data buffer */
238 		vmp = vio_allocb(ldcp->rx_vmp);
239 		ASSERT(vmp != NULL);
240 		rxdp->data_buf_offset = VIO_MBLK_DATA_OFF(vmp) + VNET_IPALIGN;
241 		rxdp->dstate = VIO_DESC_FREE;
242 		rxdp_to_vmp[i] = vmp;
243 	}
244 
245 	/*
246 	 * The descriptors and the associated buffers are all ready;
247 	 * now bind descriptor ring to the channel.
248 	 */
249 	rv = ldc_mem_dring_bind(ldcp->ldc_handle, ldcp->rx_dring_handle,
250 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
251 	    &ldcp->rx_dring_cookie, &ncookies);
252 	if (rv != 0) {
253 		DWARN(vgenp, ldcp, "ldc_mem_dring_bind failed "
254 		    "rv(%x)\n", rv);
255 		goto fail;
256 	}
257 	ASSERT(ncookies == 1);
258 	ldcp->rx_dring_ncookies = ncookies;
259 
260 	/* initialize rx seqnum and index */
261 	ldcp->next_rxseq = VNET_ISS;
262 	ldcp->next_rxi = 0;
263 
264 	return (VGEN_SUCCESS);
265 
266 fail:
267 	vgen_destroy_rx_dring(ldcp);
268 	return (VGEN_FAILURE);
269 }
270 
271 /*
272  * Free receive resources for the channel.
273  */
274 void
275 vgen_destroy_rx_dring(vgen_ldc_t *ldcp)
276 {
277 	vgen_t	*vgenp = LDC_TO_VGEN(ldcp);
278 
279 	/* We first unbind the descriptor ring */
280 	if (ldcp->rx_dring_ncookies != 0) {
281 		(void) ldc_mem_dring_unbind(ldcp->rx_dring_handle);
282 		ldcp->rx_dring_ncookies = 0;
283 	}
284 
285 	/* Destroy the mblks that are wrapped around the rx data buffers */
286 	if (ldcp->rx_vmp != NULL) {
287 		vio_clobber_pool(ldcp->rx_vmp);
288 		if (vio_destroy_mblks(ldcp->rx_vmp) != 0) {
289 			/*
290 			 * If we can't destroy the rx pool for this channel,
291 			 * dispatch a task to retry and clean up. Note that we
292 			 * don't need to wait for the task to complete. If the
293 			 * vnet device itself gets detached, it will wait for
294 			 * the task to complete implicitly in
295 			 * ddi_taskq_destroy().
296 			 */
297 			(void) ddi_taskq_dispatch(vgenp->rxp_taskq,
298 			    vgen_destroy_rxpools, ldcp->rx_vmp, DDI_SLEEP);
299 		}
300 		ldcp->rx_vmp = NULL;
301 	}
302 
303 	/* Free rx data area cookies */
304 	if (ldcp->rx_data_cookie != NULL) {
305 		kmem_free(ldcp->rx_data_cookie, VNET_DATA_AREA_COOKIES *
306 		    sizeof (ldc_mem_cookie_t));
307 		ldcp->rx_data_cookie = NULL;
308 	}
309 
310 	/* Unbind rx data area memhandle */
311 	if (ldcp->rx_data_ncookies != 0) {
312 		(void) ldc_mem_unbind_handle(ldcp->rx_data_handle);
313 		ldcp->rx_data_ncookies = 0;
314 	}
315 
316 	/* Free rx data area memhandle */
317 	if (ldcp->rx_data_handle != 0) {
318 		(void) ldc_mem_free_handle(ldcp->rx_data_handle);
319 		ldcp->rx_data_handle = 0;
320 	}
321 
322 	/* Now free the rx data area itself */
323 	if (ldcp->rx_datap != NULL) {
324 		/* prealloc'd rx data buffer */
325 		kmem_free(ldcp->rx_datap, ldcp->rx_data_sz);
326 		ldcp->rx_datap = NULL;
327 		ldcp->rx_data_sz = 0;
328 	}
329 
330 	/* Finally, free the receive descriptor ring */
331 	if (ldcp->rx_dring_handle != 0) {
332 		(void) ldc_mem_dring_destroy(ldcp->rx_dring_handle);
333 		ldcp->rx_dring_handle = 0;
334 		ldcp->rxdp = NULL;
335 	}
336 
337 	if (ldcp->rxdp_to_vmp != NULL) {
338 		kmem_free(ldcp->rxdp_to_vmp,
339 		    ldcp->num_rxds * sizeof (uintptr_t));
340 		ldcp->rxdp_to_vmp = NULL;
341 	}
342 
343 	/* Reset rx index and seqnum */
344 	ldcp->next_rxi = 0;
345 	ldcp->next_rxseq = VNET_ISS;
346 }
347 
348 /*
349  * Map the receive descriptor ring exported
350  * by the peer, as our transmit descriptor ring.
351  */
352 int
353 vgen_map_tx_dring(vgen_ldc_t *ldcp, void *pkt)
354 {
355 	int				i;
356 	int				rv;
357 	ldc_mem_info_t			minfo;
358 	ldc_mem_cookie_t		dcookie;
359 	uint32_t			ncookies;
360 	uint32_t 			num_desc;
361 	uint32_t			desc_size;
362 	vnet_rx_dringdata_desc_t	*txdp;
363 	on_trap_data_t			otd;
364 	vio_dring_reg_msg_t 		*msg = pkt;
365 
366 	ncookies = msg->ncookies;
367 	num_desc = msg->num_descriptors;
368 	desc_size = msg->descriptor_size;
369 
370 	/*
371 	 * Sanity check.
372 	 */
373 	if (num_desc < VGEN_NUM_DESCRIPTORS_MIN ||
374 	    desc_size < sizeof (vnet_rx_dringdata_desc_t) ||
375 	    ncookies > 1) {
376 		goto fail;
377 	}
378 
379 	bcopy(&msg->cookie[0], &dcookie, sizeof (ldc_mem_cookie_t));
380 
381 	/* Map the remote dring */
382 	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dcookie, ncookies, num_desc,
383 	    desc_size, LDC_DIRECT_MAP, &(ldcp->tx_dring_handle));
384 	if (rv != 0) {
385 		goto fail;
386 	}
387 
388 	/*
389 	 * Sucessfully mapped; now try to get info about the mapped dring
390 	 */
391 	rv = ldc_mem_dring_info(ldcp->tx_dring_handle, &minfo);
392 	if (rv != 0) {
393 		goto fail;
394 	}
395 
396 	/*
397 	 * Save ring address, number of descriptors.
398 	 */
399 	ldcp->mtxdp = (vnet_rx_dringdata_desc_t *)(minfo.vaddr);
400 	bcopy(&dcookie, &(ldcp->tx_dring_cookie), sizeof (dcookie));
401 	ldcp->tx_dring_ncookies = ncookies;
402 	ldcp->num_txds = num_desc;
403 
404 	/* Initialize tx dring indexes and seqnum */
405 	ldcp->next_txi = ldcp->cur_txi = ldcp->resched_peer_txi = 0;
406 	ldcp->next_txseq = VNET_ISS - 1;
407 	ldcp->resched_peer = B_TRUE;
408 	ldcp->dring_mtype = minfo.mtype;
409 	ldcp->dringdata_msgid = 0;
410 
411 	/* Save peer's dring_info values */
412 	bcopy(&dcookie, &(ldcp->peer_hparams.dring_cookie),
413 	    sizeof (ldc_mem_cookie_t));
414 	ldcp->peer_hparams.num_desc = num_desc;
415 	ldcp->peer_hparams.desc_size = desc_size;
416 	ldcp->peer_hparams.dring_ncookies = ncookies;
417 
418 	/* Set dring_ident for the peer */
419 	ldcp->peer_hparams.dring_ident = (uint64_t)ldcp->mtxdp;
420 
421 	/* Return the dring_ident in ack msg */
422 	msg->dring_ident = (uint64_t)ldcp->mtxdp;
423 
424 	/*
425 	 * Mark the descriptor state as 'done'. This is implementation specific
426 	 * and not required by the protocol. In our implementation, we only
427 	 * need the descripor to be in 'done' state to be used by the transmit
428 	 * function and the peer is not aware of it. As the protocol requires
429 	 * that during initial registration the exporting end point mark the
430 	 * dstate as 'free', we change it 'done' here. After this, the dstate
431 	 * in our implementation will keep moving between 'ready', set by our
432 	 * transmit function; and and 'done', set by the peer (per protocol)
433 	 * after receiving data.
434 	 * Setup on_trap() protection before accessing dring shared memory area.
435 	 */
436 	rv = LDC_ON_TRAP(&otd);
437 	if (rv != 0) {
438 		/*
439 		 * Data access fault occured down the code path below while
440 		 * accessing the descriptors. Return failure.
441 		 */
442 		goto fail;
443 	}
444 
445 	for (i = 0; i < num_desc; i++) {
446 		txdp = &ldcp->mtxdp[i];
447 		txdp->dstate = VIO_DESC_DONE;
448 	}
449 
450 	(void) LDC_NO_TRAP();
451 	return (VGEN_SUCCESS);
452 
453 fail:
454 	if (ldcp->tx_dring_handle != 0) {
455 		(void) ldc_mem_dring_unmap(ldcp->tx_dring_handle);
456 		ldcp->tx_dring_handle = 0;
457 	}
458 	return (VGEN_FAILURE);
459 }
460 
461 /*
462  * Unmap the transmit descriptor ring.
463  */
464 void
465 vgen_unmap_tx_dring(vgen_ldc_t *ldcp)
466 {
467 	/* Unmap mapped tx data area */
468 	if (ldcp->tx_datap != NULL) {
469 		(void) ldc_mem_unmap(ldcp->tx_data_handle);
470 		ldcp->tx_datap = NULL;
471 	}
472 
473 	/* Free tx data area handle */
474 	if (ldcp->tx_data_handle != 0) {
475 		(void) ldc_mem_free_handle(ldcp->tx_data_handle);
476 		ldcp->tx_data_handle = 0;
477 	}
478 
479 	/* Free tx data area cookies */
480 	if (ldcp->tx_data_cookie != NULL) {
481 		kmem_free(ldcp->tx_data_cookie, ldcp->tx_data_ncookies *
482 		    sizeof (ldc_mem_cookie_t));
483 		ldcp->tx_data_cookie = NULL;
484 		ldcp->tx_data_ncookies = 0;
485 	}
486 
487 	/* Unmap peer's dring */
488 	if (ldcp->tx_dring_handle != 0) {
489 		(void) ldc_mem_dring_unmap(ldcp->tx_dring_handle);
490 		ldcp->tx_dring_handle = 0;
491 	}
492 
493 	/* clobber tx ring members */
494 	bzero(&ldcp->tx_dring_cookie, sizeof (ldcp->tx_dring_cookie));
495 	ldcp->mtxdp = NULL;
496 	ldcp->next_txi = ldcp->cur_txi = ldcp->resched_peer_txi = 0;
497 	ldcp->num_txds = 0;
498 	ldcp->next_txseq = VNET_ISS - 1;
499 	ldcp->resched_peer = B_TRUE;
500 }
501 
502 /*
503  * Map the shared memory data buffer area exported by the peer.
504  */
505 int
506 vgen_map_data(vgen_ldc_t *ldcp, void *pkt)
507 {
508 	int			rv;
509 	vio_dring_reg_ext_msg_t	*emsg;
510 	vio_dring_reg_msg_t	*msg = (vio_dring_reg_msg_t *)pkt;
511 	uint8_t			*buf = (uint8_t *)msg->cookie;
512 	vgen_t			*vgenp = LDC_TO_VGEN(ldcp);
513 
514 	/* skip over dring cookies */
515 	ASSERT(msg->ncookies == 1);
516 	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
517 
518 	emsg = (vio_dring_reg_ext_msg_t *)buf;
519 	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
520 		return (VGEN_FAILURE);
521 	}
522 
523 	/* save # of data area cookies */
524 	ldcp->tx_data_ncookies = emsg->data_ncookies;
525 
526 	/* save data area size */
527 	ldcp->tx_data_sz = emsg->data_area_size;
528 
529 	/* allocate ldc mem handle for data area */
530 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &ldcp->tx_data_handle);
531 	if (rv != 0) {
532 		DWARN(vgenp, ldcp, "ldc_mem_alloc_handle() failed: %d\n", rv);
533 		return (VGEN_FAILURE);
534 	}
535 
536 	/* map the data area */
537 	rv = ldc_mem_map(ldcp->tx_data_handle, emsg->data_cookie,
538 	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_W,
539 	    (caddr_t *)&ldcp->tx_datap, NULL);
540 	if (rv != 0) {
541 		DWARN(vgenp, ldcp, "ldc_mem_map() failed: %d\n", rv);
542 		(void) ldc_mem_free_handle(ldcp->tx_data_handle);
543 		ldcp->tx_data_handle = 0;
544 		return (VGEN_FAILURE);
545 	}
546 
547 	/* allocate memory for data area cookies */
548 	ldcp->tx_data_cookie = kmem_zalloc(emsg->data_ncookies *
549 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
550 
551 	/* save data area cookies */
552 	bcopy(emsg->data_cookie, ldcp->tx_data_cookie,
553 	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
554 
555 	return (VGEN_SUCCESS);
556 }
557 
558 /*
559  * This function transmits normal data frames (non-priority) over the channel.
560  * It queues the frame into the transmit descriptor ring and sends a
561  * VIO_DRING_DATA message if needed, to wake up the peer to (re)start
562  * processing.
563  */
564 int
565 vgen_dringsend_shm(void *arg, mblk_t *mp)
566 {
567 	uint32_t			next_txi;
568 	uint32_t			txi;
569 	vnet_rx_dringdata_desc_t	*txdp;
570 	struct ether_header		*ehp;
571 	size_t				mblksz;
572 	caddr_t				dst;
573 	mblk_t				*bp;
574 	size_t				size;
575 	uint32_t			buf_offset;
576 	on_trap_data_t			otd;
577 	int				rv = 0;
578 	boolean_t			is_bcast = B_FALSE;
579 	boolean_t			is_mcast = B_FALSE;
580 	vgen_ldc_t			*ldcp = (vgen_ldc_t *)arg;
581 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
582 	vgen_stats_t			*statsp = &ldcp->stats;
583 	vgen_hparams_t			*lp = &ldcp->local_hparams;
584 	boolean_t			resched_peer = B_FALSE;
585 	boolean_t			tx_update = B_FALSE;
586 
587 	/* Drop the packet if ldc is not up or handshake is not done */
588 	if (ldcp->ldc_status != LDC_UP) {
589 		DBG2(vgenp, ldcp, "status(%d), dropping packet\n",
590 		    ldcp->ldc_status);
591 		goto dringsend_shm_exit;
592 	}
593 
594 	if (ldcp->hphase != VH_DONE) {
595 		DWARN(vgenp, ldcp, "hphase(%x), dropping packet\n",
596 		    ldcp->hphase);
597 		goto dringsend_shm_exit;
598 	}
599 
600 	size = msgsize(mp);
601 	if (size > (size_t)lp->mtu) {
602 		DWARN(vgenp, ldcp, "invalid size(%d)\n", size);
603 		goto dringsend_shm_exit;
604 	}
605 	if (size < ETHERMIN)
606 		size = ETHERMIN;
607 
608 	ehp = (struct ether_header *)mp->b_rptr;
609 	is_bcast = IS_BROADCAST(ehp);
610 	is_mcast = IS_MULTICAST(ehp);
611 
612 	/*
613 	 * Setup on_trap() protection before accessing shared memory areas
614 	 * (descriptor and data buffer). Note that we enable this protection a
615 	 * little early and turn it off slightly later, than keeping it enabled
616 	 * strictly at the points in code below where the descriptor and data
617 	 * buffer are accessed. This is done for performance reasons:
618 	 * (a) to avoid calling the trap protection code while holding mutex.
619 	 * (b) to avoid multiple on/off steps for descriptor and data accesses.
620 	 */
621 	rv = LDC_ON_TRAP(&otd);
622 	if (rv != 0) {
623 		/*
624 		 * Data access fault occured down the code path below while
625 		 * accessing either the descriptor or the data buffer. Release
626 		 * any locks that we might have acquired in the code below and
627 		 * return failure.
628 		 */
629 		DERR(vgenp, ldcp, "data access fault occured\n");
630 		statsp->oerrors++;
631 		if (mutex_owned(&ldcp->txlock)) {
632 			mutex_exit(&ldcp->txlock);
633 		}
634 		if (mutex_owned(&ldcp->wrlock)) {
635 			mutex_exit(&ldcp->wrlock);
636 		}
637 		goto dringsend_shm_exit;
638 	}
639 
640 	/*
641 	 * Allocate a descriptor
642 	 */
643 	mutex_enter(&ldcp->txlock);
644 	txi = next_txi = ldcp->next_txi;
645 	INCR_TXI(next_txi, ldcp);
646 	txdp = &(ldcp->mtxdp[txi]);
647 	if (txdp->dstate != VIO_DESC_DONE) { /* out of descriptors */
648 		if (ldcp->tx_blocked == B_FALSE) {
649 			ldcp->tx_blocked_lbolt = ddi_get_lbolt();
650 			ldcp->tx_blocked = B_TRUE;
651 		}
652 		statsp->tx_no_desc++;
653 		mutex_exit(&ldcp->txlock);
654 		(void) LDC_NO_TRAP();
655 		return (VGEN_TX_NORESOURCES);
656 	} else {
657 		txdp->dstate = VIO_DESC_INITIALIZING;
658 	}
659 
660 	if (ldcp->tx_blocked == B_TRUE) {
661 		ldcp->tx_blocked = B_FALSE;
662 		tx_update = B_TRUE;
663 	}
664 
665 	/* Update descriptor ring index */
666 	ldcp->next_txi = next_txi;
667 	mutex_exit(&ldcp->txlock);
668 
669 	if (tx_update == B_TRUE) {
670 		vio_net_tx_update_t vtx_update =
671 		    ldcp->portp->vcb.vio_net_tx_update;
672 
673 		vtx_update(ldcp->portp->vhp);
674 	}
675 
676 	/* Ensure load ordering of dstate (above) and data_buf_offset. */
677 	MEMBAR_CONSUMER();
678 
679 	/* Get the offset of the buffer to be used */
680 	buf_offset = txdp->data_buf_offset;
681 
682 	/* Access the buffer using the offset */
683 	dst = (caddr_t)ldcp->tx_datap + buf_offset;
684 
685 	/* Copy data into mapped transmit buffer */
686 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
687 		mblksz = MBLKL(bp);
688 		bcopy(bp->b_rptr, dst, mblksz);
689 		dst += mblksz;
690 	}
691 
692 	/* Set the size of data in the descriptor */
693 	txdp->nbytes = size;
694 
695 	/*
696 	 * Ensure store ordering of nbytes and dstate (below); so that the peer
697 	 * sees the right nbytes value after it checks that the dstate is READY.
698 	 */
699 	MEMBAR_PRODUCER();
700 
701 	mutex_enter(&ldcp->wrlock);
702 
703 	ASSERT(txdp->dstate == VIO_DESC_INITIALIZING);
704 
705 	/* Mark the descriptor ready */
706 	txdp->dstate = VIO_DESC_READY;
707 
708 	/* Check if peer needs wake up (handled below) */
709 	if (ldcp->resched_peer == B_TRUE && ldcp->resched_peer_txi == txi) {
710 		resched_peer = B_TRUE;
711 		ldcp->resched_peer = B_FALSE;
712 	}
713 
714 	/* Update tx stats */
715 	statsp->opackets++;
716 	statsp->obytes += size;
717 	if (is_bcast)
718 		statsp->brdcstxmt++;
719 	else if (is_mcast)
720 		statsp->multixmt++;
721 
722 	mutex_exit(&ldcp->wrlock);
723 
724 	/*
725 	 * We are done accessing shared memory; clear trap protection.
726 	 */
727 	(void) LDC_NO_TRAP();
728 
729 	/*
730 	 * Need to wake up the peer ?
731 	 */
732 	if (resched_peer == B_TRUE) {
733 		rv = vgen_send_dringdata_shm(ldcp, (uint32_t)txi, -1);
734 		if (rv != 0) {
735 			/* error: drop the packet */
736 			DWARN(vgenp, ldcp, "failed sending dringdata msg "
737 			    "rv(%d) len(%d)\n", rv, size);
738 			mutex_enter(&ldcp->wrlock);
739 			statsp->oerrors++;
740 			ldcp->resched_peer = B_TRUE;
741 			mutex_exit(&ldcp->wrlock);
742 		}
743 	}
744 
745 dringsend_shm_exit:
746 	if (rv == ECONNRESET || rv == EACCES) {
747 		(void) vgen_handle_evt_reset(ldcp, VGEN_OTHER);
748 	}
749 	freemsg(mp);
750 	return (VGEN_TX_SUCCESS);
751 }
752 
753 /*
754  * Process dring data messages (info/ack/nack)
755  */
756 int
757 vgen_handle_dringdata_shm(void *arg1, void *arg2)
758 {
759 	vgen_ldc_t	*ldcp = (vgen_ldc_t *)arg1;
760 	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)arg2;
761 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
762 	int		rv = 0;
763 
764 	switch (tagp->vio_subtype) {
765 
766 	case VIO_SUBTYPE_INFO:
767 		/*
768 		 * To reduce the locking contention, release the
769 		 * cblock here and re-acquire it once we are done
770 		 * receiving packets.
771 		 */
772 		mutex_exit(&ldcp->cblock);
773 		mutex_enter(&ldcp->rxlock);
774 		rv = vgen_handle_dringdata_info_shm(ldcp, tagp);
775 		mutex_exit(&ldcp->rxlock);
776 		mutex_enter(&ldcp->cblock);
777 		if (rv != 0) {
778 			DWARN(vgenp, ldcp, "handle_data_info failed(%d)\n", rv);
779 		}
780 		break;
781 
782 	case VIO_SUBTYPE_ACK:
783 		rv = vgen_handle_dringdata_ack_shm(ldcp, tagp);
784 		if (rv != 0) {
785 			DWARN(vgenp, ldcp, "handle_data_ack failed(%d)\n", rv);
786 		}
787 		break;
788 
789 	case VIO_SUBTYPE_NACK:
790 		rv = vgen_handle_dringdata_nack_shm(ldcp, tagp);
791 		if (rv != 0) {
792 			DWARN(vgenp, ldcp, "handle_data_nack failed(%d)\n", rv);
793 		}
794 		break;
795 	}
796 
797 	return (rv);
798 }
799 
800 static int
801 vgen_handle_dringdata_info_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
802 {
803 	uint32_t	start;
804 	int32_t		end;
805 	int		rv = 0;
806 	vio_dring_msg_t	*dringmsg = (vio_dring_msg_t *)tagp;
807 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
808 	vgen_stats_t	*statsp = &ldcp->stats;
809 
810 	start = dringmsg->start_idx;
811 	end = dringmsg->end_idx;
812 
813 	DBG1(vgenp, ldcp, "INFO: start(%d), end(%d)\n",
814 	    start, end);
815 
816 	if (!(CHECK_RXI(start, ldcp)) ||
817 	    ((end != -1) && !(CHECK_RXI(end, ldcp)))) {
818 		DWARN(vgenp, ldcp, "Invalid Rx start(%d) or end(%d)\n",
819 		    start, end);
820 		/* drop the message if invalid index */
821 		return (0);
822 	}
823 
824 	/* validate dring_ident */
825 	if (dringmsg->dring_ident != ldcp->peer_hparams.dring_ident) {
826 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
827 		    dringmsg->dring_ident);
828 		/* invalid dring_ident, drop the msg */
829 		return (0);
830 	}
831 
832 	statsp->dring_data_msgs_rcvd++;
833 
834 	/*
835 	 * If we are in polling mode, return from here without processing the
836 	 * dring. We will process the dring in the context of polling thread.
837 	 */
838 	if (ldcp->polling_on == B_TRUE) {
839 		return (0);
840 	}
841 
842 	/*
843 	 * Process the dring and receive packets in intr context.
844 	 */
845 	rv = vgen_intr_rcv_shm(ldcp);
846 	if (rv != 0) {
847 		DWARN(vgenp, ldcp, "vgen_intr_rcv_shm() failed\n");
848 	}
849 	return (rv);
850 }
851 
852 /*
853  * Process the rx descriptor ring in the context of interrupt thread
854  * (vgen_ldc_cb() callback) and send the received packets up the stack.
855  */
856 static int
857 vgen_intr_rcv_shm(vgen_ldc_t *ldcp)
858 {
859 	int		rv;
860 	uint32_t	end_ix;
861 	vio_dring_msg_t msg;
862 	uint_t		mblk_sz;
863 	int		count = 0;
864 	int		total_count = 0;
865 	mblk_t		*bp = NULL;
866 	mblk_t		*bpt = NULL;
867 	mblk_t		*mp = NULL;
868 	vio_net_rx_cb_t vrx_cb = ldcp->portp->vcb.vio_net_rx_cb;
869 
870 	ASSERT(MUTEX_HELD(&ldcp->rxlock));
871 
872 	do {
873 		rv = vgen_receive_packet(ldcp, &mp, &mblk_sz);
874 		if (rv != 0) {
875 			if (rv == EINVAL) {
876 				/* Invalid descriptor error; get next */
877 				continue;
878 			}
879 			DTRACE_PROBE1(vgen_intr_nopkts, vgen_ldc_t *, ldcp);
880 			break;
881 		}
882 
883 		/* Build a chain of received packets */
884 		if (bp == NULL) {
885 			/* first pkt */
886 			bp = mp;
887 			bpt = bp;
888 			bpt->b_next = NULL;
889 		} else {
890 			mp->b_next = NULL;
891 			bpt->b_next = mp;
892 			bpt = mp;
893 		}
894 
895 		total_count++;
896 		count++;
897 
898 		/*
899 		 * We are receiving the packets in interrupt context. If we
900 		 * have gathered vgen_chain_len (tunable) # of packets in the
901 		 * chain, send them up. (See vgen_poll_rcv_shm() for receiving
902 		 * in polling thread context).
903 		 */
904 		if (count == vgen_chain_len) {
905 			DTRACE_PROBE2(vgen_intr_pkts, vgen_ldc_t *, ldcp,
906 			    int, count);
907 			mutex_exit(&ldcp->rxlock);
908 			vrx_cb(ldcp->portp->vhp, bp);
909 			mutex_enter(&ldcp->rxlock);
910 			bp = bpt = NULL;
911 			count = 0;
912 		}
913 
914 		/*
915 		 * Stop further processing if we processed the entire dring
916 		 * once; otherwise continue.
917 		 */
918 	} while (total_count < ldcp->num_rxds);
919 
920 	if (bp != NULL) {
921 		DTRACE_PROBE2(vgen_intr_pkts, vgen_ldc_t *, ldcp, int, count);
922 		mutex_exit(&ldcp->rxlock);
923 		vrx_cb(ldcp->portp->vhp, bp);
924 		mutex_enter(&ldcp->rxlock);
925 	}
926 
927 	if (ldcp->polling_on == B_FALSE) {
928 		/*
929 		 * We send a stopped message to peer (sender) while we are in
930 		 * intr mode only; allowing the peer to send further data intrs
931 		 * (dring data msgs) to us.
932 		 */
933 		end_ix = ldcp->next_rxi;
934 		DECR_RXI(end_ix, ldcp);
935 		msg.dring_ident = ldcp->peer_hparams.dring_ident;
936 		rv = vgen_send_dringack_shm(ldcp, (vio_msg_tag_t *)&msg,
937 		    VNET_START_IDX_UNSPEC, end_ix, VIO_DP_STOPPED);
938 		return (rv);
939 	}
940 
941 	return (0);
942 }
943 
944 /*
945  * Process the rx descriptor ring in the context of mac polling thread. Receive
946  * packets upto the limit specified by bytes_to_pickup or until there are no
947  * more packets, whichever occurs first. Return the chain of received packets.
948  */
949 mblk_t *
950 vgen_poll_rcv_shm(vgen_ldc_t *ldcp, int bytes_to_pickup)
951 {
952 	uint_t		mblk_sz = 0;
953 	uint_t		sz = 0;
954 	mblk_t		*bp = NULL;
955 	mblk_t		*bpt = NULL;
956 	mblk_t		*mp = NULL;
957 	int		count = 0;
958 	int		rv;
959 
960 	mutex_enter(&ldcp->rxlock);
961 
962 	if (ldcp->hphase != VH_DONE) {
963 		/* Channel is being reset and handshake not complete */
964 		mutex_exit(&ldcp->rxlock);
965 		return (NULL);
966 	}
967 
968 	do {
969 		rv = vgen_receive_packet(ldcp, &mp, &mblk_sz);
970 		if (rv != 0) {
971 			if (rv == EINVAL) {
972 				/* Invalid descriptor error; get next */
973 				continue;
974 			}
975 			DTRACE_PROBE1(vgen_poll_nopkts, vgen_ldc_t *, ldcp);
976 			break;
977 		}
978 
979 		/* Build a chain of received packets */
980 		if (bp == NULL) {
981 			/* first pkt */
982 			bp = mp;
983 			bpt = bp;
984 			bpt->b_next = NULL;
985 		} else {
986 			mp->b_next = NULL;
987 			bpt->b_next = mp;
988 			bpt = mp;
989 		}
990 
991 		/* Compute total size accumulated */
992 		sz += mblk_sz;
993 		count++;
994 
995 		/* Reached the bytes limit; we are done. */
996 		if (sz >= bytes_to_pickup) {
997 			break;
998 		}
999 
1000 	_NOTE(CONSTCOND)
1001 	} while (1);
1002 
1003 	/*
1004 	 * We prepend any high priority packets to the chain of packets; note
1005 	 * that if we are already at the bytes_to_pickup limit, we might
1006 	 * slightly exceed that in such cases. That should be ok, as these pkts
1007 	 * are expected to be small in size and arrive at an interval in the
1008 	 * the order of a few seconds.
1009 	 */
1010 	if (ldcp->rx_pktdata == vgen_handle_pkt_data &&
1011 	    ldcp->rx_pri_head != NULL) {
1012 		ldcp->rx_pri_tail->b_next = bp;
1013 		bp = ldcp->rx_pri_head;
1014 		ldcp->rx_pri_head = ldcp->rx_pri_tail = NULL;
1015 	}
1016 
1017 	mutex_exit(&ldcp->rxlock);
1018 
1019 	DTRACE_PROBE2(vgen_poll_pkts, vgen_ldc_t *, ldcp, int, count);
1020 	DTRACE_PROBE2(vgen_poll_bytes, vgen_ldc_t *, ldcp, uint_t, sz);
1021 	return (bp);
1022 }
1023 
1024 /*
1025  * Process the next index in the rx dring and receive the associated packet.
1026  *
1027  * Returns:
1028  *	bp:	Success: The received packet.
1029  *		Failure: NULL
1030  *      size:	Success: Size of received packet.
1031  *		Failure: 0
1032  *      retval:
1033  *		Success: 0
1034  *		Failure: EAGAIN: Descriptor not ready
1035  *			 EIO:    Descriptor contents invalid.
1036  */
1037 static int
1038 vgen_receive_packet(vgen_ldc_t *ldcp, mblk_t **bp, uint_t *size)
1039 {
1040 	uint32_t			rxi;
1041 	vio_mblk_t			*vmp;
1042 	vio_mblk_t			*new_vmp;
1043 	struct ether_header		*ehp;
1044 	vnet_rx_dringdata_desc_t	*rxdp;
1045 	int				err = 0;
1046 	uint32_t			nbytes = 0;
1047 	mblk_t				*mp = NULL;
1048 	mblk_t				*dmp = NULL;
1049 	vgen_stats_t			*statsp = &ldcp->stats;
1050 	vgen_hparams_t			*lp = &ldcp->local_hparams;
1051 
1052 	rxi = ldcp->next_rxi;
1053 	rxdp = &(ldcp->rxdp[rxi]);
1054 	vmp = ldcp->rxdp_to_vmp[rxi];
1055 
1056 	if (rxdp->dstate != VIO_DESC_READY) {
1057 		/*
1058 		 * Descriptor is not ready.
1059 		 */
1060 		DTRACE_PROBE1(vgen_noready_rxds, vgen_ldc_t *, ldcp);
1061 		return (EAGAIN);
1062 	}
1063 
1064 	/*
1065 	 * Ensure load ordering of dstate and nbytes.
1066 	 */
1067 	MEMBAR_CONSUMER();
1068 
1069 	nbytes = rxdp->nbytes;
1070 
1071 	if ((nbytes < ETHERMIN) ||
1072 	    (nbytes > lp->mtu) ||
1073 	    (rxdp->data_buf_offset !=
1074 	    (VIO_MBLK_DATA_OFF(vmp) + VNET_IPALIGN))) {
1075 		/*
1076 		 * Descriptor contents invalid.
1077 		 */
1078 		statsp->ierrors++;
1079 		rxdp->dstate = VIO_DESC_DONE;
1080 		err = EIO;
1081 		goto done;
1082 	}
1083 
1084 	/*
1085 	 * Now allocate a new buffer for this descriptor before sending up the
1086 	 * buffer being processed. If that fails, stop processing; as we are
1087 	 * out of receive buffers.
1088 	 */
1089 	new_vmp = vio_allocb(ldcp->rx_vmp);
1090 
1091 	/*
1092 	 * Process the current buffer being received.
1093 	 */
1094 	mp = vmp->mp;
1095 
1096 	if (new_vmp == NULL) {
1097 		/*
1098 		 * We failed to get a new mapped buffer that is needed to
1099 		 * refill the descriptor. In that case, leave the current
1100 		 * buffer bound to the descriptor; allocate an mblk dynamically
1101 		 * and copy the contents of the buffer to the mblk. Then send
1102 		 * up this mblk. This way the sender has the same buffer as
1103 		 * before that can be used to send new data.
1104 		 */
1105 		statsp->norcvbuf++;
1106 		dmp = allocb(nbytes + VNET_IPALIGN, BPRI_MED);
1107 		if (dmp == NULL) {
1108 			statsp->ierrors++;
1109 			return (ENOMEM);
1110 		}
1111 		bcopy(mp->b_rptr + VNET_IPALIGN,
1112 		    dmp->b_rptr + VNET_IPALIGN, nbytes);
1113 		mp = dmp;
1114 	} else {
1115 		/* Mark the status of the current rbuf */
1116 		vmp->state = VIO_MBLK_HAS_DATA;
1117 
1118 		/* Set the offset of the new buffer in the descriptor */
1119 		rxdp->data_buf_offset =
1120 		    VIO_MBLK_DATA_OFF(new_vmp) + VNET_IPALIGN;
1121 		ldcp->rxdp_to_vmp[rxi] = new_vmp;
1122 	}
1123 	mp->b_rptr += VNET_IPALIGN;
1124 	mp->b_wptr = mp->b_rptr + nbytes;
1125 
1126 	/*
1127 	 * Ensure store ordering of data_buf_offset and dstate; so that the
1128 	 * peer sees the right data_buf_offset after it checks that the dstate
1129 	 * is DONE.
1130 	 */
1131 	MEMBAR_PRODUCER();
1132 
1133 	/* Now mark the descriptor 'done' */
1134 	rxdp->dstate = VIO_DESC_DONE;
1135 
1136 	/* Update stats */
1137 	statsp->ipackets++;
1138 	statsp->rbytes += rxdp->nbytes;
1139 	ehp = (struct ether_header *)mp->b_rptr;
1140 	if (IS_BROADCAST(ehp))
1141 		statsp->brdcstrcv++;
1142 	else if (IS_MULTICAST(ehp))
1143 		statsp->multircv++;
1144 done:
1145 	/* Update the next index to be processed */
1146 	INCR_RXI(rxi, ldcp);
1147 
1148 	/* Save the new recv index */
1149 	ldcp->next_rxi = rxi;
1150 
1151 	/* Return the packet received */
1152 	*size = nbytes;
1153 	*bp = mp;
1154 	return (err);
1155 }
1156 
1157 static int
1158 vgen_handle_dringdata_ack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
1159 {
1160 	uint32_t			start;
1161 	int32_t				end;
1162 	uint32_t			txi;
1163 	vgen_stats_t			*statsp;
1164 	vnet_rx_dringdata_desc_t	*txdp;
1165 	on_trap_data_t			otd;
1166 	int				rv = 0;
1167 	boolean_t			ready_txd = B_FALSE;
1168 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
1169 	vio_dring_msg_t			*dringmsg = (vio_dring_msg_t *)tagp;
1170 
1171 	start = dringmsg->start_idx;
1172 	end = dringmsg->end_idx;
1173 	statsp = &ldcp->stats;
1174 
1175 	/*
1176 	 * Received an ack for our transmits upto a certain dring index. This
1177 	 * enables us to reclaim descriptors. We also send a new dring data msg
1178 	 * to the peer to restart processing if there are pending transmit pkts.
1179 	 */
1180 	DBG2(vgenp, ldcp, "ACK:  start(%d), end(%d)\n", start, end);
1181 
1182 	/*
1183 	 * In RxDringData mode (v1.6), start index of -1 can be used by the
1184 	 * peer to indicate that it is unspecified. However, the end index
1185 	 * must be set correctly indicating the last descriptor index processed.
1186 	 */
1187 	if (((start != VNET_START_IDX_UNSPEC) && !(CHECK_TXI(start, ldcp))) ||
1188 	    !(CHECK_TXI(end, ldcp))) {
1189 		/* drop the message if invalid index */
1190 		DWARN(vgenp, ldcp, "Invalid Tx ack start(%d) or end(%d)\n",
1191 		    start, end);
1192 		return (rv);
1193 	}
1194 
1195 	/* Validate dring_ident */
1196 	if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
1197 		/* invalid dring_ident, drop the msg */
1198 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
1199 		    dringmsg->dring_ident);
1200 		return (rv);
1201 	}
1202 	statsp->dring_data_acks_rcvd++;
1203 
1204 	/*
1205 	 * Clear transmit flow control condition
1206 	 * as some descriptors should be free now.
1207 	 */
1208 	mutex_enter(&ldcp->txlock);
1209 	if (ldcp->tx_blocked == B_TRUE) {
1210 		vio_net_tx_update_t vtx_update =
1211 		    ldcp->portp->vcb.vio_net_tx_update;
1212 
1213 		ldcp->tx_blocked = B_FALSE;
1214 		vtx_update(ldcp->portp->vhp);
1215 	}
1216 	mutex_exit(&ldcp->txlock);
1217 
1218 	if (dringmsg->dring_process_state != VIO_DP_STOPPED) {
1219 		/*
1220 		 * Receiver continued processing
1221 		 * dring after sending us the ack.
1222 		 */
1223 		return (rv);
1224 	}
1225 
1226 	/*
1227 	 * Receiver stopped processing descriptors.
1228 	 */
1229 	statsp->dring_stopped_acks_rcvd++;
1230 
1231 	/*
1232 	 * Setup on_trap() protection before accessing dring shared memory area.
1233 	 */
1234 	rv = LDC_ON_TRAP(&otd);
1235 	if (rv != 0) {
1236 		/*
1237 		 * Data access fault occured down the code path below while
1238 		 * accessing the descriptors. Release any locks that we might
1239 		 * have acquired in the code below and return failure.
1240 		 */
1241 		if (mutex_owned(&ldcp->wrlock)) {
1242 			mutex_exit(&ldcp->wrlock);
1243 		}
1244 		return (ECONNRESET);
1245 	}
1246 
1247 	/*
1248 	 * Determine if there are any pending tx descriptors ready to be
1249 	 * processed by the receiver(peer) and if so, send a message to the
1250 	 * peer to restart receiving.
1251 	 */
1252 	mutex_enter(&ldcp->wrlock);
1253 
1254 	ready_txd = B_FALSE;
1255 	txi = end;
1256 	INCR_TXI(txi, ldcp);
1257 	txdp = &ldcp->mtxdp[txi];
1258 	if (txdp->dstate == VIO_DESC_READY) {
1259 		ready_txd = B_TRUE;
1260 	}
1261 
1262 	/*
1263 	 * We are done accessing shared memory; clear trap protection.
1264 	 */
1265 	(void) LDC_NO_TRAP();
1266 
1267 	if (ready_txd == B_FALSE) {
1268 		/*
1269 		 * No ready tx descriptors. Set the flag to send a message to
1270 		 * the peer when tx descriptors are ready in transmit routine.
1271 		 */
1272 		ldcp->resched_peer = B_TRUE;
1273 		ldcp->resched_peer_txi = txi;
1274 		mutex_exit(&ldcp->wrlock);
1275 		return (rv);
1276 	}
1277 
1278 	/*
1279 	 * We have some tx descriptors ready to be processed by the receiver.
1280 	 * Send a dring data message to the peer to restart processing.
1281 	 */
1282 	ldcp->resched_peer = B_FALSE;
1283 	mutex_exit(&ldcp->wrlock);
1284 	rv = vgen_send_dringdata_shm(ldcp, txi, -1);
1285 	if (rv != VGEN_SUCCESS) {
1286 		mutex_enter(&ldcp->wrlock);
1287 		ldcp->resched_peer = B_TRUE;
1288 		mutex_exit(&ldcp->wrlock);
1289 	}
1290 
1291 	return (rv);
1292 }
1293 
1294 static int
1295 vgen_handle_dringdata_nack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
1296 {
1297 	uint32_t			start;
1298 	int32_t				end;
1299 	uint32_t			txi;
1300 	vnet_rx_dringdata_desc_t	*txdp;
1301 	on_trap_data_t			otd;
1302 	int				rv = 0;
1303 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
1304 	vio_dring_msg_t			*dringmsg = (vio_dring_msg_t *)tagp;
1305 
1306 	DBG1(vgenp, ldcp, "enter\n");
1307 	start = dringmsg->start_idx;
1308 	end = dringmsg->end_idx;
1309 
1310 	/*
1311 	 * Peer sent a NACK msg (to indicate bad descriptors ?). The start and
1312 	 * end correspond to the range of descriptors which are being nack'd.
1313 	 */
1314 	DWARN(vgenp, ldcp, "NACK: start(%d), end(%d)\n", start, end);
1315 
1316 	/*
1317 	 * In RxDringData mode (v1.6), start index of -1 can be used by
1318 	 * the peer to indicate that it is unspecified. However, the end index
1319 	 * must be set correctly indicating the last descriptor index processed.
1320 	 */
1321 	if (((start != VNET_START_IDX_UNSPEC) && !(CHECK_TXI(start, ldcp))) ||
1322 	    !(CHECK_TXI(end, ldcp))) {
1323 		/* drop the message if invalid index */
1324 		DWARN(vgenp, ldcp, "Invalid Tx nack start(%d) or end(%d)\n",
1325 		    start, end);
1326 		return (rv);
1327 	}
1328 
1329 	/* Validate dring_ident */
1330 	if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
1331 		/* invalid dring_ident, drop the msg */
1332 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
1333 		    dringmsg->dring_ident);
1334 		return (rv);
1335 	}
1336 
1337 	/*
1338 	 * Setup on_trap() protection before accessing dring shared memory area.
1339 	 */
1340 	rv = LDC_ON_TRAP(&otd);
1341 	if (rv != 0) {
1342 		/*
1343 		 * Data access fault occured down the code path below while
1344 		 * accessing the descriptors. Release any locks that we might
1345 		 * have acquired in the code below and return failure.
1346 		 */
1347 		mutex_exit(&ldcp->txlock);
1348 		return (ECONNRESET);
1349 	}
1350 
1351 	/* We just mark the descrs as free so they can be reused */
1352 	mutex_enter(&ldcp->txlock);
1353 	for (txi = start; txi <= end; ) {
1354 		txdp = &(ldcp->mtxdp[txi]);
1355 		if (txdp->dstate == VIO_DESC_READY)
1356 			txdp->dstate = VIO_DESC_DONE;
1357 		INCR_TXI(txi, ldcp);
1358 	}
1359 
1360 	/*
1361 	 * We are done accessing shared memory; clear trap protection.
1362 	 */
1363 	(void) LDC_NO_TRAP();
1364 
1365 	mutex_exit(&ldcp->txlock);
1366 
1367 	return (rv);
1368 }
1369 
1370 /*
1371  * Send descriptor ring data message to the peer over LDC.
1372  */
1373 static int
1374 vgen_send_dringdata_shm(vgen_ldc_t *ldcp, uint32_t start, int32_t end)
1375 {
1376 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
1377 	vio_dring_msg_t	dringmsg, *msgp = &dringmsg;
1378 	vio_msg_tag_t	*tagp = &msgp->tag;
1379 	vgen_stats_t	*statsp = &ldcp->stats;
1380 	int		rv;
1381 
1382 #ifdef DEBUG
1383 	if (vgen_inject_error(ldcp, VGEN_ERR_TXTIMEOUT)) {
1384 		return (VGEN_SUCCESS);
1385 	}
1386 #endif
1387 	bzero(msgp, sizeof (*msgp));
1388 
1389 	tagp->vio_msgtype = VIO_TYPE_DATA;
1390 	tagp->vio_subtype = VIO_SUBTYPE_INFO;
1391 	tagp->vio_subtype_env = VIO_DRING_DATA;
1392 	tagp->vio_sid = ldcp->local_sid;
1393 
1394 	msgp->dring_ident = ldcp->local_hparams.dring_ident;
1395 	msgp->start_idx = start;
1396 	msgp->end_idx = end;
1397 	msgp->seq_num = atomic_inc_32_nv(&ldcp->dringdata_msgid);
1398 
1399 	rv = vgen_sendmsg_shm(ldcp, (caddr_t)tagp, sizeof (dringmsg));
1400 	if (rv != VGEN_SUCCESS) {
1401 		DWARN(vgenp, ldcp, "vgen_sendmsg_shm() failed\n");
1402 		return (rv);
1403 	}
1404 
1405 	statsp->dring_data_msgs_sent++;
1406 
1407 	DBG2(vgenp, ldcp, "DRING_DATA_SENT \n");
1408 
1409 	return (VGEN_SUCCESS);
1410 }
1411 
1412 /*
1413  * Send dring data ack message.
1414  */
1415 int
1416 vgen_send_dringack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, uint32_t start,
1417     int32_t end, uint8_t pstate)
1418 {
1419 	int		rv = 0;
1420 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
1421 	vio_dring_msg_t	*msgp = (vio_dring_msg_t *)tagp;
1422 	vgen_stats_t	*statsp = &ldcp->stats;
1423 
1424 	tagp->vio_msgtype = VIO_TYPE_DATA;
1425 	tagp->vio_subtype = VIO_SUBTYPE_ACK;
1426 	tagp->vio_subtype_env = VIO_DRING_DATA;
1427 	tagp->vio_sid = ldcp->local_sid;
1428 	msgp->start_idx = start;
1429 	msgp->end_idx = end;
1430 	msgp->dring_process_state = pstate;
1431 	msgp->seq_num = atomic_inc_32_nv(&ldcp->dringdata_msgid);
1432 
1433 	rv = vgen_sendmsg_shm(ldcp, (caddr_t)tagp, sizeof (*msgp));
1434 	if (rv != VGEN_SUCCESS) {
1435 		DWARN(vgenp, ldcp, "vgen_sendmsg_shm() failed\n");
1436 	}
1437 
1438 	statsp->dring_data_acks_sent++;
1439 	if (pstate == VIO_DP_STOPPED) {
1440 		statsp->dring_stopped_acks_sent++;
1441 	}
1442 
1443 	return (rv);
1444 }
1445 
1446 /*
1447  * Send dring data msgs (info/ack/nack) over LDC.
1448  */
1449 static int
1450 vgen_sendmsg_shm(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen)
1451 {
1452 	int			rv;
1453 	size_t			len;
1454 	uint32_t		retries = 0;
1455 	vgen_t			*vgenp = LDC_TO_VGEN(ldcp);
1456 
1457 	len = msglen;
1458 	if ((len == 0) || (msg == NULL))
1459 		return (VGEN_FAILURE);
1460 
1461 	do {
1462 		len = msglen;
1463 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msg, &len);
1464 		if (retries++ >= vgen_ldcwr_retries)
1465 			break;
1466 	} while (rv == EWOULDBLOCK);
1467 
1468 	if (rv != 0) {
1469 		DWARN(vgenp, ldcp, "ldc_write failed: rv(%d) msglen(%d)\n",
1470 		    rv, msglen);
1471 		return (rv);
1472 	}
1473 
1474 	if (len != msglen) {
1475 		DWARN(vgenp, ldcp, "ldc_write failed: rv(%d) msglen (%d)\n",
1476 		    rv, msglen);
1477 		return (VGEN_FAILURE);
1478 	}
1479 
1480 	return (VGEN_SUCCESS);
1481 }
1482