xref: /illumos-gate/usr/src/uts/sun4v/io/vnet_rxdring.c (revision 4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/sysmacros.h>
29 #include <sys/param.h>
30 #include <sys/machsystm.h>
31 #include <sys/stream.h>
32 #include <sys/strsubr.h>
33 #include <sys/kmem.h>
34 #include <sys/strsun.h>
35 #include <sys/callb.h>
36 #include <sys/sdt.h>
37 #include <sys/ethernet.h>
38 #include <sys/mach_descrip.h>
39 #include <sys/mdeg.h>
40 #include <sys/vnet.h>
41 #include <sys/vio_mailbox.h>
42 #include <sys/vio_common.h>
43 #include <sys/vnet_common.h>
44 #include <sys/vnet_mailbox.h>
45 #include <sys/vio_util.h>
46 #include <sys/vnet_gen.h>
47 
48 /*
49  * This file contains the implementation of RxDringData transfer mode of VIO
50  * Protocol in vnet. The functions in this file are invoked from vnet_gen.c
51  * after RxDringData mode is negotiated with the peer during attribute phase of
52  * handshake. This file contains functions that setup the transmit and receive
53  * descriptor rings, and associated resources in RxDringData mode. It also
54  * contains the transmit and receive data processing functions that are invoked
55  * in RxDringData mode. The data processing routines in this file have the
56  * suffix '_shm' to indicate the shared memory mechanism used in RxDringData
57  * mode.
58  */
59 
60 /* Functions exported to vnet_gen.c */
61 int vgen_create_rx_dring(vgen_ldc_t *ldcp);
62 void vgen_destroy_rx_dring(vgen_ldc_t *ldcp);
63 int vgen_map_tx_dring(vgen_ldc_t *ldcp, void *pkt);
64 void vgen_unmap_tx_dring(vgen_ldc_t *ldcp);
65 int vgen_map_data(vgen_ldc_t *ldcp, void *pkt);
66 int vgen_dringsend_shm(void *arg, mblk_t *mp);
67 int vgen_handle_dringdata_shm(void *arg1, void *arg2);
68 mblk_t *vgen_poll_rcv_shm(vgen_ldc_t *ldcp, int bytes_to_pickup);
69 int vgen_send_dringack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
70     uint32_t start, int32_t end, uint8_t pstate);
71 
72 /* Internal functions */
73 static int vgen_handle_dringdata_info_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tp);
74 static int vgen_handle_dringdata_ack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
75 static int vgen_handle_dringdata_nack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tp);
76 static int vgen_intr_rcv_shm(vgen_ldc_t *ldcp);
77 static int vgen_receive_packet(vgen_ldc_t *ldcp, mblk_t **bp, uint_t *size);
78 static int vgen_send_dringdata_shm(vgen_ldc_t *ldcp, uint32_t start,
79     int32_t end);
80 static int vgen_sendmsg_shm(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen);
81 
82 /* Functions imported from vnet_gen.c */
83 extern int vgen_handle_evt_read(vgen_ldc_t *ldcp, vgen_caller_t caller);
84 extern int vgen_handle_evt_reset(vgen_ldc_t *ldcp, vgen_caller_t caller);
85 extern void vgen_handle_pkt_data(void *arg1, void *arg2, uint32_t msglen);
86 extern void vgen_destroy_rxpools(void *arg);
87 
88 /* Tunables */
89 extern uint32_t vnet_num_descriptors;
90 extern uint32_t vgen_chain_len;
91 extern uint32_t vgen_ldcwr_retries;
92 extern uint32_t vgen_recv_delay;
93 extern uint32_t vgen_recv_retries;
94 extern uint32_t vgen_nrbufs_factor;
95 
96 #ifdef DEBUG
97 
98 #define	DEBUG_PRINTF	vgen_debug_printf
99 
100 extern int vnet_dbglevel;
101 extern int vgen_inject_err_flag;
102 
103 extern void vgen_debug_printf(const char *fname, vgen_t *vgenp,
104 	vgen_ldc_t *ldcp, const char *fmt, ...);
105 extern boolean_t vgen_inject_error(vgen_ldc_t *ldcp, int error);
106 
107 #endif
108 
109 /*
110  * Allocate receive resources for the channel. The resources consist of a
111  * receive descriptor ring and an associated receive buffer area.
112  */
113 int
114 vgen_create_rx_dring(vgen_ldc_t *ldcp)
115 {
116 	int 				i, j;
117 	int 				rv;
118 	uint32_t			ncookies;
119 	ldc_mem_info_t			minfo;
120 	vnet_rx_dringdata_desc_t	*rxdp;
121 	size_t				data_sz;
122 	vio_mblk_t			*vmp;
123 	vio_mblk_t			**rxdp_to_vmp;
124 	uint32_t			rxdsize;
125 	caddr_t				datap = NULL;
126 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
127 
128 	rxdsize = sizeof (vnet_rx_dringdata_desc_t);
129 	ldcp->num_rxds = vnet_num_descriptors;
130 	ldcp->num_rbufs = VGEN_RXDRING_NRBUFS;
131 
132 	/* Create the receive descriptor ring */
133 	rv = ldc_mem_dring_create(ldcp->num_rxds, rxdsize,
134 	    &ldcp->rx_dring_handle);
135 	if (rv != 0) {
136 		DWARN(vgenp, ldcp, "ldc_mem_dring_create() failed\n");
137 		goto fail;
138 	}
139 
140 	/* Get the addr of descriptor ring */
141 	rv = ldc_mem_dring_info(ldcp->rx_dring_handle, &minfo);
142 	if (rv != 0) {
143 		DWARN(vgenp, ldcp, "ldc_mem_dring_info() failed\n");
144 		goto fail;
145 	}
146 	ldcp->rxdp = (vnet_rx_dringdata_desc_t *)(minfo.vaddr);
147 	bzero(ldcp->rxdp, sizeof (*rxdp) * (ldcp->num_rxds));
148 
149 	/*
150 	 * Allocate a table that maps descriptor to its associated buffer;
151 	 * used while receiving to validate that the peer has not changed the
152 	 * buffer offset provided in the descriptor.
153 	 */
154 	rxdp_to_vmp = kmem_zalloc(ldcp->num_rxds * sizeof (uintptr_t),
155 	    KM_SLEEP);
156 	ldcp->rxdp_to_vmp = rxdp_to_vmp;
157 
158 	/*
159 	 * Allocate a single large buffer that serves as the rx buffer area.
160 	 * We allocate a ldc memory handle and export the buffer area as shared
161 	 * memory. We send the ldc memcookie for this buffer space to the peer,
162 	 * as part of dring registration phase during handshake. We manage this
163 	 * buffer area as individual buffers of max_frame_size and provide
164 	 * specific buffer offsets in each descriptor to the peer. Note that
165 	 * the factor used to compute the # of buffers (above) must be > 1 to
166 	 * ensure that there are more buffers than the # of descriptors. This
167 	 * is needed because, while the shared memory buffers are sent up our
168 	 * stack during receive, the sender needs additional buffers that can
169 	 * be used for further transmits. This also means there is no one to
170 	 * one correspondence between the descriptor index and buffer offset.
171 	 * The sender has to read the buffer offset in the descriptor and use
172 	 * the specified offset to copy the tx data into the shared buffer. We
173 	 * (receiver) manage the individual buffers and their state (see
174 	 * VIO_MBLK_STATEs in vio_util.h).
175 	 */
176 	data_sz = RXDRING_DBLK_SZ(vgenp->max_frame_size);
177 
178 	ldcp->rx_data_sz = data_sz * ldcp->num_rbufs;
179 	ldcp->rx_dblk_sz = data_sz;
180 	datap = kmem_zalloc(ldcp->rx_data_sz, KM_SLEEP);
181 	ldcp->rx_datap = datap;
182 
183 	/* Allocate a ldc memhandle for the entire rx data area */
184 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &ldcp->rx_data_handle);
185 	if (rv) {
186 		ldcp->rx_data_handle = 0;
187 		goto fail;
188 	}
189 
190 	/* Allocate memory for the data cookies */
191 	ldcp->rx_data_cookie = kmem_zalloc(VNET_DATA_AREA_COOKIES *
192 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
193 
194 	/*
195 	 * Bind ldc memhandle to the corresponding rx data area.
196 	 */
197 	ncookies = 0;
198 	rv = ldc_mem_bind_handle(ldcp->rx_data_handle, (caddr_t)datap,
199 	    ldcp->rx_data_sz, LDC_DIRECT_MAP, LDC_MEM_W,
200 	    ldcp->rx_data_cookie, &ncookies);
201 	if (rv != 0) {
202 		goto fail;
203 	}
204 	if ((ncookies == 0) || (ncookies > VNET_DATA_AREA_COOKIES)) {
205 		goto fail;
206 	}
207 	ldcp->rx_data_ncookies = ncookies;
208 
209 	for (j = 1; j < ncookies; j++) {
210 		rv = ldc_mem_nextcookie(ldcp->rx_data_handle,
211 		    &(ldcp->rx_data_cookie[j]));
212 		if (rv != 0) {
213 			DERR(vgenp, ldcp, "ldc_mem_nextcookie "
214 			    "failed rv (%d)", rv);
215 			goto fail;
216 		}
217 	}
218 
219 	/*
220 	 * Successful in binding the handle to rx data area. Now setup mblks
221 	 * around each data buffer and setup the descriptors to point to these
222 	 * rx data buffers. We associate each descriptor with a buffer
223 	 * by specifying the buffer offset in the descriptor. When the peer
224 	 * needs to transmit data, this offset is read by the peer to determine
225 	 * the buffer in the mapped buffer area where the data to be
226 	 * transmitted should be copied, for a specific descriptor.
227 	 */
228 	rv = vio_create_mblks(ldcp->num_rbufs, data_sz, (uint8_t *)datap,
229 	    &ldcp->rx_vmp);
230 	if (rv != 0) {
231 		goto fail;
232 	}
233 
234 	for (i = 0; i < ldcp->num_rxds; i++) {
235 		rxdp = &(ldcp->rxdp[i]);
236 		/* allocate an mblk around this data buffer */
237 		vmp = vio_allocb(ldcp->rx_vmp);
238 		ASSERT(vmp != NULL);
239 		rxdp->data_buf_offset = VIO_MBLK_DATA_OFF(vmp) + VNET_IPALIGN;
240 		rxdp->dstate = VIO_DESC_FREE;
241 		rxdp_to_vmp[i] = vmp;
242 	}
243 
244 	/*
245 	 * The descriptors and the associated buffers are all ready;
246 	 * now bind descriptor ring to the channel.
247 	 */
248 	rv = ldc_mem_dring_bind(ldcp->ldc_handle, ldcp->rx_dring_handle,
249 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
250 	    &ldcp->rx_dring_cookie, &ncookies);
251 	if (rv != 0) {
252 		DWARN(vgenp, ldcp, "ldc_mem_dring_bind failed "
253 		    "rv(%x)\n", rv);
254 		goto fail;
255 	}
256 	ASSERT(ncookies == 1);
257 	ldcp->rx_dring_ncookies = ncookies;
258 
259 	/* initialize rx seqnum and index */
260 	ldcp->next_rxseq = VNET_ISS;
261 	ldcp->next_rxi = 0;
262 
263 	return (VGEN_SUCCESS);
264 
265 fail:
266 	vgen_destroy_rx_dring(ldcp);
267 	return (VGEN_FAILURE);
268 }
269 
270 /*
271  * Free receive resources for the channel.
272  */
273 void
274 vgen_destroy_rx_dring(vgen_ldc_t *ldcp)
275 {
276 	vgen_t	*vgenp = LDC_TO_VGEN(ldcp);
277 
278 	/* We first unbind the descriptor ring */
279 	if (ldcp->rx_dring_ncookies != 0) {
280 		(void) ldc_mem_dring_unbind(ldcp->rx_dring_handle);
281 		ldcp->rx_dring_ncookies = 0;
282 	}
283 
284 	/* Destroy the mblks that are wrapped around the rx data buffers */
285 	if (ldcp->rx_vmp != NULL) {
286 		vio_clobber_pool(ldcp->rx_vmp);
287 		if (vio_destroy_mblks(ldcp->rx_vmp) != 0) {
288 			/*
289 			 * If we can't destroy the rx pool for this channel,
290 			 * dispatch a task to retry and clean up. Note that we
291 			 * don't need to wait for the task to complete. If the
292 			 * vnet device itself gets detached, it will wait for
293 			 * the task to complete implicitly in
294 			 * ddi_taskq_destroy().
295 			 */
296 			(void) ddi_taskq_dispatch(vgenp->rxp_taskq,
297 			    vgen_destroy_rxpools, ldcp->rx_vmp, DDI_SLEEP);
298 		}
299 		ldcp->rx_vmp = NULL;
300 	}
301 
302 	/* Free rx data area cookies */
303 	if (ldcp->rx_data_cookie != NULL) {
304 		kmem_free(ldcp->rx_data_cookie, VNET_DATA_AREA_COOKIES *
305 		    sizeof (ldc_mem_cookie_t));
306 		ldcp->rx_data_cookie = NULL;
307 	}
308 
309 	/* Unbind rx data area memhandle */
310 	if (ldcp->rx_data_ncookies != 0) {
311 		(void) ldc_mem_unbind_handle(ldcp->rx_data_handle);
312 		ldcp->rx_data_ncookies = 0;
313 	}
314 
315 	/* Free rx data area memhandle */
316 	if (ldcp->rx_data_handle != 0) {
317 		(void) ldc_mem_free_handle(ldcp->rx_data_handle);
318 		ldcp->rx_data_handle = 0;
319 	}
320 
321 	/* Now free the rx data area itself */
322 	if (ldcp->rx_datap != NULL) {
323 		/* prealloc'd rx data buffer */
324 		kmem_free(ldcp->rx_datap, ldcp->rx_data_sz);
325 		ldcp->rx_datap = NULL;
326 		ldcp->rx_data_sz = 0;
327 	}
328 
329 	/* Finally, free the receive descriptor ring */
330 	if (ldcp->rx_dring_handle != 0) {
331 		(void) ldc_mem_dring_destroy(ldcp->rx_dring_handle);
332 		ldcp->rx_dring_handle = 0;
333 		ldcp->rxdp = NULL;
334 	}
335 
336 	if (ldcp->rxdp_to_vmp != NULL) {
337 		kmem_free(ldcp->rxdp_to_vmp,
338 		    ldcp->num_rxds * sizeof (uintptr_t));
339 		ldcp->rxdp_to_vmp = NULL;
340 	}
341 
342 	/* Reset rx index and seqnum */
343 	ldcp->next_rxi = 0;
344 	ldcp->next_rxseq = VNET_ISS;
345 }
346 
347 /*
348  * Map the receive descriptor ring exported
349  * by the peer, as our transmit descriptor ring.
350  */
351 int
352 vgen_map_tx_dring(vgen_ldc_t *ldcp, void *pkt)
353 {
354 	int				i;
355 	int				rv;
356 	ldc_mem_info_t			minfo;
357 	ldc_mem_cookie_t		dcookie;
358 	uint32_t			ncookies;
359 	uint32_t 			num_desc;
360 	uint32_t			desc_size;
361 	vnet_rx_dringdata_desc_t	*txdp;
362 	on_trap_data_t			otd;
363 	vio_dring_reg_msg_t 		*msg = pkt;
364 
365 	ncookies = msg->ncookies;
366 	num_desc = msg->num_descriptors;
367 	desc_size = msg->descriptor_size;
368 
369 	/*
370 	 * Sanity check.
371 	 */
372 	if (num_desc < VGEN_NUM_DESCRIPTORS_MIN ||
373 	    desc_size < sizeof (vnet_rx_dringdata_desc_t) ||
374 	    ncookies > 1) {
375 		goto fail;
376 	}
377 
378 	bcopy(&msg->cookie[0], &dcookie, sizeof (ldc_mem_cookie_t));
379 
380 	/* Map the remote dring */
381 	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dcookie, ncookies, num_desc,
382 	    desc_size, LDC_DIRECT_MAP, &(ldcp->tx_dring_handle));
383 	if (rv != 0) {
384 		goto fail;
385 	}
386 
387 	/*
388 	 * Sucessfully mapped; now try to get info about the mapped dring
389 	 */
390 	rv = ldc_mem_dring_info(ldcp->tx_dring_handle, &minfo);
391 	if (rv != 0) {
392 		goto fail;
393 	}
394 
395 	/*
396 	 * Save ring address, number of descriptors.
397 	 */
398 	ldcp->mtxdp = (vnet_rx_dringdata_desc_t *)(minfo.vaddr);
399 	bcopy(&dcookie, &(ldcp->tx_dring_cookie), sizeof (dcookie));
400 	ldcp->tx_dring_ncookies = ncookies;
401 	ldcp->num_txds = num_desc;
402 
403 	/* Initialize tx dring indexes and seqnum */
404 	ldcp->next_txi = ldcp->cur_txi = ldcp->resched_peer_txi = 0;
405 	ldcp->next_txseq = VNET_ISS - 1;
406 	ldcp->resched_peer = B_TRUE;
407 	ldcp->dring_mtype = minfo.mtype;
408 	ldcp->dringdata_msgid = 0;
409 
410 	/* Save peer's dring_info values */
411 	bcopy(&dcookie, &(ldcp->peer_hparams.dring_cookie),
412 	    sizeof (ldc_mem_cookie_t));
413 	ldcp->peer_hparams.num_desc = num_desc;
414 	ldcp->peer_hparams.desc_size = desc_size;
415 	ldcp->peer_hparams.dring_ncookies = ncookies;
416 
417 	/* Set dring_ident for the peer */
418 	ldcp->peer_hparams.dring_ident = (uint64_t)ldcp->mtxdp;
419 
420 	/* Return the dring_ident in ack msg */
421 	msg->dring_ident = (uint64_t)ldcp->mtxdp;
422 
423 	/*
424 	 * Mark the descriptor state as 'done'. This is implementation specific
425 	 * and not required by the protocol. In our implementation, we only
426 	 * need the descripor to be in 'done' state to be used by the transmit
427 	 * function and the peer is not aware of it. As the protocol requires
428 	 * that during initial registration the exporting end point mark the
429 	 * dstate as 'free', we change it 'done' here. After this, the dstate
430 	 * in our implementation will keep moving between 'ready', set by our
431 	 * transmit function; and and 'done', set by the peer (per protocol)
432 	 * after receiving data.
433 	 * Setup on_trap() protection before accessing dring shared memory area.
434 	 */
435 	rv = LDC_ON_TRAP(&otd);
436 	if (rv != 0) {
437 		/*
438 		 * Data access fault occured down the code path below while
439 		 * accessing the descriptors. Return failure.
440 		 */
441 		goto fail;
442 	}
443 
444 	for (i = 0; i < num_desc; i++) {
445 		txdp = &ldcp->mtxdp[i];
446 		txdp->dstate = VIO_DESC_DONE;
447 	}
448 
449 	(void) LDC_NO_TRAP();
450 	return (VGEN_SUCCESS);
451 
452 fail:
453 	if (ldcp->tx_dring_handle != 0) {
454 		(void) ldc_mem_dring_unmap(ldcp->tx_dring_handle);
455 		ldcp->tx_dring_handle = 0;
456 	}
457 	return (VGEN_FAILURE);
458 }
459 
460 /*
461  * Unmap the transmit descriptor ring.
462  */
463 void
464 vgen_unmap_tx_dring(vgen_ldc_t *ldcp)
465 {
466 	/* Unmap mapped tx data area */
467 	if (ldcp->tx_datap != NULL) {
468 		(void) ldc_mem_unmap(ldcp->tx_data_handle);
469 		ldcp->tx_datap = NULL;
470 	}
471 
472 	/* Free tx data area handle */
473 	if (ldcp->tx_data_handle != 0) {
474 		(void) ldc_mem_free_handle(ldcp->tx_data_handle);
475 		ldcp->tx_data_handle = 0;
476 	}
477 
478 	/* Free tx data area cookies */
479 	if (ldcp->tx_data_cookie != NULL) {
480 		kmem_free(ldcp->tx_data_cookie, ldcp->tx_data_ncookies *
481 		    sizeof (ldc_mem_cookie_t));
482 		ldcp->tx_data_cookie = NULL;
483 		ldcp->tx_data_ncookies = 0;
484 	}
485 
486 	/* Unmap peer's dring */
487 	if (ldcp->tx_dring_handle != 0) {
488 		(void) ldc_mem_dring_unmap(ldcp->tx_dring_handle);
489 		ldcp->tx_dring_handle = 0;
490 	}
491 
492 	/* clobber tx ring members */
493 	bzero(&ldcp->tx_dring_cookie, sizeof (ldcp->tx_dring_cookie));
494 	ldcp->mtxdp = NULL;
495 	ldcp->next_txi = ldcp->cur_txi = ldcp->resched_peer_txi = 0;
496 	ldcp->num_txds = 0;
497 	ldcp->next_txseq = VNET_ISS - 1;
498 	ldcp->resched_peer = B_TRUE;
499 }
500 
501 /*
502  * Map the shared memory data buffer area exported by the peer.
503  */
504 int
505 vgen_map_data(vgen_ldc_t *ldcp, void *pkt)
506 {
507 	int			rv;
508 	vio_dring_reg_ext_msg_t	*emsg;
509 	vio_dring_reg_msg_t	*msg = (vio_dring_reg_msg_t *)pkt;
510 	uint8_t			*buf = (uint8_t *)msg->cookie;
511 	vgen_t			*vgenp = LDC_TO_VGEN(ldcp);
512 	ldc_mem_info_t		minfo;
513 
514 	/* skip over dring cookies */
515 	ASSERT(msg->ncookies == 1);
516 	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
517 
518 	emsg = (vio_dring_reg_ext_msg_t *)buf;
519 	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
520 		return (VGEN_FAILURE);
521 	}
522 
523 	/* save # of data area cookies */
524 	ldcp->tx_data_ncookies = emsg->data_ncookies;
525 
526 	/* save data area size */
527 	ldcp->tx_data_sz = emsg->data_area_size;
528 
529 	/* allocate ldc mem handle for data area */
530 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &ldcp->tx_data_handle);
531 	if (rv != 0) {
532 		DWARN(vgenp, ldcp, "ldc_mem_alloc_handle() failed: %d\n", rv);
533 		return (VGEN_FAILURE);
534 	}
535 
536 	/* map the data area */
537 	rv = ldc_mem_map(ldcp->tx_data_handle, emsg->data_cookie,
538 	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_W,
539 	    (caddr_t *)&ldcp->tx_datap, NULL);
540 	if (rv != 0) {
541 		DWARN(vgenp, ldcp, "ldc_mem_map() failed: %d\n", rv);
542 		return (VGEN_FAILURE);
543 	}
544 
545 	/* get the map info */
546 	rv = ldc_mem_info(ldcp->tx_data_handle, &minfo);
547 	if (rv != 0) {
548 		DWARN(vgenp, ldcp, "ldc_mem_info() failed: %d\n", rv);
549 		return (VGEN_FAILURE);
550 	}
551 
552 	if (minfo.mtype != LDC_DIRECT_MAP) {
553 		DWARN(vgenp, ldcp, "mtype(%d) is not direct map\n",
554 		    minfo.mtype);
555 		return (VGEN_FAILURE);
556 	}
557 
558 	/* allocate memory for data area cookies */
559 	ldcp->tx_data_cookie = kmem_zalloc(emsg->data_ncookies *
560 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
561 
562 	/* save data area cookies */
563 	bcopy(emsg->data_cookie, ldcp->tx_data_cookie,
564 	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
565 
566 	return (VGEN_SUCCESS);
567 }
568 
569 /*
570  * This function transmits normal data frames (non-priority) over the channel.
571  * It queues the frame into the transmit descriptor ring and sends a
572  * VIO_DRING_DATA message if needed, to wake up the peer to (re)start
573  * processing.
574  */
575 int
576 vgen_dringsend_shm(void *arg, mblk_t *mp)
577 {
578 	uint32_t			next_txi;
579 	uint32_t			txi;
580 	vnet_rx_dringdata_desc_t	*txdp;
581 	struct ether_header		*ehp;
582 	size_t				mblksz;
583 	caddr_t				dst;
584 	mblk_t				*bp;
585 	size_t				size;
586 	uint32_t			buf_offset;
587 	on_trap_data_t			otd;
588 	int				rv = 0;
589 	boolean_t			is_bcast = B_FALSE;
590 	boolean_t			is_mcast = B_FALSE;
591 	vgen_ldc_t			*ldcp = (vgen_ldc_t *)arg;
592 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
593 	vgen_stats_t			*statsp = &ldcp->stats;
594 	vgen_hparams_t			*lp = &ldcp->local_hparams;
595 	boolean_t			resched_peer = B_FALSE;
596 	boolean_t			tx_update = B_FALSE;
597 
598 	/* Drop the packet if ldc is not up or handshake is not done */
599 	if (ldcp->ldc_status != LDC_UP) {
600 		DBG2(vgenp, ldcp, "status(%d), dropping packet\n",
601 		    ldcp->ldc_status);
602 		goto dringsend_shm_exit;
603 	}
604 
605 	if (ldcp->hphase != VH_DONE) {
606 		DWARN(vgenp, ldcp, "hphase(%x), dropping packet\n",
607 		    ldcp->hphase);
608 		goto dringsend_shm_exit;
609 	}
610 
611 	size = msgsize(mp);
612 	if (size > (size_t)lp->mtu) {
613 		DWARN(vgenp, ldcp, "invalid size(%d)\n", size);
614 		goto dringsend_shm_exit;
615 	}
616 	if (size < ETHERMIN)
617 		size = ETHERMIN;
618 
619 	ehp = (struct ether_header *)mp->b_rptr;
620 	is_bcast = IS_BROADCAST(ehp);
621 	is_mcast = IS_MULTICAST(ehp);
622 
623 	/*
624 	 * Setup on_trap() protection before accessing shared memory areas
625 	 * (descriptor and data buffer). Note that we enable this protection a
626 	 * little early and turn it off slightly later, than keeping it enabled
627 	 * strictly at the points in code below where the descriptor and data
628 	 * buffer are accessed. This is done for performance reasons:
629 	 * (a) to avoid calling the trap protection code while holding mutex.
630 	 * (b) to avoid multiple on/off steps for descriptor and data accesses.
631 	 */
632 	rv = LDC_ON_TRAP(&otd);
633 	if (rv != 0) {
634 		/*
635 		 * Data access fault occured down the code path below while
636 		 * accessing either the descriptor or the data buffer. Release
637 		 * any locks that we might have acquired in the code below and
638 		 * return failure.
639 		 */
640 		DERR(vgenp, ldcp, "data access fault occured\n");
641 		statsp->oerrors++;
642 		if (mutex_owned(&ldcp->txlock)) {
643 			mutex_exit(&ldcp->txlock);
644 		}
645 		if (mutex_owned(&ldcp->wrlock)) {
646 			mutex_exit(&ldcp->wrlock);
647 		}
648 		goto dringsend_shm_exit;
649 	}
650 
651 	/*
652 	 * Allocate a descriptor
653 	 */
654 	mutex_enter(&ldcp->txlock);
655 	txi = next_txi = ldcp->next_txi;
656 	INCR_TXI(next_txi, ldcp);
657 	txdp = &(ldcp->mtxdp[txi]);
658 	if (txdp->dstate != VIO_DESC_DONE) { /* out of descriptors */
659 		if (ldcp->tx_blocked == B_FALSE) {
660 			ldcp->tx_blocked_lbolt = ddi_get_lbolt();
661 			ldcp->tx_blocked = B_TRUE;
662 		}
663 		statsp->tx_no_desc++;
664 		mutex_exit(&ldcp->txlock);
665 		(void) LDC_NO_TRAP();
666 		return (VGEN_TX_NORESOURCES);
667 	} else {
668 		txdp->dstate = VIO_DESC_INITIALIZING;
669 	}
670 
671 	if (ldcp->tx_blocked == B_TRUE) {
672 		ldcp->tx_blocked = B_FALSE;
673 		tx_update = B_TRUE;
674 	}
675 
676 	/* Update descriptor ring index */
677 	ldcp->next_txi = next_txi;
678 	mutex_exit(&ldcp->txlock);
679 
680 	if (tx_update == B_TRUE) {
681 		vio_net_tx_update_t vtx_update =
682 		    ldcp->portp->vcb.vio_net_tx_update;
683 
684 		vtx_update(ldcp->portp->vhp);
685 	}
686 
687 	/* Ensure load ordering of dstate (above) and data_buf_offset. */
688 	MEMBAR_CONSUMER();
689 
690 	/* Get the offset of the buffer to be used */
691 	buf_offset = txdp->data_buf_offset;
692 
693 	/* Access the buffer using the offset */
694 	dst = (caddr_t)ldcp->tx_datap + buf_offset;
695 
696 	/* Copy data into mapped transmit buffer */
697 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
698 		mblksz = MBLKL(bp);
699 		bcopy(bp->b_rptr, dst, mblksz);
700 		dst += mblksz;
701 	}
702 
703 	/* Set the size of data in the descriptor */
704 	txdp->nbytes = size;
705 
706 	/*
707 	 * Ensure store ordering of nbytes and dstate (below); so that the peer
708 	 * sees the right nbytes value after it checks that the dstate is READY.
709 	 */
710 	MEMBAR_PRODUCER();
711 
712 	mutex_enter(&ldcp->wrlock);
713 
714 	ASSERT(txdp->dstate == VIO_DESC_INITIALIZING);
715 
716 	/* Mark the descriptor ready */
717 	txdp->dstate = VIO_DESC_READY;
718 
719 	/* Check if peer needs wake up (handled below) */
720 	if (ldcp->resched_peer == B_TRUE && ldcp->resched_peer_txi == txi) {
721 		resched_peer = B_TRUE;
722 		ldcp->resched_peer = B_FALSE;
723 	}
724 
725 	/* Update tx stats */
726 	statsp->opackets++;
727 	statsp->obytes += size;
728 	if (is_bcast)
729 		statsp->brdcstxmt++;
730 	else if (is_mcast)
731 		statsp->multixmt++;
732 
733 	mutex_exit(&ldcp->wrlock);
734 
735 	/*
736 	 * We are done accessing shared memory; clear trap protection.
737 	 */
738 	(void) LDC_NO_TRAP();
739 
740 	/*
741 	 * Need to wake up the peer ?
742 	 */
743 	if (resched_peer == B_TRUE) {
744 		rv = vgen_send_dringdata_shm(ldcp, (uint32_t)txi, -1);
745 		if (rv != 0) {
746 			/* error: drop the packet */
747 			DWARN(vgenp, ldcp, "failed sending dringdata msg "
748 			    "rv(%d) len(%d)\n", rv, size);
749 			mutex_enter(&ldcp->wrlock);
750 			statsp->oerrors++;
751 			ldcp->resched_peer = B_TRUE;
752 			mutex_exit(&ldcp->wrlock);
753 		}
754 	}
755 
756 dringsend_shm_exit:
757 	if (rv == ECONNRESET || rv == EACCES) {
758 		(void) vgen_handle_evt_reset(ldcp, VGEN_OTHER);
759 	}
760 	freemsg(mp);
761 	return (VGEN_TX_SUCCESS);
762 }
763 
764 /*
765  * Process dring data messages (info/ack/nack)
766  */
767 int
768 vgen_handle_dringdata_shm(void *arg1, void *arg2)
769 {
770 	vgen_ldc_t	*ldcp = (vgen_ldc_t *)arg1;
771 	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)arg2;
772 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
773 	int		rv = 0;
774 
775 	switch (tagp->vio_subtype) {
776 
777 	case VIO_SUBTYPE_INFO:
778 		/*
779 		 * To reduce the locking contention, release the
780 		 * cblock here and re-acquire it once we are done
781 		 * receiving packets.
782 		 */
783 		mutex_exit(&ldcp->cblock);
784 		mutex_enter(&ldcp->rxlock);
785 		rv = vgen_handle_dringdata_info_shm(ldcp, tagp);
786 		mutex_exit(&ldcp->rxlock);
787 		mutex_enter(&ldcp->cblock);
788 		if (rv != 0) {
789 			DWARN(vgenp, ldcp, "handle_data_info failed(%d)\n", rv);
790 		}
791 		break;
792 
793 	case VIO_SUBTYPE_ACK:
794 		rv = vgen_handle_dringdata_ack_shm(ldcp, tagp);
795 		if (rv != 0) {
796 			DWARN(vgenp, ldcp, "handle_data_ack failed(%d)\n", rv);
797 		}
798 		break;
799 
800 	case VIO_SUBTYPE_NACK:
801 		rv = vgen_handle_dringdata_nack_shm(ldcp, tagp);
802 		if (rv != 0) {
803 			DWARN(vgenp, ldcp, "handle_data_nack failed(%d)\n", rv);
804 		}
805 		break;
806 	}
807 
808 	return (rv);
809 }
810 
811 static int
812 vgen_handle_dringdata_info_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
813 {
814 	uint32_t	start;
815 	int32_t		end;
816 	int		rv = 0;
817 	vio_dring_msg_t	*dringmsg = (vio_dring_msg_t *)tagp;
818 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
819 	vgen_stats_t	*statsp = &ldcp->stats;
820 
821 	start = dringmsg->start_idx;
822 	end = dringmsg->end_idx;
823 
824 	DBG1(vgenp, ldcp, "INFO: start(%d), end(%d)\n",
825 	    start, end);
826 
827 	if (!(CHECK_RXI(start, ldcp)) ||
828 	    ((end != -1) && !(CHECK_RXI(end, ldcp)))) {
829 		DWARN(vgenp, ldcp, "Invalid Rx start(%d) or end(%d)\n",
830 		    start, end);
831 		/* drop the message if invalid index */
832 		return (0);
833 	}
834 
835 	/* validate dring_ident */
836 	if (dringmsg->dring_ident != ldcp->peer_hparams.dring_ident) {
837 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
838 		    dringmsg->dring_ident);
839 		/* invalid dring_ident, drop the msg */
840 		return (0);
841 	}
842 
843 	statsp->dring_data_msgs_rcvd++;
844 
845 	/*
846 	 * If we are in polling mode, return from here without processing the
847 	 * dring. We will process the dring in the context of polling thread.
848 	 */
849 	if (ldcp->polling_on == B_TRUE) {
850 		return (0);
851 	}
852 
853 	/*
854 	 * Process the dring and receive packets in intr context.
855 	 */
856 	rv = vgen_intr_rcv_shm(ldcp);
857 	if (rv != 0) {
858 		DWARN(vgenp, ldcp, "vgen_intr_rcv_shm() failed\n");
859 	}
860 	return (rv);
861 }
862 
863 /*
864  * Process the rx descriptor ring in the context of interrupt thread
865  * (vgen_ldc_cb() callback) and send the received packets up the stack.
866  */
867 static int
868 vgen_intr_rcv_shm(vgen_ldc_t *ldcp)
869 {
870 	int		rv;
871 	uint32_t	end_ix;
872 	vio_dring_msg_t msg;
873 	uint_t		mblk_sz;
874 	int		count = 0;
875 	int		total_count = 0;
876 	mblk_t		*bp = NULL;
877 	mblk_t		*bpt = NULL;
878 	mblk_t		*mp = NULL;
879 	vio_net_rx_cb_t vrx_cb = ldcp->portp->vcb.vio_net_rx_cb;
880 
881 	ASSERT(MUTEX_HELD(&ldcp->rxlock));
882 
883 	do {
884 		rv = vgen_receive_packet(ldcp, &mp, &mblk_sz);
885 		if (rv != 0) {
886 			if (rv == EINVAL) {
887 				/* Invalid descriptor error; get next */
888 				continue;
889 			}
890 			DTRACE_PROBE1(vgen_intr_nopkts, vgen_ldc_t *, ldcp);
891 			break;
892 		}
893 
894 		/* Build a chain of received packets */
895 		if (bp == NULL) {
896 			/* first pkt */
897 			bp = mp;
898 			bpt = bp;
899 			bpt->b_next = NULL;
900 		} else {
901 			mp->b_next = NULL;
902 			bpt->b_next = mp;
903 			bpt = mp;
904 		}
905 
906 		total_count++;
907 		count++;
908 
909 		/*
910 		 * We are receiving the packets in interrupt context. If we
911 		 * have gathered vgen_chain_len (tunable) # of packets in the
912 		 * chain, send them up. (See vgen_poll_rcv_shm() for receiving
913 		 * in polling thread context).
914 		 */
915 		if (count == vgen_chain_len) {
916 			DTRACE_PROBE2(vgen_intr_pkts, vgen_ldc_t *, ldcp,
917 			    int, count);
918 			mutex_exit(&ldcp->rxlock);
919 			vrx_cb(ldcp->portp->vhp, bp);
920 			mutex_enter(&ldcp->rxlock);
921 			bp = bpt = NULL;
922 			count = 0;
923 		}
924 
925 		/*
926 		 * Stop further processing if we processed the entire dring
927 		 * once; otherwise continue.
928 		 */
929 	} while (total_count < ldcp->num_rxds);
930 
931 	if (bp != NULL) {
932 		DTRACE_PROBE2(vgen_intr_pkts, vgen_ldc_t *, ldcp, int, count);
933 		mutex_exit(&ldcp->rxlock);
934 		vrx_cb(ldcp->portp->vhp, bp);
935 		mutex_enter(&ldcp->rxlock);
936 	}
937 
938 	if (ldcp->polling_on == B_FALSE) {
939 		/*
940 		 * We send a stopped message to peer (sender) while we are in
941 		 * intr mode only; allowing the peer to send further data intrs
942 		 * (dring data msgs) to us.
943 		 */
944 		end_ix = ldcp->next_rxi;
945 		DECR_RXI(end_ix, ldcp);
946 		msg.dring_ident = ldcp->peer_hparams.dring_ident;
947 		rv = vgen_send_dringack_shm(ldcp, (vio_msg_tag_t *)&msg,
948 		    VNET_START_IDX_UNSPEC, end_ix, VIO_DP_STOPPED);
949 		return (rv);
950 	}
951 
952 	return (0);
953 }
954 
955 /*
956  * Process the rx descriptor ring in the context of mac polling thread. Receive
957  * packets upto the limit specified by bytes_to_pickup or until there are no
958  * more packets, whichever occurs first. Return the chain of received packets.
959  */
960 mblk_t *
961 vgen_poll_rcv_shm(vgen_ldc_t *ldcp, int bytes_to_pickup)
962 {
963 	uint_t		mblk_sz = 0;
964 	uint_t		sz = 0;
965 	mblk_t		*bp = NULL;
966 	mblk_t		*bpt = NULL;
967 	mblk_t		*mp = NULL;
968 	int		count = 0;
969 	int		rv;
970 
971 	mutex_enter(&ldcp->rxlock);
972 
973 	if (ldcp->hphase != VH_DONE) {
974 		/* Channel is being reset and handshake not complete */
975 		mutex_exit(&ldcp->rxlock);
976 		return (NULL);
977 	}
978 
979 	do {
980 		rv = vgen_receive_packet(ldcp, &mp, &mblk_sz);
981 		if (rv != 0) {
982 			if (rv == EINVAL) {
983 				/* Invalid descriptor error; get next */
984 				continue;
985 			}
986 			DTRACE_PROBE1(vgen_poll_nopkts, vgen_ldc_t *, ldcp);
987 			break;
988 		}
989 
990 		/* Build a chain of received packets */
991 		if (bp == NULL) {
992 			/* first pkt */
993 			bp = mp;
994 			bpt = bp;
995 			bpt->b_next = NULL;
996 		} else {
997 			mp->b_next = NULL;
998 			bpt->b_next = mp;
999 			bpt = mp;
1000 		}
1001 
1002 		/* Compute total size accumulated */
1003 		sz += mblk_sz;
1004 		count++;
1005 
1006 		/* Reached the bytes limit; we are done. */
1007 		if (sz >= bytes_to_pickup) {
1008 			break;
1009 		}
1010 
1011 	_NOTE(CONSTCOND)
1012 	} while (1);
1013 
1014 	/*
1015 	 * We prepend any high priority packets to the chain of packets; note
1016 	 * that if we are already at the bytes_to_pickup limit, we might
1017 	 * slightly exceed that in such cases. That should be ok, as these pkts
1018 	 * are expected to be small in size and arrive at an interval in the
1019 	 * the order of a few seconds.
1020 	 */
1021 	if (ldcp->rx_pktdata == vgen_handle_pkt_data &&
1022 	    ldcp->rx_pri_head != NULL) {
1023 		ldcp->rx_pri_tail->b_next = bp;
1024 		bp = ldcp->rx_pri_head;
1025 		ldcp->rx_pri_head = ldcp->rx_pri_tail = NULL;
1026 	}
1027 
1028 	mutex_exit(&ldcp->rxlock);
1029 
1030 	DTRACE_PROBE2(vgen_poll_pkts, vgen_ldc_t *, ldcp, int, count);
1031 	DTRACE_PROBE2(vgen_poll_bytes, vgen_ldc_t *, ldcp, uint_t, sz);
1032 	return (bp);
1033 }
1034 
1035 /*
1036  * Process the next index in the rx dring and receive the associated packet.
1037  *
1038  * Returns:
1039  *	bp:	Success: The received packet.
1040  *		Failure: NULL
1041  *      size:	Success: Size of received packet.
1042  *		Failure: 0
1043  *      retval:
1044  *		Success: 0
1045  *		Failure: EAGAIN: Descriptor not ready
1046  *			 EIO:    Descriptor contents invalid.
1047  */
1048 static int
1049 vgen_receive_packet(vgen_ldc_t *ldcp, mblk_t **bp, uint_t *size)
1050 {
1051 	uint32_t			rxi;
1052 	vio_mblk_t			*vmp;
1053 	vio_mblk_t			*new_vmp;
1054 	struct ether_header		*ehp;
1055 	vnet_rx_dringdata_desc_t	*rxdp;
1056 	int				err = 0;
1057 	uint32_t			nbytes = 0;
1058 	mblk_t				*mp = NULL;
1059 	mblk_t				*dmp = NULL;
1060 	vgen_stats_t			*statsp = &ldcp->stats;
1061 	vgen_hparams_t			*lp = &ldcp->local_hparams;
1062 
1063 	rxi = ldcp->next_rxi;
1064 	rxdp = &(ldcp->rxdp[rxi]);
1065 	vmp = ldcp->rxdp_to_vmp[rxi];
1066 
1067 	if (rxdp->dstate != VIO_DESC_READY) {
1068 		/*
1069 		 * Descriptor is not ready.
1070 		 */
1071 		DTRACE_PROBE1(vgen_noready_rxds, vgen_ldc_t *, ldcp);
1072 		return (EAGAIN);
1073 	}
1074 
1075 	/*
1076 	 * Ensure load ordering of dstate and nbytes.
1077 	 */
1078 	MEMBAR_CONSUMER();
1079 
1080 	nbytes = rxdp->nbytes;
1081 
1082 	if ((nbytes < ETHERMIN) ||
1083 	    (nbytes > lp->mtu) ||
1084 	    (rxdp->data_buf_offset !=
1085 	    (VIO_MBLK_DATA_OFF(vmp) + VNET_IPALIGN))) {
1086 		/*
1087 		 * Descriptor contents invalid.
1088 		 */
1089 		statsp->ierrors++;
1090 		rxdp->dstate = VIO_DESC_DONE;
1091 		err = EIO;
1092 		goto done;
1093 	}
1094 
1095 	/*
1096 	 * Now allocate a new buffer for this descriptor before sending up the
1097 	 * buffer being processed. If that fails, stop processing; as we are
1098 	 * out of receive buffers.
1099 	 */
1100 	new_vmp = vio_allocb(ldcp->rx_vmp);
1101 
1102 	/*
1103 	 * Process the current buffer being received.
1104 	 */
1105 	mp = vmp->mp;
1106 
1107 	if (new_vmp == NULL) {
1108 		/*
1109 		 * We failed to get a new mapped buffer that is needed to
1110 		 * refill the descriptor. In that case, leave the current
1111 		 * buffer bound to the descriptor; allocate an mblk dynamically
1112 		 * and copy the contents of the buffer to the mblk. Then send
1113 		 * up this mblk. This way the sender has the same buffer as
1114 		 * before that can be used to send new data.
1115 		 */
1116 		statsp->norcvbuf++;
1117 		dmp = allocb(nbytes + VNET_IPALIGN, BPRI_MED);
1118 		if (dmp == NULL) {
1119 			statsp->ierrors++;
1120 			return (ENOMEM);
1121 		}
1122 		bcopy(mp->b_rptr + VNET_IPALIGN,
1123 		    dmp->b_rptr + VNET_IPALIGN, nbytes);
1124 		mp = dmp;
1125 	} else {
1126 		/* Mark the status of the current rbuf */
1127 		vmp->state = VIO_MBLK_HAS_DATA;
1128 
1129 		/* Set the offset of the new buffer in the descriptor */
1130 		rxdp->data_buf_offset =
1131 		    VIO_MBLK_DATA_OFF(new_vmp) + VNET_IPALIGN;
1132 		ldcp->rxdp_to_vmp[rxi] = new_vmp;
1133 	}
1134 	mp->b_rptr += VNET_IPALIGN;
1135 	mp->b_wptr = mp->b_rptr + nbytes;
1136 
1137 	/*
1138 	 * Ensure store ordering of data_buf_offset and dstate; so that the
1139 	 * peer sees the right data_buf_offset after it checks that the dstate
1140 	 * is DONE.
1141 	 */
1142 	MEMBAR_PRODUCER();
1143 
1144 	/* Now mark the descriptor 'done' */
1145 	rxdp->dstate = VIO_DESC_DONE;
1146 
1147 	/* Update stats */
1148 	statsp->ipackets++;
1149 	statsp->rbytes += rxdp->nbytes;
1150 	ehp = (struct ether_header *)mp->b_rptr;
1151 	if (IS_BROADCAST(ehp))
1152 		statsp->brdcstrcv++;
1153 	else if (IS_MULTICAST(ehp))
1154 		statsp->multircv++;
1155 done:
1156 	/* Update the next index to be processed */
1157 	INCR_RXI(rxi, ldcp);
1158 
1159 	/* Save the new recv index */
1160 	ldcp->next_rxi = rxi;
1161 
1162 	/* Return the packet received */
1163 	*size = nbytes;
1164 	*bp = mp;
1165 	return (err);
1166 }
1167 
1168 static int
1169 vgen_handle_dringdata_ack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
1170 {
1171 	uint32_t			start;
1172 	int32_t				end;
1173 	uint32_t			txi;
1174 	vgen_stats_t			*statsp;
1175 	vnet_rx_dringdata_desc_t	*txdp;
1176 	on_trap_data_t			otd;
1177 	int				rv = 0;
1178 	boolean_t			ready_txd = B_FALSE;
1179 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
1180 	vio_dring_msg_t			*dringmsg = (vio_dring_msg_t *)tagp;
1181 
1182 	start = dringmsg->start_idx;
1183 	end = dringmsg->end_idx;
1184 	statsp = &ldcp->stats;
1185 
1186 	/*
1187 	 * Received an ack for our transmits upto a certain dring index. This
1188 	 * enables us to reclaim descriptors. We also send a new dring data msg
1189 	 * to the peer to restart processing if there are pending transmit pkts.
1190 	 */
1191 	DBG2(vgenp, ldcp, "ACK:  start(%d), end(%d)\n", start, end);
1192 
1193 	/*
1194 	 * In RxDringData mode (v1.6), start index of -1 can be used by the
1195 	 * peer to indicate that it is unspecified. However, the end index
1196 	 * must be set correctly indicating the last descriptor index processed.
1197 	 */
1198 	if (((start != VNET_START_IDX_UNSPEC) && !(CHECK_TXI(start, ldcp))) ||
1199 	    !(CHECK_TXI(end, ldcp))) {
1200 		/* drop the message if invalid index */
1201 		DWARN(vgenp, ldcp, "Invalid Tx ack start(%d) or end(%d)\n",
1202 		    start, end);
1203 		return (rv);
1204 	}
1205 
1206 	/* Validate dring_ident */
1207 	if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
1208 		/* invalid dring_ident, drop the msg */
1209 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
1210 		    dringmsg->dring_ident);
1211 		return (rv);
1212 	}
1213 	statsp->dring_data_acks_rcvd++;
1214 
1215 	/*
1216 	 * Clear transmit flow control condition
1217 	 * as some descriptors should be free now.
1218 	 */
1219 	mutex_enter(&ldcp->txlock);
1220 	if (ldcp->tx_blocked == B_TRUE) {
1221 		vio_net_tx_update_t vtx_update =
1222 		    ldcp->portp->vcb.vio_net_tx_update;
1223 
1224 		ldcp->tx_blocked = B_FALSE;
1225 		vtx_update(ldcp->portp->vhp);
1226 	}
1227 	mutex_exit(&ldcp->txlock);
1228 
1229 	if (dringmsg->dring_process_state != VIO_DP_STOPPED) {
1230 		/*
1231 		 * Receiver continued processing
1232 		 * dring after sending us the ack.
1233 		 */
1234 		return (rv);
1235 	}
1236 
1237 	/*
1238 	 * Receiver stopped processing descriptors.
1239 	 */
1240 	statsp->dring_stopped_acks_rcvd++;
1241 
1242 	/*
1243 	 * Setup on_trap() protection before accessing dring shared memory area.
1244 	 */
1245 	rv = LDC_ON_TRAP(&otd);
1246 	if (rv != 0) {
1247 		/*
1248 		 * Data access fault occured down the code path below while
1249 		 * accessing the descriptors. Release any locks that we might
1250 		 * have acquired in the code below and return failure.
1251 		 */
1252 		if (mutex_owned(&ldcp->wrlock)) {
1253 			mutex_exit(&ldcp->wrlock);
1254 		}
1255 		return (ECONNRESET);
1256 	}
1257 
1258 	/*
1259 	 * Determine if there are any pending tx descriptors ready to be
1260 	 * processed by the receiver(peer) and if so, send a message to the
1261 	 * peer to restart receiving.
1262 	 */
1263 	mutex_enter(&ldcp->wrlock);
1264 
1265 	ready_txd = B_FALSE;
1266 	txi = end;
1267 	INCR_TXI(txi, ldcp);
1268 	txdp = &ldcp->mtxdp[txi];
1269 	if (txdp->dstate == VIO_DESC_READY) {
1270 		ready_txd = B_TRUE;
1271 	}
1272 
1273 	/*
1274 	 * We are done accessing shared memory; clear trap protection.
1275 	 */
1276 	(void) LDC_NO_TRAP();
1277 
1278 	if (ready_txd == B_FALSE) {
1279 		/*
1280 		 * No ready tx descriptors. Set the flag to send a message to
1281 		 * the peer when tx descriptors are ready in transmit routine.
1282 		 */
1283 		ldcp->resched_peer = B_TRUE;
1284 		ldcp->resched_peer_txi = txi;
1285 		mutex_exit(&ldcp->wrlock);
1286 		return (rv);
1287 	}
1288 
1289 	/*
1290 	 * We have some tx descriptors ready to be processed by the receiver.
1291 	 * Send a dring data message to the peer to restart processing.
1292 	 */
1293 	ldcp->resched_peer = B_FALSE;
1294 	mutex_exit(&ldcp->wrlock);
1295 	rv = vgen_send_dringdata_shm(ldcp, txi, -1);
1296 	if (rv != VGEN_SUCCESS) {
1297 		mutex_enter(&ldcp->wrlock);
1298 		ldcp->resched_peer = B_TRUE;
1299 		mutex_exit(&ldcp->wrlock);
1300 	}
1301 
1302 	return (rv);
1303 }
1304 
1305 static int
1306 vgen_handle_dringdata_nack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
1307 {
1308 	uint32_t			start;
1309 	int32_t				end;
1310 	uint32_t			txi;
1311 	vnet_rx_dringdata_desc_t	*txdp;
1312 	on_trap_data_t			otd;
1313 	int				rv = 0;
1314 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
1315 	vio_dring_msg_t			*dringmsg = (vio_dring_msg_t *)tagp;
1316 
1317 	DBG1(vgenp, ldcp, "enter\n");
1318 	start = dringmsg->start_idx;
1319 	end = dringmsg->end_idx;
1320 
1321 	/*
1322 	 * Peer sent a NACK msg (to indicate bad descriptors ?). The start and
1323 	 * end correspond to the range of descriptors which are being nack'd.
1324 	 */
1325 	DWARN(vgenp, ldcp, "NACK: start(%d), end(%d)\n", start, end);
1326 
1327 	/*
1328 	 * In RxDringData mode (v1.6), start index of -1 can be used by
1329 	 * the peer to indicate that it is unspecified. However, the end index
1330 	 * must be set correctly indicating the last descriptor index processed.
1331 	 */
1332 	if (((start != VNET_START_IDX_UNSPEC) && !(CHECK_TXI(start, ldcp))) ||
1333 	    !(CHECK_TXI(end, ldcp))) {
1334 		/* drop the message if invalid index */
1335 		DWARN(vgenp, ldcp, "Invalid Tx nack start(%d) or end(%d)\n",
1336 		    start, end);
1337 		return (rv);
1338 	}
1339 
1340 	/* Validate dring_ident */
1341 	if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
1342 		/* invalid dring_ident, drop the msg */
1343 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
1344 		    dringmsg->dring_ident);
1345 		return (rv);
1346 	}
1347 
1348 	/*
1349 	 * Setup on_trap() protection before accessing dring shared memory area.
1350 	 */
1351 	rv = LDC_ON_TRAP(&otd);
1352 	if (rv != 0) {
1353 		/*
1354 		 * Data access fault occured down the code path below while
1355 		 * accessing the descriptors. Release any locks that we might
1356 		 * have acquired in the code below and return failure.
1357 		 */
1358 		mutex_exit(&ldcp->txlock);
1359 		return (ECONNRESET);
1360 	}
1361 
1362 	/* We just mark the descrs as free so they can be reused */
1363 	mutex_enter(&ldcp->txlock);
1364 	for (txi = start; txi <= end; ) {
1365 		txdp = &(ldcp->mtxdp[txi]);
1366 		if (txdp->dstate == VIO_DESC_READY)
1367 			txdp->dstate = VIO_DESC_DONE;
1368 		INCR_TXI(txi, ldcp);
1369 	}
1370 
1371 	/*
1372 	 * We are done accessing shared memory; clear trap protection.
1373 	 */
1374 	(void) LDC_NO_TRAP();
1375 
1376 	mutex_exit(&ldcp->txlock);
1377 
1378 	return (rv);
1379 }
1380 
1381 /*
1382  * Send descriptor ring data message to the peer over LDC.
1383  */
1384 static int
1385 vgen_send_dringdata_shm(vgen_ldc_t *ldcp, uint32_t start, int32_t end)
1386 {
1387 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
1388 	vio_dring_msg_t	dringmsg, *msgp = &dringmsg;
1389 	vio_msg_tag_t	*tagp = &msgp->tag;
1390 	vgen_stats_t	*statsp = &ldcp->stats;
1391 	int		rv;
1392 
1393 #ifdef DEBUG
1394 	if (vgen_inject_error(ldcp, VGEN_ERR_TXTIMEOUT)) {
1395 		return (VGEN_SUCCESS);
1396 	}
1397 #endif
1398 	bzero(msgp, sizeof (*msgp));
1399 
1400 	tagp->vio_msgtype = VIO_TYPE_DATA;
1401 	tagp->vio_subtype = VIO_SUBTYPE_INFO;
1402 	tagp->vio_subtype_env = VIO_DRING_DATA;
1403 	tagp->vio_sid = ldcp->local_sid;
1404 
1405 	msgp->dring_ident = ldcp->local_hparams.dring_ident;
1406 	msgp->start_idx = start;
1407 	msgp->end_idx = end;
1408 	msgp->seq_num = atomic_inc_32_nv(&ldcp->dringdata_msgid);
1409 
1410 	rv = vgen_sendmsg_shm(ldcp, (caddr_t)tagp, sizeof (dringmsg));
1411 	if (rv != VGEN_SUCCESS) {
1412 		DWARN(vgenp, ldcp, "vgen_sendmsg_shm() failed\n");
1413 		return (rv);
1414 	}
1415 
1416 	statsp->dring_data_msgs_sent++;
1417 
1418 	DBG2(vgenp, ldcp, "DRING_DATA_SENT \n");
1419 
1420 	return (VGEN_SUCCESS);
1421 }
1422 
1423 /*
1424  * Send dring data ack message.
1425  */
1426 int
1427 vgen_send_dringack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, uint32_t start,
1428     int32_t end, uint8_t pstate)
1429 {
1430 	int		rv = 0;
1431 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
1432 	vio_dring_msg_t	*msgp = (vio_dring_msg_t *)tagp;
1433 	vgen_stats_t	*statsp = &ldcp->stats;
1434 
1435 	tagp->vio_msgtype = VIO_TYPE_DATA;
1436 	tagp->vio_subtype = VIO_SUBTYPE_ACK;
1437 	tagp->vio_subtype_env = VIO_DRING_DATA;
1438 	tagp->vio_sid = ldcp->local_sid;
1439 	msgp->start_idx = start;
1440 	msgp->end_idx = end;
1441 	msgp->dring_process_state = pstate;
1442 	msgp->seq_num = atomic_inc_32_nv(&ldcp->dringdata_msgid);
1443 
1444 	rv = vgen_sendmsg_shm(ldcp, (caddr_t)tagp, sizeof (*msgp));
1445 	if (rv != VGEN_SUCCESS) {
1446 		DWARN(vgenp, ldcp, "vgen_sendmsg_shm() failed\n");
1447 	}
1448 
1449 	statsp->dring_data_acks_sent++;
1450 	if (pstate == VIO_DP_STOPPED) {
1451 		statsp->dring_stopped_acks_sent++;
1452 	}
1453 
1454 	return (rv);
1455 }
1456 
1457 /*
1458  * Send dring data msgs (info/ack/nack) over LDC.
1459  */
1460 static int
1461 vgen_sendmsg_shm(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen)
1462 {
1463 	int			rv;
1464 	size_t			len;
1465 	uint32_t		retries = 0;
1466 	vgen_t			*vgenp = LDC_TO_VGEN(ldcp);
1467 
1468 	len = msglen;
1469 	if ((len == 0) || (msg == NULL))
1470 		return (VGEN_FAILURE);
1471 
1472 	do {
1473 		len = msglen;
1474 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msg, &len);
1475 		if (retries++ >= vgen_ldcwr_retries)
1476 			break;
1477 	} while (rv == EWOULDBLOCK);
1478 
1479 	if (rv != 0) {
1480 		DWARN(vgenp, ldcp, "ldc_write failed: rv(%d) msglen(%d)\n",
1481 		    rv, msglen);
1482 		return (rv);
1483 	}
1484 
1485 	if (len != msglen) {
1486 		DWARN(vgenp, ldcp, "ldc_write failed: rv(%d) msglen (%d)\n",
1487 		    rv, msglen);
1488 		return (VGEN_FAILURE);
1489 	}
1490 
1491 	return (VGEN_SUCCESS);
1492 }
1493