xref: /titanic_51/usr/src/uts/sun4v/io/vnet_rxdring.c (revision c39526b769298791ff5b0b6c5e761f49aabaeb4e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/sysmacros.h>
29 #include <sys/param.h>
30 #include <sys/machsystm.h>
31 #include <sys/stream.h>
32 #include <sys/strsubr.h>
33 #include <sys/kmem.h>
34 #include <sys/strsun.h>
35 #include <sys/callb.h>
36 #include <sys/sdt.h>
37 #include <sys/ethernet.h>
38 #include <sys/mach_descrip.h>
39 #include <sys/mdeg.h>
40 #include <sys/vnet.h>
41 #include <sys/vio_mailbox.h>
42 #include <sys/vio_common.h>
43 #include <sys/vnet_common.h>
44 #include <sys/vnet_mailbox.h>
45 #include <sys/vio_util.h>
46 #include <sys/vnet_gen.h>
47 
48 /*
49  * This file contains the implementation of RxDringData transfer mode of VIO
50  * Protocol in vnet. The functions in this file are invoked from vnet_gen.c
51  * after RxDringData mode is negotiated with the peer during attribute phase of
52  * handshake. This file contains functions that setup the transmit and receive
53  * descriptor rings, and associated resources in RxDringData mode. It also
54  * contains the transmit and receive data processing functions that are invoked
55  * in RxDringData mode. The data processing routines in this file have the
56  * suffix '_shm' to indicate the shared memory mechanism used in RxDringData
57  * mode.
58  */
59 
60 /* Functions exported to vnet_gen.c */
61 int vgen_create_rx_dring(vgen_ldc_t *ldcp);
62 void vgen_destroy_rx_dring(vgen_ldc_t *ldcp);
63 int vgen_map_tx_dring(vgen_ldc_t *ldcp, void *pkt);
64 void vgen_unmap_tx_dring(vgen_ldc_t *ldcp);
65 int vgen_map_data(vgen_ldc_t *ldcp, void *pkt);
66 int vgen_dringsend_shm(void *arg, mblk_t *mp);
67 int vgen_handle_dringdata_shm(void *arg1, void *arg2);
68 mblk_t *vgen_poll_rcv_shm(vgen_ldc_t *ldcp, int bytes_to_pickup);
69 int vgen_send_dringack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
70     uint32_t start, int32_t end, uint8_t pstate);
71 
72 /* Internal functions */
73 static int vgen_handle_dringdata_info_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tp);
74 static int vgen_handle_dringdata_ack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
75 static int vgen_handle_dringdata_nack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tp);
76 static int vgen_intr_rcv_shm(vgen_ldc_t *ldcp);
77 static int vgen_receive_packet(vgen_ldc_t *ldcp, mblk_t **bp, uint_t *size);
78 static int vgen_send_dringdata_shm(vgen_ldc_t *ldcp, uint32_t start,
79     int32_t end);
80 static int vgen_sendmsg_shm(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen);
81 
82 /* Functions imported from vnet_gen.c */
83 extern int vgen_handle_evt_read(vgen_ldc_t *ldcp, vgen_caller_t caller);
84 extern int vgen_handle_evt_reset(vgen_ldc_t *ldcp, vgen_caller_t caller);
85 extern void vgen_handle_pkt_data(void *arg1, void *arg2, uint32_t msglen);
86 extern void vgen_destroy_rxpools(void *arg);
87 
88 /* Tunables */
89 extern uint32_t vnet_num_descriptors;
90 extern uint32_t vgen_chain_len;
91 extern uint32_t vgen_ldcwr_retries;
92 extern uint32_t vgen_recv_delay;
93 extern uint32_t vgen_recv_retries;
94 extern uint32_t vgen_nrbufs_factor;
95 
96 #ifdef DEBUG
97 
98 #define	DEBUG_PRINTF	vgen_debug_printf
99 
100 extern int vnet_dbglevel;
101 extern int vgen_inject_err_flag;
102 
103 extern void vgen_debug_printf(const char *fname, vgen_t *vgenp,
104 	vgen_ldc_t *ldcp, const char *fmt, ...);
105 extern boolean_t vgen_inject_error(vgen_ldc_t *ldcp, int error);
106 
107 #endif
108 
109 /*
110  * Allocate receive resources for the channel. The resources consist of a
111  * receive descriptor ring and an associated receive buffer area.
112  */
113 int
114 vgen_create_rx_dring(vgen_ldc_t *ldcp)
115 {
116 	int 				i;
117 	int 				rv;
118 	uint32_t			ncookies;
119 	ldc_mem_info_t			minfo;
120 	vnet_rx_dringdata_desc_t	*rxdp;
121 	size_t				data_sz;
122 	vio_mblk_t			*vmp;
123 	vio_mblk_t			**rxdp_to_vmp;
124 	uint32_t			rxdsize;
125 	caddr_t				datap = NULL;
126 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
127 
128 	rxdsize = sizeof (vnet_rx_dringdata_desc_t);
129 	ldcp->num_rxds = vnet_num_descriptors;
130 	ldcp->num_rbufs = vnet_num_descriptors * vgen_nrbufs_factor;
131 
132 	/* Create the receive descriptor ring */
133 	rv = ldc_mem_dring_create(ldcp->num_rxds, rxdsize,
134 	    &ldcp->rx_dring_handle);
135 	if (rv != 0) {
136 		DWARN(vgenp, ldcp, "ldc_mem_dring_create() failed\n");
137 		goto fail;
138 	}
139 
140 	/* Get the addr of descriptor ring */
141 	rv = ldc_mem_dring_info(ldcp->rx_dring_handle, &minfo);
142 	if (rv != 0) {
143 		DWARN(vgenp, ldcp, "ldc_mem_dring_info() failed\n");
144 		goto fail;
145 	}
146 	ldcp->rxdp = (vnet_rx_dringdata_desc_t *)(minfo.vaddr);
147 	bzero(ldcp->rxdp, sizeof (*rxdp) * (ldcp->num_rxds));
148 
149 	/*
150 	 * Allocate a table that maps descriptor to its associated buffer;
151 	 * used while receiving to validate that the peer has not changed the
152 	 * buffer offset provided in the descriptor.
153 	 */
154 	rxdp_to_vmp = kmem_zalloc(ldcp->num_rxds * sizeof (uintptr_t),
155 	    KM_SLEEP);
156 	ldcp->rxdp_to_vmp = rxdp_to_vmp;
157 
158 	/*
159 	 * Allocate a single large buffer that serves as the rx buffer area.
160 	 * We allocate a ldc memory handle and export the buffer area as shared
161 	 * memory. We send the ldc memcookie for this buffer space to the peer,
162 	 * as part of dring registration phase during handshake. We manage this
163 	 * buffer area as individual buffers of max_frame_size and provide
164 	 * specific buffer offsets in each descriptor to the peer. Note that
165 	 * the factor used to compute the # of buffers (above) must be > 1 to
166 	 * ensure that there are more buffers than the # of descriptors. This
167 	 * is needed because, while the shared memory buffers are sent up our
168 	 * stack during receive, the sender needs additional buffers that can
169 	 * be used for further transmits. This also means there is no one to
170 	 * one correspondence between the descriptor index and buffer offset.
171 	 * The sender has to read the buffer offset in the descriptor and use
172 	 * the specified offset to copy the tx data into the shared buffer. We
173 	 * (receiver) manage the individual buffers and their state (see
174 	 * VIO_MBLK_STATEs in vio_util.h).
175 	 */
176 	data_sz = vgenp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
177 	data_sz = VNET_ROUNDUP_2K(data_sz);
178 
179 	ldcp->rx_data_sz = data_sz * ldcp->num_rbufs;
180 	ldcp->rx_dblk_sz = data_sz;
181 	datap = kmem_zalloc(ldcp->rx_data_sz, KM_SLEEP);
182 	ldcp->rx_datap = datap;
183 
184 	/* Allocate a ldc memhandle for the entire rx data area */
185 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &ldcp->rx_data_handle);
186 	if (rv) {
187 		ldcp->rx_data_handle = 0;
188 		goto fail;
189 	}
190 
191 	/* Allocate memory for the data cookies */
192 	ldcp->rx_data_cookie = kmem_zalloc(VNET_DATA_AREA_COOKIES *
193 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
194 
195 	/*
196 	 * Bind ldc memhandle to the corresponding rx data area.
197 	 */
198 	ncookies = 0;
199 	rv = ldc_mem_bind_handle(ldcp->rx_data_handle, (caddr_t)datap,
200 	    ldcp->rx_data_sz, LDC_DIRECT_MAP, LDC_MEM_W,
201 	    ldcp->rx_data_cookie, &ncookies);
202 	if (rv != 0) {
203 		goto fail;
204 	}
205 	if ((ncookies == 0) || (ncookies > VNET_DATA_AREA_COOKIES)) {
206 		goto fail;
207 	}
208 	ldcp->rx_data_ncookies = ncookies;
209 
210 	/*
211 	 * Successful in binding the handle to rx data area. Now setup mblks
212 	 * around each data buffer and setup the descriptors to point to these
213 	 * rx data buffers. We associate each descriptor with a buffer
214 	 * by specifying the buffer offset in the descriptor. When the peer
215 	 * needs to transmit data, this offset is read by the peer to determine
216 	 * the buffer in the mapped buffer area where the data to be
217 	 * transmitted should be copied, for a specific descriptor.
218 	 */
219 	rv = vio_create_mblks(ldcp->num_rbufs, data_sz, (uint8_t *)datap,
220 	    &ldcp->rx_vmp);
221 	if (rv != 0) {
222 		goto fail;
223 	}
224 
225 	for (i = 0; i < ldcp->num_rxds; i++) {
226 		rxdp = &(ldcp->rxdp[i]);
227 		/* allocate an mblk around this data buffer */
228 		vmp = vio_allocb(ldcp->rx_vmp);
229 		ASSERT(vmp != NULL);
230 		rxdp->data_buf_offset = VIO_MBLK_DATA_OFF(vmp) + VNET_IPALIGN;
231 		rxdp->dstate = VIO_DESC_FREE;
232 		rxdp_to_vmp[i] = vmp;
233 	}
234 
235 	/*
236 	 * The descriptors and the associated buffers are all ready;
237 	 * now bind descriptor ring to the channel.
238 	 */
239 	rv = ldc_mem_dring_bind(ldcp->ldc_handle, ldcp->rx_dring_handle,
240 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
241 	    &ldcp->rx_dring_cookie, &ncookies);
242 	if (rv != 0) {
243 		DWARN(vgenp, ldcp, "ldc_mem_dring_bind failed "
244 		    "rv(%x)\n", rv);
245 		goto fail;
246 	}
247 	ASSERT(ncookies == 1);
248 	ldcp->rx_dring_ncookies = ncookies;
249 
250 	/* initialize rx seqnum and index */
251 	ldcp->next_rxseq = VNET_ISS;
252 	ldcp->next_rxi = 0;
253 
254 	return (VGEN_SUCCESS);
255 
256 fail:
257 	vgen_destroy_rx_dring(ldcp);
258 	return (VGEN_FAILURE);
259 }
260 
261 /*
262  * Free receive resources for the channel.
263  */
264 void
265 vgen_destroy_rx_dring(vgen_ldc_t *ldcp)
266 {
267 	vgen_t	*vgenp = LDC_TO_VGEN(ldcp);
268 
269 	/* We first unbind the descriptor ring */
270 	if (ldcp->rx_dring_ncookies != 0) {
271 		(void) ldc_mem_dring_unbind(ldcp->rx_dring_handle);
272 		ldcp->rx_dring_ncookies = 0;
273 	}
274 
275 	/* Destroy the mblks that are wrapped around the rx data buffers */
276 	if (ldcp->rx_vmp != NULL) {
277 		vio_clobber_pool(ldcp->rx_vmp);
278 		if (vio_destroy_mblks(ldcp->rx_vmp) != 0) {
279 			/*
280 			 * If we can't destroy the rx pool for this channel,
281 			 * dispatch a task to retry and clean up. Note that we
282 			 * don't need to wait for the task to complete. If the
283 			 * vnet device itself gets detached, it will wait for
284 			 * the task to complete implicitly in
285 			 * ddi_taskq_destroy().
286 			 */
287 			(void) ddi_taskq_dispatch(vgenp->rxp_taskq,
288 			    vgen_destroy_rxpools, ldcp->rx_vmp, DDI_SLEEP);
289 		}
290 		ldcp->rx_vmp = NULL;
291 	}
292 
293 	/* Free rx data area cookies */
294 	if (ldcp->rx_data_cookie != NULL) {
295 		kmem_free(ldcp->rx_data_cookie, VNET_DATA_AREA_COOKIES *
296 		    sizeof (ldc_mem_cookie_t));
297 		ldcp->rx_data_cookie = NULL;
298 	}
299 
300 	/* Unbind rx data area memhandle */
301 	if (ldcp->rx_data_ncookies != 0) {
302 		(void) ldc_mem_unbind_handle(ldcp->rx_data_handle);
303 		ldcp->rx_data_ncookies = 0;
304 	}
305 
306 	/* Free rx data area memhandle */
307 	if (ldcp->rx_data_handle != 0) {
308 		(void) ldc_mem_free_handle(ldcp->rx_data_handle);
309 		ldcp->rx_data_handle = 0;
310 	}
311 
312 	/* Now free the rx data area itself */
313 	if (ldcp->rx_datap != NULL) {
314 		/* prealloc'd rx data buffer */
315 		kmem_free(ldcp->rx_datap, ldcp->rx_data_sz);
316 		ldcp->rx_datap = NULL;
317 		ldcp->rx_data_sz = 0;
318 	}
319 
320 	/* Finally, free the receive descriptor ring */
321 	if (ldcp->rx_dring_handle != 0) {
322 		(void) ldc_mem_dring_destroy(ldcp->rx_dring_handle);
323 		ldcp->rx_dring_handle = 0;
324 		ldcp->rxdp = NULL;
325 	}
326 
327 	if (ldcp->rxdp_to_vmp != NULL) {
328 		kmem_free(ldcp->rxdp_to_vmp,
329 		    ldcp->num_rxds * sizeof (uintptr_t));
330 		ldcp->rxdp_to_vmp = NULL;
331 	}
332 
333 	/* Reset rx index and seqnum */
334 	ldcp->next_rxi = 0;
335 	ldcp->next_rxseq = VNET_ISS;
336 }
337 
338 /*
339  * Map the receive descriptor ring exported
340  * by the peer, as our transmit descriptor ring.
341  */
342 int
343 vgen_map_tx_dring(vgen_ldc_t *ldcp, void *pkt)
344 {
345 	int				i;
346 	int				rv;
347 	ldc_mem_info_t			minfo;
348 	ldc_mem_cookie_t		dcookie;
349 	uint32_t			ncookies;
350 	uint32_t 			num_desc;
351 	uint32_t			desc_size;
352 	vnet_rx_dringdata_desc_t	*txdp;
353 	on_trap_data_t			otd;
354 	vio_dring_reg_msg_t 		*msg = pkt;
355 
356 	ncookies = msg->ncookies;
357 	num_desc = msg->num_descriptors;
358 	desc_size = msg->descriptor_size;
359 
360 	/*
361 	 * Sanity check.
362 	 */
363 	if (num_desc < VGEN_NUM_DESCRIPTORS_MIN ||
364 	    desc_size < sizeof (vnet_rx_dringdata_desc_t) ||
365 	    ncookies > 1) {
366 		goto fail;
367 	}
368 
369 	bcopy(&msg->cookie[0], &dcookie, sizeof (ldc_mem_cookie_t));
370 
371 	/* Map the remote dring */
372 	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dcookie, ncookies, num_desc,
373 	    desc_size, LDC_DIRECT_MAP, &(ldcp->tx_dring_handle));
374 	if (rv != 0) {
375 		goto fail;
376 	}
377 
378 	/*
379 	 * Sucessfully mapped; now try to get info about the mapped dring
380 	 */
381 	rv = ldc_mem_dring_info(ldcp->tx_dring_handle, &minfo);
382 	if (rv != 0) {
383 		goto fail;
384 	}
385 
386 	/*
387 	 * Save ring address, number of descriptors.
388 	 */
389 	ldcp->mtxdp = (vnet_rx_dringdata_desc_t *)(minfo.vaddr);
390 	bcopy(&dcookie, &(ldcp->tx_dring_cookie), sizeof (dcookie));
391 	ldcp->tx_dring_ncookies = ncookies;
392 	ldcp->num_txds = num_desc;
393 
394 	/* Initialize tx dring indexes and seqnum */
395 	ldcp->next_txi = ldcp->cur_txi = 0;
396 	ldcp->next_txseq = VNET_ISS - 1;
397 	ldcp->resched_peer = B_TRUE;
398 	ldcp->dring_mtype = minfo.mtype;
399 	ldcp->dringdata_msgid = 0;
400 
401 	/* Save peer's dring_info values */
402 	bcopy(&dcookie, &(ldcp->peer_hparams.dring_cookie),
403 	    sizeof (ldc_mem_cookie_t));
404 	ldcp->peer_hparams.num_desc = num_desc;
405 	ldcp->peer_hparams.desc_size = desc_size;
406 	ldcp->peer_hparams.dring_ncookies = ncookies;
407 
408 	/* Set dring_ident for the peer */
409 	ldcp->peer_hparams.dring_ident = (uint64_t)ldcp->mtxdp;
410 
411 	/* Return the dring_ident in ack msg */
412 	msg->dring_ident = (uint64_t)ldcp->mtxdp;
413 
414 	/*
415 	 * Mark the descriptor state as 'done'. This is implementation specific
416 	 * and not required by the protocol. In our implementation, we only
417 	 * need the descripor to be in 'done' state to be used by the transmit
418 	 * function and the peer is not aware of it. As the protocol requires
419 	 * that during initial registration the exporting end point mark the
420 	 * dstate as 'free', we change it 'done' here. After this, the dstate
421 	 * in our implementation will keep moving between 'ready', set by our
422 	 * transmit function; and and 'done', set by the peer (per protocol)
423 	 * after receiving data.
424 	 * Setup on_trap() protection before accessing dring shared memory area.
425 	 */
426 	rv = LDC_ON_TRAP(&otd);
427 	if (rv != 0) {
428 		/*
429 		 * Data access fault occured down the code path below while
430 		 * accessing the descriptors. Return failure.
431 		 */
432 		goto fail;
433 	}
434 
435 	for (i = 0; i < num_desc; i++) {
436 		txdp = &ldcp->mtxdp[i];
437 		txdp->dstate = VIO_DESC_DONE;
438 	}
439 
440 	(void) LDC_NO_TRAP();
441 	return (VGEN_SUCCESS);
442 
443 fail:
444 	if (ldcp->tx_dring_handle != 0) {
445 		(void) ldc_mem_dring_unmap(ldcp->tx_dring_handle);
446 		ldcp->tx_dring_handle = 0;
447 	}
448 	return (VGEN_FAILURE);
449 }
450 
451 /*
452  * Unmap the transmit descriptor ring.
453  */
454 void
455 vgen_unmap_tx_dring(vgen_ldc_t *ldcp)
456 {
457 	/* Unmap mapped tx data area */
458 	if (ldcp->tx_datap != NULL) {
459 		(void) ldc_mem_unmap(ldcp->tx_data_handle);
460 		ldcp->tx_datap = NULL;
461 	}
462 
463 	/* Free tx data area handle */
464 	if (ldcp->tx_data_handle != 0) {
465 		(void) ldc_mem_free_handle(ldcp->tx_data_handle);
466 		ldcp->tx_data_handle = 0;
467 	}
468 
469 	/* Free tx data area cookies */
470 	if (ldcp->tx_data_cookie != NULL) {
471 		kmem_free(ldcp->tx_data_cookie, ldcp->tx_data_ncookies *
472 		    sizeof (ldc_mem_cookie_t));
473 		ldcp->tx_data_cookie = NULL;
474 		ldcp->tx_data_ncookies = 0;
475 	}
476 
477 	/* Unmap peer's dring */
478 	if (ldcp->tx_dring_handle != 0) {
479 		(void) ldc_mem_dring_unmap(ldcp->tx_dring_handle);
480 		ldcp->tx_dring_handle = 0;
481 	}
482 
483 	/* clobber tx ring members */
484 	bzero(&ldcp->tx_dring_cookie, sizeof (ldcp->tx_dring_cookie));
485 	ldcp->mtxdp = NULL;
486 	ldcp->next_txi = ldcp->cur_txi = 0;
487 	ldcp->num_txds = 0;
488 	ldcp->next_txseq = VNET_ISS - 1;
489 	ldcp->resched_peer = B_TRUE;
490 }
491 
492 /*
493  * Map the shared memory data buffer area exported by the peer.
494  */
495 int
496 vgen_map_data(vgen_ldc_t *ldcp, void *pkt)
497 {
498 	int			rv;
499 	vio_dring_reg_ext_msg_t	*emsg;
500 	vio_dring_reg_msg_t	*msg = (vio_dring_reg_msg_t *)pkt;
501 	uint8_t			*buf = (uint8_t *)msg->cookie;
502 	vgen_t			*vgenp = LDC_TO_VGEN(ldcp);
503 
504 	/* skip over dring cookies */
505 	ASSERT(msg->ncookies == 1);
506 	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
507 
508 	emsg = (vio_dring_reg_ext_msg_t *)buf;
509 	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
510 		return (VGEN_FAILURE);
511 	}
512 
513 	/* save # of data area cookies */
514 	ldcp->tx_data_ncookies = emsg->data_ncookies;
515 
516 	/* save data area size */
517 	ldcp->tx_data_sz = emsg->data_area_size;
518 
519 	/* allocate ldc mem handle for data area */
520 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &ldcp->tx_data_handle);
521 	if (rv != 0) {
522 		DWARN(vgenp, ldcp, "ldc_mem_alloc_handle() failed: %d\n", rv);
523 		return (VGEN_FAILURE);
524 	}
525 
526 	/* map the data area */
527 	rv = ldc_mem_map(ldcp->tx_data_handle, emsg->data_cookie,
528 	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_W,
529 	    (caddr_t *)&ldcp->tx_datap, NULL);
530 	if (rv != 0) {
531 		DWARN(vgenp, ldcp, "ldc_mem_map() failed: %d\n", rv);
532 		(void) ldc_mem_free_handle(ldcp->tx_data_handle);
533 		ldcp->tx_data_handle = 0;
534 		return (VGEN_FAILURE);
535 	}
536 
537 	/* allocate memory for data area cookies */
538 	ldcp->tx_data_cookie = kmem_zalloc(emsg->data_ncookies *
539 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
540 
541 	/* save data area cookies */
542 	bcopy(emsg->data_cookie, ldcp->tx_data_cookie,
543 	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
544 
545 	return (VGEN_SUCCESS);
546 }
547 
548 /*
549  * This function transmits normal data frames (non-priority) over the channel.
550  * It queues the frame into the transmit descriptor ring and sends a
551  * VIO_DRING_DATA message if needed, to wake up the peer to (re)start
552  * processing.
553  */
554 int
555 vgen_dringsend_shm(void *arg, mblk_t *mp)
556 {
557 	uint32_t			next_txi;
558 	uint32_t			txi;
559 	vnet_rx_dringdata_desc_t	*txdp;
560 	vnet_rx_dringdata_desc_t	*ntxdp;
561 	struct ether_header		*ehp;
562 	size_t				mblksz;
563 	caddr_t				dst;
564 	mblk_t				*bp;
565 	size_t				size;
566 	uint32_t			buf_offset;
567 	on_trap_data_t			otd;
568 	int				rv = 0;
569 	boolean_t			is_bcast = B_FALSE;
570 	boolean_t			is_mcast = B_FALSE;
571 	vgen_ldc_t			*ldcp = (vgen_ldc_t *)arg;
572 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
573 	vgen_stats_t			*statsp = &ldcp->stats;
574 	vgen_hparams_t			*lp = &ldcp->local_hparams;
575 	boolean_t			resched_peer = B_FALSE;
576 	boolean_t			tx_update = B_FALSE;
577 
578 	/* Drop the packet if ldc is not up or handshake is not done */
579 	if (ldcp->ldc_status != LDC_UP) {
580 		DBG2(vgenp, ldcp, "status(%d), dropping packet\n",
581 		    ldcp->ldc_status);
582 		goto dringsend_shm_exit;
583 	}
584 
585 	if (ldcp->hphase != VH_DONE) {
586 		DWARN(vgenp, ldcp, "hphase(%x), dropping packet\n",
587 		    ldcp->hphase);
588 		goto dringsend_shm_exit;
589 	}
590 
591 	size = msgsize(mp);
592 	if (size > (size_t)lp->mtu) {
593 		DWARN(vgenp, ldcp, "invalid size(%d)\n", size);
594 		goto dringsend_shm_exit;
595 	}
596 	if (size < ETHERMIN)
597 		size = ETHERMIN;
598 
599 	ehp = (struct ether_header *)mp->b_rptr;
600 	is_bcast = IS_BROADCAST(ehp);
601 	is_mcast = IS_MULTICAST(ehp);
602 
603 	/*
604 	 * Setup on_trap() protection before accessing shared memory areas
605 	 * (descriptor and data buffer). Note that we enable this protection a
606 	 * little early and turn it off slightly later, than keeping it enabled
607 	 * strictly at the points in code below where the descriptor and data
608 	 * buffer are accessed. This is done for performance reasons:
609 	 * (a) to avoid calling the trap protection code while holding mutex.
610 	 * (b) to avoid multiple on/off steps for descriptor and data accesses.
611 	 */
612 	rv = LDC_ON_TRAP(&otd);
613 	if (rv != 0) {
614 		/*
615 		 * Data access fault occured down the code path below while
616 		 * accessing either the descriptor or the data buffer. Release
617 		 * any locks that we might have acquired in the code below and
618 		 * return failure.
619 		 */
620 		DERR(vgenp, ldcp, "data access fault occured\n");
621 		statsp->oerrors++;
622 		if (mutex_owned(&ldcp->txlock)) {
623 			mutex_exit(&ldcp->txlock);
624 		}
625 		if (mutex_owned(&ldcp->wrlock)) {
626 			mutex_exit(&ldcp->wrlock);
627 		}
628 		goto dringsend_shm_exit;
629 	}
630 
631 	/*
632 	 * Allocate a descriptor
633 	 */
634 	mutex_enter(&ldcp->txlock);
635 	txi = next_txi = ldcp->next_txi;
636 	INCR_TXI(next_txi, ldcp);
637 	ntxdp = &(ldcp->mtxdp[next_txi]);
638 	if (ntxdp->dstate != VIO_DESC_DONE) { /* out of descriptors */
639 		if (ldcp->tx_blocked == B_FALSE) {
640 			ldcp->tx_blocked_lbolt = ddi_get_lbolt();
641 			ldcp->tx_blocked = B_TRUE;
642 		}
643 		statsp->tx_no_desc++;
644 		mutex_exit(&ldcp->txlock);
645 		(void) LDC_NO_TRAP();
646 		return (VGEN_TX_NORESOURCES);
647 	}
648 
649 	if (ldcp->tx_blocked == B_TRUE) {
650 		ldcp->tx_blocked = B_FALSE;
651 		tx_update = B_TRUE;
652 	}
653 
654 	/* Update descriptor ring index */
655 	ldcp->next_txi = next_txi;
656 	mutex_exit(&ldcp->txlock);
657 
658 	if (tx_update == B_TRUE) {
659 		vio_net_tx_update_t vtx_update =
660 		    ldcp->portp->vcb.vio_net_tx_update;
661 
662 		vtx_update(ldcp->portp->vhp);
663 	}
664 
665 	/* Access the descriptor */
666 	txdp = &(ldcp->mtxdp[txi]);
667 
668 	/* Ensure load ordering of dstate (above) and data_buf_offset. */
669 	MEMBAR_CONSUMER();
670 
671 	/* Get the offset of the buffer to be used */
672 	buf_offset = txdp->data_buf_offset;
673 
674 	/* Access the buffer using the offset */
675 	dst = (caddr_t)ldcp->tx_datap + buf_offset;
676 
677 	/* Copy data into mapped transmit buffer */
678 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
679 		mblksz = MBLKL(bp);
680 		bcopy(bp->b_rptr, dst, mblksz);
681 		dst += mblksz;
682 	}
683 
684 	/* Set the size of data in the descriptor */
685 	txdp->nbytes = size;
686 
687 	/*
688 	 * Ensure store ordering of nbytes and dstate (below); so that the peer
689 	 * sees the right nbytes value after it checks that the dstate is READY.
690 	 */
691 	MEMBAR_PRODUCER();
692 
693 	mutex_enter(&ldcp->wrlock);
694 
695 	/* Mark the descriptor ready */
696 	txdp->dstate = VIO_DESC_READY;
697 
698 	/* Check if peer needs wake up (handled below) */
699 	if (ldcp->resched_peer == B_TRUE) {
700 		ldcp->resched_peer = B_FALSE;
701 		resched_peer = B_TRUE;
702 	}
703 
704 	/* Update tx stats */
705 	statsp->opackets++;
706 	statsp->obytes += size;
707 	if (is_bcast)
708 		statsp->brdcstxmt++;
709 	else if (is_mcast)
710 		statsp->multixmt++;
711 
712 	mutex_exit(&ldcp->wrlock);
713 
714 	/*
715 	 * We are done accessing shared memory; clear trap protection.
716 	 */
717 	(void) LDC_NO_TRAP();
718 
719 	/*
720 	 * Need to wake up the peer ?
721 	 */
722 	if (resched_peer == B_TRUE) {
723 		rv = vgen_send_dringdata_shm(ldcp, (uint32_t)txi, -1);
724 		if (rv != 0) {
725 			/* error: drop the packet */
726 			DWARN(vgenp, ldcp, "failed sending dringdata msg "
727 			    "rv(%d) len(%d)\n", rv, size);
728 			mutex_enter(&ldcp->wrlock);
729 			statsp->oerrors++;
730 			ldcp->resched_peer = B_TRUE;
731 			mutex_exit(&ldcp->wrlock);
732 		}
733 	}
734 
735 dringsend_shm_exit:
736 	if (rv == ECONNRESET || rv == EACCES) {
737 		(void) vgen_handle_evt_reset(ldcp, VGEN_OTHER);
738 	}
739 	freemsg(mp);
740 	return (VGEN_TX_SUCCESS);
741 }
742 
743 /*
744  * Process dring data messages (info/ack/nack)
745  */
746 int
747 vgen_handle_dringdata_shm(void *arg1, void *arg2)
748 {
749 	vgen_ldc_t	*ldcp = (vgen_ldc_t *)arg1;
750 	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)arg2;
751 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
752 	int		rv = 0;
753 
754 	switch (tagp->vio_subtype) {
755 
756 	case VIO_SUBTYPE_INFO:
757 		/*
758 		 * To reduce the locking contention, release the
759 		 * cblock here and re-acquire it once we are done
760 		 * receiving packets.
761 		 */
762 		mutex_exit(&ldcp->cblock);
763 		mutex_enter(&ldcp->rxlock);
764 		rv = vgen_handle_dringdata_info_shm(ldcp, tagp);
765 		mutex_exit(&ldcp->rxlock);
766 		mutex_enter(&ldcp->cblock);
767 		if (rv != 0) {
768 			DWARN(vgenp, ldcp, "handle_data_info failed(%d)\n", rv);
769 		}
770 		break;
771 
772 	case VIO_SUBTYPE_ACK:
773 		rv = vgen_handle_dringdata_ack_shm(ldcp, tagp);
774 		if (rv != 0) {
775 			DWARN(vgenp, ldcp, "handle_data_ack failed(%d)\n", rv);
776 		}
777 		break;
778 
779 	case VIO_SUBTYPE_NACK:
780 		rv = vgen_handle_dringdata_nack_shm(ldcp, tagp);
781 		if (rv != 0) {
782 			DWARN(vgenp, ldcp, "handle_data_nack failed(%d)\n", rv);
783 		}
784 		break;
785 	}
786 
787 	return (rv);
788 }
789 
790 static int
791 vgen_handle_dringdata_info_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
792 {
793 	uint32_t	start;
794 	int32_t		end;
795 	int		rv = 0;
796 	vio_dring_msg_t	*dringmsg = (vio_dring_msg_t *)tagp;
797 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
798 	vgen_stats_t	*statsp = &ldcp->stats;
799 
800 	start = dringmsg->start_idx;
801 	end = dringmsg->end_idx;
802 
803 	DBG1(vgenp, ldcp, "INFO: start(%d), end(%d)\n",
804 	    start, end);
805 
806 	if (!(CHECK_RXI(start, ldcp)) ||
807 	    ((end != -1) && !(CHECK_RXI(end, ldcp)))) {
808 		DWARN(vgenp, ldcp, "Invalid Rx start(%d) or end(%d)\n",
809 		    start, end);
810 		/* drop the message if invalid index */
811 		return (0);
812 	}
813 
814 	/* validate dring_ident */
815 	if (dringmsg->dring_ident != ldcp->peer_hparams.dring_ident) {
816 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
817 		    dringmsg->dring_ident);
818 		/* invalid dring_ident, drop the msg */
819 		return (0);
820 	}
821 
822 	statsp->dring_data_msgs_rcvd++;
823 
824 	/*
825 	 * If we are in polling mode, return from here without processing the
826 	 * dring. We will process the dring in the context of polling thread.
827 	 */
828 	if (ldcp->polling_on == B_TRUE) {
829 		return (0);
830 	}
831 
832 	/*
833 	 * Process the dring and receive packets in intr context.
834 	 */
835 	rv = vgen_intr_rcv_shm(ldcp);
836 	if (rv != 0) {
837 		DWARN(vgenp, ldcp, "vgen_intr_rcv_shm() failed\n");
838 	}
839 	return (rv);
840 }
841 
842 /*
843  * Process the rx descriptor ring in the context of interrupt thread
844  * (vgen_ldc_cb() callback) and send the received packets up the stack.
845  */
846 static int
847 vgen_intr_rcv_shm(vgen_ldc_t *ldcp)
848 {
849 	int		rv;
850 	uint32_t	end_ix;
851 	vio_dring_msg_t msg;
852 	uint_t		mblk_sz;
853 	int		count = 0;
854 	int		total_count = 0;
855 	mblk_t		*bp = NULL;
856 	mblk_t		*bpt = NULL;
857 	mblk_t		*mp = NULL;
858 	vio_net_rx_cb_t vrx_cb = ldcp->portp->vcb.vio_net_rx_cb;
859 
860 	ASSERT(MUTEX_HELD(&ldcp->rxlock));
861 
862 	do {
863 		rv = vgen_receive_packet(ldcp, &mp, &mblk_sz);
864 		if (rv != 0) {
865 			if (rv == EINVAL) {
866 				/* Invalid descriptor error; get next */
867 				continue;
868 			}
869 			DTRACE_PROBE1(vgen_intr_nopkts, vgen_ldc_t *, ldcp);
870 			break;
871 		}
872 
873 		/* Build a chain of received packets */
874 		if (bp == NULL) {
875 			/* first pkt */
876 			bp = mp;
877 			bpt = bp;
878 			bpt->b_next = NULL;
879 		} else {
880 			mp->b_next = NULL;
881 			bpt->b_next = mp;
882 			bpt = mp;
883 		}
884 
885 		total_count++;
886 		count++;
887 
888 		/*
889 		 * We are receiving the packets in interrupt context. If we
890 		 * have gathered vgen_chain_len (tunable) # of packets in the
891 		 * chain, send them up. (See vgen_poll_rcv_shm() for receiving
892 		 * in polling thread context).
893 		 */
894 		if (count == vgen_chain_len) {
895 			DTRACE_PROBE2(vgen_intr_pkts, vgen_ldc_t *, ldcp,
896 			    int, count);
897 			mutex_exit(&ldcp->rxlock);
898 			vrx_cb(ldcp->portp->vhp, bp);
899 			mutex_enter(&ldcp->rxlock);
900 			bp = bpt = NULL;
901 			count = 0;
902 		}
903 
904 		/*
905 		 * Stop further processing if we processed the entire dring
906 		 * once; otherwise continue.
907 		 */
908 	} while (total_count < ldcp->num_rxds);
909 
910 	if (bp != NULL) {
911 		DTRACE_PROBE2(vgen_intr_pkts, vgen_ldc_t *, ldcp, int, count);
912 		mutex_exit(&ldcp->rxlock);
913 		vrx_cb(ldcp->portp->vhp, bp);
914 		mutex_enter(&ldcp->rxlock);
915 	}
916 
917 	if (ldcp->polling_on == B_FALSE) {
918 		/*
919 		 * We send a stopped message to peer (sender) while we are in
920 		 * intr mode only; allowing the peer to send further data intrs
921 		 * (dring data msgs) to us.
922 		 */
923 		end_ix = ldcp->next_rxi;
924 		DECR_RXI(end_ix, ldcp);
925 		msg.dring_ident = ldcp->peer_hparams.dring_ident;
926 		rv = vgen_send_dringack_shm(ldcp, (vio_msg_tag_t *)&msg,
927 		    VNET_START_IDX_UNSPEC, end_ix, VIO_DP_STOPPED);
928 		return (rv);
929 	}
930 
931 	return (0);
932 }
933 
934 /*
935  * Process the rx descriptor ring in the context of mac polling thread. Receive
936  * packets upto the limit specified by bytes_to_pickup or until there are no
937  * more packets, whichever occurs first. Return the chain of received packets.
938  */
939 mblk_t *
940 vgen_poll_rcv_shm(vgen_ldc_t *ldcp, int bytes_to_pickup)
941 {
942 	uint_t		mblk_sz = 0;
943 	uint_t		sz = 0;
944 	mblk_t		*bp = NULL;
945 	mblk_t		*bpt = NULL;
946 	mblk_t		*mp = NULL;
947 	int		count = 0;
948 	int		rv;
949 
950 	mutex_enter(&ldcp->rxlock);
951 
952 	if (ldcp->hphase != VH_DONE) {
953 		/* Channel is being reset and handshake not complete */
954 		mutex_exit(&ldcp->rxlock);
955 		return (NULL);
956 	}
957 
958 	do {
959 		rv = vgen_receive_packet(ldcp, &mp, &mblk_sz);
960 		if (rv != 0) {
961 			if (rv == EINVAL) {
962 				/* Invalid descriptor error; get next */
963 				continue;
964 			}
965 			DTRACE_PROBE1(vgen_poll_nopkts, vgen_ldc_t *, ldcp);
966 			break;
967 		}
968 
969 		/* Build a chain of received packets */
970 		if (bp == NULL) {
971 			/* first pkt */
972 			bp = mp;
973 			bpt = bp;
974 			bpt->b_next = NULL;
975 		} else {
976 			mp->b_next = NULL;
977 			bpt->b_next = mp;
978 			bpt = mp;
979 		}
980 
981 		/* Compute total size accumulated */
982 		sz += mblk_sz;
983 		count++;
984 
985 		/* Reached the bytes limit; we are done. */
986 		if (sz >= bytes_to_pickup) {
987 			break;
988 		}
989 
990 	_NOTE(CONSTCOND)
991 	} while (1);
992 
993 	/*
994 	 * We prepend any high priority packets to the chain of packets; note
995 	 * that if we are already at the bytes_to_pickup limit, we might
996 	 * slightly exceed that in such cases. That should be ok, as these pkts
997 	 * are expected to be small in size and arrive at an interval in the
998 	 * the order of a few seconds.
999 	 */
1000 	if (ldcp->rx_pktdata == vgen_handle_pkt_data &&
1001 	    ldcp->rx_pri_head != NULL) {
1002 		ldcp->rx_pri_tail->b_next = bp;
1003 		bp = ldcp->rx_pri_head;
1004 		ldcp->rx_pri_head = ldcp->rx_pri_tail = NULL;
1005 	}
1006 
1007 	mutex_exit(&ldcp->rxlock);
1008 
1009 	DTRACE_PROBE2(vgen_poll_pkts, vgen_ldc_t *, ldcp, int, count);
1010 	DTRACE_PROBE2(vgen_poll_bytes, vgen_ldc_t *, ldcp, uint_t, sz);
1011 	return (bp);
1012 }
1013 
1014 /*
1015  * Process the next index in the rx dring and receive the associated packet.
1016  *
1017  * Returns:
1018  *	bp:	Success: The received packet.
1019  *		Failure: NULL
1020  *      size:	Success: Size of received packet.
1021  *		Failure: 0
1022  *      retval:
1023  *		Success: 0
1024  *		Failure: EAGAIN: Descriptor not ready
1025  *			 EIO:    Descriptor contents invalid.
1026  */
1027 static int
1028 vgen_receive_packet(vgen_ldc_t *ldcp, mblk_t **bp, uint_t *size)
1029 {
1030 	uint32_t			rxi;
1031 	vio_mblk_t			*vmp;
1032 	vio_mblk_t			*new_vmp;
1033 	struct ether_header		*ehp;
1034 	vnet_rx_dringdata_desc_t	*rxdp;
1035 	int				err = 0;
1036 	uint32_t			nbytes = 0;
1037 	mblk_t				*mp = NULL;
1038 	mblk_t				*dmp = NULL;
1039 	vgen_stats_t			*statsp = &ldcp->stats;
1040 	vgen_hparams_t			*lp = &ldcp->local_hparams;
1041 
1042 	rxi = ldcp->next_rxi;
1043 	rxdp = &(ldcp->rxdp[rxi]);
1044 	vmp = ldcp->rxdp_to_vmp[rxi];
1045 
1046 	if (rxdp->dstate != VIO_DESC_READY) {
1047 		/*
1048 		 * Descriptor is not ready.
1049 		 */
1050 		DTRACE_PROBE1(vgen_noready_rxds, vgen_ldc_t *, ldcp);
1051 		return (EAGAIN);
1052 	}
1053 
1054 	/*
1055 	 * Ensure load ordering of dstate and nbytes.
1056 	 */
1057 	MEMBAR_CONSUMER();
1058 
1059 	nbytes = rxdp->nbytes;
1060 
1061 	if ((nbytes < ETHERMIN) ||
1062 	    (nbytes > lp->mtu) ||
1063 	    (rxdp->data_buf_offset !=
1064 	    (VIO_MBLK_DATA_OFF(vmp) + VNET_IPALIGN))) {
1065 		/*
1066 		 * Descriptor contents invalid.
1067 		 */
1068 		statsp->ierrors++;
1069 		rxdp->dstate = VIO_DESC_DONE;
1070 		err = EIO;
1071 		goto done;
1072 	}
1073 
1074 	/*
1075 	 * Now allocate a new buffer for this descriptor before sending up the
1076 	 * buffer being processed. If that fails, stop processing; as we are
1077 	 * out of receive buffers.
1078 	 */
1079 	new_vmp = vio_allocb(ldcp->rx_vmp);
1080 
1081 	/*
1082 	 * Process the current buffer being received.
1083 	 */
1084 	mp = vmp->mp;
1085 
1086 	if (new_vmp == NULL) {
1087 		/*
1088 		 * We failed to get a new mapped buffer that is needed to
1089 		 * refill the descriptor. In that case, leave the current
1090 		 * buffer bound to the descriptor; allocate an mblk dynamically
1091 		 * and copy the contents of the buffer to the mblk. Then send
1092 		 * up this mblk. This way the sender has the same buffer as
1093 		 * before that can be used to send new data.
1094 		 */
1095 		statsp->norcvbuf++;
1096 		dmp = allocb(nbytes + VNET_IPALIGN, BPRI_MED);
1097 		if (dmp == NULL) {
1098 			statsp->ierrors++;
1099 			return (ENOMEM);
1100 		}
1101 		bcopy(mp->b_rptr + VNET_IPALIGN,
1102 		    dmp->b_rptr + VNET_IPALIGN, nbytes);
1103 		mp = dmp;
1104 	} else {
1105 		/* Mark the status of the current rbuf */
1106 		vmp->state = VIO_MBLK_HAS_DATA;
1107 
1108 		/* Set the offset of the new buffer in the descriptor */
1109 		rxdp->data_buf_offset =
1110 		    VIO_MBLK_DATA_OFF(new_vmp) + VNET_IPALIGN;
1111 		ldcp->rxdp_to_vmp[rxi] = new_vmp;
1112 	}
1113 	mp->b_rptr += VNET_IPALIGN;
1114 	mp->b_wptr = mp->b_rptr + nbytes;
1115 
1116 	/*
1117 	 * Ensure store ordering of data_buf_offset and dstate; so that the
1118 	 * peer sees the right data_buf_offset after it checks that the dstate
1119 	 * is DONE.
1120 	 */
1121 	MEMBAR_PRODUCER();
1122 
1123 	/* Now mark the descriptor 'done' */
1124 	rxdp->dstate = VIO_DESC_DONE;
1125 
1126 	/* Update stats */
1127 	statsp->ipackets++;
1128 	statsp->rbytes += rxdp->nbytes;
1129 	ehp = (struct ether_header *)mp->b_rptr;
1130 	if (IS_BROADCAST(ehp))
1131 		statsp->brdcstrcv++;
1132 	else if (IS_MULTICAST(ehp))
1133 		statsp->multircv++;
1134 done:
1135 	/* Update the next index to be processed */
1136 	INCR_RXI(rxi, ldcp);
1137 
1138 	/* Save the new recv index */
1139 	ldcp->next_rxi = rxi;
1140 
1141 	/* Return the packet received */
1142 	*size = nbytes;
1143 	*bp = mp;
1144 	return (err);
1145 }
1146 
1147 static int
1148 vgen_handle_dringdata_ack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
1149 {
1150 	uint32_t			start;
1151 	int32_t				end;
1152 	uint32_t			txi;
1153 	vgen_stats_t			*statsp;
1154 	vnet_rx_dringdata_desc_t	*txdp;
1155 	on_trap_data_t			otd;
1156 	int				rv = 0;
1157 	boolean_t			ready_txd = B_FALSE;
1158 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
1159 	vio_dring_msg_t			*dringmsg = (vio_dring_msg_t *)tagp;
1160 
1161 	start = dringmsg->start_idx;
1162 	end = dringmsg->end_idx;
1163 	statsp = &ldcp->stats;
1164 
1165 	/*
1166 	 * Received an ack for our transmits upto a certain dring index. This
1167 	 * enables us to reclaim descriptors. We also send a new dring data msg
1168 	 * to the peer to restart processing if there are pending transmit pkts.
1169 	 */
1170 	DBG2(vgenp, ldcp, "ACK:  start(%d), end(%d)\n", start, end);
1171 
1172 	/*
1173 	 * In RxDringData mode (v1.6), start index of -1 can be used by the
1174 	 * peer to indicate that it is unspecified. However, the end index
1175 	 * must be set correctly indicating the last descriptor index processed.
1176 	 */
1177 	if (((start != VNET_START_IDX_UNSPEC) && !(CHECK_TXI(start, ldcp))) ||
1178 	    !(CHECK_TXI(end, ldcp))) {
1179 		/* drop the message if invalid index */
1180 		DWARN(vgenp, ldcp, "Invalid Tx ack start(%d) or end(%d)\n",
1181 		    start, end);
1182 		return (rv);
1183 	}
1184 
1185 	/* Validate dring_ident */
1186 	if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
1187 		/* invalid dring_ident, drop the msg */
1188 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
1189 		    dringmsg->dring_ident);
1190 		return (rv);
1191 	}
1192 	statsp->dring_data_acks_rcvd++;
1193 
1194 	/*
1195 	 * Clear transmit flow control condition
1196 	 * as some descriptors should be free now.
1197 	 */
1198 	mutex_enter(&ldcp->txlock);
1199 	if (ldcp->tx_blocked == B_TRUE) {
1200 		vio_net_tx_update_t vtx_update =
1201 		    ldcp->portp->vcb.vio_net_tx_update;
1202 
1203 		ldcp->tx_blocked = B_FALSE;
1204 		vtx_update(ldcp->portp->vhp);
1205 	}
1206 	mutex_exit(&ldcp->txlock);
1207 
1208 	if (dringmsg->dring_process_state != VIO_DP_STOPPED) {
1209 		/*
1210 		 * Receiver continued processing
1211 		 * dring after sending us the ack.
1212 		 */
1213 		return (rv);
1214 	}
1215 
1216 	/*
1217 	 * Receiver stopped processing descriptors.
1218 	 */
1219 	statsp->dring_stopped_acks_rcvd++;
1220 
1221 	/*
1222 	 * Setup on_trap() protection before accessing dring shared memory area.
1223 	 */
1224 	rv = LDC_ON_TRAP(&otd);
1225 	if (rv != 0) {
1226 		/*
1227 		 * Data access fault occured down the code path below while
1228 		 * accessing the descriptors. Release any locks that we might
1229 		 * have acquired in the code below and return failure.
1230 		 */
1231 		if (mutex_owned(&ldcp->wrlock)) {
1232 			mutex_exit(&ldcp->wrlock);
1233 		}
1234 		return (ECONNRESET);
1235 	}
1236 
1237 	/*
1238 	 * Determine if there are any pending tx descriptors ready to be
1239 	 * processed by the receiver(peer) and if so, send a message to the
1240 	 * peer to restart receiving.
1241 	 */
1242 	mutex_enter(&ldcp->wrlock);
1243 
1244 	ready_txd = B_FALSE;
1245 	txi = end;
1246 	INCR_TXI(txi, ldcp);
1247 	txdp = &ldcp->mtxdp[txi];
1248 	if (txdp->dstate == VIO_DESC_READY) {
1249 		ready_txd = B_TRUE;
1250 	}
1251 
1252 	/*
1253 	 * We are done accessing shared memory; clear trap protection.
1254 	 */
1255 	(void) LDC_NO_TRAP();
1256 
1257 	if (ready_txd == B_FALSE) {
1258 		/*
1259 		 * No ready tx descriptors. Set the flag to send a message to
1260 		 * the peer when tx descriptors are ready in transmit routine.
1261 		 */
1262 		ldcp->resched_peer = B_TRUE;
1263 		mutex_exit(&ldcp->wrlock);
1264 		return (rv);
1265 	}
1266 
1267 	/*
1268 	 * We have some tx descriptors ready to be processed by the receiver.
1269 	 * Send a dring data message to the peer to restart processing.
1270 	 */
1271 	ldcp->resched_peer = B_FALSE;
1272 	mutex_exit(&ldcp->wrlock);
1273 	rv = vgen_send_dringdata_shm(ldcp, txi, -1);
1274 	if (rv != VGEN_SUCCESS) {
1275 		mutex_enter(&ldcp->wrlock);
1276 		ldcp->resched_peer = B_TRUE;
1277 		mutex_exit(&ldcp->wrlock);
1278 	}
1279 
1280 	return (rv);
1281 }
1282 
1283 static int
1284 vgen_handle_dringdata_nack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
1285 {
1286 	uint32_t			start;
1287 	int32_t				end;
1288 	uint32_t			txi;
1289 	vnet_rx_dringdata_desc_t	*txdp;
1290 	on_trap_data_t			otd;
1291 	int				rv = 0;
1292 	vgen_t				*vgenp = LDC_TO_VGEN(ldcp);
1293 	vio_dring_msg_t			*dringmsg = (vio_dring_msg_t *)tagp;
1294 
1295 	DBG1(vgenp, ldcp, "enter\n");
1296 	start = dringmsg->start_idx;
1297 	end = dringmsg->end_idx;
1298 
1299 	/*
1300 	 * Peer sent a NACK msg (to indicate bad descriptors ?). The start and
1301 	 * end correspond to the range of descriptors which are being nack'd.
1302 	 */
1303 	DWARN(vgenp, ldcp, "NACK: start(%d), end(%d)\n", start, end);
1304 
1305 	/*
1306 	 * In RxDringData mode (v1.6), start index of -1 can be used by
1307 	 * the peer to indicate that it is unspecified. However, the end index
1308 	 * must be set correctly indicating the last descriptor index processed.
1309 	 */
1310 	if (((start != VNET_START_IDX_UNSPEC) && !(CHECK_TXI(start, ldcp))) ||
1311 	    !(CHECK_TXI(end, ldcp))) {
1312 		/* drop the message if invalid index */
1313 		DWARN(vgenp, ldcp, "Invalid Tx nack start(%d) or end(%d)\n",
1314 		    start, end);
1315 		return (rv);
1316 	}
1317 
1318 	/* Validate dring_ident */
1319 	if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
1320 		/* invalid dring_ident, drop the msg */
1321 		DWARN(vgenp, ldcp, "Invalid dring ident 0x%x\n",
1322 		    dringmsg->dring_ident);
1323 		return (rv);
1324 	}
1325 
1326 	/*
1327 	 * Setup on_trap() protection before accessing dring shared memory area.
1328 	 */
1329 	rv = LDC_ON_TRAP(&otd);
1330 	if (rv != 0) {
1331 		/*
1332 		 * Data access fault occured down the code path below while
1333 		 * accessing the descriptors. Release any locks that we might
1334 		 * have acquired in the code below and return failure.
1335 		 */
1336 		mutex_exit(&ldcp->txlock);
1337 		return (ECONNRESET);
1338 	}
1339 
1340 	/* We just mark the descrs as free so they can be reused */
1341 	mutex_enter(&ldcp->txlock);
1342 	for (txi = start; txi <= end; ) {
1343 		txdp = &(ldcp->mtxdp[txi]);
1344 		if (txdp->dstate == VIO_DESC_READY)
1345 			txdp->dstate = VIO_DESC_DONE;
1346 		INCR_TXI(txi, ldcp);
1347 	}
1348 
1349 	/*
1350 	 * We are done accessing shared memory; clear trap protection.
1351 	 */
1352 	(void) LDC_NO_TRAP();
1353 
1354 	mutex_exit(&ldcp->txlock);
1355 
1356 	return (rv);
1357 }
1358 
1359 /*
1360  * Send descriptor ring data message to the peer over LDC.
1361  */
1362 static int
1363 vgen_send_dringdata_shm(vgen_ldc_t *ldcp, uint32_t start, int32_t end)
1364 {
1365 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
1366 	vio_dring_msg_t	dringmsg, *msgp = &dringmsg;
1367 	vio_msg_tag_t	*tagp = &msgp->tag;
1368 	vgen_stats_t	*statsp = &ldcp->stats;
1369 	int		rv;
1370 
1371 #ifdef DEBUG
1372 	if (vgen_inject_error(ldcp, VGEN_ERR_TXTIMEOUT)) {
1373 		return (VGEN_SUCCESS);
1374 	}
1375 #endif
1376 	bzero(msgp, sizeof (*msgp));
1377 
1378 	tagp->vio_msgtype = VIO_TYPE_DATA;
1379 	tagp->vio_subtype = VIO_SUBTYPE_INFO;
1380 	tagp->vio_subtype_env = VIO_DRING_DATA;
1381 	tagp->vio_sid = ldcp->local_sid;
1382 
1383 	msgp->dring_ident = ldcp->local_hparams.dring_ident;
1384 	msgp->start_idx = start;
1385 	msgp->end_idx = end;
1386 	msgp->seq_num = atomic_inc_32_nv(&ldcp->dringdata_msgid);
1387 
1388 	rv = vgen_sendmsg_shm(ldcp, (caddr_t)tagp, sizeof (dringmsg));
1389 	if (rv != VGEN_SUCCESS) {
1390 		DWARN(vgenp, ldcp, "vgen_sendmsg_shm() failed\n");
1391 		return (rv);
1392 	}
1393 
1394 	statsp->dring_data_msgs_sent++;
1395 
1396 	DBG2(vgenp, ldcp, "DRING_DATA_SENT \n");
1397 
1398 	return (VGEN_SUCCESS);
1399 }
1400 
1401 /*
1402  * Send dring data ack message.
1403  */
1404 int
1405 vgen_send_dringack_shm(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, uint32_t start,
1406     int32_t end, uint8_t pstate)
1407 {
1408 	int		rv = 0;
1409 	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
1410 	vio_dring_msg_t	*msgp = (vio_dring_msg_t *)tagp;
1411 	vgen_stats_t	*statsp = &ldcp->stats;
1412 
1413 	tagp->vio_msgtype = VIO_TYPE_DATA;
1414 	tagp->vio_subtype = VIO_SUBTYPE_ACK;
1415 	tagp->vio_subtype_env = VIO_DRING_DATA;
1416 	tagp->vio_sid = ldcp->local_sid;
1417 	msgp->start_idx = start;
1418 	msgp->end_idx = end;
1419 	msgp->dring_process_state = pstate;
1420 	msgp->seq_num = atomic_inc_32_nv(&ldcp->dringdata_msgid);
1421 
1422 	rv = vgen_sendmsg_shm(ldcp, (caddr_t)tagp, sizeof (*msgp));
1423 	if (rv != VGEN_SUCCESS) {
1424 		DWARN(vgenp, ldcp, "vgen_sendmsg_shm() failed\n");
1425 	}
1426 
1427 	statsp->dring_data_acks_sent++;
1428 	if (pstate == VIO_DP_STOPPED) {
1429 		statsp->dring_stopped_acks_sent++;
1430 	}
1431 
1432 	return (rv);
1433 }
1434 
1435 /*
1436  * Send dring data msgs (info/ack/nack) over LDC.
1437  */
1438 static int
1439 vgen_sendmsg_shm(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen)
1440 {
1441 	int			rv;
1442 	size_t			len;
1443 	uint32_t		retries = 0;
1444 	vgen_t			*vgenp = LDC_TO_VGEN(ldcp);
1445 
1446 	len = msglen;
1447 	if ((len == 0) || (msg == NULL))
1448 		return (VGEN_FAILURE);
1449 
1450 	do {
1451 		len = msglen;
1452 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msg, &len);
1453 		if (retries++ >= vgen_ldcwr_retries)
1454 			break;
1455 	} while (rv == EWOULDBLOCK);
1456 
1457 	if (rv != 0) {
1458 		DWARN(vgenp, ldcp, "ldc_write failed: rv(%d) msglen(%d)\n",
1459 		    rv, msglen);
1460 		return (rv);
1461 	}
1462 
1463 	if (len != msglen) {
1464 		DWARN(vgenp, ldcp, "ldc_write failed: rv(%d) msglen (%d)\n",
1465 		    rv, msglen);
1466 		return (VGEN_FAILURE);
1467 	}
1468 
1469 	return (VGEN_SUCCESS);
1470 }
1471