xref: /titanic_41/usr/src/uts/common/io/bge/bge_send.c (revision 1d4b38e0077763e7c9b20768eacb841957e787bc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "sys/bge_impl2.h"
30 
31 
32 /*
33  * The transmit-side code uses an allocation process which is similar
34  * to some theme park roller-coaster rides, where riders sit in cars
35  * that can go individually, but work better in a train.
36  *
37  * 1)	RESERVE a place - this doesn't refer to any specific car or
38  *	seat, just that you will get a ride.  The attempt to RESERVE a
39  *	place can fail if all spaces in all cars are already committed.
40  *
41  * 2)	Prepare yourself; this may take an arbitrary (but not unbounded)
42  *	time, and you can back out at this stage, in which case you must
43  *	give up (RENOUNCE) your place.
44  *
45  * 3)	CLAIM your space - a specific car (the next sequentially
46  *	numbered one) is allocated at this stage, and is guaranteed
47  *	to be part of the next train to depart.  Once you've done
48  *	this, you can't back out, nor wait for any external event
49  *	or resource.
50  *
51  * 4)	Occupy your car - when all CLAIMED cars are OCCUPIED, they
52  *	all depart together as a single train!
53  *
54  * 5)	At the end of the ride, you climb out of the car and RENOUNCE
55  *	your right to it, so that it can be recycled for another rider.
56  *
57  * For each rider, these have to occur in this order, but the riders
58  * don't have to stay in the same order at each stage.  In particular,
59  * they may overtake each other between RESERVING a place and CLAIMING
60  * it, or between CLAIMING and OCCUPYING a space.
61  *
62  * Once a car is CLAIMED, the train currently being assembled can't go
63  * without that car (this guarantees that the cars in a single train
64  * make up a consecutively-numbered set).  Therefore, when any train
65  * leaves, we know there can't be any riders in transit between CLAIMING
66  * and OCCUPYING their cars.  There can be some who have RESERVED but
67  * not yet CLAIMED their places.  That's OK, though, because they'll go
68  * into the next train.
69  */
70 
71 #define	BGE_DBG		BGE_DBG_SEND	/* debug flag for this code	*/
72 
73 
74 /*
75  * ========== Send-side recycle routines ==========
76  */
77 
78 /*
79  * Recycle all the completed buffers in the specified send ring up to
80  * (but not including) the consumer index in the status block.
81  *
82  * This function must advance (srp->tc_next) AND adjust (srp->tx_free)
83  * to account for the packets it has recycled.
84  *
85  * This is a trivial version that just does that and nothing more, but
86  * it suffices while there's only one method for sending messages (by
87  * copying) and that method doesn't need any special per-buffer action
88  * for recycling.
89  */
90 static void bge_recycle_ring(bge_t *bgep, send_ring_t *srp);
91 #pragma	inline(bge_recycle_ring)
92 
93 static void
94 bge_recycle_ring(bge_t *bgep, send_ring_t *srp)
95 {
96 	uint64_t slot;
97 	uint64_t n;
98 
99 	_NOTE(ARGUNUSED(bgep))
100 
101 	ASSERT(mutex_owned(srp->tc_lock));
102 
103 	slot = *srp->cons_index_p;			/* volatile	*/
104 	n = slot - srp->tc_next;
105 	if (slot < srp->tc_next)
106 		n += srp->desc.nslots;
107 
108 	/*
109 	 * We're about to release one or more places :-)
110 	 * These ASSERTions check that our invariants still hold:
111 	 *	there must always be at least one free place
112 	 *	at this point, there must be at least one place NOT free
113 	 *	we're not about to free more places than were claimed!
114 	 */
115 	ASSERT(srp->tx_free > 0);
116 
117 	srp->tc_next = slot;
118 	bge_atomic_renounce(&srp->tx_free, n);
119 
120 	/*
121 	 * Reset the watchdog count: to 0 if all buffers are
122 	 * now free, or to 1 if some are still outstanding.
123 	 * Note: non-synchonised access here means we may get
124 	 * the "wrong" answer, but only in a harmless fashion
125 	 * (i.e. we deactivate the watchdog because all buffers
126 	 * are apparently free, even though another thread may
127 	 * have claimed one before we leave here; in this case
128 	 * the watchdog will restart on the next send() call).
129 	 */
130 	bgep->watchdog = srp->tx_free == srp->desc.nslots ? 0 : 1;
131 }
132 
133 /*
134  * Recycle all returned slots in all rings.
135  *
136  * To give priority to low-numbered rings, whenever we have recycled any
137  * slots in any ring except 0, we restart scanning again from ring 0.
138  * Thus, for example, if rings 0, 3, and 10 are carrying traffic, the
139  * pattern of recycles might go 0, 3, 10, 3, 0, 10, 0:
140  *
141  *	0	found some - recycle them
142  *	1..2					none found
143  *	3	found some - recycle them	and restart scan
144  *	0..9					none found
145  *	10	found some - recycle them	and restart scan
146  *	0..2					none found
147  *	3	found some more - recycle them	and restart scan
148  *	0	found some more - recycle them
149  *	0..9					none found
150  *	10	found some more - recycle them	and restart scan
151  *	0	found some more - recycle them
152  *	1..15					none found
153  *
154  * The routine returns only when a complete scan has been performed
155  * without finding any slots to recycle.
156  *
157  * Note: the expression (BGE_SEND_RINGS_USED > 1) yields a compile-time
158  * constant and allows the compiler to optimise away the outer do-loop
159  * if only one send ring is being used.
160  */
161 void bge_recycle(bge_t *bgep, bge_status_t *bsp);
162 #pragma	no_inline(bge_recycle)
163 
164 void
165 bge_recycle(bge_t *bgep, bge_status_t *bsp)
166 {
167 	send_ring_t *srp;
168 	uint64_t ring;
169 	uint64_t tx_rings = bgep->chipid.tx_rings;
170 
171 restart:
172 	ring = 0;
173 	srp = &bgep->send[ring];
174 	do {
175 		/*
176 		 * For each ring, (srp->cons_index_p) points to the
177 		 * proper index within the status block (which has
178 		 * already been sync'd by the caller).
179 		 */
180 		ASSERT(srp->cons_index_p == SEND_INDEX_P(bsp, ring));
181 
182 		if (*srp->cons_index_p == srp->tc_next)
183 			continue;		/* no slots to recycle	*/
184 
185 		mutex_enter(srp->tc_lock);
186 		bge_recycle_ring(bgep, srp);
187 		mutex_exit(srp->tc_lock);
188 
189 		if (bgep->resched_needed && !bgep->resched_running) {
190 			bgep->resched_running = B_TRUE;
191 			ddi_trigger_softintr(bgep->resched_id);
192 		}
193 		/*
194 		 * Restart from ring 0, if we're not on ring 0 already.
195 		 * As H/W selects send BDs totally based on priority and
196 		 * available BDs on the higher priority ring are always
197 		 * selected first, driver should keep consistence with H/W
198 		 * and gives lower-numbered ring with higher priority.
199 		 */
200 		if (tx_rings > 1 && ring > 0)
201 			goto restart;
202 
203 		/*
204 		 * Loop over all rings (if there *are* multiple rings)
205 		 */
206 	} while (++srp, ++ring < tx_rings);
207 }
208 
209 
210 /*
211  * ========== Send-side transmit routines ==========
212  */
213 
214 /*
215  * CLAIM an already-reserved place on the next train
216  *
217  * This is the point of no return!
218  */
219 static uint64_t bge_send_claim(bge_t *bgep, send_ring_t *srp);
220 #pragma	inline(bge_send_claim)
221 
222 static uint64_t
223 bge_send_claim(bge_t *bgep, send_ring_t *srp)
224 {
225 	uint64_t slot;
226 
227 	mutex_enter(srp->tx_lock);
228 	atomic_add_64(&srp->tx_flow, 1);
229 	slot = bge_atomic_claim(&srp->tx_next, srp->desc.nslots);
230 	mutex_exit(srp->tx_lock);
231 
232 	/*
233 	 * Bump the watchdog counter, thus guaranteeing that it's
234 	 * nonzero (watchdog activated).  Note that non-synchonised
235 	 * access here means we may race with the reclaim() code
236 	 * above, but the outcome will be harmless.  At worst, the
237 	 * counter may not get reset on a partial reclaim; but the
238 	 * large trigger threshold makes false positives unlikely
239 	 */
240 	bgep->watchdog += 1;
241 
242 	return (slot);
243 }
244 
245 #define	TCP_CKSUM_OFFSET	16
246 #define	UDP_CKSUM_OFFSET	6
247 
248 static void
249 bge_pseudo_cksum(uint8_t *buf)
250 {
251 	uint32_t cksum;
252 	uint16_t iphl;
253 	uint16_t proto;
254 
255 	/*
256 	 * Point it to the ip header.
257 	 */
258 	buf += sizeof (struct ether_header);
259 
260 	/*
261 	 * Calculate the pseudo-header checksum.
262 	 */
263 	iphl = 4 * (buf[0] & 0xF);
264 	cksum = (((uint16_t)buf[2])<<8) + buf[3] - iphl;
265 	cksum += proto = buf[9];
266 	cksum += (((uint16_t)buf[12])<<8) + buf[13];
267 	cksum += (((uint16_t)buf[14])<<8) + buf[15];
268 	cksum += (((uint16_t)buf[16])<<8) + buf[17];
269 	cksum += (((uint16_t)buf[18])<<8) + buf[19];
270 	cksum = (cksum>>16) + (cksum & 0xFFFF);
271 	cksum = (cksum>>16) + (cksum & 0xFFFF);
272 
273 	/*
274 	 * Point it to the TCP/UDP header, and
275 	 * update the checksum field.
276 	 */
277 	buf += iphl + ((proto == IPPROTO_TCP) ?
278 		TCP_CKSUM_OFFSET : UDP_CKSUM_OFFSET);
279 
280 	*(uint16_t *)buf = htons((uint16_t)cksum);
281 }
282 
283 /*
284  * Send a message by copying it into a preallocated (and premapped) buffer
285  */
286 static enum send_status bge_send_copy(bge_t *bgep, mblk_t *mp,
287 	send_ring_t *srp, uint16_t tci);
288 #pragma	inline(bge_send_copy)
289 
290 static enum send_status
291 bge_send_copy(bge_t *bgep, mblk_t *mp, send_ring_t *srp, uint16_t tci)
292 {
293 	bge_sbd_t *hw_sbd_p;
294 	sw_sbd_t *ssbdp;
295 	mblk_t *bp;
296 	char *txb;
297 	uint64_t slot;
298 	size_t totlen;
299 	size_t mblen;
300 	uint32_t pflags;
301 
302 	BGE_TRACE(("bge_send_copy($%p, $%p, $%p, 0x%x)",
303 		(void *)bgep, (void *)mp, (void *)srp));
304 
305 	/*
306 	 * IMPORTANT:
307 	 *	Up to the point where it claims a place, a send_msg()
308 	 *	routine can indicate failure by returning SEND_FAIL.
309 	 *	Once it's claimed a place, it mustn't fail.
310 	 *
311 	 * In this version, there's no setup to be done here, and there's
312 	 * nothing that can fail, so we can go straight to claiming our
313 	 * already-reserved place on the train.
314 	 *
315 	 * This is the point of no return!
316 	 */
317 	slot = bge_send_claim(bgep, srp);
318 	ssbdp = &srp->sw_sbds[slot];
319 
320 	/*
321 	 * Copy the data into a pre-mapped buffer, which avoids the
322 	 * overhead (and complication) of mapping/unmapping STREAMS
323 	 * buffers and keeping hold of them until the DMA has completed.
324 	 *
325 	 * Because all buffers are the same size, and larger than the
326 	 * longest single valid message, we don't have to bother about
327 	 * splitting the message across multiple buffers either.
328 	 */
329 	txb = DMA_VPTR(ssbdp->pbuf);
330 	totlen = 0;
331 	bp = mp;
332 	if (tci != 0) {
333 		mblen = bp->b_wptr - bp->b_rptr;
334 
335 		ASSERT(mblen >= 2 * ETHERADDRL + VLAN_TAGSZ);
336 
337 		bcopy(bp->b_rptr, txb, 2 * ETHERADDRL);
338 		txb += 2 * ETHERADDRL;
339 		totlen = 2 * ETHERADDRL;
340 
341 		if (mblen -= 2 * ETHERADDRL + VLAN_TAGSZ) {
342 			if ((totlen += mblen) <= bgep->chipid.ethmax_size) {
343 				bcopy(bp->b_wptr-mblen, txb, mblen);
344 				txb += mblen;
345 			}
346 		}
347 		bp = bp->b_cont;
348 	}
349 	for (; bp != NULL; bp = bp->b_cont) {
350 		mblen = bp->b_wptr - bp->b_rptr;
351 		if ((totlen += mblen) <= bgep->chipid.ethmax_size) {
352 			bcopy(bp->b_rptr, txb, mblen);
353 			txb += mblen;
354 		}
355 	}
356 
357 	/*
358 	 * We've reached the end of the chain; and we should have
359 	 * collected no more than ETHERMAX bytes into our buffer.
360 	 */
361 	ASSERT(bp == NULL);
362 	ASSERT(totlen <= bgep->chipid.ethmax_size);
363 	DMA_SYNC(ssbdp->pbuf, DDI_DMA_SYNC_FORDEV);
364 
365 	/*
366 	 * Update the hardware send buffer descriptor; then we're done.
367 	 * The return status indicates that the message can be freed
368 	 * right away, as we've already copied the contents ...
369 	 */
370 	hw_sbd_p = DMA_VPTR(ssbdp->desc);
371 	hw_sbd_p->host_buf_addr = ssbdp->pbuf.cookie.dmac_laddress;
372 	hw_sbd_p->len = totlen;
373 	hw_sbd_p->flags = SBD_FLAG_PACKET_END;
374 	if (tci != 0) {
375 		hw_sbd_p->vlan_tci = tci;
376 		hw_sbd_p->flags |= SBD_FLAG_VLAN_TAG;
377 	}
378 
379 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
380 	if (pflags & HCK_IPV4_HDRCKSUM)
381 		hw_sbd_p->flags |= SBD_FLAG_IP_CKSUM;
382 	if (pflags & HCK_FULLCKSUM) {
383 		hw_sbd_p->flags |= SBD_FLAG_TCP_UDP_CKSUM;
384 		if (bgep->chipid.flags & CHIP_FLAG_PARTIAL_CSUM)
385 			bge_pseudo_cksum((uint8_t *)DMA_VPTR(ssbdp->pbuf));
386 	}
387 
388 	return (SEND_FREE);
389 }
390 
391 static boolean_t
392 bge_send(bge_t *bgep, mblk_t *mp)
393 {
394 	send_ring_t *srp;
395 	enum send_status status;
396 	struct ether_vlan_header *ehp;
397 	boolean_t need_strip = B_FALSE;
398 	bge_status_t *bsp;
399 	uint16_t tci;
400 	uint_t ring = 0;
401 
402 	ASSERT(mp->b_next == NULL);
403 
404 	/*
405 	 * Determine if the packet is VLAN tagged.
406 	 */
407 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
408 	ehp = (struct ether_vlan_header *)mp->b_rptr;
409 
410 	if (ehp->ether_tpid == htons(VLAN_TPID)) {
411 		if (MBLKL(mp) < sizeof (struct ether_vlan_header)) {
412 			uint32_t pflags;
413 
414 			/*
415 			 * Need to preserve checksum flags across pullup.
416 			 */
417 			hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL,
418 			    NULL, &pflags);
419 
420 			if (!pullupmsg(mp,
421 			    sizeof (struct ether_vlan_header))) {
422 				BGE_DEBUG(("bge_send: pullup failure"));
423 				bgep->resched_needed = B_TRUE;
424 				return (B_FALSE);
425 			}
426 
427 			(void) hcksum_assoc(mp, NULL, NULL, NULL, NULL, NULL,
428 			    NULL, pflags, KM_NOSLEEP);
429 		}
430 
431 		ehp = (struct ether_vlan_header *)mp->b_rptr;
432 		need_strip = B_TRUE;
433 	}
434 
435 	/*
436 	 * Try to reserve a place in the chosen ring. Shouldn't try next
437 	 * higher-numbered (lower-priority) ring, if there aren't any
438 	 * available. Otherwise, packets with same priority may get
439 	 * transmission starvation.
440 	 */
441 	srp = &bgep->send[ring];
442 	if (!bge_atomic_reserve(&srp->tx_free, 1)) {
443 		BGE_DEBUG(("bge_send: no free slots"));
444 		bgep->resched_needed = B_TRUE;
445 		return (B_FALSE);
446 	}
447 
448 	/*
449 	 * Now that we know that there is space to transmit the packet
450 	 * strip any VLAN tag that is present.
451 	 */
452 	if (need_strip) {
453 		tci = ntohs(ehp->ether_tci);
454 	} else {
455 		tci = 0;
456 	}
457 
458 	if (srp->tx_free <= 16) {
459 		bsp = DMA_VPTR(bgep->status_block);
460 		bge_recycle(bgep, bsp);
461 	}
462 	/*
463 	 * We've reserved a place :-)
464 	 * These ASSERTions check that our invariants still hold:
465 	 *	there must still be at least one free place
466 	 *	there must be at least one place NOT free (ours!)
467 	 */
468 	ASSERT(srp->tx_free > 0);
469 
470 	if ((status = bge_send_copy(bgep, mp, srp, tci)) == SEND_FAIL) {
471 		/*
472 		 * The send routine failed :(  So we have to renounce
473 		 * our reservation before returning the error.
474 		 */
475 		bge_atomic_renounce(&srp->tx_free, 1);
476 		bgep->resched_needed = B_TRUE;
477 		return (B_FALSE);
478 	}
479 
480 	/*
481 	 * The send routine succeeded; it will have updated the
482 	 * h/w ring descriptor, and the <tx_next> and <tx_flow>
483 	 * counters.
484 	 *
485 	 * Because there can be multiple concurrent threads in
486 	 * transit through this code, we only want to prod the
487 	 * hardware once the last one is departing ...
488 	 */
489 	mutex_enter(srp->tx_lock);
490 	if (--srp->tx_flow == 0) {
491 		DMA_SYNC(srp->desc, DDI_DMA_SYNC_FORDEV);
492 		bge_mbx_put(bgep, srp->chip_mbx_reg, srp->tx_next);
493 		if (bge_check_acc_handle(bgep, bgep->io_handle) != DDI_FM_OK)
494 			bgep->bge_chip_state = BGE_CHIP_ERROR;
495 	}
496 	mutex_exit(srp->tx_lock);
497 
498 	if (status == SEND_FREE)
499 		freemsg(mp);
500 	return (B_TRUE);
501 }
502 
503 uint_t
504 bge_reschedule(caddr_t arg)
505 {
506 	bge_t *bgep;
507 
508 	bgep = (bge_t *)arg;
509 
510 	BGE_TRACE(("bge_reschedule($%p)", (void *)bgep));
511 
512 	if (bgep->bge_mac_state == BGE_MAC_STARTED && bgep->resched_needed) {
513 		mac_tx_update(bgep->macp);
514 		bgep->resched_needed = B_FALSE;
515 		bgep->resched_running = B_FALSE;
516 	}
517 
518 	return (DDI_INTR_CLAIMED);
519 }
520 
521 /*
522  * bge_m_tx() - send a chain of packets
523  */
524 mblk_t *
525 bge_m_tx(void *arg, mblk_t *mp)
526 {
527 	bge_t *bgep = arg;		/* private device info	*/
528 	mblk_t *next;
529 
530 	BGE_TRACE(("bge_m_tx($%p, $%p)", arg, (void *)mp));
531 
532 	ASSERT(mp != NULL);
533 	ASSERT(bgep->bge_mac_state == BGE_MAC_STARTED);
534 
535 	if (bgep->bge_chip_state != BGE_CHIP_RUNNING) {
536 		BGE_DEBUG(("bge_m_tx: chip not running"));
537 		return (mp);
538 	}
539 
540 	rw_enter(bgep->errlock, RW_READER);
541 	while (mp != NULL) {
542 		next = mp->b_next;
543 		mp->b_next = NULL;
544 
545 		if (!bge_send(bgep, mp)) {
546 			mp->b_next = next;
547 			break;
548 		}
549 
550 		mp = next;
551 	}
552 	rw_exit(bgep->errlock);
553 
554 	return (mp);
555 }
556