xref: /freebsd/sys/dev/sume/if_sume.c (revision ec994981447e8a974426660b5071bc405280af73)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Bjoern A. Zeeb
5  * Copyright (c) 2020 Denis Salopek
6  *
7  * This software was developed by SRI International and the University of
8  * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-11-C-0249
9  * ("MRC2"), as part of the DARPA MRC research programme.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #include <sys/param.h>
35 #include <sys/bus.h>
36 #include <sys/endian.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
39 #include <sys/module.h>
40 #include <sys/rman.h>
41 #include <sys/socket.h>
42 #include <sys/sockio.h>
43 #include <sys/sysctl.h>
44 #include <sys/taskqueue.h>
45 
46 #include <net/if.h>
47 #include <net/if_media.h>
48 #include <net/if_types.h>
49 #include <net/if_var.h>
50 
51 #include <netinet/in.h>
52 #include <netinet/if_ether.h>
53 
54 #include <dev/pci/pcivar.h>
55 #include <dev/pci/pcireg.h>
56 
57 #include <machine/bus.h>
58 
59 #include "adapter.h"
60 
61 #define	PCI_VENDOR_ID_XILINX	0x10ee
62 #define	PCI_DEVICE_ID_SUME	0x7028
63 
64 /* SUME bus driver interface */
65 static int sume_probe(device_t);
66 static int sume_attach(device_t);
67 static int sume_detach(device_t);
68 
69 static device_method_t sume_methods[] = {
70 	DEVMETHOD(device_probe,		sume_probe),
71 	DEVMETHOD(device_attach,	sume_attach),
72 	DEVMETHOD(device_detach,	sume_detach),
73 	DEVMETHOD_END
74 };
75 
76 static driver_t sume_driver = {
77 	"sume",
78 	sume_methods,
79 	sizeof(struct sume_adapter)
80 };
81 
82 /*
83  * The DMA engine for SUME generates interrupts for each RX/TX transaction.
84  * Depending on the channel (0 if packet transaction, 1 if register transaction)
85  * the used bits of the interrupt vector will be the lowest or the second lowest
86  * 5 bits.
87  *
88  * When receiving packets from SUME (RX):
89  * (1) SUME received a packet on one of the interfaces.
90  * (2) SUME generates an interrupt vector, bit 00001 is set (channel 0 - new RX
91  *     transaction).
92  * (3) We read the length of the incoming packet and the offset along with the
93  *     'last' flag from the SUME registers.
94  * (4) We prepare for the DMA transaction by setting the bouncebuffer on the
95  *     address buf_addr. For now, this is how it's done:
96  *     - First 3*sizeof(uint32_t) bytes are: lower and upper 32 bits of physical
97  *     address where we want the data to arrive (buf_addr[0] and buf_addr[1]),
98  *     and length of incoming data (buf_addr[2]).
99  *     - Data will start right after, at buf_addr+3*sizeof(uint32_t). The
100  *     physical address buf_hw_addr is a block of contiguous memory mapped to
101  *     buf_addr, so we can set the incoming data's physical address (buf_addr[0]
102  *     and buf_addr[1]) to buf_hw_addr+3*sizeof(uint32_t).
103  * (5) We notify SUME that the bouncebuffer is ready for the transaction by
104  *     writing the lower/upper physical address buf_hw_addr to the SUME
105  *     registers RIFFA_TX_SG_ADDR_LO_REG_OFF and RIFFA_TX_SG_ADDR_HI_REG_OFF as
106  *     well as the number of segments to the register RIFFA_TX_SG_LEN_REG_OFF.
107  * (6) SUME generates an interrupt vector, bit 00010 is set (channel 0 -
108  *     bouncebuffer received).
109  * (7) SUME generates an interrupt vector, bit 00100 is set (channel 0 -
110  *     transaction is done).
111  * (8) SUME can do both steps (6) and (7) using the same interrupt.
112  * (8) We read the first 16 bytes (metadata) of the received data and note the
113  *     incoming interface so we can later forward it to the right one in the OS
114  *     (sume0, sume1, sume2 or sume3).
115  * (10) We create an mbuf and copy the data from the bouncebuffer to the mbuf
116  *     and set the mbuf rcvif to the incoming interface.
117  * (11) We forward the mbuf to the appropriate interface via ifp->if_input.
118  *
119  * When sending packets to SUME (TX):
120  * (1) The OS calls sume_if_start() function on TX.
121  * (2) We get the mbuf packet data and copy it to the
122  *     buf_addr+3*sizeof(uint32_t) + metadata 16 bytes.
123  * (3) We create the metadata based on the output interface and copy it to the
124  *     buf_addr+3*sizeof(uint32_t).
125  * (4) We write the offset/last and length of the packet to the SUME registers
126  *     RIFFA_RX_OFFLAST_REG_OFF and RIFFA_RX_LEN_REG_OFF.
127  * (5) We fill the bouncebuffer by filling the first 3*sizeof(uint32_t) bytes
128  *     with the physical address and length just as in RX step (4).
129  * (6) We notify SUME that the bouncebuffer is ready by writing to SUME
130  *     registers RIFFA_RX_SG_ADDR_LO_REG_OFF, RIFFA_RX_SG_ADDR_HI_REG_OFF and
131  *     RIFFA_RX_SG_LEN_REG_OFF just as in RX step (5).
132  * (7) SUME generates an interrupt vector, bit 01000 is set (channel 0 -
133  *     bouncebuffer is read).
134  * (8) SUME generates an interrupt vector, bit 10000 is set (channel 0 -
135  *     transaction is done).
136  * (9) SUME can do both steps (7) and (8) using the same interrupt.
137  *
138  * Internal registers
139  * Every module in the SUME hardware has its own set of internal registers
140  * (IDs, for debugging and statistic purposes, etc.). Their base addresses are
141  * defined in 'projects/reference_nic/hw/tcl/reference_nic_defines.tcl' and the
142  * offsets to different memory locations of every module are defined in their
143  * corresponding folder inside the library. These registers can be RO/RW and
144  * there is a special method to fetch/change this data over 1 or 2 DMA
145  * transactions. For writing, by calling the sume_module_reg_write(). For
146  * reading, by calling the sume_module_reg_write() and then
147  * sume_module_reg_read(). Check those functions for more information.
148  */
149 
150 MALLOC_DECLARE(M_SUME);
151 MALLOC_DEFINE(M_SUME, "sume", "NetFPGA SUME device driver");
152 
153 static void check_tx_queues(struct sume_adapter *);
154 static void sume_fill_bb_desc(struct sume_adapter *, struct riffa_chnl_dir *,
155     uint64_t);
156 
157 static struct unrhdr *unr;
158 
159 static struct {
160 	uint16_t device;
161 	char *desc;
162 } sume_pciids[] = {
163 	{PCI_DEVICE_ID_SUME, "NetFPGA SUME reference NIC"},
164 };
165 
166 static inline uint32_t
167 read_reg(struct sume_adapter *adapter, int offset)
168 {
169 
170 	return (bus_space_read_4(adapter->bt, adapter->bh, offset << 2));
171 }
172 
173 static inline void
174 write_reg(struct sume_adapter *adapter, int offset, uint32_t val)
175 {
176 
177 	bus_space_write_4(adapter->bt, adapter->bh, offset << 2, val);
178 }
179 
180 static int
181 sume_probe(device_t dev)
182 {
183 	int i;
184 	uint16_t v = pci_get_vendor(dev);
185 	uint16_t d = pci_get_device(dev);
186 
187 	if (v != PCI_VENDOR_ID_XILINX)
188 		return (ENXIO);
189 
190 	for (i = 0; i < nitems(sume_pciids); i++) {
191 		if (d == sume_pciids[i].device) {
192 			device_set_desc(dev, sume_pciids[i].desc);
193 			return (BUS_PROBE_DEFAULT);
194 		}
195 	}
196 
197 	return (ENXIO);
198 }
199 
200 /*
201  * Building mbuf for packet received from SUME. We expect to receive 'len'
202  * bytes of data (including metadata) written from the bouncebuffer address
203  * buf_addr+3*sizeof(uint32_t). Metadata will tell us which SUME interface
204  * received the packet (sport will be 1, 2, 4 or 8), the packet length (plen),
205  * and the magic word needs to be 0xcafe. When we have the packet data, we
206  * create an mbuf and copy the data to it using m_copyback() function, set the
207  * correct interface to rcvif and return the mbuf to be later sent to the OS
208  * with if_input.
209  */
210 static struct mbuf *
211 sume_rx_build_mbuf(struct sume_adapter *adapter, uint32_t len)
212 {
213 	struct nf_priv *nf_priv;
214 	struct mbuf *m;
215 	if_t ifp = NULL;
216 	int np;
217 	uint16_t dport, plen, magic;
218 	device_t dev = adapter->dev;
219 	uint8_t *indata = (uint8_t *)
220 	    adapter->recv[SUME_RIFFA_CHANNEL_DATA]->buf_addr +
221 	    sizeof(struct nf_bb_desc);
222 	struct nf_metadata *mdata = (struct nf_metadata *) indata;
223 
224 	/* The metadata header is 16 bytes. */
225 	if (len < sizeof(struct nf_metadata)) {
226 		device_printf(dev, "short frame (%d)\n", len);
227 		adapter->packets_err++;
228 		adapter->bytes_err += len;
229 		return (NULL);
230 	}
231 
232 	dport = le16toh(mdata->dport);
233 	plen = le16toh(mdata->plen);
234 	magic = le16toh(mdata->magic);
235 
236 	if (sizeof(struct nf_metadata) + plen > len ||
237 	    magic != SUME_RIFFA_MAGIC) {
238 		device_printf(dev, "corrupted packet (%zd + %d > %d || magic "
239 		    "0x%04x != 0x%04x)\n", sizeof(struct nf_metadata), plen,
240 		    len, magic, SUME_RIFFA_MAGIC);
241 		return (NULL);
242 	}
243 
244 	/* We got the packet from one of the even bits */
245 	np = (ffs(dport & SUME_DPORT_MASK) >> 1) - 1;
246 	if (np > SUME_NPORTS) {
247 		device_printf(dev, "invalid destination port 0x%04x (%d)\n",
248 		    dport, np);
249 		adapter->packets_err++;
250 		adapter->bytes_err += plen;
251 		return (NULL);
252 	}
253 	ifp = adapter->ifp[np];
254 	nf_priv = if_getsoftc(ifp);
255 	nf_priv->stats.rx_packets++;
256 	nf_priv->stats.rx_bytes += plen;
257 
258 	/* If the interface is down, well, we are done. */
259 	if (!(if_getflags(ifp) & IFF_UP)) {
260 		nf_priv->stats.ifc_down_packets++;
261 		nf_priv->stats.ifc_down_bytes += plen;
262 		return (NULL);
263 	}
264 
265 	if (adapter->sume_debug)
266 		printf("Building mbuf with length: %d\n", plen);
267 
268 	m = m_getm(NULL, plen, M_NOWAIT, MT_DATA);
269 	if (m == NULL) {
270 		adapter->packets_err++;
271 		adapter->bytes_err += plen;
272 		return (NULL);
273 	}
274 
275 	/* Copy the data in at the right offset. */
276 	m_copyback(m, 0, plen, (void *) (indata + sizeof(struct nf_metadata)));
277 	m->m_pkthdr.rcvif = ifp;
278 
279 	return (m);
280 }
281 
282 /*
283  * SUME interrupt handler for when we get a valid interrupt from the board.
284  * Theoretically, we can receive interrupt for any of the available channels,
285  * but RIFFA DMA uses only 2: 0 and 1, so we use only vect0. The vector is a 32
286  * bit number, using 5 bits for every channel, the least significant bits
287  * correspond to channel 0 and the next 5 bits correspond to channel 1. Vector
288  * bits for RX/TX are:
289  * RX
290  * bit 0 - new transaction from SUME
291  * bit 1 - SUME received our bouncebuffer address
292  * bit 2 - SUME copied the received data to our bouncebuffer, transaction done
293  * TX
294  * bit 3 - SUME received our bouncebuffer address
295  * bit 4 - SUME copied the data from our bouncebuffer, transaction done
296  *
297  * There are two finite state machines (one for TX, one for RX). We loop
298  * through channels 0 and 1 to check and our current state and which interrupt
299  * bit is set.
300  * TX
301  * SUME_RIFFA_CHAN_STATE_IDLE: waiting for the first TX transaction.
302  * SUME_RIFFA_CHAN_STATE_READY: we prepared (filled with data) the bouncebuffer
303  * and triggered the SUME for the TX transaction. Waiting for interrupt bit 3
304  * to go to the next state.
305  * SUME_RIFFA_CHAN_STATE_READ: waiting for interrupt bit 4 (for SUME to send
306  * our packet). Then we get the length of the sent data and go back to the
307  * IDLE state.
308  * RX
309  * SUME_RIFFA_CHAN_STATE_IDLE: waiting for the interrupt bit 0 (new RX
310  * transaction). When we get it, we prepare our bouncebuffer for reading and
311  * trigger the SUME to start the transaction. Go to the next state.
312  * SUME_RIFFA_CHAN_STATE_READY: waiting for the interrupt bit 1 (SUME got our
313  * bouncebuffer). Go to the next state.
314  * SUME_RIFFA_CHAN_STATE_READ: SUME copied data and our bouncebuffer is ready,
315  * we can build the mbuf and go back to the IDLE state.
316  */
317 static void
318 sume_intr_handler(void *arg)
319 {
320 	struct sume_adapter *adapter = arg;
321 	uint32_t vect, vect0, len;
322 	int ch, loops;
323 	device_t dev = adapter->dev;
324 	struct mbuf *m = NULL;
325 	if_t ifp = NULL;
326 	struct riffa_chnl_dir *send, *recv;
327 
328 	SUME_LOCK(adapter);
329 
330 	vect0 = read_reg(adapter, RIFFA_IRQ_REG0_OFF);
331 	if ((vect0 & SUME_INVALID_VECT) != 0) {
332 		SUME_UNLOCK(adapter);
333 		return;
334 	}
335 
336 	/*
337 	 * We only have one interrupt for all channels and no way
338 	 * to quickly lookup for which channel(s) we got an interrupt?
339 	 */
340 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
341 		vect = vect0 >> (5 * ch);
342 		send = adapter->send[ch];
343 		recv = adapter->recv[ch];
344 
345 		loops = 0;
346 		while ((vect & (SUME_MSI_TXBUF | SUME_MSI_TXDONE)) &&
347 		    loops <= 5) {
348 			if (adapter->sume_debug)
349 				device_printf(dev, "TX ch %d state %u vect = "
350 				    "0x%08x\n", ch, send->state, vect);
351 			switch (send->state) {
352 			case SUME_RIFFA_CHAN_STATE_IDLE:
353 				break;
354 			case SUME_RIFFA_CHAN_STATE_READY:
355 				if (!(vect & SUME_MSI_TXBUF)) {
356 					device_printf(dev, "ch %d unexpected "
357 					    "interrupt in send+3 state %u: "
358 					    "vect = 0x%08x\n", ch, send->state,
359 					    vect);
360 					send->recovery = 1;
361 					break;
362 				}
363 				send->state = SUME_RIFFA_CHAN_STATE_READ;
364 				vect &= ~SUME_MSI_TXBUF;
365 				break;
366 			case SUME_RIFFA_CHAN_STATE_READ:
367 				if (!(vect & SUME_MSI_TXDONE)) {
368 					device_printf(dev, "ch %d unexpected "
369 					    "interrupt in send+4 state %u: "
370 					    "vect = 0x%08x\n", ch, send->state,
371 					    vect);
372 					send->recovery = 1;
373 					break;
374 				}
375 				send->state = SUME_RIFFA_CHAN_STATE_LEN;
376 
377 				len = read_reg(adapter, RIFFA_CHNL_REG(ch,
378 				    RIFFA_RX_TNFR_LEN_REG_OFF));
379 				if (ch == SUME_RIFFA_CHANNEL_DATA) {
380 					send->state =
381 					    SUME_RIFFA_CHAN_STATE_IDLE;
382 					check_tx_queues(adapter);
383 				} else if (ch == SUME_RIFFA_CHANNEL_REG)
384 					wakeup(&send->event);
385 				else {
386 					device_printf(dev, "ch %d unexpected "
387 					    "interrupt in send+4 state %u: "
388 					    "vect = 0x%08x\n", ch, send->state,
389 					    vect);
390 					send->recovery = 1;
391 				}
392 				vect &= ~SUME_MSI_TXDONE;
393 				break;
394 			case SUME_RIFFA_CHAN_STATE_LEN:
395 				break;
396 			default:
397 				device_printf(dev, "unknown TX state!\n");
398 			}
399 			loops++;
400 		}
401 
402 		if ((vect & (SUME_MSI_TXBUF | SUME_MSI_TXDONE)) &&
403 		    send->recovery)
404 			device_printf(dev, "ch %d ignoring vect = 0x%08x "
405 			    "during TX; not in recovery; state = %d loops = "
406 			    "%d\n", ch, vect, send->state, loops);
407 
408 		loops = 0;
409 		while ((vect & (SUME_MSI_RXQUE | SUME_MSI_RXBUF |
410 		    SUME_MSI_RXDONE)) && loops < 5) {
411 			if (adapter->sume_debug)
412 				device_printf(dev, "RX ch %d state %u vect = "
413 				    "0x%08x\n", ch, recv->state, vect);
414 			switch (recv->state) {
415 			case SUME_RIFFA_CHAN_STATE_IDLE:
416 				if (!(vect & SUME_MSI_RXQUE)) {
417 					device_printf(dev, "ch %d unexpected "
418 					    "interrupt in recv+0 state %u: "
419 					    "vect = 0x%08x\n", ch, recv->state,
420 					    vect);
421 					recv->recovery = 1;
422 					break;
423 				}
424 				uint32_t max_ptr;
425 
426 				/* Clear recovery state. */
427 				recv->recovery = 0;
428 
429 				/* Get offset and length. */
430 				recv->offlast = read_reg(adapter,
431 				    RIFFA_CHNL_REG(ch,
432 				    RIFFA_TX_OFFLAST_REG_OFF));
433 				recv->len = read_reg(adapter, RIFFA_CHNL_REG(ch,
434 				    RIFFA_TX_LEN_REG_OFF));
435 
436 				/* Boundary checks. */
437 				max_ptr = (uint32_t)((uintptr_t)recv->buf_addr
438 				    + SUME_RIFFA_OFFSET(recv->offlast)
439 				    + SUME_RIFFA_LEN(recv->len) - 1);
440 				if (max_ptr <
441 				    (uint32_t)((uintptr_t)recv->buf_addr))
442 					device_printf(dev, "receive buffer "
443 					    "wrap-around overflow.\n");
444 				if (SUME_RIFFA_OFFSET(recv->offlast) +
445 				    SUME_RIFFA_LEN(recv->len) >
446 				    adapter->sg_buf_size)
447 					device_printf(dev, "receive buffer too"
448 					    " small.\n");
449 
450 				/* Fill the bouncebuf "descriptor". */
451 				sume_fill_bb_desc(adapter, recv,
452 				    SUME_RIFFA_LEN(recv->len));
453 
454 				bus_dmamap_sync(recv->ch_tag, recv->ch_map,
455 				    BUS_DMASYNC_PREREAD |
456 				    BUS_DMASYNC_PREWRITE);
457 				write_reg(adapter, RIFFA_CHNL_REG(ch,
458 				    RIFFA_TX_SG_ADDR_LO_REG_OFF),
459 				    SUME_RIFFA_LO_ADDR(recv->buf_hw_addr));
460 				write_reg(adapter, RIFFA_CHNL_REG(ch,
461 				    RIFFA_TX_SG_ADDR_HI_REG_OFF),
462 				    SUME_RIFFA_HI_ADDR(recv->buf_hw_addr));
463 				write_reg(adapter, RIFFA_CHNL_REG(ch,
464 				    RIFFA_TX_SG_LEN_REG_OFF),
465 				    4 * recv->num_sg);
466 				bus_dmamap_sync(recv->ch_tag, recv->ch_map,
467 				    BUS_DMASYNC_POSTREAD |
468 				    BUS_DMASYNC_POSTWRITE);
469 
470 				recv->state = SUME_RIFFA_CHAN_STATE_READY;
471 				vect &= ~SUME_MSI_RXQUE;
472 				break;
473 			case SUME_RIFFA_CHAN_STATE_READY:
474 				if (!(vect & SUME_MSI_RXBUF)) {
475 					device_printf(dev, "ch %d unexpected "
476 					    "interrupt in recv+1 state %u: "
477 					    "vect = 0x%08x\n", ch, recv->state,
478 					    vect);
479 					recv->recovery = 1;
480 					break;
481 				}
482 				recv->state = SUME_RIFFA_CHAN_STATE_READ;
483 				vect &= ~SUME_MSI_RXBUF;
484 				break;
485 			case SUME_RIFFA_CHAN_STATE_READ:
486 				if (!(vect & SUME_MSI_RXDONE)) {
487 					device_printf(dev, "ch %d unexpected "
488 					    "interrupt in recv+2 state %u: "
489 					    "vect = 0x%08x\n", ch, recv->state,
490 					    vect);
491 					recv->recovery = 1;
492 					break;
493 				}
494 				len = read_reg(adapter, RIFFA_CHNL_REG(ch,
495 				    RIFFA_TX_TNFR_LEN_REG_OFF));
496 
497 				/* Remember, len and recv->len are words. */
498 				if (ch == SUME_RIFFA_CHANNEL_DATA) {
499 					m = sume_rx_build_mbuf(adapter,
500 					    len << 2);
501 					recv->state =
502 					    SUME_RIFFA_CHAN_STATE_IDLE;
503 				} else if (ch == SUME_RIFFA_CHANNEL_REG)
504 					wakeup(&recv->event);
505 				else {
506 					device_printf(dev, "ch %d unexpected "
507 					    "interrupt in recv+2 state %u: "
508 					    "vect = 0x%08x\n", ch, recv->state,
509 					    vect);
510 					recv->recovery = 1;
511 				}
512 				vect &= ~SUME_MSI_RXDONE;
513 				break;
514 			case SUME_RIFFA_CHAN_STATE_LEN:
515 				break;
516 			default:
517 				device_printf(dev, "unknown RX state!\n");
518 			}
519 			loops++;
520 		}
521 
522 		if ((vect & (SUME_MSI_RXQUE | SUME_MSI_RXBUF |
523 		    SUME_MSI_RXDONE)) && recv->recovery) {
524 			device_printf(dev, "ch %d ignoring vect = 0x%08x "
525 			    "during RX; not in recovery; state = %d, loops = "
526 			    "%d\n", ch, vect, recv->state, loops);
527 
528 			/* Clean the unfinished transaction. */
529 			if (ch == SUME_RIFFA_CHANNEL_REG &&
530 			    vect & SUME_MSI_RXDONE) {
531 				read_reg(adapter, RIFFA_CHNL_REG(ch,
532 				    RIFFA_TX_TNFR_LEN_REG_OFF));
533 				recv->recovery = 0;
534 			}
535 		}
536 	}
537 	SUME_UNLOCK(adapter);
538 
539 	if (m != NULL) {
540 		ifp = m->m_pkthdr.rcvif;
541 		if_input(ifp, m);
542 	}
543 }
544 
545 /*
546  * As we cannot disable interrupt generation, ignore early interrupts by waiting
547  * for the adapter to go into the 'running' state.
548  */
549 static int
550 sume_intr_filter(void *arg)
551 {
552 	struct sume_adapter *adapter = arg;
553 
554 	if (adapter->running == 0)
555 		return (FILTER_STRAY);
556 
557 	return (FILTER_SCHEDULE_THREAD);
558 }
559 
560 static int
561 sume_probe_riffa_pci(struct sume_adapter *adapter)
562 {
563 	device_t dev = adapter->dev;
564 	int error, count, capmem;
565 	uint32_t reg, devctl, linkctl;
566 
567 	pci_enable_busmaster(dev);
568 
569 	adapter->rid = PCIR_BAR(0);
570 	adapter->bar0_addr = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
571 	    &adapter->rid, RF_ACTIVE);
572 	if (adapter->bar0_addr == NULL) {
573 		device_printf(dev, "unable to allocate bus resource: "
574 		    "BAR0 address\n");
575 		return (ENXIO);
576 	}
577 	adapter->bt = rman_get_bustag(adapter->bar0_addr);
578 	adapter->bh = rman_get_bushandle(adapter->bar0_addr);
579 	adapter->bar0_len = rman_get_size(adapter->bar0_addr);
580 	if (adapter->bar0_len != 1024) {
581 		device_printf(dev, "BAR0 resource length %lu != 1024\n",
582 		    adapter->bar0_len);
583 		return (ENXIO);
584 	}
585 
586 	count = pci_msi_count(dev);
587 	error = pci_alloc_msi(dev, &count);
588 	if (error) {
589 		device_printf(dev, "unable to allocate bus resource: PCI "
590 		    "MSI\n");
591 		return (error);
592 	}
593 
594 	adapter->irq.rid = 1; /* Should be 1, thus says pci_alloc_msi() */
595 	adapter->irq.res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
596 	    &adapter->irq.rid, RF_SHAREABLE | RF_ACTIVE);
597 	if (adapter->irq.res == NULL) {
598 		device_printf(dev, "unable to allocate bus resource: IRQ "
599 		    "memory\n");
600 		return (ENXIO);
601 	}
602 
603 	error = bus_setup_intr(dev, adapter->irq.res, INTR_MPSAFE |
604 	    INTR_TYPE_NET, sume_intr_filter, sume_intr_handler, adapter,
605 	    &adapter->irq.tag);
606 	if (error) {
607 		device_printf(dev, "failed to setup interrupt for rid %d, name"
608 		    " %s: %d\n", adapter->irq.rid, "SUME_INTR", error);
609 		return (ENXIO);
610 	}
611 
612 	if (pci_find_cap(dev, PCIY_EXPRESS, &capmem) != 0) {
613 		device_printf(dev, "PCI not PCIe capable\n");
614 		return (ENXIO);
615 	}
616 
617 	devctl = pci_read_config(dev, capmem + PCIER_DEVICE_CTL, 2);
618 	pci_write_config(dev, capmem + PCIER_DEVICE_CTL, (devctl |
619 	    PCIEM_CTL_EXT_TAG_FIELD), 2);
620 
621 	devctl = pci_read_config(dev, capmem + PCIER_DEVICE_CTL2, 2);
622 	pci_write_config(dev, capmem + PCIER_DEVICE_CTL2, (devctl |
623 	    PCIEM_CTL2_ID_ORDERED_REQ_EN), 2);
624 
625 	linkctl = pci_read_config(dev, capmem + PCIER_LINK_CTL, 2);
626 	pci_write_config(dev, capmem + PCIER_LINK_CTL, (linkctl |
627 	    PCIEM_LINK_CTL_RCB), 2);
628 
629 	reg = read_reg(adapter, RIFFA_INFO_REG_OFF);
630 	adapter->num_sg = RIFFA_SG_ELEMS * ((reg >> 19) & 0xf);
631 	adapter->sg_buf_size = RIFFA_SG_BUF_SIZE * ((reg >> 19) & 0xf);
632 
633 	error = ENODEV;
634 	/* Check bus master is enabled. */
635 	if (((reg >> 4) & 0x1) != 1) {
636 		device_printf(dev, "bus master not enabled: %d\n",
637 		    (reg >> 4) & 0x1);
638 		return (error);
639 	}
640 	/* Check link parameters are valid. */
641 	if (((reg >> 5) & 0x3f) == 0 || ((reg >> 11) & 0x3) == 0) {
642 		device_printf(dev, "link parameters not valid: %d %d\n",
643 		    (reg >> 5) & 0x3f, (reg >> 11) & 0x3);
644 		return (error);
645 	}
646 	/* Check # of channels are within valid range. */
647 	if ((reg & 0xf) == 0 || (reg & 0xf) > RIFFA_MAX_CHNLS) {
648 		device_printf(dev, "number of channels out of range: %d\n",
649 		    reg & 0xf);
650 		return (error);
651 	}
652 	/* Check bus width. */
653 	if (((reg >> 19) & 0xf) == 0 ||
654 	    ((reg >> 19) & 0xf) > RIFFA_MAX_BUS_WIDTH_PARAM) {
655 		device_printf(dev, "bus width out of range: %d\n",
656 		    (reg >> 19) & 0xf);
657 		return (error);
658 	}
659 
660 	device_printf(dev, "[riffa] # of channels: %d\n",
661 	    reg & 0xf);
662 	device_printf(dev, "[riffa] bus interface width: %d\n",
663 	    ((reg >> 19) & 0xf) << 5);
664 	device_printf(dev, "[riffa] bus master enabled: %d\n",
665 	    (reg >> 4) & 0x1);
666 	device_printf(dev, "[riffa] negotiated link width: %d\n",
667 	    (reg >> 5) & 0x3f);
668 	device_printf(dev, "[riffa] negotiated rate width: %d MTs\n",
669 	    ((reg >> 11) & 0x3) * 2500);
670 	device_printf(dev, "[riffa] max downstream payload: %d B\n",
671 	    128 << ((reg >> 13) & 0x7));
672 	device_printf(dev, "[riffa] max upstream payload: %d B\n",
673 	    128 << ((reg >> 16) & 0x7));
674 
675 	return (0);
676 }
677 
678 /* If there is no sume_if_init, the ether_ioctl panics. */
679 static void
680 sume_if_init(void *sc)
681 {
682 }
683 
684 /* Write the address and length for our incoming / outgoing transaction. */
685 static void
686 sume_fill_bb_desc(struct sume_adapter *adapter, struct riffa_chnl_dir *p,
687     uint64_t len)
688 {
689 	struct nf_bb_desc *bouncebuf = (struct nf_bb_desc *) p->buf_addr;
690 
691 	bouncebuf->lower = (p->buf_hw_addr + sizeof(struct nf_bb_desc));
692 	bouncebuf->upper = (p->buf_hw_addr + sizeof(struct nf_bb_desc)) >> 32;
693 	bouncebuf->len = len >> 2;
694 }
695 
696 /* Module register locked write. */
697 static int
698 sume_modreg_write_locked(struct sume_adapter *adapter)
699 {
700 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
701 
702 	/* Let the FPGA know about the transfer. */
703 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
704 	    RIFFA_RX_OFFLAST_REG_OFF), SUME_OFFLAST);
705 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
706 	    RIFFA_RX_LEN_REG_OFF), send->len);	/* words */
707 
708 	/* Fill the bouncebuf "descriptor". */
709 	sume_fill_bb_desc(adapter, send, SUME_RIFFA_LEN(send->len));
710 
711 	/* Update the state before intiating the DMA to avoid races. */
712 	send->state = SUME_RIFFA_CHAN_STATE_READY;
713 
714 	bus_dmamap_sync(send->ch_tag, send->ch_map,
715 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
716 	/* DMA. */
717 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
718 	    RIFFA_RX_SG_ADDR_LO_REG_OFF),
719 	    SUME_RIFFA_LO_ADDR(send->buf_hw_addr));
720 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
721 	    RIFFA_RX_SG_ADDR_HI_REG_OFF),
722 	    SUME_RIFFA_HI_ADDR(send->buf_hw_addr));
723 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
724 	    RIFFA_RX_SG_LEN_REG_OFF), 4 * send->num_sg);
725 	bus_dmamap_sync(send->ch_tag, send->ch_map,
726 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
727 
728 	return (0);
729 }
730 
731 /*
732  * Request a register read or write (depending on optype).
733  * If optype is set (0x1f) this will result in a register write,
734  * otherwise this will result in a register read request at the given
735  * address and the result will need to be DMAed back.
736  */
737 static int
738 sume_module_reg_write(struct nf_priv *nf_priv, struct sume_ifreq *sifr,
739     uint32_t optype)
740 {
741 	struct sume_adapter *adapter = nf_priv->adapter;
742 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
743 	struct nf_regop_data *data;
744 	int error;
745 
746 	/*
747 	 * 1. Make sure the channel is free;  otherwise return EBUSY.
748 	 * 2. Prepare the memory in the bounce buffer (which we always
749 	 *    use for regs).
750 	 * 3. Start the DMA process.
751 	 * 4. Sleep and wait for result and return success or error.
752 	 */
753 	SUME_LOCK(adapter);
754 
755 	if (send->state != SUME_RIFFA_CHAN_STATE_IDLE) {
756 		SUME_UNLOCK(adapter);
757 		return (EBUSY);
758 	}
759 
760 	data = (struct nf_regop_data *) (send->buf_addr +
761 	    sizeof(struct nf_bb_desc));
762 	data->addr = htole32(sifr->addr);
763 	data->val = htole32(sifr->val);
764 	/* Tag to indentify request. */
765 	data->rtag = htole32(++send->rtag);
766 	data->optype = htole32(optype);
767 	send->len = sizeof(struct nf_regop_data) / 4; /* words */
768 
769 	error = sume_modreg_write_locked(adapter);
770 	if (error) {
771 		SUME_UNLOCK(adapter);
772 		return (EFAULT);
773 	}
774 
775 	/* Timeout after 1s. */
776 	if (send->state != SUME_RIFFA_CHAN_STATE_LEN)
777 		error = msleep(&send->event, &adapter->lock, 0,
778 		    "Waiting recv finish", 1 * hz);
779 
780 	/* This was a write so we are done; were interrupted, or timed out. */
781 	if (optype != SUME_MR_READ || error != 0 || error == EWOULDBLOCK) {
782 		send->state = SUME_RIFFA_CHAN_STATE_IDLE;
783 		if (optype == SUME_MR_READ)
784 			error = EWOULDBLOCK;
785 		else
786 			error = 0;
787 	} else
788 		error = 0;
789 
790 	/*
791 	 * For read requests we will update state once we are done
792 	 * having read the result to avoid any two outstanding
793 	 * transactions, or we need a queue and validate tags,
794 	 * which is a lot of work for a low priority, infrequent
795 	 * event.
796 	 */
797 
798 	SUME_UNLOCK(adapter);
799 
800 	return (error);
801 }
802 
803 /* Module register read. */
804 static int
805 sume_module_reg_read(struct nf_priv *nf_priv, struct sume_ifreq *sifr)
806 {
807 	struct sume_adapter *adapter = nf_priv->adapter;
808 	struct riffa_chnl_dir *recv = adapter->recv[SUME_RIFFA_CHANNEL_REG];
809 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
810 	struct nf_regop_data *data;
811 	int error = 0;
812 
813 	/*
814 	 * 0. Sleep waiting for result if needed (unless condition is
815 	 *    true already).
816 	 * 1. Read DMA results.
817 	 * 2. Update state on *TX* to IDLE to allow next read to start.
818 	 */
819 	SUME_LOCK(adapter);
820 
821 	bus_dmamap_sync(recv->ch_tag, recv->ch_map,
822 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
823 	/*
824 	 * We only need to be woken up at the end of the transaction.
825 	 * Timeout after 1s.
826 	 */
827 	if (recv->state != SUME_RIFFA_CHAN_STATE_READ)
828 		error = msleep(&recv->event, &adapter->lock, 0,
829 		    "Waiting transaction finish", 1 * hz);
830 
831 	if (recv->state != SUME_RIFFA_CHAN_STATE_READ || error == EWOULDBLOCK) {
832 		SUME_UNLOCK(adapter);
833 		device_printf(adapter->dev, "wait error: %d\n", error);
834 		return (EWOULDBLOCK);
835 	}
836 
837 	bus_dmamap_sync(recv->ch_tag, recv->ch_map,
838 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
839 
840 	/*
841 	 * Read reply data and validate address and tag.
842 	 * Note: we do access the send side without lock but the state
843 	 * machine does prevent the data from changing.
844 	 */
845 	data = (struct nf_regop_data *) (recv->buf_addr +
846 	    sizeof(struct nf_bb_desc));
847 
848 	if (le32toh(data->rtag) != send->rtag)
849 		device_printf(adapter->dev, "rtag error: 0x%08x 0x%08x\n",
850 		    le32toh(data->rtag), send->rtag);
851 
852 	sifr->val = le32toh(data->val);
853 	recv->state = SUME_RIFFA_CHAN_STATE_IDLE;
854 
855 	/* We are done. */
856 	send->state = SUME_RIFFA_CHAN_STATE_IDLE;
857 
858 	SUME_UNLOCK(adapter);
859 
860 	return (0);
861 }
862 
863 /* Read value from a module register and return it to a sume_ifreq. */
864 static int
865 get_modreg_value(struct nf_priv *nf_priv, struct sume_ifreq *sifr)
866 {
867 	int error;
868 
869 	error = sume_module_reg_write(nf_priv, sifr, SUME_MR_READ);
870 	if (!error)
871 		error = sume_module_reg_read(nf_priv, sifr);
872 
873 	return (error);
874 }
875 
876 static int
877 sume_if_ioctl(if_t ifp, unsigned long cmd, caddr_t data)
878 {
879 	struct ifreq *ifr = (struct ifreq *) data;
880 	struct nf_priv *nf_priv = if_getsoftc(ifp);
881 	struct sume_ifreq sifr;
882 	int error = 0;
883 
884 	switch (cmd) {
885 	case SIOCGIFMEDIA:
886 	case SIOCGIFXMEDIA:
887 		error = ifmedia_ioctl(ifp, ifr, &nf_priv->media, cmd);
888 		break;
889 
890 	case SUME_IOCTL_CMD_WRITE_REG:
891 		error = copyin(ifr_data_get_ptr(ifr), &sifr, sizeof(sifr));
892 		if (error) {
893 			error = EINVAL;
894 			break;
895 		}
896 		error = sume_module_reg_write(nf_priv, &sifr, SUME_MR_WRITE);
897 		break;
898 
899 	case SUME_IOCTL_CMD_READ_REG:
900 		error = copyin(ifr_data_get_ptr(ifr), &sifr, sizeof(sifr));
901 		if (error) {
902 			error = EINVAL;
903 			break;
904 		}
905 
906 		error = get_modreg_value(nf_priv, &sifr);
907 		if (error)
908 			break;
909 
910 		error = copyout(&sifr, ifr_data_get_ptr(ifr), sizeof(sifr));
911 		if (error)
912 			error = EINVAL;
913 
914 		break;
915 
916 	case SIOCSIFFLAGS:
917 		/* Silence tcpdump 'promisc mode not supported' warning. */
918 		if (if_getflags(ifp) & IFF_PROMISC)
919 			break;
920 
921 	default:
922 		error = ether_ioctl(ifp, cmd, data);
923 		break;
924 	}
925 
926 	return (error);
927 }
928 
929 static int
930 sume_media_change(if_t ifp)
931 {
932 	struct nf_priv *nf_priv = if_getsoftc(ifp);
933 	struct ifmedia *ifm = &nf_priv->media;
934 
935 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
936 		return (EINVAL);
937 
938 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_10G_SR)
939 		if_setbaudrate(ifp, ifmedia_baudrate(IFM_ETHER | IFM_10G_SR));
940 	else
941 		if_setbaudrate(ifp, ifmedia_baudrate(ifm->ifm_media));
942 
943 	return (0);
944 }
945 
946 static void
947 sume_update_link_status(if_t ifp)
948 {
949 	struct nf_priv *nf_priv = if_getsoftc(ifp);
950 	struct sume_adapter *adapter = nf_priv->adapter;
951 	struct sume_ifreq sifr;
952 	int link_status;
953 
954 	sifr.addr = SUME_STATUS_ADDR(nf_priv->port);
955 	sifr.val = 0;
956 
957 	if (get_modreg_value(nf_priv, &sifr))
958 		return;
959 
960 	link_status = SUME_LINK_STATUS(sifr.val);
961 
962 	if (!link_status && nf_priv->link_up) {
963 		if_link_state_change(ifp, LINK_STATE_DOWN);
964 		nf_priv->link_up = 0;
965 		if (adapter->sume_debug)
966 			device_printf(adapter->dev, "port %d link state "
967 			    "changed to DOWN\n", nf_priv->unit);
968 	} else if (link_status && !nf_priv->link_up) {
969 		nf_priv->link_up = 1;
970 		if_link_state_change(ifp, LINK_STATE_UP);
971 		if (adapter->sume_debug)
972 			device_printf(adapter->dev, "port %d link state "
973 			    "changed to UP\n", nf_priv->unit);
974 	}
975 }
976 
977 static void
978 sume_media_status(if_t ifp, struct ifmediareq *ifmr)
979 {
980 	struct nf_priv *nf_priv = if_getsoftc(ifp);
981 	struct ifmedia *ifm = &nf_priv->media;
982 
983 	if (ifm->ifm_cur->ifm_media == (IFM_ETHER | IFM_10G_SR) &&
984 	    (if_getflags(ifp) & IFF_UP))
985 		ifmr->ifm_active = IFM_ETHER | IFM_10G_SR;
986 	else
987 		ifmr->ifm_active = ifm->ifm_cur->ifm_media;
988 
989 	ifmr->ifm_status |= IFM_AVALID;
990 
991 	sume_update_link_status(ifp);
992 
993 	if (nf_priv->link_up)
994 		ifmr->ifm_status |= IFM_ACTIVE;
995 }
996 
997 /*
998  * Packet to transmit. We take the packet data from the mbuf and copy it to the
999  * bouncebuffer address buf_addr+3*sizeof(uint32_t)+16. The 16 bytes before the
1000  * packet data are for metadata: sport/dport (depending on our source
1001  * interface), packet length and magic 0xcafe. We tell the SUME about the
1002  * transfer, fill the first 3*sizeof(uint32_t) bytes of the bouncebuffer with
1003  * the information about the start and length of the packet and trigger the
1004  * transaction.
1005  */
1006 static int
1007 sume_if_start_locked(if_t ifp)
1008 {
1009 	struct mbuf *m;
1010 	struct nf_priv *nf_priv = if_getsoftc(ifp);
1011 	struct sume_adapter *adapter = nf_priv->adapter;
1012 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_DATA];
1013 	uint8_t *outbuf;
1014 	struct nf_metadata *mdata;
1015 	int plen = SUME_MIN_PKT_SIZE;
1016 
1017 	KASSERT(mtx_owned(&adapter->lock), ("SUME lock not owned"));
1018 	KASSERT(send->state == SUME_RIFFA_CHAN_STATE_IDLE,
1019 	    ("SUME not in IDLE state"));
1020 
1021 	m = if_dequeue(ifp);
1022 	if (m == NULL)
1023 		return (EINVAL);
1024 
1025 	/* Packets large enough do not need to be padded */
1026 	if (m->m_pkthdr.len > SUME_MIN_PKT_SIZE)
1027 		plen = m->m_pkthdr.len;
1028 
1029 	if (adapter->sume_debug)
1030 		device_printf(adapter->dev, "sending %d bytes to %s%d\n", plen,
1031 		    SUME_ETH_DEVICE_NAME, nf_priv->unit);
1032 
1033 	outbuf = (uint8_t *) send->buf_addr + sizeof(struct nf_bb_desc);
1034 	mdata = (struct nf_metadata *) outbuf;
1035 
1036 	/* Clear the recovery flag. */
1037 	send->recovery = 0;
1038 
1039 	/* Make sure we fit with the 16 bytes nf_metadata. */
1040 	if (m->m_pkthdr.len + sizeof(struct nf_metadata) >
1041 	    adapter->sg_buf_size) {
1042 		device_printf(adapter->dev, "packet too big for bounce buffer "
1043 		    "(%d)\n", m->m_pkthdr.len);
1044 		m_freem(m);
1045 		nf_priv->stats.tx_dropped++;
1046 		return (ENOMEM);
1047 	}
1048 
1049 	bus_dmamap_sync(send->ch_tag, send->ch_map,
1050 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1051 
1052 	/* Zero out the padded data */
1053 	if (m->m_pkthdr.len < SUME_MIN_PKT_SIZE)
1054 		bzero(outbuf + sizeof(struct nf_metadata), SUME_MIN_PKT_SIZE);
1055 	/* Skip the first 16 bytes for the metadata. */
1056 	m_copydata(m, 0, m->m_pkthdr.len, outbuf + sizeof(struct nf_metadata));
1057 	send->len = (sizeof(struct nf_metadata) + plen + 3) / 4;
1058 
1059 	/* Fill in the metadata: CPU(DMA) ports are odd, MAC ports are even. */
1060 	mdata->sport = htole16(1 << (nf_priv->port * 2 + 1));
1061 	mdata->dport = htole16(1 << (nf_priv->port * 2));
1062 	mdata->plen = htole16(plen);
1063 	mdata->magic = htole16(SUME_RIFFA_MAGIC);
1064 	mdata->t1 = htole32(0);
1065 	mdata->t2 = htole32(0);
1066 
1067 	/* Let the FPGA know about the transfer. */
1068 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1069 	    RIFFA_RX_OFFLAST_REG_OFF), SUME_OFFLAST);
1070 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1071 	    RIFFA_RX_LEN_REG_OFF), send->len);
1072 
1073 	/* Fill the bouncebuf "descriptor". */
1074 	sume_fill_bb_desc(adapter, send, SUME_RIFFA_LEN(send->len));
1075 
1076 	/* Update the state before intiating the DMA to avoid races. */
1077 	send->state = SUME_RIFFA_CHAN_STATE_READY;
1078 
1079 	/* DMA. */
1080 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1081 	    RIFFA_RX_SG_ADDR_LO_REG_OFF),
1082 	    SUME_RIFFA_LO_ADDR(send->buf_hw_addr));
1083 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1084 	    RIFFA_RX_SG_ADDR_HI_REG_OFF),
1085 	    SUME_RIFFA_HI_ADDR(send->buf_hw_addr));
1086 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1087 	    RIFFA_RX_SG_LEN_REG_OFF), 4 * send->num_sg);
1088 
1089 	bus_dmamap_sync(send->ch_tag, send->ch_map,
1090 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1091 
1092 	nf_priv->stats.tx_packets++;
1093 	nf_priv->stats.tx_bytes += plen;
1094 
1095 	/* We can free as long as we use the bounce buffer. */
1096 	m_freem(m);
1097 
1098 	adapter->last_ifc = nf_priv->port;
1099 
1100 	/* Reset watchdog counter. */
1101 	adapter->wd_counter = 0;
1102 
1103 	return (0);
1104 }
1105 
1106 static void
1107 sume_if_start(if_t ifp)
1108 {
1109 	struct nf_priv *nf_priv = if_getsoftc(ifp);
1110 	struct sume_adapter *adapter = nf_priv->adapter;
1111 
1112 	if (!adapter->running || !(if_getflags(ifp) & IFF_UP))
1113 		return;
1114 
1115 	SUME_LOCK(adapter);
1116 	if (adapter->send[SUME_RIFFA_CHANNEL_DATA]->state ==
1117 	    SUME_RIFFA_CHAN_STATE_IDLE)
1118 		sume_if_start_locked(ifp);
1119 	SUME_UNLOCK(adapter);
1120 }
1121 
1122 /*
1123  * We call this function at the end of every TX transaction to check for
1124  * remaining packets in the TX queues for every UP interface.
1125  */
1126 static void
1127 check_tx_queues(struct sume_adapter *adapter)
1128 {
1129 	int i, last_ifc;
1130 
1131 	KASSERT(mtx_owned(&adapter->lock), ("SUME lock not owned"));
1132 
1133 	last_ifc = adapter->last_ifc;
1134 
1135 	/* Check all interfaces */
1136 	for (i = last_ifc + 1; i < last_ifc + SUME_NPORTS + 1; i++) {
1137 		if_t ifp = adapter->ifp[i % SUME_NPORTS];
1138 
1139 		if (!(if_getflags(ifp) & IFF_UP))
1140 			continue;
1141 
1142 		if (!sume_if_start_locked(ifp))
1143 			break;
1144 	}
1145 }
1146 
1147 static int
1148 sume_ifp_alloc(struct sume_adapter *adapter, uint32_t port)
1149 {
1150 	if_t ifp;
1151 	struct nf_priv *nf_priv = malloc(sizeof(struct nf_priv), M_SUME,
1152 	    M_ZERO | M_WAITOK);
1153 
1154 	ifp = if_alloc(IFT_ETHER);
1155 	if (ifp == NULL) {
1156 		device_printf(adapter->dev, "cannot allocate ifnet\n");
1157 		return (ENOMEM);
1158 	}
1159 
1160 	adapter->ifp[port] = ifp;
1161 	if_setsoftc(ifp, nf_priv);
1162 
1163 	nf_priv->adapter = adapter;
1164 	nf_priv->unit = alloc_unr(unr);
1165 	nf_priv->port = port;
1166 	nf_priv->link_up = 0;
1167 
1168 	if_initname(ifp, SUME_ETH_DEVICE_NAME, nf_priv->unit);
1169 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
1170 
1171 	if_setinitfn(ifp, sume_if_init);
1172 	if_setstartfn(ifp, sume_if_start);
1173 	if_setioctlfn(ifp, sume_if_ioctl);
1174 
1175 	uint8_t hw_addr[ETHER_ADDR_LEN] = DEFAULT_ETHER_ADDRESS;
1176 	hw_addr[ETHER_ADDR_LEN-1] = nf_priv->unit;
1177 	ether_ifattach(ifp, hw_addr);
1178 
1179 	ifmedia_init(&nf_priv->media, IFM_IMASK, sume_media_change,
1180 	    sume_media_status);
1181 	ifmedia_add(&nf_priv->media, IFM_ETHER | IFM_10G_SR, 0, NULL);
1182 	ifmedia_set(&nf_priv->media, IFM_ETHER | IFM_10G_SR);
1183 
1184 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
1185 
1186 	return (0);
1187 }
1188 
1189 static void
1190 callback_dma(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1191 {
1192 	if (err)
1193 		return;
1194 
1195 	KASSERT(nseg == 1, ("%d segments returned!", nseg));
1196 
1197 	*(bus_addr_t *) arg = segs[0].ds_addr;
1198 }
1199 
1200 static int
1201 sume_probe_riffa_buffer(const struct sume_adapter *adapter,
1202     struct riffa_chnl_dir ***p, const char *dir)
1203 {
1204 	struct riffa_chnl_dir **rp;
1205 	bus_addr_t hw_addr;
1206 	int error, ch;
1207 	device_t dev = adapter->dev;
1208 
1209 	error = ENOMEM;
1210 	*p = malloc(SUME_RIFFA_CHANNELS * sizeof(struct riffa_chnl_dir *),
1211 	    M_SUME, M_ZERO | M_WAITOK);
1212 	if (*p == NULL) {
1213 		device_printf(dev, "malloc(%s) failed.\n", dir);
1214 		return (error);
1215 	}
1216 
1217 	rp = *p;
1218 	/* Allocate the chnl_dir structs themselves. */
1219 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
1220 		/* One direction. */
1221 		rp[ch] = malloc(sizeof(struct riffa_chnl_dir), M_SUME,
1222 		    M_ZERO | M_WAITOK);
1223 		if (rp[ch] == NULL) {
1224 			device_printf(dev, "malloc(%s[%d]) riffa_chnl_dir "
1225 			    "failed.\n", dir, ch);
1226 			return (error);
1227 		}
1228 
1229 		int err = bus_dma_tag_create(bus_get_dma_tag(dev),
1230 		    4, 0,
1231 		    BUS_SPACE_MAXADDR,
1232 		    BUS_SPACE_MAXADDR,
1233 		    NULL, NULL,
1234 		    adapter->sg_buf_size,
1235 		    1,
1236 		    adapter->sg_buf_size,
1237 		    0,
1238 		    NULL,
1239 		    NULL,
1240 		    &rp[ch]->ch_tag);
1241 
1242 		if (err) {
1243 			device_printf(dev, "bus_dma_tag_create(%s[%d]) "
1244 			    "failed.\n", dir, ch);
1245 			return (err);
1246 		}
1247 
1248 		err = bus_dmamem_alloc(rp[ch]->ch_tag, (void **)
1249 		    &rp[ch]->buf_addr, BUS_DMA_WAITOK | BUS_DMA_COHERENT |
1250 		    BUS_DMA_ZERO, &rp[ch]->ch_map);
1251 		if (err) {
1252 			device_printf(dev, "bus_dmamem_alloc(%s[%d]) failed.\n",
1253 			    dir, ch);
1254 			return (err);
1255 		}
1256 
1257 		bzero(rp[ch]->buf_addr, adapter->sg_buf_size);
1258 
1259 		err = bus_dmamap_load(rp[ch]->ch_tag, rp[ch]->ch_map,
1260 		    rp[ch]->buf_addr, adapter->sg_buf_size, callback_dma,
1261 		    &hw_addr, BUS_DMA_NOWAIT);
1262 		if (err) {
1263 			device_printf(dev, "bus_dmamap_load(%s[%d]) failed.\n",
1264 			    dir, ch);
1265 			return (err);
1266 		}
1267 		rp[ch]->buf_hw_addr = hw_addr;
1268 		rp[ch]->num_sg = 1;
1269 		rp[ch]->state = SUME_RIFFA_CHAN_STATE_IDLE;
1270 
1271 		rp[ch]->rtag = SUME_INIT_RTAG;
1272 	}
1273 
1274 	return (0);
1275 }
1276 
1277 static int
1278 sume_probe_riffa_buffers(struct sume_adapter *adapter)
1279 {
1280 	int error;
1281 
1282 	error = sume_probe_riffa_buffer(adapter, &adapter->recv, "recv");
1283 	if (error)
1284 		return (error);
1285 
1286 	error = sume_probe_riffa_buffer(adapter, &adapter->send, "send");
1287 
1288 	return (error);
1289 }
1290 
1291 static void
1292 sume_sysctl_init(struct sume_adapter *adapter)
1293 {
1294 	device_t dev = adapter->dev;
1295 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev);
1296 	struct sysctl_oid *tree = device_get_sysctl_tree(dev);
1297 	struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree);
1298 	struct sysctl_oid *tmp_tree;
1299 	char namebuf[MAX_IFC_NAME_LEN];
1300 	int i;
1301 
1302 	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "sume", CTLFLAG_RW,
1303 	    0, "SUME top-level tree");
1304 	if (tree == NULL) {
1305 		device_printf(dev, "SYSCTL_ADD_NODE failed.\n");
1306 		return;
1307 	}
1308 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug", CTLFLAG_RW,
1309 	    &adapter->sume_debug, 0, "debug int leaf");
1310 
1311 	/* total RX error stats */
1312 	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "rx_epkts",
1313 	    CTLFLAG_RD, &adapter->packets_err, 0, "rx errors");
1314 	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "rx_ebytes",
1315 	    CTLFLAG_RD, &adapter->bytes_err, 0, "rx error bytes");
1316 
1317 	for (i = SUME_NPORTS - 1; i >= 0; i--) {
1318 		if_t ifp = adapter->ifp[i];
1319 		if (ifp == NULL)
1320 			continue;
1321 
1322 		struct nf_priv *nf_priv = if_getsoftc(ifp);
1323 
1324 		snprintf(namebuf, MAX_IFC_NAME_LEN, "%s%d",
1325 		    SUME_ETH_DEVICE_NAME, nf_priv->unit);
1326 		tmp_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
1327 		    CTLFLAG_RW, 0, "SUME ifc tree");
1328 		if (tmp_tree == NULL) {
1329 			device_printf(dev, "SYSCTL_ADD_NODE failed.\n");
1330 			return;
1331 		}
1332 
1333 		/* Packets dropped by down interface. */
1334 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1335 		    "ifc_down_bytes", CTLFLAG_RD,
1336 		    &nf_priv->stats.ifc_down_bytes, 0, "ifc_down bytes");
1337 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1338 		    "ifc_down_packets", CTLFLAG_RD,
1339 		    &nf_priv->stats.ifc_down_packets, 0, "ifc_down packets");
1340 
1341 		/* HW RX stats */
1342 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1343 		    "hw_rx_packets", CTLFLAG_RD, &nf_priv->stats.hw_rx_packets,
1344 		    0, "hw_rx packets");
1345 
1346 		/* HW TX stats */
1347 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1348 		    "hw_tx_packets", CTLFLAG_RD, &nf_priv->stats.hw_tx_packets,
1349 		    0, "hw_tx packets");
1350 
1351 		/* RX stats */
1352 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1353 		    "rx_bytes", CTLFLAG_RD, &nf_priv->stats.rx_bytes, 0,
1354 		    "rx bytes");
1355 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1356 		    "rx_dropped", CTLFLAG_RD, &nf_priv->stats.rx_dropped, 0,
1357 		    "rx dropped");
1358 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1359 		    "rx_packets", CTLFLAG_RD, &nf_priv->stats.rx_packets, 0,
1360 		    "rx packets");
1361 
1362 		/* TX stats */
1363 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1364 		    "tx_bytes", CTLFLAG_RD, &nf_priv->stats.tx_bytes, 0,
1365 		    "tx bytes");
1366 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1367 		    "tx_dropped", CTLFLAG_RD, &nf_priv->stats.tx_dropped, 0,
1368 		    "tx dropped");
1369 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1370 		    "tx_packets", CTLFLAG_RD, &nf_priv->stats.tx_packets, 0,
1371 		    "tx packets");
1372 	}
1373 }
1374 
1375 static void
1376 sume_local_timer(void *arg)
1377 {
1378 	struct sume_adapter *adapter = arg;
1379 
1380 	if (!adapter->running)
1381 		return;
1382 
1383 	taskqueue_enqueue(adapter->tq, &adapter->stat_task);
1384 
1385 	SUME_LOCK(adapter);
1386 	if (adapter->send[SUME_RIFFA_CHANNEL_DATA]->state !=
1387 	    SUME_RIFFA_CHAN_STATE_IDLE && ++adapter->wd_counter >= 3) {
1388 		/* Resetting interfaces if stuck for 3 seconds. */
1389 		device_printf(adapter->dev, "TX stuck, resetting adapter.\n");
1390 		read_reg(adapter, RIFFA_INFO_REG_OFF);
1391 
1392 		adapter->send[SUME_RIFFA_CHANNEL_DATA]->state =
1393 		    SUME_RIFFA_CHAN_STATE_IDLE;
1394 		adapter->wd_counter = 0;
1395 
1396 		check_tx_queues(adapter);
1397 	}
1398 	SUME_UNLOCK(adapter);
1399 
1400 	callout_reset(&adapter->timer, 1 * hz, sume_local_timer, adapter);
1401 }
1402 
1403 static void
1404 sume_get_stats(void *context, int pending)
1405 {
1406 	struct sume_adapter *adapter = context;
1407 	int i;
1408 
1409 	for (i = 0; i < SUME_NPORTS; i++) {
1410 		if_t ifp = adapter->ifp[i];
1411 
1412 		if (if_getflags(ifp) & IFF_UP) {
1413 			struct nf_priv *nf_priv = if_getsoftc(ifp);
1414 			struct sume_ifreq sifr;
1415 
1416 			sume_update_link_status(ifp);
1417 
1418 			/* Get RX counter. */
1419 			sifr.addr = SUME_STAT_RX_ADDR(nf_priv->port);
1420 			sifr.val = 0;
1421 
1422 			if (!get_modreg_value(nf_priv, &sifr))
1423 				nf_priv->stats.hw_rx_packets += sifr.val;
1424 
1425 			/* Get TX counter. */
1426 			sifr.addr = SUME_STAT_TX_ADDR(nf_priv->port);
1427 			sifr.val = 0;
1428 
1429 			if (!get_modreg_value(nf_priv, &sifr))
1430 				nf_priv->stats.hw_tx_packets += sifr.val;
1431 		}
1432 	}
1433 }
1434 
1435 static int
1436 sume_attach(device_t dev)
1437 {
1438 	struct sume_adapter *adapter = device_get_softc(dev);
1439 	adapter->dev = dev;
1440 	int error, i;
1441 
1442 	mtx_init(&adapter->lock, "Global lock", NULL, MTX_DEF);
1443 
1444 	adapter->running = 0;
1445 
1446 	/* OK finish up RIFFA. */
1447 	error = sume_probe_riffa_pci(adapter);
1448 	if (error != 0)
1449 		goto error;
1450 
1451 	error = sume_probe_riffa_buffers(adapter);
1452 	if (error != 0)
1453 		goto error;
1454 
1455 	/* Now do the network interfaces. */
1456 	for (i = 0; i < SUME_NPORTS; i++) {
1457 		error = sume_ifp_alloc(adapter, i);
1458 		if (error != 0)
1459 			goto error;
1460 	}
1461 
1462 	/*  Register stats and register sysctls. */
1463 	sume_sysctl_init(adapter);
1464 
1465 	/* Reset the HW. */
1466 	read_reg(adapter, RIFFA_INFO_REG_OFF);
1467 
1468 	/* Ready to go, "enable" IRQ. */
1469 	adapter->running = 1;
1470 
1471 	callout_init(&adapter->timer, 1);
1472 	TASK_INIT(&adapter->stat_task, 0, sume_get_stats, adapter);
1473 
1474 	adapter->tq = taskqueue_create("sume_stats", M_NOWAIT,
1475 	    taskqueue_thread_enqueue, &adapter->tq);
1476 	taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s stattaskq",
1477 	    device_get_nameunit(adapter->dev));
1478 
1479 	callout_reset(&adapter->timer, 1 * hz, sume_local_timer, adapter);
1480 
1481 	return (0);
1482 
1483 error:
1484 	sume_detach(dev);
1485 
1486 	return (error);
1487 }
1488 
1489 static void
1490 sume_remove_riffa_buffer(const struct sume_adapter *adapter,
1491     struct riffa_chnl_dir **pp)
1492 {
1493 	int ch;
1494 
1495 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
1496 		if (pp[ch] == NULL)
1497 			continue;
1498 
1499 		if (pp[ch]->buf_hw_addr != 0) {
1500 			bus_dmamem_free(pp[ch]->ch_tag, pp[ch]->buf_addr,
1501 			    pp[ch]->ch_map);
1502 			pp[ch]->buf_hw_addr = 0;
1503 		}
1504 
1505 		free(pp[ch], M_SUME);
1506 	}
1507 }
1508 
1509 static void
1510 sume_remove_riffa_buffers(struct sume_adapter *adapter)
1511 {
1512 	if (adapter->send != NULL) {
1513 		sume_remove_riffa_buffer(adapter, adapter->send);
1514 		free(adapter->send, M_SUME);
1515 		adapter->send = NULL;
1516 	}
1517 	if (adapter->recv != NULL) {
1518 		sume_remove_riffa_buffer(adapter, adapter->recv);
1519 		free(adapter->recv, M_SUME);
1520 		adapter->recv = NULL;
1521 	}
1522 }
1523 
1524 static int
1525 sume_detach(device_t dev)
1526 {
1527 	struct sume_adapter *adapter = device_get_softc(dev);
1528 	int i;
1529 	struct nf_priv *nf_priv;
1530 
1531 	KASSERT(mtx_initialized(&adapter->lock), ("SUME mutex not "
1532 	    "initialized"));
1533 	adapter->running = 0;
1534 
1535 	/* Drain the stats callout and task queue. */
1536 	callout_drain(&adapter->timer);
1537 
1538 	if (adapter->tq) {
1539 		taskqueue_drain(adapter->tq, &adapter->stat_task);
1540 		taskqueue_free(adapter->tq);
1541 	}
1542 
1543 	for (i = 0; i < SUME_NPORTS; i++) {
1544 		if_t ifp = adapter->ifp[i];
1545 		if (ifp == NULL)
1546 			continue;
1547 
1548 		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
1549 		nf_priv = if_getsoftc(ifp);
1550 
1551 		if (if_getflags(ifp) & IFF_UP)
1552 			if_down(ifp);
1553 		ifmedia_removeall(&nf_priv->media);
1554 		free_unr(unr, nf_priv->unit);
1555 
1556 		if_setflagbits(ifp, 0, IFF_UP);
1557 		ether_ifdetach(ifp);
1558 		if_free(ifp);
1559 
1560 		free(nf_priv, M_SUME);
1561 	}
1562 
1563 	sume_remove_riffa_buffers(adapter);
1564 
1565 	if (adapter->irq.tag)
1566 		bus_teardown_intr(dev, adapter->irq.res, adapter->irq.tag);
1567 	if (adapter->irq.res)
1568 		bus_release_resource(dev, SYS_RES_IRQ, adapter->irq.rid,
1569 		    adapter->irq.res);
1570 
1571 	pci_release_msi(dev);
1572 
1573 	if (adapter->bar0_addr)
1574 		bus_release_resource(dev, SYS_RES_MEMORY, adapter->rid,
1575 		    adapter->bar0_addr);
1576 
1577 	mtx_destroy(&adapter->lock);
1578 
1579 	return (0);
1580 }
1581 
1582 static int
1583 mod_event(module_t mod, int cmd, void *arg)
1584 {
1585 	switch (cmd) {
1586 	case MOD_LOAD:
1587 		unr = new_unrhdr(0, INT_MAX, NULL);
1588 		break;
1589 
1590 	case MOD_UNLOAD:
1591 		delete_unrhdr(unr);
1592 		break;
1593 	}
1594 
1595 	return (0);
1596 }
1597 
1598 DRIVER_MODULE(sume, pci, sume_driver, mod_event, NULL);
1599 MODULE_VERSION(sume, 1);
1600