xref: /freebsd/sys/dev/sume/if_sume.c (revision cab6a39d7b343596a5823e65c0f7b426551ec22d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2015 Bjoern A. Zeeb
5  * Copyright (c) 2020 Denis Salopek
6  *
7  * This software was developed by SRI International and the University of
8  * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-11-C-0249
9  * ("MRC2"), as part of the DARPA MRC research programme.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <sys/param.h>
37 #include <sys/bus.h>
38 #include <sys/endian.h>
39 #include <sys/kernel.h>
40 #include <sys/limits.h>
41 #include <sys/module.h>
42 #include <sys/rman.h>
43 #include <sys/socket.h>
44 #include <sys/sockio.h>
45 #include <sys/sysctl.h>
46 #include <sys/taskqueue.h>
47 
48 #include <net/if.h>
49 #include <net/if_media.h>
50 #include <net/if_types.h>
51 #include <net/if_var.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/if_ether.h>
55 
56 #include <dev/pci/pcivar.h>
57 #include <dev/pci/pcireg.h>
58 
59 #include <machine/bus.h>
60 
61 #include "adapter.h"
62 
63 #define	PCI_VENDOR_ID_XILINX	0x10ee
64 #define	PCI_DEVICE_ID_SUME	0x7028
65 
66 /* SUME bus driver interface */
67 static int sume_probe(device_t);
68 static int sume_attach(device_t);
69 static int sume_detach(device_t);
70 
71 static device_method_t sume_methods[] = {
72 	DEVMETHOD(device_probe,		sume_probe),
73 	DEVMETHOD(device_attach,	sume_attach),
74 	DEVMETHOD(device_detach,	sume_detach),
75 	DEVMETHOD_END
76 };
77 
78 static driver_t sume_driver = {
79 	"sume",
80 	sume_methods,
81 	sizeof(struct sume_adapter)
82 };
83 
84 /*
85  * The DMA engine for SUME generates interrupts for each RX/TX transaction.
86  * Depending on the channel (0 if packet transaction, 1 if register transaction)
87  * the used bits of the interrupt vector will be the lowest or the second lowest
88  * 5 bits.
89  *
90  * When receiving packets from SUME (RX):
91  * (1) SUME received a packet on one of the interfaces.
92  * (2) SUME generates an interrupt vector, bit 00001 is set (channel 0 - new RX
93  *     transaction).
94  * (3) We read the length of the incoming packet and the offset along with the
95  *     'last' flag from the SUME registers.
96  * (4) We prepare for the DMA transaction by setting the bouncebuffer on the
97  *     address buf_addr. For now, this is how it's done:
98  *     - First 3*sizeof(uint32_t) bytes are: lower and upper 32 bits of physical
99  *     address where we want the data to arrive (buf_addr[0] and buf_addr[1]),
100  *     and length of incoming data (buf_addr[2]).
101  *     - Data will start right after, at buf_addr+3*sizeof(uint32_t). The
102  *     physical address buf_hw_addr is a block of contiguous memory mapped to
103  *     buf_addr, so we can set the incoming data's physical address (buf_addr[0]
104  *     and buf_addr[1]) to buf_hw_addr+3*sizeof(uint32_t).
105  * (5) We notify SUME that the bouncebuffer is ready for the transaction by
106  *     writing the lower/upper physical address buf_hw_addr to the SUME
107  *     registers RIFFA_TX_SG_ADDR_LO_REG_OFF and RIFFA_TX_SG_ADDR_HI_REG_OFF as
108  *     well as the number of segments to the register RIFFA_TX_SG_LEN_REG_OFF.
109  * (6) SUME generates an interrupt vector, bit 00010 is set (channel 0 -
110  *     bouncebuffer received).
111  * (7) SUME generates an interrupt vector, bit 00100 is set (channel 0 -
112  *     transaction is done).
113  * (8) SUME can do both steps (6) and (7) using the same interrupt.
114  * (8) We read the first 16 bytes (metadata) of the received data and note the
115  *     incoming interface so we can later forward it to the right one in the OS
116  *     (sume0, sume1, sume2 or sume3).
117  * (10) We create an mbuf and copy the data from the bouncebuffer to the mbuf
118  *     and set the mbuf rcvif to the incoming interface.
119  * (11) We forward the mbuf to the appropriate interface via ifp->if_input.
120  *
121  * When sending packets to SUME (TX):
122  * (1) The OS calls sume_if_start() function on TX.
123  * (2) We get the mbuf packet data and copy it to the
124  *     buf_addr+3*sizeof(uint32_t) + metadata 16 bytes.
125  * (3) We create the metadata based on the output interface and copy it to the
126  *     buf_addr+3*sizeof(uint32_t).
127  * (4) We write the offset/last and length of the packet to the SUME registers
128  *     RIFFA_RX_OFFLAST_REG_OFF and RIFFA_RX_LEN_REG_OFF.
129  * (5) We fill the bouncebuffer by filling the first 3*sizeof(uint32_t) bytes
130  *     with the physical address and length just as in RX step (4).
131  * (6) We notify SUME that the bouncebuffer is ready by writing to SUME
132  *     registers RIFFA_RX_SG_ADDR_LO_REG_OFF, RIFFA_RX_SG_ADDR_HI_REG_OFF and
133  *     RIFFA_RX_SG_LEN_REG_OFF just as in RX step (5).
134  * (7) SUME generates an interrupt vector, bit 01000 is set (channel 0 -
135  *     bouncebuffer is read).
136  * (8) SUME generates an interrupt vector, bit 10000 is set (channel 0 -
137  *     transaction is done).
138  * (9) SUME can do both steps (7) and (8) using the same interrupt.
139  *
140  * Internal registers
141  * Every module in the SUME hardware has its own set of internal registers
142  * (IDs, for debugging and statistic purposes, etc.). Their base addresses are
143  * defined in 'projects/reference_nic/hw/tcl/reference_nic_defines.tcl' and the
144  * offsets to different memory locations of every module are defined in their
145  * corresponding folder inside the library. These registers can be RO/RW and
146  * there is a special method to fetch/change this data over 1 or 2 DMA
147  * transactions. For writing, by calling the sume_module_reg_write(). For
148  * reading, by calling the sume_module_reg_write() and then
149  * sume_module_reg_read(). Check those functions for more information.
150  */
151 
152 MALLOC_DECLARE(M_SUME);
153 MALLOC_DEFINE(M_SUME, "sume", "NetFPGA SUME device driver");
154 
155 static void check_tx_queues(struct sume_adapter *);
156 static void sume_fill_bb_desc(struct sume_adapter *, struct riffa_chnl_dir *,
157     uint64_t);
158 
159 static struct unrhdr *unr;
160 
161 static struct {
162 	uint16_t device;
163 	char *desc;
164 } sume_pciids[] = {
165 	{PCI_DEVICE_ID_SUME, "NetFPGA SUME reference NIC"},
166 };
167 
168 static inline uint32_t
169 read_reg(struct sume_adapter *adapter, int offset)
170 {
171 
172 	return (bus_space_read_4(adapter->bt, adapter->bh, offset << 2));
173 }
174 
175 static inline void
176 write_reg(struct sume_adapter *adapter, int offset, uint32_t val)
177 {
178 
179 	bus_space_write_4(adapter->bt, adapter->bh, offset << 2, val);
180 }
181 
182 static int
183 sume_probe(device_t dev)
184 {
185 	int i;
186 	uint16_t v = pci_get_vendor(dev);
187 	uint16_t d = pci_get_device(dev);
188 
189 	if (v != PCI_VENDOR_ID_XILINX)
190 		return (ENXIO);
191 
192 	for (i = 0; i < nitems(sume_pciids); i++) {
193 		if (d == sume_pciids[i].device) {
194 			device_set_desc(dev, sume_pciids[i].desc);
195 			return (BUS_PROBE_DEFAULT);
196 		}
197 	}
198 
199 	return (ENXIO);
200 }
201 
202 /*
203  * Building mbuf for packet received from SUME. We expect to receive 'len'
204  * bytes of data (including metadata) written from the bouncebuffer address
205  * buf_addr+3*sizeof(uint32_t). Metadata will tell us which SUME interface
206  * received the packet (sport will be 1, 2, 4 or 8), the packet length (plen),
207  * and the magic word needs to be 0xcafe. When we have the packet data, we
208  * create an mbuf and copy the data to it using m_copyback() function, set the
209  * correct interface to rcvif and return the mbuf to be later sent to the OS
210  * with if_input.
211  */
212 static struct mbuf *
213 sume_rx_build_mbuf(struct sume_adapter *adapter, uint32_t len)
214 {
215 	struct nf_priv *nf_priv;
216 	struct mbuf *m;
217 	struct ifnet *ifp = NULL;
218 	int np;
219 	uint16_t dport, plen, magic;
220 	device_t dev = adapter->dev;
221 	uint8_t *indata = (uint8_t *)
222 	    adapter->recv[SUME_RIFFA_CHANNEL_DATA]->buf_addr +
223 	    sizeof(struct nf_bb_desc);
224 	struct nf_metadata *mdata = (struct nf_metadata *) indata;
225 
226 	/* The metadata header is 16 bytes. */
227 	if (len < sizeof(struct nf_metadata)) {
228 		device_printf(dev, "short frame (%d)\n", len);
229 		adapter->packets_err++;
230 		adapter->bytes_err += len;
231 		return (NULL);
232 	}
233 
234 	dport = le16toh(mdata->dport);
235 	plen = le16toh(mdata->plen);
236 	magic = le16toh(mdata->magic);
237 
238 	if (sizeof(struct nf_metadata) + plen > len ||
239 	    magic != SUME_RIFFA_MAGIC) {
240 		device_printf(dev, "corrupted packet (%zd + %d > %d || magic "
241 		    "0x%04x != 0x%04x)\n", sizeof(struct nf_metadata), plen,
242 		    len, magic, SUME_RIFFA_MAGIC);
243 		return (NULL);
244 	}
245 
246 	/* We got the packet from one of the even bits */
247 	np = (ffs(dport & SUME_DPORT_MASK) >> 1) - 1;
248 	if (np > SUME_NPORTS) {
249 		device_printf(dev, "invalid destination port 0x%04x (%d)\n",
250 		    dport, np);
251 		adapter->packets_err++;
252 		adapter->bytes_err += plen;
253 		return (NULL);
254 	}
255 	ifp = adapter->ifp[np];
256 	nf_priv = ifp->if_softc;
257 	nf_priv->stats.rx_packets++;
258 	nf_priv->stats.rx_bytes += plen;
259 
260 	/* If the interface is down, well, we are done. */
261 	if (!(ifp->if_flags & IFF_UP)) {
262 		nf_priv->stats.ifc_down_packets++;
263 		nf_priv->stats.ifc_down_bytes += plen;
264 		return (NULL);
265 	}
266 
267 	if (adapter->sume_debug)
268 		printf("Building mbuf with length: %d\n", plen);
269 
270 	m = m_getm(NULL, plen, M_NOWAIT, MT_DATA);
271 	if (m == NULL) {
272 		adapter->packets_err++;
273 		adapter->bytes_err += plen;
274 		return (NULL);
275 	}
276 
277 	/* Copy the data in at the right offset. */
278 	m_copyback(m, 0, plen, (void *) (indata + sizeof(struct nf_metadata)));
279 	m->m_pkthdr.rcvif = ifp;
280 
281 	return (m);
282 }
283 
284 /*
285  * SUME interrupt handler for when we get a valid interrupt from the board.
286  * Theoretically, we can receive interrupt for any of the available channels,
287  * but RIFFA DMA uses only 2: 0 and 1, so we use only vect0. The vector is a 32
288  * bit number, using 5 bits for every channel, the least significant bits
289  * correspond to channel 0 and the next 5 bits correspond to channel 1. Vector
290  * bits for RX/TX are:
291  * RX
292  * bit 0 - new transaction from SUME
293  * bit 1 - SUME received our bouncebuffer address
294  * bit 2 - SUME copied the received data to our bouncebuffer, transaction done
295  * TX
296  * bit 3 - SUME received our bouncebuffer address
297  * bit 4 - SUME copied the data from our bouncebuffer, transaction done
298  *
299  * There are two finite state machines (one for TX, one for RX). We loop
300  * through channels 0 and 1 to check and our current state and which interrupt
301  * bit is set.
302  * TX
303  * SUME_RIFFA_CHAN_STATE_IDLE: waiting for the first TX transaction.
304  * SUME_RIFFA_CHAN_STATE_READY: we prepared (filled with data) the bouncebuffer
305  * and triggered the SUME for the TX transaction. Waiting for interrupt bit 3
306  * to go to the next state.
307  * SUME_RIFFA_CHAN_STATE_READ: waiting for interrupt bit 4 (for SUME to send
308  * our packet). Then we get the length of the sent data and go back to the
309  * IDLE state.
310  * RX
311  * SUME_RIFFA_CHAN_STATE_IDLE: waiting for the interrupt bit 0 (new RX
312  * transaction). When we get it, we prepare our bouncebuffer for reading and
313  * trigger the SUME to start the transaction. Go to the next state.
314  * SUME_RIFFA_CHAN_STATE_READY: waiting for the interrupt bit 1 (SUME got our
315  * bouncebuffer). Go to the next state.
316  * SUME_RIFFA_CHAN_STATE_READ: SUME copied data and our bouncebuffer is ready,
317  * we can build the mbuf and go back to the IDLE state.
318  */
319 static void
320 sume_intr_handler(void *arg)
321 {
322 	struct sume_adapter *adapter = arg;
323 	uint32_t vect, vect0, len;
324 	int ch, loops;
325 	device_t dev = adapter->dev;
326 	struct mbuf *m = NULL;
327 	struct ifnet *ifp = NULL;
328 	struct riffa_chnl_dir *send, *recv;
329 
330 	SUME_LOCK(adapter);
331 
332 	vect0 = read_reg(adapter, RIFFA_IRQ_REG0_OFF);
333 	if ((vect0 & SUME_INVALID_VECT) != 0) {
334 		SUME_UNLOCK(adapter);
335 		return;
336 	}
337 
338 	/*
339 	 * We only have one interrupt for all channels and no way
340 	 * to quickly lookup for which channel(s) we got an interrupt?
341 	 */
342 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
343 		vect = vect0 >> (5 * ch);
344 		send = adapter->send[ch];
345 		recv = adapter->recv[ch];
346 
347 		loops = 0;
348 		while ((vect & (SUME_MSI_TXBUF | SUME_MSI_TXDONE)) &&
349 		    loops <= 5) {
350 			if (adapter->sume_debug)
351 				device_printf(dev, "TX ch %d state %u vect = "
352 				    "0x%08x\n", ch, send->state, vect);
353 			switch (send->state) {
354 			case SUME_RIFFA_CHAN_STATE_IDLE:
355 				break;
356 			case SUME_RIFFA_CHAN_STATE_READY:
357 				if (!(vect & SUME_MSI_TXBUF)) {
358 					device_printf(dev, "ch %d unexpected "
359 					    "interrupt in send+3 state %u: "
360 					    "vect = 0x%08x\n", ch, send->state,
361 					    vect);
362 					send->recovery = 1;
363 					break;
364 				}
365 				send->state = SUME_RIFFA_CHAN_STATE_READ;
366 				vect &= ~SUME_MSI_TXBUF;
367 				break;
368 			case SUME_RIFFA_CHAN_STATE_READ:
369 				if (!(vect & SUME_MSI_TXDONE)) {
370 					device_printf(dev, "ch %d unexpected "
371 					    "interrupt in send+4 state %u: "
372 					    "vect = 0x%08x\n", ch, send->state,
373 					    vect);
374 					send->recovery = 1;
375 					break;
376 				}
377 				send->state = SUME_RIFFA_CHAN_STATE_LEN;
378 
379 				len = read_reg(adapter, RIFFA_CHNL_REG(ch,
380 				    RIFFA_RX_TNFR_LEN_REG_OFF));
381 				if (ch == SUME_RIFFA_CHANNEL_DATA) {
382 					send->state =
383 					    SUME_RIFFA_CHAN_STATE_IDLE;
384 					check_tx_queues(adapter);
385 				} else if (ch == SUME_RIFFA_CHANNEL_REG)
386 					wakeup(&send->event);
387 				else {
388 					device_printf(dev, "ch %d unexpected "
389 					    "interrupt in send+4 state %u: "
390 					    "vect = 0x%08x\n", ch, send->state,
391 					    vect);
392 					send->recovery = 1;
393 				}
394 				vect &= ~SUME_MSI_TXDONE;
395 				break;
396 			case SUME_RIFFA_CHAN_STATE_LEN:
397 				break;
398 			default:
399 				device_printf(dev, "unknown TX state!\n");
400 			}
401 			loops++;
402 		}
403 
404 		if ((vect & (SUME_MSI_TXBUF | SUME_MSI_TXDONE)) &&
405 		    send->recovery)
406 			device_printf(dev, "ch %d ignoring vect = 0x%08x "
407 			    "during TX; not in recovery; state = %d loops = "
408 			    "%d\n", ch, vect, send->state, loops);
409 
410 		loops = 0;
411 		while ((vect & (SUME_MSI_RXQUE | SUME_MSI_RXBUF |
412 		    SUME_MSI_RXDONE)) && loops < 5) {
413 			if (adapter->sume_debug)
414 				device_printf(dev, "RX ch %d state %u vect = "
415 				    "0x%08x\n", ch, recv->state, vect);
416 			switch (recv->state) {
417 			case SUME_RIFFA_CHAN_STATE_IDLE:
418 				if (!(vect & SUME_MSI_RXQUE)) {
419 					device_printf(dev, "ch %d unexpected "
420 					    "interrupt in recv+0 state %u: "
421 					    "vect = 0x%08x\n", ch, recv->state,
422 					    vect);
423 					recv->recovery = 1;
424 					break;
425 				}
426 				uint32_t max_ptr;
427 
428 				/* Clear recovery state. */
429 				recv->recovery = 0;
430 
431 				/* Get offset and length. */
432 				recv->offlast = read_reg(adapter,
433 				    RIFFA_CHNL_REG(ch,
434 				    RIFFA_TX_OFFLAST_REG_OFF));
435 				recv->len = read_reg(adapter, RIFFA_CHNL_REG(ch,
436 				    RIFFA_TX_LEN_REG_OFF));
437 
438 				/* Boundary checks. */
439 				max_ptr = (uint32_t)((uintptr_t)recv->buf_addr
440 				    + SUME_RIFFA_OFFSET(recv->offlast)
441 				    + SUME_RIFFA_LEN(recv->len) - 1);
442 				if (max_ptr <
443 				    (uint32_t)((uintptr_t)recv->buf_addr))
444 					device_printf(dev, "receive buffer "
445 					    "wrap-around overflow.\n");
446 				if (SUME_RIFFA_OFFSET(recv->offlast) +
447 				    SUME_RIFFA_LEN(recv->len) >
448 				    adapter->sg_buf_size)
449 					device_printf(dev, "receive buffer too"
450 					    " small.\n");
451 
452 				/* Fill the bouncebuf "descriptor". */
453 				sume_fill_bb_desc(adapter, recv,
454 				    SUME_RIFFA_LEN(recv->len));
455 
456 				bus_dmamap_sync(recv->ch_tag, recv->ch_map,
457 				    BUS_DMASYNC_PREREAD |
458 				    BUS_DMASYNC_PREWRITE);
459 				write_reg(adapter, RIFFA_CHNL_REG(ch,
460 				    RIFFA_TX_SG_ADDR_LO_REG_OFF),
461 				    SUME_RIFFA_LO_ADDR(recv->buf_hw_addr));
462 				write_reg(adapter, RIFFA_CHNL_REG(ch,
463 				    RIFFA_TX_SG_ADDR_HI_REG_OFF),
464 				    SUME_RIFFA_HI_ADDR(recv->buf_hw_addr));
465 				write_reg(adapter, RIFFA_CHNL_REG(ch,
466 				    RIFFA_TX_SG_LEN_REG_OFF),
467 				    4 * recv->num_sg);
468 				bus_dmamap_sync(recv->ch_tag, recv->ch_map,
469 				    BUS_DMASYNC_POSTREAD |
470 				    BUS_DMASYNC_POSTWRITE);
471 
472 				recv->state = SUME_RIFFA_CHAN_STATE_READY;
473 				vect &= ~SUME_MSI_RXQUE;
474 				break;
475 			case SUME_RIFFA_CHAN_STATE_READY:
476 				if (!(vect & SUME_MSI_RXBUF)) {
477 					device_printf(dev, "ch %d unexpected "
478 					    "interrupt in recv+1 state %u: "
479 					    "vect = 0x%08x\n", ch, recv->state,
480 					    vect);
481 					recv->recovery = 1;
482 					break;
483 				}
484 				recv->state = SUME_RIFFA_CHAN_STATE_READ;
485 				vect &= ~SUME_MSI_RXBUF;
486 				break;
487 			case SUME_RIFFA_CHAN_STATE_READ:
488 				if (!(vect & SUME_MSI_RXDONE)) {
489 					device_printf(dev, "ch %d unexpected "
490 					    "interrupt in recv+2 state %u: "
491 					    "vect = 0x%08x\n", ch, recv->state,
492 					    vect);
493 					recv->recovery = 1;
494 					break;
495 				}
496 				len = read_reg(adapter, RIFFA_CHNL_REG(ch,
497 				    RIFFA_TX_TNFR_LEN_REG_OFF));
498 
499 				/* Remember, len and recv->len are words. */
500 				if (ch == SUME_RIFFA_CHANNEL_DATA) {
501 					m = sume_rx_build_mbuf(adapter,
502 					    len << 2);
503 					recv->state =
504 					    SUME_RIFFA_CHAN_STATE_IDLE;
505 				} else if (ch == SUME_RIFFA_CHANNEL_REG)
506 					wakeup(&recv->event);
507 				else {
508 					device_printf(dev, "ch %d unexpected "
509 					    "interrupt in recv+2 state %u: "
510 					    "vect = 0x%08x\n", ch, recv->state,
511 					    vect);
512 					recv->recovery = 1;
513 				}
514 				vect &= ~SUME_MSI_RXDONE;
515 				break;
516 			case SUME_RIFFA_CHAN_STATE_LEN:
517 				break;
518 			default:
519 				device_printf(dev, "unknown RX state!\n");
520 			}
521 			loops++;
522 		}
523 
524 		if ((vect & (SUME_MSI_RXQUE | SUME_MSI_RXBUF |
525 		    SUME_MSI_RXDONE)) && recv->recovery) {
526 			device_printf(dev, "ch %d ignoring vect = 0x%08x "
527 			    "during RX; not in recovery; state = %d, loops = "
528 			    "%d\n", ch, vect, recv->state, loops);
529 
530 			/* Clean the unfinished transaction. */
531 			if (ch == SUME_RIFFA_CHANNEL_REG &&
532 			    vect & SUME_MSI_RXDONE) {
533 				read_reg(adapter, RIFFA_CHNL_REG(ch,
534 				    RIFFA_TX_TNFR_LEN_REG_OFF));
535 				recv->recovery = 0;
536 			}
537 		}
538 	}
539 	SUME_UNLOCK(adapter);
540 
541 	if (m != NULL) {
542 		ifp = m->m_pkthdr.rcvif;
543 		(*ifp->if_input)(ifp, m);
544 	}
545 }
546 
547 /*
548  * As we cannot disable interrupt generation, ignore early interrupts by waiting
549  * for the adapter to go into the 'running' state.
550  */
551 static int
552 sume_intr_filter(void *arg)
553 {
554 	struct sume_adapter *adapter = arg;
555 
556 	if (adapter->running == 0)
557 		return (FILTER_STRAY);
558 
559 	return (FILTER_SCHEDULE_THREAD);
560 }
561 
562 static int
563 sume_probe_riffa_pci(struct sume_adapter *adapter)
564 {
565 	device_t dev = adapter->dev;
566 	int error, count, capmem;
567 	uint32_t reg, devctl, linkctl;
568 
569 	pci_enable_busmaster(dev);
570 
571 	adapter->rid = PCIR_BAR(0);
572 	adapter->bar0_addr = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
573 	    &adapter->rid, RF_ACTIVE);
574 	if (adapter->bar0_addr == NULL) {
575 		device_printf(dev, "unable to allocate bus resource: "
576 		    "BAR0 address\n");
577 		return (ENXIO);
578 	}
579 	adapter->bt = rman_get_bustag(adapter->bar0_addr);
580 	adapter->bh = rman_get_bushandle(adapter->bar0_addr);
581 	adapter->bar0_len = rman_get_size(adapter->bar0_addr);
582 	if (adapter->bar0_len != 1024) {
583 		device_printf(dev, "BAR0 resource length %lu != 1024\n",
584 		    adapter->bar0_len);
585 		return (ENXIO);
586 	}
587 
588 	count = pci_msi_count(dev);
589 	error = pci_alloc_msi(dev, &count);
590 	if (error) {
591 		device_printf(dev, "unable to allocate bus resource: PCI "
592 		    "MSI\n");
593 		return (error);
594 	}
595 
596 	adapter->irq.rid = 1; /* Should be 1, thus says pci_alloc_msi() */
597 	adapter->irq.res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
598 	    &adapter->irq.rid, RF_SHAREABLE | RF_ACTIVE);
599 	if (adapter->irq.res == NULL) {
600 		device_printf(dev, "unable to allocate bus resource: IRQ "
601 		    "memory\n");
602 		return (ENXIO);
603 	}
604 
605 	error = bus_setup_intr(dev, adapter->irq.res, INTR_MPSAFE |
606 	    INTR_TYPE_NET, sume_intr_filter, sume_intr_handler, adapter,
607 	    &adapter->irq.tag);
608 	if (error) {
609 		device_printf(dev, "failed to setup interrupt for rid %d, name"
610 		    " %s: %d\n", adapter->irq.rid, "SUME_INTR", error);
611 		return (ENXIO);
612 	}
613 
614 	if (pci_find_cap(dev, PCIY_EXPRESS, &capmem) != 0) {
615 		device_printf(dev, "PCI not PCIe capable\n");
616 		return (ENXIO);
617 	}
618 
619 	devctl = pci_read_config(dev, capmem + PCIER_DEVICE_CTL, 2);
620 	pci_write_config(dev, capmem + PCIER_DEVICE_CTL, (devctl |
621 	    PCIEM_CTL_EXT_TAG_FIELD), 2);
622 
623 	devctl = pci_read_config(dev, capmem + PCIER_DEVICE_CTL2, 2);
624 	pci_write_config(dev, capmem + PCIER_DEVICE_CTL2, (devctl |
625 	    PCIEM_CTL2_ID_ORDERED_REQ_EN), 2);
626 
627 	linkctl = pci_read_config(dev, capmem + PCIER_LINK_CTL, 2);
628 	pci_write_config(dev, capmem + PCIER_LINK_CTL, (linkctl |
629 	    PCIEM_LINK_CTL_RCB), 2);
630 
631 	reg = read_reg(adapter, RIFFA_INFO_REG_OFF);
632 	adapter->num_sg = RIFFA_SG_ELEMS * ((reg >> 19) & 0xf);
633 	adapter->sg_buf_size = RIFFA_SG_BUF_SIZE * ((reg >> 19) & 0xf);
634 
635 	error = ENODEV;
636 	/* Check bus master is enabled. */
637 	if (((reg >> 4) & 0x1) != 1) {
638 		device_printf(dev, "bus master not enabled: %d\n",
639 		    (reg >> 4) & 0x1);
640 		return (error);
641 	}
642 	/* Check link parameters are valid. */
643 	if (((reg >> 5) & 0x3f) == 0 || ((reg >> 11) & 0x3) == 0) {
644 		device_printf(dev, "link parameters not valid: %d %d\n",
645 		    (reg >> 5) & 0x3f, (reg >> 11) & 0x3);
646 		return (error);
647 	}
648 	/* Check # of channels are within valid range. */
649 	if ((reg & 0xf) == 0 || (reg & 0xf) > RIFFA_MAX_CHNLS) {
650 		device_printf(dev, "number of channels out of range: %d\n",
651 		    reg & 0xf);
652 		return (error);
653 	}
654 	/* Check bus width. */
655 	if (((reg >> 19) & 0xf) == 0 ||
656 	    ((reg >> 19) & 0xf) > RIFFA_MAX_BUS_WIDTH_PARAM) {
657 		device_printf(dev, "bus width out of range: %d\n",
658 		    (reg >> 19) & 0xf);
659 		return (error);
660 	}
661 
662 	device_printf(dev, "[riffa] # of channels: %d\n",
663 	    reg & 0xf);
664 	device_printf(dev, "[riffa] bus interface width: %d\n",
665 	    ((reg >> 19) & 0xf) << 5);
666 	device_printf(dev, "[riffa] bus master enabled: %d\n",
667 	    (reg >> 4) & 0x1);
668 	device_printf(dev, "[riffa] negotiated link width: %d\n",
669 	    (reg >> 5) & 0x3f);
670 	device_printf(dev, "[riffa] negotiated rate width: %d MTs\n",
671 	    ((reg >> 11) & 0x3) * 2500);
672 	device_printf(dev, "[riffa] max downstream payload: %d B\n",
673 	    128 << ((reg >> 13) & 0x7));
674 	device_printf(dev, "[riffa] max upstream payload: %d B\n",
675 	    128 << ((reg >> 16) & 0x7));
676 
677 	return (0);
678 }
679 
680 /* If there is no sume_if_init, the ether_ioctl panics. */
681 static void
682 sume_if_init(void *sc)
683 {
684 }
685 
686 /* Write the address and length for our incoming / outgoing transaction. */
687 static void
688 sume_fill_bb_desc(struct sume_adapter *adapter, struct riffa_chnl_dir *p,
689     uint64_t len)
690 {
691 	struct nf_bb_desc *bouncebuf = (struct nf_bb_desc *) p->buf_addr;
692 
693 	bouncebuf->lower = (p->buf_hw_addr + sizeof(struct nf_bb_desc));
694 	bouncebuf->upper = (p->buf_hw_addr + sizeof(struct nf_bb_desc)) >> 32;
695 	bouncebuf->len = len >> 2;
696 }
697 
698 /* Module register locked write. */
699 static int
700 sume_modreg_write_locked(struct sume_adapter *adapter)
701 {
702 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
703 
704 	/* Let the FPGA know about the transfer. */
705 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
706 	    RIFFA_RX_OFFLAST_REG_OFF), SUME_OFFLAST);
707 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
708 	    RIFFA_RX_LEN_REG_OFF), send->len);	/* words */
709 
710 	/* Fill the bouncebuf "descriptor". */
711 	sume_fill_bb_desc(adapter, send, SUME_RIFFA_LEN(send->len));
712 
713 	/* Update the state before intiating the DMA to avoid races. */
714 	send->state = SUME_RIFFA_CHAN_STATE_READY;
715 
716 	bus_dmamap_sync(send->ch_tag, send->ch_map,
717 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
718 	/* DMA. */
719 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
720 	    RIFFA_RX_SG_ADDR_LO_REG_OFF),
721 	    SUME_RIFFA_LO_ADDR(send->buf_hw_addr));
722 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
723 	    RIFFA_RX_SG_ADDR_HI_REG_OFF),
724 	    SUME_RIFFA_HI_ADDR(send->buf_hw_addr));
725 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
726 	    RIFFA_RX_SG_LEN_REG_OFF), 4 * send->num_sg);
727 	bus_dmamap_sync(send->ch_tag, send->ch_map,
728 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
729 
730 	return (0);
731 }
732 
733 /*
734  * Request a register read or write (depending on optype).
735  * If optype is set (0x1f) this will result in a register write,
736  * otherwise this will result in a register read request at the given
737  * address and the result will need to be DMAed back.
738  */
739 static int
740 sume_module_reg_write(struct nf_priv *nf_priv, struct sume_ifreq *sifr,
741     uint32_t optype)
742 {
743 	struct sume_adapter *adapter = nf_priv->adapter;
744 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
745 	struct nf_regop_data *data;
746 	int error;
747 
748 	/*
749 	 * 1. Make sure the channel is free;  otherwise return EBUSY.
750 	 * 2. Prepare the memory in the bounce buffer (which we always
751 	 *    use for regs).
752 	 * 3. Start the DMA process.
753 	 * 4. Sleep and wait for result and return success or error.
754 	 */
755 	SUME_LOCK(adapter);
756 
757 	if (send->state != SUME_RIFFA_CHAN_STATE_IDLE) {
758 		SUME_UNLOCK(adapter);
759 		return (EBUSY);
760 	}
761 
762 	data = (struct nf_regop_data *) (send->buf_addr +
763 	    sizeof(struct nf_bb_desc));
764 	data->addr = htole32(sifr->addr);
765 	data->val = htole32(sifr->val);
766 	/* Tag to indentify request. */
767 	data->rtag = htole32(++send->rtag);
768 	data->optype = htole32(optype);
769 	send->len = sizeof(struct nf_regop_data) / 4; /* words */
770 
771 	error = sume_modreg_write_locked(adapter);
772 	if (error) {
773 		SUME_UNLOCK(adapter);
774 		return (EFAULT);
775 	}
776 
777 	/* Timeout after 1s. */
778 	if (send->state != SUME_RIFFA_CHAN_STATE_LEN)
779 		error = msleep(&send->event, &adapter->lock, 0,
780 		    "Waiting recv finish", 1 * hz);
781 
782 	/* This was a write so we are done; were interrupted, or timed out. */
783 	if (optype != SUME_MR_READ || error != 0 || error == EWOULDBLOCK) {
784 		send->state = SUME_RIFFA_CHAN_STATE_IDLE;
785 		if (optype == SUME_MR_READ)
786 			error = EWOULDBLOCK;
787 		else
788 			error = 0;
789 	} else
790 		error = 0;
791 
792 	/*
793 	 * For read requests we will update state once we are done
794 	 * having read the result to avoid any two outstanding
795 	 * transactions, or we need a queue and validate tags,
796 	 * which is a lot of work for a low priority, infrequent
797 	 * event.
798 	 */
799 
800 	SUME_UNLOCK(adapter);
801 
802 	return (error);
803 }
804 
805 /* Module register read. */
806 static int
807 sume_module_reg_read(struct nf_priv *nf_priv, struct sume_ifreq *sifr)
808 {
809 	struct sume_adapter *adapter = nf_priv->adapter;
810 	struct riffa_chnl_dir *recv = adapter->recv[SUME_RIFFA_CHANNEL_REG];
811 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
812 	struct nf_regop_data *data;
813 	int error = 0;
814 
815 	/*
816 	 * 0. Sleep waiting for result if needed (unless condition is
817 	 *    true already).
818 	 * 1. Read DMA results.
819 	 * 2. Update state on *TX* to IDLE to allow next read to start.
820 	 */
821 	SUME_LOCK(adapter);
822 
823 	bus_dmamap_sync(recv->ch_tag, recv->ch_map,
824 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
825 	/*
826 	 * We only need to be woken up at the end of the transaction.
827 	 * Timeout after 1s.
828 	 */
829 	if (recv->state != SUME_RIFFA_CHAN_STATE_READ)
830 		error = msleep(&recv->event, &adapter->lock, 0,
831 		    "Waiting transaction finish", 1 * hz);
832 
833 	if (recv->state != SUME_RIFFA_CHAN_STATE_READ || error == EWOULDBLOCK) {
834 		SUME_UNLOCK(adapter);
835 		device_printf(adapter->dev, "wait error: %d\n", error);
836 		return (EWOULDBLOCK);
837 	}
838 
839 	bus_dmamap_sync(recv->ch_tag, recv->ch_map,
840 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
841 
842 	/*
843 	 * Read reply data and validate address and tag.
844 	 * Note: we do access the send side without lock but the state
845 	 * machine does prevent the data from changing.
846 	 */
847 	data = (struct nf_regop_data *) (recv->buf_addr +
848 	    sizeof(struct nf_bb_desc));
849 
850 	if (le32toh(data->rtag) != send->rtag)
851 		device_printf(adapter->dev, "rtag error: 0x%08x 0x%08x\n",
852 		    le32toh(data->rtag), send->rtag);
853 
854 	sifr->val = le32toh(data->val);
855 	recv->state = SUME_RIFFA_CHAN_STATE_IDLE;
856 
857 	/* We are done. */
858 	send->state = SUME_RIFFA_CHAN_STATE_IDLE;
859 
860 	SUME_UNLOCK(adapter);
861 
862 	return (0);
863 }
864 
865 /* Read value from a module register and return it to a sume_ifreq. */
866 static int
867 get_modreg_value(struct nf_priv *nf_priv, struct sume_ifreq *sifr)
868 {
869 	int error;
870 
871 	error = sume_module_reg_write(nf_priv, sifr, SUME_MR_READ);
872 	if (!error)
873 		error = sume_module_reg_read(nf_priv, sifr);
874 
875 	return (error);
876 }
877 
878 static int
879 sume_if_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
880 {
881 	struct ifreq *ifr = (struct ifreq *) data;
882 	struct nf_priv *nf_priv = ifp->if_softc;
883 	struct sume_ifreq sifr;
884 	int error = 0;
885 
886 	switch (cmd) {
887 	case SIOCGIFMEDIA:
888 	case SIOCGIFXMEDIA:
889 		error = ifmedia_ioctl(ifp, ifr, &nf_priv->media, cmd);
890 		break;
891 
892 	case SUME_IOCTL_CMD_WRITE_REG:
893 		error = copyin(ifr_data_get_ptr(ifr), &sifr, sizeof(sifr));
894 		if (error) {
895 			error = EINVAL;
896 			break;
897 		}
898 		error = sume_module_reg_write(nf_priv, &sifr, SUME_MR_WRITE);
899 		break;
900 
901 	case SUME_IOCTL_CMD_READ_REG:
902 		error = copyin(ifr_data_get_ptr(ifr), &sifr, sizeof(sifr));
903 		if (error) {
904 			error = EINVAL;
905 			break;
906 		}
907 
908 		error = get_modreg_value(nf_priv, &sifr);
909 		if (error)
910 			break;
911 
912 		error = copyout(&sifr, ifr_data_get_ptr(ifr), sizeof(sifr));
913 		if (error)
914 			error = EINVAL;
915 
916 		break;
917 
918 	case SIOCSIFFLAGS:
919 		/* Silence tcpdump 'promisc mode not supported' warning. */
920 		if (ifp->if_flags & IFF_PROMISC)
921 			break;
922 
923 	default:
924 		error = ether_ioctl(ifp, cmd, data);
925 		break;
926 	}
927 
928 	return (error);
929 }
930 
931 static int
932 sume_media_change(struct ifnet *ifp)
933 {
934 	struct nf_priv *nf_priv = ifp->if_softc;
935 	struct ifmedia *ifm = &nf_priv->media;
936 
937 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
938 		return (EINVAL);
939 
940 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_10G_SR)
941 		ifp->if_baudrate = ifmedia_baudrate(IFM_ETHER | IFM_10G_SR);
942 	else
943 		ifp->if_baudrate = ifmedia_baudrate(ifm->ifm_media);
944 
945 	return (0);
946 }
947 
948 static void
949 sume_update_link_status(struct ifnet *ifp)
950 {
951 	struct nf_priv *nf_priv = ifp->if_softc;
952 	struct sume_adapter *adapter = nf_priv->adapter;
953 	struct sume_ifreq sifr;
954 	int link_status;
955 
956 	sifr.addr = SUME_STATUS_ADDR(nf_priv->port);
957 	sifr.val = 0;
958 
959 	if (get_modreg_value(nf_priv, &sifr))
960 		return;
961 
962 	link_status = SUME_LINK_STATUS(sifr.val);
963 
964 	if (!link_status && nf_priv->link_up) {
965 		if_link_state_change(ifp, LINK_STATE_DOWN);
966 		nf_priv->link_up = 0;
967 		if (adapter->sume_debug)
968 			device_printf(adapter->dev, "port %d link state "
969 			    "changed to DOWN\n", nf_priv->unit);
970 	} else if (link_status && !nf_priv->link_up) {
971 		nf_priv->link_up = 1;
972 		if_link_state_change(ifp, LINK_STATE_UP);
973 		if (adapter->sume_debug)
974 			device_printf(adapter->dev, "port %d link state "
975 			    "changed to UP\n", nf_priv->unit);
976 	}
977 }
978 
979 static void
980 sume_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
981 {
982 	struct nf_priv *nf_priv = ifp->if_softc;
983 	struct ifmedia *ifm = &nf_priv->media;
984 
985 	if (ifm->ifm_cur->ifm_media == (IFM_ETHER | IFM_10G_SR) &&
986 	    (ifp->if_flags & IFF_UP))
987 		ifmr->ifm_active = IFM_ETHER | IFM_10G_SR;
988 	else
989 		ifmr->ifm_active = ifm->ifm_cur->ifm_media;
990 
991 	ifmr->ifm_status |= IFM_AVALID;
992 
993 	sume_update_link_status(ifp);
994 
995 	if (nf_priv->link_up)
996 		ifmr->ifm_status |= IFM_ACTIVE;
997 }
998 
999 /*
1000  * Packet to transmit. We take the packet data from the mbuf and copy it to the
1001  * bouncebuffer address buf_addr+3*sizeof(uint32_t)+16. The 16 bytes before the
1002  * packet data are for metadata: sport/dport (depending on our source
1003  * interface), packet length and magic 0xcafe. We tell the SUME about the
1004  * transfer, fill the first 3*sizeof(uint32_t) bytes of the bouncebuffer with
1005  * the information about the start and length of the packet and trigger the
1006  * transaction.
1007  */
1008 static int
1009 sume_if_start_locked(struct ifnet *ifp)
1010 {
1011 	struct mbuf *m;
1012 	struct nf_priv *nf_priv = ifp->if_softc;
1013 	struct sume_adapter *adapter = nf_priv->adapter;
1014 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_DATA];
1015 	uint8_t *outbuf;
1016 	struct nf_metadata *mdata;
1017 	int plen = SUME_MIN_PKT_SIZE;
1018 
1019 	KASSERT(mtx_owned(&adapter->lock), ("SUME lock not owned"));
1020 	KASSERT(send->state == SUME_RIFFA_CHAN_STATE_IDLE,
1021 	    ("SUME not in IDLE state"));
1022 
1023 	IFQ_DEQUEUE(&ifp->if_snd, m);
1024 	if (m == NULL)
1025 		return (EINVAL);
1026 
1027 	/* Packets large enough do not need to be padded */
1028 	if (m->m_pkthdr.len > SUME_MIN_PKT_SIZE)
1029 		plen = m->m_pkthdr.len;
1030 
1031 	if (adapter->sume_debug)
1032 		device_printf(adapter->dev, "sending %d bytes to %s%d\n", plen,
1033 		    SUME_ETH_DEVICE_NAME, nf_priv->unit);
1034 
1035 	outbuf = (uint8_t *) send->buf_addr + sizeof(struct nf_bb_desc);
1036 	mdata = (struct nf_metadata *) outbuf;
1037 
1038 	/* Clear the recovery flag. */
1039 	send->recovery = 0;
1040 
1041 	/* Make sure we fit with the 16 bytes nf_metadata. */
1042 	if (m->m_pkthdr.len + sizeof(struct nf_metadata) >
1043 	    adapter->sg_buf_size) {
1044 		device_printf(adapter->dev, "packet too big for bounce buffer "
1045 		    "(%d)\n", m->m_pkthdr.len);
1046 		m_freem(m);
1047 		nf_priv->stats.tx_dropped++;
1048 		return (ENOMEM);
1049 	}
1050 
1051 	bus_dmamap_sync(send->ch_tag, send->ch_map,
1052 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1053 
1054 	/* Zero out the padded data */
1055 	if (m->m_pkthdr.len < SUME_MIN_PKT_SIZE)
1056 		bzero(outbuf + sizeof(struct nf_metadata), SUME_MIN_PKT_SIZE);
1057 	/* Skip the first 16 bytes for the metadata. */
1058 	m_copydata(m, 0, m->m_pkthdr.len, outbuf + sizeof(struct nf_metadata));
1059 	send->len = (sizeof(struct nf_metadata) + plen + 3) / 4;
1060 
1061 	/* Fill in the metadata: CPU(DMA) ports are odd, MAC ports are even. */
1062 	mdata->sport = htole16(1 << (nf_priv->port * 2 + 1));
1063 	mdata->dport = htole16(1 << (nf_priv->port * 2));
1064 	mdata->plen = htole16(plen);
1065 	mdata->magic = htole16(SUME_RIFFA_MAGIC);
1066 	mdata->t1 = htole32(0);
1067 	mdata->t2 = htole32(0);
1068 
1069 	/* Let the FPGA know about the transfer. */
1070 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1071 	    RIFFA_RX_OFFLAST_REG_OFF), SUME_OFFLAST);
1072 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1073 	    RIFFA_RX_LEN_REG_OFF), send->len);
1074 
1075 	/* Fill the bouncebuf "descriptor". */
1076 	sume_fill_bb_desc(adapter, send, SUME_RIFFA_LEN(send->len));
1077 
1078 	/* Update the state before intiating the DMA to avoid races. */
1079 	send->state = SUME_RIFFA_CHAN_STATE_READY;
1080 
1081 	/* DMA. */
1082 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1083 	    RIFFA_RX_SG_ADDR_LO_REG_OFF),
1084 	    SUME_RIFFA_LO_ADDR(send->buf_hw_addr));
1085 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1086 	    RIFFA_RX_SG_ADDR_HI_REG_OFF),
1087 	    SUME_RIFFA_HI_ADDR(send->buf_hw_addr));
1088 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1089 	    RIFFA_RX_SG_LEN_REG_OFF), 4 * send->num_sg);
1090 
1091 	bus_dmamap_sync(send->ch_tag, send->ch_map,
1092 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1093 
1094 	nf_priv->stats.tx_packets++;
1095 	nf_priv->stats.tx_bytes += plen;
1096 
1097 	/* We can free as long as we use the bounce buffer. */
1098 	m_freem(m);
1099 
1100 	adapter->last_ifc = nf_priv->port;
1101 
1102 	/* Reset watchdog counter. */
1103 	adapter->wd_counter = 0;
1104 
1105 	return (0);
1106 }
1107 
1108 static void
1109 sume_if_start(struct ifnet *ifp)
1110 {
1111 	struct nf_priv *nf_priv = ifp->if_softc;
1112 	struct sume_adapter *adapter = nf_priv->adapter;
1113 
1114 	if (!adapter->running || !(ifp->if_flags & IFF_UP))
1115 		return;
1116 
1117 	SUME_LOCK(adapter);
1118 	if (adapter->send[SUME_RIFFA_CHANNEL_DATA]->state ==
1119 	    SUME_RIFFA_CHAN_STATE_IDLE)
1120 		sume_if_start_locked(ifp);
1121 	SUME_UNLOCK(adapter);
1122 }
1123 
1124 /*
1125  * We call this function at the end of every TX transaction to check for
1126  * remaining packets in the TX queues for every UP interface.
1127  */
1128 static void
1129 check_tx_queues(struct sume_adapter *adapter)
1130 {
1131 	int i, last_ifc;
1132 
1133 	KASSERT(mtx_owned(&adapter->lock), ("SUME lock not owned"));
1134 
1135 	last_ifc = adapter->last_ifc;
1136 
1137 	/* Check all interfaces */
1138 	for (i = last_ifc + 1; i < last_ifc + SUME_NPORTS + 1; i++) {
1139 		struct ifnet *ifp = adapter->ifp[i % SUME_NPORTS];
1140 
1141 		if (!(ifp->if_flags & IFF_UP))
1142 			continue;
1143 
1144 		if (!sume_if_start_locked(ifp))
1145 			break;
1146 	}
1147 }
1148 
1149 static int
1150 sume_ifp_alloc(struct sume_adapter *adapter, uint32_t port)
1151 {
1152 	struct ifnet *ifp;
1153 	struct nf_priv *nf_priv = malloc(sizeof(struct nf_priv), M_SUME,
1154 	    M_ZERO | M_WAITOK);
1155 
1156 	ifp = if_alloc(IFT_ETHER);
1157 	if (ifp == NULL) {
1158 		device_printf(adapter->dev, "cannot allocate ifnet\n");
1159 		return (ENOMEM);
1160 	}
1161 
1162 	adapter->ifp[port] = ifp;
1163 	ifp->if_softc = nf_priv;
1164 
1165 	nf_priv->adapter = adapter;
1166 	nf_priv->unit = alloc_unr(unr);
1167 	nf_priv->port = port;
1168 	nf_priv->link_up = 0;
1169 
1170 	if_initname(ifp, SUME_ETH_DEVICE_NAME, nf_priv->unit);
1171 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1172 
1173 	ifp->if_init = sume_if_init;
1174 	ifp->if_start = sume_if_start;
1175 	ifp->if_ioctl = sume_if_ioctl;
1176 
1177 	uint8_t hw_addr[ETHER_ADDR_LEN] = DEFAULT_ETHER_ADDRESS;
1178 	hw_addr[ETHER_ADDR_LEN-1] = nf_priv->unit;
1179 	ether_ifattach(ifp, hw_addr);
1180 
1181 	ifmedia_init(&nf_priv->media, IFM_IMASK, sume_media_change,
1182 	    sume_media_status);
1183 	ifmedia_add(&nf_priv->media, IFM_ETHER | IFM_10G_SR, 0, NULL);
1184 	ifmedia_set(&nf_priv->media, IFM_ETHER | IFM_10G_SR);
1185 
1186 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1187 
1188 	return (0);
1189 }
1190 
1191 static void
1192 callback_dma(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1193 {
1194 	if (err)
1195 		return;
1196 
1197 	KASSERT(nseg == 1, ("%d segments returned!", nseg));
1198 
1199 	*(bus_addr_t *) arg = segs[0].ds_addr;
1200 }
1201 
1202 static int
1203 sume_probe_riffa_buffer(const struct sume_adapter *adapter,
1204     struct riffa_chnl_dir ***p, const char *dir)
1205 {
1206 	struct riffa_chnl_dir **rp;
1207 	bus_addr_t hw_addr;
1208 	int error, ch;
1209 	device_t dev = adapter->dev;
1210 
1211 	error = ENOMEM;
1212 	*p = malloc(SUME_RIFFA_CHANNELS * sizeof(struct riffa_chnl_dir *),
1213 	    M_SUME, M_ZERO | M_WAITOK);
1214 	if (*p == NULL) {
1215 		device_printf(dev, "malloc(%s) failed.\n", dir);
1216 		return (error);
1217 	}
1218 
1219 	rp = *p;
1220 	/* Allocate the chnl_dir structs themselves. */
1221 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
1222 		/* One direction. */
1223 		rp[ch] = malloc(sizeof(struct riffa_chnl_dir), M_SUME,
1224 		    M_ZERO | M_WAITOK);
1225 		if (rp[ch] == NULL) {
1226 			device_printf(dev, "malloc(%s[%d]) riffa_chnl_dir "
1227 			    "failed.\n", dir, ch);
1228 			return (error);
1229 		}
1230 
1231 		int err = bus_dma_tag_create(bus_get_dma_tag(dev),
1232 		    4, 0,
1233 		    BUS_SPACE_MAXADDR,
1234 		    BUS_SPACE_MAXADDR,
1235 		    NULL, NULL,
1236 		    adapter->sg_buf_size,
1237 		    1,
1238 		    adapter->sg_buf_size,
1239 		    0,
1240 		    NULL,
1241 		    NULL,
1242 		    &rp[ch]->ch_tag);
1243 
1244 		if (err) {
1245 			device_printf(dev, "bus_dma_tag_create(%s[%d]) "
1246 			    "failed.\n", dir, ch);
1247 			return (err);
1248 		}
1249 
1250 		err = bus_dmamem_alloc(rp[ch]->ch_tag, (void **)
1251 		    &rp[ch]->buf_addr, BUS_DMA_WAITOK | BUS_DMA_COHERENT |
1252 		    BUS_DMA_ZERO, &rp[ch]->ch_map);
1253 		if (err) {
1254 			device_printf(dev, "bus_dmamem_alloc(%s[%d]) failed.\n",
1255 			    dir, ch);
1256 			return (err);
1257 		}
1258 
1259 		bzero(rp[ch]->buf_addr, adapter->sg_buf_size);
1260 
1261 		err = bus_dmamap_load(rp[ch]->ch_tag, rp[ch]->ch_map,
1262 		    rp[ch]->buf_addr, adapter->sg_buf_size, callback_dma,
1263 		    &hw_addr, BUS_DMA_NOWAIT);
1264 		if (err) {
1265 			device_printf(dev, "bus_dmamap_load(%s[%d]) failed.\n",
1266 			    dir, ch);
1267 			return (err);
1268 		}
1269 		rp[ch]->buf_hw_addr = hw_addr;
1270 		rp[ch]->num_sg = 1;
1271 		rp[ch]->state = SUME_RIFFA_CHAN_STATE_IDLE;
1272 
1273 		rp[ch]->rtag = SUME_INIT_RTAG;
1274 	}
1275 
1276 	return (0);
1277 }
1278 
1279 static int
1280 sume_probe_riffa_buffers(struct sume_adapter *adapter)
1281 {
1282 	int error;
1283 
1284 	error = sume_probe_riffa_buffer(adapter, &adapter->recv, "recv");
1285 	if (error)
1286 		return (error);
1287 
1288 	error = sume_probe_riffa_buffer(adapter, &adapter->send, "send");
1289 
1290 	return (error);
1291 }
1292 
1293 static void
1294 sume_sysctl_init(struct sume_adapter *adapter)
1295 {
1296 	device_t dev = adapter->dev;
1297 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev);
1298 	struct sysctl_oid *tree = device_get_sysctl_tree(dev);
1299 	struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree);
1300 	struct sysctl_oid *tmp_tree;
1301 	char namebuf[MAX_IFC_NAME_LEN];
1302 	int i;
1303 
1304 	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "sume", CTLFLAG_RW,
1305 	    0, "SUME top-level tree");
1306 	if (tree == NULL) {
1307 		device_printf(dev, "SYSCTL_ADD_NODE failed.\n");
1308 		return;
1309 	}
1310 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug", CTLFLAG_RW,
1311 	    &adapter->sume_debug, 0, "debug int leaf");
1312 
1313 	/* total RX error stats */
1314 	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "rx_epkts",
1315 	    CTLFLAG_RD, &adapter->packets_err, 0, "rx errors");
1316 	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "rx_ebytes",
1317 	    CTLFLAG_RD, &adapter->bytes_err, 0, "rx error bytes");
1318 
1319 	for (i = SUME_NPORTS - 1; i >= 0; i--) {
1320 		struct ifnet *ifp = adapter->ifp[i];
1321 		if (ifp == NULL)
1322 			continue;
1323 
1324 		struct nf_priv *nf_priv = ifp->if_softc;
1325 
1326 		snprintf(namebuf, MAX_IFC_NAME_LEN, "%s%d",
1327 		    SUME_ETH_DEVICE_NAME, nf_priv->unit);
1328 		tmp_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
1329 		    CTLFLAG_RW, 0, "SUME ifc tree");
1330 		if (tmp_tree == NULL) {
1331 			device_printf(dev, "SYSCTL_ADD_NODE failed.\n");
1332 			return;
1333 		}
1334 
1335 		/* Packets dropped by down interface. */
1336 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1337 		    "ifc_down_bytes", CTLFLAG_RD,
1338 		    &nf_priv->stats.ifc_down_bytes, 0, "ifc_down bytes");
1339 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1340 		    "ifc_down_packets", CTLFLAG_RD,
1341 		    &nf_priv->stats.ifc_down_packets, 0, "ifc_down packets");
1342 
1343 		/* HW RX stats */
1344 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1345 		    "hw_rx_packets", CTLFLAG_RD, &nf_priv->stats.hw_rx_packets,
1346 		    0, "hw_rx packets");
1347 
1348 		/* HW TX stats */
1349 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1350 		    "hw_tx_packets", CTLFLAG_RD, &nf_priv->stats.hw_tx_packets,
1351 		    0, "hw_tx packets");
1352 
1353 		/* RX stats */
1354 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1355 		    "rx_bytes", CTLFLAG_RD, &nf_priv->stats.rx_bytes, 0,
1356 		    "rx bytes");
1357 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1358 		    "rx_dropped", CTLFLAG_RD, &nf_priv->stats.rx_dropped, 0,
1359 		    "rx dropped");
1360 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1361 		    "rx_packets", CTLFLAG_RD, &nf_priv->stats.rx_packets, 0,
1362 		    "rx packets");
1363 
1364 		/* TX stats */
1365 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1366 		    "tx_bytes", CTLFLAG_RD, &nf_priv->stats.tx_bytes, 0,
1367 		    "tx bytes");
1368 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1369 		    "tx_dropped", CTLFLAG_RD, &nf_priv->stats.tx_dropped, 0,
1370 		    "tx dropped");
1371 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1372 		    "tx_packets", CTLFLAG_RD, &nf_priv->stats.tx_packets, 0,
1373 		    "tx packets");
1374 	}
1375 }
1376 
1377 static void
1378 sume_local_timer(void *arg)
1379 {
1380 	struct sume_adapter *adapter = arg;
1381 
1382 	if (!adapter->running)
1383 		return;
1384 
1385 	taskqueue_enqueue(adapter->tq, &adapter->stat_task);
1386 
1387 	SUME_LOCK(adapter);
1388 	if (adapter->send[SUME_RIFFA_CHANNEL_DATA]->state !=
1389 	    SUME_RIFFA_CHAN_STATE_IDLE && ++adapter->wd_counter >= 3) {
1390 		/* Resetting interfaces if stuck for 3 seconds. */
1391 		device_printf(adapter->dev, "TX stuck, resetting adapter.\n");
1392 		read_reg(adapter, RIFFA_INFO_REG_OFF);
1393 
1394 		adapter->send[SUME_RIFFA_CHANNEL_DATA]->state =
1395 		    SUME_RIFFA_CHAN_STATE_IDLE;
1396 		adapter->wd_counter = 0;
1397 
1398 		check_tx_queues(adapter);
1399 	}
1400 	SUME_UNLOCK(adapter);
1401 
1402 	callout_reset(&adapter->timer, 1 * hz, sume_local_timer, adapter);
1403 }
1404 
1405 static void
1406 sume_get_stats(void *context, int pending)
1407 {
1408 	struct sume_adapter *adapter = context;
1409 	int i;
1410 
1411 	for (i = 0; i < SUME_NPORTS; i++) {
1412 		struct ifnet *ifp = adapter->ifp[i];
1413 
1414 		if (ifp->if_flags & IFF_UP) {
1415 			struct nf_priv *nf_priv = ifp->if_softc;
1416 			struct sume_ifreq sifr;
1417 
1418 			sume_update_link_status(ifp);
1419 
1420 			/* Get RX counter. */
1421 			sifr.addr = SUME_STAT_RX_ADDR(nf_priv->port);
1422 			sifr.val = 0;
1423 
1424 			if (!get_modreg_value(nf_priv, &sifr))
1425 				nf_priv->stats.hw_rx_packets += sifr.val;
1426 
1427 			/* Get TX counter. */
1428 			sifr.addr = SUME_STAT_TX_ADDR(nf_priv->port);
1429 			sifr.val = 0;
1430 
1431 			if (!get_modreg_value(nf_priv, &sifr))
1432 				nf_priv->stats.hw_tx_packets += sifr.val;
1433 		}
1434 	}
1435 }
1436 
1437 static int
1438 sume_attach(device_t dev)
1439 {
1440 	struct sume_adapter *adapter = device_get_softc(dev);
1441 	adapter->dev = dev;
1442 	int error, i;
1443 
1444 	mtx_init(&adapter->lock, "Global lock", NULL, MTX_DEF);
1445 
1446 	adapter->running = 0;
1447 
1448 	/* OK finish up RIFFA. */
1449 	error = sume_probe_riffa_pci(adapter);
1450 	if (error != 0)
1451 		goto error;
1452 
1453 	error = sume_probe_riffa_buffers(adapter);
1454 	if (error != 0)
1455 		goto error;
1456 
1457 	/* Now do the network interfaces. */
1458 	for (i = 0; i < SUME_NPORTS; i++) {
1459 		error = sume_ifp_alloc(adapter, i);
1460 		if (error != 0)
1461 			goto error;
1462 	}
1463 
1464 	/*  Register stats and register sysctls. */
1465 	sume_sysctl_init(adapter);
1466 
1467 	/* Reset the HW. */
1468 	read_reg(adapter, RIFFA_INFO_REG_OFF);
1469 
1470 	/* Ready to go, "enable" IRQ. */
1471 	adapter->running = 1;
1472 
1473 	callout_init(&adapter->timer, 1);
1474 	TASK_INIT(&adapter->stat_task, 0, sume_get_stats, adapter);
1475 
1476 	adapter->tq = taskqueue_create("sume_stats", M_NOWAIT,
1477 	    taskqueue_thread_enqueue, &adapter->tq);
1478 	taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s stattaskq",
1479 	    device_get_nameunit(adapter->dev));
1480 
1481 	callout_reset(&adapter->timer, 1 * hz, sume_local_timer, adapter);
1482 
1483 	return (0);
1484 
1485 error:
1486 	sume_detach(dev);
1487 
1488 	return (error);
1489 }
1490 
1491 static void
1492 sume_remove_riffa_buffer(const struct sume_adapter *adapter,
1493     struct riffa_chnl_dir **pp)
1494 {
1495 	int ch;
1496 
1497 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
1498 		if (pp[ch] == NULL)
1499 			continue;
1500 
1501 		if (pp[ch]->buf_hw_addr != 0) {
1502 			bus_dmamem_free(pp[ch]->ch_tag, pp[ch]->buf_addr,
1503 			    pp[ch]->ch_map);
1504 			pp[ch]->buf_hw_addr = 0;
1505 		}
1506 
1507 		free(pp[ch], M_SUME);
1508 	}
1509 }
1510 
1511 static void
1512 sume_remove_riffa_buffers(struct sume_adapter *adapter)
1513 {
1514 	if (adapter->send != NULL) {
1515 		sume_remove_riffa_buffer(adapter, adapter->send);
1516 		free(adapter->send, M_SUME);
1517 		adapter->send = NULL;
1518 	}
1519 	if (adapter->recv != NULL) {
1520 		sume_remove_riffa_buffer(adapter, adapter->recv);
1521 		free(adapter->recv, M_SUME);
1522 		adapter->recv = NULL;
1523 	}
1524 }
1525 
1526 static int
1527 sume_detach(device_t dev)
1528 {
1529 	struct sume_adapter *adapter = device_get_softc(dev);
1530 	int i;
1531 	struct nf_priv *nf_priv;
1532 
1533 	KASSERT(mtx_initialized(&adapter->lock), ("SUME mutex not "
1534 	    "initialized"));
1535 	adapter->running = 0;
1536 
1537 	/* Drain the stats callout and task queue. */
1538 	callout_drain(&adapter->timer);
1539 
1540 	if (adapter->tq) {
1541 		taskqueue_drain(adapter->tq, &adapter->stat_task);
1542 		taskqueue_free(adapter->tq);
1543 	}
1544 
1545 	for (i = 0; i < SUME_NPORTS; i++) {
1546 		struct ifnet *ifp = adapter->ifp[i];
1547 		if (ifp == NULL)
1548 			continue;
1549 
1550 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1551 		nf_priv = ifp->if_softc;
1552 
1553 		if (ifp->if_flags & IFF_UP)
1554 			if_down(ifp);
1555 		ifmedia_removeall(&nf_priv->media);
1556 		free_unr(unr, nf_priv->unit);
1557 
1558 		ifp->if_flags &= ~IFF_UP;
1559 		ether_ifdetach(ifp);
1560 		if_free(ifp);
1561 
1562 		free(nf_priv, M_SUME);
1563 	}
1564 
1565 	sume_remove_riffa_buffers(adapter);
1566 
1567 	if (adapter->irq.tag)
1568 		bus_teardown_intr(dev, adapter->irq.res, adapter->irq.tag);
1569 	if (adapter->irq.res)
1570 		bus_release_resource(dev, SYS_RES_IRQ, adapter->irq.rid,
1571 		    adapter->irq.res);
1572 
1573 	pci_release_msi(dev);
1574 
1575 	if (adapter->bar0_addr)
1576 		bus_release_resource(dev, SYS_RES_MEMORY, adapter->rid,
1577 		    adapter->bar0_addr);
1578 
1579 	mtx_destroy(&adapter->lock);
1580 
1581 	return (0);
1582 }
1583 
1584 static int
1585 mod_event(module_t mod, int cmd, void *arg)
1586 {
1587 	switch (cmd) {
1588 	case MOD_LOAD:
1589 		unr = new_unrhdr(0, INT_MAX, NULL);
1590 		break;
1591 
1592 	case MOD_UNLOAD:
1593 		delete_unrhdr(unr);
1594 		break;
1595 	}
1596 
1597 	return (0);
1598 }
1599 static devclass_t sume_devclass;
1600 
1601 DRIVER_MODULE(sume, pci, sume_driver, sume_devclass, mod_event, 0);
1602 MODULE_VERSION(sume, 1);
1603