xref: /freebsd/sys/dev/sume/if_sume.c (revision d59a76183470685bdf0b88013d2baad1f04f030f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Bjoern A. Zeeb
5  * Copyright (c) 2020 Denis Salopek
6  *
7  * This software was developed by SRI International and the University of
8  * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-11-C-0249
9  * ("MRC2"), as part of the DARPA MRC research programme.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/bus.h>
35 #include <sys/endian.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
38 #include <sys/module.h>
39 #include <sys/rman.h>
40 #include <sys/socket.h>
41 #include <sys/sockio.h>
42 #include <sys/sysctl.h>
43 #include <sys/taskqueue.h>
44 
45 #include <net/if.h>
46 #include <net/if_media.h>
47 #include <net/if_types.h>
48 #include <net/if_var.h>
49 
50 #include <netinet/in.h>
51 #include <netinet/if_ether.h>
52 
53 #include <dev/pci/pcivar.h>
54 #include <dev/pci/pcireg.h>
55 
56 #include <machine/bus.h>
57 
58 #include "adapter.h"
59 
60 #define	PCI_VENDOR_ID_XILINX	0x10ee
61 #define	PCI_DEVICE_ID_SUME	0x7028
62 
63 /* SUME bus driver interface */
64 static int sume_probe(device_t);
65 static int sume_attach(device_t);
66 static int sume_detach(device_t);
67 
68 static device_method_t sume_methods[] = {
69 	DEVMETHOD(device_probe,		sume_probe),
70 	DEVMETHOD(device_attach,	sume_attach),
71 	DEVMETHOD(device_detach,	sume_detach),
72 	DEVMETHOD_END
73 };
74 
75 static driver_t sume_driver = {
76 	"sume",
77 	sume_methods,
78 	sizeof(struct sume_adapter)
79 };
80 
81 /*
82  * The DMA engine for SUME generates interrupts for each RX/TX transaction.
83  * Depending on the channel (0 if packet transaction, 1 if register transaction)
84  * the used bits of the interrupt vector will be the lowest or the second lowest
85  * 5 bits.
86  *
87  * When receiving packets from SUME (RX):
88  * (1) SUME received a packet on one of the interfaces.
89  * (2) SUME generates an interrupt vector, bit 00001 is set (channel 0 - new RX
90  *     transaction).
91  * (3) We read the length of the incoming packet and the offset along with the
92  *     'last' flag from the SUME registers.
93  * (4) We prepare for the DMA transaction by setting the bouncebuffer on the
94  *     address buf_addr. For now, this is how it's done:
95  *     - First 3*sizeof(uint32_t) bytes are: lower and upper 32 bits of physical
96  *     address where we want the data to arrive (buf_addr[0] and buf_addr[1]),
97  *     and length of incoming data (buf_addr[2]).
98  *     - Data will start right after, at buf_addr+3*sizeof(uint32_t). The
99  *     physical address buf_hw_addr is a block of contiguous memory mapped to
100  *     buf_addr, so we can set the incoming data's physical address (buf_addr[0]
101  *     and buf_addr[1]) to buf_hw_addr+3*sizeof(uint32_t).
102  * (5) We notify SUME that the bouncebuffer is ready for the transaction by
103  *     writing the lower/upper physical address buf_hw_addr to the SUME
104  *     registers RIFFA_TX_SG_ADDR_LO_REG_OFF and RIFFA_TX_SG_ADDR_HI_REG_OFF as
105  *     well as the number of segments to the register RIFFA_TX_SG_LEN_REG_OFF.
106  * (6) SUME generates an interrupt vector, bit 00010 is set (channel 0 -
107  *     bouncebuffer received).
108  * (7) SUME generates an interrupt vector, bit 00100 is set (channel 0 -
109  *     transaction is done).
110  * (8) SUME can do both steps (6) and (7) using the same interrupt.
111  * (8) We read the first 16 bytes (metadata) of the received data and note the
112  *     incoming interface so we can later forward it to the right one in the OS
113  *     (sume0, sume1, sume2 or sume3).
114  * (10) We create an mbuf and copy the data from the bouncebuffer to the mbuf
115  *     and set the mbuf rcvif to the incoming interface.
116  * (11) We forward the mbuf to the appropriate interface via ifp->if_input.
117  *
118  * When sending packets to SUME (TX):
119  * (1) The OS calls sume_if_start() function on TX.
120  * (2) We get the mbuf packet data and copy it to the
121  *     buf_addr+3*sizeof(uint32_t) + metadata 16 bytes.
122  * (3) We create the metadata based on the output interface and copy it to the
123  *     buf_addr+3*sizeof(uint32_t).
124  * (4) We write the offset/last and length of the packet to the SUME registers
125  *     RIFFA_RX_OFFLAST_REG_OFF and RIFFA_RX_LEN_REG_OFF.
126  * (5) We fill the bouncebuffer by filling the first 3*sizeof(uint32_t) bytes
127  *     with the physical address and length just as in RX step (4).
128  * (6) We notify SUME that the bouncebuffer is ready by writing to SUME
129  *     registers RIFFA_RX_SG_ADDR_LO_REG_OFF, RIFFA_RX_SG_ADDR_HI_REG_OFF and
130  *     RIFFA_RX_SG_LEN_REG_OFF just as in RX step (5).
131  * (7) SUME generates an interrupt vector, bit 01000 is set (channel 0 -
132  *     bouncebuffer is read).
133  * (8) SUME generates an interrupt vector, bit 10000 is set (channel 0 -
134  *     transaction is done).
135  * (9) SUME can do both steps (7) and (8) using the same interrupt.
136  *
137  * Internal registers
138  * Every module in the SUME hardware has its own set of internal registers
139  * (IDs, for debugging and statistic purposes, etc.). Their base addresses are
140  * defined in 'projects/reference_nic/hw/tcl/reference_nic_defines.tcl' and the
141  * offsets to different memory locations of every module are defined in their
142  * corresponding folder inside the library. These registers can be RO/RW and
143  * there is a special method to fetch/change this data over 1 or 2 DMA
144  * transactions. For writing, by calling the sume_module_reg_write(). For
145  * reading, by calling the sume_module_reg_write() and then
146  * sume_module_reg_read(). Check those functions for more information.
147  */
148 
149 MALLOC_DECLARE(M_SUME);
150 MALLOC_DEFINE(M_SUME, "sume", "NetFPGA SUME device driver");
151 
152 static void check_tx_queues(struct sume_adapter *);
153 static void sume_fill_bb_desc(struct sume_adapter *, struct riffa_chnl_dir *,
154     uint64_t);
155 
156 static struct unrhdr *unr;
157 
158 static struct {
159 	uint16_t device;
160 	char *desc;
161 } sume_pciids[] = {
162 	{PCI_DEVICE_ID_SUME, "NetFPGA SUME reference NIC"},
163 };
164 
165 static inline uint32_t
166 read_reg(struct sume_adapter *adapter, int offset)
167 {
168 
169 	return (bus_space_read_4(adapter->bt, adapter->bh, offset << 2));
170 }
171 
172 static inline void
173 write_reg(struct sume_adapter *adapter, int offset, uint32_t val)
174 {
175 
176 	bus_space_write_4(adapter->bt, adapter->bh, offset << 2, val);
177 }
178 
179 static int
180 sume_probe(device_t dev)
181 {
182 	int i;
183 	uint16_t v = pci_get_vendor(dev);
184 	uint16_t d = pci_get_device(dev);
185 
186 	if (v != PCI_VENDOR_ID_XILINX)
187 		return (ENXIO);
188 
189 	for (i = 0; i < nitems(sume_pciids); i++) {
190 		if (d == sume_pciids[i].device) {
191 			device_set_desc(dev, sume_pciids[i].desc);
192 			return (BUS_PROBE_DEFAULT);
193 		}
194 	}
195 
196 	return (ENXIO);
197 }
198 
199 /*
200  * Building mbuf for packet received from SUME. We expect to receive 'len'
201  * bytes of data (including metadata) written from the bouncebuffer address
202  * buf_addr+3*sizeof(uint32_t). Metadata will tell us which SUME interface
203  * received the packet (sport will be 1, 2, 4 or 8), the packet length (plen),
204  * and the magic word needs to be 0xcafe. When we have the packet data, we
205  * create an mbuf and copy the data to it using m_copyback() function, set the
206  * correct interface to rcvif and return the mbuf to be later sent to the OS
207  * with if_input.
208  */
209 static struct mbuf *
210 sume_rx_build_mbuf(struct sume_adapter *adapter, uint32_t len)
211 {
212 	struct nf_priv *nf_priv;
213 	struct mbuf *m;
214 	if_t ifp = NULL;
215 	int np;
216 	uint16_t dport, plen, magic;
217 	device_t dev = adapter->dev;
218 	uint8_t *indata = (uint8_t *)
219 	    adapter->recv[SUME_RIFFA_CHANNEL_DATA]->buf_addr +
220 	    sizeof(struct nf_bb_desc);
221 	struct nf_metadata *mdata = (struct nf_metadata *) indata;
222 
223 	/* The metadata header is 16 bytes. */
224 	if (len < sizeof(struct nf_metadata)) {
225 		device_printf(dev, "short frame (%d)\n", len);
226 		adapter->packets_err++;
227 		adapter->bytes_err += len;
228 		return (NULL);
229 	}
230 
231 	dport = le16toh(mdata->dport);
232 	plen = le16toh(mdata->plen);
233 	magic = le16toh(mdata->magic);
234 
235 	if (sizeof(struct nf_metadata) + plen > len ||
236 	    magic != SUME_RIFFA_MAGIC) {
237 		device_printf(dev, "corrupted packet (%zd + %d > %d || magic "
238 		    "0x%04x != 0x%04x)\n", sizeof(struct nf_metadata), plen,
239 		    len, magic, SUME_RIFFA_MAGIC);
240 		return (NULL);
241 	}
242 
243 	/* We got the packet from one of the even bits */
244 	np = (ffs(dport & SUME_DPORT_MASK) >> 1) - 1;
245 	if (np > SUME_NPORTS) {
246 		device_printf(dev, "invalid destination port 0x%04x (%d)\n",
247 		    dport, np);
248 		adapter->packets_err++;
249 		adapter->bytes_err += plen;
250 		return (NULL);
251 	}
252 	ifp = adapter->ifp[np];
253 	nf_priv = if_getsoftc(ifp);
254 	nf_priv->stats.rx_packets++;
255 	nf_priv->stats.rx_bytes += plen;
256 
257 	/* If the interface is down, well, we are done. */
258 	if (!(if_getflags(ifp) & IFF_UP)) {
259 		nf_priv->stats.ifc_down_packets++;
260 		nf_priv->stats.ifc_down_bytes += plen;
261 		return (NULL);
262 	}
263 
264 	if (adapter->sume_debug)
265 		printf("Building mbuf with length: %d\n", plen);
266 
267 	m = m_getm(NULL, plen, M_NOWAIT, MT_DATA);
268 	if (m == NULL) {
269 		adapter->packets_err++;
270 		adapter->bytes_err += plen;
271 		return (NULL);
272 	}
273 
274 	/* Copy the data in at the right offset. */
275 	m_copyback(m, 0, plen, (void *) (indata + sizeof(struct nf_metadata)));
276 	m->m_pkthdr.rcvif = ifp;
277 
278 	return (m);
279 }
280 
281 /*
282  * SUME interrupt handler for when we get a valid interrupt from the board.
283  * Theoretically, we can receive interrupt for any of the available channels,
284  * but RIFFA DMA uses only 2: 0 and 1, so we use only vect0. The vector is a 32
285  * bit number, using 5 bits for every channel, the least significant bits
286  * correspond to channel 0 and the next 5 bits correspond to channel 1. Vector
287  * bits for RX/TX are:
288  * RX
289  * bit 0 - new transaction from SUME
290  * bit 1 - SUME received our bouncebuffer address
291  * bit 2 - SUME copied the received data to our bouncebuffer, transaction done
292  * TX
293  * bit 3 - SUME received our bouncebuffer address
294  * bit 4 - SUME copied the data from our bouncebuffer, transaction done
295  *
296  * There are two finite state machines (one for TX, one for RX). We loop
297  * through channels 0 and 1 to check and our current state and which interrupt
298  * bit is set.
299  * TX
300  * SUME_RIFFA_CHAN_STATE_IDLE: waiting for the first TX transaction.
301  * SUME_RIFFA_CHAN_STATE_READY: we prepared (filled with data) the bouncebuffer
302  * and triggered the SUME for the TX transaction. Waiting for interrupt bit 3
303  * to go to the next state.
304  * SUME_RIFFA_CHAN_STATE_READ: waiting for interrupt bit 4 (for SUME to send
305  * our packet). Then we get the length of the sent data and go back to the
306  * IDLE state.
307  * RX
308  * SUME_RIFFA_CHAN_STATE_IDLE: waiting for the interrupt bit 0 (new RX
309  * transaction). When we get it, we prepare our bouncebuffer for reading and
310  * trigger the SUME to start the transaction. Go to the next state.
311  * SUME_RIFFA_CHAN_STATE_READY: waiting for the interrupt bit 1 (SUME got our
312  * bouncebuffer). Go to the next state.
313  * SUME_RIFFA_CHAN_STATE_READ: SUME copied data and our bouncebuffer is ready,
314  * we can build the mbuf and go back to the IDLE state.
315  */
316 static void
317 sume_intr_handler(void *arg)
318 {
319 	struct sume_adapter *adapter = arg;
320 	uint32_t vect, vect0, len;
321 	int ch, loops;
322 	device_t dev = adapter->dev;
323 	struct mbuf *m = NULL;
324 	if_t ifp = NULL;
325 	struct riffa_chnl_dir *send, *recv;
326 
327 	SUME_LOCK(adapter);
328 
329 	vect0 = read_reg(adapter, RIFFA_IRQ_REG0_OFF);
330 	if ((vect0 & SUME_INVALID_VECT) != 0) {
331 		SUME_UNLOCK(adapter);
332 		return;
333 	}
334 
335 	/*
336 	 * We only have one interrupt for all channels and no way
337 	 * to quickly lookup for which channel(s) we got an interrupt?
338 	 */
339 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
340 		vect = vect0 >> (5 * ch);
341 		send = adapter->send[ch];
342 		recv = adapter->recv[ch];
343 
344 		loops = 0;
345 		while ((vect & (SUME_MSI_TXBUF | SUME_MSI_TXDONE)) &&
346 		    loops <= 5) {
347 			if (adapter->sume_debug)
348 				device_printf(dev, "TX ch %d state %u vect = "
349 				    "0x%08x\n", ch, send->state, vect);
350 			switch (send->state) {
351 			case SUME_RIFFA_CHAN_STATE_IDLE:
352 				break;
353 			case SUME_RIFFA_CHAN_STATE_READY:
354 				if (!(vect & SUME_MSI_TXBUF)) {
355 					device_printf(dev, "ch %d unexpected "
356 					    "interrupt in send+3 state %u: "
357 					    "vect = 0x%08x\n", ch, send->state,
358 					    vect);
359 					send->recovery = 1;
360 					break;
361 				}
362 				send->state = SUME_RIFFA_CHAN_STATE_READ;
363 				vect &= ~SUME_MSI_TXBUF;
364 				break;
365 			case SUME_RIFFA_CHAN_STATE_READ:
366 				if (!(vect & SUME_MSI_TXDONE)) {
367 					device_printf(dev, "ch %d unexpected "
368 					    "interrupt in send+4 state %u: "
369 					    "vect = 0x%08x\n", ch, send->state,
370 					    vect);
371 					send->recovery = 1;
372 					break;
373 				}
374 				send->state = SUME_RIFFA_CHAN_STATE_LEN;
375 
376 				len = read_reg(adapter, RIFFA_CHNL_REG(ch,
377 				    RIFFA_RX_TNFR_LEN_REG_OFF));
378 				if (ch == SUME_RIFFA_CHANNEL_DATA) {
379 					send->state =
380 					    SUME_RIFFA_CHAN_STATE_IDLE;
381 					check_tx_queues(adapter);
382 				} else if (ch == SUME_RIFFA_CHANNEL_REG)
383 					wakeup(&send->event);
384 				else {
385 					device_printf(dev, "ch %d unexpected "
386 					    "interrupt in send+4 state %u: "
387 					    "vect = 0x%08x\n", ch, send->state,
388 					    vect);
389 					send->recovery = 1;
390 				}
391 				vect &= ~SUME_MSI_TXDONE;
392 				break;
393 			case SUME_RIFFA_CHAN_STATE_LEN:
394 				break;
395 			default:
396 				device_printf(dev, "unknown TX state!\n");
397 			}
398 			loops++;
399 		}
400 
401 		if ((vect & (SUME_MSI_TXBUF | SUME_MSI_TXDONE)) &&
402 		    send->recovery)
403 			device_printf(dev, "ch %d ignoring vect = 0x%08x "
404 			    "during TX; not in recovery; state = %d loops = "
405 			    "%d\n", ch, vect, send->state, loops);
406 
407 		loops = 0;
408 		while ((vect & (SUME_MSI_RXQUE | SUME_MSI_RXBUF |
409 		    SUME_MSI_RXDONE)) && loops < 5) {
410 			if (adapter->sume_debug)
411 				device_printf(dev, "RX ch %d state %u vect = "
412 				    "0x%08x\n", ch, recv->state, vect);
413 			switch (recv->state) {
414 			case SUME_RIFFA_CHAN_STATE_IDLE:
415 				if (!(vect & SUME_MSI_RXQUE)) {
416 					device_printf(dev, "ch %d unexpected "
417 					    "interrupt in recv+0 state %u: "
418 					    "vect = 0x%08x\n", ch, recv->state,
419 					    vect);
420 					recv->recovery = 1;
421 					break;
422 				}
423 				uint32_t max_ptr;
424 
425 				/* Clear recovery state. */
426 				recv->recovery = 0;
427 
428 				/* Get offset and length. */
429 				recv->offlast = read_reg(adapter,
430 				    RIFFA_CHNL_REG(ch,
431 				    RIFFA_TX_OFFLAST_REG_OFF));
432 				recv->len = read_reg(adapter, RIFFA_CHNL_REG(ch,
433 				    RIFFA_TX_LEN_REG_OFF));
434 
435 				/* Boundary checks. */
436 				max_ptr = (uint32_t)((uintptr_t)recv->buf_addr
437 				    + SUME_RIFFA_OFFSET(recv->offlast)
438 				    + SUME_RIFFA_LEN(recv->len) - 1);
439 				if (max_ptr <
440 				    (uint32_t)((uintptr_t)recv->buf_addr))
441 					device_printf(dev, "receive buffer "
442 					    "wrap-around overflow.\n");
443 				if (SUME_RIFFA_OFFSET(recv->offlast) +
444 				    SUME_RIFFA_LEN(recv->len) >
445 				    adapter->sg_buf_size)
446 					device_printf(dev, "receive buffer too"
447 					    " small.\n");
448 
449 				/* Fill the bouncebuf "descriptor". */
450 				sume_fill_bb_desc(adapter, recv,
451 				    SUME_RIFFA_LEN(recv->len));
452 
453 				bus_dmamap_sync(recv->ch_tag, recv->ch_map,
454 				    BUS_DMASYNC_PREREAD |
455 				    BUS_DMASYNC_PREWRITE);
456 				write_reg(adapter, RIFFA_CHNL_REG(ch,
457 				    RIFFA_TX_SG_ADDR_LO_REG_OFF),
458 				    SUME_RIFFA_LO_ADDR(recv->buf_hw_addr));
459 				write_reg(adapter, RIFFA_CHNL_REG(ch,
460 				    RIFFA_TX_SG_ADDR_HI_REG_OFF),
461 				    SUME_RIFFA_HI_ADDR(recv->buf_hw_addr));
462 				write_reg(adapter, RIFFA_CHNL_REG(ch,
463 				    RIFFA_TX_SG_LEN_REG_OFF),
464 				    4 * recv->num_sg);
465 				bus_dmamap_sync(recv->ch_tag, recv->ch_map,
466 				    BUS_DMASYNC_POSTREAD |
467 				    BUS_DMASYNC_POSTWRITE);
468 
469 				recv->state = SUME_RIFFA_CHAN_STATE_READY;
470 				vect &= ~SUME_MSI_RXQUE;
471 				break;
472 			case SUME_RIFFA_CHAN_STATE_READY:
473 				if (!(vect & SUME_MSI_RXBUF)) {
474 					device_printf(dev, "ch %d unexpected "
475 					    "interrupt in recv+1 state %u: "
476 					    "vect = 0x%08x\n", ch, recv->state,
477 					    vect);
478 					recv->recovery = 1;
479 					break;
480 				}
481 				recv->state = SUME_RIFFA_CHAN_STATE_READ;
482 				vect &= ~SUME_MSI_RXBUF;
483 				break;
484 			case SUME_RIFFA_CHAN_STATE_READ:
485 				if (!(vect & SUME_MSI_RXDONE)) {
486 					device_printf(dev, "ch %d unexpected "
487 					    "interrupt in recv+2 state %u: "
488 					    "vect = 0x%08x\n", ch, recv->state,
489 					    vect);
490 					recv->recovery = 1;
491 					break;
492 				}
493 				len = read_reg(adapter, RIFFA_CHNL_REG(ch,
494 				    RIFFA_TX_TNFR_LEN_REG_OFF));
495 
496 				/* Remember, len and recv->len are words. */
497 				if (ch == SUME_RIFFA_CHANNEL_DATA) {
498 					m = sume_rx_build_mbuf(adapter,
499 					    len << 2);
500 					recv->state =
501 					    SUME_RIFFA_CHAN_STATE_IDLE;
502 				} else if (ch == SUME_RIFFA_CHANNEL_REG)
503 					wakeup(&recv->event);
504 				else {
505 					device_printf(dev, "ch %d unexpected "
506 					    "interrupt in recv+2 state %u: "
507 					    "vect = 0x%08x\n", ch, recv->state,
508 					    vect);
509 					recv->recovery = 1;
510 				}
511 				vect &= ~SUME_MSI_RXDONE;
512 				break;
513 			case SUME_RIFFA_CHAN_STATE_LEN:
514 				break;
515 			default:
516 				device_printf(dev, "unknown RX state!\n");
517 			}
518 			loops++;
519 		}
520 
521 		if ((vect & (SUME_MSI_RXQUE | SUME_MSI_RXBUF |
522 		    SUME_MSI_RXDONE)) && recv->recovery) {
523 			device_printf(dev, "ch %d ignoring vect = 0x%08x "
524 			    "during RX; not in recovery; state = %d, loops = "
525 			    "%d\n", ch, vect, recv->state, loops);
526 
527 			/* Clean the unfinished transaction. */
528 			if (ch == SUME_RIFFA_CHANNEL_REG &&
529 			    vect & SUME_MSI_RXDONE) {
530 				read_reg(adapter, RIFFA_CHNL_REG(ch,
531 				    RIFFA_TX_TNFR_LEN_REG_OFF));
532 				recv->recovery = 0;
533 			}
534 		}
535 	}
536 	SUME_UNLOCK(adapter);
537 
538 	if (m != NULL) {
539 		ifp = m->m_pkthdr.rcvif;
540 		if_input(ifp, m);
541 	}
542 }
543 
544 /*
545  * As we cannot disable interrupt generation, ignore early interrupts by waiting
546  * for the adapter to go into the 'running' state.
547  */
548 static int
549 sume_intr_filter(void *arg)
550 {
551 	struct sume_adapter *adapter = arg;
552 
553 	if (adapter->running == 0)
554 		return (FILTER_STRAY);
555 
556 	return (FILTER_SCHEDULE_THREAD);
557 }
558 
559 static int
560 sume_probe_riffa_pci(struct sume_adapter *adapter)
561 {
562 	device_t dev = adapter->dev;
563 	int error, count, capmem;
564 	uint32_t reg, devctl, linkctl;
565 
566 	pci_enable_busmaster(dev);
567 
568 	adapter->rid = PCIR_BAR(0);
569 	adapter->bar0_addr = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
570 	    &adapter->rid, RF_ACTIVE);
571 	if (adapter->bar0_addr == NULL) {
572 		device_printf(dev, "unable to allocate bus resource: "
573 		    "BAR0 address\n");
574 		return (ENXIO);
575 	}
576 	adapter->bt = rman_get_bustag(adapter->bar0_addr);
577 	adapter->bh = rman_get_bushandle(adapter->bar0_addr);
578 	adapter->bar0_len = rman_get_size(adapter->bar0_addr);
579 	if (adapter->bar0_len != 1024) {
580 		device_printf(dev, "BAR0 resource length %lu != 1024\n",
581 		    adapter->bar0_len);
582 		return (ENXIO);
583 	}
584 
585 	count = pci_msi_count(dev);
586 	error = pci_alloc_msi(dev, &count);
587 	if (error) {
588 		device_printf(dev, "unable to allocate bus resource: PCI "
589 		    "MSI\n");
590 		return (error);
591 	}
592 
593 	adapter->irq.rid = 1; /* Should be 1, thus says pci_alloc_msi() */
594 	adapter->irq.res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
595 	    &adapter->irq.rid, RF_SHAREABLE | RF_ACTIVE);
596 	if (adapter->irq.res == NULL) {
597 		device_printf(dev, "unable to allocate bus resource: IRQ "
598 		    "memory\n");
599 		return (ENXIO);
600 	}
601 
602 	error = bus_setup_intr(dev, adapter->irq.res, INTR_MPSAFE |
603 	    INTR_TYPE_NET, sume_intr_filter, sume_intr_handler, adapter,
604 	    &adapter->irq.tag);
605 	if (error) {
606 		device_printf(dev, "failed to setup interrupt for rid %d, name"
607 		    " %s: %d\n", adapter->irq.rid, "SUME_INTR", error);
608 		return (ENXIO);
609 	}
610 
611 	if (pci_find_cap(dev, PCIY_EXPRESS, &capmem) != 0) {
612 		device_printf(dev, "PCI not PCIe capable\n");
613 		return (ENXIO);
614 	}
615 
616 	devctl = pci_read_config(dev, capmem + PCIER_DEVICE_CTL, 2);
617 	pci_write_config(dev, capmem + PCIER_DEVICE_CTL, (devctl |
618 	    PCIEM_CTL_EXT_TAG_FIELD), 2);
619 
620 	devctl = pci_read_config(dev, capmem + PCIER_DEVICE_CTL2, 2);
621 	pci_write_config(dev, capmem + PCIER_DEVICE_CTL2, (devctl |
622 	    PCIEM_CTL2_ID_ORDERED_REQ_EN), 2);
623 
624 	linkctl = pci_read_config(dev, capmem + PCIER_LINK_CTL, 2);
625 	pci_write_config(dev, capmem + PCIER_LINK_CTL, (linkctl |
626 	    PCIEM_LINK_CTL_RCB), 2);
627 
628 	reg = read_reg(adapter, RIFFA_INFO_REG_OFF);
629 	adapter->num_sg = RIFFA_SG_ELEMS * ((reg >> 19) & 0xf);
630 	adapter->sg_buf_size = RIFFA_SG_BUF_SIZE * ((reg >> 19) & 0xf);
631 
632 	error = ENODEV;
633 	/* Check bus master is enabled. */
634 	if (((reg >> 4) & 0x1) != 1) {
635 		device_printf(dev, "bus master not enabled: %d\n",
636 		    (reg >> 4) & 0x1);
637 		return (error);
638 	}
639 	/* Check link parameters are valid. */
640 	if (((reg >> 5) & 0x3f) == 0 || ((reg >> 11) & 0x3) == 0) {
641 		device_printf(dev, "link parameters not valid: %d %d\n",
642 		    (reg >> 5) & 0x3f, (reg >> 11) & 0x3);
643 		return (error);
644 	}
645 	/* Check # of channels are within valid range. */
646 	if ((reg & 0xf) == 0 || (reg & 0xf) > RIFFA_MAX_CHNLS) {
647 		device_printf(dev, "number of channels out of range: %d\n",
648 		    reg & 0xf);
649 		return (error);
650 	}
651 	/* Check bus width. */
652 	if (((reg >> 19) & 0xf) == 0 ||
653 	    ((reg >> 19) & 0xf) > RIFFA_MAX_BUS_WIDTH_PARAM) {
654 		device_printf(dev, "bus width out of range: %d\n",
655 		    (reg >> 19) & 0xf);
656 		return (error);
657 	}
658 
659 	device_printf(dev, "[riffa] # of channels: %d\n",
660 	    reg & 0xf);
661 	device_printf(dev, "[riffa] bus interface width: %d\n",
662 	    ((reg >> 19) & 0xf) << 5);
663 	device_printf(dev, "[riffa] bus master enabled: %d\n",
664 	    (reg >> 4) & 0x1);
665 	device_printf(dev, "[riffa] negotiated link width: %d\n",
666 	    (reg >> 5) & 0x3f);
667 	device_printf(dev, "[riffa] negotiated rate width: %d MTs\n",
668 	    ((reg >> 11) & 0x3) * 2500);
669 	device_printf(dev, "[riffa] max downstream payload: %d B\n",
670 	    128 << ((reg >> 13) & 0x7));
671 	device_printf(dev, "[riffa] max upstream payload: %d B\n",
672 	    128 << ((reg >> 16) & 0x7));
673 
674 	return (0);
675 }
676 
677 /* If there is no sume_if_init, the ether_ioctl panics. */
678 static void
679 sume_if_init(void *sc)
680 {
681 }
682 
683 /* Write the address and length for our incoming / outgoing transaction. */
684 static void
685 sume_fill_bb_desc(struct sume_adapter *adapter, struct riffa_chnl_dir *p,
686     uint64_t len)
687 {
688 	struct nf_bb_desc *bouncebuf = (struct nf_bb_desc *) p->buf_addr;
689 
690 	bouncebuf->lower = (p->buf_hw_addr + sizeof(struct nf_bb_desc));
691 	bouncebuf->upper = (p->buf_hw_addr + sizeof(struct nf_bb_desc)) >> 32;
692 	bouncebuf->len = len >> 2;
693 }
694 
695 /* Module register locked write. */
696 static int
697 sume_modreg_write_locked(struct sume_adapter *adapter)
698 {
699 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
700 
701 	/* Let the FPGA know about the transfer. */
702 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
703 	    RIFFA_RX_OFFLAST_REG_OFF), SUME_OFFLAST);
704 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
705 	    RIFFA_RX_LEN_REG_OFF), send->len);	/* words */
706 
707 	/* Fill the bouncebuf "descriptor". */
708 	sume_fill_bb_desc(adapter, send, SUME_RIFFA_LEN(send->len));
709 
710 	/* Update the state before intiating the DMA to avoid races. */
711 	send->state = SUME_RIFFA_CHAN_STATE_READY;
712 
713 	bus_dmamap_sync(send->ch_tag, send->ch_map,
714 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
715 	/* DMA. */
716 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
717 	    RIFFA_RX_SG_ADDR_LO_REG_OFF),
718 	    SUME_RIFFA_LO_ADDR(send->buf_hw_addr));
719 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
720 	    RIFFA_RX_SG_ADDR_HI_REG_OFF),
721 	    SUME_RIFFA_HI_ADDR(send->buf_hw_addr));
722 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_REG,
723 	    RIFFA_RX_SG_LEN_REG_OFF), 4 * send->num_sg);
724 	bus_dmamap_sync(send->ch_tag, send->ch_map,
725 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
726 
727 	return (0);
728 }
729 
730 /*
731  * Request a register read or write (depending on optype).
732  * If optype is set (0x1f) this will result in a register write,
733  * otherwise this will result in a register read request at the given
734  * address and the result will need to be DMAed back.
735  */
736 static int
737 sume_module_reg_write(struct nf_priv *nf_priv, struct sume_ifreq *sifr,
738     uint32_t optype)
739 {
740 	struct sume_adapter *adapter = nf_priv->adapter;
741 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
742 	struct nf_regop_data *data;
743 	int error;
744 
745 	/*
746 	 * 1. Make sure the channel is free;  otherwise return EBUSY.
747 	 * 2. Prepare the memory in the bounce buffer (which we always
748 	 *    use for regs).
749 	 * 3. Start the DMA process.
750 	 * 4. Sleep and wait for result and return success or error.
751 	 */
752 	SUME_LOCK(adapter);
753 
754 	if (send->state != SUME_RIFFA_CHAN_STATE_IDLE) {
755 		SUME_UNLOCK(adapter);
756 		return (EBUSY);
757 	}
758 
759 	data = (struct nf_regop_data *) (send->buf_addr +
760 	    sizeof(struct nf_bb_desc));
761 	data->addr = htole32(sifr->addr);
762 	data->val = htole32(sifr->val);
763 	/* Tag to indentify request. */
764 	data->rtag = htole32(++send->rtag);
765 	data->optype = htole32(optype);
766 	send->len = sizeof(struct nf_regop_data) / 4; /* words */
767 
768 	error = sume_modreg_write_locked(adapter);
769 	if (error) {
770 		SUME_UNLOCK(adapter);
771 		return (EFAULT);
772 	}
773 
774 	/* Timeout after 1s. */
775 	if (send->state != SUME_RIFFA_CHAN_STATE_LEN)
776 		error = msleep(&send->event, &adapter->lock, 0,
777 		    "Waiting recv finish", 1 * hz);
778 
779 	/* This was a write so we are done; were interrupted, or timed out. */
780 	if (optype != SUME_MR_READ || error != 0 || error == EWOULDBLOCK) {
781 		send->state = SUME_RIFFA_CHAN_STATE_IDLE;
782 		if (optype == SUME_MR_READ)
783 			error = EWOULDBLOCK;
784 		else
785 			error = 0;
786 	} else
787 		error = 0;
788 
789 	/*
790 	 * For read requests we will update state once we are done
791 	 * having read the result to avoid any two outstanding
792 	 * transactions, or we need a queue and validate tags,
793 	 * which is a lot of work for a low priority, infrequent
794 	 * event.
795 	 */
796 
797 	SUME_UNLOCK(adapter);
798 
799 	return (error);
800 }
801 
802 /* Module register read. */
803 static int
804 sume_module_reg_read(struct nf_priv *nf_priv, struct sume_ifreq *sifr)
805 {
806 	struct sume_adapter *adapter = nf_priv->adapter;
807 	struct riffa_chnl_dir *recv = adapter->recv[SUME_RIFFA_CHANNEL_REG];
808 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_REG];
809 	struct nf_regop_data *data;
810 	int error = 0;
811 
812 	/*
813 	 * 0. Sleep waiting for result if needed (unless condition is
814 	 *    true already).
815 	 * 1. Read DMA results.
816 	 * 2. Update state on *TX* to IDLE to allow next read to start.
817 	 */
818 	SUME_LOCK(adapter);
819 
820 	bus_dmamap_sync(recv->ch_tag, recv->ch_map,
821 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
822 	/*
823 	 * We only need to be woken up at the end of the transaction.
824 	 * Timeout after 1s.
825 	 */
826 	if (recv->state != SUME_RIFFA_CHAN_STATE_READ)
827 		error = msleep(&recv->event, &adapter->lock, 0,
828 		    "Waiting transaction finish", 1 * hz);
829 
830 	if (recv->state != SUME_RIFFA_CHAN_STATE_READ || error == EWOULDBLOCK) {
831 		SUME_UNLOCK(adapter);
832 		device_printf(adapter->dev, "wait error: %d\n", error);
833 		return (EWOULDBLOCK);
834 	}
835 
836 	bus_dmamap_sync(recv->ch_tag, recv->ch_map,
837 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
838 
839 	/*
840 	 * Read reply data and validate address and tag.
841 	 * Note: we do access the send side without lock but the state
842 	 * machine does prevent the data from changing.
843 	 */
844 	data = (struct nf_regop_data *) (recv->buf_addr +
845 	    sizeof(struct nf_bb_desc));
846 
847 	if (le32toh(data->rtag) != send->rtag)
848 		device_printf(adapter->dev, "rtag error: 0x%08x 0x%08x\n",
849 		    le32toh(data->rtag), send->rtag);
850 
851 	sifr->val = le32toh(data->val);
852 	recv->state = SUME_RIFFA_CHAN_STATE_IDLE;
853 
854 	/* We are done. */
855 	send->state = SUME_RIFFA_CHAN_STATE_IDLE;
856 
857 	SUME_UNLOCK(adapter);
858 
859 	return (0);
860 }
861 
862 /* Read value from a module register and return it to a sume_ifreq. */
863 static int
864 get_modreg_value(struct nf_priv *nf_priv, struct sume_ifreq *sifr)
865 {
866 	int error;
867 
868 	error = sume_module_reg_write(nf_priv, sifr, SUME_MR_READ);
869 	if (!error)
870 		error = sume_module_reg_read(nf_priv, sifr);
871 
872 	return (error);
873 }
874 
875 static int
876 sume_if_ioctl(if_t ifp, unsigned long cmd, caddr_t data)
877 {
878 	struct ifreq *ifr = (struct ifreq *) data;
879 	struct nf_priv *nf_priv = if_getsoftc(ifp);
880 	struct sume_ifreq sifr;
881 	int error = 0;
882 
883 	switch (cmd) {
884 	case SIOCGIFMEDIA:
885 	case SIOCGIFXMEDIA:
886 		error = ifmedia_ioctl(ifp, ifr, &nf_priv->media, cmd);
887 		break;
888 
889 	case SUME_IOCTL_CMD_WRITE_REG:
890 		error = copyin(ifr_data_get_ptr(ifr), &sifr, sizeof(sifr));
891 		if (error) {
892 			error = EINVAL;
893 			break;
894 		}
895 		error = sume_module_reg_write(nf_priv, &sifr, SUME_MR_WRITE);
896 		break;
897 
898 	case SUME_IOCTL_CMD_READ_REG:
899 		error = copyin(ifr_data_get_ptr(ifr), &sifr, sizeof(sifr));
900 		if (error) {
901 			error = EINVAL;
902 			break;
903 		}
904 
905 		error = get_modreg_value(nf_priv, &sifr);
906 		if (error)
907 			break;
908 
909 		error = copyout(&sifr, ifr_data_get_ptr(ifr), sizeof(sifr));
910 		if (error)
911 			error = EINVAL;
912 
913 		break;
914 
915 	case SIOCSIFFLAGS:
916 		/* Silence tcpdump 'promisc mode not supported' warning. */
917 		if (if_getflags(ifp) & IFF_PROMISC)
918 			break;
919 
920 	default:
921 		error = ether_ioctl(ifp, cmd, data);
922 		break;
923 	}
924 
925 	return (error);
926 }
927 
928 static int
929 sume_media_change(if_t ifp)
930 {
931 	struct nf_priv *nf_priv = if_getsoftc(ifp);
932 	struct ifmedia *ifm = &nf_priv->media;
933 
934 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
935 		return (EINVAL);
936 
937 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_10G_SR)
938 		if_setbaudrate(ifp, ifmedia_baudrate(IFM_ETHER | IFM_10G_SR));
939 	else
940 		if_setbaudrate(ifp, ifmedia_baudrate(ifm->ifm_media));
941 
942 	return (0);
943 }
944 
945 static void
946 sume_update_link_status(if_t ifp)
947 {
948 	struct nf_priv *nf_priv = if_getsoftc(ifp);
949 	struct sume_adapter *adapter = nf_priv->adapter;
950 	struct sume_ifreq sifr;
951 	int link_status;
952 
953 	sifr.addr = SUME_STATUS_ADDR(nf_priv->port);
954 	sifr.val = 0;
955 
956 	if (get_modreg_value(nf_priv, &sifr))
957 		return;
958 
959 	link_status = SUME_LINK_STATUS(sifr.val);
960 
961 	if (!link_status && nf_priv->link_up) {
962 		if_link_state_change(ifp, LINK_STATE_DOWN);
963 		nf_priv->link_up = 0;
964 		if (adapter->sume_debug)
965 			device_printf(adapter->dev, "port %d link state "
966 			    "changed to DOWN\n", nf_priv->unit);
967 	} else if (link_status && !nf_priv->link_up) {
968 		nf_priv->link_up = 1;
969 		if_link_state_change(ifp, LINK_STATE_UP);
970 		if (adapter->sume_debug)
971 			device_printf(adapter->dev, "port %d link state "
972 			    "changed to UP\n", nf_priv->unit);
973 	}
974 }
975 
976 static void
977 sume_media_status(if_t ifp, struct ifmediareq *ifmr)
978 {
979 	struct nf_priv *nf_priv = if_getsoftc(ifp);
980 	struct ifmedia *ifm = &nf_priv->media;
981 
982 	if (ifm->ifm_cur->ifm_media == (IFM_ETHER | IFM_10G_SR) &&
983 	    (if_getflags(ifp) & IFF_UP))
984 		ifmr->ifm_active = IFM_ETHER | IFM_10G_SR;
985 	else
986 		ifmr->ifm_active = ifm->ifm_cur->ifm_media;
987 
988 	ifmr->ifm_status |= IFM_AVALID;
989 
990 	sume_update_link_status(ifp);
991 
992 	if (nf_priv->link_up)
993 		ifmr->ifm_status |= IFM_ACTIVE;
994 }
995 
996 /*
997  * Packet to transmit. We take the packet data from the mbuf and copy it to the
998  * bouncebuffer address buf_addr+3*sizeof(uint32_t)+16. The 16 bytes before the
999  * packet data are for metadata: sport/dport (depending on our source
1000  * interface), packet length and magic 0xcafe. We tell the SUME about the
1001  * transfer, fill the first 3*sizeof(uint32_t) bytes of the bouncebuffer with
1002  * the information about the start and length of the packet and trigger the
1003  * transaction.
1004  */
1005 static int
1006 sume_if_start_locked(if_t ifp)
1007 {
1008 	struct mbuf *m;
1009 	struct nf_priv *nf_priv = if_getsoftc(ifp);
1010 	struct sume_adapter *adapter = nf_priv->adapter;
1011 	struct riffa_chnl_dir *send = adapter->send[SUME_RIFFA_CHANNEL_DATA];
1012 	uint8_t *outbuf;
1013 	struct nf_metadata *mdata;
1014 	int plen = SUME_MIN_PKT_SIZE;
1015 
1016 	KASSERT(mtx_owned(&adapter->lock), ("SUME lock not owned"));
1017 	KASSERT(send->state == SUME_RIFFA_CHAN_STATE_IDLE,
1018 	    ("SUME not in IDLE state"));
1019 
1020 	m = if_dequeue(ifp);
1021 	if (m == NULL)
1022 		return (EINVAL);
1023 
1024 	/* Packets large enough do not need to be padded */
1025 	if (m->m_pkthdr.len > SUME_MIN_PKT_SIZE)
1026 		plen = m->m_pkthdr.len;
1027 
1028 	if (adapter->sume_debug)
1029 		device_printf(adapter->dev, "sending %d bytes to %s%d\n", plen,
1030 		    SUME_ETH_DEVICE_NAME, nf_priv->unit);
1031 
1032 	outbuf = (uint8_t *) send->buf_addr + sizeof(struct nf_bb_desc);
1033 	mdata = (struct nf_metadata *) outbuf;
1034 
1035 	/* Clear the recovery flag. */
1036 	send->recovery = 0;
1037 
1038 	/* Make sure we fit with the 16 bytes nf_metadata. */
1039 	if (m->m_pkthdr.len + sizeof(struct nf_metadata) >
1040 	    adapter->sg_buf_size) {
1041 		device_printf(adapter->dev, "packet too big for bounce buffer "
1042 		    "(%d)\n", m->m_pkthdr.len);
1043 		m_freem(m);
1044 		nf_priv->stats.tx_dropped++;
1045 		return (ENOMEM);
1046 	}
1047 
1048 	bus_dmamap_sync(send->ch_tag, send->ch_map,
1049 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1050 
1051 	/* Zero out the padded data */
1052 	if (m->m_pkthdr.len < SUME_MIN_PKT_SIZE)
1053 		bzero(outbuf + sizeof(struct nf_metadata), SUME_MIN_PKT_SIZE);
1054 	/* Skip the first 16 bytes for the metadata. */
1055 	m_copydata(m, 0, m->m_pkthdr.len, outbuf + sizeof(struct nf_metadata));
1056 	send->len = (sizeof(struct nf_metadata) + plen + 3) / 4;
1057 
1058 	/* Fill in the metadata: CPU(DMA) ports are odd, MAC ports are even. */
1059 	mdata->sport = htole16(1 << (nf_priv->port * 2 + 1));
1060 	mdata->dport = htole16(1 << (nf_priv->port * 2));
1061 	mdata->plen = htole16(plen);
1062 	mdata->magic = htole16(SUME_RIFFA_MAGIC);
1063 	mdata->t1 = htole32(0);
1064 	mdata->t2 = htole32(0);
1065 
1066 	/* Let the FPGA know about the transfer. */
1067 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1068 	    RIFFA_RX_OFFLAST_REG_OFF), SUME_OFFLAST);
1069 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1070 	    RIFFA_RX_LEN_REG_OFF), send->len);
1071 
1072 	/* Fill the bouncebuf "descriptor". */
1073 	sume_fill_bb_desc(adapter, send, SUME_RIFFA_LEN(send->len));
1074 
1075 	/* Update the state before intiating the DMA to avoid races. */
1076 	send->state = SUME_RIFFA_CHAN_STATE_READY;
1077 
1078 	/* DMA. */
1079 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1080 	    RIFFA_RX_SG_ADDR_LO_REG_OFF),
1081 	    SUME_RIFFA_LO_ADDR(send->buf_hw_addr));
1082 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1083 	    RIFFA_RX_SG_ADDR_HI_REG_OFF),
1084 	    SUME_RIFFA_HI_ADDR(send->buf_hw_addr));
1085 	write_reg(adapter, RIFFA_CHNL_REG(SUME_RIFFA_CHANNEL_DATA,
1086 	    RIFFA_RX_SG_LEN_REG_OFF), 4 * send->num_sg);
1087 
1088 	bus_dmamap_sync(send->ch_tag, send->ch_map,
1089 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1090 
1091 	nf_priv->stats.tx_packets++;
1092 	nf_priv->stats.tx_bytes += plen;
1093 
1094 	/* We can free as long as we use the bounce buffer. */
1095 	m_freem(m);
1096 
1097 	adapter->last_ifc = nf_priv->port;
1098 
1099 	/* Reset watchdog counter. */
1100 	adapter->wd_counter = 0;
1101 
1102 	return (0);
1103 }
1104 
1105 static void
1106 sume_if_start(if_t ifp)
1107 {
1108 	struct nf_priv *nf_priv = if_getsoftc(ifp);
1109 	struct sume_adapter *adapter = nf_priv->adapter;
1110 
1111 	if (!adapter->running || !(if_getflags(ifp) & IFF_UP))
1112 		return;
1113 
1114 	SUME_LOCK(adapter);
1115 	if (adapter->send[SUME_RIFFA_CHANNEL_DATA]->state ==
1116 	    SUME_RIFFA_CHAN_STATE_IDLE)
1117 		sume_if_start_locked(ifp);
1118 	SUME_UNLOCK(adapter);
1119 }
1120 
1121 /*
1122  * We call this function at the end of every TX transaction to check for
1123  * remaining packets in the TX queues for every UP interface.
1124  */
1125 static void
1126 check_tx_queues(struct sume_adapter *adapter)
1127 {
1128 	int i, last_ifc;
1129 
1130 	KASSERT(mtx_owned(&adapter->lock), ("SUME lock not owned"));
1131 
1132 	last_ifc = adapter->last_ifc;
1133 
1134 	/* Check all interfaces */
1135 	for (i = last_ifc + 1; i < last_ifc + SUME_NPORTS + 1; i++) {
1136 		if_t ifp = adapter->ifp[i % SUME_NPORTS];
1137 
1138 		if (!(if_getflags(ifp) & IFF_UP))
1139 			continue;
1140 
1141 		if (!sume_if_start_locked(ifp))
1142 			break;
1143 	}
1144 }
1145 
1146 static void
1147 sume_ifp_alloc(struct sume_adapter *adapter, uint32_t port)
1148 {
1149 	if_t ifp;
1150 	struct nf_priv *nf_priv = malloc(sizeof(struct nf_priv), M_SUME,
1151 	    M_ZERO | M_WAITOK);
1152 
1153 	ifp = if_alloc(IFT_ETHER);
1154 	adapter->ifp[port] = ifp;
1155 	if_setsoftc(ifp, nf_priv);
1156 
1157 	nf_priv->adapter = adapter;
1158 	nf_priv->unit = alloc_unr(unr);
1159 	nf_priv->port = port;
1160 	nf_priv->link_up = 0;
1161 
1162 	if_initname(ifp, SUME_ETH_DEVICE_NAME, nf_priv->unit);
1163 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
1164 
1165 	if_setinitfn(ifp, sume_if_init);
1166 	if_setstartfn(ifp, sume_if_start);
1167 	if_setioctlfn(ifp, sume_if_ioctl);
1168 
1169 	uint8_t hw_addr[ETHER_ADDR_LEN] = DEFAULT_ETHER_ADDRESS;
1170 	hw_addr[ETHER_ADDR_LEN-1] = nf_priv->unit;
1171 	ether_ifattach(ifp, hw_addr);
1172 
1173 	ifmedia_init(&nf_priv->media, IFM_IMASK, sume_media_change,
1174 	    sume_media_status);
1175 	ifmedia_add(&nf_priv->media, IFM_ETHER | IFM_10G_SR, 0, NULL);
1176 	ifmedia_set(&nf_priv->media, IFM_ETHER | IFM_10G_SR);
1177 
1178 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
1179 }
1180 
1181 static void
1182 callback_dma(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1183 {
1184 	if (err)
1185 		return;
1186 
1187 	KASSERT(nseg == 1, ("%d segments returned!", nseg));
1188 
1189 	*(bus_addr_t *) arg = segs[0].ds_addr;
1190 }
1191 
1192 static int
1193 sume_probe_riffa_buffer(const struct sume_adapter *adapter,
1194     struct riffa_chnl_dir ***p, const char *dir)
1195 {
1196 	struct riffa_chnl_dir **rp;
1197 	bus_addr_t hw_addr;
1198 	int ch;
1199 	device_t dev = adapter->dev;
1200 
1201 	*p = malloc(SUME_RIFFA_CHANNELS * sizeof(struct riffa_chnl_dir *),
1202 	    M_SUME, M_ZERO | M_WAITOK);
1203 
1204 	rp = *p;
1205 	/* Allocate the chnl_dir structs themselves. */
1206 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
1207 		/* One direction. */
1208 		rp[ch] = malloc(sizeof(struct riffa_chnl_dir), M_SUME,
1209 		    M_ZERO | M_WAITOK);
1210 
1211 		int err = bus_dma_tag_create(bus_get_dma_tag(dev),
1212 		    4, 0,
1213 		    BUS_SPACE_MAXADDR,
1214 		    BUS_SPACE_MAXADDR,
1215 		    NULL, NULL,
1216 		    adapter->sg_buf_size,
1217 		    1,
1218 		    adapter->sg_buf_size,
1219 		    0,
1220 		    NULL,
1221 		    NULL,
1222 		    &rp[ch]->ch_tag);
1223 
1224 		if (err) {
1225 			device_printf(dev, "bus_dma_tag_create(%s[%d]) "
1226 			    "failed.\n", dir, ch);
1227 			return (err);
1228 		}
1229 
1230 		err = bus_dmamem_alloc(rp[ch]->ch_tag, (void **)
1231 		    &rp[ch]->buf_addr, BUS_DMA_WAITOK | BUS_DMA_COHERENT |
1232 		    BUS_DMA_ZERO, &rp[ch]->ch_map);
1233 		if (err) {
1234 			device_printf(dev, "bus_dmamem_alloc(%s[%d]) failed.\n",
1235 			    dir, ch);
1236 			return (err);
1237 		}
1238 
1239 		bzero(rp[ch]->buf_addr, adapter->sg_buf_size);
1240 
1241 		err = bus_dmamap_load(rp[ch]->ch_tag, rp[ch]->ch_map,
1242 		    rp[ch]->buf_addr, adapter->sg_buf_size, callback_dma,
1243 		    &hw_addr, BUS_DMA_NOWAIT);
1244 		if (err) {
1245 			device_printf(dev, "bus_dmamap_load(%s[%d]) failed.\n",
1246 			    dir, ch);
1247 			return (err);
1248 		}
1249 		rp[ch]->buf_hw_addr = hw_addr;
1250 		rp[ch]->num_sg = 1;
1251 		rp[ch]->state = SUME_RIFFA_CHAN_STATE_IDLE;
1252 
1253 		rp[ch]->rtag = SUME_INIT_RTAG;
1254 	}
1255 
1256 	return (0);
1257 }
1258 
1259 static int
1260 sume_probe_riffa_buffers(struct sume_adapter *adapter)
1261 {
1262 	int error;
1263 
1264 	error = sume_probe_riffa_buffer(adapter, &adapter->recv, "recv");
1265 	if (error)
1266 		return (error);
1267 
1268 	error = sume_probe_riffa_buffer(adapter, &adapter->send, "send");
1269 
1270 	return (error);
1271 }
1272 
1273 static void
1274 sume_sysctl_init(struct sume_adapter *adapter)
1275 {
1276 	device_t dev = adapter->dev;
1277 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev);
1278 	struct sysctl_oid *tree = device_get_sysctl_tree(dev);
1279 	struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree);
1280 	struct sysctl_oid *tmp_tree;
1281 	char namebuf[MAX_IFC_NAME_LEN];
1282 	int i;
1283 
1284 	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "sume", CTLFLAG_RW,
1285 	    0, "SUME top-level tree");
1286 	if (tree == NULL) {
1287 		device_printf(dev, "SYSCTL_ADD_NODE failed.\n");
1288 		return;
1289 	}
1290 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug", CTLFLAG_RW,
1291 	    &adapter->sume_debug, 0, "debug int leaf");
1292 
1293 	/* total RX error stats */
1294 	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "rx_epkts",
1295 	    CTLFLAG_RD, &adapter->packets_err, 0, "rx errors");
1296 	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "rx_ebytes",
1297 	    CTLFLAG_RD, &adapter->bytes_err, 0, "rx error bytes");
1298 
1299 	for (i = SUME_NPORTS - 1; i >= 0; i--) {
1300 		if_t ifp = adapter->ifp[i];
1301 		if (ifp == NULL)
1302 			continue;
1303 
1304 		struct nf_priv *nf_priv = if_getsoftc(ifp);
1305 
1306 		snprintf(namebuf, MAX_IFC_NAME_LEN, "%s%d",
1307 		    SUME_ETH_DEVICE_NAME, nf_priv->unit);
1308 		tmp_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
1309 		    CTLFLAG_RW, 0, "SUME ifc tree");
1310 		if (tmp_tree == NULL) {
1311 			device_printf(dev, "SYSCTL_ADD_NODE failed.\n");
1312 			return;
1313 		}
1314 
1315 		/* Packets dropped by down interface. */
1316 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1317 		    "ifc_down_bytes", CTLFLAG_RD,
1318 		    &nf_priv->stats.ifc_down_bytes, 0, "ifc_down bytes");
1319 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1320 		    "ifc_down_packets", CTLFLAG_RD,
1321 		    &nf_priv->stats.ifc_down_packets, 0, "ifc_down packets");
1322 
1323 		/* HW RX stats */
1324 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1325 		    "hw_rx_packets", CTLFLAG_RD, &nf_priv->stats.hw_rx_packets,
1326 		    0, "hw_rx packets");
1327 
1328 		/* HW TX stats */
1329 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1330 		    "hw_tx_packets", CTLFLAG_RD, &nf_priv->stats.hw_tx_packets,
1331 		    0, "hw_tx packets");
1332 
1333 		/* RX stats */
1334 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1335 		    "rx_bytes", CTLFLAG_RD, &nf_priv->stats.rx_bytes, 0,
1336 		    "rx bytes");
1337 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1338 		    "rx_dropped", CTLFLAG_RD, &nf_priv->stats.rx_dropped, 0,
1339 		    "rx dropped");
1340 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1341 		    "rx_packets", CTLFLAG_RD, &nf_priv->stats.rx_packets, 0,
1342 		    "rx packets");
1343 
1344 		/* TX stats */
1345 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1346 		    "tx_bytes", CTLFLAG_RD, &nf_priv->stats.tx_bytes, 0,
1347 		    "tx bytes");
1348 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1349 		    "tx_dropped", CTLFLAG_RD, &nf_priv->stats.tx_dropped, 0,
1350 		    "tx dropped");
1351 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(tmp_tree), OID_AUTO,
1352 		    "tx_packets", CTLFLAG_RD, &nf_priv->stats.tx_packets, 0,
1353 		    "tx packets");
1354 	}
1355 }
1356 
1357 static void
1358 sume_local_timer(void *arg)
1359 {
1360 	struct sume_adapter *adapter = arg;
1361 
1362 	if (!adapter->running)
1363 		return;
1364 
1365 	taskqueue_enqueue(adapter->tq, &adapter->stat_task);
1366 
1367 	SUME_LOCK(adapter);
1368 	if (adapter->send[SUME_RIFFA_CHANNEL_DATA]->state !=
1369 	    SUME_RIFFA_CHAN_STATE_IDLE && ++adapter->wd_counter >= 3) {
1370 		/* Resetting interfaces if stuck for 3 seconds. */
1371 		device_printf(adapter->dev, "TX stuck, resetting adapter.\n");
1372 		read_reg(adapter, RIFFA_INFO_REG_OFF);
1373 
1374 		adapter->send[SUME_RIFFA_CHANNEL_DATA]->state =
1375 		    SUME_RIFFA_CHAN_STATE_IDLE;
1376 		adapter->wd_counter = 0;
1377 
1378 		check_tx_queues(adapter);
1379 	}
1380 	SUME_UNLOCK(adapter);
1381 
1382 	callout_reset(&adapter->timer, 1 * hz, sume_local_timer, adapter);
1383 }
1384 
1385 static void
1386 sume_get_stats(void *context, int pending)
1387 {
1388 	struct sume_adapter *adapter = context;
1389 	int i;
1390 
1391 	for (i = 0; i < SUME_NPORTS; i++) {
1392 		if_t ifp = adapter->ifp[i];
1393 
1394 		if (if_getflags(ifp) & IFF_UP) {
1395 			struct nf_priv *nf_priv = if_getsoftc(ifp);
1396 			struct sume_ifreq sifr;
1397 
1398 			sume_update_link_status(ifp);
1399 
1400 			/* Get RX counter. */
1401 			sifr.addr = SUME_STAT_RX_ADDR(nf_priv->port);
1402 			sifr.val = 0;
1403 
1404 			if (!get_modreg_value(nf_priv, &sifr))
1405 				nf_priv->stats.hw_rx_packets += sifr.val;
1406 
1407 			/* Get TX counter. */
1408 			sifr.addr = SUME_STAT_TX_ADDR(nf_priv->port);
1409 			sifr.val = 0;
1410 
1411 			if (!get_modreg_value(nf_priv, &sifr))
1412 				nf_priv->stats.hw_tx_packets += sifr.val;
1413 		}
1414 	}
1415 }
1416 
1417 static int
1418 sume_attach(device_t dev)
1419 {
1420 	struct sume_adapter *adapter = device_get_softc(dev);
1421 	adapter->dev = dev;
1422 	int error, i;
1423 
1424 	mtx_init(&adapter->lock, "Global lock", NULL, MTX_DEF);
1425 
1426 	adapter->running = 0;
1427 
1428 	/* OK finish up RIFFA. */
1429 	error = sume_probe_riffa_pci(adapter);
1430 	if (error != 0)
1431 		goto error;
1432 
1433 	error = sume_probe_riffa_buffers(adapter);
1434 	if (error != 0)
1435 		goto error;
1436 
1437 	/* Now do the network interfaces. */
1438 	for (i = 0; i < SUME_NPORTS; i++)
1439 		sume_ifp_alloc(adapter, i);
1440 
1441 	/*  Register stats and register sysctls. */
1442 	sume_sysctl_init(adapter);
1443 
1444 	/* Reset the HW. */
1445 	read_reg(adapter, RIFFA_INFO_REG_OFF);
1446 
1447 	/* Ready to go, "enable" IRQ. */
1448 	adapter->running = 1;
1449 
1450 	callout_init(&adapter->timer, 1);
1451 	TASK_INIT(&adapter->stat_task, 0, sume_get_stats, adapter);
1452 
1453 	adapter->tq = taskqueue_create("sume_stats", M_NOWAIT,
1454 	    taskqueue_thread_enqueue, &adapter->tq);
1455 	taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s stattaskq",
1456 	    device_get_nameunit(adapter->dev));
1457 
1458 	callout_reset(&adapter->timer, 1 * hz, sume_local_timer, adapter);
1459 
1460 	return (0);
1461 
1462 error:
1463 	sume_detach(dev);
1464 
1465 	return (error);
1466 }
1467 
1468 static void
1469 sume_remove_riffa_buffer(const struct sume_adapter *adapter,
1470     struct riffa_chnl_dir **pp)
1471 {
1472 	int ch;
1473 
1474 	for (ch = 0; ch < SUME_RIFFA_CHANNELS; ch++) {
1475 		if (pp[ch] == NULL)
1476 			continue;
1477 
1478 		if (pp[ch]->buf_hw_addr != 0) {
1479 			bus_dmamem_free(pp[ch]->ch_tag, pp[ch]->buf_addr,
1480 			    pp[ch]->ch_map);
1481 			pp[ch]->buf_hw_addr = 0;
1482 		}
1483 
1484 		free(pp[ch], M_SUME);
1485 	}
1486 }
1487 
1488 static void
1489 sume_remove_riffa_buffers(struct sume_adapter *adapter)
1490 {
1491 	if (adapter->send != NULL) {
1492 		sume_remove_riffa_buffer(adapter, adapter->send);
1493 		free(adapter->send, M_SUME);
1494 		adapter->send = NULL;
1495 	}
1496 	if (adapter->recv != NULL) {
1497 		sume_remove_riffa_buffer(adapter, adapter->recv);
1498 		free(adapter->recv, M_SUME);
1499 		adapter->recv = NULL;
1500 	}
1501 }
1502 
1503 static int
1504 sume_detach(device_t dev)
1505 {
1506 	struct sume_adapter *adapter = device_get_softc(dev);
1507 	int i;
1508 	struct nf_priv *nf_priv;
1509 
1510 	KASSERT(mtx_initialized(&adapter->lock), ("SUME mutex not "
1511 	    "initialized"));
1512 	adapter->running = 0;
1513 
1514 	/* Drain the stats callout and task queue. */
1515 	callout_drain(&adapter->timer);
1516 
1517 	if (adapter->tq) {
1518 		taskqueue_drain(adapter->tq, &adapter->stat_task);
1519 		taskqueue_free(adapter->tq);
1520 	}
1521 
1522 	for (i = 0; i < SUME_NPORTS; i++) {
1523 		if_t ifp = adapter->ifp[i];
1524 		if (ifp == NULL)
1525 			continue;
1526 
1527 		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
1528 		nf_priv = if_getsoftc(ifp);
1529 
1530 		if (if_getflags(ifp) & IFF_UP)
1531 			if_down(ifp);
1532 		ifmedia_removeall(&nf_priv->media);
1533 		free_unr(unr, nf_priv->unit);
1534 
1535 		if_setflagbits(ifp, 0, IFF_UP);
1536 		ether_ifdetach(ifp);
1537 		if_free(ifp);
1538 
1539 		free(nf_priv, M_SUME);
1540 	}
1541 
1542 	sume_remove_riffa_buffers(adapter);
1543 
1544 	if (adapter->irq.tag)
1545 		bus_teardown_intr(dev, adapter->irq.res, adapter->irq.tag);
1546 	if (adapter->irq.res)
1547 		bus_release_resource(dev, SYS_RES_IRQ, adapter->irq.rid,
1548 		    adapter->irq.res);
1549 
1550 	pci_release_msi(dev);
1551 
1552 	if (adapter->bar0_addr)
1553 		bus_release_resource(dev, SYS_RES_MEMORY, adapter->rid,
1554 		    adapter->bar0_addr);
1555 
1556 	mtx_destroy(&adapter->lock);
1557 
1558 	return (0);
1559 }
1560 
1561 static int
1562 mod_event(module_t mod, int cmd, void *arg)
1563 {
1564 	switch (cmd) {
1565 	case MOD_LOAD:
1566 		unr = new_unrhdr(0, INT_MAX, NULL);
1567 		break;
1568 
1569 	case MOD_UNLOAD:
1570 		delete_unrhdr(unr);
1571 		break;
1572 	}
1573 
1574 	return (0);
1575 }
1576 
1577 DRIVER_MODULE(sume, pci, sume_driver, mod_event, NULL);
1578 MODULE_VERSION(sume, 1);
1579