xref: /freebsd/contrib/libpcap/pcap-dpdk.c (revision b59017c5cad90d0f09a59e68c00457b7faf93e7c)
1 /*
2  * Copyright (C) 2018 jingle YANG. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28 Date: Dec 16, 2018
29 
30 Description:
31 1. Pcap-dpdk provides libpcap the ability to use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
32 2. DPDK is a set of libraries and drivers for fast packet processing. (https://www.dpdk.org/)
33 3. The testprogs/capturetest provides 6.4Gbps/800,000 pps on Intel 10-Gigabit X540-AT2 with DPDK 18.11.
34 
35 Limitations:
36 1. DPDK support will be on if DPDK is available. Please set DIR for --with-dpdk[=DIR] with ./configure or -DDPDK_DIR[=DIR] with cmake if DPDK is installed manually.
37 2. Only support link libdpdk.so dynamically, because the libdpdk.a will not work correctly.
38 3. Only support read operation, and packet injection has not been supported yet.
39 
40 Usage:
41 1. Compile DPDK as shared library and install.(https://github.com/DPDK/dpdk.git)
42 
43 You shall modify the file $RTE_SDK/$RTE_TARGET/.config and set:
44 CONFIG_RTE_BUILD_SHARED_LIB=y
45 By the following command:
46 sed -i 's/CONFIG_RTE_BUILD_SHARED_LIB=n/CONFIG_RTE_BUILD_SHARED_LIB=y/' $RTE_SDK/$RTE_TARGET/.config
47 
48 2. Launch l2fwd that is one of DPDK examples correctly, and get device information.
49 
50 You shall learn how to bind nic with DPDK-compatible driver by $RTE_SDK/usertools/dpdk-devbind.py, such as igb_uio.
51 And enable hugepages by dpdk-setup.sh
52 
53 Then launch the l2fwd with dynamic driver support. For example:
54 $RTE_SDK/examples/l2fwd/$RTE_TARGET/l2fwd -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so -- -p 0x1
55 
56 3. Compile libpcap with dpdk options.
57 
58 If DPDK has not been found automatically, you shall export DPDK environment variable which are used for compiling DPDK. And then pass $RTE_SDK/$RTE_TARGET to --with-dpdk or -DDPDK_DIR
59 
60 export RTE_SDK={your DPDK base directory}
61 export RTE_TARGET={your target name}
62 
63 3.1 With configure
64 
65 ./configure --with-dpdk=$RTE_SDK/$RTE_TARGET && make -s all && make -s testprogs && make install
66 
67 3.2 With cmake
68 
69 mkdir -p build && cd build && cmake -DDPDK_DIR=$RTE_SDK/$RTE_TARGET ../ && make -s all && make -s testprogs && make install
70 
71 4. Link your own program with libpcap, and use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
72 And you shall set DPDK configure options by environment variable DPDK_CFG
73 For example, the testprogs/capturetest could be launched by:
74 
75 env DPDK_CFG="--log-level=debug -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" ./capturetest -i dpdk:0
76 */
77 
78 #include <config.h>
79 
80 #include <errno.h>
81 #include <netdb.h>
82 #include <stdio.h>
83 #include <stdlib.h>
84 #include <string.h>
85 #include <unistd.h>
86 #include <limits.h> /* for INT_MAX */
87 #include <time.h>
88 
89 #include <sys/time.h>
90 
91 //header for calling dpdk
92 #include <rte_config.h>
93 #include <rte_common.h>
94 #include <rte_errno.h>
95 #include <rte_log.h>
96 #include <rte_malloc.h>
97 #include <rte_memory.h>
98 #include <rte_eal.h>
99 #include <rte_launch.h>
100 #include <rte_atomic.h>
101 #include <rte_cycles.h>
102 #include <rte_lcore.h>
103 #include <rte_per_lcore.h>
104 #include <rte_branch_prediction.h>
105 #include <rte_interrupts.h>
106 #include <rte_random.h>
107 #include <rte_debug.h>
108 #include <rte_ether.h>
109 #include <rte_ethdev.h>
110 #include <rte_mempool.h>
111 #include <rte_mbuf.h>
112 #include <rte_bus.h>
113 
114 #include "pcap-int.h"
115 #include "pcap-dpdk.h"
116 
117 /*
118  * Deal with API changes that break source compatibility.
119  */
120 
121 #ifdef HAVE_STRUCT_RTE_ETHER_ADDR
122 #define ETHER_ADDR_TYPE	struct rte_ether_addr
123 #else
124 #define ETHER_ADDR_TYPE	struct ether_addr
125 #endif
126 
127 #define DPDK_DEF_LOG_LEV RTE_LOG_ERR
128 //
129 // This is set to 0 if we haven't initialized DPDK yet, 1 if we've
130 // successfully initialized it, a negative value, which is the negative
131 // of the rte_errno from rte_eal_init(), if we tried to initialize it
132 // and got an error.
133 //
134 static int is_dpdk_pre_inited=0;
135 #define DPDK_LIB_NAME "libpcap_dpdk"
136 #define DPDK_DESC "Data Plane Development Kit (DPDK) Interface"
137 #define DPDK_ERR_PERM_MSG "permission denied, DPDK needs root permission"
138 #define DPDK_ARGC_MAX 64
139 #define DPDK_CFG_MAX_LEN 1024
140 #define DPDK_DEV_NAME_MAX 32
141 #define DPDK_DEV_DESC_MAX 512
142 #define DPDK_CFG_ENV_NAME "DPDK_CFG"
143 #define DPDK_DEF_MIN_SLEEP_MS 1
144 static char dpdk_cfg_buf[DPDK_CFG_MAX_LEN];
145 #define DPDK_MAC_ADDR_SIZE 32
146 #define DPDK_DEF_MAC_ADDR "00:00:00:00:00:00"
147 #define DPDK_PCI_ADDR_SIZE 16
148 #define DPDK_DEF_CFG "--log-level=error -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so"
149 #define DPDK_PREFIX "dpdk:"
150 #define DPDK_PORTID_MAX 65535U
151 #define MBUF_POOL_NAME "mbuf_pool"
152 #define DPDK_TX_BUF_NAME "tx_buffer"
153 //The number of elements in the mbuf pool.
154 #define DPDK_NB_MBUFS 8192U
155 #define MEMPOOL_CACHE_SIZE 256
156 #define MAX_PKT_BURST 32
157 // Configurable number of RX/TX ring descriptors
158 #define RTE_TEST_RX_DESC_DEFAULT 1024
159 #define RTE_TEST_TX_DESC_DEFAULT 1024
160 
161 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
162 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
163 
164 #ifdef RTE_ETHER_MAX_JUMBO_FRAME_LEN
165 #define RTE_ETH_PCAP_SNAPLEN RTE_ETHER_MAX_JUMBO_FRAME_LEN
166 #else
167 #define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN
168 #endif
169 
170 static struct rte_eth_dev_tx_buffer *tx_buffer;
171 
172 struct dpdk_ts_helper{
173 	struct timeval start_time;
174 	uint64_t start_cycles;
175 	uint64_t hz;
176 };
177 struct pcap_dpdk{
178 	pcap_t * orig;
179 	uint16_t portid; // portid of DPDK
180 	int must_clear_promisc;
181 	uint64_t bpf_drop;
182 	int nonblock;
183 	struct timeval required_select_timeout;
184 	struct timeval prev_ts;
185 	struct rte_eth_stats prev_stats;
186 	struct timeval curr_ts;
187 	struct rte_eth_stats curr_stats;
188 	uint64_t pps;
189 	uint64_t bps;
190 	struct rte_mempool * pktmbuf_pool;
191 	struct dpdk_ts_helper ts_helper;
192 	ETHER_ADDR_TYPE eth_addr;
193 	char mac_addr[DPDK_MAC_ADDR_SIZE];
194 	char pci_addr[DPDK_PCI_ADDR_SIZE];
195 	unsigned char pcap_tmp_buf[RTE_ETH_PCAP_SNAPLEN];
196 };
197 
198 static struct rte_eth_conf port_conf = {
199 	.rxmode = {
200 		.split_hdr_size = 0,
201 	},
202 	.txmode = {
203 		.mq_mode = ETH_MQ_TX_NONE,
204 	},
205 };
206 
207 static void	dpdk_fmt_errmsg_for_rte_errno(char *, size_t, int,
208     PCAP_FORMAT_STRING(const char *), ...) PCAP_PRINTFLIKE(4, 5);
209 
210 /*
211  * Generate an error message based on a format, arguments, and an
212  * rte_errno, with a message for the rte_errno after the formatted output.
213  */
214 static void dpdk_fmt_errmsg_for_rte_errno(char *errbuf, size_t errbuflen,
215     int errnum, const char *fmt, ...)
216 {
217 	va_list ap;
218 	size_t msglen;
219 	char *p;
220 	size_t errbuflen_remaining;
221 
222 	va_start(ap, fmt);
223 	vsnprintf(errbuf, errbuflen, fmt, ap);
224 	va_end(ap);
225 	msglen = strlen(errbuf);
226 
227 	/*
228 	 * Do we have enough space to append ": "?
229 	 * Including the terminating '\0', that's 3 bytes.
230 	 */
231 	if (msglen + 3 > errbuflen) {
232 		/* No - just give them what we've produced. */
233 		return;
234 	}
235 	p = errbuf + msglen;
236 	errbuflen_remaining = errbuflen - msglen;
237 	*p++ = ':';
238 	*p++ = ' ';
239 	*p = '\0';
240 	msglen += 2;
241 	errbuflen_remaining -= 2;
242 
243 	/*
244 	 * Now append the string for the error code.
245 	 * rte_strerror() is thread-safe, at least as of dpdk 18.11,
246 	 * unlike strerror() - it uses strerror_r() rather than strerror()
247 	 * for UN*X errno values, and prints to what I assume is a per-thread
248 	 * buffer (based on the "PER_LCORE" in "RTE_DEFINE_PER_LCORE" used
249 	 * to declare the buffers statically) for DPDK errors.
250 	 */
251 	snprintf(p, errbuflen_remaining, "%s", rte_strerror(errnum));
252 }
253 
254 static int dpdk_init_timer(struct pcap_dpdk *pd){
255 	gettimeofday(&(pd->ts_helper.start_time),NULL);
256 	pd->ts_helper.start_cycles = rte_get_timer_cycles();
257 	pd->ts_helper.hz = rte_get_timer_hz();
258 	if (pd->ts_helper.hz == 0){
259 		return -1;
260 	}
261 	return 0;
262 }
263 static inline void calculate_timestamp(struct dpdk_ts_helper *helper,struct timeval *ts)
264 {
265 	uint64_t cycles;
266 	// delta
267 	struct timeval cur_time;
268 	cycles = rte_get_timer_cycles() - helper->start_cycles;
269 	cur_time.tv_sec = (time_t)(cycles/helper->hz);
270 	cur_time.tv_usec = (suseconds_t)((cycles%helper->hz)*1e6/helper->hz);
271 	timeradd(&(helper->start_time), &cur_time, ts);
272 }
273 
274 static uint32_t dpdk_gather_data(unsigned char *data, uint32_t len, struct rte_mbuf *mbuf)
275 {
276 	uint32_t total_len = 0;
277 	while (mbuf && (total_len+mbuf->data_len) < len ){
278 		rte_memcpy(data+total_len, rte_pktmbuf_mtod(mbuf,void *),mbuf->data_len);
279 		total_len+=mbuf->data_len;
280 		mbuf=mbuf->next;
281 	}
282 	return total_len;
283 }
284 
285 
286 static int dpdk_read_with_timeout(pcap_t *p, struct rte_mbuf **pkts_burst, const uint16_t burst_cnt){
287 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
288 	int nb_rx = 0;
289 	int timeout_ms = p->opt.timeout;
290 	int sleep_ms = 0;
291 	if (pd->nonblock){
292 		// In non-blocking mode, just read once, no matter how many packets are captured.
293 		nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
294 	}else{
295 		// In blocking mode, read many times until packets are captured or timeout or break_loop is set.
296 		// if timeout_ms == 0, it may be blocked forever.
297 		while (timeout_ms == 0 || sleep_ms < timeout_ms){
298 			nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
299 			if (nb_rx){ // got packets within timeout_ms
300 				break;
301 			}else{ // no packet arrives at this round.
302 				if (p->break_loop){
303 					break;
304 				}
305 				// sleep for a very short while.
306 				// block sleep is the only choice, since usleep() will impact performance dramatically.
307 				rte_delay_us_block(DPDK_DEF_MIN_SLEEP_MS*1000);
308 				sleep_ms += DPDK_DEF_MIN_SLEEP_MS;
309 			}
310 		}
311 	}
312 	return nb_rx;
313 }
314 
315 static int pcap_dpdk_dispatch(pcap_t *p, int max_cnt, pcap_handler cb, u_char *cb_arg)
316 {
317 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
318 	int burst_cnt = 0;
319 	int nb_rx = 0;
320 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
321 	struct rte_mbuf *m;
322 	struct pcap_pkthdr pcap_header;
323 	// In DPDK, pkt_len is sum of lengths for all segments. And data_len is for one segment
324 	uint32_t pkt_len = 0;
325 	uint32_t caplen = 0;
326 	u_char *bp = NULL;
327 	int i=0;
328 	unsigned int gather_len =0;
329 	int pkt_cnt = 0;
330 	u_char *large_buffer=NULL;
331 	int timeout_ms = p->opt.timeout;
332 
333 	/*
334 	 * This can conceivably process more than INT_MAX packets,
335 	 * which would overflow the packet count, causing it either
336 	 * to look like a negative number, and thus cause us to
337 	 * return a value that looks like an error, or overflow
338 	 * back into positive territory, and thus cause us to
339 	 * return a too-low count.
340 	 *
341 	 * Therefore, if the packet count is unlimited, we clip
342 	 * it at INT_MAX; this routine is not expected to
343 	 * process packets indefinitely, so that's not an issue.
344 	 */
345 	if (PACKET_COUNT_IS_UNLIMITED(max_cnt))
346 		max_cnt = INT_MAX;
347 
348 	if (max_cnt < MAX_PKT_BURST){
349 		burst_cnt = max_cnt;
350 	}else{
351 		burst_cnt = MAX_PKT_BURST;
352 	}
353 
354 	while( pkt_cnt < max_cnt){
355 		if (p->break_loop){
356 			p->break_loop = 0;
357 			return PCAP_ERROR_BREAK;
358 		}
359 		// read once in non-blocking mode, or try many times waiting for timeout_ms.
360 		// if timeout_ms == 0, it will be blocked until one packet arrives or break_loop is set.
361 		nb_rx = dpdk_read_with_timeout(p, pkts_burst, burst_cnt);
362 		if (nb_rx == 0){
363 			if (pd->nonblock){
364 				RTE_LOG(DEBUG, USER1, "dpdk: no packets available in non-blocking mode.\n");
365 			}else{
366 				if (p->break_loop){
367 					RTE_LOG(DEBUG, USER1, "dpdk: no packets available and break_loop is set in blocking mode.\n");
368 					p->break_loop = 0;
369 					return PCAP_ERROR_BREAK;
370 
371 				}
372 				RTE_LOG(DEBUG, USER1, "dpdk: no packets available for timeout %d ms in blocking mode.\n", timeout_ms);
373 			}
374 			// break if dpdk reads 0 packet, no matter in blocking(timeout) or non-blocking mode.
375 			break;
376 		}
377 		pkt_cnt += nb_rx;
378 		for ( i = 0; i < nb_rx; i++) {
379 			m = pkts_burst[i];
380 			calculate_timestamp(&(pd->ts_helper),&(pcap_header.ts));
381 			pkt_len = rte_pktmbuf_pkt_len(m);
382 			// caplen = min(pkt_len, p->snapshot);
383 			// caplen will not be changed, no matter how long the rte_pktmbuf
384 			caplen = pkt_len < (uint32_t)p->snapshot ? pkt_len: (uint32_t)p->snapshot;
385 			pcap_header.caplen = caplen;
386 			pcap_header.len = pkt_len;
387 			// volatile prefetch
388 			rte_prefetch0(rte_pktmbuf_mtod(m, void *));
389 			bp = NULL;
390 			if (m->nb_segs == 1)
391 			{
392 				bp = rte_pktmbuf_mtod(m, u_char *);
393 			}else{
394 				// use fast buffer pcap_tmp_buf if pkt_len is small, no need to call malloc and free
395 				if ( pkt_len <= RTE_ETH_PCAP_SNAPLEN)
396 				{
397 					gather_len = dpdk_gather_data(pd->pcap_tmp_buf, RTE_ETH_PCAP_SNAPLEN, m);
398 					bp = pd->pcap_tmp_buf;
399 				}else{
400 					// need call free later
401 					large_buffer = (u_char *)malloc(caplen*sizeof(u_char));
402 					gather_len = dpdk_gather_data(large_buffer, caplen, m);
403 					bp = large_buffer;
404 				}
405 
406 			}
407 			if (bp){
408 				if (p->fcode.bf_insns==NULL || pcapint_filter(p->fcode.bf_insns, bp, pcap_header.len, pcap_header.caplen)){
409 					cb(cb_arg, &pcap_header, bp);
410 				}else{
411 					pd->bpf_drop++;
412 				}
413 			}
414 			//free all pktmbuf
415 			rte_pktmbuf_free(m);
416 			if (large_buffer){
417 				free(large_buffer);
418 				large_buffer=NULL;
419 			}
420 		}
421 	}
422 	return pkt_cnt;
423 }
424 
425 static int pcap_dpdk_inject(pcap_t *p, const void *buf _U_, int size _U_)
426 {
427 	//not implemented yet
428 	pcapint_strlcpy(p->errbuf,
429 	    "dpdk error: Inject function has not been implemented yet",
430 	    PCAP_ERRBUF_SIZE);
431 	return PCAP_ERROR;
432 }
433 
434 static void pcap_dpdk_close(pcap_t *p)
435 {
436 	struct pcap_dpdk *pd = p->priv;
437 	if (pd==NULL)
438 	{
439 		return;
440 	}
441 	if (pd->must_clear_promisc)
442 	{
443 		rte_eth_promiscuous_disable(pd->portid);
444 	}
445 	rte_eth_dev_stop(pd->portid);
446 	rte_eth_dev_close(pd->portid);
447 	pcapint_cleanup_live_common(p);
448 }
449 
450 static void nic_stats_display(struct pcap_dpdk *pd)
451 {
452 	uint16_t portid = pd->portid;
453 	struct rte_eth_stats stats;
454 	rte_eth_stats_get(portid, &stats);
455 	RTE_LOG(INFO,USER1, "portid:%d, RX-packets: %-10"PRIu64"  RX-errors:  %-10"PRIu64
456 	       "  RX-bytes:  %-10"PRIu64"  RX-Imissed:  %-10"PRIu64"\n", portid, stats.ipackets, stats.ierrors,
457 	       stats.ibytes,stats.imissed);
458 	RTE_LOG(INFO,USER1, "portid:%d, RX-PPS: %-10"PRIu64" RX-Mbps: %.2lf\n", portid, pd->pps, pd->bps/1e6f );
459 }
460 
461 static int pcap_dpdk_stats(pcap_t *p, struct pcap_stat *ps)
462 {
463 	struct pcap_dpdk *pd = p->priv;
464 	calculate_timestamp(&(pd->ts_helper), &(pd->curr_ts));
465 	rte_eth_stats_get(pd->portid,&(pd->curr_stats));
466 	if (ps){
467 		ps->ps_recv = pd->curr_stats.ipackets;
468 		ps->ps_drop = pd->curr_stats.ierrors;
469 		ps->ps_drop += pd->bpf_drop;
470 		ps->ps_ifdrop = pd->curr_stats.imissed;
471 	}
472 	uint64_t delta_pkt = pd->curr_stats.ipackets - pd->prev_stats.ipackets;
473 	struct timeval delta_tm;
474 	timersub(&(pd->curr_ts),&(pd->prev_ts), &delta_tm);
475 	uint64_t delta_usec = delta_tm.tv_sec*1e6+delta_tm.tv_usec;
476 	uint64_t delta_bit = (pd->curr_stats.ibytes-pd->prev_stats.ibytes)*8;
477 	RTE_LOG(DEBUG, USER1, "delta_usec: %-10"PRIu64" delta_pkt: %-10"PRIu64" delta_bit: %-10"PRIu64"\n", delta_usec, delta_pkt, delta_bit);
478 	pd->pps = (uint64_t)(delta_pkt*1e6f/delta_usec);
479 	pd->bps = (uint64_t)(delta_bit*1e6f/delta_usec);
480 	nic_stats_display(pd);
481 	pd->prev_stats = pd->curr_stats;
482 	pd->prev_ts = pd->curr_ts;
483 	return 0;
484 }
485 
486 static int pcap_dpdk_setnonblock(pcap_t *p, int nonblock){
487 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
488 	pd->nonblock = nonblock;
489 	return 0;
490 }
491 
492 static int pcap_dpdk_getnonblock(pcap_t *p){
493 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
494 	return pd->nonblock;
495 }
496 static int check_link_status(uint16_t portid, struct rte_eth_link *plink)
497 {
498 	// wait up to 9 seconds to get link status
499 	rte_eth_link_get(portid, plink);
500 	return plink->link_status == ETH_LINK_UP;
501 }
502 static void eth_addr_str(ETHER_ADDR_TYPE *addrp, char* mac_str, int len)
503 {
504 	int offset=0;
505 	if (addrp == NULL){
506 		snprintf(mac_str, len-1, DPDK_DEF_MAC_ADDR);
507 		return;
508 	}
509 	for (int i=0; i<6; i++)
510 	{
511 		if (offset >= len)
512 		{ // buffer overflow
513 			return;
514 		}
515 		if (i==0)
516 		{
517 			snprintf(mac_str+offset, len-1-offset, "%02X",addrp->addr_bytes[i]);
518 			offset+=2; // FF
519 		}else{
520 			snprintf(mac_str+offset, len-1-offset, ":%02X", addrp->addr_bytes[i]);
521 			offset+=3; // :FF
522 		}
523 	}
524 	return;
525 }
526 // return portid by device name, otherwise return -1
527 static uint16_t portid_by_device(char * device)
528 {
529 	uint16_t ret = DPDK_PORTID_MAX;
530 	size_t len = strlen(device);
531 	size_t prefix_len = strlen(DPDK_PREFIX);
532 	unsigned long ret_ul = 0L;
533 	char *pEnd;
534 	if (len<=prefix_len || strncmp(device, DPDK_PREFIX, prefix_len)) // check prefix dpdk:
535 	{
536 		return ret;
537 	}
538 	//check all chars are digital
539 	for (int i=prefix_len; device[i]; i++){
540 		if (device[i]<'0' || device[i]>'9'){
541 			return ret;
542 		}
543 	}
544 	ret_ul = strtoul(&(device[prefix_len]), &pEnd, 10);
545 	if (pEnd == &(device[prefix_len]) || *pEnd != '\0'){
546 		return ret;
547 	}
548 	// too large for portid
549 	if (ret_ul >= DPDK_PORTID_MAX){
550 		return ret;
551 	}
552 	ret = (uint16_t)ret_ul;
553 	return ret;
554 }
555 
556 static int parse_dpdk_cfg(char* dpdk_cfg,char** dargv)
557 {
558 	int cnt=0;
559 	memset(dargv,0,sizeof(dargv[0])*DPDK_ARGC_MAX);
560 	//current process name
561 	int skip_space = 1;
562 	int i=0;
563 	RTE_LOG(INFO, USER1,"dpdk cfg: %s\n",dpdk_cfg);
564 	// find first non space char
565 	// The last opt is NULL
566 	for (i=0;dpdk_cfg[i] && cnt<DPDK_ARGC_MAX-1;i++){
567 		if (skip_space && dpdk_cfg[i]!=' '){ // not space
568 			skip_space=!skip_space; // skip normal char
569 			dargv[cnt++] = dpdk_cfg+i;
570 		}
571 		if (!skip_space && dpdk_cfg[i]==' '){ // find a space
572 			dpdk_cfg[i]=0x00; // end of this opt
573 			skip_space=!skip_space; // skip space char
574 		}
575 	}
576 	dargv[cnt]=NULL;
577 	return cnt;
578 }
579 
580 // only called once
581 // Returns:
582 //
583 //    1 on success;
584 //
585 //    0 if "the EAL cannot initialize on this system", which we treat as
586 //    meaning "DPDK isn't available";
587 //
588 //    a PCAP_ERROR_ code for other errors.
589 //
590 // If eaccess_not_fatal is non-zero, treat "a permissions issue" the way
591 // we treat "the EAL cannot initialize on this system".  We use that
592 // when trying to find DPDK devices, as we don't want to fail to return
593 // *any* devices just because we can't support DPDK; when we're trying
594 // to open a device, we need to return a permissions error in that case.
595 static int dpdk_pre_init(char * ebuf, int eaccess_not_fatal)
596 {
597 	int dargv_cnt=0;
598 	char *dargv[DPDK_ARGC_MAX];
599 	char *ptr_dpdk_cfg = NULL;
600 	int ret;
601 	// global var
602 	if (is_dpdk_pre_inited != 0)
603 	{
604 		// already inited; did that succeed?
605 		if (is_dpdk_pre_inited < 0)
606 		{
607 			// failed
608 			goto error;
609 		}
610 		else
611 		{
612 			// succeeded
613 			return 1;
614 		}
615 	}
616 	// init EAL
617 	ptr_dpdk_cfg = getenv(DPDK_CFG_ENV_NAME);
618 	// set default log level to debug
619 	rte_log_set_global_level(DPDK_DEF_LOG_LEV);
620 	if (ptr_dpdk_cfg == NULL)
621 	{
622 		RTE_LOG(INFO,USER1,"env $DPDK_CFG is unset, so using default: %s\n",DPDK_DEF_CFG);
623 		ptr_dpdk_cfg = DPDK_DEF_CFG;
624 	}
625 	memset(dpdk_cfg_buf,0,sizeof(dpdk_cfg_buf));
626 	snprintf(dpdk_cfg_buf,DPDK_CFG_MAX_LEN-1,"%s %s",DPDK_LIB_NAME,ptr_dpdk_cfg);
627 	dargv_cnt = parse_dpdk_cfg(dpdk_cfg_buf,dargv);
628 	ret = rte_eal_init(dargv_cnt,dargv);
629 	if (ret == -1)
630 	{
631 		// Indicate that we've called rte_eal_init() by setting
632 		// is_dpdk_pre_inited to the negative of the error code,
633 		// and process the error.
634 		is_dpdk_pre_inited = -rte_errno;
635 		goto error;
636 	}
637 	// init succeeded, so we do not need to do it again later.
638 	is_dpdk_pre_inited = 1;
639 	return 1;
640 
641 error:
642 	switch (-is_dpdk_pre_inited)
643 	{
644 		case EACCES:
645 			// This "indicates a permissions issue.".
646 			RTE_LOG(ERR, USER1, "%s\n", DPDK_ERR_PERM_MSG);
647 			// If we were told to treat this as just meaning
648 			// DPDK isn't available, do so.
649 			if (eaccess_not_fatal)
650 				return 0;
651 			// Otherwise report a fatal error.
652 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
653 			    "DPDK requires that it run as root");
654 			return PCAP_ERROR_PERM_DENIED;
655 
656 		case EAGAIN:
657 			// This "indicates either a bus or system
658 			// resource was not available, setup may
659 			// be attempted again."
660 			// There's no such error in pcap, so I'm
661 			// not sure what we should do here.
662 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
663 			    "Bus or system resource was not available");
664 			break;
665 
666 		case EALREADY:
667 			// This "indicates that the rte_eal_init
668 			// function has already been called, and
669 			// cannot be called again."
670 			// That's not an error; set the "we've
671 			// been here before" flag and return
672 			// success.
673 			is_dpdk_pre_inited = 1;
674 			return 1;
675 
676 		case EFAULT:
677 			// This "indicates the tailq configuration
678 			// name was not found in memory configuration."
679 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
680 			    "The tailq configuration name was not found in the memory configuration");
681 			return PCAP_ERROR;
682 
683 		case EINVAL:
684 			// This "indicates invalid parameters were
685 			// passed as argv/argc."  Those came from
686 			// the configuration file.
687 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
688 			    "The configuration file has invalid parameters");
689 			break;
690 
691 		case ENOMEM:
692 			// This "indicates failure likely caused by
693 			// an out-of-memory condition."
694 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
695 			    "Out of memory");
696 			break;
697 
698 		case ENODEV:
699 			// This "indicates memory setup issues."
700 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
701 			    "An error occurred setting up memory");
702 			break;
703 
704 		case ENOTSUP:
705 			// This "indicates that the EAL cannot
706 			// initialize on this system."  We treat
707 			// that as meaning DPDK isn't available
708 			// on this machine, rather than as a
709 			// fatal error, and let our caller decide
710 			// whether that's a fatal error (if trying
711 			// to activate a DPDK device) or not (if
712 			// trying to enumerate devices).
713 			return 0;
714 
715 		case EPROTO:
716 			// This "indicates that the PCI bus is
717 			// either not present, or is not readable
718 			// by the eal."  Does "the PCI bus is not
719 			// present" mean "this machine has no PCI
720 			// bus", which strikes me as a "not available"
721 			// case?  If so, should "is not readable by
722 			// the EAL" also something we should treat
723 			// as a "not available" case?  If not, we
724 			// can't distinguish between the two, so
725 			// we're stuck.
726 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
727 			    "PCI bus is not present or not readable by the EAL");
728 			break;
729 
730 		case ENOEXEC:
731 			// This "indicates that a service core
732 			// failed to launch successfully."
733 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
734 			    "A service core failed to launch successfully");
735 			break;
736 
737 		default:
738 			//
739 			// That's not in the list of errors in
740 			// the documentation; let it be reported
741 			// as an error.
742 			//
743 			dpdk_fmt_errmsg_for_rte_errno(ebuf,
744 			    PCAP_ERRBUF_SIZE, -is_dpdk_pre_inited,
745 			    "dpdk error: dpdk_pre_init failed");
746 			break;
747 	}
748 	// Error.
749 	return PCAP_ERROR;
750 }
751 
752 static int pcap_dpdk_activate(pcap_t *p)
753 {
754 	struct pcap_dpdk *pd = p->priv;
755 	pd->orig = p;
756 	int ret = PCAP_ERROR;
757 	uint16_t nb_ports=0;
758 	uint16_t portid= DPDK_PORTID_MAX;
759 	unsigned nb_mbufs = DPDK_NB_MBUFS;
760 	struct rte_eth_rxconf rxq_conf;
761 	struct rte_eth_txconf txq_conf;
762 	struct rte_eth_conf local_port_conf = port_conf;
763 	struct rte_eth_dev_info dev_info;
764 	int is_port_up = 0;
765 	struct rte_eth_link link;
766 	do{
767 		//init EAL; fail if we have insufficient permission
768 		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
769 		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 0);
770 		if (ret < 0)
771 		{
772 			// This returns a negative value on an error.
773 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
774 			    "Can't open device %s: %s",
775 			    p->opt.device, dpdk_pre_init_errbuf);
776 			// ret is set to the correct error
777 			break;
778 		}
779 		if (ret == 0)
780 		{
781 			// This means DPDK isn't available on this machine.
782 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
783 			    "Can't open device %s: DPDK is not available on this machine",
784 			    p->opt.device);
785 			return PCAP_ERROR_NO_SUCH_DEVICE;
786 		}
787 
788 		ret = dpdk_init_timer(pd);
789 		if (ret<0)
790 		{
791 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
792 				"dpdk error: Init timer is zero with device %s",
793 				p->opt.device);
794 			ret = PCAP_ERROR;
795 			break;
796 		}
797 
798 		nb_ports = rte_eth_dev_count_avail();
799 		if (nb_ports == 0)
800 		{
801 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
802 			    "dpdk error: No Ethernet ports");
803 			ret = PCAP_ERROR;
804 			break;
805 		}
806 
807 		portid = portid_by_device(p->opt.device);
808 		if (portid == DPDK_PORTID_MAX){
809 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
810 			    "dpdk error: portid is invalid. device %s",
811 			    p->opt.device);
812 			ret = PCAP_ERROR_NO_SUCH_DEVICE;
813 			break;
814 		}
815 
816 		pd->portid = portid;
817 
818 		if (p->snapshot <= 0 || p->snapshot > MAXIMUM_SNAPLEN)
819 		{
820 			p->snapshot = MAXIMUM_SNAPLEN;
821 		}
822 		// create the mbuf pool
823 		pd->pktmbuf_pool = rte_pktmbuf_pool_create(MBUF_POOL_NAME, nb_mbufs,
824 			MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
825 			rte_socket_id());
826 		if (pd->pktmbuf_pool == NULL)
827 		{
828 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
829 			    PCAP_ERRBUF_SIZE, rte_errno,
830 			    "dpdk error: Cannot init mbuf pool");
831 			ret = PCAP_ERROR;
832 			break;
833 		}
834 		// config dev
835 		rte_eth_dev_info_get(portid, &dev_info);
836 		if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
837 		{
838 			local_port_conf.txmode.offloads |=DEV_TX_OFFLOAD_MBUF_FAST_FREE;
839 		}
840 		// only support 1 queue
841 		ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
842 		if (ret < 0)
843 		{
844 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
845 			    PCAP_ERRBUF_SIZE, -ret,
846 			    "dpdk error: Cannot configure device: port=%u",
847 			    portid);
848 			ret = PCAP_ERROR;
849 			break;
850 		}
851 		// adjust rx tx
852 		ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
853 		if (ret < 0)
854 		{
855 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
856 			    PCAP_ERRBUF_SIZE, -ret,
857 			    "dpdk error: Cannot adjust number of descriptors: port=%u",
858 			    portid);
859 			ret = PCAP_ERROR;
860 			break;
861 		}
862 		// get MAC addr
863 		rte_eth_macaddr_get(portid, &(pd->eth_addr));
864 		eth_addr_str(&(pd->eth_addr), pd->mac_addr, DPDK_MAC_ADDR_SIZE-1);
865 
866 		// init one RX queue
867 		rxq_conf = dev_info.default_rxconf;
868 		rxq_conf.offloads = local_port_conf.rxmode.offloads;
869 		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
870 					     rte_eth_dev_socket_id(portid),
871 					     &rxq_conf,
872 					     pd->pktmbuf_pool);
873 		if (ret < 0)
874 		{
875 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
876 			    PCAP_ERRBUF_SIZE, -ret,
877 			    "dpdk error: rte_eth_rx_queue_setup:port=%u",
878 			    portid);
879 			ret = PCAP_ERROR;
880 			break;
881 		}
882 
883 		// init one TX queue
884 		txq_conf = dev_info.default_txconf;
885 		txq_conf.offloads = local_port_conf.txmode.offloads;
886 		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
887 				rte_eth_dev_socket_id(portid),
888 				&txq_conf);
889 		if (ret < 0)
890 		{
891 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
892 			    PCAP_ERRBUF_SIZE, -ret,
893 			    "dpdk error: rte_eth_tx_queue_setup:port=%u",
894 			    portid);
895 			ret = PCAP_ERROR;
896 			break;
897 		}
898 		// Initialize TX buffers
899 		tx_buffer = rte_zmalloc_socket(DPDK_TX_BUF_NAME,
900 				RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
901 				rte_eth_dev_socket_id(portid));
902 		if (tx_buffer == NULL)
903 		{
904 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
905 			    "dpdk error: Cannot allocate buffer for tx on port %u", portid);
906 			ret = PCAP_ERROR;
907 			break;
908 		}
909 		rte_eth_tx_buffer_init(tx_buffer, MAX_PKT_BURST);
910 		// Start device
911 		ret = rte_eth_dev_start(portid);
912 		if (ret < 0)
913 		{
914 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
915 			    PCAP_ERRBUF_SIZE, -ret,
916 			    "dpdk error: rte_eth_dev_start:port=%u",
917 			    portid);
918 			ret = PCAP_ERROR;
919 			break;
920 		}
921 		// set promiscuous mode
922 		if (p->opt.promisc){
923 			pd->must_clear_promisc=1;
924 			rte_eth_promiscuous_enable(portid);
925 		}
926 		// check link status
927 		is_port_up = check_link_status(portid, &link);
928 		if (!is_port_up){
929 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
930 			    "dpdk error: link is down, port=%u",portid);
931 			ret = PCAP_ERROR_IFACE_NOT_UP;
932 			break;
933 		}
934 		// reset statistics
935 		rte_eth_stats_reset(pd->portid);
936 		calculate_timestamp(&(pd->ts_helper), &(pd->prev_ts));
937 		rte_eth_stats_get(pd->portid,&(pd->prev_stats));
938 		// format pcap_t
939 		pd->portid = portid;
940 		p->fd = pd->portid;
941 		if (p->snapshot <=0 || p->snapshot> MAXIMUM_SNAPLEN)
942 		{
943 			p->snapshot = MAXIMUM_SNAPLEN;
944 		}
945 		p->linktype = DLT_EN10MB; // Ethernet, the 10MB is historical.
946 		p->selectable_fd = p->fd;
947 		p->read_op = pcap_dpdk_dispatch;
948 		p->inject_op = pcap_dpdk_inject;
949 		// using pcapint_filter currently, though DPDK provides their own BPF function. Because DPDK BPF needs load a ELF file as a filter.
950 		p->setfilter_op = pcapint_install_bpf_program;
951 		p->setdirection_op = NULL;
952 		p->set_datalink_op = NULL;
953 		p->getnonblock_op = pcap_dpdk_getnonblock;
954 		p->setnonblock_op = pcap_dpdk_setnonblock;
955 		p->stats_op = pcap_dpdk_stats;
956 		p->cleanup_op = pcap_dpdk_close;
957 		p->breakloop_op = pcapint_breakloop_common;
958 		// set default timeout
959 		pd->required_select_timeout.tv_sec = 0;
960 		pd->required_select_timeout.tv_usec = DPDK_DEF_MIN_SLEEP_MS*1000;
961 		p->required_select_timeout = &pd->required_select_timeout;
962 		ret = 0; // OK
963 	}while(0);
964 
965 	if (ret <= PCAP_ERROR) // all kinds of error code
966 	{
967 		pcapint_cleanup_live_common(p);
968 	}else{
969 		rte_eth_dev_get_name_by_port(portid,pd->pci_addr);
970 		RTE_LOG(INFO, USER1,"Port %d device: %s, MAC:%s, PCI:%s\n", portid, p->opt.device, pd->mac_addr, pd->pci_addr);
971 		RTE_LOG(INFO, USER1,"Port %d Link Up. Speed %u Mbps - %s\n",
972 							portid, link.link_speed,
973 					(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
974 						("full-duplex") : ("half-duplex\n"));
975 	}
976 	return ret;
977 }
978 
979 // device name for dpdk should be in the form as dpdk:number, such as dpdk:0
980 pcap_t * pcap_dpdk_create(const char *device, char *ebuf, int *is_ours)
981 {
982 	pcap_t *p=NULL;
983 	*is_ours = 0;
984 
985 	*is_ours = !strncmp(device, "dpdk:", 5);
986 	if (! *is_ours)
987 		return NULL;
988 	//memset will happen
989 	p = PCAP_CREATE_COMMON(ebuf, struct pcap_dpdk);
990 
991 	if (p == NULL)
992 		return NULL;
993 	p->activate_op = pcap_dpdk_activate;
994 	return p;
995 }
996 
997 int pcap_dpdk_findalldevs(pcap_if_list_t *devlistp, char *ebuf)
998 {
999 	int ret=0;
1000 	unsigned int nb_ports = 0;
1001 	char dpdk_name[DPDK_DEV_NAME_MAX];
1002 	char dpdk_desc[DPDK_DEV_DESC_MAX];
1003 	ETHER_ADDR_TYPE eth_addr;
1004 	char mac_addr[DPDK_MAC_ADDR_SIZE];
1005 	char pci_addr[DPDK_PCI_ADDR_SIZE];
1006 	do{
1007 		// init EAL; return "DPDK not available" if we
1008 		// have insufficient permission
1009 		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
1010 		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 1);
1011 		if (ret < 0)
1012 		{
1013 			// This returns a negative value on an error.
1014 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
1015 			    "Can't look for DPDK devices: %s",
1016 			    dpdk_pre_init_errbuf);
1017 			ret = PCAP_ERROR;
1018 			break;
1019 		}
1020 		if (ret == 0)
1021 		{
1022 			// This means DPDK isn't available on this machine.
1023 			// That just means "don't return any devices".
1024 			break;
1025 		}
1026 		nb_ports = rte_eth_dev_count_avail();
1027 		if (nb_ports == 0)
1028 		{
1029 			// That just means "don't return any devices".
1030 			ret = 0;
1031 			break;
1032 		}
1033 		for (unsigned int i=0; i<nb_ports; i++){
1034 			snprintf(dpdk_name, DPDK_DEV_NAME_MAX-1,
1035 			    "%s%u", DPDK_PREFIX, i);
1036 			// mac addr
1037 			rte_eth_macaddr_get(i, &eth_addr);
1038 			eth_addr_str(&eth_addr,mac_addr,DPDK_MAC_ADDR_SIZE);
1039 			// PCI addr
1040 			rte_eth_dev_get_name_by_port(i,pci_addr);
1041 			snprintf(dpdk_desc,DPDK_DEV_DESC_MAX-1,"%s %s, MAC:%s, PCI:%s", DPDK_DESC, dpdk_name, mac_addr, pci_addr);
1042 			if (pcapint_add_dev(devlistp, dpdk_name, 0, dpdk_desc, ebuf)==NULL){
1043 				ret = PCAP_ERROR;
1044 				break;
1045 			}
1046 		}
1047 	}while(0);
1048 	return ret;
1049 }
1050 
1051 #ifdef DPDK_ONLY
1052 /*
1053  * This libpcap build supports only DPDK, not regular network interfaces.
1054  */
1055 
1056 /*
1057  * There are no regular interfaces, just DPDK interfaces.
1058  */
1059 int
1060 pcapint_platform_finddevs(pcap_if_list_t *devlistp _U_, char *errbuf)
1061 {
1062 	return (0);
1063 }
1064 
1065 /*
1066  * Attempts to open a regular interface fail.
1067  */
1068 pcap_t *
1069 pcapint_create_interface(const char *device, char *errbuf)
1070 {
1071 	snprintf(errbuf, PCAP_ERRBUF_SIZE,
1072 	    "This version of libpcap only supports DPDK");
1073 	return NULL;
1074 }
1075 
1076 /*
1077  * Libpcap version string.
1078  */
1079 const char *
1080 pcap_lib_version(void)
1081 {
1082 	return (PCAP_VERSION_STRING " (DPDK-only)");
1083 }
1084 #endif
1085