xref: /freebsd/contrib/libpcap/pcap-dpdk.c (revision b3e7694832e81d7a904a10f525f8797b753bf0d3)
1 /*
2  * Copyright (C) 2018 jingle YANG. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28 Date: Dec 16, 2018
29 
30 Description:
31 1. Pcap-dpdk provides libpcap the ability to use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
32 2. DPDK is a set of libraries and drivers for fast packet processing. (https://www.dpdk.org/)
33 3. The testprogs/capturetest provides 6.4Gbps/800,000 pps on Intel 10-Gigabit X540-AT2 with DPDK 18.11.
34 
35 Limitations:
36 1. DPDK support will be on if DPDK is available. Please set DIR for --with-dpdk[=DIR] with ./configure or -DDPDK_DIR[=DIR] with cmake if DPDK is installed manually.
37 2. Only support link libdpdk.so dynamically, because the libdpdk.a will not work correctly.
38 3. Only support read operation, and packet injection has not been supported yet.
39 
40 Usage:
41 1. Compile DPDK as shared library and install.(https://github.com/DPDK/dpdk.git)
42 
43 You shall modify the file $RTE_SDK/$RTE_TARGET/.config and set:
44 CONFIG_RTE_BUILD_SHARED_LIB=y
45 By the following command:
46 sed -i 's/CONFIG_RTE_BUILD_SHARED_LIB=n/CONFIG_RTE_BUILD_SHARED_LIB=y/' $RTE_SDK/$RTE_TARGET/.config
47 
48 2. Launch l2fwd that is one of DPDK examples correctly, and get device information.
49 
50 You shall learn how to bind nic with DPDK-compatible driver by $RTE_SDK/usertools/dpdk-devbind.py, such as igb_uio.
51 And enable hugepages by dpdk-setup.sh
52 
53 Then launch the l2fwd with dynamic driver support. For example:
54 $RTE_SDK/examples/l2fwd/$RTE_TARGET/l2fwd -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so -- -p 0x1
55 
56 3. Compile libpcap with dpdk options.
57 
58 If DPDK has not been found automatically, you shall export DPDK environment variable which are used for compiling DPDK. And then pass $RTE_SDK/$RTE_TARGET to --with-dpdk or -DDPDK_DIR
59 
60 export RTE_SDK={your DPDK base directory}
61 export RTE_TARGET={your target name}
62 
63 3.1 With configure
64 
65 ./configure --with-dpdk=$RTE_SDK/$RTE_TARGET && make -s all && make -s testprogs && make install
66 
67 3.2 With cmake
68 
69 mkdir -p build && cd build && cmake -DDPDK_DIR=$RTE_SDK/$RTE_TARGET ../ && make -s all && make -s testprogs && make install
70 
71 4. Link your own program with libpcap, and use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
72 And you shall set DPDK configure options by environment variable DPDK_CFG
73 For example, the testprogs/capturetest could be lanched by:
74 
75 env DPDK_CFG="--log-level=debug -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" ./capturetest -i dpdk:0
76 */
77 
78 #ifdef HAVE_CONFIG_H
79 #include <config.h>
80 #endif
81 
82 #include <errno.h>
83 #include <netdb.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <string.h>
87 #include <unistd.h>
88 #include <limits.h> /* for INT_MAX */
89 #include <time.h>
90 
91 #include <sys/time.h>
92 
93 //header for calling dpdk
94 #include <rte_config.h>
95 #include <rte_common.h>
96 #include <rte_errno.h>
97 #include <rte_log.h>
98 #include <rte_malloc.h>
99 #include <rte_memory.h>
100 #include <rte_eal.h>
101 #include <rte_launch.h>
102 #include <rte_atomic.h>
103 #include <rte_cycles.h>
104 #include <rte_lcore.h>
105 #include <rte_per_lcore.h>
106 #include <rte_branch_prediction.h>
107 #include <rte_interrupts.h>
108 #include <rte_random.h>
109 #include <rte_debug.h>
110 #include <rte_ether.h>
111 #include <rte_ethdev.h>
112 #include <rte_mempool.h>
113 #include <rte_mbuf.h>
114 #include <rte_bus.h>
115 
116 #include "pcap-int.h"
117 #include "pcap-dpdk.h"
118 
119 /*
120  * Deal with API changes that break source compatibility.
121  */
122 
123 #ifdef HAVE_STRUCT_RTE_ETHER_ADDR
124 #define ETHER_ADDR_TYPE	struct rte_ether_addr
125 #else
126 #define ETHER_ADDR_TYPE	struct ether_addr
127 #endif
128 
129 #define DPDK_DEF_LOG_LEV RTE_LOG_ERR
130 //
131 // This is set to 0 if we haven't initialized DPDK yet, 1 if we've
132 // successfully initialized it, a negative value, which is the negative
133 // of the rte_errno from rte_eal_init(), if we tried to initialize it
134 // and got an error.
135 //
136 static int is_dpdk_pre_inited=0;
137 #define DPDK_LIB_NAME "libpcap_dpdk"
138 #define DPDK_DESC "Data Plane Development Kit (DPDK) Interface"
139 #define DPDK_ERR_PERM_MSG "permission denied, DPDK needs root permission"
140 #define DPDK_ARGC_MAX 64
141 #define DPDK_CFG_MAX_LEN 1024
142 #define DPDK_DEV_NAME_MAX 32
143 #define DPDK_DEV_DESC_MAX 512
144 #define DPDK_CFG_ENV_NAME "DPDK_CFG"
145 #define DPDK_DEF_MIN_SLEEP_MS 1
146 static char dpdk_cfg_buf[DPDK_CFG_MAX_LEN];
147 #define DPDK_MAC_ADDR_SIZE 32
148 #define DPDK_DEF_MAC_ADDR "00:00:00:00:00:00"
149 #define DPDK_PCI_ADDR_SIZE 16
150 #define DPDK_DEF_CFG "--log-level=error -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so"
151 #define DPDK_PREFIX "dpdk:"
152 #define DPDK_PORTID_MAX 65535U
153 #define MBUF_POOL_NAME "mbuf_pool"
154 #define DPDK_TX_BUF_NAME "tx_buffer"
155 //The number of elements in the mbuf pool.
156 #define DPDK_NB_MBUFS 8192U
157 #define MEMPOOL_CACHE_SIZE 256
158 #define MAX_PKT_BURST 32
159 // Configurable number of RX/TX ring descriptors
160 #define RTE_TEST_RX_DESC_DEFAULT 1024
161 #define RTE_TEST_TX_DESC_DEFAULT 1024
162 
163 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
164 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
165 
166 #ifdef RTE_ETHER_MAX_JUMBO_FRAME_LEN
167 #define RTE_ETH_PCAP_SNAPLEN RTE_ETHER_MAX_JUMBO_FRAME_LEN
168 #else
169 #define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN
170 #endif
171 
172 static struct rte_eth_dev_tx_buffer *tx_buffer;
173 
174 struct dpdk_ts_helper{
175 	struct timeval start_time;
176 	uint64_t start_cycles;
177 	uint64_t hz;
178 };
179 struct pcap_dpdk{
180 	pcap_t * orig;
181 	uint16_t portid; // portid of DPDK
182 	int must_clear_promisc;
183 	uint64_t bpf_drop;
184 	int nonblock;
185 	struct timeval required_select_timeout;
186 	struct timeval prev_ts;
187 	struct rte_eth_stats prev_stats;
188 	struct timeval curr_ts;
189 	struct rte_eth_stats curr_stats;
190 	uint64_t pps;
191 	uint64_t bps;
192 	struct rte_mempool * pktmbuf_pool;
193 	struct dpdk_ts_helper ts_helper;
194 	ETHER_ADDR_TYPE eth_addr;
195 	char mac_addr[DPDK_MAC_ADDR_SIZE];
196 	char pci_addr[DPDK_PCI_ADDR_SIZE];
197 	unsigned char pcap_tmp_buf[RTE_ETH_PCAP_SNAPLEN];
198 };
199 
200 static struct rte_eth_conf port_conf = {
201 	.rxmode = {
202 		.split_hdr_size = 0,
203 	},
204 	.txmode = {
205 		.mq_mode = ETH_MQ_TX_NONE,
206 	},
207 };
208 
209 static void	dpdk_fmt_errmsg_for_rte_errno(char *, size_t, int,
210     PCAP_FORMAT_STRING(const char *), ...) PCAP_PRINTFLIKE(4, 5);
211 
212 /*
213  * Generate an error message based on a format, arguments, and an
214  * rte_errno, with a message for the rte_errno after the formatted output.
215  */
216 static void dpdk_fmt_errmsg_for_rte_errno(char *errbuf, size_t errbuflen,
217     int errnum, const char *fmt, ...)
218 {
219 	va_list ap;
220 	size_t msglen;
221 	char *p;
222 	size_t errbuflen_remaining;
223 
224 	va_start(ap, fmt);
225 	vsnprintf(errbuf, errbuflen, fmt, ap);
226 	va_end(ap);
227 	msglen = strlen(errbuf);
228 
229 	/*
230 	 * Do we have enough space to append ": "?
231 	 * Including the terminating '\0', that's 3 bytes.
232 	 */
233 	if (msglen + 3 > errbuflen) {
234 		/* No - just give them what we've produced. */
235 		return;
236 	}
237 	p = errbuf + msglen;
238 	errbuflen_remaining = errbuflen - msglen;
239 	*p++ = ':';
240 	*p++ = ' ';
241 	*p = '\0';
242 	msglen += 2;
243 	errbuflen_remaining -= 2;
244 
245 	/*
246 	 * Now append the string for the error code.
247 	 * rte_strerror() is thread-safe, at least as of dpdk 18.11,
248 	 * unlike strerror() - it uses strerror_r() rather than strerror()
249 	 * for UN*X errno values, and prints to what I assume is a per-thread
250 	 * buffer (based on the "PER_LCORE" in "RTE_DEFINE_PER_LCORE" used
251 	 * to declare the buffers statically) for DPDK errors.
252 	 */
253 	snprintf(p, errbuflen_remaining, "%s", rte_strerror(errnum));
254 }
255 
256 static int dpdk_init_timer(struct pcap_dpdk *pd){
257 	gettimeofday(&(pd->ts_helper.start_time),NULL);
258 	pd->ts_helper.start_cycles = rte_get_timer_cycles();
259 	pd->ts_helper.hz = rte_get_timer_hz();
260 	if (pd->ts_helper.hz == 0){
261 		return -1;
262 	}
263 	return 0;
264 }
265 static inline void calculate_timestamp(struct dpdk_ts_helper *helper,struct timeval *ts)
266 {
267 	uint64_t cycles;
268 	// delta
269 	struct timeval cur_time;
270 	cycles = rte_get_timer_cycles() - helper->start_cycles;
271 	cur_time.tv_sec = (time_t)(cycles/helper->hz);
272 	cur_time.tv_usec = (suseconds_t)((cycles%helper->hz)*1e6/helper->hz);
273 	timeradd(&(helper->start_time), &cur_time, ts);
274 }
275 
276 static uint32_t dpdk_gather_data(unsigned char *data, uint32_t len, struct rte_mbuf *mbuf)
277 {
278 	uint32_t total_len = 0;
279 	while (mbuf && (total_len+mbuf->data_len) < len ){
280 		rte_memcpy(data+total_len, rte_pktmbuf_mtod(mbuf,void *),mbuf->data_len);
281 		total_len+=mbuf->data_len;
282 		mbuf=mbuf->next;
283 	}
284 	return total_len;
285 }
286 
287 
288 static int dpdk_read_with_timeout(pcap_t *p, struct rte_mbuf **pkts_burst, const uint16_t burst_cnt){
289 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
290 	int nb_rx = 0;
291 	int timeout_ms = p->opt.timeout;
292 	int sleep_ms = 0;
293 	if (pd->nonblock){
294 		// In non-blocking mode, just read once, no matter how many packets are captured.
295 		nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
296 	}else{
297 		// In blocking mode, read many times until packets are captured or timeout or break_loop is set.
298 		// if timeout_ms == 0, it may be blocked forever.
299 		while (timeout_ms == 0 || sleep_ms < timeout_ms){
300 			nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
301 			if (nb_rx){ // got packets within timeout_ms
302 				break;
303 			}else{ // no packet arrives at this round.
304 				if (p->break_loop){
305 					break;
306 				}
307 				// sleep for a very short while.
308 				// block sleep is the only choice, since usleep() will impact performance dramatically.
309 				rte_delay_us_block(DPDK_DEF_MIN_SLEEP_MS*1000);
310 				sleep_ms += DPDK_DEF_MIN_SLEEP_MS;
311 			}
312 		}
313 	}
314 	return nb_rx;
315 }
316 
317 static int pcap_dpdk_dispatch(pcap_t *p, int max_cnt, pcap_handler cb, u_char *cb_arg)
318 {
319 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
320 	int burst_cnt = 0;
321 	int nb_rx = 0;
322 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
323 	struct rte_mbuf *m;
324 	struct pcap_pkthdr pcap_header;
325 	// In DPDK, pkt_len is sum of lengths for all segments. And data_len is for one segment
326 	uint32_t pkt_len = 0;
327 	uint32_t caplen = 0;
328 	u_char *bp = NULL;
329 	int i=0;
330 	unsigned int gather_len =0;
331 	int pkt_cnt = 0;
332 	u_char *large_buffer=NULL;
333 	int timeout_ms = p->opt.timeout;
334 
335 	/*
336 	 * This can conceivably process more than INT_MAX packets,
337 	 * which would overflow the packet count, causing it either
338 	 * to look like a negative number, and thus cause us to
339 	 * return a value that looks like an error, or overflow
340 	 * back into positive territory, and thus cause us to
341 	 * return a too-low count.
342 	 *
343 	 * Therefore, if the packet count is unlimited, we clip
344 	 * it at INT_MAX; this routine is not expected to
345 	 * process packets indefinitely, so that's not an issue.
346 	 */
347 	if (PACKET_COUNT_IS_UNLIMITED(max_cnt))
348 		max_cnt = INT_MAX;
349 
350 	if (max_cnt < MAX_PKT_BURST){
351 		burst_cnt = max_cnt;
352 	}else{
353 		burst_cnt = MAX_PKT_BURST;
354 	}
355 
356 	while( pkt_cnt < max_cnt){
357 		if (p->break_loop){
358 			p->break_loop = 0;
359 			return PCAP_ERROR_BREAK;
360 		}
361 		// read once in non-blocking mode, or try many times waiting for timeout_ms.
362 		// if timeout_ms == 0, it will be blocked until one packet arrives or break_loop is set.
363 		nb_rx = dpdk_read_with_timeout(p, pkts_burst, burst_cnt);
364 		if (nb_rx == 0){
365 			if (pd->nonblock){
366 				RTE_LOG(DEBUG, USER1, "dpdk: no packets available in non-blocking mode.\n");
367 			}else{
368 				if (p->break_loop){
369 					RTE_LOG(DEBUG, USER1, "dpdk: no packets available and break_loop is set in blocking mode.\n");
370 					p->break_loop = 0;
371 					return PCAP_ERROR_BREAK;
372 
373 				}
374 				RTE_LOG(DEBUG, USER1, "dpdk: no packets available for timeout %d ms in blocking mode.\n", timeout_ms);
375 			}
376 			// break if dpdk reads 0 packet, no matter in blocking(timeout) or non-blocking mode.
377 			break;
378 		}
379 		pkt_cnt += nb_rx;
380 		for ( i = 0; i < nb_rx; i++) {
381 			m = pkts_burst[i];
382 			calculate_timestamp(&(pd->ts_helper),&(pcap_header.ts));
383 			pkt_len = rte_pktmbuf_pkt_len(m);
384 			// caplen = min(pkt_len, p->snapshot);
385 			// caplen will not be changed, no matter how long the rte_pktmbuf
386 			caplen = pkt_len < (uint32_t)p->snapshot ? pkt_len: (uint32_t)p->snapshot;
387 			pcap_header.caplen = caplen;
388 			pcap_header.len = pkt_len;
389 			// volatile prefetch
390 			rte_prefetch0(rte_pktmbuf_mtod(m, void *));
391 			bp = NULL;
392 			if (m->nb_segs == 1)
393 			{
394 				bp = rte_pktmbuf_mtod(m, u_char *);
395 			}else{
396 				// use fast buffer pcap_tmp_buf if pkt_len is small, no need to call malloc and free
397 				if ( pkt_len <= RTE_ETH_PCAP_SNAPLEN)
398 				{
399 					gather_len = dpdk_gather_data(pd->pcap_tmp_buf, RTE_ETH_PCAP_SNAPLEN, m);
400 					bp = pd->pcap_tmp_buf;
401 				}else{
402 					// need call free later
403 					large_buffer = (u_char *)malloc(caplen*sizeof(u_char));
404 					gather_len = dpdk_gather_data(large_buffer, caplen, m);
405 					bp = large_buffer;
406 				}
407 
408 			}
409 			if (bp){
410 				if (p->fcode.bf_insns==NULL || pcap_filter(p->fcode.bf_insns, bp, pcap_header.len, pcap_header.caplen)){
411 					cb(cb_arg, &pcap_header, bp);
412 				}else{
413 					pd->bpf_drop++;
414 				}
415 			}
416 			//free all pktmbuf
417 			rte_pktmbuf_free(m);
418 			if (large_buffer){
419 				free(large_buffer);
420 				large_buffer=NULL;
421 			}
422 		}
423 	}
424 	return pkt_cnt;
425 }
426 
427 static int pcap_dpdk_inject(pcap_t *p, const void *buf _U_, int size _U_)
428 {
429 	//not implemented yet
430 	pcap_strlcpy(p->errbuf,
431 	    "dpdk error: Inject function has not been implemented yet",
432 	    PCAP_ERRBUF_SIZE);
433 	return PCAP_ERROR;
434 }
435 
436 static void pcap_dpdk_close(pcap_t *p)
437 {
438 	struct pcap_dpdk *pd = p->priv;
439 	if (pd==NULL)
440 	{
441 		return;
442 	}
443 	if (pd->must_clear_promisc)
444 	{
445 		rte_eth_promiscuous_disable(pd->portid);
446 	}
447 	rte_eth_dev_stop(pd->portid);
448 	rte_eth_dev_close(pd->portid);
449 	pcap_cleanup_live_common(p);
450 }
451 
452 static void nic_stats_display(struct pcap_dpdk *pd)
453 {
454 	uint16_t portid = pd->portid;
455 	struct rte_eth_stats stats;
456 	rte_eth_stats_get(portid, &stats);
457 	RTE_LOG(INFO,USER1, "portid:%d, RX-packets: %-10"PRIu64"  RX-errors:  %-10"PRIu64
458 	       "  RX-bytes:  %-10"PRIu64"  RX-Imissed:  %-10"PRIu64"\n", portid, stats.ipackets, stats.ierrors,
459 	       stats.ibytes,stats.imissed);
460 	RTE_LOG(INFO,USER1, "portid:%d, RX-PPS: %-10"PRIu64" RX-Mbps: %.2lf\n", portid, pd->pps, pd->bps/1e6f );
461 }
462 
463 static int pcap_dpdk_stats(pcap_t *p, struct pcap_stat *ps)
464 {
465 	struct pcap_dpdk *pd = p->priv;
466 	calculate_timestamp(&(pd->ts_helper), &(pd->curr_ts));
467 	rte_eth_stats_get(pd->portid,&(pd->curr_stats));
468 	if (ps){
469 		ps->ps_recv = pd->curr_stats.ipackets;
470 		ps->ps_drop = pd->curr_stats.ierrors;
471 		ps->ps_drop += pd->bpf_drop;
472 		ps->ps_ifdrop = pd->curr_stats.imissed;
473 	}
474 	uint64_t delta_pkt = pd->curr_stats.ipackets - pd->prev_stats.ipackets;
475 	struct timeval delta_tm;
476 	timersub(&(pd->curr_ts),&(pd->prev_ts), &delta_tm);
477 	uint64_t delta_usec = delta_tm.tv_sec*1e6+delta_tm.tv_usec;
478 	uint64_t delta_bit = (pd->curr_stats.ibytes-pd->prev_stats.ibytes)*8;
479 	RTE_LOG(DEBUG, USER1, "delta_usec: %-10"PRIu64" delta_pkt: %-10"PRIu64" delta_bit: %-10"PRIu64"\n", delta_usec, delta_pkt, delta_bit);
480 	pd->pps = (uint64_t)(delta_pkt*1e6f/delta_usec);
481 	pd->bps = (uint64_t)(delta_bit*1e6f/delta_usec);
482 	nic_stats_display(pd);
483 	pd->prev_stats = pd->curr_stats;
484 	pd->prev_ts = pd->curr_ts;
485 	return 0;
486 }
487 
488 static int pcap_dpdk_setnonblock(pcap_t *p, int nonblock){
489 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
490 	pd->nonblock = nonblock;
491 	return 0;
492 }
493 
494 static int pcap_dpdk_getnonblock(pcap_t *p){
495 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
496 	return pd->nonblock;
497 }
498 static int check_link_status(uint16_t portid, struct rte_eth_link *plink)
499 {
500 	// wait up to 9 seconds to get link status
501 	rte_eth_link_get(portid, plink);
502 	return plink->link_status == ETH_LINK_UP;
503 }
504 static void eth_addr_str(ETHER_ADDR_TYPE *addrp, char* mac_str, int len)
505 {
506 	int offset=0;
507 	if (addrp == NULL){
508 		snprintf(mac_str, len-1, DPDK_DEF_MAC_ADDR);
509 		return;
510 	}
511 	for (int i=0; i<6; i++)
512 	{
513 		if (offset >= len)
514 		{ // buffer overflow
515 			return;
516 		}
517 		if (i==0)
518 		{
519 			snprintf(mac_str+offset, len-1-offset, "%02X",addrp->addr_bytes[i]);
520 			offset+=2; // FF
521 		}else{
522 			snprintf(mac_str+offset, len-1-offset, ":%02X", addrp->addr_bytes[i]);
523 			offset+=3; // :FF
524 		}
525 	}
526 	return;
527 }
528 // return portid by device name, otherwise return -1
529 static uint16_t portid_by_device(char * device)
530 {
531 	uint16_t ret = DPDK_PORTID_MAX;
532 	int len = strlen(device);
533 	int prefix_len = strlen(DPDK_PREFIX);
534 	unsigned long ret_ul = 0L;
535 	char *pEnd;
536 	if (len<=prefix_len || strncmp(device, DPDK_PREFIX, prefix_len)) // check prefix dpdk:
537 	{
538 		return ret;
539 	}
540 	//check all chars are digital
541 	for (int i=prefix_len; device[i]; i++){
542 		if (device[i]<'0' || device[i]>'9'){
543 			return ret;
544 		}
545 	}
546 	ret_ul = strtoul(&(device[prefix_len]), &pEnd, 10);
547 	if (pEnd == &(device[prefix_len]) || *pEnd != '\0'){
548 		return ret;
549 	}
550 	// too large for portid
551 	if (ret_ul >= DPDK_PORTID_MAX){
552 		return ret;
553 	}
554 	ret = (uint16_t)ret_ul;
555 	return ret;
556 }
557 
558 static int parse_dpdk_cfg(char* dpdk_cfg,char** dargv)
559 {
560 	int cnt=0;
561 	memset(dargv,0,sizeof(dargv[0])*DPDK_ARGC_MAX);
562 	//current process name
563 	int skip_space = 1;
564 	int i=0;
565 	RTE_LOG(INFO, USER1,"dpdk cfg: %s\n",dpdk_cfg);
566 	// find first non space char
567 	// The last opt is NULL
568 	for (i=0;dpdk_cfg[i] && cnt<DPDK_ARGC_MAX-1;i++){
569 		if (skip_space && dpdk_cfg[i]!=' '){ // not space
570 			skip_space=!skip_space; // skip normal char
571 			dargv[cnt++] = dpdk_cfg+i;
572 		}
573 		if (!skip_space && dpdk_cfg[i]==' '){ // fint a space
574 			dpdk_cfg[i]=0x00; // end of this opt
575 			skip_space=!skip_space; // skip space char
576 		}
577 	}
578 	dargv[cnt]=NULL;
579 	return cnt;
580 }
581 
582 // only called once
583 // Returns:
584 //
585 //    1 on success;
586 //
587 //    0 if "the EAL cannot initialize on this system", which we treat as
588 //    meaning "DPDK isn't available";
589 //
590 //    a PCAP_ERROR_ code for other errors.
591 //
592 // If eaccess_not_fatal is non-zero, treat "a permissions issue" the way
593 // we treat "the EAL cannot initialize on this system".  We use that
594 // when trying to find DPDK devices, as we don't want to fail to return
595 // *any* devices just because we can't support DPDK; when we're trying
596 // to open a device, we need to return a permissions error in that case.
597 static int dpdk_pre_init(char * ebuf, int eaccess_not_fatal)
598 {
599 	int dargv_cnt=0;
600 	char *dargv[DPDK_ARGC_MAX];
601 	char *ptr_dpdk_cfg = NULL;
602 	int ret;
603 	// globale var
604 	if (is_dpdk_pre_inited != 0)
605 	{
606 		// already inited; did that succeed?
607 		if (is_dpdk_pre_inited < 0)
608 		{
609 			// failed
610 			goto error;
611 		}
612 		else
613 		{
614 			// succeeded
615 			return 1;
616 		}
617 	}
618 	// init EAL
619 	ptr_dpdk_cfg = getenv(DPDK_CFG_ENV_NAME);
620 	// set default log level to debug
621 	rte_log_set_global_level(DPDK_DEF_LOG_LEV);
622 	if (ptr_dpdk_cfg == NULL)
623 	{
624 		RTE_LOG(INFO,USER1,"env $DPDK_CFG is unset, so using default: %s\n",DPDK_DEF_CFG);
625 		ptr_dpdk_cfg = DPDK_DEF_CFG;
626 	}
627 	memset(dpdk_cfg_buf,0,sizeof(dpdk_cfg_buf));
628 	snprintf(dpdk_cfg_buf,DPDK_CFG_MAX_LEN-1,"%s %s",DPDK_LIB_NAME,ptr_dpdk_cfg);
629 	dargv_cnt = parse_dpdk_cfg(dpdk_cfg_buf,dargv);
630 	ret = rte_eal_init(dargv_cnt,dargv);
631 	if (ret == -1)
632 	{
633 		// Indicate that we've called rte_eal_init() by setting
634 		// is_dpdk_pre_inited to the negative of the error code,
635 		// and process the error.
636 		is_dpdk_pre_inited = -rte_errno;
637 		goto error;
638 	}
639 	// init succeeded, so we do not need to do it again later.
640 	is_dpdk_pre_inited = 1;
641 	return 1;
642 
643 error:
644 	switch (-is_dpdk_pre_inited)
645 	{
646 		case EACCES:
647 			// This "indicates a permissions issue.".
648 			RTE_LOG(ERR, USER1, "%s\n", DPDK_ERR_PERM_MSG);
649 			// If we were told to treat this as just meaning
650 			// DPDK isn't available, do so.
651 			if (eaccess_not_fatal)
652 				return 0;
653 			// Otherwise report a fatal error.
654 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
655 			    "DPDK requires that it run as root");
656 			return PCAP_ERROR_PERM_DENIED;
657 
658 		case EAGAIN:
659 			// This "indicates either a bus or system
660 			// resource was not available, setup may
661 			// be attempted again."
662 			// There's no such error in pcap, so I'm
663 			// not sure what we should do here.
664 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
665 			    "Bus or system resource was not available");
666 			break;
667 
668 		case EALREADY:
669 			// This "indicates that the rte_eal_init
670 			// function has already been called, and
671 			// cannot be called again."
672 			// That's not an error; set the "we've
673 			// been here before" flag and return
674 			// success.
675 			is_dpdk_pre_inited = 1;
676 			return 1;
677 
678 		case EFAULT:
679 			// This "indicates the tailq configuration
680 			// name was not found in memory configuration."
681 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
682 			    "The tailq configuration name was not found in the memory configuration");
683 			return PCAP_ERROR;
684 
685 		case EINVAL:
686 			// This "indicates invalid parameters were
687 			// passed as argv/argc."  Those came from
688 			// the configuration file.
689 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
690 			    "The configuration file has invalid parameters");
691 			break;
692 
693 		case ENOMEM:
694 			// This "indicates failure likely caused by
695 			// an out-of-memory condition."
696 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
697 			    "Out of memory");
698 			break;
699 
700 		case ENODEV:
701 			// This "indicates memory setup issues."
702 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
703 			    "An error occurred setting up memory");
704 			break;
705 
706 		case ENOTSUP:
707 			// This "indicates that the EAL cannot
708 			// initialize on this system."  We treat
709 			// that as meaning DPDK isn't available
710 			// on this machine, rather than as a
711 			// fatal error, and let our caller decide
712 			// whether that's a fatal error (if trying
713 			// to activate a DPDK device) or not (if
714 			// trying to enumerate devices).
715 			return 0;
716 
717 		case EPROTO:
718 			// This "indicates that the PCI bus is
719 			// either not present, or is not readable
720 			// by the eal."  Does "the PCI bus is not
721 			// present" mean "this machine has no PCI
722 			// bus", which strikes me as a "not available"
723 			// case?  If so, should "is not readable by
724 			// the EAL" also something we should treat
725 			// as a "not available" case?  If not, we
726 			// can't distinguish between the two, so
727 			// we're stuck.
728 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
729 			    "PCI bus is not present or not readable by the EAL");
730 			break;
731 
732 		case ENOEXEC:
733 			// This "indicates that a service core
734 			// failed to launch successfully."
735 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
736 			    "A service core failed to launch successfully");
737 			break;
738 
739 		default:
740 			//
741 			// That's not in the list of errors in
742 			// the documentation; let it be reported
743 			// as an error.
744 			//
745 			dpdk_fmt_errmsg_for_rte_errno(ebuf,
746 			    PCAP_ERRBUF_SIZE, -is_dpdk_pre_inited,
747 			    "dpdk error: dpdk_pre_init failed");
748 			break;
749 	}
750 	// Error.
751 	return PCAP_ERROR;
752 }
753 
754 static int pcap_dpdk_activate(pcap_t *p)
755 {
756 	struct pcap_dpdk *pd = p->priv;
757 	pd->orig = p;
758 	int ret = PCAP_ERROR;
759 	uint16_t nb_ports=0;
760 	uint16_t portid= DPDK_PORTID_MAX;
761 	unsigned nb_mbufs = DPDK_NB_MBUFS;
762 	struct rte_eth_rxconf rxq_conf;
763 	struct rte_eth_txconf txq_conf;
764 	struct rte_eth_conf local_port_conf = port_conf;
765 	struct rte_eth_dev_info dev_info;
766 	int is_port_up = 0;
767 	struct rte_eth_link link;
768 	do{
769 		//init EAL; fail if we have insufficient permission
770 		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
771 		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 0);
772 		if (ret < 0)
773 		{
774 			// This returns a negative value on an error.
775 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
776 			    "Can't open device %s: %s",
777 			    p->opt.device, dpdk_pre_init_errbuf);
778 			// ret is set to the correct error
779 			break;
780 		}
781 		if (ret == 0)
782 		{
783 			// This means DPDK isn't available on this machine.
784 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
785 			    "Can't open device %s: DPDK is not available on this machine",
786 			    p->opt.device);
787 			return PCAP_ERROR_NO_SUCH_DEVICE;
788 		}
789 
790 		ret = dpdk_init_timer(pd);
791 		if (ret<0)
792 		{
793 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
794 				"dpdk error: Init timer is zero with device %s",
795 				p->opt.device);
796 			ret = PCAP_ERROR;
797 			break;
798 		}
799 
800 		nb_ports = rte_eth_dev_count_avail();
801 		if (nb_ports == 0)
802 		{
803 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
804 			    "dpdk error: No Ethernet ports");
805 			ret = PCAP_ERROR;
806 			break;
807 		}
808 
809 		portid = portid_by_device(p->opt.device);
810 		if (portid == DPDK_PORTID_MAX){
811 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
812 			    "dpdk error: portid is invalid. device %s",
813 			    p->opt.device);
814 			ret = PCAP_ERROR_NO_SUCH_DEVICE;
815 			break;
816 		}
817 
818 		pd->portid = portid;
819 
820 		if (p->snapshot <= 0 || p->snapshot > MAXIMUM_SNAPLEN)
821 		{
822 			p->snapshot = MAXIMUM_SNAPLEN;
823 		}
824 		// create the mbuf pool
825 		pd->pktmbuf_pool = rte_pktmbuf_pool_create(MBUF_POOL_NAME, nb_mbufs,
826 			MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
827 			rte_socket_id());
828 		if (pd->pktmbuf_pool == NULL)
829 		{
830 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
831 			    PCAP_ERRBUF_SIZE, rte_errno,
832 			    "dpdk error: Cannot init mbuf pool");
833 			ret = PCAP_ERROR;
834 			break;
835 		}
836 		// config dev
837 		rte_eth_dev_info_get(portid, &dev_info);
838 		if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
839 		{
840 			local_port_conf.txmode.offloads |=DEV_TX_OFFLOAD_MBUF_FAST_FREE;
841 		}
842 		// only support 1 queue
843 		ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
844 		if (ret < 0)
845 		{
846 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
847 			    PCAP_ERRBUF_SIZE, -ret,
848 			    "dpdk error: Cannot configure device: port=%u",
849 			    portid);
850 			ret = PCAP_ERROR;
851 			break;
852 		}
853 		// adjust rx tx
854 		ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
855 		if (ret < 0)
856 		{
857 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
858 			    PCAP_ERRBUF_SIZE, -ret,
859 			    "dpdk error: Cannot adjust number of descriptors: port=%u",
860 			    portid);
861 			ret = PCAP_ERROR;
862 			break;
863 		}
864 		// get MAC addr
865 		rte_eth_macaddr_get(portid, &(pd->eth_addr));
866 		eth_addr_str(&(pd->eth_addr), pd->mac_addr, DPDK_MAC_ADDR_SIZE-1);
867 
868 		// init one RX queue
869 		rxq_conf = dev_info.default_rxconf;
870 		rxq_conf.offloads = local_port_conf.rxmode.offloads;
871 		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
872 					     rte_eth_dev_socket_id(portid),
873 					     &rxq_conf,
874 					     pd->pktmbuf_pool);
875 		if (ret < 0)
876 		{
877 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
878 			    PCAP_ERRBUF_SIZE, -ret,
879 			    "dpdk error: rte_eth_rx_queue_setup:port=%u",
880 			    portid);
881 			ret = PCAP_ERROR;
882 			break;
883 		}
884 
885 		// init one TX queue
886 		txq_conf = dev_info.default_txconf;
887 		txq_conf.offloads = local_port_conf.txmode.offloads;
888 		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
889 				rte_eth_dev_socket_id(portid),
890 				&txq_conf);
891 		if (ret < 0)
892 		{
893 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
894 			    PCAP_ERRBUF_SIZE, -ret,
895 			    "dpdk error: rte_eth_tx_queue_setup:port=%u",
896 			    portid);
897 			ret = PCAP_ERROR;
898 			break;
899 		}
900 		// Initialize TX buffers
901 		tx_buffer = rte_zmalloc_socket(DPDK_TX_BUF_NAME,
902 				RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
903 				rte_eth_dev_socket_id(portid));
904 		if (tx_buffer == NULL)
905 		{
906 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
907 			    "dpdk error: Cannot allocate buffer for tx on port %u", portid);
908 			ret = PCAP_ERROR;
909 			break;
910 		}
911 		rte_eth_tx_buffer_init(tx_buffer, MAX_PKT_BURST);
912 		// Start device
913 		ret = rte_eth_dev_start(portid);
914 		if (ret < 0)
915 		{
916 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
917 			    PCAP_ERRBUF_SIZE, -ret,
918 			    "dpdk error: rte_eth_dev_start:port=%u",
919 			    portid);
920 			ret = PCAP_ERROR;
921 			break;
922 		}
923 		// set promiscuous mode
924 		if (p->opt.promisc){
925 			pd->must_clear_promisc=1;
926 			rte_eth_promiscuous_enable(portid);
927 		}
928 		// check link status
929 		is_port_up = check_link_status(portid, &link);
930 		if (!is_port_up){
931 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
932 			    "dpdk error: link is down, port=%u",portid);
933 			ret = PCAP_ERROR_IFACE_NOT_UP;
934 			break;
935 		}
936 		// reset statistics
937 		rte_eth_stats_reset(pd->portid);
938 		calculate_timestamp(&(pd->ts_helper), &(pd->prev_ts));
939 		rte_eth_stats_get(pd->portid,&(pd->prev_stats));
940 		// format pcap_t
941 		pd->portid = portid;
942 		p->fd = pd->portid;
943 		if (p->snapshot <=0 || p->snapshot> MAXIMUM_SNAPLEN)
944 		{
945 			p->snapshot = MAXIMUM_SNAPLEN;
946 		}
947 		p->linktype = DLT_EN10MB; // Ethernet, the 10MB is historical.
948 		p->selectable_fd = p->fd;
949 		p->read_op = pcap_dpdk_dispatch;
950 		p->inject_op = pcap_dpdk_inject;
951 		// using pcap_filter currently, though DPDK provides their own BPF function. Because DPDK BPF needs load a ELF file as a filter.
952 		p->setfilter_op = install_bpf_program;
953 		p->setdirection_op = NULL;
954 		p->set_datalink_op = NULL;
955 		p->getnonblock_op = pcap_dpdk_getnonblock;
956 		p->setnonblock_op = pcap_dpdk_setnonblock;
957 		p->stats_op = pcap_dpdk_stats;
958 		p->cleanup_op = pcap_dpdk_close;
959 		p->breakloop_op = pcap_breakloop_common;
960 		// set default timeout
961 		pd->required_select_timeout.tv_sec = 0;
962 		pd->required_select_timeout.tv_usec = DPDK_DEF_MIN_SLEEP_MS*1000;
963 		p->required_select_timeout = &pd->required_select_timeout;
964 		ret = 0; // OK
965 	}while(0);
966 
967 	if (ret <= PCAP_ERROR) // all kinds of error code
968 	{
969 		pcap_cleanup_live_common(p);
970 	}else{
971 		rte_eth_dev_get_name_by_port(portid,pd->pci_addr);
972 		RTE_LOG(INFO, USER1,"Port %d device: %s, MAC:%s, PCI:%s\n", portid, p->opt.device, pd->mac_addr, pd->pci_addr);
973 		RTE_LOG(INFO, USER1,"Port %d Link Up. Speed %u Mbps - %s\n",
974 							portid, link.link_speed,
975 					(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
976 						("full-duplex") : ("half-duplex\n"));
977 	}
978 	return ret;
979 }
980 
981 // device name for dpdk should be in the form as dpdk:number, such as dpdk:0
982 pcap_t * pcap_dpdk_create(const char *device, char *ebuf, int *is_ours)
983 {
984 	pcap_t *p=NULL;
985 	*is_ours = 0;
986 
987 	*is_ours = !strncmp(device, "dpdk:", 5);
988 	if (! *is_ours)
989 		return NULL;
990 	//memset will happen
991 	p = PCAP_CREATE_COMMON(ebuf, struct pcap_dpdk);
992 
993 	if (p == NULL)
994 		return NULL;
995 	p->activate_op = pcap_dpdk_activate;
996 	return p;
997 }
998 
999 int pcap_dpdk_findalldevs(pcap_if_list_t *devlistp, char *ebuf)
1000 {
1001 	int ret=0;
1002 	unsigned int nb_ports = 0;
1003 	char dpdk_name[DPDK_DEV_NAME_MAX];
1004 	char dpdk_desc[DPDK_DEV_DESC_MAX];
1005 	ETHER_ADDR_TYPE eth_addr;
1006 	char mac_addr[DPDK_MAC_ADDR_SIZE];
1007 	char pci_addr[DPDK_PCI_ADDR_SIZE];
1008 	do{
1009 		// init EAL; return "DPDK not available" if we
1010 		// have insufficient permission
1011 		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
1012 		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 1);
1013 		if (ret < 0)
1014 		{
1015 			// This returns a negative value on an error.
1016 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
1017 			    "Can't look for DPDK devices: %s",
1018 			    dpdk_pre_init_errbuf);
1019 			ret = PCAP_ERROR;
1020 			break;
1021 		}
1022 		if (ret == 0)
1023 		{
1024 			// This means DPDK isn't available on this machine.
1025 			// That just means "don't return any devices".
1026 			break;
1027 		}
1028 		nb_ports = rte_eth_dev_count_avail();
1029 		if (nb_ports == 0)
1030 		{
1031 			// That just means "don't return any devices".
1032 			ret = 0;
1033 			break;
1034 		}
1035 		for (unsigned int i=0; i<nb_ports; i++){
1036 			snprintf(dpdk_name, DPDK_DEV_NAME_MAX-1,
1037 			    "%s%u", DPDK_PREFIX, i);
1038 			// mac addr
1039 			rte_eth_macaddr_get(i, &eth_addr);
1040 			eth_addr_str(&eth_addr,mac_addr,DPDK_MAC_ADDR_SIZE);
1041 			// PCI addr
1042 			rte_eth_dev_get_name_by_port(i,pci_addr);
1043 			snprintf(dpdk_desc,DPDK_DEV_DESC_MAX-1,"%s %s, MAC:%s, PCI:%s", DPDK_DESC, dpdk_name, mac_addr, pci_addr);
1044 			if (add_dev(devlistp, dpdk_name, 0, dpdk_desc, ebuf)==NULL){
1045 				ret = PCAP_ERROR;
1046 				break;
1047 			}
1048 		}
1049 	}while(0);
1050 	return ret;
1051 }
1052 
1053 #ifdef DPDK_ONLY
1054 /*
1055  * This libpcap build supports only DPDK, not regular network interfaces.
1056  */
1057 
1058 /*
1059  * There are no regular interfaces, just DPDK interfaces.
1060  */
1061 int
1062 pcap_platform_finddevs(pcap_if_list_t *devlistp _U_, char *errbuf)
1063 {
1064 	return (0);
1065 }
1066 
1067 /*
1068  * Attempts to open a regular interface fail.
1069  */
1070 pcap_t *
1071 pcap_create_interface(const char *device, char *errbuf)
1072 {
1073 	snprintf(errbuf, PCAP_ERRBUF_SIZE,
1074 	    "This version of libpcap only supports DPDK");
1075 	return NULL;
1076 }
1077 
1078 /*
1079  * Libpcap version string.
1080  */
1081 const char *
1082 pcap_lib_version(void)
1083 {
1084 	return (PCAP_VERSION_STRING " (DPDK-only)");
1085 }
1086 #endif
1087