1 /* 2 * Copyright (c) 2017 Pure Storage, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. The name of the author may not be used to endorse or promote 15 * products derived from this software without specific prior written 16 * permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include <config.h> 32 33 #include "pcap-int.h" 34 #include "pcap-rdmasniff.h" 35 36 #include <infiniband/verbs.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <limits.h> /* for INT_MAX */ 40 #include <sys/time.h> 41 42 #if !defined(IBV_FLOW_ATTR_SNIFFER) 43 #define IBV_FLOW_ATTR_SNIFFER 3 44 #endif 45 46 static const int RDMASNIFF_NUM_RECEIVES = 128; 47 static const int RDMASNIFF_RECEIVE_SIZE = 10000; 48 49 struct pcap_rdmasniff { 50 struct ibv_device * rdma_device; 51 struct ibv_context * context; 52 struct ibv_comp_channel * channel; 53 struct ibv_pd * pd; 54 struct ibv_cq * cq; 55 struct ibv_qp * qp; 56 struct ibv_flow * flow; 57 struct ibv_mr * mr; 58 u_char * oneshot_buffer; 59 unsigned long port_num; 60 int cq_event; 61 u_int packets_recv; 62 }; 63 64 static int 65 rdmasniff_stats(pcap_t *handle, struct pcap_stat *stat) 66 { 67 struct pcap_rdmasniff *priv = handle->priv; 68 69 stat->ps_recv = priv->packets_recv; 70 stat->ps_drop = 0; 71 stat->ps_ifdrop = 0; 72 73 return 0; 74 } 75 76 static void 77 rdmasniff_cleanup(pcap_t *handle) 78 { 79 struct pcap_rdmasniff *priv = handle->priv; 80 81 ibv_dereg_mr(priv->mr); 82 ibv_destroy_flow(priv->flow); 83 ibv_destroy_qp(priv->qp); 84 ibv_destroy_cq(priv->cq); 85 ibv_dealloc_pd(priv->pd); 86 ibv_destroy_comp_channel(priv->channel); 87 ibv_close_device(priv->context); 88 free(priv->oneshot_buffer); 89 90 pcapint_cleanup_live_common(handle); 91 } 92 93 static void 94 rdmasniff_post_recv(pcap_t *handle, uint64_t wr_id) 95 { 96 struct pcap_rdmasniff *priv = handle->priv; 97 struct ibv_sge sg_entry; 98 struct ibv_recv_wr wr, *bad_wr; 99 100 sg_entry.length = RDMASNIFF_RECEIVE_SIZE; 101 sg_entry.addr = (uintptr_t) handle->buffer + RDMASNIFF_RECEIVE_SIZE * wr_id; 102 sg_entry.lkey = priv->mr->lkey; 103 104 wr.wr_id = wr_id; 105 wr.num_sge = 1; 106 wr.sg_list = &sg_entry; 107 wr.next = NULL; 108 109 ibv_post_recv(priv->qp, &wr, &bad_wr); 110 } 111 112 static int 113 rdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user) 114 { 115 struct pcap_rdmasniff *priv = handle->priv; 116 struct ibv_cq *ev_cq; 117 void *ev_ctx; 118 struct ibv_wc wc; 119 struct pcap_pkthdr pkth; 120 u_char *pktd; 121 int count = 0; 122 123 if (!priv->cq_event) { 124 while (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) { 125 if (errno != EINTR) { 126 return PCAP_ERROR; 127 } 128 if (handle->break_loop) { 129 handle->break_loop = 0; 130 return PCAP_ERROR_BREAK; 131 } 132 } 133 ibv_ack_cq_events(priv->cq, 1); 134 ibv_req_notify_cq(priv->cq, 0); 135 priv->cq_event = 1; 136 } 137 138 /* 139 * This can conceivably process more than INT_MAX packets, 140 * which would overflow the packet count, causing it either 141 * to look like a negative number, and thus cause us to 142 * return a value that looks like an error, or overflow 143 * back into positive territory, and thus cause us to 144 * return a too-low count. 145 * 146 * Therefore, if the packet count is unlimited, we clip 147 * it at INT_MAX; this routine is not expected to 148 * process packets indefinitely, so that's not an issue. 149 */ 150 if (PACKET_COUNT_IS_UNLIMITED(max_packets)) 151 max_packets = INT_MAX; 152 153 while (count < max_packets) { 154 if (ibv_poll_cq(priv->cq, 1, &wc) != 1) { 155 priv->cq_event = 0; 156 break; 157 } 158 159 if (wc.status != IBV_WC_SUCCESS) { 160 fprintf(stderr, "failed WC wr_id %" PRIu64 " status %d/%s\n", 161 wc.wr_id, 162 wc.status, ibv_wc_status_str(wc.status)); 163 continue; 164 } 165 166 pkth.len = wc.byte_len; 167 pkth.caplen = min(pkth.len, (u_int)handle->snapshot); 168 gettimeofday(&pkth.ts, NULL); 169 170 pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE; 171 172 if (handle->fcode.bf_insns == NULL || 173 pcapint_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) { 174 callback(user, &pkth, pktd); 175 ++priv->packets_recv; 176 ++count; 177 } 178 179 rdmasniff_post_recv(handle, wc.wr_id); 180 181 if (handle->break_loop) { 182 handle->break_loop = 0; 183 return PCAP_ERROR_BREAK; 184 } 185 } 186 187 return count; 188 } 189 190 static void 191 rdmasniff_oneshot(u_char *user, const struct pcap_pkthdr *h, const u_char *bytes) 192 { 193 struct oneshot_userdata *sp = (struct oneshot_userdata *) user; 194 pcap_t *handle = sp->pd; 195 struct pcap_rdmasniff *priv = handle->priv; 196 197 *sp->hdr = *h; 198 memcpy(priv->oneshot_buffer, bytes, h->caplen); 199 *sp->pkt = priv->oneshot_buffer; 200 } 201 202 static int 203 rdmasniff_activate(pcap_t *handle) 204 { 205 struct pcap_rdmasniff *priv = handle->priv; 206 struct ibv_qp_init_attr qp_init_attr; 207 struct ibv_qp_attr qp_attr; 208 struct ibv_flow_attr flow_attr; 209 struct ibv_port_attr port_attr; 210 int i; 211 212 priv->context = ibv_open_device(priv->rdma_device); 213 if (!priv->context) { 214 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 215 "Failed to open device %s", handle->opt.device); 216 goto error; 217 } 218 219 priv->pd = ibv_alloc_pd(priv->context); 220 if (!priv->pd) { 221 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 222 "Failed to alloc PD for device %s", handle->opt.device); 223 goto error; 224 } 225 226 priv->channel = ibv_create_comp_channel(priv->context); 227 if (!priv->channel) { 228 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 229 "Failed to create comp channel for device %s", handle->opt.device); 230 goto error; 231 } 232 233 priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES, 234 NULL, priv->channel, 0); 235 if (!priv->cq) { 236 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 237 "Failed to create CQ for device %s", handle->opt.device); 238 goto error; 239 } 240 241 ibv_req_notify_cq(priv->cq, 0); 242 243 memset(&qp_init_attr, 0, sizeof qp_init_attr); 244 qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq; 245 qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES; 246 qp_init_attr.cap.max_recv_sge = 1; 247 qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; 248 priv->qp = ibv_create_qp(priv->pd, &qp_init_attr); 249 if (!priv->qp) { 250 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 251 "Failed to create QP for device %s", handle->opt.device); 252 goto error; 253 } 254 255 memset(&qp_attr, 0, sizeof qp_attr); 256 qp_attr.qp_state = IBV_QPS_INIT; 257 qp_attr.port_num = priv->port_num; 258 if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) { 259 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 260 "Failed to modify QP to INIT for device %s", handle->opt.device); 261 goto error; 262 } 263 264 memset(&qp_attr, 0, sizeof qp_attr); 265 qp_attr.qp_state = IBV_QPS_RTR; 266 if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) { 267 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 268 "Failed to modify QP to RTR for device %s", handle->opt.device); 269 goto error; 270 } 271 272 memset(&flow_attr, 0, sizeof flow_attr); 273 flow_attr.type = IBV_FLOW_ATTR_SNIFFER; 274 flow_attr.size = sizeof flow_attr; 275 flow_attr.port = priv->port_num; 276 priv->flow = ibv_create_flow(priv->qp, &flow_attr); 277 if (!priv->flow) { 278 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 279 "Failed to create flow for device %s", handle->opt.device); 280 goto error; 281 } 282 283 handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE; 284 handle->buffer = malloc(handle->bufsize); 285 if (!handle->buffer) { 286 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 287 "Failed to allocate receive buffer for device %s", handle->opt.device); 288 goto error; 289 } 290 291 priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE); 292 if (!priv->oneshot_buffer) { 293 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 294 "Failed to allocate oneshot buffer for device %s", handle->opt.device); 295 goto error; 296 } 297 298 priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE); 299 if (!priv->mr) { 300 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 301 "Failed to register MR for device %s", handle->opt.device); 302 goto error; 303 } 304 305 306 for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) { 307 rdmasniff_post_recv(handle, i); 308 } 309 310 if (!ibv_query_port(priv->context, priv->port_num, &port_attr) && 311 port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { 312 handle->linktype = DLT_INFINIBAND; 313 } else { 314 handle->linktype = DLT_EN10MB; 315 } 316 317 if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE) 318 handle->snapshot = RDMASNIFF_RECEIVE_SIZE; 319 320 handle->offset = 0; 321 handle->read_op = rdmasniff_read; 322 handle->stats_op = rdmasniff_stats; 323 handle->cleanup_op = rdmasniff_cleanup; 324 handle->setfilter_op = pcapint_install_bpf_program; 325 handle->setdirection_op = NULL; 326 handle->set_datalink_op = NULL; 327 handle->getnonblock_op = pcapint_getnonblock_fd; 328 handle->setnonblock_op = pcapint_setnonblock_fd; 329 handle->oneshot_callback = rdmasniff_oneshot; 330 handle->selectable_fd = priv->channel->fd; 331 332 return 0; 333 334 error: 335 if (priv->mr) { 336 ibv_dereg_mr(priv->mr); 337 } 338 339 if (priv->flow) { 340 ibv_destroy_flow(priv->flow); 341 } 342 343 if (priv->qp) { 344 ibv_destroy_qp(priv->qp); 345 } 346 347 if (priv->cq) { 348 ibv_destroy_cq(priv->cq); 349 } 350 351 if (priv->channel) { 352 ibv_destroy_comp_channel(priv->channel); 353 } 354 355 if (priv->pd) { 356 ibv_dealloc_pd(priv->pd); 357 } 358 359 if (priv->context) { 360 ibv_close_device(priv->context); 361 } 362 363 if (priv->oneshot_buffer) { 364 free(priv->oneshot_buffer); 365 } 366 367 return PCAP_ERROR; 368 } 369 370 pcap_t * 371 rdmasniff_create(const char *device, char *ebuf, int *is_ours) 372 { 373 struct pcap_rdmasniff *priv; 374 struct ibv_device **dev_list; 375 int numdev; 376 size_t namelen; 377 const char *port; 378 unsigned long port_num; 379 int i; 380 pcap_t *p = NULL; 381 382 *is_ours = 0; 383 384 dev_list = ibv_get_device_list(&numdev); 385 if (!dev_list) { 386 return NULL; 387 } 388 if (!numdev) { 389 ibv_free_device_list(dev_list); 390 return NULL; 391 } 392 393 namelen = strlen(device); 394 395 port = strchr(device, ':'); 396 if (port) { 397 port_num = strtoul(port + 1, NULL, 10); 398 if (port_num > 0) { 399 namelen = port - device; 400 } else { 401 port_num = 1; 402 } 403 } else { 404 port_num = 1; 405 } 406 407 for (i = 0; i < numdev; ++i) { 408 if (strlen(dev_list[i]->name) == namelen && 409 !strncmp(device, dev_list[i]->name, namelen)) { 410 *is_ours = 1; 411 412 p = PCAP_CREATE_COMMON(ebuf, struct pcap_rdmasniff); 413 if (p) { 414 p->activate_op = rdmasniff_activate; 415 priv = p->priv; 416 priv->rdma_device = dev_list[i]; 417 priv->port_num = port_num; 418 } 419 break; 420 } 421 } 422 423 ibv_free_device_list(dev_list); 424 return p; 425 } 426 427 int 428 rdmasniff_findalldevs(pcap_if_list_t *devlistp, char *err_str) 429 { 430 struct ibv_device **dev_list; 431 int numdev; 432 int i; 433 int ret = 0; 434 435 dev_list = ibv_get_device_list(&numdev); 436 if (!dev_list) { 437 return 0; 438 } 439 440 for (i = 0; i < numdev; ++i) { 441 /* 442 * XXX - do the notions of "up", "running", or 443 * "connected" apply here? 444 */ 445 if (!pcapint_add_dev(devlistp, dev_list[i]->name, 0, "RDMA sniffer", err_str)) { 446 ret = -1; 447 break; 448 } 449 } 450 451 ibv_free_device_list(dev_list); 452 return ret; 453 } 454