xref: /freebsd/sys/contrib/rdma/krping/krping.c (revision 4f52dfbb8d6c4d446500c5b097e3806ec219fbd4)
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/slab.h>
40 #include <linux/err.h>
41 #include <linux/string.h>
42 #include <linux/list.h>
43 #include <linux/in.h>
44 #include <linux/device.h>
45 #include <linux/pci.h>
46 #include <linux/sched.h>
47 #include <linux/wait.h>
48 
49 #include <asm/atomic.h>
50 
51 #include <rdma/ib_verbs.h>
52 #include <rdma/rdma_cm.h>
53 
54 #include "krping.h"
55 #include "getopt.h"
56 
57 #define PFX "krping: "
58 
59 extern int krping_debug;
60 #define DEBUG_LOG(...) do { if (krping_debug) log(LOG_INFO, __VA_ARGS__); } while (0)
61 #define BIND_INFO 1
62 
63 MODULE_AUTHOR("Steve Wise");
64 MODULE_DESCRIPTION("RDMA ping server");
65 MODULE_LICENSE("Dual BSD/GPL");
66 MODULE_VERSION(krping, 1);
67 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
68 
69 static __inline uint64_t
70 get_cycles(void)
71 {
72 	uint32_t low, high;
73 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
74 	return (low | ((u_int64_t)high << 32));
75 }
76 
77 typedef uint64_t cycles_t;
78 
79 enum mem_type {
80 	DMA = 1,
81 	REG = 2,
82 };
83 
84 static const struct krping_option krping_opts[] = {
85 	{"count", OPT_INT, 'C'},
86 	{"size", OPT_INT, 'S'},
87 	{"addr", OPT_STRING, 'a'},
88 	{"addr6", OPT_STRING, 'A'},
89 	{"port", OPT_INT, 'p'},
90 	{"verbose", OPT_NOPARAM, 'v'},
91 	{"validate", OPT_NOPARAM, 'V'},
92 	{"server", OPT_NOPARAM, 's'},
93 	{"client", OPT_NOPARAM, 'c'},
94 	{"server_inv", OPT_NOPARAM, 'I'},
95  	{"wlat", OPT_NOPARAM, 'l'},
96  	{"rlat", OPT_NOPARAM, 'L'},
97  	{"bw", OPT_NOPARAM, 'B'},
98  	{"duplex", OPT_NOPARAM, 'd'},
99  	{"txdepth", OPT_INT, 'T'},
100  	{"poll", OPT_NOPARAM, 'P'},
101  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
102  	{"read_inv", OPT_NOPARAM, 'R'},
103  	{"fr", OPT_NOPARAM, 'f'},
104 	{NULL, 0, 0}
105 };
106 
107 #define htonll(x) cpu_to_be64((x))
108 #define ntohll(x) cpu_to_be64((x))
109 
110 static DEFINE_MUTEX(krping_mutex);
111 
112 /*
113  * List of running krping threads.
114  */
115 static LIST_HEAD(krping_cbs);
116 
117 /*
118  * Invoke like this, one on each side, using the server's address on
119  * the RDMA device (iw%d):
120  *
121  * /bin/echo server,port=9999,addr=192.168.69.142,validate > /proc/krping
122  * /bin/echo client,port=9999,addr=192.168.69.142,validate > /proc/krping
123  * /bin/echo client,port=9999,addr6=2001:db8:0:f101::1,validate > /proc/krping
124  *
125  * krping "ping/pong" loop:
126  * 	client sends source rkey/addr/len
127  *	server receives source rkey/add/len
128  *	server rdma reads "ping" data from source
129  * 	server sends "go ahead" on rdma read completion
130  *	client sends sink rkey/addr/len
131  * 	server receives sink rkey/addr/len
132  * 	server rdma writes "pong" data to sink
133  * 	server sends "go ahead" on rdma write completion
134  * 	<repeat loop>
135  */
136 
137 /*
138  * These states are used to signal events between the completion handler
139  * and the main client or server thread.
140  *
141  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
142  * and RDMA_WRITE_COMPLETE for each ping.
143  */
144 enum test_state {
145 	IDLE = 1,
146 	CONNECT_REQUEST,
147 	ADDR_RESOLVED,
148 	ROUTE_RESOLVED,
149 	CONNECTED,
150 	RDMA_READ_ADV,
151 	RDMA_READ_COMPLETE,
152 	RDMA_WRITE_ADV,
153 	RDMA_WRITE_COMPLETE,
154 	ERROR
155 };
156 
157 struct krping_rdma_info {
158 	uint64_t buf;
159 	uint32_t rkey;
160 	uint32_t size;
161 };
162 
163 /*
164  * Default max buffer size for IO...
165  */
166 #define RPING_BUFSIZE 128*1024
167 #define RPING_SQ_DEPTH 64
168 
169 /*
170  * Control block struct.
171  */
172 struct krping_cb {
173 	int server;			/* 0 iff client */
174 	struct ib_cq *cq;
175 	struct ib_pd *pd;
176 	struct ib_qp *qp;
177 
178 	struct ib_mr *dma_mr;
179 
180 	struct ib_fast_reg_page_list *page_list;
181 	int page_list_len;
182 	struct ib_reg_wr reg_mr_wr;
183 	struct ib_send_wr invalidate_wr;
184 	struct ib_mr *reg_mr;
185 	int server_invalidate;
186 	int read_inv;
187 	u8 key;
188 
189 	struct ib_recv_wr rq_wr;	/* recv work request record */
190 	struct ib_sge recv_sgl;		/* recv single SGE */
191 	struct krping_rdma_info recv_buf __aligned(16);	/* malloc'd buffer */
192 	u64 recv_dma_addr;
193 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
194 
195 	struct ib_send_wr sq_wr;	/* send work requrest record */
196 	struct ib_sge send_sgl;
197 	struct krping_rdma_info send_buf __aligned(16); /* single send buf */
198 	u64 send_dma_addr;
199 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
200 
201 	struct ib_rdma_wr rdma_sq_wr;	/* rdma work request record */
202 	struct ib_sge rdma_sgl;		/* rdma single SGE */
203 	char *rdma_buf;			/* used as rdma sink */
204 	u64  rdma_dma_addr;
205 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
206 	struct ib_mr *rdma_mr;
207 
208 	uint32_t remote_rkey;		/* remote guys RKEY */
209 	uint64_t remote_addr;		/* remote guys TO */
210 	uint32_t remote_len;		/* remote guys LEN */
211 
212 	char *start_buf;		/* rdma read src */
213 	u64  start_dma_addr;
214 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
215 	struct ib_mr *start_mr;
216 
217 	enum test_state state;		/* used for cond/signalling */
218 	wait_queue_head_t sem;
219 	struct krping_stats stats;
220 
221 	uint16_t port;			/* dst port in NBO */
222 	u8 addr[16] __aligned(8);	/* dst addr in NBO */
223 	char *addr_str;			/* dst addr string */
224 	uint8_t addr_type;		/* ADDR_FAMILY - IPv4/V6 */
225 	int verbose;			/* verbose logging */
226 	int count;			/* ping count */
227 	int size;			/* ping data size */
228 	int validate;			/* validate ping data */
229 	int wlat;			/* run wlat test */
230 	int rlat;			/* run rlat test */
231 	int bw;				/* run bw test */
232 	int duplex;			/* run bw full duplex test */
233 	int poll;			/* poll or block for rlat test */
234 	int txdepth;			/* SQ depth */
235 	int local_dma_lkey;		/* use 0 for lkey */
236 	int frtest;			/* reg test */
237 
238 	/* CM stuff */
239 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
240 					/* listener on server side. */
241 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
242 	struct list_head list;
243 };
244 
245 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
246 				   struct rdma_cm_event *event)
247 {
248 	int ret;
249 	struct krping_cb *cb = cma_id->context;
250 
251 	DEBUG_LOG("cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
252 		  (cma_id == cb->cm_id) ? "parent" : "child");
253 
254 	switch (event->event) {
255 	case RDMA_CM_EVENT_ADDR_RESOLVED:
256 		cb->state = ADDR_RESOLVED;
257 		ret = rdma_resolve_route(cma_id, 2000);
258 		if (ret) {
259 			printk(KERN_ERR PFX "rdma_resolve_route error %d\n",
260 			       ret);
261 			wake_up_interruptible(&cb->sem);
262 		}
263 		break;
264 
265 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
266 		cb->state = ROUTE_RESOLVED;
267 		wake_up_interruptible(&cb->sem);
268 		break;
269 
270 	case RDMA_CM_EVENT_CONNECT_REQUEST:
271 		cb->state = CONNECT_REQUEST;
272 		cb->child_cm_id = cma_id;
273 		DEBUG_LOG("child cma %p\n", cb->child_cm_id);
274 		wake_up_interruptible(&cb->sem);
275 		break;
276 
277 	case RDMA_CM_EVENT_ESTABLISHED:
278 		DEBUG_LOG("ESTABLISHED\n");
279 		if (!cb->server) {
280 			cb->state = CONNECTED;
281 		}
282 		wake_up_interruptible(&cb->sem);
283 		break;
284 
285 	case RDMA_CM_EVENT_ADDR_ERROR:
286 	case RDMA_CM_EVENT_ROUTE_ERROR:
287 	case RDMA_CM_EVENT_CONNECT_ERROR:
288 	case RDMA_CM_EVENT_UNREACHABLE:
289 	case RDMA_CM_EVENT_REJECTED:
290 		printk(KERN_ERR PFX "cma event %d, error %d\n", event->event,
291 		       event->status);
292 		cb->state = ERROR;
293 		wake_up_interruptible(&cb->sem);
294 		break;
295 
296 	case RDMA_CM_EVENT_DISCONNECTED:
297 		printk(KERN_ERR PFX "DISCONNECT EVENT...\n");
298 		cb->state = ERROR;
299 		wake_up_interruptible(&cb->sem);
300 		break;
301 
302 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
303 		printk(KERN_ERR PFX "cma detected device removal!!!!\n");
304 		cb->state = ERROR;
305 		wake_up_interruptible(&cb->sem);
306 		break;
307 
308 	default:
309 		printk(KERN_ERR PFX "oof bad type!\n");
310 		wake_up_interruptible(&cb->sem);
311 		break;
312 	}
313 	return 0;
314 }
315 
316 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
317 {
318 	if (wc->byte_len != sizeof(cb->recv_buf)) {
319 		printk(KERN_ERR PFX "Received bogus data, size %d\n",
320 		       wc->byte_len);
321 		return -1;
322 	}
323 
324 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
325 	cb->remote_addr = ntohll(cb->recv_buf.buf);
326 	cb->remote_len  = ntohl(cb->recv_buf.size);
327 	DEBUG_LOG("Received rkey %x addr %llx len %d from peer\n",
328 		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
329 		  cb->remote_len);
330 
331 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
332 		cb->state = RDMA_READ_ADV;
333 	else
334 		cb->state = RDMA_WRITE_ADV;
335 
336 	return 0;
337 }
338 
339 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
340 {
341 	if (wc->byte_len != sizeof(cb->recv_buf)) {
342 		printk(KERN_ERR PFX "Received bogus data, size %d\n",
343 		       wc->byte_len);
344 		return -1;
345 	}
346 
347 	if (cb->state == RDMA_READ_ADV)
348 		cb->state = RDMA_WRITE_ADV;
349 	else
350 		cb->state = RDMA_WRITE_COMPLETE;
351 
352 	return 0;
353 }
354 
355 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
356 {
357 	struct krping_cb *cb = ctx;
358 	struct ib_wc wc;
359 	struct ib_recv_wr *bad_wr;
360 	int ret;
361 
362 	BUG_ON(cb->cq != cq);
363 	if (cb->state == ERROR) {
364 		printk(KERN_ERR PFX "cq completion in ERROR state\n");
365 		return;
366 	}
367 	if (cb->frtest) {
368 		printk(KERN_ERR PFX "cq completion event in frtest!\n");
369 		return;
370 	}
371 	if (!cb->wlat && !cb->rlat && !cb->bw)
372 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
373 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
374 		if (wc.status) {
375 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
376 				DEBUG_LOG("cq flushed\n");
377 				continue;
378 			} else {
379 				printk(KERN_ERR PFX "cq completion failed with "
380 				       "wr_id %jx status %d opcode %d vender_err %x\n",
381 					(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
382 				goto error;
383 			}
384 		}
385 
386 		switch (wc.opcode) {
387 		case IB_WC_SEND:
388 			DEBUG_LOG("send completion\n");
389 			cb->stats.send_bytes += cb->send_sgl.length;
390 			cb->stats.send_msgs++;
391 			break;
392 
393 		case IB_WC_RDMA_WRITE:
394 			DEBUG_LOG("rdma write completion\n");
395 			cb->stats.write_bytes += cb->rdma_sq_wr.wr.sg_list->length;
396 			cb->stats.write_msgs++;
397 			cb->state = RDMA_WRITE_COMPLETE;
398 			wake_up_interruptible(&cb->sem);
399 			break;
400 
401 		case IB_WC_RDMA_READ:
402 			DEBUG_LOG("rdma read completion\n");
403 			cb->stats.read_bytes += cb->rdma_sq_wr.wr.sg_list->length;
404 			cb->stats.read_msgs++;
405 			cb->state = RDMA_READ_COMPLETE;
406 			wake_up_interruptible(&cb->sem);
407 			break;
408 
409 		case IB_WC_RECV:
410 			DEBUG_LOG("recv completion\n");
411 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
412 			cb->stats.recv_msgs++;
413 			if (cb->wlat || cb->rlat || cb->bw)
414 				ret = server_recv(cb, &wc);
415 			else
416 				ret = cb->server ? server_recv(cb, &wc) :
417 						   client_recv(cb, &wc);
418 			if (ret) {
419 				printk(KERN_ERR PFX "recv wc error: %d\n", ret);
420 				goto error;
421 			}
422 
423 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
424 			if (ret) {
425 				printk(KERN_ERR PFX "post recv error: %d\n",
426 				       ret);
427 				goto error;
428 			}
429 			wake_up_interruptible(&cb->sem);
430 			break;
431 
432 		default:
433 			printk(KERN_ERR PFX
434 			       "%s:%d Unexpected opcode %d, Shutting down\n",
435 			       __func__, __LINE__, wc.opcode);
436 			goto error;
437 		}
438 	}
439 	if (ret) {
440 		printk(KERN_ERR PFX "poll error %d\n", ret);
441 		goto error;
442 	}
443 	return;
444 error:
445 	cb->state = ERROR;
446 	wake_up_interruptible(&cb->sem);
447 }
448 
449 static int krping_accept(struct krping_cb *cb)
450 {
451 	struct rdma_conn_param conn_param;
452 	int ret;
453 
454 	DEBUG_LOG("accepting client connection request\n");
455 
456 	memset(&conn_param, 0, sizeof conn_param);
457 	conn_param.responder_resources = 1;
458 	conn_param.initiator_depth = 1;
459 
460 	ret = rdma_accept(cb->child_cm_id, &conn_param);
461 	if (ret) {
462 		printk(KERN_ERR PFX "rdma_accept error: %d\n", ret);
463 		return ret;
464 	}
465 
466 	if (!cb->wlat && !cb->rlat && !cb->bw) {
467 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
468 		if (cb->state == ERROR) {
469 			printk(KERN_ERR PFX "wait for CONNECTED state %d\n",
470 				cb->state);
471 			return -1;
472 		}
473 	}
474 	return 0;
475 }
476 
477 static void krping_setup_wr(struct krping_cb *cb)
478 {
479 	cb->recv_sgl.addr = cb->recv_dma_addr;
480 	cb->recv_sgl.length = sizeof cb->recv_buf;
481 	cb->recv_sgl.lkey = cb->pd->local_dma_lkey;
482 	cb->rq_wr.sg_list = &cb->recv_sgl;
483 	cb->rq_wr.num_sge = 1;
484 
485 	cb->send_sgl.addr = cb->send_dma_addr;
486 	cb->send_sgl.length = sizeof cb->send_buf;
487 	cb->send_sgl.lkey = cb->pd->local_dma_lkey;
488 
489 	cb->sq_wr.opcode = IB_WR_SEND;
490 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
491 	cb->sq_wr.sg_list = &cb->send_sgl;
492 	cb->sq_wr.num_sge = 1;
493 
494 	if (cb->server || cb->wlat || cb->rlat || cb->bw) {
495 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
496 		cb->rdma_sq_wr.wr.send_flags = IB_SEND_SIGNALED;
497 		cb->rdma_sq_wr.wr.sg_list = &cb->rdma_sgl;
498 		cb->rdma_sq_wr.wr.num_sge = 1;
499 	}
500 
501 	/*
502 	 * A chain of 2 WRs, INVALDATE_MR + REG_MR.
503 	 * both unsignaled.  The client uses them to reregister
504 	 * the rdma buffers with a new key each iteration.
505 	 */
506 	cb->reg_mr_wr.wr.opcode = IB_WR_REG_MR;
507 	cb->reg_mr_wr.mr = cb->reg_mr;
508 
509 	cb->invalidate_wr.next = &cb->reg_mr_wr.wr;
510 	cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
511 }
512 
513 static int krping_setup_buffers(struct krping_cb *cb)
514 {
515 	int ret;
516 
517 	DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
518 
519 	cb->recv_dma_addr = ib_dma_map_single(cb->pd->device,
520 				   &cb->recv_buf,
521 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
522 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
523 	cb->send_dma_addr = ib_dma_map_single(cb->pd->device,
524 					   &cb->send_buf, sizeof(cb->send_buf),
525 					   DMA_BIDIRECTIONAL);
526 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
527 
528 	cb->rdma_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size,
529 					     &cb->rdma_dma_addr,
530 					     GFP_KERNEL);
531 	if (!cb->rdma_buf) {
532 		DEBUG_LOG(PFX "rdma_buf allocation failed\n");
533 		ret = -ENOMEM;
534 		goto bail;
535 	}
536 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
537 	cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE)
538 				>> PAGE_SHIFT;
539 	cb->reg_mr = ib_alloc_mr(cb->pd,  IB_MR_TYPE_MEM_REG,
540 				 cb->page_list_len);
541 	if (IS_ERR(cb->reg_mr)) {
542 		ret = PTR_ERR(cb->reg_mr);
543 		DEBUG_LOG(PFX "recv_buf reg_mr failed %d\n", ret);
544 		goto bail;
545 	}
546 	DEBUG_LOG(PFX "reg rkey 0x%x page_list_len %u\n",
547 		cb->reg_mr->rkey, cb->page_list_len);
548 
549 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
550 
551 		cb->start_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size,
552 						      &cb->start_dma_addr,
553 						      GFP_KERNEL);
554 		if (!cb->start_buf) {
555 			DEBUG_LOG(PFX "start_buf malloc failed\n");
556 			ret = -ENOMEM;
557 			goto bail;
558 		}
559 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
560 	}
561 
562 	krping_setup_wr(cb);
563 	DEBUG_LOG(PFX "allocated & registered buffers...\n");
564 	return 0;
565 bail:
566 	if (cb->reg_mr && !IS_ERR(cb->reg_mr))
567 		ib_dereg_mr(cb->reg_mr);
568 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
569 		ib_dereg_mr(cb->rdma_mr);
570 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
571 		ib_dereg_mr(cb->dma_mr);
572 	if (cb->rdma_buf) {
573 		ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf,
574 				     cb->rdma_dma_addr);
575 	}
576 	if (cb->start_buf) {
577 		ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf,
578 				     cb->start_dma_addr);
579 	}
580 	return ret;
581 }
582 
583 static void krping_free_buffers(struct krping_cb *cb)
584 {
585 	DEBUG_LOG("krping_free_buffers called on cb %p\n", cb);
586 
587 	if (cb->dma_mr)
588 		ib_dereg_mr(cb->dma_mr);
589 	if (cb->rdma_mr)
590 		ib_dereg_mr(cb->rdma_mr);
591 	if (cb->start_mr)
592 		ib_dereg_mr(cb->start_mr);
593 	if (cb->reg_mr)
594 		ib_dereg_mr(cb->reg_mr);
595 
596 	dma_unmap_single(cb->pd->device->dma_device,
597 			 pci_unmap_addr(cb, recv_mapping),
598 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
599 	dma_unmap_single(cb->pd->device->dma_device,
600 			 pci_unmap_addr(cb, send_mapping),
601 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
602 
603 	ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf,
604 			     cb->rdma_dma_addr);
605 
606 	if (cb->start_buf) {
607 		ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf,
608 				     cb->start_dma_addr);
609 	}
610 }
611 
612 static int krping_create_qp(struct krping_cb *cb)
613 {
614 	struct ib_qp_init_attr init_attr;
615 	int ret;
616 
617 	memset(&init_attr, 0, sizeof(init_attr));
618 	init_attr.cap.max_send_wr = cb->txdepth;
619 	init_attr.cap.max_recv_wr = 2;
620 
621 	/* For flush_qp() */
622 	init_attr.cap.max_send_wr++;
623 	init_attr.cap.max_recv_wr++;
624 
625 	init_attr.cap.max_recv_sge = 1;
626 	init_attr.cap.max_send_sge = 1;
627 	init_attr.qp_type = IB_QPT_RC;
628 	init_attr.send_cq = cb->cq;
629 	init_attr.recv_cq = cb->cq;
630 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
631 
632 	if (cb->server) {
633 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
634 		if (!ret)
635 			cb->qp = cb->child_cm_id->qp;
636 	} else {
637 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
638 		if (!ret)
639 			cb->qp = cb->cm_id->qp;
640 	}
641 
642 	return ret;
643 }
644 
645 static void krping_free_qp(struct krping_cb *cb)
646 {
647 	ib_destroy_qp(cb->qp);
648 	ib_destroy_cq(cb->cq);
649 	ib_dealloc_pd(cb->pd);
650 }
651 
652 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
653 {
654 	int ret;
655 	struct ib_cq_init_attr attr = {0};
656 
657 	cb->pd = ib_alloc_pd(cm_id->device, 0);
658 	if (IS_ERR(cb->pd)) {
659 		printk(KERN_ERR PFX "ib_alloc_pd failed\n");
660 		return PTR_ERR(cb->pd);
661 	}
662 	DEBUG_LOG("created pd %p\n", cb->pd);
663 
664 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
665 
666 	attr.cqe = cb->txdepth * 2;
667 	attr.comp_vector = 0;
668 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
669 			      cb, &attr);
670 	if (IS_ERR(cb->cq)) {
671 		printk(KERN_ERR PFX "ib_create_cq failed\n");
672 		ret = PTR_ERR(cb->cq);
673 		goto err1;
674 	}
675 	DEBUG_LOG("created cq %p\n", cb->cq);
676 
677 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
678 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
679 		if (ret) {
680 			printk(KERN_ERR PFX "ib_create_cq failed\n");
681 			goto err2;
682 		}
683 	}
684 
685 	ret = krping_create_qp(cb);
686 	if (ret) {
687 		printk(KERN_ERR PFX "krping_create_qp failed: %d\n", ret);
688 		goto err2;
689 	}
690 	DEBUG_LOG("created qp %p\n", cb->qp);
691 	return 0;
692 err2:
693 	ib_destroy_cq(cb->cq);
694 err1:
695 	ib_dealloc_pd(cb->pd);
696 	return ret;
697 }
698 
699 /*
700  * return the (possibly rebound) rkey for the rdma buffer.
701  * REG mode: invalidate and rebind via reg wr.
702  * other modes: just return the mr rkey.
703  */
704 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
705 {
706 	u32 rkey;
707 	struct ib_send_wr *bad_wr;
708 	int ret;
709 	struct scatterlist sg = {0};
710 
711 	cb->invalidate_wr.ex.invalidate_rkey = cb->reg_mr->rkey;
712 
713 	/*
714 	 * Update the reg key.
715 	 */
716 	ib_update_fast_reg_key(cb->reg_mr, ++cb->key);
717 	cb->reg_mr_wr.key = cb->reg_mr->rkey;
718 
719 	/*
720 	 * Update the reg WR with new buf info.
721 	 */
722 	if (buf == (u64)cb->start_dma_addr)
723 		cb->reg_mr_wr.access = IB_ACCESS_REMOTE_READ;
724 	else
725 		cb->reg_mr_wr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
726 	sg_dma_address(&sg) = buf;
727 	sg_dma_len(&sg) = cb->size;
728 
729 	ret = ib_map_mr_sg(cb->reg_mr, &sg, 1, NULL, PAGE_SIZE);
730 	BUG_ON(ret <= 0 || ret > cb->page_list_len);
731 
732 	DEBUG_LOG(PFX "post_inv = %d, reg_mr new rkey 0x%x pgsz %u len %u"
733 		" iova_start %llx\n",
734 		post_inv,
735 		cb->reg_mr_wr.key,
736 		cb->reg_mr->page_size,
737 		cb->reg_mr->length,
738 	        (unsigned long long)cb->reg_mr->iova);
739 
740 	if (post_inv)
741 		ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
742 	else
743 		ret = ib_post_send(cb->qp, &cb->reg_mr_wr.wr, &bad_wr);
744 	if (ret) {
745 		printk(KERN_ERR PFX "post send error %d\n", ret);
746 		cb->state = ERROR;
747 	}
748 	rkey = cb->reg_mr->rkey;
749 	return rkey;
750 }
751 
752 static void krping_format_send(struct krping_cb *cb, u64 buf)
753 {
754 	struct krping_rdma_info *info = &cb->send_buf;
755 	u32 rkey;
756 
757 	/*
758 	 * Client side will do reg or mw bind before
759 	 * advertising the rdma buffer.  Server side
760 	 * sends have no data.
761 	 */
762 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
763 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
764 		info->buf = htonll(buf);
765 		info->rkey = htonl(rkey);
766 		info->size = htonl(cb->size);
767 		DEBUG_LOG("RDMA addr %llx rkey %x len %d\n",
768 			  (unsigned long long)buf, rkey, cb->size);
769 	}
770 }
771 
772 static void krping_test_server(struct krping_cb *cb)
773 {
774 	struct ib_send_wr *bad_wr, inv;
775 	int ret;
776 
777 	while (1) {
778 		/* Wait for client's Start STAG/TO/Len */
779 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
780 		if (cb->state != RDMA_READ_ADV) {
781 			printk(KERN_ERR PFX "wait for RDMA_READ_ADV state %d\n",
782 				cb->state);
783 			break;
784 		}
785 
786 		DEBUG_LOG("server received sink adv\n");
787 
788 		cb->rdma_sq_wr.rkey = cb->remote_rkey;
789 		cb->rdma_sq_wr.remote_addr = cb->remote_addr;
790 		cb->rdma_sq_wr.wr.sg_list->length = cb->remote_len;
791 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, !cb->read_inv);
792 		cb->rdma_sq_wr.wr.next = NULL;
793 
794 		/* Issue RDMA Read. */
795 		if (cb->read_inv)
796 			cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
797 		else {
798 
799 			cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ;
800 			/*
801 			 * Immediately follow the read with a
802 			 * fenced LOCAL_INV.
803 			 */
804 			cb->rdma_sq_wr.wr.next = &inv;
805 			memset(&inv, 0, sizeof inv);
806 			inv.opcode = IB_WR_LOCAL_INV;
807 			inv.ex.invalidate_rkey = cb->reg_mr->rkey;
808 			inv.send_flags = IB_SEND_FENCE;
809 		}
810 
811 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
812 		if (ret) {
813 			printk(KERN_ERR PFX "post send error %d\n", ret);
814 			break;
815 		}
816 		cb->rdma_sq_wr.wr.next = NULL;
817 
818 		DEBUG_LOG("server posted rdma read req \n");
819 
820 		/* Wait for read completion */
821 		wait_event_interruptible(cb->sem,
822 					 cb->state >= RDMA_READ_COMPLETE);
823 		if (cb->state != RDMA_READ_COMPLETE) {
824 			printk(KERN_ERR PFX
825 			       "wait for RDMA_READ_COMPLETE state %d\n",
826 			       cb->state);
827 			break;
828 		}
829 		DEBUG_LOG("server received read complete\n");
830 
831 		/* Display data in recv buf */
832 		if (cb->verbose)
833 			printk(KERN_INFO PFX "server ping data: %s\n",
834 				cb->rdma_buf);
835 
836 		/* Tell client to continue */
837 		if (cb->server && cb->server_invalidate) {
838 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
839 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
840 			DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey);
841 		}
842 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
843 		if (ret) {
844 			printk(KERN_ERR PFX "post send error %d\n", ret);
845 			break;
846 		}
847 		DEBUG_LOG("server posted go ahead\n");
848 
849 		/* Wait for client's RDMA STAG/TO/Len */
850 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
851 		if (cb->state != RDMA_WRITE_ADV) {
852 			printk(KERN_ERR PFX
853 			       "wait for RDMA_WRITE_ADV state %d\n",
854 			       cb->state);
855 			break;
856 		}
857 		DEBUG_LOG("server received sink adv\n");
858 
859 		/* RDMA Write echo data */
860 		cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
861 		cb->rdma_sq_wr.rkey = cb->remote_rkey;
862 		cb->rdma_sq_wr.remote_addr = cb->remote_addr;
863 		cb->rdma_sq_wr.wr.sg_list->length = strlen(cb->rdma_buf) + 1;
864 		if (cb->local_dma_lkey)
865 			cb->rdma_sgl.lkey = cb->pd->local_dma_lkey;
866 		else
867 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
868 
869 		DEBUG_LOG("rdma write from lkey %x laddr %llx len %d\n",
870 			  cb->rdma_sq_wr.wr.sg_list->lkey,
871 			  (unsigned long long)cb->rdma_sq_wr.wr.sg_list->addr,
872 			  cb->rdma_sq_wr.wr.sg_list->length);
873 
874 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
875 		if (ret) {
876 			printk(KERN_ERR PFX "post send error %d\n", ret);
877 			break;
878 		}
879 
880 		/* Wait for completion */
881 		ret = wait_event_interruptible(cb->sem, cb->state >=
882 							 RDMA_WRITE_COMPLETE);
883 		if (cb->state != RDMA_WRITE_COMPLETE) {
884 			printk(KERN_ERR PFX
885 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
886 			       cb->state);
887 			break;
888 		}
889 		DEBUG_LOG("server rdma write complete \n");
890 
891 		cb->state = CONNECTED;
892 
893 		/* Tell client to begin again */
894 		if (cb->server && cb->server_invalidate) {
895 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
896 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
897 			DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey);
898 		}
899 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
900 		if (ret) {
901 			printk(KERN_ERR PFX "post send error %d\n", ret);
902 			break;
903 		}
904 		DEBUG_LOG("server posted go ahead\n");
905 	}
906 }
907 
908 static void rlat_test(struct krping_cb *cb)
909 {
910 	int scnt;
911 	int iters = cb->count;
912 	struct timeval start_tv, stop_tv;
913 	int ret;
914 	struct ib_wc wc;
915 	struct ib_send_wr *bad_wr;
916 	int ne;
917 
918 	scnt = 0;
919 	cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ;
920 	cb->rdma_sq_wr.rkey = cb->remote_rkey;
921 	cb->rdma_sq_wr.remote_addr = cb->remote_addr;
922 	cb->rdma_sq_wr.wr.sg_list->length = cb->size;
923 
924 	microtime(&start_tv);
925 	if (!cb->poll) {
926 		cb->state = RDMA_READ_ADV;
927 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
928 	}
929 	while (scnt < iters) {
930 
931 		cb->state = RDMA_READ_ADV;
932 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
933 		if (ret) {
934 			printk(KERN_ERR PFX
935 				"Couldn't post send: ret=%d scnt %d\n",
936 				ret, scnt);
937 			return;
938 		}
939 
940 		do {
941 			if (!cb->poll) {
942 				wait_event_interruptible(cb->sem,
943 					cb->state != RDMA_READ_ADV);
944 				if (cb->state == RDMA_READ_COMPLETE) {
945 					ne = 1;
946 					ib_req_notify_cq(cb->cq,
947 						IB_CQ_NEXT_COMP);
948 				} else {
949 					ne = -1;
950 				}
951 			} else
952 				ne = ib_poll_cq(cb->cq, 1, &wc);
953 			if (cb->state == ERROR) {
954 				printk(KERN_ERR PFX
955 					"state == ERROR...bailing scnt %d\n",
956 					scnt);
957 				return;
958 			}
959 		} while (ne == 0);
960 
961 		if (ne < 0) {
962 			printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
963 			return;
964 		}
965 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
966 			printk(KERN_ERR PFX "Completion wth error at %s:\n",
967 				cb->server ? "server" : "client");
968 			printk(KERN_ERR PFX "Failed status %d: wr_id %d\n",
969 				wc.status, (int) wc.wr_id);
970 			return;
971 		}
972 		++scnt;
973 	}
974 	microtime(&stop_tv);
975 
976         if (stop_tv.tv_usec < start_tv.tv_usec) {
977                 stop_tv.tv_usec += 1000000;
978                 stop_tv.tv_sec  -= 1;
979         }
980 
981 	printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d\n",
982 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
983 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
984 		scnt, cb->size);
985 }
986 
987 static void wlat_test(struct krping_cb *cb)
988 {
989 	int ccnt, scnt, rcnt;
990 	int iters=cb->count;
991 	volatile char *poll_buf = (char *) cb->start_buf;
992 	char *buf = (char *)cb->rdma_buf;
993 	struct timeval start_tv, stop_tv;
994 	cycles_t *post_cycles_start, *post_cycles_stop;
995 	cycles_t *poll_cycles_start, *poll_cycles_stop;
996 	cycles_t *last_poll_cycles_start;
997 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
998 	int i;
999 	int cycle_iters = 1000;
1000 
1001 	ccnt = 0;
1002 	scnt = 0;
1003 	rcnt = 0;
1004 
1005 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1006 	if (!post_cycles_start) {
1007 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1008 		return;
1009 	}
1010 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1011 	if (!post_cycles_stop) {
1012 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1013 		return;
1014 	}
1015 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1016 	if (!poll_cycles_start) {
1017 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1018 		return;
1019 	}
1020 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1021 	if (!poll_cycles_stop) {
1022 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1023 		return;
1024 	}
1025 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1026 		GFP_KERNEL);
1027 	if (!last_poll_cycles_start) {
1028 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1029 		return;
1030 	}
1031 	cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
1032 	cb->rdma_sq_wr.rkey = cb->remote_rkey;
1033 	cb->rdma_sq_wr.remote_addr = cb->remote_addr;
1034 	cb->rdma_sq_wr.wr.sg_list->length = cb->size;
1035 
1036 	if (cycle_iters > iters)
1037 		cycle_iters = iters;
1038 	microtime(&start_tv);
1039 	while (scnt < iters || ccnt < iters || rcnt < iters) {
1040 
1041 		/* Wait till buffer changes. */
1042 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1043 			++rcnt;
1044 			while (*poll_buf != (char)rcnt) {
1045 				if (cb->state == ERROR) {
1046 					printk(KERN_ERR PFX
1047 						"state = ERROR, bailing\n");
1048 					return;
1049 				}
1050 			}
1051 		}
1052 
1053 		if (scnt < iters) {
1054 			struct ib_send_wr *bad_wr;
1055 
1056 			*buf = (char)scnt+1;
1057 			if (scnt < cycle_iters)
1058 				post_cycles_start[scnt] = get_cycles();
1059 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
1060 				printk(KERN_ERR PFX
1061 					"Couldn't post send: scnt=%d\n",
1062 					scnt);
1063 				return;
1064 			}
1065 			if (scnt < cycle_iters)
1066 				post_cycles_stop[scnt] = get_cycles();
1067 			scnt++;
1068 		}
1069 
1070 		if (ccnt < iters) {
1071 			struct ib_wc wc;
1072 			int ne;
1073 
1074 			if (ccnt < cycle_iters)
1075 				poll_cycles_start[ccnt] = get_cycles();
1076 			do {
1077 				if (ccnt < cycle_iters)
1078 					last_poll_cycles_start[ccnt] =
1079 						get_cycles();
1080 				ne = ib_poll_cq(cb->cq, 1, &wc);
1081 			} while (ne == 0);
1082 			if (ccnt < cycle_iters)
1083 				poll_cycles_stop[ccnt] = get_cycles();
1084 			++ccnt;
1085 
1086 			if (ne < 0) {
1087 				printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
1088 				return;
1089 			}
1090 			if (wc.status != IB_WC_SUCCESS) {
1091 				printk(KERN_ERR PFX
1092 					"Completion wth error at %s:\n",
1093 					cb->server ? "server" : "client");
1094 				printk(KERN_ERR PFX
1095 					"Failed status %d: wr_id %d\n",
1096 					wc.status, (int) wc.wr_id);
1097 				printk(KERN_ERR PFX
1098 					"scnt=%d, rcnt=%d, ccnt=%d\n",
1099 					scnt, rcnt, ccnt);
1100 				return;
1101 			}
1102 		}
1103 	}
1104 	microtime(&stop_tv);
1105 
1106         if (stop_tv.tv_usec < start_tv.tv_usec) {
1107                 stop_tv.tv_usec += 1000000;
1108                 stop_tv.tv_sec  -= 1;
1109         }
1110 
1111 	for (i=0; i < cycle_iters; i++) {
1112 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1113 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1114 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1115 	}
1116 	printk(KERN_ERR PFX
1117 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1118 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1119 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1120 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1121 		scnt, cb->size, cycle_iters,
1122 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1123 		(unsigned long long)sum_last_poll);
1124 	kfree(post_cycles_start);
1125 	kfree(post_cycles_stop);
1126 	kfree(poll_cycles_start);
1127 	kfree(poll_cycles_stop);
1128 	kfree(last_poll_cycles_start);
1129 }
1130 
1131 static void bw_test(struct krping_cb *cb)
1132 {
1133 	int ccnt, scnt, rcnt;
1134 	int iters=cb->count;
1135 	struct timeval start_tv, stop_tv;
1136 	cycles_t *post_cycles_start, *post_cycles_stop;
1137 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1138 	cycles_t *last_poll_cycles_start;
1139 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1140 	int i;
1141 	int cycle_iters = 1000;
1142 
1143 	ccnt = 0;
1144 	scnt = 0;
1145 	rcnt = 0;
1146 
1147 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1148 	if (!post_cycles_start) {
1149 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1150 		return;
1151 	}
1152 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1153 	if (!post_cycles_stop) {
1154 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1155 		return;
1156 	}
1157 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1158 	if (!poll_cycles_start) {
1159 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1160 		return;
1161 	}
1162 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1163 	if (!poll_cycles_stop) {
1164 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1165 		return;
1166 	}
1167 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1168 		GFP_KERNEL);
1169 	if (!last_poll_cycles_start) {
1170 		printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1171 		return;
1172 	}
1173 	cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
1174 	cb->rdma_sq_wr.rkey = cb->remote_rkey;
1175 	cb->rdma_sq_wr.remote_addr = cb->remote_addr;
1176 	cb->rdma_sq_wr.wr.sg_list->length = cb->size;
1177 
1178 	if (cycle_iters > iters)
1179 		cycle_iters = iters;
1180 	microtime(&start_tv);
1181 	while (scnt < iters || ccnt < iters) {
1182 
1183 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1184 			struct ib_send_wr *bad_wr;
1185 
1186 			if (scnt < cycle_iters)
1187 				post_cycles_start[scnt] = get_cycles();
1188 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
1189 				printk(KERN_ERR PFX
1190 					"Couldn't post send: scnt=%d\n",
1191 					scnt);
1192 				return;
1193 			}
1194 			if (scnt < cycle_iters)
1195 				post_cycles_stop[scnt] = get_cycles();
1196 			++scnt;
1197 		}
1198 
1199 		if (ccnt < iters) {
1200 			int ne;
1201 			struct ib_wc wc;
1202 
1203 			if (ccnt < cycle_iters)
1204 				poll_cycles_start[ccnt] = get_cycles();
1205 			do {
1206 				if (ccnt < cycle_iters)
1207 					last_poll_cycles_start[ccnt] =
1208 						get_cycles();
1209 				ne = ib_poll_cq(cb->cq, 1, &wc);
1210 			} while (ne == 0);
1211 			if (ccnt < cycle_iters)
1212 				poll_cycles_stop[ccnt] = get_cycles();
1213 			ccnt += 1;
1214 
1215 			if (ne < 0) {
1216 				printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
1217 				return;
1218 			}
1219 			if (wc.status != IB_WC_SUCCESS) {
1220 				printk(KERN_ERR PFX
1221 					"Completion wth error at %s:\n",
1222 					cb->server ? "server" : "client");
1223 				printk(KERN_ERR PFX
1224 					"Failed status %d: wr_id %d\n",
1225 					wc.status, (int) wc.wr_id);
1226 				return;
1227 			}
1228 		}
1229 	}
1230 	microtime(&stop_tv);
1231 
1232         if (stop_tv.tv_usec < start_tv.tv_usec) {
1233                 stop_tv.tv_usec += 1000000;
1234                 stop_tv.tv_sec  -= 1;
1235         }
1236 
1237 	for (i=0; i < cycle_iters; i++) {
1238 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1239 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1240 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1241 	}
1242 	printk(KERN_ERR PFX
1243 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1244 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1245 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1246 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1247 		scnt, cb->size, cycle_iters,
1248 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1249 		(unsigned long long)sum_last_poll);
1250 	kfree(post_cycles_start);
1251 	kfree(post_cycles_stop);
1252 	kfree(poll_cycles_start);
1253 	kfree(poll_cycles_stop);
1254 	kfree(last_poll_cycles_start);
1255 }
1256 
1257 static void krping_rlat_test_server(struct krping_cb *cb)
1258 {
1259 	struct ib_send_wr *bad_wr;
1260 	struct ib_wc wc;
1261 	int ret;
1262 
1263 	/* Spin waiting for client's Start STAG/TO/Len */
1264 	while (cb->state < RDMA_READ_ADV) {
1265 		krping_cq_event_handler(cb->cq, cb);
1266 	}
1267 
1268 	/* Send STAG/TO/Len to client */
1269 	krping_format_send(cb, cb->start_dma_addr);
1270 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1271 	if (ret) {
1272 		printk(KERN_ERR PFX "post send error %d\n", ret);
1273 		return;
1274 	}
1275 
1276 	/* Spin waiting for send completion */
1277 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1278 	if (ret < 0) {
1279 		printk(KERN_ERR PFX "poll error %d\n", ret);
1280 		return;
1281 	}
1282 	if (wc.status) {
1283 		printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
1284 		return;
1285 	}
1286 
1287 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1288 }
1289 
1290 static void krping_wlat_test_server(struct krping_cb *cb)
1291 {
1292 	struct ib_send_wr *bad_wr;
1293 	struct ib_wc wc;
1294 	int ret;
1295 
1296 	/* Spin waiting for client's Start STAG/TO/Len */
1297 	while (cb->state < RDMA_READ_ADV) {
1298 		krping_cq_event_handler(cb->cq, cb);
1299 	}
1300 
1301 	/* Send STAG/TO/Len to client */
1302 	krping_format_send(cb, cb->start_dma_addr);
1303 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1304 	if (ret) {
1305 		printk(KERN_ERR PFX "post send error %d\n", ret);
1306 		return;
1307 	}
1308 
1309 	/* Spin waiting for send completion */
1310 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1311 	if (ret < 0) {
1312 		printk(KERN_ERR PFX "poll error %d\n", ret);
1313 		return;
1314 	}
1315 	if (wc.status) {
1316 		printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
1317 		return;
1318 	}
1319 
1320 	wlat_test(cb);
1321 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1322 }
1323 
1324 static void krping_bw_test_server(struct krping_cb *cb)
1325 {
1326 	struct ib_send_wr *bad_wr;
1327 	struct ib_wc wc;
1328 	int ret;
1329 
1330 	/* Spin waiting for client's Start STAG/TO/Len */
1331 	while (cb->state < RDMA_READ_ADV) {
1332 		krping_cq_event_handler(cb->cq, cb);
1333 	}
1334 
1335 	/* Send STAG/TO/Len to client */
1336 	krping_format_send(cb, cb->start_dma_addr);
1337 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1338 	if (ret) {
1339 		printk(KERN_ERR PFX "post send error %d\n", ret);
1340 		return;
1341 	}
1342 
1343 	/* Spin waiting for send completion */
1344 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1345 	if (ret < 0) {
1346 		printk(KERN_ERR PFX "poll error %d\n", ret);
1347 		return;
1348 	}
1349 	if (wc.status) {
1350 		printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
1351 		return;
1352 	}
1353 
1354 	if (cb->duplex)
1355 		bw_test(cb);
1356 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1357 }
1358 
1359 static int reg_supported(struct ib_device *dev)
1360 {
1361 	u64 needed_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
1362 
1363 	if ((dev->attrs.device_cap_flags & needed_flags) != needed_flags) {
1364 		printk(KERN_ERR PFX
1365 			"Fastreg not supported - device_cap_flags 0x%llx\n",
1366 			(unsigned long long)dev->attrs.device_cap_flags);
1367 		return 0;
1368 	}
1369 	DEBUG_LOG("Fastreg supported - device_cap_flags 0x%llx\n",
1370 		(unsigned long long)dev->attrs.device_cap_flags);
1371 	return 1;
1372 }
1373 
1374 static void fill_sockaddr(struct sockaddr_storage *sin, struct krping_cb *cb)
1375 {
1376 	memset(sin, 0, sizeof(*sin));
1377 
1378 	if (cb->addr_type == AF_INET) {
1379 		struct sockaddr_in *sin4 = (struct sockaddr_in *)sin;
1380 		sin4->sin_len = sizeof(*sin4);
1381 		sin4->sin_family = AF_INET;
1382 		memcpy((void *)&sin4->sin_addr.s_addr, cb->addr, 4);
1383 		sin4->sin_port = cb->port;
1384 	} else if (cb->addr_type == AF_INET6) {
1385 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
1386 		sin6->sin6_len = sizeof(*sin6);
1387 		sin6->sin6_family = AF_INET6;
1388 		memcpy((void *)&sin6->sin6_addr, cb->addr, 16);
1389 		sin6->sin6_port = cb->port;
1390 	}
1391 }
1392 
1393 static int krping_bind_server(struct krping_cb *cb)
1394 {
1395 	struct sockaddr_storage sin;
1396 	int ret;
1397 
1398 
1399 	fill_sockaddr(&sin, cb);
1400 
1401 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *)&sin);
1402 	if (ret) {
1403 		printk(KERN_ERR PFX "rdma_bind_addr error %d\n", ret);
1404 		return ret;
1405 	}
1406 	DEBUG_LOG("rdma_bind_addr successful\n");
1407 
1408 	DEBUG_LOG("rdma_listen\n");
1409 	ret = rdma_listen(cb->cm_id, 3);
1410 	if (ret) {
1411 		printk(KERN_ERR PFX "rdma_listen failed: %d\n", ret);
1412 		return ret;
1413 	}
1414 
1415 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1416 	if (cb->state != CONNECT_REQUEST) {
1417 		printk(KERN_ERR PFX "wait for CONNECT_REQUEST state %d\n",
1418 			cb->state);
1419 		return -1;
1420 	}
1421 
1422 	if (!reg_supported(cb->child_cm_id->device))
1423 		return -EINVAL;
1424 
1425 	return 0;
1426 }
1427 
1428 static void krping_run_server(struct krping_cb *cb)
1429 {
1430 	struct ib_recv_wr *bad_wr;
1431 	int ret;
1432 
1433 	ret = krping_bind_server(cb);
1434 	if (ret)
1435 		return;
1436 
1437 	ret = krping_setup_qp(cb, cb->child_cm_id);
1438 	if (ret) {
1439 		printk(KERN_ERR PFX "setup_qp failed: %d\n", ret);
1440 		goto err0;
1441 	}
1442 
1443 	ret = krping_setup_buffers(cb);
1444 	if (ret) {
1445 		printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret);
1446 		goto err1;
1447 	}
1448 
1449 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1450 	if (ret) {
1451 		printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
1452 		goto err2;
1453 	}
1454 
1455 	ret = krping_accept(cb);
1456 	if (ret) {
1457 		printk(KERN_ERR PFX "connect error %d\n", ret);
1458 		goto err2;
1459 	}
1460 
1461 	if (cb->wlat)
1462 		krping_wlat_test_server(cb);
1463 	else if (cb->rlat)
1464 		krping_rlat_test_server(cb);
1465 	else if (cb->bw)
1466 		krping_bw_test_server(cb);
1467 	else
1468 		krping_test_server(cb);
1469 	rdma_disconnect(cb->child_cm_id);
1470 err2:
1471 	krping_free_buffers(cb);
1472 err1:
1473 	krping_free_qp(cb);
1474 err0:
1475 	rdma_destroy_id(cb->child_cm_id);
1476 }
1477 
1478 static void krping_test_client(struct krping_cb *cb)
1479 {
1480 	int ping, start, cc, i, ret;
1481 	struct ib_send_wr *bad_wr;
1482 	unsigned char c;
1483 
1484 	start = 65;
1485 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1486 		cb->state = RDMA_READ_ADV;
1487 
1488 		/* Put some ascii text in the buffer. */
1489 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1490 		for (i = cc, c = start; i < cb->size; i++) {
1491 			cb->start_buf[i] = c;
1492 			c++;
1493 			if (c > 122)
1494 				c = 65;
1495 		}
1496 		start++;
1497 		if (start > 122)
1498 			start = 65;
1499 		cb->start_buf[cb->size - 1] = 0;
1500 
1501 		krping_format_send(cb, cb->start_dma_addr);
1502 		if (cb->state == ERROR) {
1503 			printk(KERN_ERR PFX "krping_format_send failed\n");
1504 			break;
1505 		}
1506 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1507 		if (ret) {
1508 			printk(KERN_ERR PFX "post send error %d\n", ret);
1509 			break;
1510 		}
1511 
1512 		/* Wait for server to ACK */
1513 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1514 		if (cb->state != RDMA_WRITE_ADV) {
1515 			printk(KERN_ERR PFX
1516 			       "wait for RDMA_WRITE_ADV state %d\n",
1517 			       cb->state);
1518 			break;
1519 		}
1520 
1521 		krping_format_send(cb, cb->rdma_dma_addr);
1522 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1523 		if (ret) {
1524 			printk(KERN_ERR PFX "post send error %d\n", ret);
1525 			break;
1526 		}
1527 
1528 		/* Wait for the server to say the RDMA Write is complete. */
1529 		wait_event_interruptible(cb->sem,
1530 					 cb->state >= RDMA_WRITE_COMPLETE);
1531 		if (cb->state != RDMA_WRITE_COMPLETE) {
1532 			printk(KERN_ERR PFX
1533 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1534 			       cb->state);
1535 			break;
1536 		}
1537 
1538 		if (cb->validate)
1539 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1540 				printk(KERN_ERR PFX "data mismatch!\n");
1541 				break;
1542 			}
1543 
1544 		if (cb->verbose)
1545 			printk(KERN_INFO PFX "ping data: %s\n", cb->rdma_buf);
1546 #ifdef SLOW_KRPING
1547 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1548 #endif
1549 	}
1550 }
1551 
1552 static void krping_rlat_test_client(struct krping_cb *cb)
1553 {
1554 	struct ib_send_wr *bad_wr;
1555 	struct ib_wc wc;
1556 	int ret;
1557 
1558 	cb->state = RDMA_READ_ADV;
1559 
1560 	/* Send STAG/TO/Len to client */
1561 	krping_format_send(cb, cb->start_dma_addr);
1562 	if (cb->state == ERROR) {
1563 		printk(KERN_ERR PFX "krping_format_send failed\n");
1564 		return;
1565 	}
1566 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1567 	if (ret) {
1568 		printk(KERN_ERR PFX "post send error %d\n", ret);
1569 		return;
1570 	}
1571 
1572 	/* Spin waiting for send completion */
1573 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1574 	if (ret < 0) {
1575 		printk(KERN_ERR PFX "poll error %d\n", ret);
1576 		return;
1577 	}
1578 	if (wc.status) {
1579 		printk(KERN_ERR PFX "send completion error %d\n", wc.status);
1580 		return;
1581 	}
1582 
1583 	/* Spin waiting for server's Start STAG/TO/Len */
1584 	while (cb->state < RDMA_WRITE_ADV) {
1585 		krping_cq_event_handler(cb->cq, cb);
1586 	}
1587 
1588 #if 0
1589 {
1590 	int i;
1591 	struct timeval start, stop;
1592 	time_t sec;
1593 	suseconds_t usec;
1594 	unsigned long long elapsed;
1595 	struct ib_wc wc;
1596 	struct ib_send_wr *bad_wr;
1597 	int ne;
1598 
1599 	cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
1600 	cb->rdma_sq_wr.rkey = cb->remote_rkey;
1601 	cb->rdma_sq_wr.remote_addr = cb->remote_addr;
1602 	cb->rdma_sq_wr.wr.sg_list->length = 0;
1603 	cb->rdma_sq_wr.wr.num_sge = 0;
1604 
1605 	microtime(&start);
1606 	for (i=0; i < 100000; i++) {
1607 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
1608 			printk(KERN_ERR PFX  "Couldn't post send\n");
1609 			return;
1610 		}
1611 		do {
1612 			ne = ib_poll_cq(cb->cq, 1, &wc);
1613 		} while (ne == 0);
1614 		if (ne < 0) {
1615 			printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
1616 			return;
1617 		}
1618 		if (wc.status != IB_WC_SUCCESS) {
1619 			printk(KERN_ERR PFX "Completion wth error at %s:\n",
1620 				cb->server ? "server" : "client");
1621 			printk(KERN_ERR PFX "Failed status %d: wr_id %d\n",
1622 				wc.status, (int) wc.wr_id);
1623 			return;
1624 		}
1625 	}
1626 	microtime(&stop);
1627 
1628 	if (stop.tv_usec < start.tv_usec) {
1629 		stop.tv_usec += 1000000;
1630 		stop.tv_sec  -= 1;
1631 	}
1632 	sec     = stop.tv_sec - start.tv_sec;
1633 	usec    = stop.tv_usec - start.tv_usec;
1634 	elapsed = sec * 1000000 + usec;
1635 	printk(KERN_ERR PFX "0B-write-lat iters 100000 usec %llu\n", elapsed);
1636 }
1637 #endif
1638 
1639 	rlat_test(cb);
1640 }
1641 
1642 static void krping_wlat_test_client(struct krping_cb *cb)
1643 {
1644 	struct ib_send_wr *bad_wr;
1645 	struct ib_wc wc;
1646 	int ret;
1647 
1648 	cb->state = RDMA_READ_ADV;
1649 
1650 	/* Send STAG/TO/Len to client */
1651 	krping_format_send(cb, cb->start_dma_addr);
1652 	if (cb->state == ERROR) {
1653 		printk(KERN_ERR PFX "krping_format_send failed\n");
1654 		return;
1655 	}
1656 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1657 	if (ret) {
1658 		printk(KERN_ERR PFX "post send error %d\n", ret);
1659 		return;
1660 	}
1661 
1662 	/* Spin waiting for send completion */
1663 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1664 	if (ret < 0) {
1665 		printk(KERN_ERR PFX "poll error %d\n", ret);
1666 		return;
1667 	}
1668 	if (wc.status) {
1669 		printk(KERN_ERR PFX "send completion error %d\n", wc.status);
1670 		return;
1671 	}
1672 
1673 	/* Spin waiting for server's Start STAG/TO/Len */
1674 	while (cb->state < RDMA_WRITE_ADV) {
1675 		krping_cq_event_handler(cb->cq, cb);
1676 	}
1677 
1678 	wlat_test(cb);
1679 }
1680 
1681 static void krping_bw_test_client(struct krping_cb *cb)
1682 {
1683 	struct ib_send_wr *bad_wr;
1684 	struct ib_wc wc;
1685 	int ret;
1686 
1687 	cb->state = RDMA_READ_ADV;
1688 
1689 	/* Send STAG/TO/Len to client */
1690 	krping_format_send(cb, cb->start_dma_addr);
1691 	if (cb->state == ERROR) {
1692 		printk(KERN_ERR PFX "krping_format_send failed\n");
1693 		return;
1694 	}
1695 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1696 	if (ret) {
1697 		printk(KERN_ERR PFX "post send error %d\n", ret);
1698 		return;
1699 	}
1700 
1701 	/* Spin waiting for send completion */
1702 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1703 	if (ret < 0) {
1704 		printk(KERN_ERR PFX "poll error %d\n", ret);
1705 		return;
1706 	}
1707 	if (wc.status) {
1708 		printk(KERN_ERR PFX "send completion error %d\n", wc.status);
1709 		return;
1710 	}
1711 
1712 	/* Spin waiting for server's Start STAG/TO/Len */
1713 	while (cb->state < RDMA_WRITE_ADV) {
1714 		krping_cq_event_handler(cb->cq, cb);
1715 	}
1716 
1717 	bw_test(cb);
1718 }
1719 
1720 /*
1721  * Manual qp flush test
1722  */
1723 static void flush_qp(struct krping_cb *cb)
1724 {
1725 	struct ib_send_wr wr = { 0 }, *bad;
1726 	struct ib_recv_wr recv_wr = { 0 }, *recv_bad;
1727 	struct ib_wc wc;
1728 	int ret;
1729 	int flushed = 0;
1730 	int ccnt = 0;
1731 
1732 	rdma_disconnect(cb->cm_id);
1733 	DEBUG_LOG("disconnected!\n");
1734 
1735 	wr.opcode = IB_WR_SEND;
1736 	wr.wr_id = 0xdeadbeefcafebabe;
1737 	ret = ib_post_send(cb->qp, &wr, &bad);
1738 	if (ret) {
1739 		printk(KERN_ERR PFX "%s post_send failed ret %d\n", __func__, ret);
1740 		return;
1741 	}
1742 
1743 	recv_wr.wr_id = 0xcafebabedeadbeef;
1744 	ret = ib_post_recv(cb->qp, &recv_wr, &recv_bad);
1745 	if (ret) {
1746 		printk(KERN_ERR PFX "%s post_recv failed ret %d\n", __func__, ret);
1747 		return;
1748 	}
1749 
1750 	/* poll until the flush WRs complete */
1751 	do {
1752 		ret = ib_poll_cq(cb->cq, 1, &wc);
1753 		if (ret < 0) {
1754 			printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret);
1755 			return;
1756 		}
1757 		if (ret == 0)
1758 			continue;
1759 		ccnt++;
1760 		if (wc.wr_id == 0xdeadbeefcafebabe ||
1761 		    wc.wr_id == 0xcafebabedeadbeef)
1762 			flushed++;
1763 	} while (flushed != 2);
1764 	DEBUG_LOG("qp_flushed! ccnt %u\n", ccnt);
1765 }
1766 
1767 static void krping_fr_test(struct krping_cb *cb)
1768 {
1769 	struct ib_send_wr inv, *bad;
1770 	struct ib_reg_wr fr;
1771 	struct ib_wc wc;
1772 	u8 key = 0;
1773 	struct ib_mr *mr;
1774 	int ret;
1775 	int size = cb->size;
1776 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1777 	unsigned long start;
1778 	int count = 0;
1779 	int scnt = 0;
1780 	struct scatterlist sg = {0};
1781 
1782 	mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, plen);
1783 	if (IS_ERR(mr)) {
1784 		printk(KERN_ERR PFX "ib_alloc_mr failed %ld\n", PTR_ERR(mr));
1785 		return;
1786 	}
1787 
1788 	sg_dma_address(&sg) = (dma_addr_t)0xcafebabe0000ULL;
1789 	sg_dma_len(&sg) = size;
1790 	ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE);
1791 	if (ret <= 0) {
1792 		printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret);
1793 		goto err2;
1794 	}
1795 
1796 	memset(&fr, 0, sizeof fr);
1797 	fr.wr.opcode = IB_WR_REG_MR;
1798 	fr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1799 	fr.mr = mr;
1800 	fr.wr.next = &inv;
1801 
1802 	memset(&inv, 0, sizeof inv);
1803 	inv.opcode = IB_WR_LOCAL_INV;
1804 	inv.send_flags = IB_SEND_SIGNALED;
1805 
1806 	DEBUG_LOG("fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1807 	start = time_uptime;
1808 	while (!cb->count || count <= cb->count) {
1809 		if (SIGPENDING(curthread)) {
1810 			printk(KERN_ERR PFX "signal!\n");
1811 			break;
1812 		}
1813 		if ((time_uptime - start) >= 9) {
1814 			DEBUG_LOG("fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1815 			wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1816 			if (cb->state == ERROR)
1817 				break;
1818 			start = time_uptime;
1819 		}
1820 		while (scnt < (cb->txdepth>>1)) {
1821 			ib_update_fast_reg_key(mr, ++key);
1822 			fr.key = mr->rkey;
1823 			inv.ex.invalidate_rkey = mr->rkey;
1824 
1825 			size = arc4random() % cb->size;
1826 			if (size == 0)
1827 				size = cb->size;
1828 			sg_dma_len(&sg) = size;
1829 			ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE);
1830 			if (ret <= 0) {
1831 				printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret);
1832 				goto err2;
1833 			}
1834 			ret = ib_post_send(cb->qp, &fr.wr, &bad);
1835 			if (ret) {
1836 				printk(KERN_ERR PFX "ib_post_send failed %d\n", ret);
1837 				goto err2;
1838 			}
1839 			scnt++;
1840 		}
1841 
1842 		ret = ib_poll_cq(cb->cq, 1, &wc);
1843 		if (ret < 0) {
1844 			printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret);
1845 			goto err2;
1846 		}
1847 		if (ret == 1) {
1848 			if (wc.status) {
1849 				printk(KERN_ERR PFX "completion error %u\n", wc.status);
1850 				goto err2;
1851 			}
1852 			count++;
1853 			scnt--;
1854 		}
1855 	}
1856 err2:
1857 	flush_qp(cb);
1858 	DEBUG_LOG("fr_test: done!\n");
1859 	ib_dereg_mr(mr);
1860 }
1861 
1862 static int krping_connect_client(struct krping_cb *cb)
1863 {
1864 	struct rdma_conn_param conn_param;
1865 	int ret;
1866 
1867 	memset(&conn_param, 0, sizeof conn_param);
1868 	conn_param.responder_resources = 1;
1869 	conn_param.initiator_depth = 1;
1870 	conn_param.retry_count = 10;
1871 
1872 	ret = rdma_connect(cb->cm_id, &conn_param);
1873 	if (ret) {
1874 		printk(KERN_ERR PFX "rdma_connect error %d\n", ret);
1875 		return ret;
1876 	}
1877 
1878 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
1879 	if (cb->state == ERROR) {
1880 		printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state);
1881 		return -1;
1882 	}
1883 
1884 	DEBUG_LOG("rdma_connect successful\n");
1885 	return 0;
1886 }
1887 
1888 static int krping_bind_client(struct krping_cb *cb)
1889 {
1890 	struct sockaddr_storage sin;
1891 	int ret;
1892 
1893 	fill_sockaddr(&sin, cb);
1894 
1895 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *)&sin, 2000);
1896 	if (ret) {
1897 		printk(KERN_ERR PFX "rdma_resolve_addr error %d\n", ret);
1898 		return ret;
1899 	}
1900 
1901 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
1902 	if (cb->state != ROUTE_RESOLVED) {
1903 		printk(KERN_ERR PFX
1904 		       "addr/route resolution did not resolve: state %d\n",
1905 		       cb->state);
1906 		return -EINTR;
1907 	}
1908 
1909 	if (!reg_supported(cb->cm_id->device))
1910 		return -EINVAL;
1911 
1912 	DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n");
1913 	return 0;
1914 }
1915 
1916 static void krping_run_client(struct krping_cb *cb)
1917 {
1918 	struct ib_recv_wr *bad_wr;
1919 	int ret;
1920 
1921 	ret = krping_bind_client(cb);
1922 	if (ret)
1923 		return;
1924 
1925 	ret = krping_setup_qp(cb, cb->cm_id);
1926 	if (ret) {
1927 		printk(KERN_ERR PFX "setup_qp failed: %d\n", ret);
1928 		return;
1929 	}
1930 
1931 	ret = krping_setup_buffers(cb);
1932 	if (ret) {
1933 		printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret);
1934 		goto err1;
1935 	}
1936 
1937 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1938 	if (ret) {
1939 		printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
1940 		goto err2;
1941 	}
1942 
1943 	ret = krping_connect_client(cb);
1944 	if (ret) {
1945 		printk(KERN_ERR PFX "connect error %d\n", ret);
1946 		goto err2;
1947 	}
1948 
1949 	if (cb->wlat)
1950 		krping_wlat_test_client(cb);
1951 	else if (cb->rlat)
1952 		krping_rlat_test_client(cb);
1953 	else if (cb->bw)
1954 		krping_bw_test_client(cb);
1955 	else if (cb->frtest)
1956 		krping_fr_test(cb);
1957 	else
1958 		krping_test_client(cb);
1959 	rdma_disconnect(cb->cm_id);
1960 err2:
1961 	krping_free_buffers(cb);
1962 err1:
1963 	krping_free_qp(cb);
1964 }
1965 
1966 static uint16_t
1967 krping_get_ipv6_scope_id(char *name)
1968 {
1969 	struct ifnet *ifp;
1970 	uint16_t retval;
1971 
1972 	if (name == NULL)
1973 		return (0);
1974 	CURVNET_SET_QUIET(TD_TO_VNET(curthread));
1975 	ifp = ifunit_ref(name);
1976 	CURVNET_RESTORE();
1977 	if (ifp == NULL)
1978 		return (0);
1979 	retval = ifp->if_index;
1980 	if_rele(ifp);
1981 	return (retval);
1982 }
1983 
1984 int krping_doit(char *cmd)
1985 {
1986 	struct krping_cb *cb;
1987 	int op;
1988 	int ret = 0;
1989 	char *optarg;
1990 	char *scope;
1991 	unsigned long optint;
1992 
1993 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
1994 	if (!cb)
1995 		return -ENOMEM;
1996 
1997 	mutex_lock(&krping_mutex);
1998 	list_add_tail(&cb->list, &krping_cbs);
1999 	mutex_unlock(&krping_mutex);
2000 
2001 	cb->server = -1;
2002 	cb->state = IDLE;
2003 	cb->size = 64;
2004 	cb->txdepth = RPING_SQ_DEPTH;
2005 	init_waitqueue_head(&cb->sem);
2006 
2007 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2008 			      &optint)) != 0) {
2009 		switch (op) {
2010 		case 'a':
2011 			cb->addr_str = optarg;
2012 			cb->addr_type = AF_INET;
2013 			DEBUG_LOG("ipaddr (%s)\n", optarg);
2014 			if (inet_pton(AF_INET, optarg, cb->addr) != 1) {
2015 				printk(KERN_ERR PFX "bad addr string %s\n",
2016 				    optarg);
2017 				ret = EINVAL;
2018 			}
2019 			break;
2020 		case 'A':
2021 			cb->addr_str = optarg;
2022 			cb->addr_type = AF_INET6;
2023 			DEBUG_LOG("ipv6addr (%s)\n", optarg);
2024 			scope = strstr(optarg, "%");
2025 			/* extract scope ID, if any */
2026 			if (scope != NULL)
2027 				*scope++ = 0;
2028 			/* extract IPv6 network address */
2029 			if (inet_pton(AF_INET6, optarg, cb->addr) != 1) {
2030 				printk(KERN_ERR PFX "bad addr string %s\n",
2031 				    optarg);
2032 				ret = EINVAL;
2033 			} else if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)cb->addr) ||
2034 			    IN6_IS_ADDR_MC_INTFACELOCAL((struct in6_addr *)cb->addr)) {
2035 				uint16_t scope_id = krping_get_ipv6_scope_id(scope);
2036 				DEBUG_LOG("ipv6 scope ID = %d\n", scope_id);
2037 				cb->addr[2] = scope_id >> 8;
2038 				cb->addr[3] = scope_id & 0xFF;
2039 			}
2040 			break;
2041 		case 'p':
2042 			cb->port = htons(optint);
2043 			DEBUG_LOG("port %d\n", (int)optint);
2044 			break;
2045 		case 'P':
2046 			cb->poll = 1;
2047 			DEBUG_LOG("server\n");
2048 			break;
2049 		case 's':
2050 			cb->server = 1;
2051 			DEBUG_LOG("server\n");
2052 			break;
2053 		case 'c':
2054 			cb->server = 0;
2055 			DEBUG_LOG("client\n");
2056 			break;
2057 		case 'S':
2058 			cb->size = optint;
2059 			if ((cb->size < 1) ||
2060 			    (cb->size > RPING_BUFSIZE)) {
2061 				printk(KERN_ERR PFX "Invalid size %d "
2062 				       "(valid range is 1 to %d)\n",
2063 				       cb->size, RPING_BUFSIZE);
2064 				ret = EINVAL;
2065 			} else
2066 				DEBUG_LOG("size %d\n", (int)optint);
2067 			break;
2068 		case 'C':
2069 			cb->count = optint;
2070 			if (cb->count < 0) {
2071 				printk(KERN_ERR PFX "Invalid count %d\n",
2072 					cb->count);
2073 				ret = EINVAL;
2074 			} else
2075 				DEBUG_LOG("count %d\n", (int) cb->count);
2076 			break;
2077 		case 'v':
2078 			cb->verbose++;
2079 			DEBUG_LOG("verbose\n");
2080 			break;
2081 		case 'V':
2082 			cb->validate++;
2083 			DEBUG_LOG("validate data\n");
2084 			break;
2085 		case 'l':
2086 			cb->wlat++;
2087 			break;
2088 		case 'L':
2089 			cb->rlat++;
2090 			break;
2091 		case 'B':
2092 			cb->bw++;
2093 			break;
2094 		case 'd':
2095 			cb->duplex++;
2096 			break;
2097 		case 'I':
2098 			cb->server_invalidate = 1;
2099 			break;
2100 		case 'T':
2101 			cb->txdepth = optint;
2102 			DEBUG_LOG("txdepth %d\n", (int) cb->txdepth);
2103 			break;
2104 		case 'Z':
2105 			cb->local_dma_lkey = 1;
2106 			DEBUG_LOG("using local dma lkey\n");
2107 			break;
2108 		case 'R':
2109 			cb->read_inv = 1;
2110 			DEBUG_LOG("using read-with-inv\n");
2111 			break;
2112 		case 'f':
2113 			cb->frtest = 1;
2114 			DEBUG_LOG("fast-reg test!\n");
2115 			break;
2116 		default:
2117 			printk(KERN_ERR PFX "unknown opt %s\n", optarg);
2118 			ret = -EINVAL;
2119 			break;
2120 		}
2121 	}
2122 	if (ret)
2123 		goto out;
2124 
2125 	if (cb->server == -1) {
2126 		printk(KERN_ERR PFX "must be either client or server\n");
2127 		ret = -EINVAL;
2128 		goto out;
2129 	}
2130 
2131 	if (cb->server && cb->frtest) {
2132 		printk(KERN_ERR PFX "must be client to run frtest\n");
2133 		ret = -EINVAL;
2134 		goto out;
2135 	}
2136 
2137 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2138 		printk(KERN_ERR PFX "Pick only one test: fr, bw, rlat, wlat\n");
2139 		ret = -EINVAL;
2140 		goto out;
2141 	}
2142 
2143 	if (cb->wlat || cb->rlat || cb->bw) {
2144 		printk(KERN_ERR PFX "wlat, rlat, and bw tests only support mem_mode MR - which is no longer supported\n");
2145 		ret = -EINVAL;
2146 		goto out;
2147 	}
2148 
2149 	cb->cm_id = rdma_create_id(&init_net, krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
2150 	if (IS_ERR(cb->cm_id)) {
2151 		ret = PTR_ERR(cb->cm_id);
2152 		printk(KERN_ERR PFX "rdma_create_id error %d\n", ret);
2153 		goto out;
2154 	}
2155 	DEBUG_LOG("created cm_id %p\n", cb->cm_id);
2156 
2157 	if (cb->server)
2158 		krping_run_server(cb);
2159 	else
2160 		krping_run_client(cb);
2161 
2162 	DEBUG_LOG("destroy cm_id %p\n", cb->cm_id);
2163 	rdma_destroy_id(cb->cm_id);
2164 out:
2165 	mutex_lock(&krping_mutex);
2166 	list_del(&cb->list);
2167 	mutex_unlock(&krping_mutex);
2168 	kfree(cb);
2169 	return ret;
2170 }
2171 
2172 void
2173 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2174 {
2175 	struct krping_cb *cb;
2176 
2177 	mutex_lock(&krping_mutex);
2178 	list_for_each_entry(cb, &krping_cbs, list)
2179 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
2180 	mutex_unlock(&krping_mutex);
2181 }
2182