1 /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ 2 /* 3 * aoedev.c 4 * AoE device utility functions; maintains device list. 5 */ 6 7 #include <linux/hdreg.h> 8 #include <linux/blk-mq.h> 9 #include <linux/netdevice.h> 10 #include <linux/delay.h> 11 #include <linux/slab.h> 12 #include <linux/bitmap.h> 13 #include <linux/kdev_t.h> 14 #include <linux/moduleparam.h> 15 #include <linux/string.h> 16 #include "aoe.h" 17 18 static void freetgt(struct aoedev *d, struct aoetgt *t); 19 static void skbpoolfree(struct aoedev *d); 20 21 static int aoe_dyndevs = 1; 22 module_param(aoe_dyndevs, int, 0644); 23 MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); 24 25 static struct aoedev *devlist; 26 static DEFINE_SPINLOCK(devlist_lock); 27 28 /* Because some systems will have one, many, or no 29 * - partitions, 30 * - slots per shelf, 31 * - or shelves, 32 * we need some flexibility in the way the minor numbers 33 * are allocated. So they are dynamic. 34 */ 35 #define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) 36 37 static DEFINE_SPINLOCK(used_minors_lock); 38 static DECLARE_BITMAP(used_minors, N_DEVS); 39 40 static int 41 minor_get_dyn(ulong *sysminor) 42 { 43 ulong flags; 44 ulong n; 45 int error = 0; 46 47 spin_lock_irqsave(&used_minors_lock, flags); 48 n = find_first_zero_bit(used_minors, N_DEVS); 49 if (n < N_DEVS) 50 set_bit(n, used_minors); 51 else 52 error = -1; 53 spin_unlock_irqrestore(&used_minors_lock, flags); 54 55 *sysminor = n * AOE_PARTITIONS; 56 return error; 57 } 58 59 static int 60 minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) 61 { 62 ulong flags; 63 ulong n; 64 int error = 0; 65 enum { 66 /* for backwards compatibility when !aoe_dyndevs, 67 * a static number of supported slots per shelf */ 68 NPERSHELF = 16, 69 }; 70 71 if (aoemin >= NPERSHELF) { 72 pr_err("aoe: %s %d slots per shelf\n", 73 "static minor device numbers support only", 74 NPERSHELF); 75 error = -1; 76 goto out; 77 } 78 79 n = aoemaj * NPERSHELF + aoemin; 80 if (n >= N_DEVS) { 81 pr_err("aoe: %s with e%ld.%d\n", 82 "cannot use static minor device numbers", 83 aoemaj, aoemin); 84 error = -1; 85 goto out; 86 } 87 88 spin_lock_irqsave(&used_minors_lock, flags); 89 if (test_bit(n, used_minors)) { 90 pr_err("aoe: %s %lu\n", 91 "existing device already has static minor number", 92 n); 93 error = -1; 94 } else 95 set_bit(n, used_minors); 96 spin_unlock_irqrestore(&used_minors_lock, flags); 97 *sysminor = n * AOE_PARTITIONS; 98 out: 99 return error; 100 } 101 102 static int 103 minor_get(ulong *sysminor, ulong aoemaj, int aoemin) 104 { 105 if (aoe_dyndevs) 106 return minor_get_dyn(sysminor); 107 else 108 return minor_get_static(sysminor, aoemaj, aoemin); 109 } 110 111 static void 112 minor_free(ulong minor) 113 { 114 ulong flags; 115 116 minor /= AOE_PARTITIONS; 117 BUG_ON(minor >= N_DEVS); 118 119 spin_lock_irqsave(&used_minors_lock, flags); 120 BUG_ON(!test_bit(minor, used_minors)); 121 clear_bit(minor, used_minors); 122 spin_unlock_irqrestore(&used_minors_lock, flags); 123 } 124 125 /* 126 * Users who grab a pointer to the device with aoedev_by_aoeaddr 127 * automatically get a reference count and must be responsible 128 * for performing a aoedev_put. With the addition of async 129 * kthread processing I'm no longer confident that we can 130 * guarantee consistency in the face of device flushes. 131 * 132 * For the time being, we only bother to add extra references for 133 * frames sitting on the iocq. When the kthreads finish processing 134 * these frames, they will aoedev_put the device. 135 */ 136 137 void 138 aoedev_put(struct aoedev *d) 139 { 140 ulong flags; 141 142 spin_lock_irqsave(&devlist_lock, flags); 143 d->ref--; 144 spin_unlock_irqrestore(&devlist_lock, flags); 145 } 146 147 static void 148 dummy_timer(struct timer_list *t) 149 { 150 struct aoedev *d; 151 152 d = timer_container_of(d, t, timer); 153 if (d->flags & DEVFL_TKILL) 154 return; 155 d->timer.expires = jiffies + HZ; 156 add_timer(&d->timer); 157 } 158 159 static void 160 aoe_failip(struct aoedev *d) 161 { 162 struct request *rq; 163 struct aoe_req *req; 164 struct bio *bio; 165 166 aoe_failbuf(d, d->ip.buf); 167 rq = d->ip.rq; 168 if (rq == NULL) 169 return; 170 171 req = blk_mq_rq_to_pdu(rq); 172 while ((bio = d->ip.nxbio)) { 173 bio->bi_status = BLK_STS_IOERR; 174 d->ip.nxbio = bio->bi_next; 175 req->nr_bios--; 176 } 177 178 if (!req->nr_bios) 179 aoe_end_request(d, rq, 0); 180 } 181 182 static void 183 downdev_frame(struct list_head *pos) 184 { 185 struct frame *f; 186 187 f = list_entry(pos, struct frame, head); 188 list_del(pos); 189 if (f->buf) { 190 f->buf->nframesout--; 191 aoe_failbuf(f->t->d, f->buf); 192 } 193 aoe_freetframe(f); 194 } 195 196 void 197 aoedev_downdev(struct aoedev *d) 198 { 199 struct aoetgt *t, **tt, **te; 200 struct list_head *head, *pos, *nx; 201 struct request *rq, *rqnext; 202 int i; 203 unsigned long flags; 204 205 spin_lock_irqsave(&d->lock, flags); 206 d->flags &= ~(DEVFL_UP | DEVFL_DEAD); 207 spin_unlock_irqrestore(&d->lock, flags); 208 209 /* clean out active and to-be-retransmitted buffers */ 210 for (i = 0; i < NFACTIVE; i++) { 211 head = &d->factive[i]; 212 list_for_each_safe(pos, nx, head) 213 downdev_frame(pos); 214 } 215 head = &d->rexmitq; 216 list_for_each_safe(pos, nx, head) 217 downdev_frame(pos); 218 219 /* reset window dressings */ 220 tt = d->targets; 221 te = tt + d->ntargets; 222 for (; tt < te && (t = *tt); tt++) { 223 aoecmd_wreset(t); 224 t->nout = 0; 225 } 226 227 /* clean out the in-process request (if any) */ 228 aoe_failip(d); 229 230 /* clean out any queued block requests */ 231 list_for_each_entry_safe(rq, rqnext, &d->rq_list, queuelist) { 232 list_del_init(&rq->queuelist); 233 blk_mq_start_request(rq); 234 blk_mq_end_request(rq, BLK_STS_IOERR); 235 } 236 237 /* fast fail all pending I/O */ 238 if (d->blkq) { 239 /* UP is cleared, freeze+quiesce to insure all are errored */ 240 unsigned int memflags = blk_mq_freeze_queue(d->blkq); 241 242 blk_mq_quiesce_queue(d->blkq); 243 blk_mq_unquiesce_queue(d->blkq); 244 blk_mq_unfreeze_queue(d->blkq, memflags); 245 } 246 247 if (d->gd) 248 set_capacity(d->gd, 0); 249 } 250 251 /* return whether the user asked for this particular 252 * device to be flushed 253 */ 254 static int 255 user_req(char *s, size_t slen, struct aoedev *d) 256 { 257 const char *p; 258 size_t lim; 259 260 if (!d->gd) 261 return 0; 262 p = kbasename(d->gd->disk_name); 263 lim = sizeof(d->gd->disk_name); 264 lim -= p - d->gd->disk_name; 265 if (slen < lim) 266 lim = slen; 267 268 return !strncmp(s, p, lim); 269 } 270 271 static void 272 freedev(struct aoedev *d) 273 { 274 struct aoetgt **t, **e; 275 int freeing = 0; 276 unsigned long flags; 277 278 spin_lock_irqsave(&d->lock, flags); 279 if (d->flags & DEVFL_TKILL 280 && !(d->flags & DEVFL_FREEING)) { 281 d->flags |= DEVFL_FREEING; 282 freeing = 1; 283 } 284 spin_unlock_irqrestore(&d->lock, flags); 285 if (!freeing) 286 return; 287 288 timer_delete_sync(&d->timer); 289 if (d->gd) { 290 aoedisk_rm_debugfs(d); 291 del_gendisk(d->gd); 292 put_disk(d->gd); 293 blk_mq_free_tag_set(&d->tag_set); 294 } 295 t = d->targets; 296 e = t + d->ntargets; 297 for (; t < e && *t; t++) 298 freetgt(d, *t); 299 300 mempool_destroy(d->bufpool); 301 skbpoolfree(d); 302 minor_free(d->sysminor); 303 304 spin_lock_irqsave(&d->lock, flags); 305 d->flags |= DEVFL_FREED; 306 spin_unlock_irqrestore(&d->lock, flags); 307 } 308 309 enum flush_parms { 310 NOT_EXITING = 0, 311 EXITING = 1, 312 }; 313 314 static int 315 flush(const char __user *str, size_t cnt, int exiting) 316 { 317 ulong flags; 318 struct aoedev *d, **dd; 319 char buf[16]; 320 int all = 0; 321 int specified = 0; /* flush a specific device */ 322 unsigned int skipflags; 323 324 skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; 325 326 if (!exiting && cnt >= 3) { 327 if (cnt > sizeof buf) 328 cnt = sizeof buf; 329 if (copy_from_user(buf, str, cnt)) 330 return -EFAULT; 331 all = !strncmp(buf, "all", 3); 332 if (!all) 333 specified = 1; 334 } 335 336 flush_workqueue(aoe_wq); 337 /* pass one: do aoedev_downdev, which might sleep */ 338 restart1: 339 spin_lock_irqsave(&devlist_lock, flags); 340 for (d = devlist; d; d = d->next) { 341 spin_lock(&d->lock); 342 if (d->flags & DEVFL_TKILL) 343 goto cont; 344 345 if (exiting) { 346 /* unconditionally take each device down */ 347 } else if (specified) { 348 if (!user_req(buf, cnt, d)) 349 goto cont; 350 } else if ((!all && (d->flags & DEVFL_UP)) 351 || d->flags & skipflags 352 || d->nopen 353 || d->ref) 354 goto cont; 355 356 spin_unlock(&d->lock); 357 spin_unlock_irqrestore(&devlist_lock, flags); 358 aoedev_downdev(d); 359 d->flags |= DEVFL_TKILL; 360 goto restart1; 361 cont: 362 spin_unlock(&d->lock); 363 } 364 spin_unlock_irqrestore(&devlist_lock, flags); 365 366 /* pass two: call freedev, which might sleep, 367 * for aoedevs marked with DEVFL_TKILL 368 */ 369 restart2: 370 spin_lock_irqsave(&devlist_lock, flags); 371 for (d = devlist; d; d = d->next) { 372 spin_lock(&d->lock); 373 if (d->flags & DEVFL_TKILL 374 && !(d->flags & DEVFL_FREEING)) { 375 spin_unlock(&d->lock); 376 spin_unlock_irqrestore(&devlist_lock, flags); 377 freedev(d); 378 goto restart2; 379 } 380 spin_unlock(&d->lock); 381 } 382 383 /* pass three: remove aoedevs marked with DEVFL_FREED */ 384 for (dd = &devlist, d = *dd; d; d = *dd) { 385 struct aoedev *doomed = NULL; 386 387 spin_lock(&d->lock); 388 if (d->flags & DEVFL_FREED) { 389 *dd = d->next; 390 doomed = d; 391 } else { 392 dd = &d->next; 393 } 394 spin_unlock(&d->lock); 395 if (doomed) 396 kfree(doomed->targets); 397 kfree(doomed); 398 } 399 spin_unlock_irqrestore(&devlist_lock, flags); 400 401 return 0; 402 } 403 404 int 405 aoedev_flush(const char __user *str, size_t cnt) 406 { 407 return flush(str, cnt, NOT_EXITING); 408 } 409 410 /* This has been confirmed to occur once with Tms=3*1000 due to the 411 * driver changing link and not processing its transmit ring. The 412 * problem is hard enough to solve by returning an error that I'm 413 * still punting on "solving" this. 414 */ 415 static void 416 skbfree(struct sk_buff *skb) 417 { 418 enum { Sms = 250, Tms = 30 * 1000}; 419 int i = Tms / Sms; 420 421 if (skb == NULL) 422 return; 423 while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) 424 msleep(Sms); 425 if (i < 0) { 426 printk(KERN_ERR 427 "aoe: %s holds ref: %s\n", 428 skb->dev ? skb->dev->name : "netif", 429 "cannot free skb -- memory leaked."); 430 return; 431 } 432 skb->truesize -= skb->data_len; 433 skb_shinfo(skb)->nr_frags = skb->data_len = 0; 434 skb_trim(skb, 0); 435 dev_kfree_skb(skb); 436 } 437 438 static void 439 skbpoolfree(struct aoedev *d) 440 { 441 struct sk_buff *skb, *tmp; 442 443 skb_queue_walk_safe(&d->skbpool, skb, tmp) 444 skbfree(skb); 445 446 __skb_queue_head_init(&d->skbpool); 447 } 448 449 /* find it or allocate it */ 450 struct aoedev * 451 aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) 452 { 453 struct aoedev *d; 454 int i; 455 ulong flags; 456 ulong sysminor = 0; 457 458 spin_lock_irqsave(&devlist_lock, flags); 459 460 for (d=devlist; d; d=d->next) 461 if (d->aoemajor == maj && d->aoeminor == min) { 462 spin_lock(&d->lock); 463 if (d->flags & DEVFL_TKILL) { 464 spin_unlock(&d->lock); 465 d = NULL; 466 goto out; 467 } 468 d->ref++; 469 spin_unlock(&d->lock); 470 break; 471 } 472 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) 473 goto out; 474 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 475 if (!d) 476 goto out; 477 d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); 478 if (!d->targets) { 479 kfree(d); 480 d = NULL; 481 goto out; 482 } 483 d->ntargets = NTARGETS; 484 INIT_WORK(&d->work, aoecmd_sleepwork); 485 spin_lock_init(&d->lock); 486 INIT_LIST_HEAD(&d->rq_list); 487 skb_queue_head_init(&d->skbpool); 488 timer_setup(&d->timer, dummy_timer, 0); 489 d->timer.expires = jiffies + HZ; 490 add_timer(&d->timer); 491 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 492 d->tgt = d->targets; 493 d->ref = 1; 494 for (i = 0; i < NFACTIVE; i++) 495 INIT_LIST_HEAD(&d->factive[i]); 496 INIT_LIST_HEAD(&d->rexmitq); 497 d->sysminor = sysminor; 498 d->aoemajor = maj; 499 d->aoeminor = min; 500 d->rttavg = RTTAVG_INIT; 501 d->rttdev = RTTDEV_INIT; 502 d->next = devlist; 503 devlist = d; 504 out: 505 spin_unlock_irqrestore(&devlist_lock, flags); 506 return d; 507 } 508 509 static void 510 freetgt(struct aoedev *d, struct aoetgt *t) 511 { 512 struct frame *f; 513 struct list_head *pos, *nx, *head; 514 struct aoeif *ifp; 515 516 for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { 517 if (!ifp->nd) 518 break; 519 dev_put(ifp->nd); 520 } 521 522 head = &t->ffree; 523 list_for_each_safe(pos, nx, head) { 524 list_del(pos); 525 f = list_entry(pos, struct frame, head); 526 skbfree(f->skb); 527 kfree(f); 528 } 529 kfree(t); 530 } 531 532 void 533 aoedev_exit(void) 534 { 535 flush_workqueue(aoe_wq); 536 flush(NULL, 0, EXITING); 537 } 538 539 int __init 540 aoedev_init(void) 541 { 542 return 0; 543 } 544