1 #include <linux/errno.h> 2 #include <linux/numa.h> 3 #include <linux/slab.h> 4 #include <linux/rculist.h> 5 #include <linux/threads.h> 6 #include <linux/preempt.h> 7 #include <linux/irqflags.h> 8 #include <linux/vmalloc.h> 9 #include <linux/mm.h> 10 #include <linux/module.h> 11 #include <linux/device-mapper.h> 12 13 #include "dm-core.h" 14 #include "dm-stats.h" 15 16 #define DM_MSG_PREFIX "stats" 17 18 static int dm_stat_need_rcu_barrier; 19 20 /* 21 * Using 64-bit values to avoid overflow (which is a 22 * problem that block/genhd.c's IO accounting has). 23 */ 24 struct dm_stat_percpu { 25 unsigned long long sectors[2]; 26 unsigned long long ios[2]; 27 unsigned long long merges[2]; 28 unsigned long long ticks[2]; 29 unsigned long long io_ticks[2]; 30 unsigned long long io_ticks_total; 31 unsigned long long time_in_queue; 32 unsigned long long *histogram; 33 }; 34 35 struct dm_stat_shared { 36 atomic_t in_flight[2]; 37 unsigned long long stamp; 38 struct dm_stat_percpu tmp; 39 }; 40 41 struct dm_stat { 42 struct list_head list_entry; 43 int id; 44 unsigned stat_flags; 45 size_t n_entries; 46 sector_t start; 47 sector_t end; 48 sector_t step; 49 unsigned n_histogram_entries; 50 unsigned long long *histogram_boundaries; 51 const char *program_id; 52 const char *aux_data; 53 struct rcu_head rcu_head; 54 size_t shared_alloc_size; 55 size_t percpu_alloc_size; 56 size_t histogram_alloc_size; 57 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 58 struct dm_stat_shared stat_shared[0]; 59 }; 60 61 #define STAT_PRECISE_TIMESTAMPS 1 62 63 struct dm_stats_last_position { 64 sector_t last_sector; 65 unsigned last_rw; 66 }; 67 68 /* 69 * A typo on the command line could possibly make the kernel run out of memory 70 * and crash. To prevent the crash we account all used memory. We fail if we 71 * exhaust 1/4 of all memory or 1/2 of vmalloc space. 72 */ 73 #define DM_STATS_MEMORY_FACTOR 4 74 #define DM_STATS_VMALLOC_FACTOR 2 75 76 static DEFINE_SPINLOCK(shared_memory_lock); 77 78 static unsigned long shared_memory_amount; 79 80 static bool __check_shared_memory(size_t alloc_size) 81 { 82 size_t a; 83 84 a = shared_memory_amount + alloc_size; 85 if (a < shared_memory_amount) 86 return false; 87 if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) 88 return false; 89 #ifdef CONFIG_MMU 90 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 91 return false; 92 #endif 93 return true; 94 } 95 96 static bool check_shared_memory(size_t alloc_size) 97 { 98 bool ret; 99 100 spin_lock_irq(&shared_memory_lock); 101 102 ret = __check_shared_memory(alloc_size); 103 104 spin_unlock_irq(&shared_memory_lock); 105 106 return ret; 107 } 108 109 static bool claim_shared_memory(size_t alloc_size) 110 { 111 spin_lock_irq(&shared_memory_lock); 112 113 if (!__check_shared_memory(alloc_size)) { 114 spin_unlock_irq(&shared_memory_lock); 115 return false; 116 } 117 118 shared_memory_amount += alloc_size; 119 120 spin_unlock_irq(&shared_memory_lock); 121 122 return true; 123 } 124 125 static void free_shared_memory(size_t alloc_size) 126 { 127 unsigned long flags; 128 129 spin_lock_irqsave(&shared_memory_lock, flags); 130 131 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 132 spin_unlock_irqrestore(&shared_memory_lock, flags); 133 DMCRIT("Memory usage accounting bug."); 134 return; 135 } 136 137 shared_memory_amount -= alloc_size; 138 139 spin_unlock_irqrestore(&shared_memory_lock, flags); 140 } 141 142 static void *dm_kvzalloc(size_t alloc_size, int node) 143 { 144 void *p; 145 146 if (!claim_shared_memory(alloc_size)) 147 return NULL; 148 149 p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node); 150 if (p) 151 return p; 152 153 free_shared_memory(alloc_size); 154 155 return NULL; 156 } 157 158 static void dm_kvfree(void *ptr, size_t alloc_size) 159 { 160 if (!ptr) 161 return; 162 163 free_shared_memory(alloc_size); 164 165 kvfree(ptr); 166 } 167 168 static void dm_stat_free(struct rcu_head *head) 169 { 170 int cpu; 171 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 172 173 kfree(s->histogram_boundaries); 174 kfree(s->program_id); 175 kfree(s->aux_data); 176 for_each_possible_cpu(cpu) { 177 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 178 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 179 } 180 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 181 dm_kvfree(s, s->shared_alloc_size); 182 } 183 184 static int dm_stat_in_flight(struct dm_stat_shared *shared) 185 { 186 return atomic_read(&shared->in_flight[READ]) + 187 atomic_read(&shared->in_flight[WRITE]); 188 } 189 190 void dm_stats_init(struct dm_stats *stats) 191 { 192 int cpu; 193 struct dm_stats_last_position *last; 194 195 mutex_init(&stats->mutex); 196 INIT_LIST_HEAD(&stats->list); 197 stats->last = alloc_percpu(struct dm_stats_last_position); 198 for_each_possible_cpu(cpu) { 199 last = per_cpu_ptr(stats->last, cpu); 200 last->last_sector = (sector_t)ULLONG_MAX; 201 last->last_rw = UINT_MAX; 202 } 203 } 204 205 void dm_stats_cleanup(struct dm_stats *stats) 206 { 207 size_t ni; 208 struct dm_stat *s; 209 struct dm_stat_shared *shared; 210 211 while (!list_empty(&stats->list)) { 212 s = container_of(stats->list.next, struct dm_stat, list_entry); 213 list_del(&s->list_entry); 214 for (ni = 0; ni < s->n_entries; ni++) { 215 shared = &s->stat_shared[ni]; 216 if (WARN_ON(dm_stat_in_flight(shared))) { 217 DMCRIT("leaked in-flight counter at index %lu " 218 "(start %llu, end %llu, step %llu): reads %d, writes %d", 219 (unsigned long)ni, 220 (unsigned long long)s->start, 221 (unsigned long long)s->end, 222 (unsigned long long)s->step, 223 atomic_read(&shared->in_flight[READ]), 224 atomic_read(&shared->in_flight[WRITE])); 225 } 226 } 227 dm_stat_free(&s->rcu_head); 228 } 229 free_percpu(stats->last); 230 } 231 232 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 233 sector_t step, unsigned stat_flags, 234 unsigned n_histogram_entries, 235 unsigned long long *histogram_boundaries, 236 const char *program_id, const char *aux_data, 237 void (*suspend_callback)(struct mapped_device *), 238 void (*resume_callback)(struct mapped_device *), 239 struct mapped_device *md) 240 { 241 struct list_head *l; 242 struct dm_stat *s, *tmp_s; 243 sector_t n_entries; 244 size_t ni; 245 size_t shared_alloc_size; 246 size_t percpu_alloc_size; 247 size_t histogram_alloc_size; 248 struct dm_stat_percpu *p; 249 int cpu; 250 int ret_id; 251 int r; 252 253 if (end < start || !step) 254 return -EINVAL; 255 256 n_entries = end - start; 257 if (dm_sector_div64(n_entries, step)) 258 n_entries++; 259 260 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 261 return -EOVERFLOW; 262 263 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); 264 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 265 return -EOVERFLOW; 266 267 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 268 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 269 return -EOVERFLOW; 270 271 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 272 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 273 return -EOVERFLOW; 274 275 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 276 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 277 return -ENOMEM; 278 279 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 280 if (!s) 281 return -ENOMEM; 282 283 s->stat_flags = stat_flags; 284 s->n_entries = n_entries; 285 s->start = start; 286 s->end = end; 287 s->step = step; 288 s->shared_alloc_size = shared_alloc_size; 289 s->percpu_alloc_size = percpu_alloc_size; 290 s->histogram_alloc_size = histogram_alloc_size; 291 292 s->n_histogram_entries = n_histogram_entries; 293 s->histogram_boundaries = kmemdup(histogram_boundaries, 294 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 295 if (!s->histogram_boundaries) { 296 r = -ENOMEM; 297 goto out; 298 } 299 300 s->program_id = kstrdup(program_id, GFP_KERNEL); 301 if (!s->program_id) { 302 r = -ENOMEM; 303 goto out; 304 } 305 s->aux_data = kstrdup(aux_data, GFP_KERNEL); 306 if (!s->aux_data) { 307 r = -ENOMEM; 308 goto out; 309 } 310 311 for (ni = 0; ni < n_entries; ni++) { 312 atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 313 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 314 } 315 316 if (s->n_histogram_entries) { 317 unsigned long long *hi; 318 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 319 if (!hi) { 320 r = -ENOMEM; 321 goto out; 322 } 323 for (ni = 0; ni < n_entries; ni++) { 324 s->stat_shared[ni].tmp.histogram = hi; 325 hi += s->n_histogram_entries + 1; 326 } 327 } 328 329 for_each_possible_cpu(cpu) { 330 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 331 if (!p) { 332 r = -ENOMEM; 333 goto out; 334 } 335 s->stat_percpu[cpu] = p; 336 if (s->n_histogram_entries) { 337 unsigned long long *hi; 338 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 339 if (!hi) { 340 r = -ENOMEM; 341 goto out; 342 } 343 for (ni = 0; ni < n_entries; ni++) { 344 p[ni].histogram = hi; 345 hi += s->n_histogram_entries + 1; 346 } 347 } 348 } 349 350 /* 351 * Suspend/resume to make sure there is no i/o in flight, 352 * so that newly created statistics will be exact. 353 * 354 * (note: we couldn't suspend earlier because we must not 355 * allocate memory while suspended) 356 */ 357 suspend_callback(md); 358 359 mutex_lock(&stats->mutex); 360 s->id = 0; 361 list_for_each(l, &stats->list) { 362 tmp_s = container_of(l, struct dm_stat, list_entry); 363 if (WARN_ON(tmp_s->id < s->id)) { 364 r = -EINVAL; 365 goto out_unlock_resume; 366 } 367 if (tmp_s->id > s->id) 368 break; 369 if (unlikely(s->id == INT_MAX)) { 370 r = -ENFILE; 371 goto out_unlock_resume; 372 } 373 s->id++; 374 } 375 ret_id = s->id; 376 list_add_tail_rcu(&s->list_entry, l); 377 mutex_unlock(&stats->mutex); 378 379 resume_callback(md); 380 381 return ret_id; 382 383 out_unlock_resume: 384 mutex_unlock(&stats->mutex); 385 resume_callback(md); 386 out: 387 dm_stat_free(&s->rcu_head); 388 return r; 389 } 390 391 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 392 { 393 struct dm_stat *s; 394 395 list_for_each_entry(s, &stats->list, list_entry) { 396 if (s->id > id) 397 break; 398 if (s->id == id) 399 return s; 400 } 401 402 return NULL; 403 } 404 405 static int dm_stats_delete(struct dm_stats *stats, int id) 406 { 407 struct dm_stat *s; 408 int cpu; 409 410 mutex_lock(&stats->mutex); 411 412 s = __dm_stats_find(stats, id); 413 if (!s) { 414 mutex_unlock(&stats->mutex); 415 return -ENOENT; 416 } 417 418 list_del_rcu(&s->list_entry); 419 mutex_unlock(&stats->mutex); 420 421 /* 422 * vfree can't be called from RCU callback 423 */ 424 for_each_possible_cpu(cpu) 425 if (is_vmalloc_addr(s->stat_percpu) || 426 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 427 goto do_sync_free; 428 if (is_vmalloc_addr(s) || 429 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 430 do_sync_free: 431 synchronize_rcu_expedited(); 432 dm_stat_free(&s->rcu_head); 433 } else { 434 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; 435 call_rcu(&s->rcu_head, dm_stat_free); 436 } 437 return 0; 438 } 439 440 static int dm_stats_list(struct dm_stats *stats, const char *program, 441 char *result, unsigned maxlen) 442 { 443 struct dm_stat *s; 444 sector_t len; 445 unsigned sz = 0; 446 447 /* 448 * Output format: 449 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 450 */ 451 452 mutex_lock(&stats->mutex); 453 list_for_each_entry(s, &stats->list, list_entry) { 454 if (!program || !strcmp(program, s->program_id)) { 455 len = s->end - s->start; 456 DMEMIT("%d: %llu+%llu %llu %s %s", s->id, 457 (unsigned long long)s->start, 458 (unsigned long long)len, 459 (unsigned long long)s->step, 460 s->program_id, 461 s->aux_data); 462 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 463 DMEMIT(" precise_timestamps"); 464 if (s->n_histogram_entries) { 465 unsigned i; 466 DMEMIT(" histogram:"); 467 for (i = 0; i < s->n_histogram_entries; i++) { 468 if (i) 469 DMEMIT(","); 470 DMEMIT("%llu", s->histogram_boundaries[i]); 471 } 472 } 473 DMEMIT("\n"); 474 } 475 } 476 mutex_unlock(&stats->mutex); 477 478 return 1; 479 } 480 481 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 482 struct dm_stat_percpu *p) 483 { 484 /* 485 * This is racy, but so is part_round_stats_single. 486 */ 487 unsigned long long now, difference; 488 unsigned in_flight_read, in_flight_write; 489 490 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 491 now = jiffies; 492 else 493 now = ktime_to_ns(ktime_get()); 494 495 difference = now - shared->stamp; 496 if (!difference) 497 return; 498 499 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 500 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 501 if (in_flight_read) 502 p->io_ticks[READ] += difference; 503 if (in_flight_write) 504 p->io_ticks[WRITE] += difference; 505 if (in_flight_read + in_flight_write) { 506 p->io_ticks_total += difference; 507 p->time_in_queue += (in_flight_read + in_flight_write) * difference; 508 } 509 shared->stamp = now; 510 } 511 512 static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 513 int idx, sector_t len, 514 struct dm_stats_aux *stats_aux, bool end, 515 unsigned long duration_jiffies) 516 { 517 struct dm_stat_shared *shared = &s->stat_shared[entry]; 518 struct dm_stat_percpu *p; 519 520 /* 521 * For strict correctness we should use local_irq_save/restore 522 * instead of preempt_disable/enable. 523 * 524 * preempt_disable/enable is racy if the driver finishes bios 525 * from non-interrupt context as well as from interrupt context 526 * or from more different interrupts. 527 * 528 * On 64-bit architectures the race only results in not counting some 529 * events, so it is acceptable. On 32-bit architectures the race could 530 * cause the counter going off by 2^32, so we need to do proper locking 531 * there. 532 * 533 * part_stat_lock()/part_stat_unlock() have this race too. 534 */ 535 #if BITS_PER_LONG == 32 536 unsigned long flags; 537 local_irq_save(flags); 538 #else 539 preempt_disable(); 540 #endif 541 p = &s->stat_percpu[smp_processor_id()][entry]; 542 543 if (!end) { 544 dm_stat_round(s, shared, p); 545 atomic_inc(&shared->in_flight[idx]); 546 } else { 547 unsigned long long duration; 548 dm_stat_round(s, shared, p); 549 atomic_dec(&shared->in_flight[idx]); 550 p->sectors[idx] += len; 551 p->ios[idx] += 1; 552 p->merges[idx] += stats_aux->merged; 553 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 554 p->ticks[idx] += duration_jiffies; 555 duration = jiffies_to_msecs(duration_jiffies); 556 } else { 557 p->ticks[idx] += stats_aux->duration_ns; 558 duration = stats_aux->duration_ns; 559 } 560 if (s->n_histogram_entries) { 561 unsigned lo = 0, hi = s->n_histogram_entries + 1; 562 while (lo + 1 < hi) { 563 unsigned mid = (lo + hi) / 2; 564 if (s->histogram_boundaries[mid - 1] > duration) { 565 hi = mid; 566 } else { 567 lo = mid; 568 } 569 570 } 571 p->histogram[lo]++; 572 } 573 } 574 575 #if BITS_PER_LONG == 32 576 local_irq_restore(flags); 577 #else 578 preempt_enable(); 579 #endif 580 } 581 582 static void __dm_stat_bio(struct dm_stat *s, int bi_rw, 583 sector_t bi_sector, sector_t end_sector, 584 bool end, unsigned long duration_jiffies, 585 struct dm_stats_aux *stats_aux) 586 { 587 sector_t rel_sector, offset, todo, fragment_len; 588 size_t entry; 589 590 if (end_sector <= s->start || bi_sector >= s->end) 591 return; 592 if (unlikely(bi_sector < s->start)) { 593 rel_sector = 0; 594 todo = end_sector - s->start; 595 } else { 596 rel_sector = bi_sector - s->start; 597 todo = end_sector - bi_sector; 598 } 599 if (unlikely(end_sector > s->end)) 600 todo -= (end_sector - s->end); 601 602 offset = dm_sector_div64(rel_sector, s->step); 603 entry = rel_sector; 604 do { 605 if (WARN_ON_ONCE(entry >= s->n_entries)) { 606 DMCRIT("Invalid area access in region id %d", s->id); 607 return; 608 } 609 fragment_len = todo; 610 if (fragment_len > s->step - offset) 611 fragment_len = s->step - offset; 612 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 613 stats_aux, end, duration_jiffies); 614 todo -= fragment_len; 615 entry++; 616 offset = 0; 617 } while (unlikely(todo != 0)); 618 } 619 620 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 621 sector_t bi_sector, unsigned bi_sectors, bool end, 622 unsigned long duration_jiffies, 623 struct dm_stats_aux *stats_aux) 624 { 625 struct dm_stat *s; 626 sector_t end_sector; 627 struct dm_stats_last_position *last; 628 bool got_precise_time; 629 630 if (unlikely(!bi_sectors)) 631 return; 632 633 end_sector = bi_sector + bi_sectors; 634 635 if (!end) { 636 /* 637 * A race condition can at worst result in the merged flag being 638 * misrepresented, so we don't have to disable preemption here. 639 */ 640 last = raw_cpu_ptr(stats->last); 641 stats_aux->merged = 642 (bi_sector == (ACCESS_ONCE(last->last_sector) && 643 ((bi_rw == WRITE) == 644 (ACCESS_ONCE(last->last_rw) == WRITE)) 645 )); 646 ACCESS_ONCE(last->last_sector) = end_sector; 647 ACCESS_ONCE(last->last_rw) = bi_rw; 648 } 649 650 rcu_read_lock(); 651 652 got_precise_time = false; 653 list_for_each_entry_rcu(s, &stats->list, list_entry) { 654 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 655 if (!end) 656 stats_aux->duration_ns = ktime_to_ns(ktime_get()); 657 else 658 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 659 got_precise_time = true; 660 } 661 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 662 } 663 664 rcu_read_unlock(); 665 } 666 667 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 668 struct dm_stat *s, size_t x) 669 { 670 int cpu; 671 struct dm_stat_percpu *p; 672 673 local_irq_disable(); 674 p = &s->stat_percpu[smp_processor_id()][x]; 675 dm_stat_round(s, shared, p); 676 local_irq_enable(); 677 678 shared->tmp.sectors[READ] = 0; 679 shared->tmp.sectors[WRITE] = 0; 680 shared->tmp.ios[READ] = 0; 681 shared->tmp.ios[WRITE] = 0; 682 shared->tmp.merges[READ] = 0; 683 shared->tmp.merges[WRITE] = 0; 684 shared->tmp.ticks[READ] = 0; 685 shared->tmp.ticks[WRITE] = 0; 686 shared->tmp.io_ticks[READ] = 0; 687 shared->tmp.io_ticks[WRITE] = 0; 688 shared->tmp.io_ticks_total = 0; 689 shared->tmp.time_in_queue = 0; 690 691 if (s->n_histogram_entries) 692 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 693 694 for_each_possible_cpu(cpu) { 695 p = &s->stat_percpu[cpu][x]; 696 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); 697 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); 698 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); 699 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); 700 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); 701 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); 702 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); 703 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); 704 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); 705 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); 706 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); 707 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); 708 if (s->n_histogram_entries) { 709 unsigned i; 710 for (i = 0; i < s->n_histogram_entries + 1; i++) 711 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); 712 } 713 } 714 } 715 716 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 717 bool init_tmp_percpu_totals) 718 { 719 size_t x; 720 struct dm_stat_shared *shared; 721 struct dm_stat_percpu *p; 722 723 for (x = idx_start; x < idx_end; x++) { 724 shared = &s->stat_shared[x]; 725 if (init_tmp_percpu_totals) 726 __dm_stat_init_temporary_percpu_totals(shared, s, x); 727 local_irq_disable(); 728 p = &s->stat_percpu[smp_processor_id()][x]; 729 p->sectors[READ] -= shared->tmp.sectors[READ]; 730 p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 731 p->ios[READ] -= shared->tmp.ios[READ]; 732 p->ios[WRITE] -= shared->tmp.ios[WRITE]; 733 p->merges[READ] -= shared->tmp.merges[READ]; 734 p->merges[WRITE] -= shared->tmp.merges[WRITE]; 735 p->ticks[READ] -= shared->tmp.ticks[READ]; 736 p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 737 p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 738 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 739 p->io_ticks_total -= shared->tmp.io_ticks_total; 740 p->time_in_queue -= shared->tmp.time_in_queue; 741 local_irq_enable(); 742 if (s->n_histogram_entries) { 743 unsigned i; 744 for (i = 0; i < s->n_histogram_entries + 1; i++) { 745 local_irq_disable(); 746 p = &s->stat_percpu[smp_processor_id()][x]; 747 p->histogram[i] -= shared->tmp.histogram[i]; 748 local_irq_enable(); 749 } 750 } 751 } 752 } 753 754 static int dm_stats_clear(struct dm_stats *stats, int id) 755 { 756 struct dm_stat *s; 757 758 mutex_lock(&stats->mutex); 759 760 s = __dm_stats_find(stats, id); 761 if (!s) { 762 mutex_unlock(&stats->mutex); 763 return -ENOENT; 764 } 765 766 __dm_stat_clear(s, 0, s->n_entries, true); 767 768 mutex_unlock(&stats->mutex); 769 770 return 1; 771 } 772 773 /* 774 * This is like jiffies_to_msec, but works for 64-bit values. 775 */ 776 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 777 { 778 unsigned long long result; 779 unsigned mult; 780 781 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 782 return j; 783 784 result = 0; 785 if (j) 786 result = jiffies_to_msecs(j & 0x3fffff); 787 if (j >= 1 << 22) { 788 mult = jiffies_to_msecs(1 << 22); 789 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 790 } 791 if (j >= 1ULL << 44) 792 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 793 794 return result; 795 } 796 797 static int dm_stats_print(struct dm_stats *stats, int id, 798 size_t idx_start, size_t idx_len, 799 bool clear, char *result, unsigned maxlen) 800 { 801 unsigned sz = 0; 802 struct dm_stat *s; 803 size_t x; 804 sector_t start, end, step; 805 size_t idx_end; 806 struct dm_stat_shared *shared; 807 808 /* 809 * Output format: 810 * <start_sector>+<length> counters 811 */ 812 813 mutex_lock(&stats->mutex); 814 815 s = __dm_stats_find(stats, id); 816 if (!s) { 817 mutex_unlock(&stats->mutex); 818 return -ENOENT; 819 } 820 821 idx_end = idx_start + idx_len; 822 if (idx_end < idx_start || 823 idx_end > s->n_entries) 824 idx_end = s->n_entries; 825 826 if (idx_start > idx_end) 827 idx_start = idx_end; 828 829 step = s->step; 830 start = s->start + (step * idx_start); 831 832 for (x = idx_start; x < idx_end; x++, start = end) { 833 shared = &s->stat_shared[x]; 834 end = start + step; 835 if (unlikely(end > s->end)) 836 end = s->end; 837 838 __dm_stat_init_temporary_percpu_totals(shared, s, x); 839 840 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 841 (unsigned long long)start, 842 (unsigned long long)step, 843 shared->tmp.ios[READ], 844 shared->tmp.merges[READ], 845 shared->tmp.sectors[READ], 846 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 847 shared->tmp.ios[WRITE], 848 shared->tmp.merges[WRITE], 849 shared->tmp.sectors[WRITE], 850 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 851 dm_stat_in_flight(shared), 852 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 853 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 854 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 855 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 856 if (s->n_histogram_entries) { 857 unsigned i; 858 for (i = 0; i < s->n_histogram_entries + 1; i++) { 859 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 860 } 861 } 862 DMEMIT("\n"); 863 864 if (unlikely(sz + 1 >= maxlen)) 865 goto buffer_overflow; 866 } 867 868 if (clear) 869 __dm_stat_clear(s, idx_start, idx_end, false); 870 871 buffer_overflow: 872 mutex_unlock(&stats->mutex); 873 874 return 1; 875 } 876 877 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 878 { 879 struct dm_stat *s; 880 const char *new_aux_data; 881 882 mutex_lock(&stats->mutex); 883 884 s = __dm_stats_find(stats, id); 885 if (!s) { 886 mutex_unlock(&stats->mutex); 887 return -ENOENT; 888 } 889 890 new_aux_data = kstrdup(aux_data, GFP_KERNEL); 891 if (!new_aux_data) { 892 mutex_unlock(&stats->mutex); 893 return -ENOMEM; 894 } 895 896 kfree(s->aux_data); 897 s->aux_data = new_aux_data; 898 899 mutex_unlock(&stats->mutex); 900 901 return 0; 902 } 903 904 static int parse_histogram(const char *h, unsigned *n_histogram_entries, 905 unsigned long long **histogram_boundaries) 906 { 907 const char *q; 908 unsigned n; 909 unsigned long long last; 910 911 *n_histogram_entries = 1; 912 for (q = h; *q; q++) 913 if (*q == ',') 914 (*n_histogram_entries)++; 915 916 *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 917 if (!*histogram_boundaries) 918 return -ENOMEM; 919 920 n = 0; 921 last = 0; 922 while (1) { 923 unsigned long long hi; 924 int s; 925 char ch; 926 s = sscanf(h, "%llu%c", &hi, &ch); 927 if (!s || (s == 2 && ch != ',')) 928 return -EINVAL; 929 if (hi <= last) 930 return -EINVAL; 931 last = hi; 932 (*histogram_boundaries)[n] = hi; 933 if (s == 1) 934 return 0; 935 h = strchr(h, ',') + 1; 936 n++; 937 } 938 } 939 940 static int message_stats_create(struct mapped_device *md, 941 unsigned argc, char **argv, 942 char *result, unsigned maxlen) 943 { 944 int r; 945 int id; 946 char dummy; 947 unsigned long long start, end, len, step; 948 unsigned divisor; 949 const char *program_id, *aux_data; 950 unsigned stat_flags = 0; 951 952 unsigned n_histogram_entries = 0; 953 unsigned long long *histogram_boundaries = NULL; 954 955 struct dm_arg_set as, as_backup; 956 const char *a; 957 unsigned feature_args; 958 959 /* 960 * Input format: 961 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 962 */ 963 964 if (argc < 3) 965 goto ret_einval; 966 967 as.argc = argc; 968 as.argv = argv; 969 dm_consume_args(&as, 1); 970 971 a = dm_shift_arg(&as); 972 if (!strcmp(a, "-")) { 973 start = 0; 974 len = dm_get_size(md); 975 if (!len) 976 len = 1; 977 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 978 start != (sector_t)start || len != (sector_t)len) 979 goto ret_einval; 980 981 end = start + len; 982 if (start >= end) 983 goto ret_einval; 984 985 a = dm_shift_arg(&as); 986 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 987 if (!divisor) 988 return -EINVAL; 989 step = end - start; 990 if (do_div(step, divisor)) 991 step++; 992 if (!step) 993 step = 1; 994 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 995 step != (sector_t)step || !step) 996 goto ret_einval; 997 998 as_backup = as; 999 a = dm_shift_arg(&as); 1000 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 1001 while (feature_args--) { 1002 a = dm_shift_arg(&as); 1003 if (!a) 1004 goto ret_einval; 1005 if (!strcasecmp(a, "precise_timestamps")) 1006 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1007 else if (!strncasecmp(a, "histogram:", 10)) { 1008 if (n_histogram_entries) 1009 goto ret_einval; 1010 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) 1011 goto ret; 1012 } else 1013 goto ret_einval; 1014 } 1015 } else { 1016 as = as_backup; 1017 } 1018 1019 program_id = "-"; 1020 aux_data = "-"; 1021 1022 a = dm_shift_arg(&as); 1023 if (a) 1024 program_id = a; 1025 1026 a = dm_shift_arg(&as); 1027 if (a) 1028 aux_data = a; 1029 1030 if (as.argc) 1031 goto ret_einval; 1032 1033 /* 1034 * If a buffer overflow happens after we created the region, 1035 * it's too late (the userspace would retry with a larger 1036 * buffer, but the region id that caused the overflow is already 1037 * leaked). So we must detect buffer overflow in advance. 1038 */ 1039 snprintf(result, maxlen, "%d", INT_MAX); 1040 if (dm_message_test_buffer_overflow(result, maxlen)) { 1041 r = 1; 1042 goto ret; 1043 } 1044 1045 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 1046 n_histogram_entries, histogram_boundaries, program_id, aux_data, 1047 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1048 if (id < 0) { 1049 r = id; 1050 goto ret; 1051 } 1052 1053 snprintf(result, maxlen, "%d", id); 1054 1055 r = 1; 1056 goto ret; 1057 1058 ret_einval: 1059 r = -EINVAL; 1060 ret: 1061 kfree(histogram_boundaries); 1062 return r; 1063 } 1064 1065 static int message_stats_delete(struct mapped_device *md, 1066 unsigned argc, char **argv) 1067 { 1068 int id; 1069 char dummy; 1070 1071 if (argc != 2) 1072 return -EINVAL; 1073 1074 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1075 return -EINVAL; 1076 1077 return dm_stats_delete(dm_get_stats(md), id); 1078 } 1079 1080 static int message_stats_clear(struct mapped_device *md, 1081 unsigned argc, char **argv) 1082 { 1083 int id; 1084 char dummy; 1085 1086 if (argc != 2) 1087 return -EINVAL; 1088 1089 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1090 return -EINVAL; 1091 1092 return dm_stats_clear(dm_get_stats(md), id); 1093 } 1094 1095 static int message_stats_list(struct mapped_device *md, 1096 unsigned argc, char **argv, 1097 char *result, unsigned maxlen) 1098 { 1099 int r; 1100 const char *program = NULL; 1101 1102 if (argc < 1 || argc > 2) 1103 return -EINVAL; 1104 1105 if (argc > 1) { 1106 program = kstrdup(argv[1], GFP_KERNEL); 1107 if (!program) 1108 return -ENOMEM; 1109 } 1110 1111 r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 1112 1113 kfree(program); 1114 1115 return r; 1116 } 1117 1118 static int message_stats_print(struct mapped_device *md, 1119 unsigned argc, char **argv, bool clear, 1120 char *result, unsigned maxlen) 1121 { 1122 int id; 1123 char dummy; 1124 unsigned long idx_start = 0, idx_len = ULONG_MAX; 1125 1126 if (argc != 2 && argc != 4) 1127 return -EINVAL; 1128 1129 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1130 return -EINVAL; 1131 1132 if (argc > 3) { 1133 if (strcmp(argv[2], "-") && 1134 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 1135 return -EINVAL; 1136 if (strcmp(argv[3], "-") && 1137 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 1138 return -EINVAL; 1139 } 1140 1141 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 1142 result, maxlen); 1143 } 1144 1145 static int message_stats_set_aux(struct mapped_device *md, 1146 unsigned argc, char **argv) 1147 { 1148 int id; 1149 char dummy; 1150 1151 if (argc != 3) 1152 return -EINVAL; 1153 1154 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1155 return -EINVAL; 1156 1157 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 1158 } 1159 1160 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 1161 char *result, unsigned maxlen) 1162 { 1163 int r; 1164 1165 /* All messages here must start with '@' */ 1166 if (!strcasecmp(argv[0], "@stats_create")) 1167 r = message_stats_create(md, argc, argv, result, maxlen); 1168 else if (!strcasecmp(argv[0], "@stats_delete")) 1169 r = message_stats_delete(md, argc, argv); 1170 else if (!strcasecmp(argv[0], "@stats_clear")) 1171 r = message_stats_clear(md, argc, argv); 1172 else if (!strcasecmp(argv[0], "@stats_list")) 1173 r = message_stats_list(md, argc, argv, result, maxlen); 1174 else if (!strcasecmp(argv[0], "@stats_print")) 1175 r = message_stats_print(md, argc, argv, false, result, maxlen); 1176 else if (!strcasecmp(argv[0], "@stats_print_clear")) 1177 r = message_stats_print(md, argc, argv, true, result, maxlen); 1178 else if (!strcasecmp(argv[0], "@stats_set_aux")) 1179 r = message_stats_set_aux(md, argc, argv); 1180 else 1181 return 2; /* this wasn't a stats message */ 1182 1183 if (r == -EINVAL) 1184 DMWARN("Invalid parameters for message %s", argv[0]); 1185 1186 return r; 1187 } 1188 1189 int __init dm_statistics_init(void) 1190 { 1191 shared_memory_amount = 0; 1192 dm_stat_need_rcu_barrier = 0; 1193 return 0; 1194 } 1195 1196 void dm_statistics_exit(void) 1197 { 1198 if (dm_stat_need_rcu_barrier) 1199 rcu_barrier(); 1200 if (WARN_ON(shared_memory_amount)) 1201 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 1202 } 1203 1204 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); 1205 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); 1206