1 #include <linux/errno.h> 2 #include <linux/numa.h> 3 #include <linux/slab.h> 4 #include <linux/rculist.h> 5 #include <linux/threads.h> 6 #include <linux/preempt.h> 7 #include <linux/irqflags.h> 8 #include <linux/vmalloc.h> 9 #include <linux/mm.h> 10 #include <linux/module.h> 11 #include <linux/device-mapper.h> 12 13 #include "dm-core.h" 14 #include "dm-stats.h" 15 16 #define DM_MSG_PREFIX "stats" 17 18 static int dm_stat_need_rcu_barrier; 19 20 /* 21 * Using 64-bit values to avoid overflow (which is a 22 * problem that block/genhd.c's IO accounting has). 23 */ 24 struct dm_stat_percpu { 25 unsigned long long sectors[2]; 26 unsigned long long ios[2]; 27 unsigned long long merges[2]; 28 unsigned long long ticks[2]; 29 unsigned long long io_ticks[2]; 30 unsigned long long io_ticks_total; 31 unsigned long long time_in_queue; 32 unsigned long long *histogram; 33 }; 34 35 struct dm_stat_shared { 36 atomic_t in_flight[2]; 37 unsigned long long stamp; 38 struct dm_stat_percpu tmp; 39 }; 40 41 struct dm_stat { 42 struct list_head list_entry; 43 int id; 44 unsigned stat_flags; 45 size_t n_entries; 46 sector_t start; 47 sector_t end; 48 sector_t step; 49 unsigned n_histogram_entries; 50 unsigned long long *histogram_boundaries; 51 const char *program_id; 52 const char *aux_data; 53 struct rcu_head rcu_head; 54 size_t shared_alloc_size; 55 size_t percpu_alloc_size; 56 size_t histogram_alloc_size; 57 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 58 struct dm_stat_shared stat_shared[0]; 59 }; 60 61 #define STAT_PRECISE_TIMESTAMPS 1 62 63 struct dm_stats_last_position { 64 sector_t last_sector; 65 unsigned last_rw; 66 }; 67 68 /* 69 * A typo on the command line could possibly make the kernel run out of memory 70 * and crash. To prevent the crash we account all used memory. We fail if we 71 * exhaust 1/4 of all memory or 1/2 of vmalloc space. 72 */ 73 #define DM_STATS_MEMORY_FACTOR 4 74 #define DM_STATS_VMALLOC_FACTOR 2 75 76 static DEFINE_SPINLOCK(shared_memory_lock); 77 78 static unsigned long shared_memory_amount; 79 80 static bool __check_shared_memory(size_t alloc_size) 81 { 82 size_t a; 83 84 a = shared_memory_amount + alloc_size; 85 if (a < shared_memory_amount) 86 return false; 87 if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) 88 return false; 89 #ifdef CONFIG_MMU 90 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 91 return false; 92 #endif 93 return true; 94 } 95 96 static bool check_shared_memory(size_t alloc_size) 97 { 98 bool ret; 99 100 spin_lock_irq(&shared_memory_lock); 101 102 ret = __check_shared_memory(alloc_size); 103 104 spin_unlock_irq(&shared_memory_lock); 105 106 return ret; 107 } 108 109 static bool claim_shared_memory(size_t alloc_size) 110 { 111 spin_lock_irq(&shared_memory_lock); 112 113 if (!__check_shared_memory(alloc_size)) { 114 spin_unlock_irq(&shared_memory_lock); 115 return false; 116 } 117 118 shared_memory_amount += alloc_size; 119 120 spin_unlock_irq(&shared_memory_lock); 121 122 return true; 123 } 124 125 static void free_shared_memory(size_t alloc_size) 126 { 127 unsigned long flags; 128 129 spin_lock_irqsave(&shared_memory_lock, flags); 130 131 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 132 spin_unlock_irqrestore(&shared_memory_lock, flags); 133 DMCRIT("Memory usage accounting bug."); 134 return; 135 } 136 137 shared_memory_amount -= alloc_size; 138 139 spin_unlock_irqrestore(&shared_memory_lock, flags); 140 } 141 142 static void *dm_kvzalloc(size_t alloc_size, int node) 143 { 144 void *p; 145 146 if (!claim_shared_memory(alloc_size)) 147 return NULL; 148 149 if (alloc_size <= KMALLOC_MAX_SIZE) { 150 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); 151 if (p) 152 return p; 153 } 154 p = vzalloc_node(alloc_size, node); 155 if (p) 156 return p; 157 158 free_shared_memory(alloc_size); 159 160 return NULL; 161 } 162 163 static void dm_kvfree(void *ptr, size_t alloc_size) 164 { 165 if (!ptr) 166 return; 167 168 free_shared_memory(alloc_size); 169 170 kvfree(ptr); 171 } 172 173 static void dm_stat_free(struct rcu_head *head) 174 { 175 int cpu; 176 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 177 178 kfree(s->program_id); 179 kfree(s->aux_data); 180 for_each_possible_cpu(cpu) { 181 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 182 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 183 } 184 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 185 dm_kvfree(s, s->shared_alloc_size); 186 } 187 188 static int dm_stat_in_flight(struct dm_stat_shared *shared) 189 { 190 return atomic_read(&shared->in_flight[READ]) + 191 atomic_read(&shared->in_flight[WRITE]); 192 } 193 194 void dm_stats_init(struct dm_stats *stats) 195 { 196 int cpu; 197 struct dm_stats_last_position *last; 198 199 mutex_init(&stats->mutex); 200 INIT_LIST_HEAD(&stats->list); 201 stats->last = alloc_percpu(struct dm_stats_last_position); 202 for_each_possible_cpu(cpu) { 203 last = per_cpu_ptr(stats->last, cpu); 204 last->last_sector = (sector_t)ULLONG_MAX; 205 last->last_rw = UINT_MAX; 206 } 207 } 208 209 void dm_stats_cleanup(struct dm_stats *stats) 210 { 211 size_t ni; 212 struct dm_stat *s; 213 struct dm_stat_shared *shared; 214 215 while (!list_empty(&stats->list)) { 216 s = container_of(stats->list.next, struct dm_stat, list_entry); 217 list_del(&s->list_entry); 218 for (ni = 0; ni < s->n_entries; ni++) { 219 shared = &s->stat_shared[ni]; 220 if (WARN_ON(dm_stat_in_flight(shared))) { 221 DMCRIT("leaked in-flight counter at index %lu " 222 "(start %llu, end %llu, step %llu): reads %d, writes %d", 223 (unsigned long)ni, 224 (unsigned long long)s->start, 225 (unsigned long long)s->end, 226 (unsigned long long)s->step, 227 atomic_read(&shared->in_flight[READ]), 228 atomic_read(&shared->in_flight[WRITE])); 229 } 230 } 231 dm_stat_free(&s->rcu_head); 232 } 233 free_percpu(stats->last); 234 } 235 236 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 237 sector_t step, unsigned stat_flags, 238 unsigned n_histogram_entries, 239 unsigned long long *histogram_boundaries, 240 const char *program_id, const char *aux_data, 241 void (*suspend_callback)(struct mapped_device *), 242 void (*resume_callback)(struct mapped_device *), 243 struct mapped_device *md) 244 { 245 struct list_head *l; 246 struct dm_stat *s, *tmp_s; 247 sector_t n_entries; 248 size_t ni; 249 size_t shared_alloc_size; 250 size_t percpu_alloc_size; 251 size_t histogram_alloc_size; 252 struct dm_stat_percpu *p; 253 int cpu; 254 int ret_id; 255 int r; 256 257 if (end < start || !step) 258 return -EINVAL; 259 260 n_entries = end - start; 261 if (dm_sector_div64(n_entries, step)) 262 n_entries++; 263 264 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 265 return -EOVERFLOW; 266 267 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); 268 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 269 return -EOVERFLOW; 270 271 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 272 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 273 return -EOVERFLOW; 274 275 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 276 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 277 return -EOVERFLOW; 278 279 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 280 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 281 return -ENOMEM; 282 283 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 284 if (!s) 285 return -ENOMEM; 286 287 s->stat_flags = stat_flags; 288 s->n_entries = n_entries; 289 s->start = start; 290 s->end = end; 291 s->step = step; 292 s->shared_alloc_size = shared_alloc_size; 293 s->percpu_alloc_size = percpu_alloc_size; 294 s->histogram_alloc_size = histogram_alloc_size; 295 296 s->n_histogram_entries = n_histogram_entries; 297 s->histogram_boundaries = kmemdup(histogram_boundaries, 298 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 299 if (!s->histogram_boundaries) { 300 r = -ENOMEM; 301 goto out; 302 } 303 304 s->program_id = kstrdup(program_id, GFP_KERNEL); 305 if (!s->program_id) { 306 r = -ENOMEM; 307 goto out; 308 } 309 s->aux_data = kstrdup(aux_data, GFP_KERNEL); 310 if (!s->aux_data) { 311 r = -ENOMEM; 312 goto out; 313 } 314 315 for (ni = 0; ni < n_entries; ni++) { 316 atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 317 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 318 } 319 320 if (s->n_histogram_entries) { 321 unsigned long long *hi; 322 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 323 if (!hi) { 324 r = -ENOMEM; 325 goto out; 326 } 327 for (ni = 0; ni < n_entries; ni++) { 328 s->stat_shared[ni].tmp.histogram = hi; 329 hi += s->n_histogram_entries + 1; 330 } 331 } 332 333 for_each_possible_cpu(cpu) { 334 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 335 if (!p) { 336 r = -ENOMEM; 337 goto out; 338 } 339 s->stat_percpu[cpu] = p; 340 if (s->n_histogram_entries) { 341 unsigned long long *hi; 342 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 343 if (!hi) { 344 r = -ENOMEM; 345 goto out; 346 } 347 for (ni = 0; ni < n_entries; ni++) { 348 p[ni].histogram = hi; 349 hi += s->n_histogram_entries + 1; 350 } 351 } 352 } 353 354 /* 355 * Suspend/resume to make sure there is no i/o in flight, 356 * so that newly created statistics will be exact. 357 * 358 * (note: we couldn't suspend earlier because we must not 359 * allocate memory while suspended) 360 */ 361 suspend_callback(md); 362 363 mutex_lock(&stats->mutex); 364 s->id = 0; 365 list_for_each(l, &stats->list) { 366 tmp_s = container_of(l, struct dm_stat, list_entry); 367 if (WARN_ON(tmp_s->id < s->id)) { 368 r = -EINVAL; 369 goto out_unlock_resume; 370 } 371 if (tmp_s->id > s->id) 372 break; 373 if (unlikely(s->id == INT_MAX)) { 374 r = -ENFILE; 375 goto out_unlock_resume; 376 } 377 s->id++; 378 } 379 ret_id = s->id; 380 list_add_tail_rcu(&s->list_entry, l); 381 mutex_unlock(&stats->mutex); 382 383 resume_callback(md); 384 385 return ret_id; 386 387 out_unlock_resume: 388 mutex_unlock(&stats->mutex); 389 resume_callback(md); 390 out: 391 dm_stat_free(&s->rcu_head); 392 return r; 393 } 394 395 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 396 { 397 struct dm_stat *s; 398 399 list_for_each_entry(s, &stats->list, list_entry) { 400 if (s->id > id) 401 break; 402 if (s->id == id) 403 return s; 404 } 405 406 return NULL; 407 } 408 409 static int dm_stats_delete(struct dm_stats *stats, int id) 410 { 411 struct dm_stat *s; 412 int cpu; 413 414 mutex_lock(&stats->mutex); 415 416 s = __dm_stats_find(stats, id); 417 if (!s) { 418 mutex_unlock(&stats->mutex); 419 return -ENOENT; 420 } 421 422 list_del_rcu(&s->list_entry); 423 mutex_unlock(&stats->mutex); 424 425 /* 426 * vfree can't be called from RCU callback 427 */ 428 for_each_possible_cpu(cpu) 429 if (is_vmalloc_addr(s->stat_percpu) || 430 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 431 goto do_sync_free; 432 if (is_vmalloc_addr(s) || 433 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 434 do_sync_free: 435 synchronize_rcu_expedited(); 436 dm_stat_free(&s->rcu_head); 437 } else { 438 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; 439 call_rcu(&s->rcu_head, dm_stat_free); 440 } 441 return 0; 442 } 443 444 static int dm_stats_list(struct dm_stats *stats, const char *program, 445 char *result, unsigned maxlen) 446 { 447 struct dm_stat *s; 448 sector_t len; 449 unsigned sz = 0; 450 451 /* 452 * Output format: 453 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 454 */ 455 456 mutex_lock(&stats->mutex); 457 list_for_each_entry(s, &stats->list, list_entry) { 458 if (!program || !strcmp(program, s->program_id)) { 459 len = s->end - s->start; 460 DMEMIT("%d: %llu+%llu %llu %s %s", s->id, 461 (unsigned long long)s->start, 462 (unsigned long long)len, 463 (unsigned long long)s->step, 464 s->program_id, 465 s->aux_data); 466 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 467 DMEMIT(" precise_timestamps"); 468 if (s->n_histogram_entries) { 469 unsigned i; 470 DMEMIT(" histogram:"); 471 for (i = 0; i < s->n_histogram_entries; i++) { 472 if (i) 473 DMEMIT(","); 474 DMEMIT("%llu", s->histogram_boundaries[i]); 475 } 476 } 477 DMEMIT("\n"); 478 } 479 } 480 mutex_unlock(&stats->mutex); 481 482 return 1; 483 } 484 485 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 486 struct dm_stat_percpu *p) 487 { 488 /* 489 * This is racy, but so is part_round_stats_single. 490 */ 491 unsigned long long now, difference; 492 unsigned in_flight_read, in_flight_write; 493 494 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 495 now = jiffies; 496 else 497 now = ktime_to_ns(ktime_get()); 498 499 difference = now - shared->stamp; 500 if (!difference) 501 return; 502 503 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 504 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 505 if (in_flight_read) 506 p->io_ticks[READ] += difference; 507 if (in_flight_write) 508 p->io_ticks[WRITE] += difference; 509 if (in_flight_read + in_flight_write) { 510 p->io_ticks_total += difference; 511 p->time_in_queue += (in_flight_read + in_flight_write) * difference; 512 } 513 shared->stamp = now; 514 } 515 516 static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 517 int idx, sector_t len, 518 struct dm_stats_aux *stats_aux, bool end, 519 unsigned long duration_jiffies) 520 { 521 struct dm_stat_shared *shared = &s->stat_shared[entry]; 522 struct dm_stat_percpu *p; 523 524 /* 525 * For strict correctness we should use local_irq_save/restore 526 * instead of preempt_disable/enable. 527 * 528 * preempt_disable/enable is racy if the driver finishes bios 529 * from non-interrupt context as well as from interrupt context 530 * or from more different interrupts. 531 * 532 * On 64-bit architectures the race only results in not counting some 533 * events, so it is acceptable. On 32-bit architectures the race could 534 * cause the counter going off by 2^32, so we need to do proper locking 535 * there. 536 * 537 * part_stat_lock()/part_stat_unlock() have this race too. 538 */ 539 #if BITS_PER_LONG == 32 540 unsigned long flags; 541 local_irq_save(flags); 542 #else 543 preempt_disable(); 544 #endif 545 p = &s->stat_percpu[smp_processor_id()][entry]; 546 547 if (!end) { 548 dm_stat_round(s, shared, p); 549 atomic_inc(&shared->in_flight[idx]); 550 } else { 551 unsigned long long duration; 552 dm_stat_round(s, shared, p); 553 atomic_dec(&shared->in_flight[idx]); 554 p->sectors[idx] += len; 555 p->ios[idx] += 1; 556 p->merges[idx] += stats_aux->merged; 557 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 558 p->ticks[idx] += duration_jiffies; 559 duration = jiffies_to_msecs(duration_jiffies); 560 } else { 561 p->ticks[idx] += stats_aux->duration_ns; 562 duration = stats_aux->duration_ns; 563 } 564 if (s->n_histogram_entries) { 565 unsigned lo = 0, hi = s->n_histogram_entries + 1; 566 while (lo + 1 < hi) { 567 unsigned mid = (lo + hi) / 2; 568 if (s->histogram_boundaries[mid - 1] > duration) { 569 hi = mid; 570 } else { 571 lo = mid; 572 } 573 574 } 575 p->histogram[lo]++; 576 } 577 } 578 579 #if BITS_PER_LONG == 32 580 local_irq_restore(flags); 581 #else 582 preempt_enable(); 583 #endif 584 } 585 586 static void __dm_stat_bio(struct dm_stat *s, int bi_rw, 587 sector_t bi_sector, sector_t end_sector, 588 bool end, unsigned long duration_jiffies, 589 struct dm_stats_aux *stats_aux) 590 { 591 sector_t rel_sector, offset, todo, fragment_len; 592 size_t entry; 593 594 if (end_sector <= s->start || bi_sector >= s->end) 595 return; 596 if (unlikely(bi_sector < s->start)) { 597 rel_sector = 0; 598 todo = end_sector - s->start; 599 } else { 600 rel_sector = bi_sector - s->start; 601 todo = end_sector - bi_sector; 602 } 603 if (unlikely(end_sector > s->end)) 604 todo -= (end_sector - s->end); 605 606 offset = dm_sector_div64(rel_sector, s->step); 607 entry = rel_sector; 608 do { 609 if (WARN_ON_ONCE(entry >= s->n_entries)) { 610 DMCRIT("Invalid area access in region id %d", s->id); 611 return; 612 } 613 fragment_len = todo; 614 if (fragment_len > s->step - offset) 615 fragment_len = s->step - offset; 616 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 617 stats_aux, end, duration_jiffies); 618 todo -= fragment_len; 619 entry++; 620 offset = 0; 621 } while (unlikely(todo != 0)); 622 } 623 624 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 625 sector_t bi_sector, unsigned bi_sectors, bool end, 626 unsigned long duration_jiffies, 627 struct dm_stats_aux *stats_aux) 628 { 629 struct dm_stat *s; 630 sector_t end_sector; 631 struct dm_stats_last_position *last; 632 bool got_precise_time; 633 634 if (unlikely(!bi_sectors)) 635 return; 636 637 end_sector = bi_sector + bi_sectors; 638 639 if (!end) { 640 /* 641 * A race condition can at worst result in the merged flag being 642 * misrepresented, so we don't have to disable preemption here. 643 */ 644 last = raw_cpu_ptr(stats->last); 645 stats_aux->merged = 646 (bi_sector == (ACCESS_ONCE(last->last_sector) && 647 ((bi_rw == WRITE) == 648 (ACCESS_ONCE(last->last_rw) == WRITE)) 649 )); 650 ACCESS_ONCE(last->last_sector) = end_sector; 651 ACCESS_ONCE(last->last_rw) = bi_rw; 652 } 653 654 rcu_read_lock(); 655 656 got_precise_time = false; 657 list_for_each_entry_rcu(s, &stats->list, list_entry) { 658 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 659 if (!end) 660 stats_aux->duration_ns = ktime_to_ns(ktime_get()); 661 else 662 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 663 got_precise_time = true; 664 } 665 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 666 } 667 668 rcu_read_unlock(); 669 } 670 671 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 672 struct dm_stat *s, size_t x) 673 { 674 int cpu; 675 struct dm_stat_percpu *p; 676 677 local_irq_disable(); 678 p = &s->stat_percpu[smp_processor_id()][x]; 679 dm_stat_round(s, shared, p); 680 local_irq_enable(); 681 682 shared->tmp.sectors[READ] = 0; 683 shared->tmp.sectors[WRITE] = 0; 684 shared->tmp.ios[READ] = 0; 685 shared->tmp.ios[WRITE] = 0; 686 shared->tmp.merges[READ] = 0; 687 shared->tmp.merges[WRITE] = 0; 688 shared->tmp.ticks[READ] = 0; 689 shared->tmp.ticks[WRITE] = 0; 690 shared->tmp.io_ticks[READ] = 0; 691 shared->tmp.io_ticks[WRITE] = 0; 692 shared->tmp.io_ticks_total = 0; 693 shared->tmp.time_in_queue = 0; 694 695 if (s->n_histogram_entries) 696 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 697 698 for_each_possible_cpu(cpu) { 699 p = &s->stat_percpu[cpu][x]; 700 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); 701 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); 702 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); 703 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); 704 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); 705 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); 706 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); 707 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); 708 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); 709 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); 710 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); 711 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); 712 if (s->n_histogram_entries) { 713 unsigned i; 714 for (i = 0; i < s->n_histogram_entries + 1; i++) 715 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); 716 } 717 } 718 } 719 720 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 721 bool init_tmp_percpu_totals) 722 { 723 size_t x; 724 struct dm_stat_shared *shared; 725 struct dm_stat_percpu *p; 726 727 for (x = idx_start; x < idx_end; x++) { 728 shared = &s->stat_shared[x]; 729 if (init_tmp_percpu_totals) 730 __dm_stat_init_temporary_percpu_totals(shared, s, x); 731 local_irq_disable(); 732 p = &s->stat_percpu[smp_processor_id()][x]; 733 p->sectors[READ] -= shared->tmp.sectors[READ]; 734 p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 735 p->ios[READ] -= shared->tmp.ios[READ]; 736 p->ios[WRITE] -= shared->tmp.ios[WRITE]; 737 p->merges[READ] -= shared->tmp.merges[READ]; 738 p->merges[WRITE] -= shared->tmp.merges[WRITE]; 739 p->ticks[READ] -= shared->tmp.ticks[READ]; 740 p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 741 p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 742 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 743 p->io_ticks_total -= shared->tmp.io_ticks_total; 744 p->time_in_queue -= shared->tmp.time_in_queue; 745 local_irq_enable(); 746 if (s->n_histogram_entries) { 747 unsigned i; 748 for (i = 0; i < s->n_histogram_entries + 1; i++) { 749 local_irq_disable(); 750 p = &s->stat_percpu[smp_processor_id()][x]; 751 p->histogram[i] -= shared->tmp.histogram[i]; 752 local_irq_enable(); 753 } 754 } 755 } 756 } 757 758 static int dm_stats_clear(struct dm_stats *stats, int id) 759 { 760 struct dm_stat *s; 761 762 mutex_lock(&stats->mutex); 763 764 s = __dm_stats_find(stats, id); 765 if (!s) { 766 mutex_unlock(&stats->mutex); 767 return -ENOENT; 768 } 769 770 __dm_stat_clear(s, 0, s->n_entries, true); 771 772 mutex_unlock(&stats->mutex); 773 774 return 1; 775 } 776 777 /* 778 * This is like jiffies_to_msec, but works for 64-bit values. 779 */ 780 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 781 { 782 unsigned long long result; 783 unsigned mult; 784 785 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 786 return j; 787 788 result = 0; 789 if (j) 790 result = jiffies_to_msecs(j & 0x3fffff); 791 if (j >= 1 << 22) { 792 mult = jiffies_to_msecs(1 << 22); 793 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 794 } 795 if (j >= 1ULL << 44) 796 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 797 798 return result; 799 } 800 801 static int dm_stats_print(struct dm_stats *stats, int id, 802 size_t idx_start, size_t idx_len, 803 bool clear, char *result, unsigned maxlen) 804 { 805 unsigned sz = 0; 806 struct dm_stat *s; 807 size_t x; 808 sector_t start, end, step; 809 size_t idx_end; 810 struct dm_stat_shared *shared; 811 812 /* 813 * Output format: 814 * <start_sector>+<length> counters 815 */ 816 817 mutex_lock(&stats->mutex); 818 819 s = __dm_stats_find(stats, id); 820 if (!s) { 821 mutex_unlock(&stats->mutex); 822 return -ENOENT; 823 } 824 825 idx_end = idx_start + idx_len; 826 if (idx_end < idx_start || 827 idx_end > s->n_entries) 828 idx_end = s->n_entries; 829 830 if (idx_start > idx_end) 831 idx_start = idx_end; 832 833 step = s->step; 834 start = s->start + (step * idx_start); 835 836 for (x = idx_start; x < idx_end; x++, start = end) { 837 shared = &s->stat_shared[x]; 838 end = start + step; 839 if (unlikely(end > s->end)) 840 end = s->end; 841 842 __dm_stat_init_temporary_percpu_totals(shared, s, x); 843 844 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 845 (unsigned long long)start, 846 (unsigned long long)step, 847 shared->tmp.ios[READ], 848 shared->tmp.merges[READ], 849 shared->tmp.sectors[READ], 850 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 851 shared->tmp.ios[WRITE], 852 shared->tmp.merges[WRITE], 853 shared->tmp.sectors[WRITE], 854 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 855 dm_stat_in_flight(shared), 856 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 857 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 858 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 859 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 860 if (s->n_histogram_entries) { 861 unsigned i; 862 for (i = 0; i < s->n_histogram_entries + 1; i++) { 863 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 864 } 865 } 866 DMEMIT("\n"); 867 868 if (unlikely(sz + 1 >= maxlen)) 869 goto buffer_overflow; 870 } 871 872 if (clear) 873 __dm_stat_clear(s, idx_start, idx_end, false); 874 875 buffer_overflow: 876 mutex_unlock(&stats->mutex); 877 878 return 1; 879 } 880 881 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 882 { 883 struct dm_stat *s; 884 const char *new_aux_data; 885 886 mutex_lock(&stats->mutex); 887 888 s = __dm_stats_find(stats, id); 889 if (!s) { 890 mutex_unlock(&stats->mutex); 891 return -ENOENT; 892 } 893 894 new_aux_data = kstrdup(aux_data, GFP_KERNEL); 895 if (!new_aux_data) { 896 mutex_unlock(&stats->mutex); 897 return -ENOMEM; 898 } 899 900 kfree(s->aux_data); 901 s->aux_data = new_aux_data; 902 903 mutex_unlock(&stats->mutex); 904 905 return 0; 906 } 907 908 static int parse_histogram(const char *h, unsigned *n_histogram_entries, 909 unsigned long long **histogram_boundaries) 910 { 911 const char *q; 912 unsigned n; 913 unsigned long long last; 914 915 *n_histogram_entries = 1; 916 for (q = h; *q; q++) 917 if (*q == ',') 918 (*n_histogram_entries)++; 919 920 *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 921 if (!*histogram_boundaries) 922 return -ENOMEM; 923 924 n = 0; 925 last = 0; 926 while (1) { 927 unsigned long long hi; 928 int s; 929 char ch; 930 s = sscanf(h, "%llu%c", &hi, &ch); 931 if (!s || (s == 2 && ch != ',')) 932 return -EINVAL; 933 if (hi <= last) 934 return -EINVAL; 935 last = hi; 936 (*histogram_boundaries)[n] = hi; 937 if (s == 1) 938 return 0; 939 h = strchr(h, ',') + 1; 940 n++; 941 } 942 } 943 944 static int message_stats_create(struct mapped_device *md, 945 unsigned argc, char **argv, 946 char *result, unsigned maxlen) 947 { 948 int r; 949 int id; 950 char dummy; 951 unsigned long long start, end, len, step; 952 unsigned divisor; 953 const char *program_id, *aux_data; 954 unsigned stat_flags = 0; 955 956 unsigned n_histogram_entries = 0; 957 unsigned long long *histogram_boundaries = NULL; 958 959 struct dm_arg_set as, as_backup; 960 const char *a; 961 unsigned feature_args; 962 963 /* 964 * Input format: 965 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 966 */ 967 968 if (argc < 3) 969 goto ret_einval; 970 971 as.argc = argc; 972 as.argv = argv; 973 dm_consume_args(&as, 1); 974 975 a = dm_shift_arg(&as); 976 if (!strcmp(a, "-")) { 977 start = 0; 978 len = dm_get_size(md); 979 if (!len) 980 len = 1; 981 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 982 start != (sector_t)start || len != (sector_t)len) 983 goto ret_einval; 984 985 end = start + len; 986 if (start >= end) 987 goto ret_einval; 988 989 a = dm_shift_arg(&as); 990 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 991 if (!divisor) 992 return -EINVAL; 993 step = end - start; 994 if (do_div(step, divisor)) 995 step++; 996 if (!step) 997 step = 1; 998 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 999 step != (sector_t)step || !step) 1000 goto ret_einval; 1001 1002 as_backup = as; 1003 a = dm_shift_arg(&as); 1004 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 1005 while (feature_args--) { 1006 a = dm_shift_arg(&as); 1007 if (!a) 1008 goto ret_einval; 1009 if (!strcasecmp(a, "precise_timestamps")) 1010 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1011 else if (!strncasecmp(a, "histogram:", 10)) { 1012 if (n_histogram_entries) 1013 goto ret_einval; 1014 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) 1015 goto ret; 1016 } else 1017 goto ret_einval; 1018 } 1019 } else { 1020 as = as_backup; 1021 } 1022 1023 program_id = "-"; 1024 aux_data = "-"; 1025 1026 a = dm_shift_arg(&as); 1027 if (a) 1028 program_id = a; 1029 1030 a = dm_shift_arg(&as); 1031 if (a) 1032 aux_data = a; 1033 1034 if (as.argc) 1035 goto ret_einval; 1036 1037 /* 1038 * If a buffer overflow happens after we created the region, 1039 * it's too late (the userspace would retry with a larger 1040 * buffer, but the region id that caused the overflow is already 1041 * leaked). So we must detect buffer overflow in advance. 1042 */ 1043 snprintf(result, maxlen, "%d", INT_MAX); 1044 if (dm_message_test_buffer_overflow(result, maxlen)) { 1045 r = 1; 1046 goto ret; 1047 } 1048 1049 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 1050 n_histogram_entries, histogram_boundaries, program_id, aux_data, 1051 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1052 if (id < 0) { 1053 r = id; 1054 goto ret; 1055 } 1056 1057 snprintf(result, maxlen, "%d", id); 1058 1059 r = 1; 1060 goto ret; 1061 1062 ret_einval: 1063 r = -EINVAL; 1064 ret: 1065 kfree(histogram_boundaries); 1066 return r; 1067 } 1068 1069 static int message_stats_delete(struct mapped_device *md, 1070 unsigned argc, char **argv) 1071 { 1072 int id; 1073 char dummy; 1074 1075 if (argc != 2) 1076 return -EINVAL; 1077 1078 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1079 return -EINVAL; 1080 1081 return dm_stats_delete(dm_get_stats(md), id); 1082 } 1083 1084 static int message_stats_clear(struct mapped_device *md, 1085 unsigned argc, char **argv) 1086 { 1087 int id; 1088 char dummy; 1089 1090 if (argc != 2) 1091 return -EINVAL; 1092 1093 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1094 return -EINVAL; 1095 1096 return dm_stats_clear(dm_get_stats(md), id); 1097 } 1098 1099 static int message_stats_list(struct mapped_device *md, 1100 unsigned argc, char **argv, 1101 char *result, unsigned maxlen) 1102 { 1103 int r; 1104 const char *program = NULL; 1105 1106 if (argc < 1 || argc > 2) 1107 return -EINVAL; 1108 1109 if (argc > 1) { 1110 program = kstrdup(argv[1], GFP_KERNEL); 1111 if (!program) 1112 return -ENOMEM; 1113 } 1114 1115 r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 1116 1117 kfree(program); 1118 1119 return r; 1120 } 1121 1122 static int message_stats_print(struct mapped_device *md, 1123 unsigned argc, char **argv, bool clear, 1124 char *result, unsigned maxlen) 1125 { 1126 int id; 1127 char dummy; 1128 unsigned long idx_start = 0, idx_len = ULONG_MAX; 1129 1130 if (argc != 2 && argc != 4) 1131 return -EINVAL; 1132 1133 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1134 return -EINVAL; 1135 1136 if (argc > 3) { 1137 if (strcmp(argv[2], "-") && 1138 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 1139 return -EINVAL; 1140 if (strcmp(argv[3], "-") && 1141 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 1142 return -EINVAL; 1143 } 1144 1145 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 1146 result, maxlen); 1147 } 1148 1149 static int message_stats_set_aux(struct mapped_device *md, 1150 unsigned argc, char **argv) 1151 { 1152 int id; 1153 char dummy; 1154 1155 if (argc != 3) 1156 return -EINVAL; 1157 1158 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1159 return -EINVAL; 1160 1161 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 1162 } 1163 1164 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 1165 char *result, unsigned maxlen) 1166 { 1167 int r; 1168 1169 /* All messages here must start with '@' */ 1170 if (!strcasecmp(argv[0], "@stats_create")) 1171 r = message_stats_create(md, argc, argv, result, maxlen); 1172 else if (!strcasecmp(argv[0], "@stats_delete")) 1173 r = message_stats_delete(md, argc, argv); 1174 else if (!strcasecmp(argv[0], "@stats_clear")) 1175 r = message_stats_clear(md, argc, argv); 1176 else if (!strcasecmp(argv[0], "@stats_list")) 1177 r = message_stats_list(md, argc, argv, result, maxlen); 1178 else if (!strcasecmp(argv[0], "@stats_print")) 1179 r = message_stats_print(md, argc, argv, false, result, maxlen); 1180 else if (!strcasecmp(argv[0], "@stats_print_clear")) 1181 r = message_stats_print(md, argc, argv, true, result, maxlen); 1182 else if (!strcasecmp(argv[0], "@stats_set_aux")) 1183 r = message_stats_set_aux(md, argc, argv); 1184 else 1185 return 2; /* this wasn't a stats message */ 1186 1187 if (r == -EINVAL) 1188 DMWARN("Invalid parameters for message %s", argv[0]); 1189 1190 return r; 1191 } 1192 1193 int __init dm_statistics_init(void) 1194 { 1195 shared_memory_amount = 0; 1196 dm_stat_need_rcu_barrier = 0; 1197 return 0; 1198 } 1199 1200 void dm_statistics_exit(void) 1201 { 1202 if (dm_stat_need_rcu_barrier) 1203 rcu_barrier(); 1204 if (WARN_ON(shared_memory_amount)) 1205 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 1206 } 1207 1208 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); 1209 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); 1210