1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/errno.h> 3 #include <linux/numa.h> 4 #include <linux/slab.h> 5 #include <linux/rculist.h> 6 #include <linux/threads.h> 7 #include <linux/preempt.h> 8 #include <linux/irqflags.h> 9 #include <linux/vmalloc.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/device-mapper.h> 13 14 #include "dm-core.h" 15 #include "dm-stats.h" 16 17 #define DM_MSG_PREFIX "stats" 18 19 static int dm_stat_need_rcu_barrier; 20 21 /* 22 * Using 64-bit values to avoid overflow (which is a 23 * problem that block/genhd.c's IO accounting has). 24 */ 25 struct dm_stat_percpu { 26 unsigned long long sectors[2]; 27 unsigned long long ios[2]; 28 unsigned long long merges[2]; 29 unsigned long long ticks[2]; 30 unsigned long long io_ticks[2]; 31 unsigned long long io_ticks_total; 32 unsigned long long time_in_queue; 33 unsigned long long *histogram; 34 }; 35 36 struct dm_stat_shared { 37 atomic_t in_flight[2]; 38 unsigned long long stamp; 39 struct dm_stat_percpu tmp; 40 }; 41 42 struct dm_stat { 43 struct list_head list_entry; 44 int id; 45 unsigned int stat_flags; 46 size_t n_entries; 47 sector_t start; 48 sector_t end; 49 sector_t step; 50 unsigned int n_histogram_entries; 51 unsigned long long *histogram_boundaries; 52 const char *program_id; 53 const char *aux_data; 54 struct rcu_head rcu_head; 55 size_t shared_alloc_size; 56 size_t percpu_alloc_size; 57 size_t histogram_alloc_size; 58 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 59 struct dm_stat_shared stat_shared[] __counted_by(n_entries); 60 }; 61 62 #define STAT_PRECISE_TIMESTAMPS 1 63 64 struct dm_stats_last_position { 65 sector_t last_sector; 66 unsigned int last_rw; 67 }; 68 69 #define DM_STAT_MAX_ENTRIES 8388608 70 #define DM_STAT_MAX_HISTOGRAM_ENTRIES 134217728 71 72 /* 73 * A typo on the command line could possibly make the kernel run out of memory 74 * and crash. To prevent the crash we account all used memory. We fail if we 75 * exhaust 1/4 of all memory or 1/2 of vmalloc space. 76 */ 77 #define DM_STATS_MEMORY_FACTOR 4 78 #define DM_STATS_VMALLOC_FACTOR 2 79 80 static DEFINE_SPINLOCK(shared_memory_lock); 81 82 static unsigned long shared_memory_amount; 83 84 static bool __check_shared_memory(size_t alloc_size) 85 { 86 size_t a; 87 88 a = shared_memory_amount + alloc_size; 89 if (a < shared_memory_amount) 90 return false; 91 if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) 92 return false; 93 #ifdef CONFIG_MMU 94 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 95 return false; 96 #endif 97 return true; 98 } 99 100 static bool check_shared_memory(size_t alloc_size) 101 { 102 bool ret; 103 104 spin_lock_irq(&shared_memory_lock); 105 106 ret = __check_shared_memory(alloc_size); 107 108 spin_unlock_irq(&shared_memory_lock); 109 110 return ret; 111 } 112 113 static bool claim_shared_memory(size_t alloc_size) 114 { 115 spin_lock_irq(&shared_memory_lock); 116 117 if (!__check_shared_memory(alloc_size)) { 118 spin_unlock_irq(&shared_memory_lock); 119 return false; 120 } 121 122 shared_memory_amount += alloc_size; 123 124 spin_unlock_irq(&shared_memory_lock); 125 126 return true; 127 } 128 129 static void free_shared_memory(size_t alloc_size) 130 { 131 unsigned long flags; 132 133 spin_lock_irqsave(&shared_memory_lock, flags); 134 135 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 136 spin_unlock_irqrestore(&shared_memory_lock, flags); 137 DMCRIT("Memory usage accounting bug."); 138 return; 139 } 140 141 shared_memory_amount -= alloc_size; 142 143 spin_unlock_irqrestore(&shared_memory_lock, flags); 144 } 145 146 static void *dm_kvzalloc(size_t alloc_size, int node) 147 { 148 void *p; 149 150 if (!claim_shared_memory(alloc_size)) 151 return NULL; 152 153 p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node); 154 if (p) 155 return p; 156 157 free_shared_memory(alloc_size); 158 159 return NULL; 160 } 161 162 static void dm_kvfree(void *ptr, size_t alloc_size) 163 { 164 if (!ptr) 165 return; 166 167 free_shared_memory(alloc_size); 168 169 kvfree(ptr); 170 } 171 172 static void dm_stat_free(struct rcu_head *head) 173 { 174 int cpu; 175 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 176 177 kfree(s->histogram_boundaries); 178 kfree(s->program_id); 179 kfree(s->aux_data); 180 for_each_possible_cpu(cpu) { 181 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 182 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 183 } 184 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 185 dm_kvfree(s, s->shared_alloc_size); 186 } 187 188 static int dm_stat_in_flight(struct dm_stat_shared *shared) 189 { 190 return atomic_read(&shared->in_flight[READ]) + 191 atomic_read(&shared->in_flight[WRITE]); 192 } 193 194 int dm_stats_init(struct dm_stats *stats) 195 { 196 int cpu; 197 struct dm_stats_last_position *last; 198 199 mutex_init(&stats->mutex); 200 INIT_LIST_HEAD(&stats->list); 201 stats->precise_timestamps = false; 202 stats->last = alloc_percpu(struct dm_stats_last_position); 203 if (!stats->last) 204 return -ENOMEM; 205 206 for_each_possible_cpu(cpu) { 207 last = per_cpu_ptr(stats->last, cpu); 208 last->last_sector = (sector_t)ULLONG_MAX; 209 last->last_rw = UINT_MAX; 210 } 211 212 return 0; 213 } 214 215 void dm_stats_cleanup(struct dm_stats *stats) 216 { 217 size_t ni; 218 struct dm_stat *s; 219 struct dm_stat_shared *shared; 220 221 while (!list_empty(&stats->list)) { 222 s = container_of(stats->list.next, struct dm_stat, list_entry); 223 list_del(&s->list_entry); 224 for (ni = 0; ni < s->n_entries; ni++) { 225 shared = &s->stat_shared[ni]; 226 if (WARN_ON(dm_stat_in_flight(shared))) { 227 DMCRIT("leaked in-flight counter at index %lu " 228 "(start %llu, end %llu, step %llu): reads %d, writes %d", 229 (unsigned long)ni, 230 (unsigned long long)s->start, 231 (unsigned long long)s->end, 232 (unsigned long long)s->step, 233 atomic_read(&shared->in_flight[READ]), 234 atomic_read(&shared->in_flight[WRITE])); 235 } 236 cond_resched(); 237 } 238 dm_stat_free(&s->rcu_head); 239 } 240 free_percpu(stats->last); 241 mutex_destroy(&stats->mutex); 242 } 243 244 static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats) 245 { 246 struct list_head *l; 247 struct dm_stat *tmp_s; 248 bool precise_timestamps = false; 249 250 list_for_each(l, &stats->list) { 251 tmp_s = container_of(l, struct dm_stat, list_entry); 252 if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) { 253 precise_timestamps = true; 254 break; 255 } 256 } 257 stats->precise_timestamps = precise_timestamps; 258 } 259 260 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 261 sector_t step, unsigned int stat_flags, 262 unsigned int n_histogram_entries, 263 unsigned long long *histogram_boundaries, 264 const char *program_id, const char *aux_data, 265 void (*suspend_callback)(struct mapped_device *), 266 void (*resume_callback)(struct mapped_device *), 267 struct mapped_device *md) 268 { 269 struct list_head *l; 270 struct dm_stat *s, *tmp_s; 271 sector_t n_entries; 272 size_t ni; 273 size_t shared_alloc_size; 274 size_t percpu_alloc_size; 275 size_t histogram_alloc_size; 276 struct dm_stat_percpu *p; 277 int cpu; 278 int ret_id; 279 int r; 280 281 if (end < start || !step) 282 return -EINVAL; 283 284 n_entries = end - start; 285 if (dm_sector_div64(n_entries, step)) 286 n_entries++; 287 288 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 289 return -EOVERFLOW; 290 291 if (n_entries > DM_STAT_MAX_ENTRIES) 292 return -EOVERFLOW; 293 294 shared_alloc_size = struct_size(s, stat_shared, n_entries); 295 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 296 return -EOVERFLOW; 297 298 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 299 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 300 return -EOVERFLOW; 301 302 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 303 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 304 return -EOVERFLOW; 305 306 if ((n_histogram_entries + 1) * (size_t)n_entries > DM_STAT_MAX_HISTOGRAM_ENTRIES) 307 return -EOVERFLOW; 308 309 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 310 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 311 return -ENOMEM; 312 313 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 314 if (!s) 315 return -ENOMEM; 316 317 s->stat_flags = stat_flags; 318 s->n_entries = n_entries; 319 s->start = start; 320 s->end = end; 321 s->step = step; 322 s->shared_alloc_size = shared_alloc_size; 323 s->percpu_alloc_size = percpu_alloc_size; 324 s->histogram_alloc_size = histogram_alloc_size; 325 326 s->n_histogram_entries = n_histogram_entries; 327 s->histogram_boundaries = kmemdup(histogram_boundaries, 328 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 329 if (!s->histogram_boundaries) { 330 r = -ENOMEM; 331 goto out; 332 } 333 334 s->program_id = kstrdup(program_id, GFP_KERNEL); 335 if (!s->program_id) { 336 r = -ENOMEM; 337 goto out; 338 } 339 s->aux_data = kstrdup(aux_data, GFP_KERNEL); 340 if (!s->aux_data) { 341 r = -ENOMEM; 342 goto out; 343 } 344 345 for (ni = 0; ni < n_entries; ni++) { 346 atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 347 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 348 cond_resched(); 349 } 350 351 if (s->n_histogram_entries) { 352 unsigned long long *hi; 353 354 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 355 if (!hi) { 356 r = -ENOMEM; 357 goto out; 358 } 359 for (ni = 0; ni < n_entries; ni++) { 360 s->stat_shared[ni].tmp.histogram = hi; 361 hi += s->n_histogram_entries + 1; 362 cond_resched(); 363 } 364 } 365 366 for_each_possible_cpu(cpu) { 367 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 368 if (!p) { 369 r = -ENOMEM; 370 goto out; 371 } 372 s->stat_percpu[cpu] = p; 373 if (s->n_histogram_entries) { 374 unsigned long long *hi; 375 376 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 377 if (!hi) { 378 r = -ENOMEM; 379 goto out; 380 } 381 for (ni = 0; ni < n_entries; ni++) { 382 p[ni].histogram = hi; 383 hi += s->n_histogram_entries + 1; 384 cond_resched(); 385 } 386 } 387 } 388 389 /* 390 * Suspend/resume to make sure there is no i/o in flight, 391 * so that newly created statistics will be exact. 392 * 393 * (note: we couldn't suspend earlier because we must not 394 * allocate memory while suspended) 395 */ 396 suspend_callback(md); 397 398 mutex_lock(&stats->mutex); 399 s->id = 0; 400 list_for_each(l, &stats->list) { 401 tmp_s = container_of(l, struct dm_stat, list_entry); 402 if (WARN_ON(tmp_s->id < s->id)) { 403 r = -EINVAL; 404 goto out_unlock_resume; 405 } 406 if (tmp_s->id > s->id) 407 break; 408 if (unlikely(s->id == INT_MAX)) { 409 r = -ENFILE; 410 goto out_unlock_resume; 411 } 412 s->id++; 413 } 414 ret_id = s->id; 415 list_add_tail_rcu(&s->list_entry, l); 416 417 dm_stats_recalc_precise_timestamps(stats); 418 419 if (!static_key_enabled(&stats_enabled.key)) 420 static_branch_enable(&stats_enabled); 421 422 mutex_unlock(&stats->mutex); 423 424 resume_callback(md); 425 426 return ret_id; 427 428 out_unlock_resume: 429 mutex_unlock(&stats->mutex); 430 resume_callback(md); 431 out: 432 dm_stat_free(&s->rcu_head); 433 return r; 434 } 435 436 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 437 { 438 struct dm_stat *s; 439 440 list_for_each_entry(s, &stats->list, list_entry) { 441 if (s->id > id) 442 break; 443 if (s->id == id) 444 return s; 445 } 446 447 return NULL; 448 } 449 450 static int dm_stats_delete(struct dm_stats *stats, int id) 451 { 452 struct dm_stat *s; 453 int cpu; 454 455 mutex_lock(&stats->mutex); 456 457 s = __dm_stats_find(stats, id); 458 if (!s) { 459 mutex_unlock(&stats->mutex); 460 return -ENOENT; 461 } 462 463 list_del_rcu(&s->list_entry); 464 465 dm_stats_recalc_precise_timestamps(stats); 466 467 mutex_unlock(&stats->mutex); 468 469 /* 470 * vfree can't be called from RCU callback 471 */ 472 for_each_possible_cpu(cpu) 473 if (is_vmalloc_addr(s->stat_percpu) || 474 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 475 goto do_sync_free; 476 if (is_vmalloc_addr(s) || 477 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 478 do_sync_free: 479 synchronize_rcu_expedited(); 480 dm_stat_free(&s->rcu_head); 481 } else { 482 WRITE_ONCE(dm_stat_need_rcu_barrier, 1); 483 call_rcu(&s->rcu_head, dm_stat_free); 484 } 485 return 0; 486 } 487 488 static int dm_stats_list(struct dm_stats *stats, const char *program, 489 char *result, unsigned int maxlen) 490 { 491 struct dm_stat *s; 492 sector_t len; 493 unsigned int sz = 0; 494 495 /* 496 * Output format: 497 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 498 */ 499 500 mutex_lock(&stats->mutex); 501 list_for_each_entry(s, &stats->list, list_entry) { 502 if (!program || !strcmp(program, s->program_id)) { 503 len = s->end - s->start; 504 DMEMIT("%d: %llu+%llu %llu %s %s", s->id, 505 (unsigned long long)s->start, 506 (unsigned long long)len, 507 (unsigned long long)s->step, 508 s->program_id, 509 s->aux_data); 510 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 511 DMEMIT(" precise_timestamps"); 512 if (s->n_histogram_entries) { 513 unsigned int i; 514 515 DMEMIT(" histogram:"); 516 for (i = 0; i < s->n_histogram_entries; i++) { 517 if (i) 518 DMEMIT(","); 519 DMEMIT("%llu", s->histogram_boundaries[i]); 520 } 521 } 522 DMEMIT("\n"); 523 } 524 cond_resched(); 525 } 526 mutex_unlock(&stats->mutex); 527 528 return 1; 529 } 530 531 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 532 struct dm_stat_percpu *p) 533 { 534 /* 535 * This is racy, but so is part_round_stats_single. 536 */ 537 unsigned long long now, difference; 538 unsigned int in_flight_read, in_flight_write; 539 540 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 541 now = jiffies; 542 else 543 now = ktime_to_ns(ktime_get()); 544 545 difference = now - shared->stamp; 546 if (!difference) 547 return; 548 549 in_flight_read = (unsigned int)atomic_read(&shared->in_flight[READ]); 550 in_flight_write = (unsigned int)atomic_read(&shared->in_flight[WRITE]); 551 if (in_flight_read) 552 p->io_ticks[READ] += difference; 553 if (in_flight_write) 554 p->io_ticks[WRITE] += difference; 555 if (in_flight_read + in_flight_write) { 556 p->io_ticks_total += difference; 557 p->time_in_queue += (in_flight_read + in_flight_write) * difference; 558 } 559 shared->stamp = now; 560 } 561 562 static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 563 int idx, sector_t len, 564 struct dm_stats_aux *stats_aux, bool end, 565 unsigned long duration_jiffies) 566 { 567 struct dm_stat_shared *shared = &s->stat_shared[entry]; 568 struct dm_stat_percpu *p; 569 570 /* 571 * For strict correctness we should use local_irq_save/restore 572 * instead of preempt_disable/enable. 573 * 574 * preempt_disable/enable is racy if the driver finishes bios 575 * from non-interrupt context as well as from interrupt context 576 * or from more different interrupts. 577 * 578 * On 64-bit architectures the race only results in not counting some 579 * events, so it is acceptable. On 32-bit architectures the race could 580 * cause the counter going off by 2^32, so we need to do proper locking 581 * there. 582 * 583 * part_stat_lock()/part_stat_unlock() have this race too. 584 */ 585 #if BITS_PER_LONG == 32 586 unsigned long flags; 587 588 local_irq_save(flags); 589 #else 590 preempt_disable(); 591 #endif 592 p = &s->stat_percpu[smp_processor_id()][entry]; 593 594 if (!end) { 595 dm_stat_round(s, shared, p); 596 atomic_inc(&shared->in_flight[idx]); 597 } else { 598 unsigned long long duration; 599 600 dm_stat_round(s, shared, p); 601 atomic_dec(&shared->in_flight[idx]); 602 p->sectors[idx] += len; 603 p->ios[idx] += 1; 604 p->merges[idx] += stats_aux->merged; 605 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 606 p->ticks[idx] += duration_jiffies; 607 duration = jiffies_to_msecs(duration_jiffies); 608 } else { 609 p->ticks[idx] += stats_aux->duration_ns; 610 duration = stats_aux->duration_ns; 611 } 612 if (s->n_histogram_entries) { 613 unsigned int lo = 0, hi = s->n_histogram_entries + 1; 614 615 while (lo + 1 < hi) { 616 unsigned int mid = (lo + hi) / 2; 617 618 if (s->histogram_boundaries[mid - 1] > duration) 619 hi = mid; 620 else 621 lo = mid; 622 } 623 p->histogram[lo]++; 624 } 625 } 626 627 #if BITS_PER_LONG == 32 628 local_irq_restore(flags); 629 #else 630 preempt_enable(); 631 #endif 632 } 633 634 static void __dm_stat_bio(struct dm_stat *s, int bi_rw, 635 sector_t bi_sector, sector_t end_sector, 636 bool end, unsigned long duration_jiffies, 637 struct dm_stats_aux *stats_aux) 638 { 639 sector_t rel_sector, offset, todo, fragment_len; 640 size_t entry; 641 642 if (end_sector <= s->start || bi_sector >= s->end) 643 return; 644 if (unlikely(bi_sector < s->start)) { 645 rel_sector = 0; 646 todo = end_sector - s->start; 647 } else { 648 rel_sector = bi_sector - s->start; 649 todo = end_sector - bi_sector; 650 } 651 if (unlikely(end_sector > s->end)) 652 todo -= (end_sector - s->end); 653 654 offset = dm_sector_div64(rel_sector, s->step); 655 entry = rel_sector; 656 do { 657 if (WARN_ON_ONCE(entry >= s->n_entries)) { 658 DMCRIT("Invalid area access in region id %d", s->id); 659 return; 660 } 661 fragment_len = todo; 662 if (fragment_len > s->step - offset) 663 fragment_len = s->step - offset; 664 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 665 stats_aux, end, duration_jiffies); 666 todo -= fragment_len; 667 entry++; 668 offset = 0; 669 } while (unlikely(todo != 0)); 670 } 671 672 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 673 sector_t bi_sector, unsigned int bi_sectors, bool end, 674 unsigned long start_time, 675 struct dm_stats_aux *stats_aux) 676 { 677 struct dm_stat *s; 678 sector_t end_sector; 679 struct dm_stats_last_position *last; 680 bool got_precise_time; 681 unsigned long duration_jiffies = 0; 682 683 if (unlikely(!bi_sectors)) 684 return; 685 686 end_sector = bi_sector + bi_sectors; 687 688 if (!end) { 689 /* 690 * A race condition can at worst result in the merged flag being 691 * misrepresented, so we don't have to disable preemption here. 692 */ 693 last = raw_cpu_ptr(stats->last); 694 stats_aux->merged = 695 (bi_sector == (READ_ONCE(last->last_sector) && 696 ((bi_rw == WRITE) == 697 (READ_ONCE(last->last_rw) == WRITE)) 698 )); 699 WRITE_ONCE(last->last_sector, end_sector); 700 WRITE_ONCE(last->last_rw, bi_rw); 701 } else 702 duration_jiffies = jiffies - start_time; 703 704 rcu_read_lock(); 705 706 got_precise_time = false; 707 list_for_each_entry_rcu(s, &stats->list, list_entry) { 708 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 709 /* start (!end) duration_ns is set by DM core's alloc_io() */ 710 if (end) 711 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 712 got_precise_time = true; 713 } 714 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 715 } 716 717 rcu_read_unlock(); 718 } 719 720 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 721 struct dm_stat *s, size_t x) 722 { 723 int cpu; 724 struct dm_stat_percpu *p; 725 726 local_irq_disable(); 727 p = &s->stat_percpu[smp_processor_id()][x]; 728 dm_stat_round(s, shared, p); 729 local_irq_enable(); 730 731 shared->tmp.sectors[READ] = 0; 732 shared->tmp.sectors[WRITE] = 0; 733 shared->tmp.ios[READ] = 0; 734 shared->tmp.ios[WRITE] = 0; 735 shared->tmp.merges[READ] = 0; 736 shared->tmp.merges[WRITE] = 0; 737 shared->tmp.ticks[READ] = 0; 738 shared->tmp.ticks[WRITE] = 0; 739 shared->tmp.io_ticks[READ] = 0; 740 shared->tmp.io_ticks[WRITE] = 0; 741 shared->tmp.io_ticks_total = 0; 742 shared->tmp.time_in_queue = 0; 743 744 if (s->n_histogram_entries) 745 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 746 747 for_each_possible_cpu(cpu) { 748 p = &s->stat_percpu[cpu][x]; 749 shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]); 750 shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]); 751 shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]); 752 shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]); 753 shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]); 754 shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]); 755 shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]); 756 shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]); 757 shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]); 758 shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]); 759 shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total); 760 shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue); 761 if (s->n_histogram_entries) { 762 unsigned int i; 763 764 for (i = 0; i < s->n_histogram_entries + 1; i++) 765 shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]); 766 } 767 } 768 } 769 770 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 771 bool init_tmp_percpu_totals) 772 { 773 size_t x; 774 struct dm_stat_shared *shared; 775 struct dm_stat_percpu *p; 776 777 for (x = idx_start; x < idx_end; x++) { 778 shared = &s->stat_shared[x]; 779 if (init_tmp_percpu_totals) 780 __dm_stat_init_temporary_percpu_totals(shared, s, x); 781 local_irq_disable(); 782 p = &s->stat_percpu[smp_processor_id()][x]; 783 p->sectors[READ] -= shared->tmp.sectors[READ]; 784 p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 785 p->ios[READ] -= shared->tmp.ios[READ]; 786 p->ios[WRITE] -= shared->tmp.ios[WRITE]; 787 p->merges[READ] -= shared->tmp.merges[READ]; 788 p->merges[WRITE] -= shared->tmp.merges[WRITE]; 789 p->ticks[READ] -= shared->tmp.ticks[READ]; 790 p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 791 p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 792 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 793 p->io_ticks_total -= shared->tmp.io_ticks_total; 794 p->time_in_queue -= shared->tmp.time_in_queue; 795 local_irq_enable(); 796 if (s->n_histogram_entries) { 797 unsigned int i; 798 799 for (i = 0; i < s->n_histogram_entries + 1; i++) { 800 local_irq_disable(); 801 p = &s->stat_percpu[smp_processor_id()][x]; 802 p->histogram[i] -= shared->tmp.histogram[i]; 803 local_irq_enable(); 804 } 805 } 806 cond_resched(); 807 } 808 } 809 810 static int dm_stats_clear(struct dm_stats *stats, int id) 811 { 812 struct dm_stat *s; 813 814 mutex_lock(&stats->mutex); 815 816 s = __dm_stats_find(stats, id); 817 if (!s) { 818 mutex_unlock(&stats->mutex); 819 return -ENOENT; 820 } 821 822 __dm_stat_clear(s, 0, s->n_entries, true); 823 824 mutex_unlock(&stats->mutex); 825 826 return 1; 827 } 828 829 /* 830 * This is like jiffies_to_msec, but works for 64-bit values. 831 */ 832 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 833 { 834 unsigned long long result; 835 unsigned int mult; 836 837 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 838 return j; 839 840 result = 0; 841 if (j) 842 result = jiffies_to_msecs(j & 0x3fffff); 843 if (j >= 1 << 22) { 844 mult = jiffies_to_msecs(1 << 22); 845 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 846 } 847 if (j >= 1ULL << 44) 848 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 849 850 return result; 851 } 852 853 static int dm_stats_print(struct dm_stats *stats, int id, 854 size_t idx_start, size_t idx_len, 855 bool clear, char *result, unsigned int maxlen) 856 { 857 unsigned int sz = 0; 858 struct dm_stat *s; 859 size_t x; 860 sector_t start, end, step; 861 size_t idx_end; 862 struct dm_stat_shared *shared; 863 864 /* 865 * Output format: 866 * <start_sector>+<length> counters 867 */ 868 869 mutex_lock(&stats->mutex); 870 871 s = __dm_stats_find(stats, id); 872 if (!s) { 873 mutex_unlock(&stats->mutex); 874 return -ENOENT; 875 } 876 877 idx_end = idx_start + idx_len; 878 if (idx_end < idx_start || 879 idx_end > s->n_entries) 880 idx_end = s->n_entries; 881 882 if (idx_start > idx_end) 883 idx_start = idx_end; 884 885 step = s->step; 886 start = s->start + (step * idx_start); 887 888 for (x = idx_start; x < idx_end; x++, start = end) { 889 shared = &s->stat_shared[x]; 890 end = start + step; 891 if (unlikely(end > s->end)) 892 end = s->end; 893 894 __dm_stat_init_temporary_percpu_totals(shared, s, x); 895 896 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 897 (unsigned long long)start, 898 (unsigned long long)step, 899 shared->tmp.ios[READ], 900 shared->tmp.merges[READ], 901 shared->tmp.sectors[READ], 902 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 903 shared->tmp.ios[WRITE], 904 shared->tmp.merges[WRITE], 905 shared->tmp.sectors[WRITE], 906 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 907 dm_stat_in_flight(shared), 908 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 909 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 910 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 911 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 912 if (s->n_histogram_entries) { 913 unsigned int i; 914 915 for (i = 0; i < s->n_histogram_entries + 1; i++) 916 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 917 } 918 DMEMIT("\n"); 919 920 if (unlikely(sz + 1 >= maxlen)) 921 goto buffer_overflow; 922 923 cond_resched(); 924 } 925 926 if (clear) 927 __dm_stat_clear(s, idx_start, idx_end, false); 928 929 buffer_overflow: 930 mutex_unlock(&stats->mutex); 931 932 return 1; 933 } 934 935 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 936 { 937 struct dm_stat *s; 938 const char *new_aux_data; 939 940 mutex_lock(&stats->mutex); 941 942 s = __dm_stats_find(stats, id); 943 if (!s) { 944 mutex_unlock(&stats->mutex); 945 return -ENOENT; 946 } 947 948 new_aux_data = kstrdup(aux_data, GFP_KERNEL); 949 if (!new_aux_data) { 950 mutex_unlock(&stats->mutex); 951 return -ENOMEM; 952 } 953 954 kfree(s->aux_data); 955 s->aux_data = new_aux_data; 956 957 mutex_unlock(&stats->mutex); 958 959 return 0; 960 } 961 962 static int parse_histogram(const char *h, unsigned int *n_histogram_entries, 963 unsigned long long **histogram_boundaries) 964 { 965 const char *q; 966 unsigned int n; 967 unsigned long long last; 968 969 *n_histogram_entries = 1; 970 for (q = h; *q; q++) 971 if (*q == ',') 972 (*n_histogram_entries)++; 973 974 *histogram_boundaries = kmalloc_array(*n_histogram_entries, 975 sizeof(unsigned long long), 976 GFP_KERNEL); 977 if (!*histogram_boundaries) 978 return -ENOMEM; 979 980 n = 0; 981 last = 0; 982 while (1) { 983 unsigned long long hi; 984 int s; 985 char ch; 986 987 s = sscanf(h, "%llu%c", &hi, &ch); 988 if (!s || (s == 2 && ch != ',')) 989 return -EINVAL; 990 if (hi <= last) 991 return -EINVAL; 992 last = hi; 993 (*histogram_boundaries)[n] = hi; 994 if (s == 1) 995 return 0; 996 h = strchr(h, ',') + 1; 997 n++; 998 } 999 } 1000 1001 static int message_stats_create(struct mapped_device *md, 1002 unsigned int argc, char **argv, 1003 char *result, unsigned int maxlen) 1004 { 1005 int r; 1006 int id; 1007 char dummy; 1008 unsigned long long start, end, len, step; 1009 unsigned int divisor; 1010 const char *program_id, *aux_data; 1011 unsigned int stat_flags = 0; 1012 unsigned int n_histogram_entries = 0; 1013 unsigned long long *histogram_boundaries = NULL; 1014 struct dm_arg_set as, as_backup; 1015 const char *a; 1016 unsigned int feature_args; 1017 1018 /* 1019 * Input format: 1020 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 1021 */ 1022 1023 if (argc < 3) 1024 goto ret_einval; 1025 1026 as.argc = argc; 1027 as.argv = argv; 1028 dm_consume_args(&as, 1); 1029 1030 a = dm_shift_arg(&as); 1031 if (!strcmp(a, "-")) { 1032 start = 0; 1033 len = dm_get_size(md); 1034 if (!len) 1035 len = 1; 1036 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 1037 start != (sector_t)start || len != (sector_t)len) 1038 goto ret_einval; 1039 1040 end = start + len; 1041 if (start >= end) 1042 goto ret_einval; 1043 1044 a = dm_shift_arg(&as); 1045 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 1046 if (!divisor) 1047 return -EINVAL; 1048 step = end - start; 1049 if (do_div(step, divisor)) 1050 step++; 1051 if (!step) 1052 step = 1; 1053 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 1054 step != (sector_t)step || !step) 1055 goto ret_einval; 1056 1057 as_backup = as; 1058 a = dm_shift_arg(&as); 1059 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 1060 while (feature_args--) { 1061 a = dm_shift_arg(&as); 1062 if (!a) 1063 goto ret_einval; 1064 if (!strcasecmp(a, "precise_timestamps")) 1065 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1066 else if (!strncasecmp(a, "histogram:", 10)) { 1067 if (n_histogram_entries) 1068 goto ret_einval; 1069 r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries); 1070 if (r) 1071 goto ret; 1072 } else 1073 goto ret_einval; 1074 } 1075 } else { 1076 as = as_backup; 1077 } 1078 1079 program_id = "-"; 1080 aux_data = "-"; 1081 1082 a = dm_shift_arg(&as); 1083 if (a) 1084 program_id = a; 1085 1086 a = dm_shift_arg(&as); 1087 if (a) 1088 aux_data = a; 1089 1090 if (as.argc) 1091 goto ret_einval; 1092 1093 /* 1094 * If a buffer overflow happens after we created the region, 1095 * it's too late (the userspace would retry with a larger 1096 * buffer, but the region id that caused the overflow is already 1097 * leaked). So we must detect buffer overflow in advance. 1098 */ 1099 snprintf(result, maxlen, "%d", INT_MAX); 1100 if (dm_message_test_buffer_overflow(result, maxlen)) { 1101 r = 1; 1102 goto ret; 1103 } 1104 1105 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 1106 n_histogram_entries, histogram_boundaries, program_id, aux_data, 1107 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1108 if (id < 0) { 1109 r = id; 1110 goto ret; 1111 } 1112 1113 snprintf(result, maxlen, "%d", id); 1114 1115 r = 1; 1116 goto ret; 1117 1118 ret_einval: 1119 r = -EINVAL; 1120 ret: 1121 kfree(histogram_boundaries); 1122 return r; 1123 } 1124 1125 static int message_stats_delete(struct mapped_device *md, 1126 unsigned int argc, char **argv) 1127 { 1128 int id; 1129 char dummy; 1130 1131 if (argc != 2) 1132 return -EINVAL; 1133 1134 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1135 return -EINVAL; 1136 1137 return dm_stats_delete(dm_get_stats(md), id); 1138 } 1139 1140 static int message_stats_clear(struct mapped_device *md, 1141 unsigned int argc, char **argv) 1142 { 1143 int id; 1144 char dummy; 1145 1146 if (argc != 2) 1147 return -EINVAL; 1148 1149 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1150 return -EINVAL; 1151 1152 return dm_stats_clear(dm_get_stats(md), id); 1153 } 1154 1155 static int message_stats_list(struct mapped_device *md, 1156 unsigned int argc, char **argv, 1157 char *result, unsigned int maxlen) 1158 { 1159 int r; 1160 const char *program = NULL; 1161 1162 if (argc < 1 || argc > 2) 1163 return -EINVAL; 1164 1165 if (argc > 1) { 1166 program = kstrdup(argv[1], GFP_KERNEL); 1167 if (!program) 1168 return -ENOMEM; 1169 } 1170 1171 r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 1172 1173 kfree(program); 1174 1175 return r; 1176 } 1177 1178 static int message_stats_print(struct mapped_device *md, 1179 unsigned int argc, char **argv, bool clear, 1180 char *result, unsigned int maxlen) 1181 { 1182 int id; 1183 char dummy; 1184 unsigned long idx_start = 0, idx_len = ULONG_MAX; 1185 1186 if (argc != 2 && argc != 4) 1187 return -EINVAL; 1188 1189 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1190 return -EINVAL; 1191 1192 if (argc > 3) { 1193 if (strcmp(argv[2], "-") && 1194 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 1195 return -EINVAL; 1196 if (strcmp(argv[3], "-") && 1197 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 1198 return -EINVAL; 1199 } 1200 1201 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 1202 result, maxlen); 1203 } 1204 1205 static int message_stats_set_aux(struct mapped_device *md, 1206 unsigned int argc, char **argv) 1207 { 1208 int id; 1209 char dummy; 1210 1211 if (argc != 3) 1212 return -EINVAL; 1213 1214 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1215 return -EINVAL; 1216 1217 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 1218 } 1219 1220 int dm_stats_message(struct mapped_device *md, unsigned int argc, char **argv, 1221 char *result, unsigned int maxlen) 1222 { 1223 int r; 1224 1225 /* All messages here must start with '@' */ 1226 if (!strcasecmp(argv[0], "@stats_create")) 1227 r = message_stats_create(md, argc, argv, result, maxlen); 1228 else if (!strcasecmp(argv[0], "@stats_delete")) 1229 r = message_stats_delete(md, argc, argv); 1230 else if (!strcasecmp(argv[0], "@stats_clear")) 1231 r = message_stats_clear(md, argc, argv); 1232 else if (!strcasecmp(argv[0], "@stats_list")) 1233 r = message_stats_list(md, argc, argv, result, maxlen); 1234 else if (!strcasecmp(argv[0], "@stats_print")) 1235 r = message_stats_print(md, argc, argv, false, result, maxlen); 1236 else if (!strcasecmp(argv[0], "@stats_print_clear")) 1237 r = message_stats_print(md, argc, argv, true, result, maxlen); 1238 else if (!strcasecmp(argv[0], "@stats_set_aux")) 1239 r = message_stats_set_aux(md, argc, argv); 1240 else 1241 return 2; /* this wasn't a stats message */ 1242 1243 if (r == -EINVAL) 1244 DMCRIT("Invalid parameters for message %s", argv[0]); 1245 1246 return r; 1247 } 1248 1249 int __init dm_statistics_init(void) 1250 { 1251 shared_memory_amount = 0; 1252 dm_stat_need_rcu_barrier = 0; 1253 return 0; 1254 } 1255 1256 void dm_statistics_exit(void) 1257 { 1258 if (dm_stat_need_rcu_barrier) 1259 rcu_barrier(); 1260 if (WARN_ON(shared_memory_amount)) 1261 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 1262 } 1263 1264 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, 0444); 1265 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); 1266