1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011-2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 4 * 5 * Parts came from evlist.c builtin-{top,stat,record}.c, see those files for further 6 * copyright notes. 7 */ 8 9 #include <sys/mman.h> 10 #include <inttypes.h> 11 #include <asm/bug.h> 12 #include <linux/zalloc.h> 13 #ifdef HAVE_LIBNUMA_SUPPORT 14 #include <numaif.h> 15 #endif 16 #include "debug.h" 17 #include "event.h" 18 #include "mmap.h" 19 #include "../perf.h" 20 #include "util.h" /* page_size */ 21 22 size_t perf_mmap__mmap_len(struct perf_mmap *map) 23 { 24 return map->mask + 1 + page_size; 25 } 26 27 /* When check_messup is true, 'end' must points to a good entry */ 28 static union perf_event *perf_mmap__read(struct perf_mmap *map, 29 u64 *startp, u64 end) 30 { 31 unsigned char *data = map->base + page_size; 32 union perf_event *event = NULL; 33 int diff = end - *startp; 34 35 if (diff >= (int)sizeof(event->header)) { 36 size_t size; 37 38 event = (union perf_event *)&data[*startp & map->mask]; 39 size = event->header.size; 40 41 if (size < sizeof(event->header) || diff < (int)size) 42 return NULL; 43 44 /* 45 * Event straddles the mmap boundary -- header should always 46 * be inside due to u64 alignment of output. 47 */ 48 if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) { 49 unsigned int offset = *startp; 50 unsigned int len = min(sizeof(*event), size), cpy; 51 void *dst = map->event_copy; 52 53 do { 54 cpy = min(map->mask + 1 - (offset & map->mask), len); 55 memcpy(dst, &data[offset & map->mask], cpy); 56 offset += cpy; 57 dst += cpy; 58 len -= cpy; 59 } while (len); 60 61 event = (union perf_event *)map->event_copy; 62 } 63 64 *startp += size; 65 } 66 67 return event; 68 } 69 70 /* 71 * Read event from ring buffer one by one. 72 * Return one event for each call. 73 * 74 * Usage: 75 * perf_mmap__read_init() 76 * while(event = perf_mmap__read_event()) { 77 * //process the event 78 * perf_mmap__consume() 79 * } 80 * perf_mmap__read_done() 81 */ 82 union perf_event *perf_mmap__read_event(struct perf_mmap *map) 83 { 84 union perf_event *event; 85 86 /* 87 * Check if event was unmapped due to a POLLHUP/POLLERR. 88 */ 89 if (!refcount_read(&map->refcnt)) 90 return NULL; 91 92 /* non-overwirte doesn't pause the ringbuffer */ 93 if (!map->overwrite) 94 map->end = perf_mmap__read_head(map); 95 96 event = perf_mmap__read(map, &map->start, map->end); 97 98 if (!map->overwrite) 99 map->prev = map->start; 100 101 return event; 102 } 103 104 static bool perf_mmap__empty(struct perf_mmap *map) 105 { 106 return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base; 107 } 108 109 void perf_mmap__get(struct perf_mmap *map) 110 { 111 refcount_inc(&map->refcnt); 112 } 113 114 void perf_mmap__put(struct perf_mmap *map) 115 { 116 BUG_ON(map->base && refcount_read(&map->refcnt) == 0); 117 118 if (refcount_dec_and_test(&map->refcnt)) 119 perf_mmap__munmap(map); 120 } 121 122 void perf_mmap__consume(struct perf_mmap *map) 123 { 124 if (!map->overwrite) { 125 u64 old = map->prev; 126 127 perf_mmap__write_tail(map, old); 128 } 129 130 if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map)) 131 perf_mmap__put(map); 132 } 133 134 int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, 135 struct auxtrace_mmap_params *mp __maybe_unused, 136 void *userpg __maybe_unused, 137 int fd __maybe_unused) 138 { 139 return 0; 140 } 141 142 void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused) 143 { 144 } 145 146 void __weak auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp __maybe_unused, 147 off_t auxtrace_offset __maybe_unused, 148 unsigned int auxtrace_pages __maybe_unused, 149 bool auxtrace_overwrite __maybe_unused) 150 { 151 } 152 153 void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __maybe_unused, 154 struct evlist *evlist __maybe_unused, 155 int idx __maybe_unused, 156 bool per_cpu __maybe_unused) 157 { 158 } 159 160 #ifdef HAVE_AIO_SUPPORT 161 static int perf_mmap__aio_enabled(struct perf_mmap *map) 162 { 163 return map->aio.nr_cblocks > 0; 164 } 165 166 #ifdef HAVE_LIBNUMA_SUPPORT 167 static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) 168 { 169 map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, 170 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 171 if (map->aio.data[idx] == MAP_FAILED) { 172 map->aio.data[idx] = NULL; 173 return -1; 174 } 175 176 return 0; 177 } 178 179 static void perf_mmap__aio_free(struct perf_mmap *map, int idx) 180 { 181 if (map->aio.data[idx]) { 182 munmap(map->aio.data[idx], perf_mmap__mmap_len(map)); 183 map->aio.data[idx] = NULL; 184 } 185 } 186 187 static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affinity) 188 { 189 void *data; 190 size_t mmap_len; 191 unsigned long node_mask; 192 193 if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) { 194 data = map->aio.data[idx]; 195 mmap_len = perf_mmap__mmap_len(map); 196 node_mask = 1UL << cpu__get_node(cpu); 197 if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) { 198 pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n", 199 data, data + mmap_len, cpu__get_node(cpu)); 200 return -1; 201 } 202 } 203 204 return 0; 205 } 206 #else /* !HAVE_LIBNUMA_SUPPORT */ 207 static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) 208 { 209 map->aio.data[idx] = malloc(perf_mmap__mmap_len(map)); 210 if (map->aio.data[idx] == NULL) 211 return -1; 212 213 return 0; 214 } 215 216 static void perf_mmap__aio_free(struct perf_mmap *map, int idx) 217 { 218 zfree(&(map->aio.data[idx])); 219 } 220 221 static int perf_mmap__aio_bind(struct perf_mmap *map __maybe_unused, int idx __maybe_unused, 222 int cpu __maybe_unused, int affinity __maybe_unused) 223 { 224 return 0; 225 } 226 #endif 227 228 static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp) 229 { 230 int delta_max, i, prio, ret; 231 232 map->aio.nr_cblocks = mp->nr_cblocks; 233 if (map->aio.nr_cblocks) { 234 map->aio.aiocb = calloc(map->aio.nr_cblocks, sizeof(struct aiocb *)); 235 if (!map->aio.aiocb) { 236 pr_debug2("failed to allocate aiocb for data buffer, error %m\n"); 237 return -1; 238 } 239 map->aio.cblocks = calloc(map->aio.nr_cblocks, sizeof(struct aiocb)); 240 if (!map->aio.cblocks) { 241 pr_debug2("failed to allocate cblocks for data buffer, error %m\n"); 242 return -1; 243 } 244 map->aio.data = calloc(map->aio.nr_cblocks, sizeof(void *)); 245 if (!map->aio.data) { 246 pr_debug2("failed to allocate data buffer, error %m\n"); 247 return -1; 248 } 249 delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX); 250 for (i = 0; i < map->aio.nr_cblocks; ++i) { 251 ret = perf_mmap__aio_alloc(map, i); 252 if (ret == -1) { 253 pr_debug2("failed to allocate data buffer area, error %m"); 254 return -1; 255 } 256 ret = perf_mmap__aio_bind(map, i, map->cpu, mp->affinity); 257 if (ret == -1) 258 return -1; 259 /* 260 * Use cblock.aio_fildes value different from -1 261 * to denote started aio write operation on the 262 * cblock so it requires explicit record__aio_sync() 263 * call prior the cblock may be reused again. 264 */ 265 map->aio.cblocks[i].aio_fildes = -1; 266 /* 267 * Allocate cblocks with priority delta to have 268 * faster aio write system calls because queued requests 269 * are kept in separate per-prio queues and adding 270 * a new request will iterate thru shorter per-prio 271 * list. Blocks with numbers higher than 272 * _SC_AIO_PRIO_DELTA_MAX go with priority 0. 273 */ 274 prio = delta_max - i; 275 map->aio.cblocks[i].aio_reqprio = prio >= 0 ? prio : 0; 276 } 277 } 278 279 return 0; 280 } 281 282 static void perf_mmap__aio_munmap(struct perf_mmap *map) 283 { 284 int i; 285 286 for (i = 0; i < map->aio.nr_cblocks; ++i) 287 perf_mmap__aio_free(map, i); 288 if (map->aio.data) 289 zfree(&map->aio.data); 290 zfree(&map->aio.cblocks); 291 zfree(&map->aio.aiocb); 292 } 293 #else /* !HAVE_AIO_SUPPORT */ 294 static int perf_mmap__aio_enabled(struct perf_mmap *map __maybe_unused) 295 { 296 return 0; 297 } 298 299 static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused, 300 struct mmap_params *mp __maybe_unused) 301 { 302 return 0; 303 } 304 305 static void perf_mmap__aio_munmap(struct perf_mmap *map __maybe_unused) 306 { 307 } 308 #endif 309 310 void perf_mmap__munmap(struct perf_mmap *map) 311 { 312 perf_mmap__aio_munmap(map); 313 if (map->data != NULL) { 314 munmap(map->data, perf_mmap__mmap_len(map)); 315 map->data = NULL; 316 } 317 if (map->base != NULL) { 318 munmap(map->base, perf_mmap__mmap_len(map)); 319 map->base = NULL; 320 map->fd = -1; 321 refcount_set(&map->refcnt, 0); 322 } 323 auxtrace_mmap__munmap(&map->auxtrace_mmap); 324 } 325 326 static void build_node_mask(int node, cpu_set_t *mask) 327 { 328 int c, cpu, nr_cpus; 329 const struct perf_cpu_map *cpu_map = NULL; 330 331 cpu_map = cpu_map__online(); 332 if (!cpu_map) 333 return; 334 335 nr_cpus = perf_cpu_map__nr(cpu_map); 336 for (c = 0; c < nr_cpus; c++) { 337 cpu = cpu_map->map[c]; /* map c index to online cpu index */ 338 if (cpu__get_node(cpu) == node) 339 CPU_SET(cpu, mask); 340 } 341 } 342 343 static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_params *mp) 344 { 345 CPU_ZERO(&map->affinity_mask); 346 if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) 347 build_node_mask(cpu__get_node(map->cpu), &map->affinity_mask); 348 else if (mp->affinity == PERF_AFFINITY_CPU) 349 CPU_SET(map->cpu, &map->affinity_mask); 350 } 351 352 int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu) 353 { 354 /* 355 * The last one will be done at perf_mmap__consume(), so that we 356 * make sure we don't prevent tools from consuming every last event in 357 * the ring buffer. 358 * 359 * I.e. we can get the POLLHUP meaning that the fd doesn't exist 360 * anymore, but the last events for it are still in the ring buffer, 361 * waiting to be consumed. 362 * 363 * Tools can chose to ignore this at their own discretion, but the 364 * evlist layer can't just drop it when filtering events in 365 * perf_evlist__filter_pollfd(). 366 */ 367 refcount_set(&map->refcnt, 2); 368 map->prev = 0; 369 map->mask = mp->mask; 370 map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, 371 MAP_SHARED, fd, 0); 372 if (map->base == MAP_FAILED) { 373 pr_debug2("failed to mmap perf event ring buffer, error %d\n", 374 errno); 375 map->base = NULL; 376 return -1; 377 } 378 map->fd = fd; 379 map->cpu = cpu; 380 381 perf_mmap__setup_affinity_mask(map, mp); 382 383 map->flush = mp->flush; 384 385 map->comp_level = mp->comp_level; 386 387 if (map->comp_level && !perf_mmap__aio_enabled(map)) { 388 map->data = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, 389 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 390 if (map->data == MAP_FAILED) { 391 pr_debug2("failed to mmap data buffer, error %d\n", 392 errno); 393 map->data = NULL; 394 return -1; 395 } 396 } 397 398 if (auxtrace_mmap__mmap(&map->auxtrace_mmap, 399 &mp->auxtrace_mp, map->base, fd)) 400 return -1; 401 402 return perf_mmap__aio_mmap(map, mp); 403 } 404 405 static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) 406 { 407 struct perf_event_header *pheader; 408 u64 evt_head = *start; 409 int size = mask + 1; 410 411 pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start); 412 pheader = (struct perf_event_header *)(buf + (*start & mask)); 413 while (true) { 414 if (evt_head - *start >= (unsigned int)size) { 415 pr_debug("Finished reading overwrite ring buffer: rewind\n"); 416 if (evt_head - *start > (unsigned int)size) 417 evt_head -= pheader->size; 418 *end = evt_head; 419 return 0; 420 } 421 422 pheader = (struct perf_event_header *)(buf + (evt_head & mask)); 423 424 if (pheader->size == 0) { 425 pr_debug("Finished reading overwrite ring buffer: get start\n"); 426 *end = evt_head; 427 return 0; 428 } 429 430 evt_head += pheader->size; 431 pr_debug3("move evt_head: %"PRIx64"\n", evt_head); 432 } 433 WARN_ONCE(1, "Shouldn't get here\n"); 434 return -1; 435 } 436 437 /* 438 * Report the start and end of the available data in ringbuffer 439 */ 440 static int __perf_mmap__read_init(struct perf_mmap *md) 441 { 442 u64 head = perf_mmap__read_head(md); 443 u64 old = md->prev; 444 unsigned char *data = md->base + page_size; 445 unsigned long size; 446 447 md->start = md->overwrite ? head : old; 448 md->end = md->overwrite ? old : head; 449 450 if ((md->end - md->start) < md->flush) 451 return -EAGAIN; 452 453 size = md->end - md->start; 454 if (size > (unsigned long)(md->mask) + 1) { 455 if (!md->overwrite) { 456 WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); 457 458 md->prev = head; 459 perf_mmap__consume(md); 460 return -EAGAIN; 461 } 462 463 /* 464 * Backward ring buffer is full. We still have a chance to read 465 * most of data from it. 466 */ 467 if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end)) 468 return -EINVAL; 469 } 470 471 return 0; 472 } 473 474 int perf_mmap__read_init(struct perf_mmap *map) 475 { 476 /* 477 * Check if event was unmapped due to a POLLHUP/POLLERR. 478 */ 479 if (!refcount_read(&map->refcnt)) 480 return -ENOENT; 481 482 return __perf_mmap__read_init(map); 483 } 484 485 int perf_mmap__push(struct perf_mmap *md, void *to, 486 int push(struct perf_mmap *map, void *to, void *buf, size_t size)) 487 { 488 u64 head = perf_mmap__read_head(md); 489 unsigned char *data = md->base + page_size; 490 unsigned long size; 491 void *buf; 492 int rc = 0; 493 494 rc = perf_mmap__read_init(md); 495 if (rc < 0) 496 return (rc == -EAGAIN) ? 1 : -1; 497 498 size = md->end - md->start; 499 500 if ((md->start & md->mask) + size != (md->end & md->mask)) { 501 buf = &data[md->start & md->mask]; 502 size = md->mask + 1 - (md->start & md->mask); 503 md->start += size; 504 505 if (push(md, to, buf, size) < 0) { 506 rc = -1; 507 goto out; 508 } 509 } 510 511 buf = &data[md->start & md->mask]; 512 size = md->end - md->start; 513 md->start += size; 514 515 if (push(md, to, buf, size) < 0) { 516 rc = -1; 517 goto out; 518 } 519 520 md->prev = head; 521 perf_mmap__consume(md); 522 out: 523 return rc; 524 } 525 526 /* 527 * Mandatory for overwrite mode 528 * The direction of overwrite mode is backward. 529 * The last perf_mmap__read() will set tail to map->prev. 530 * Need to correct the map->prev to head which is the end of next read. 531 */ 532 void perf_mmap__read_done(struct perf_mmap *map) 533 { 534 /* 535 * Check if event was unmapped due to a POLLHUP/POLLERR. 536 */ 537 if (!refcount_read(&map->refcnt)) 538 return; 539 540 map->prev = perf_mmap__read_head(map); 541 } 542