1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * PMEM - Direct mapping physical memory pages to userland process 28 * 29 * Provide functions used for directly (w/o occupying kernel virtual address 30 * space) allocating and exporting physical memory pages to userland. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/mutex.h> 35 #include <sys/sunddi.h> 36 #include <sys/ddidevmap.h> 37 #include <sys/vnode.h> 38 #include <sys/sysmacros.h> 39 #include <vm/seg_dev.h> 40 #include <sys/pmem.h> 41 #include <vm/hat_i86.h> 42 #include <sys/task.h> 43 #include <sys/sdt.h> 44 45 /* 46 * The routines in this file allocate memory which will be accessed through 47 * the AGP GART hardware. The GART is programmed with the PFNs for this 48 * memory, and the only mechanism for removing these entries is by an 49 * explicit process operation (ioctl/close of the driver, or process exit). 50 * As such, the pages need to remain locked to ensure that they won't be 51 * relocated or paged out. 52 * 53 * To prevent these locked pages from getting in the way of page 54 * coalescing, we try to allocate large pages from the system, and carve 55 * them up to satisfy pmem allocation requests. This will keep the locked 56 * pages within a constrained area of physical memory, limiting the number 57 * of large pages that would be pinned by our locked pages. This is, of 58 * course, another take on the infamous kernel cage, and it has many of the 59 * downsides of the original cage. It also interferes with system-wide 60 * resource management decisions, as it maintains its own pool of unused 61 * pages which can't be easily reclaimed and used during low-memory 62 * situations. 63 * 64 * The right solution is for pmem to register a callback that the VM system 65 * could call, which would temporarily remove any GART entries for pages 66 * that were being relocated. This would let us leave the pages unlocked, 67 * which would remove the need for using large pages, which would simplify 68 * this code a great deal. Unfortunately, the support for these callbacks 69 * only exists on some SPARC platforms right now. 70 * 71 * Note that this is the *only* reason that large pages are used here. The 72 * GART can't perform large-page translations, and the code appropriately 73 * falls back to using small pages if page_create_va_large() fails. 74 */ 75 76 #define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 77 { mutex_enter(&dhp->dh_lock); } 78 79 #define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 80 { mutex_exit(&dhp->dh_lock); } 81 82 #define FROM_LPG(pp) (pp->p_szc != 0) 83 #define PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1)) 84 85 /* 86 * Structs and static variables used for pmem only. 87 */ 88 typedef struct pmem_lpg { 89 page_t *pl_pp; /* start pp */ 90 ulong_t *pl_bitmap; /* allocation status for each page */ 91 ushort_t pl_pfree; /* this large page might be fully freed */ 92 struct pmem_lpg *pl_next; 93 struct pmem_lpg *pl_prev; 94 } pmem_lpg_t; 95 96 static size_t pmem_lpgsize; /* the size of one large page */ 97 static pgcnt_t pmem_pgcnt; /* the number of small pages in a large page */ 98 static uint_t pmem_lszc; /* page size code of the large page */ 99 /* The segment to be associated with all the allocated pages. */ 100 static struct seg pmem_seg; 101 /* Fully occupied large pages allocated for pmem. */ 102 static pmem_lpg_t *pmem_occ_lpgs; 103 /* Memory pool to store residual small pages from large pages. */ 104 static page_t *pmem_mpool = NULL; 105 /* Number of small pages reside in pmem_mpool currently. */ 106 static pgcnt_t pmem_nmpages = 0; 107 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */ 108 kmutex_t pmem_mutex; 109 110 static int lpg_isfree(pmem_lpg_t *); 111 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *); 112 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **); 113 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **); 114 static pmem_lpg_t *pmem_lpg_alloc(uint_t); 115 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *); 116 static void lpg_free(page_t *spp); 117 static pgcnt_t mpool_break(page_t **, pgcnt_t); 118 static void mpool_append(page_t **, pgcnt_t); 119 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *); 120 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **); 121 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **, 122 vnode_t *, u_offset_t *, uint_t); 123 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *); 124 static void tlist_out(page_t *, pgcnt_t); 125 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t); 126 static int pmem_lock(pgcnt_t, proc_t *p); 127 128 /* 129 * Called by driver devmap routine to pass physical memory mapping info to 130 * seg_dev framework, used only for physical memory allocated from 131 * devmap_pmem_alloc(). 132 */ 133 /* ARGSUSED */ 134 int 135 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip, 136 struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie, 137 offset_t off, size_t len, uint_t maxprot, uint_t flags, 138 const ddi_device_acc_attr_t *accattrp) 139 { 140 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 141 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 142 uint_t cache_attr = IOMEM_CACHE_ATTR(flags); 143 144 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 145 return (DDI_FAILURE); 146 147 /* 148 * First to check if this function has been called for this dhp. 149 */ 150 if (dhp->dh_flags & DEVMAP_SETUP_DONE) 151 return (DDI_FAILURE); 152 153 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 154 return (DDI_FAILURE); 155 156 /* 157 * Check if the cache attributes are supported. Need to pay 158 * attention that only uncachable or write-combining is 159 * permitted for pmem. 160 */ 161 if (i_ddi_check_cache_attr(flags) == B_FALSE || 162 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0) 163 return (DDI_FAILURE); 164 165 if (flags & DEVMAP_MAPPING_INVALID) { 166 /* 167 * If DEVMAP_MAPPING_INVALID is specified, we have to grant 168 * remap permission. 169 */ 170 if (!(flags & DEVMAP_ALLOW_REMAP)) 171 return (DDI_FAILURE); 172 } else { 173 dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp; 174 /* dh_roff is the offset inside the dh_pcookie. */ 175 dhp->dh_roff = ptob(btop(off)); 176 /* Set the cache attributes correctly */ 177 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr); 178 } 179 180 dhp->dh_cookie = DEVMAP_PMEM_COOKIE; 181 dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS); 182 dhp->dh_len = ptob(btopr(len)); 183 184 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 185 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 186 187 if (callbackops != NULL) { 188 bcopy(callbackops, &dhp->dh_callbackops, 189 sizeof (struct devmap_callback_ctl)); 190 } 191 192 /* 193 * Initialize dh_lock if we want to do remap. 194 */ 195 if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) { 196 mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); 197 dhp->dh_flags |= DEVMAP_LOCK_INITED; 198 } 199 200 dhp->dh_flags |= DEVMAP_SETUP_DONE; 201 202 return (DDI_SUCCESS); 203 } 204 205 /* 206 * Replace existing mapping using a new cookie, mainly gets called when doing 207 * fork(). Should be called in associated devmap_dup(9E). 208 */ 209 /* ARGSUSED */ 210 int 211 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip, 212 devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot, 213 uint_t flags, const ddi_device_acc_attr_t *accattrp) 214 { 215 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 216 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 217 uint_t cache_attr = IOMEM_CACHE_ATTR(flags); 218 219 /* 220 * Reture failure if setup has not been done or no remap permission 221 * has been granted during the setup. 222 */ 223 if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 || 224 (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0) 225 return (DDI_FAILURE); 226 227 /* No flags supported for remap yet. */ 228 if (flags != 0) 229 return (DDI_FAILURE); 230 231 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 232 return (DDI_FAILURE); 233 234 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 235 return (DDI_FAILURE); 236 237 /* 238 * Check if the cache attributes are supported. Need to pay 239 * attention that only uncachable or write-combining is 240 * permitted for pmem. 241 */ 242 if (i_ddi_check_cache_attr(flags) == B_FALSE || 243 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0) 244 return (DDI_FAILURE); 245 246 HOLD_DHP_LOCK(dhp); 247 /* 248 * Unload the old mapping of pages reloated with this dhp, so next 249 * fault will setup the new mappings. It is in segdev_faultpage that 250 * calls hat_devload to establish the mapping. Do this while holding 251 * the dhp lock so other faults dont reestablish the mappings. 252 */ 253 hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, 254 dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); 255 256 /* Set the cache attributes correctly */ 257 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr); 258 259 dhp->dh_pcookie = cookie; 260 dhp->dh_roff = ptob(btop(off)); 261 dhp->dh_len = ptob(btopr(len)); 262 263 /* Clear the large page size flag. */ 264 dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; 265 266 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 267 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 268 RELE_DHP_LOCK(dhp); 269 return (DDI_SUCCESS); 270 } 271 272 /* 273 * Directly (i.e., without occupying kernel virtual address space) allocate 274 * 'npages' physical memory pages for exporting to user land. The allocated 275 * page_t pointer will be recorded in cookie. 276 */ 277 int 278 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep) 279 { 280 u_offset_t pmem_off = 0; 281 page_t *pp = NULL; 282 page_t *lpp = NULL; 283 page_t *tlist = NULL; 284 pgcnt_t i = 0; 285 pgcnt_t rpages = 0; 286 pgcnt_t lpages = 0; 287 pgcnt_t tpages = 0; 288 pgcnt_t npages = btopr(size); 289 pmem_lpg_t *plp = NULL; 290 struct devmap_pmem_cookie *pcp; 291 uint_t reserved = 0; 292 uint_t locked = 0; 293 uint_t pflags, kflags; 294 295 *cookiep = NULL; 296 297 /* 298 * Number larger than this will cause page_create_va() to loop 299 * infinitely. 300 */ 301 if (npages == 0 || npages >= total_pages / 2) 302 return (DDI_FAILURE); 303 if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0) 304 return (DDI_FAILURE); 305 pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT; 306 kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP; 307 308 /* Allocate pmem cookie. */ 309 if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE) 310 return (DDI_FAILURE); 311 pcp->dp_npages = npages; 312 313 /* 314 * See if the requested memory can be locked. 315 */ 316 pcp->dp_proc = curproc; 317 if (pmem_lock(npages, curproc) == DDI_FAILURE) 318 goto alloc_fail; 319 locked = 1; 320 /* 321 * First, grab as many as possible from pmem_mpool. If pages in 322 * pmem_mpool are enough for this request, we are done. 323 */ 324 mutex_enter(&pmem_mutex); 325 tpages = mpool_break(&tlist, npages); 326 /* IOlock and hashin them into the new offset. */ 327 if (tpages) 328 tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off); 329 mutex_exit(&pmem_mutex); 330 331 if (tpages == npages) 332 goto done; 333 334 rpages = npages - tpages; 335 /* Quit now if memory cannot be reserved. */ 336 if (!page_resv(rpages, kflags)) 337 goto alloc_fail; 338 reserved = 1; 339 340 /* If we have large pages */ 341 if (pmem_lpgsize > PAGESIZE) { 342 /* Try to alloc large pages first to decrease fragmentation. */ 343 i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt; 344 if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off, 345 kflags) == DDI_FAILURE) 346 goto alloc_fail; 347 ASSERT(lpages == 0 ? lpp == NULL : 1); 348 } 349 350 /* 351 * Pages in large pages is more than the request, put the residual 352 * pages into pmem_mpool. 353 */ 354 if (lpages >= rpages) { 355 lpp_break(&lpp, lpages, lpages - rpages, plp); 356 goto done; 357 } 358 359 /* Allocate small pages if lpp+tlist cannot satisfy the request. */ 360 i = rpages - lpages; 361 if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i), 362 pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL) 363 goto alloc_fail; 364 365 done: 366 page_list_concat(&tlist, &lpp); 367 page_list_concat(&tlist, &pp); 368 /* Set those small pages from large pages as allocated. */ 369 mutex_enter(&pmem_mutex); 370 pmem_lpg_concat(&pmem_occ_lpgs, &plp); 371 mutex_exit(&pmem_mutex); 372 373 /* 374 * Now tlist holds all the pages for this cookie. Record these pages in 375 * pmem cookie. 376 */ 377 for (pp = tlist, i = 0; i < npages; i++) { 378 pcp->dp_pparray[i] = pp; 379 page_io_unlock(pp); 380 pp = pp->p_next; 381 page_sub(&tlist, pp->p_prev); 382 } 383 ASSERT(tlist == NULL); 384 *cookiep = (devmap_pmem_cookie_t)pcp; 385 386 return (DDI_SUCCESS); 387 388 alloc_fail: 389 DTRACE_PROBE(pmem__alloc__fail); 390 /* Free large pages and the associated allocation records. */ 391 if (lpp) 392 lpp_free(lpp, lpages / pmem_pgcnt, &plp); 393 if (reserved == 1) 394 page_unresv(rpages); 395 /* Put those pages in tlist back into pmem_mpool. */ 396 if (tpages != 0) { 397 mutex_enter(&pmem_mutex); 398 /* IOunlock, hashout and update the allocation records. */ 399 tlist_out(tlist, tpages); 400 mpool_append(&tlist, tpages); 401 mutex_exit(&pmem_mutex); 402 } 403 if (locked == 1) 404 i_ddi_decr_locked_memory(pcp->dp_proc, ptob(pcp->dp_npages)); 405 /* Freeing pmem_cookie. */ 406 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 407 kmem_free(pcp->dp_pparray, npages * sizeof (page_t *)); 408 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 409 return (DDI_FAILURE); 410 } 411 412 /* 413 * Free all small pages inside cookie, and return pages from large pages into 414 * mpool, if all the pages from one large page is in mpool, free it as a whole. 415 */ 416 void 417 devmap_pmem_free(devmap_pmem_cookie_t cookie) 418 { 419 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 420 pgcnt_t i; 421 pgcnt_t tpages = 0; 422 page_t *pp; 423 pmem_lpg_t *pl1, *plp; 424 pmem_lpg_t *pf_lpgs = NULL; 425 uint_t npls = 0; 426 pmem_lpg_t *last_pl = NULL; 427 pmem_lpg_t *plast_pl = NULL; 428 429 ASSERT(pcp); 430 mutex_enter(&pmem_mutex); 431 /* Free small pages and return them to memory pool. */ 432 for (i = pcp->dp_npages; i > 0; i--) { 433 pp = pcp->dp_pparray[i - 1]; 434 page_hashout(pp, NULL); 435 /* 436 * Remove the mapping of this single page, this mapping is 437 * created using hat_devload() in segdev_faultpage(). 438 */ 439 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 440 if (!FROM_LPG(pp)) { 441 /* Normal small page. */ 442 page_free(pp, 1); 443 page_unresv(1); 444 } else { 445 /* Small page from large pages. */ 446 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 447 if (plp && !(plp->pl_pfree)) { 448 /* 449 * Move this record to pf_lpgs list, this large 450 * page may be able to be freed as a whole. 451 */ 452 pmem_lpg_sub(&pmem_occ_lpgs, plp); 453 pmem_lpg_concat(&pf_lpgs, &plp); 454 plp->pl_pfree = 1; 455 npls++; 456 last_pl = NULL; 457 } else { 458 /* Search in pf_lpgs list. */ 459 plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl); 460 } 461 ASSERT(plp); 462 /* Mark this page as free. */ 463 BT_SET(plp->pl_bitmap, PFIND(pp)); 464 /* Record this page in pmem_mpool. */ 465 mpool_append(&pp, 1); 466 } 467 } 468 469 /* 470 * Find out the large pages whose pages have been freed, remove them 471 * from plp list, free them and the associated pmem_lpg struct. 472 */ 473 for (plp = pf_lpgs; npls != 0; npls--) { 474 pl1 = plp; 475 plp = plp->pl_next; 476 if (lpg_isfree(pl1)) { 477 /* 478 * Get one free large page. Find all pages in this 479 * large page and remove them from pmem_mpool. 480 */ 481 lpg_free(pl1->pl_pp); 482 /* Remove associated allocation records. */ 483 pmem_lpg_sub(&pf_lpgs, pl1); 484 pmem_lpg_free(&pf_lpgs, pl1); 485 tpages -= pmem_pgcnt; 486 } else 487 pl1->pl_pfree = 0; 488 } 489 /* Update allocation records accordingly. */ 490 pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs); 491 mutex_exit(&pmem_mutex); 492 493 if (curproc == pcp->dp_proc) 494 i_ddi_decr_locked_memory(curproc, ptob(pcp->dp_npages)); 495 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 496 kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *)); 497 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 498 } 499 500 /* 501 * To extract page frame number from specified range in a cookie. 502 */ 503 int 504 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages, 505 pfn_t *pfnarray) 506 { 507 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 508 pgcnt_t i; 509 510 if (pcp == NULL || start + npages > pcp->dp_npages) 511 return (DDI_FAILURE); 512 513 for (i = start; i < start + npages; i++) 514 pfnarray[i - start] = pfn_to_mfn(pcp->dp_pparray[i]->p_pagenum); 515 516 return (DDI_SUCCESS); 517 } 518 519 void 520 pmem_init() 521 { 522 mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL); 523 pmem_lszc = MIN(1, page_num_pagesizes() - 1); 524 pmem_lpgsize = page_get_pagesize(pmem_lszc); 525 pmem_pgcnt = pmem_lpgsize >> PAGESHIFT; 526 bzero(&pmem_seg, sizeof (struct seg)); 527 pmem_seg.s_as = &kas; 528 } 529 530 /* Allocate kernel memory for one pmem cookie with n pages. */ 531 static int 532 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags) 533 { 534 struct devmap_pmem_cookie *pcp; 535 536 if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie), 537 kflags)) == NULL) 538 return (DDI_FAILURE); 539 pcp = *pcpp; 540 if ((pcp->dp_vnp = 541 kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) { 542 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 543 return (DDI_FAILURE); 544 } 545 if ((pcp->dp_pparray = 546 kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) { 547 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 548 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 549 return (DDI_FAILURE); 550 } 551 return (DDI_SUCCESS); 552 } 553 554 /* Try to lock down n pages resource */ 555 static int 556 pmem_lock(pgcnt_t n, proc_t *p) 557 { 558 if (i_ddi_incr_locked_memory(p, ptob(n)) != 0) { 559 return (DDI_FAILURE); 560 } 561 return (DDI_SUCCESS); 562 } 563 564 /* To check if all the pages in a large page are freed. */ 565 static int 566 lpg_isfree(pmem_lpg_t *plp) 567 { 568 uint_t i; 569 570 for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++) 571 if (plp->pl_bitmap[i] != BT_ULMAXMASK) 572 return (0); 573 /* All 1 means all pages are freed. */ 574 return (1); 575 } 576 577 /* 578 * Using pp to get the associated large page allocation record, searching in 579 * the splp linked list with *last as the heuristic pointer. Return NULL if 580 * not found. 581 */ 582 static pmem_lpg_t * 583 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last) 584 { 585 pmem_lpg_t *plp; 586 pgcnt_t root_pfn; 587 588 ASSERT(pp); 589 if (splp == NULL) 590 return (NULL); 591 root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1); 592 593 /* Try last winner first. */ 594 if (*last && root_pfn == page_pptonum((*last)->pl_pp)) 595 goto pl_found; 596 597 /* Else search the whole pmem_lpg list. */ 598 for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) { 599 plp = plp->pl_next; 600 if (plp == splp) { 601 plp = NULL; 602 break; 603 } 604 ASSERT(plp->pl_pp); 605 } 606 607 *last = plp; 608 609 pl_found: 610 return (*last); 611 } 612 613 /* 614 * Remove one pmem_lpg plp from the oplpp list. 615 */ 616 static void 617 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp) 618 { 619 if (*oplpp == plp) 620 *oplpp = plp->pl_next; /* go to next pmem_lpg */ 621 622 if (*oplpp == plp) 623 *oplpp = NULL; /* pmem_lpg list is gone */ 624 else { 625 plp->pl_prev->pl_next = plp->pl_next; 626 plp->pl_next->pl_prev = plp->pl_prev; 627 } 628 plp->pl_prev = plp->pl_next = plp; /* make plp a list of one */ 629 } 630 631 /* 632 * Concatenate page list nplpp onto the end of list plpp. 633 */ 634 static void 635 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp) 636 { 637 pmem_lpg_t *s1p, *s2p, *e1p, *e2p; 638 639 if (*nplpp == NULL) { 640 return; 641 } 642 if (*plpp == NULL) { 643 *plpp = *nplpp; 644 return; 645 } 646 s1p = *plpp; 647 e1p = s1p->pl_prev; 648 s2p = *nplpp; 649 e2p = s2p->pl_prev; 650 s1p->pl_prev = e2p; 651 e2p->pl_next = s1p; 652 e1p->pl_next = s2p; 653 s2p->pl_prev = e1p; 654 } 655 656 /* 657 * Allocate and initialize the allocation record of one large page, the init 658 * value is 'allocated'. 659 */ 660 static pmem_lpg_t * 661 pmem_lpg_alloc(uint_t kflags) 662 { 663 pmem_lpg_t *plp; 664 665 ASSERT(pmem_pgcnt % BT_NBIPUL == 0); 666 plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags); 667 if (plp == NULL) 668 return (NULL); 669 plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags); 670 if (plp->pl_bitmap == NULL) { 671 kmem_free(plp, sizeof (*plp)); 672 return (NULL); 673 } 674 plp->pl_next = plp->pl_prev = plp; 675 return (plp); 676 } 677 678 /* Free one allocation record pointed by oplp. */ 679 static void 680 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp) 681 { 682 if (*headp == plp) 683 *headp = plp->pl_next; /* go to next pmem_lpg_t */ 684 685 if (*headp == plp) 686 *headp = NULL; /* this list is gone */ 687 else { 688 plp->pl_prev->pl_next = plp->pl_next; 689 plp->pl_next->pl_prev = plp->pl_prev; 690 } 691 kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt)); 692 kmem_free(plp, sizeof (*plp)); 693 } 694 695 /* Free one large page headed by spp from pmem_mpool. */ 696 static void 697 lpg_free(page_t *spp) 698 { 699 page_t *pp1 = spp; 700 uint_t i; 701 702 ASSERT(MUTEX_HELD(&pmem_mutex)); 703 for (i = 0; i < pmem_pgcnt; i++) { 704 /* Break pp1 from pmem_mpool. */ 705 page_sub(&pmem_mpool, pp1); 706 pp1++; 707 } 708 /* Free pages in this large page. */ 709 page_free_pages(spp); 710 page_unresv(pmem_pgcnt); 711 pmem_nmpages -= pmem_pgcnt; 712 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 713 } 714 715 /* Put n pages in *ppp list back into pmem_mpool. */ 716 static void 717 mpool_append(page_t **ppp, pgcnt_t n) 718 { 719 ASSERT(MUTEX_HELD(&pmem_mutex)); 720 /* Put back pages. */ 721 page_list_concat(&pmem_mpool, ppp); 722 pmem_nmpages += n; 723 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 724 } 725 726 /* 727 * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp 728 * list, and return the number of grabbed pages. 729 */ 730 static pgcnt_t 731 mpool_break(page_t **ppp, pgcnt_t n) 732 { 733 pgcnt_t i; 734 735 ASSERT(MUTEX_HELD(&pmem_mutex)); 736 /* Grab the pages. */ 737 i = MIN(pmem_nmpages, n); 738 *ppp = pmem_mpool; 739 page_list_break(ppp, &pmem_mpool, i); 740 pmem_nmpages -= i; 741 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 742 return (i); 743 } 744 745 /* 746 * Create n large pages, lpages and plpp contains the number of small pages and 747 * allocation records list respectively. 748 */ 749 static int 750 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp, 751 vnode_t *vnp, u_offset_t *offp, uint_t kflags) 752 { 753 pgcnt_t i; 754 pmem_lpg_t *plp; 755 page_t *pp; 756 757 for (i = 0, *lpages = 0; i < n; i++) { 758 /* Allocte one large page each time. */ 759 pp = page_create_va_large(vnp, *offp, pmem_lpgsize, 760 PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL); 761 if (pp == NULL) 762 break; 763 *offp += pmem_lpgsize; 764 page_list_concat(lppp, &pp); 765 *lpages += pmem_pgcnt; 766 /* Add one allocation record for this large page. */ 767 if ((plp = pmem_lpg_alloc(kflags)) == NULL) 768 return (DDI_FAILURE); 769 plp->pl_pp = pp; 770 pmem_lpg_concat(plpp, &plp); 771 } 772 return (DDI_SUCCESS); 773 } 774 775 /* 776 * Break the last r small pages from the large page list *lppp (with totally n 777 * small pages) and put them into pmem_mpool. 778 */ 779 static void 780 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp) 781 { 782 page_t *pp, *pp1; 783 pgcnt_t i; 784 pmem_lpg_t *plp; 785 786 if (r == 0) 787 return; 788 ASSERT(*lppp != NULL && r < pmem_pgcnt); 789 page_list_break(lppp, &pp, n - r); 790 791 /* The residual should reside in the last large page. */ 792 plp = oplp->pl_prev; 793 /* IOunlock and hashout the residual pages. */ 794 for (pp1 = pp, i = 0; i < r; i++) { 795 page_io_unlock(pp1); 796 page_hashout(pp1, NULL); 797 /* Mark this page as free. */ 798 BT_SET(plp->pl_bitmap, PFIND(pp1)); 799 pp1 = pp1->p_next; 800 } 801 ASSERT(pp1 == pp); 802 /* Put these residual pages into memory pool. */ 803 mutex_enter(&pmem_mutex); 804 mpool_append(&pp, r); 805 mutex_exit(&pmem_mutex); 806 } 807 808 /* Freeing large pages in lpp and the associated allocation records in plp. */ 809 static void 810 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp) 811 { 812 pgcnt_t i, j; 813 page_t *pp = lpp, *pp1; 814 pmem_lpg_t *plp1, *plp2; 815 816 for (i = 0; i < lpgs; i++) { 817 for (j = 0; j < pmem_pgcnt; j++) { 818 /* IO unlock and hashout this small page. */ 819 page_io_unlock(pp); 820 page_hashout(pp, NULL); 821 pp1 = pp->p_next; 822 pp->p_prev = pp->p_next = pp; 823 pp = pp1; 824 } 825 /* Free one large page at one time. */ 826 page_free_pages(lpp); 827 lpp = pp; 828 } 829 /* Free associate pmem large page allocation records. */ 830 for (plp1 = *plpp; *plpp; plp1 = plp2) { 831 plp2 = plp1->pl_next; 832 pmem_lpg_free(plpp, plp1); 833 } 834 } 835 836 /* 837 * IOlock and hashin all pages in tlist, associate them with vnode *pvnp 838 * and offset starting with *poffp. Update allocation records accordingly at 839 * the same time. 840 */ 841 static void 842 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp) 843 { 844 page_t *pp; 845 pgcnt_t i = 0; 846 pmem_lpg_t *plp, *last_pl = NULL; 847 848 ASSERT(MUTEX_HELD(&pmem_mutex)); 849 for (pp = tlist; i < tpages; i++) { 850 ASSERT(FROM_LPG(pp)); 851 page_io_lock(pp); 852 (void) page_hashin(pp, pvnp, *poffp, NULL); 853 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 854 /* Mark this page as allocated. */ 855 BT_CLEAR(plp->pl_bitmap, PFIND(pp)); 856 *poffp += PAGESIZE; 857 pp = pp->p_next; 858 } 859 ASSERT(pp == tlist); 860 } 861 862 /* 863 * IOunlock and hashout all pages in tlist, update allocation records 864 * accordingly at the same time. 865 */ 866 static void 867 tlist_out(page_t *tlist, pgcnt_t tpages) 868 { 869 page_t *pp; 870 pgcnt_t i = 0; 871 pmem_lpg_t *plp, *last_pl = NULL; 872 873 ASSERT(MUTEX_HELD(&pmem_mutex)); 874 for (pp = tlist; i < tpages; i++) { 875 ASSERT(FROM_LPG(pp)); 876 page_io_unlock(pp); 877 page_hashout(pp, NULL); 878 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 879 /* Mark this page as free. */ 880 BT_SET(plp->pl_bitmap, PFIND(pp)); 881 pp = pp->p_next; 882 } 883 ASSERT(pp == tlist); 884 } 885