1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * PMEM - Direct mapping physical memory pages to userland process 30 * 31 * Provide functions used for directly (w/o occupying kernel virtual address 32 * space) allocating and exporting physical memory pages to userland. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/mutex.h> 37 #include <sys/sunddi.h> 38 #include <sys/ddidevmap.h> 39 #include <sys/vnode.h> 40 #include <sys/sysmacros.h> 41 #include <vm/seg_dev.h> 42 #include <sys/pmem.h> 43 #include <vm/hat_i86.h> 44 #include <sys/task.h> 45 #include <sys/sdt.h> 46 47 /* 48 * The routines in this file allocate memory which will be accessed through 49 * the AGP GART hardware. The GART is programmed with the PFNs for this 50 * memory, and the only mechanism for removing these entries is by an 51 * explicit process operation (ioctl/close of the driver, or process exit). 52 * As such, the pages need to remain locked to ensure that they won't be 53 * relocated or paged out. 54 * 55 * To prevent these locked pages from getting in the way of page 56 * coalescing, we try to allocate large pages from the system, and carve 57 * them up to satisfy pmem allocation requests. This will keep the locked 58 * pages within a constrained area of physical memory, limiting the number 59 * of large pages that would be pinned by our locked pages. This is, of 60 * course, another take on the infamous kernel cage, and it has many of the 61 * downsides of the original cage. It also interferes with system-wide 62 * resource management decisions, as it maintains its own pool of unused 63 * pages which can't be easily reclaimed and used during low-memory 64 * situations. 65 * 66 * The right solution is for pmem to register a callback that the VM system 67 * could call, which would temporarily remove any GART entries for pages 68 * that were being relocated. This would let us leave the pages unlocked, 69 * which would remove the need for using large pages, which would simplify 70 * this code a great deal. Unfortunately, the support for these callbacks 71 * only exists on some SPARC platforms right now. 72 * 73 * Note that this is the *only* reason that large pages are used here. The 74 * GART can't perform large-page translations, and the code appropriately 75 * falls back to using small pages if page_create_va_large() fails. 76 */ 77 78 #define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 79 { mutex_enter(&dhp->dh_lock); } 80 81 #define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 82 { mutex_exit(&dhp->dh_lock); } 83 84 #define FROM_LPG(pp) (pp->p_szc != 0) 85 #define PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1)) 86 87 /* 88 * Structs and static variables used for pmem only. 89 */ 90 typedef struct pmem_lpg { 91 page_t *pl_pp; /* start pp */ 92 ulong_t *pl_bitmap; /* allocation status for each page */ 93 ushort_t pl_pfree; /* this large page might be fully freed */ 94 struct pmem_lpg *pl_next; 95 struct pmem_lpg *pl_prev; 96 } pmem_lpg_t; 97 98 static size_t pmem_lpgsize; /* the size of one large page */ 99 static pgcnt_t pmem_pgcnt; /* the number of small pages in a large page */ 100 static uint_t pmem_lszc; /* page size code of the large page */ 101 /* The segment to be associated with all the allocated pages. */ 102 static struct seg pmem_seg; 103 /* Fully occupied large pages allocated for pmem. */ 104 static pmem_lpg_t *pmem_occ_lpgs; 105 /* Memory pool to store residual small pages from large pages. */ 106 static page_t *pmem_mpool = NULL; 107 /* Number of small pages reside in pmem_mpool currently. */ 108 static pgcnt_t pmem_nmpages = 0; 109 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */ 110 kmutex_t pmem_mutex; 111 112 static int lpg_isfree(pmem_lpg_t *); 113 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *); 114 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **); 115 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **); 116 static pmem_lpg_t *pmem_lpg_alloc(uint_t); 117 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *); 118 static void lpg_free(page_t *spp); 119 static pgcnt_t mpool_break(page_t **, pgcnt_t); 120 static void mpool_append(page_t **, pgcnt_t); 121 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *); 122 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **); 123 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **, 124 vnode_t *, u_offset_t *, uint_t); 125 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *); 126 static void tlist_out(page_t *, pgcnt_t); 127 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t); 128 static int pmem_lock(pgcnt_t, proc_t *p); 129 130 /* 131 * Called by driver devmap routine to pass physical memory mapping info to 132 * seg_dev framework, used only for physical memory allocated from 133 * devmap_pmem_alloc(). 134 */ 135 /* ARGSUSED */ 136 int 137 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip, 138 struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie, 139 offset_t off, size_t len, uint_t maxprot, uint_t flags, 140 ddi_device_acc_attr_t *accattrp) 141 { 142 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 143 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 144 uint_t cache_attr = IOMEM_CACHE_ATTR(flags); 145 146 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 147 return (DDI_FAILURE); 148 149 /* 150 * First to check if this function has been called for this dhp. 151 */ 152 if (dhp->dh_flags & DEVMAP_SETUP_DONE) 153 return (DDI_FAILURE); 154 155 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 156 return (DDI_FAILURE); 157 158 /* 159 * Check if the cache attributes are supported. Need to pay 160 * attention that only uncachable or write-combining is 161 * permitted for pmem. 162 */ 163 if (i_ddi_check_cache_attr(flags) == B_FALSE || 164 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0) 165 return (DDI_FAILURE); 166 167 if (flags & DEVMAP_MAPPING_INVALID) { 168 /* 169 * If DEVMAP_MAPPING_INVALID is specified, we have to grant 170 * remap permission. 171 */ 172 if (!(flags & DEVMAP_ALLOW_REMAP)) 173 return (DDI_FAILURE); 174 } else { 175 dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp; 176 /* dh_roff is the offset inside the dh_pcookie. */ 177 dhp->dh_roff = ptob(btop(off)); 178 /* Set the cache attributes correctly */ 179 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr); 180 } 181 182 dhp->dh_cookie = DEVMAP_PMEM_COOKIE; 183 dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS); 184 dhp->dh_len = ptob(btopr(len)); 185 186 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 187 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 188 189 if (callbackops != NULL) { 190 bcopy(callbackops, &dhp->dh_callbackops, 191 sizeof (struct devmap_callback_ctl)); 192 } 193 194 /* 195 * Initialize dh_lock if we want to do remap. 196 */ 197 if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) { 198 mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); 199 dhp->dh_flags |= DEVMAP_LOCK_INITED; 200 } 201 202 dhp->dh_flags |= DEVMAP_SETUP_DONE; 203 204 return (DDI_SUCCESS); 205 } 206 207 /* 208 * Replace existing mapping using a new cookie, mainly gets called when doing 209 * fork(). Should be called in associated devmap_dup(9E). 210 */ 211 /* ARGSUSED */ 212 int 213 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip, 214 devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot, 215 uint_t flags, ddi_device_acc_attr_t *accattrp) 216 { 217 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 218 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 219 uint_t cache_attr = IOMEM_CACHE_ATTR(flags); 220 221 /* 222 * Reture failure if setup has not been done or no remap permission 223 * has been granted during the setup. 224 */ 225 if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 || 226 (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0) 227 return (DDI_FAILURE); 228 229 /* No flags supported for remap yet. */ 230 if (flags != 0) 231 return (DDI_FAILURE); 232 233 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 234 return (DDI_FAILURE); 235 236 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 237 return (DDI_FAILURE); 238 239 /* 240 * Check if the cache attributes are supported. Need to pay 241 * attention that only uncachable or write-combining is 242 * permitted for pmem. 243 */ 244 if (i_ddi_check_cache_attr(flags) == B_FALSE || 245 (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0) 246 return (DDI_FAILURE); 247 248 HOLD_DHP_LOCK(dhp); 249 /* 250 * Unload the old mapping of pages reloated with this dhp, so next 251 * fault will setup the new mappings. It is in segdev_faultpage that 252 * calls hat_devload to establish the mapping. Do this while holding 253 * the dhp lock so other faults dont reestablish the mappings. 254 */ 255 hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, 256 dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); 257 258 /* Set the cache attributes correctly */ 259 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr); 260 261 dhp->dh_pcookie = cookie; 262 dhp->dh_roff = ptob(btop(off)); 263 dhp->dh_len = ptob(btopr(len)); 264 265 /* Clear the large page size flag. */ 266 dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; 267 268 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 269 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 270 RELE_DHP_LOCK(dhp); 271 return (DDI_SUCCESS); 272 } 273 274 /* 275 * Directly (i.e., without occupying kernel virtual address space) allocate 276 * 'npages' physical memory pages for exporting to user land. The allocated 277 * page_t pointer will be recorded in cookie. 278 */ 279 int 280 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep) 281 { 282 u_offset_t pmem_off = 0; 283 page_t *pp = NULL; 284 page_t *lpp = NULL; 285 page_t *tlist = NULL; 286 pgcnt_t i = 0; 287 pgcnt_t rpages = 0; 288 pgcnt_t lpages = 0; 289 pgcnt_t tpages = 0; 290 pgcnt_t npages = btopr(size); 291 pmem_lpg_t *plp = NULL; 292 struct devmap_pmem_cookie *pcp; 293 uint_t reserved = 0; 294 uint_t locked = 0; 295 uint_t pflags, kflags; 296 297 *cookiep = NULL; 298 299 /* 300 * Number larger than this will cause page_create_va() to loop 301 * infinitely. 302 */ 303 if (npages == 0 || npages >= total_pages / 2) 304 return (DDI_FAILURE); 305 if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0) 306 return (DDI_FAILURE); 307 pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT; 308 kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP; 309 310 /* Allocate pmem cookie. */ 311 if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE) 312 return (DDI_FAILURE); 313 pcp->dp_npages = npages; 314 315 /* 316 * See if the requested memory can be locked. 317 */ 318 pcp->dp_proc = curproc; 319 if (pmem_lock(npages, curproc) == DDI_FAILURE) 320 goto alloc_fail; 321 locked = 1; 322 /* 323 * First, grab as many as possible from pmem_mpool. If pages in 324 * pmem_mpool are enough for this request, we are done. 325 */ 326 mutex_enter(&pmem_mutex); 327 tpages = mpool_break(&tlist, npages); 328 /* IOlock and hashin them into the new offset. */ 329 if (tpages) 330 tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off); 331 mutex_exit(&pmem_mutex); 332 333 if (tpages == npages) 334 goto done; 335 336 rpages = npages - tpages; 337 /* Quit now if memory cannot be reserved. */ 338 if (!page_resv(rpages, kflags)) 339 goto alloc_fail; 340 reserved = 1; 341 342 /* Try to allocate large pages first to decrease fragmentation. */ 343 i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt; 344 if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off, 345 kflags) == DDI_FAILURE) 346 goto alloc_fail; 347 ASSERT(lpages == 0 ? lpp == NULL : 1); 348 349 /* 350 * Pages in large pages is more than the request, put the residual 351 * pages into pmem_mpool. 352 */ 353 if (lpages >= rpages) { 354 lpp_break(&lpp, lpages, lpages - rpages, plp); 355 goto done; 356 } 357 358 /* Allocate small pages if lpp+tlist cannot satisfy the request. */ 359 i = rpages - lpages; 360 if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i), 361 pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL) 362 goto alloc_fail; 363 364 done: 365 page_list_concat(&tlist, &lpp); 366 page_list_concat(&tlist, &pp); 367 /* Set those small pages from large pages as allocated. */ 368 mutex_enter(&pmem_mutex); 369 pmem_lpg_concat(&pmem_occ_lpgs, &plp); 370 mutex_exit(&pmem_mutex); 371 372 /* 373 * Now tlist holds all the pages for this cookie. Record these pages in 374 * pmem cookie. 375 */ 376 for (pp = tlist, i = 0; i < npages; i++) { 377 pcp->dp_pparray[i] = pp; 378 page_io_unlock(pp); 379 pp = pp->p_next; 380 page_sub(&tlist, pp->p_prev); 381 } 382 ASSERT(tlist == NULL); 383 *cookiep = (devmap_pmem_cookie_t)pcp; 384 385 return (DDI_SUCCESS); 386 387 alloc_fail: 388 DTRACE_PROBE(pmem__alloc__fail); 389 /* Free large pages and the associated allocation records. */ 390 if (lpp) 391 lpp_free(lpp, lpages / pmem_pgcnt, &plp); 392 if (reserved == 1) 393 page_unresv(rpages); 394 /* Put those pages in tlist back into pmem_mpool. */ 395 if (tpages != 0) { 396 mutex_enter(&pmem_mutex); 397 /* IOunlock, hashout and update the allocation records. */ 398 tlist_out(tlist, tpages); 399 mpool_append(&tlist, tpages); 400 mutex_exit(&pmem_mutex); 401 } 402 if (locked == 1) 403 i_ddi_decr_locked_memory(pcp->dp_proc, ptob(pcp->dp_npages)); 404 /* Freeing pmem_cookie. */ 405 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 406 kmem_free(pcp->dp_pparray, npages * sizeof (page_t *)); 407 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 408 return (DDI_FAILURE); 409 } 410 411 /* 412 * Free all small pages inside cookie, and return pages from large pages into 413 * mpool, if all the pages from one large page is in mpool, free it as a whole. 414 */ 415 void 416 devmap_pmem_free(devmap_pmem_cookie_t cookie) 417 { 418 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 419 pgcnt_t i; 420 pgcnt_t tpages = 0; 421 page_t *pp; 422 pmem_lpg_t *pl1, *plp; 423 pmem_lpg_t *pf_lpgs = NULL; 424 uint_t npls = 0; 425 pmem_lpg_t *last_pl = NULL; 426 pmem_lpg_t *plast_pl = NULL; 427 428 ASSERT(pcp); 429 mutex_enter(&pmem_mutex); 430 /* Free small pages and return them to memory pool. */ 431 for (i = pcp->dp_npages; i > 0; i--) { 432 pp = pcp->dp_pparray[i - 1]; 433 page_hashout(pp, NULL); 434 /* 435 * Remove the mapping of this single page, this mapping is 436 * created using hat_devload() in segdev_faultpage(). 437 */ 438 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 439 if (!FROM_LPG(pp)) { 440 /* Normal small page. */ 441 page_free(pp, 1); 442 page_unresv(1); 443 } else { 444 /* Small page from large pages. */ 445 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 446 if (plp && !(plp->pl_pfree)) { 447 /* 448 * Move this record to pf_lpgs list, this large 449 * page may be able to be freed as a whole. 450 */ 451 pmem_lpg_sub(&pmem_occ_lpgs, plp); 452 pmem_lpg_concat(&pf_lpgs, &plp); 453 plp->pl_pfree = 1; 454 npls++; 455 last_pl = NULL; 456 } else { 457 /* Search in pf_lpgs list. */ 458 plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl); 459 } 460 ASSERT(plp); 461 /* Mark this page as free. */ 462 BT_SET(plp->pl_bitmap, PFIND(pp)); 463 /* Record this page in pmem_mpool. */ 464 mpool_append(&pp, 1); 465 } 466 } 467 468 /* 469 * Find out the large pages whose pages have been freed, remove them 470 * from plp list, free them and the associated pmem_lpg struct. 471 */ 472 for (plp = pf_lpgs; npls != 0; npls--) { 473 pl1 = plp; 474 plp = plp->pl_next; 475 if (lpg_isfree(pl1)) { 476 /* 477 * Get one free large page. Find all pages in this 478 * large page and remove them from pmem_mpool. 479 */ 480 lpg_free(pl1->pl_pp); 481 /* Remove associated allocation records. */ 482 pmem_lpg_sub(&pf_lpgs, pl1); 483 pmem_lpg_free(&pf_lpgs, pl1); 484 tpages -= pmem_pgcnt; 485 } else 486 pl1->pl_pfree = 0; 487 } 488 /* Update allocation records accordingly. */ 489 pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs); 490 mutex_exit(&pmem_mutex); 491 492 if (curproc == pcp->dp_proc) 493 i_ddi_decr_locked_memory(curproc, ptob(pcp->dp_npages)); 494 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 495 kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *)); 496 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 497 } 498 499 /* 500 * To extract page frame number from specified range in a cookie. 501 */ 502 int 503 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages, 504 pfn_t *pfnarray) 505 { 506 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 507 pgcnt_t i; 508 509 if (pcp == NULL || start + npages > pcp->dp_npages) 510 return (DDI_FAILURE); 511 512 for (i = start; i < start + npages; i++) 513 pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum; 514 return (DDI_SUCCESS); 515 } 516 517 void 518 pmem_init() 519 { 520 mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL); 521 pmem_lszc = MIN(1, page_num_pagesizes() - 1); 522 pmem_lpgsize = page_get_pagesize(pmem_lszc); 523 pmem_pgcnt = pmem_lpgsize >> PAGESHIFT; 524 bzero(&pmem_seg, sizeof (struct seg)); 525 pmem_seg.s_as = &kas; 526 } 527 528 /* Allocate kernel memory for one pmem cookie with n pages. */ 529 static int 530 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags) 531 { 532 struct devmap_pmem_cookie *pcp; 533 534 if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie), 535 kflags)) == NULL) 536 return (DDI_FAILURE); 537 pcp = *pcpp; 538 if ((pcp->dp_vnp = 539 kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) { 540 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 541 return (DDI_FAILURE); 542 } 543 if ((pcp->dp_pparray = 544 kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) { 545 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 546 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 547 return (DDI_FAILURE); 548 } 549 return (DDI_SUCCESS); 550 } 551 552 /* Try to lock down n pages resource */ 553 static int 554 pmem_lock(pgcnt_t n, proc_t *p) 555 { 556 if (i_ddi_incr_locked_memory(p, ptob(n)) != 0) { 557 return (DDI_FAILURE); 558 } 559 return (DDI_SUCCESS); 560 } 561 562 /* To check if all the pages in a large page are freed. */ 563 static int 564 lpg_isfree(pmem_lpg_t *plp) 565 { 566 uint_t i; 567 568 for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++) 569 if (plp->pl_bitmap[i] != BT_ULMAXMASK) 570 return (0); 571 /* All 1 means all pages are freed. */ 572 return (1); 573 } 574 575 /* 576 * Using pp to get the associated large page allocation record, searching in 577 * the splp linked list with *last as the heuristic pointer. Return NULL if 578 * not found. 579 */ 580 static pmem_lpg_t * 581 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last) 582 { 583 pmem_lpg_t *plp; 584 pgcnt_t root_pfn; 585 586 ASSERT(pp); 587 if (splp == NULL) 588 return (NULL); 589 root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1); 590 591 /* Try last winner first. */ 592 if (*last && root_pfn == page_pptonum((*last)->pl_pp)) 593 goto pl_found; 594 595 /* Else search the whole pmem_lpg list. */ 596 for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) { 597 plp = plp->pl_next; 598 if (plp == splp) { 599 plp = NULL; 600 break; 601 } 602 ASSERT(plp->pl_pp); 603 } 604 605 *last = plp; 606 607 pl_found: 608 return (*last); 609 } 610 611 /* 612 * Remove one pmem_lpg plp from the oplpp list. 613 */ 614 static void 615 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp) 616 { 617 if (*oplpp == plp) 618 *oplpp = plp->pl_next; /* go to next pmem_lpg */ 619 620 if (*oplpp == plp) 621 *oplpp = NULL; /* pmem_lpg list is gone */ 622 else { 623 plp->pl_prev->pl_next = plp->pl_next; 624 plp->pl_next->pl_prev = plp->pl_prev; 625 } 626 plp->pl_prev = plp->pl_next = plp; /* make plp a list of one */ 627 } 628 629 /* 630 * Concatenate page list nplpp onto the end of list plpp. 631 */ 632 static void 633 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp) 634 { 635 pmem_lpg_t *s1p, *s2p, *e1p, *e2p; 636 637 if (*nplpp == NULL) { 638 return; 639 } 640 if (*plpp == NULL) { 641 *plpp = *nplpp; 642 return; 643 } 644 s1p = *plpp; 645 e1p = s1p->pl_prev; 646 s2p = *nplpp; 647 e2p = s2p->pl_prev; 648 s1p->pl_prev = e2p; 649 e2p->pl_next = s1p; 650 e1p->pl_next = s2p; 651 s2p->pl_prev = e1p; 652 } 653 654 /* 655 * Allocate and initialize the allocation record of one large page, the init 656 * value is 'allocated'. 657 */ 658 static pmem_lpg_t * 659 pmem_lpg_alloc(uint_t kflags) 660 { 661 pmem_lpg_t *plp; 662 663 ASSERT(pmem_pgcnt % BT_NBIPUL == 0); 664 plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags); 665 if (plp == NULL) 666 return (NULL); 667 plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags); 668 if (plp->pl_bitmap == NULL) { 669 kmem_free(plp, sizeof (*plp)); 670 return (NULL); 671 } 672 plp->pl_next = plp->pl_prev = plp; 673 return (plp); 674 } 675 676 /* Free one allocation record pointed by oplp. */ 677 static void 678 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp) 679 { 680 if (*headp == plp) 681 *headp = plp->pl_next; /* go to next pmem_lpg_t */ 682 683 if (*headp == plp) 684 *headp = NULL; /* this list is gone */ 685 else { 686 plp->pl_prev->pl_next = plp->pl_next; 687 plp->pl_next->pl_prev = plp->pl_prev; 688 } 689 kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt)); 690 kmem_free(plp, sizeof (*plp)); 691 } 692 693 /* Free one large page headed by spp from pmem_mpool. */ 694 static void 695 lpg_free(page_t *spp) 696 { 697 page_t *pp1 = spp; 698 uint_t i; 699 700 ASSERT(MUTEX_HELD(&pmem_mutex)); 701 for (i = 0; i < pmem_pgcnt; i++) { 702 /* Break pp1 from pmem_mpool. */ 703 page_sub(&pmem_mpool, pp1); 704 pp1++; 705 } 706 /* Free pages in this large page. */ 707 page_free_pages(spp); 708 page_unresv(pmem_pgcnt); 709 pmem_nmpages -= pmem_pgcnt; 710 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 711 } 712 713 /* Put n pages in *ppp list back into pmem_mpool. */ 714 static void 715 mpool_append(page_t **ppp, pgcnt_t n) 716 { 717 ASSERT(MUTEX_HELD(&pmem_mutex)); 718 /* Put back pages. */ 719 page_list_concat(&pmem_mpool, ppp); 720 pmem_nmpages += n; 721 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 722 } 723 724 /* 725 * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp 726 * list, and return the number of grabbed pages. 727 */ 728 static pgcnt_t 729 mpool_break(page_t **ppp, pgcnt_t n) 730 { 731 pgcnt_t i; 732 733 ASSERT(MUTEX_HELD(&pmem_mutex)); 734 /* Grab the pages. */ 735 i = MIN(pmem_nmpages, n); 736 *ppp = pmem_mpool; 737 page_list_break(ppp, &pmem_mpool, i); 738 pmem_nmpages -= i; 739 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 740 return (i); 741 } 742 743 /* 744 * Create n large pages, lpages and plpp contains the number of small pages and 745 * allocation records list respectively. 746 */ 747 static int 748 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp, 749 vnode_t *vnp, u_offset_t *offp, uint_t kflags) 750 { 751 pgcnt_t i; 752 pmem_lpg_t *plp; 753 page_t *pp; 754 755 for (i = 0, *lpages = 0; i < n; i++) { 756 /* Allocte one large page each time. */ 757 pp = page_create_va_large(vnp, *offp, pmem_lpgsize, 758 PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL); 759 if (pp == NULL) 760 break; 761 *offp += pmem_lpgsize; 762 page_list_concat(lppp, &pp); 763 *lpages += pmem_pgcnt; 764 /* Add one allocation record for this large page. */ 765 if ((plp = pmem_lpg_alloc(kflags)) == NULL) 766 return (DDI_FAILURE); 767 plp->pl_pp = pp; 768 pmem_lpg_concat(plpp, &plp); 769 } 770 return (DDI_SUCCESS); 771 } 772 773 /* 774 * Break the last r small pages from the large page list *lppp (with totally n 775 * small pages) and put them into pmem_mpool. 776 */ 777 static void 778 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp) 779 { 780 page_t *pp, *pp1; 781 pgcnt_t i; 782 pmem_lpg_t *plp; 783 784 if (r == 0) 785 return; 786 ASSERT(*lppp != NULL && r < pmem_pgcnt); 787 page_list_break(lppp, &pp, n - r); 788 789 /* The residual should reside in the last large page. */ 790 plp = oplp->pl_prev; 791 /* IOunlock and hashout the residual pages. */ 792 for (pp1 = pp, i = 0; i < r; i++) { 793 page_io_unlock(pp1); 794 page_hashout(pp1, NULL); 795 /* Mark this page as free. */ 796 BT_SET(plp->pl_bitmap, PFIND(pp1)); 797 pp1 = pp1->p_next; 798 } 799 ASSERT(pp1 == pp); 800 /* Put these residual pages into memory pool. */ 801 mutex_enter(&pmem_mutex); 802 mpool_append(&pp, r); 803 mutex_exit(&pmem_mutex); 804 } 805 806 /* Freeing large pages in lpp and the associated allocation records in plp. */ 807 static void 808 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp) 809 { 810 pgcnt_t i, j; 811 page_t *pp = lpp, *pp1; 812 pmem_lpg_t *plp1, *plp2; 813 814 for (i = 0; i < lpgs; i++) { 815 for (j = 0; j < pmem_pgcnt; j++) { 816 /* IO unlock and hashout this small page. */ 817 page_io_unlock(pp); 818 page_hashout(pp, NULL); 819 pp1 = pp->p_next; 820 pp->p_prev = pp->p_next = pp; 821 pp = pp1; 822 } 823 /* Free one large page at one time. */ 824 page_free_pages(lpp); 825 lpp = pp; 826 } 827 /* Free associate pmem large page allocation records. */ 828 for (plp1 = *plpp; *plpp; plp1 = plp2) { 829 plp2 = plp1->pl_next; 830 pmem_lpg_free(plpp, plp1); 831 } 832 } 833 834 /* 835 * IOlock and hashin all pages in tlist, associate them with vnode *pvnp 836 * and offset starting with *poffp. Update allocation records accordingly at 837 * the same time. 838 */ 839 static void 840 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp) 841 { 842 page_t *pp; 843 pgcnt_t i = 0; 844 pmem_lpg_t *plp, *last_pl = NULL; 845 846 ASSERT(MUTEX_HELD(&pmem_mutex)); 847 for (pp = tlist; i < tpages; i++) { 848 ASSERT(FROM_LPG(pp)); 849 page_io_lock(pp); 850 (void) page_hashin(pp, pvnp, *poffp, NULL); 851 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 852 /* Mark this page as allocated. */ 853 BT_CLEAR(plp->pl_bitmap, PFIND(pp)); 854 *poffp += PAGESIZE; 855 pp = pp->p_next; 856 } 857 ASSERT(pp == tlist); 858 } 859 860 /* 861 * IOunlock and hashout all pages in tlist, update allocation records 862 * accordingly at the same time. 863 */ 864 static void 865 tlist_out(page_t *tlist, pgcnt_t tpages) 866 { 867 page_t *pp; 868 pgcnt_t i = 0; 869 pmem_lpg_t *plp, *last_pl = NULL; 870 871 ASSERT(MUTEX_HELD(&pmem_mutex)); 872 for (pp = tlist; i < tpages; i++) { 873 ASSERT(FROM_LPG(pp)); 874 page_io_unlock(pp); 875 page_hashout(pp, NULL); 876 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 877 /* Mark this page as free. */ 878 BT_SET(plp->pl_bitmap, PFIND(pp)); 879 pp = pp->p_next; 880 } 881 ASSERT(pp == tlist); 882 } 883