1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * PMEM - Direct mapping physical memory pages to userland process 31 * 32 * Provide functions used for directly (w/o occupying kernel virtual address 33 * space) allocating and exporting physical memory pages to userland. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/mutex.h> 38 #include <sys/sunddi.h> 39 #include <sys/ddidevmap.h> 40 #include <sys/vnode.h> 41 #include <sys/sysmacros.h> 42 #include <sys/project.h> 43 #include <vm/seg_dev.h> 44 #include <sys/pmem.h> 45 #include <vm/hat_i86.h> 46 #include <sys/task.h> 47 #include <sys/sdt.h> 48 49 /* 50 * The routines in this file allocate memory which will be accessed through 51 * the AGP GART hardware. The GART is programmed with the PFNs for this 52 * memory, and the only mechanism for removing these entries is by an 53 * explicit process operation (ioctl/close of the driver, or process exit). 54 * As such, the pages need to remain locked to ensure that they won't be 55 * relocated or paged out. 56 * 57 * To prevent these locked pages from getting in the way of page 58 * coalescing, we try to allocate large pages from the system, and carve 59 * them up to satisfy pmem allocation requests. This will keep the locked 60 * pages within a constrained area of physical memory, limiting the number 61 * of large pages that would be pinned by our locked pages. This is, of 62 * course, another take on the infamous kernel cage, and it has many of the 63 * downsides of the original cage. It also interferes with system-wide 64 * resource management decisions, as it maintains its own pool of unused 65 * pages which can't be easily reclaimed and used during low-memory 66 * situations. 67 * 68 * The right solution is for pmem to register a callback that the VM system 69 * could call, which would temporarily remove any GART entries for pages 70 * that were being relocated. This would let us leave the pages unlocked, 71 * which would remove the need for using large pages, which would simplify 72 * this code a great deal. Unfortunately, the support for these callbacks 73 * only exists on some SPARC platforms right now. 74 * 75 * Note that this is the *only* reason that large pages are used here. The 76 * GART can't perform large-page translations, and the code appropriately 77 * falls back to using small pages if page_create_va_large() fails. 78 */ 79 80 #define HOLD_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 81 { mutex_enter(&dhp->dh_lock); } 82 83 #define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \ 84 { mutex_exit(&dhp->dh_lock); } 85 86 #define FROM_LPG(pp) (pp->p_szc != 0) 87 #define PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1)) 88 89 /* 90 * Structs and static variables used for pmem only. 91 */ 92 typedef struct pmem_lpg { 93 page_t *pl_pp; /* start pp */ 94 ulong_t *pl_bitmap; /* allocation status for each page */ 95 ushort_t pl_pfree; /* this large page might be fully freed */ 96 struct pmem_lpg *pl_next; 97 struct pmem_lpg *pl_prev; 98 } pmem_lpg_t; 99 100 static size_t pmem_lpgsize; /* the size of one large page */ 101 static pgcnt_t pmem_pgcnt; /* the number of small pages in a large page */ 102 static uint_t pmem_lszc; /* page size code of the large page */ 103 /* The segment to be associated with all the allocated pages. */ 104 static struct seg pmem_seg; 105 /* Fully occupied large pages allocated for pmem. */ 106 static pmem_lpg_t *pmem_occ_lpgs; 107 /* Memory pool to store residual small pages from large pages. */ 108 static page_t *pmem_mpool = NULL; 109 /* Number of small pages reside in pmem_mpool currently. */ 110 static pgcnt_t pmem_nmpages = 0; 111 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */ 112 kmutex_t pmem_mutex; 113 114 static int lpg_isfree(pmem_lpg_t *); 115 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *); 116 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **); 117 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **); 118 static pmem_lpg_t *pmem_lpg_alloc(uint_t); 119 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *); 120 static void lpg_free(page_t *spp); 121 static pgcnt_t mpool_break(page_t **, pgcnt_t); 122 static void mpool_append(page_t **, pgcnt_t); 123 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *); 124 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **); 125 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **, 126 vnode_t *, u_offset_t *, uint_t); 127 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *); 128 static void tlist_out(page_t *, pgcnt_t); 129 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t); 130 static int pmem_lock(pgcnt_t, kproject_t **); 131 132 /* 133 * Called by driver devmap routine to pass physical memory mapping info to 134 * seg_dev framework, used only for physical memory allocated from 135 * devmap_pmem_alloc(). 136 */ 137 /* ARGSUSED */ 138 int 139 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip, 140 struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie, 141 offset_t off, size_t len, uint_t maxprot, uint_t flags, 142 ddi_device_acc_attr_t *accattrp) 143 { 144 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 145 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 146 147 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 148 return (DDI_FAILURE); 149 150 /* 151 * First to check if this function has been called for this dhp. 152 */ 153 if (dhp->dh_flags & DEVMAP_SETUP_DONE) 154 return (DDI_FAILURE); 155 156 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 157 return (DDI_FAILURE); 158 159 if (flags & DEVMAP_MAPPING_INVALID) { 160 /* 161 * If DEVMAP_MAPPING_INVALID is specified, we have to grant 162 * remap permission. 163 */ 164 if (!(flags & DEVMAP_ALLOW_REMAP)) 165 return (DDI_FAILURE); 166 } else { 167 dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp; 168 /* dh_roff is the offset inside the dh_pcookie. */ 169 dhp->dh_roff = ptob(btop(off)); 170 } 171 172 /* 173 * Only "No Cache" and "Write Combining" are supported. If any other 174 * cache type is specified, override with "No Cache". 175 */ 176 if (accattrp->devacc_attr_dataorder == DDI_MERGING_OK_ACC) 177 dhp->dh_hat_attr = HAT_PLAT_NOCACHE | HAT_MERGING_OK; 178 else 179 dhp->dh_hat_attr = HAT_PLAT_NOCACHE | HAT_STRICTORDER; 180 dhp->dh_cookie = DEVMAP_PMEM_COOKIE; 181 dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS); 182 dhp->dh_len = ptob(btopr(len)); 183 184 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 185 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 186 187 if (callbackops != NULL) { 188 bcopy(callbackops, &dhp->dh_callbackops, 189 sizeof (struct devmap_callback_ctl)); 190 } 191 192 /* 193 * Initialize dh_lock if we want to do remap. 194 */ 195 if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) { 196 mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL); 197 dhp->dh_flags |= DEVMAP_LOCK_INITED; 198 } 199 200 dhp->dh_flags |= DEVMAP_SETUP_DONE; 201 202 return (DDI_SUCCESS); 203 } 204 205 /* 206 * Replace existing mapping using a new cookie, mainly gets called when doing 207 * fork(). Should be called in associated devmap_dup(9E). 208 */ 209 /* ARGSUSED */ 210 int 211 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip, 212 devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot, 213 uint_t flags, ddi_device_acc_attr_t *accattrp) 214 { 215 devmap_handle_t *dhp = (devmap_handle_t *)dhc; 216 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 217 218 /* 219 * Reture failure if setup has not been done or no remap permission 220 * has been granted during the setup. 221 */ 222 if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 || 223 (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0) 224 return (DDI_FAILURE); 225 226 /* No flags supported for remap yet. */ 227 if (flags != 0) 228 return (DDI_FAILURE); 229 230 if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot) 231 return (DDI_FAILURE); 232 233 if (pcp == NULL || (off + len) > ptob(pcp->dp_npages)) 234 return (DDI_FAILURE); 235 236 HOLD_DHP_LOCK(dhp); 237 /* 238 * Unload the old mapping of pages reloated with this dhp, so next 239 * fault will setup the new mappings. It is in segdev_faultpage that 240 * calls hat_devload to establish the mapping. Do this while holding 241 * the dhp lock so other faults dont reestablish the mappings. 242 */ 243 hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr, 244 dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER); 245 246 /* 247 * Only "No Cache" and "Write Combining" are supported, if other cache 248 * type is specified, override with "No Cache". 249 */ 250 if (accattrp->devacc_attr_dataorder == DDI_MERGING_OK_ACC) 251 dhp->dh_hat_attr = HAT_MERGING_OK; 252 else 253 dhp->dh_hat_attr = HAT_STRICTORDER; 254 dhp->dh_pcookie = cookie; 255 dhp->dh_roff = ptob(btop(off)); 256 dhp->dh_len = ptob(btopr(len)); 257 258 /* Clear the large page size flag. */ 259 dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; 260 261 dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot; 262 ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot); 263 RELE_DHP_LOCK(dhp); 264 return (DDI_SUCCESS); 265 } 266 267 /* 268 * Directly (i.e., without occupying kernel virtual address space) allocate 269 * 'npages' physical memory pages for exporting to user land. The allocated 270 * page_t pointer will be recorded in cookie. 271 */ 272 int 273 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep) 274 { 275 u_offset_t pmem_off = 0; 276 page_t *pp = NULL; 277 page_t *lpp = NULL; 278 page_t *tlist = NULL; 279 pgcnt_t i = 0; 280 pgcnt_t rpages = 0; 281 pgcnt_t lpages = 0; 282 pgcnt_t tpages = 0; 283 pgcnt_t npages = btopr(size); 284 pmem_lpg_t *plp = NULL; 285 struct devmap_pmem_cookie *pcp; 286 uint_t reserved = 0; 287 uint_t locked = 0; 288 uint_t pflags, kflags; 289 290 *cookiep = NULL; 291 292 /* 293 * Number larger than this will cause page_create_va() to loop 294 * infinitely. 295 */ 296 if (npages == 0 || npages >= total_pages / 2) 297 return (DDI_FAILURE); 298 if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0) 299 return (DDI_FAILURE); 300 pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT; 301 kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP; 302 303 /* Allocate pmem cookie. */ 304 if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE) 305 return (DDI_FAILURE); 306 pcp->dp_npages = npages; 307 308 /* 309 * See if the requested memory can be locked. Currently we do resource 310 * controls on the project levlel only. 311 */ 312 if (pmem_lock(npages, &(pcp->dp_projp)) == DDI_FAILURE) 313 goto alloc_fail; 314 locked = 1; 315 316 /* 317 * First, grab as many as possible from pmem_mpool. If pages in 318 * pmem_mpool are enough for this request, we are done. 319 */ 320 mutex_enter(&pmem_mutex); 321 tpages = mpool_break(&tlist, npages); 322 /* IOlock and hashin them into the new offset. */ 323 if (tpages) 324 tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off); 325 mutex_exit(&pmem_mutex); 326 327 if (tpages == npages) 328 goto done; 329 330 rpages = npages - tpages; 331 /* Quit now if memory cannot be reserved. */ 332 if (!page_resv(rpages, kflags)) 333 goto alloc_fail; 334 reserved = 1; 335 336 /* Try to allocate large pages first to decrease fragmentation. */ 337 i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt; 338 if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off, 339 kflags) == DDI_FAILURE) 340 goto alloc_fail; 341 ASSERT(lpages == 0 ? lpp == NULL : 1); 342 343 /* 344 * Pages in large pages is more than the request, put the residual 345 * pages into pmem_mpool. 346 */ 347 if (lpages >= rpages) { 348 lpp_break(&lpp, lpages, lpages - rpages, plp); 349 goto done; 350 } 351 352 /* Allocate small pages if lpp+tlist cannot satisfy the request. */ 353 i = rpages - lpages; 354 if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i), 355 pflags, &pmem_seg, (caddr_t)pmem_off)) == NULL) 356 goto alloc_fail; 357 358 done: 359 page_list_concat(&tlist, &lpp); 360 page_list_concat(&tlist, &pp); 361 /* Set those small pages from large pages as allocated. */ 362 mutex_enter(&pmem_mutex); 363 pmem_lpg_concat(&pmem_occ_lpgs, &plp); 364 mutex_exit(&pmem_mutex); 365 366 /* 367 * Now tlist holds all the pages for this cookie. Record these pages in 368 * pmem cookie. 369 */ 370 for (pp = tlist, i = 0; i < npages; i++) { 371 pcp->dp_pparray[i] = pp; 372 page_io_unlock(pp); 373 pp = pp->p_next; 374 page_sub(&tlist, pp->p_prev); 375 } 376 ASSERT(tlist == NULL); 377 *cookiep = (devmap_pmem_cookie_t)pcp; 378 379 return (DDI_SUCCESS); 380 381 alloc_fail: 382 DTRACE_PROBE(pmem__alloc__fail); 383 /* Free large pages and the associated allocation records. */ 384 if (lpp) 385 lpp_free(lpp, lpages / pmem_pgcnt, &plp); 386 if (reserved == 1) 387 page_unresv(rpages); 388 /* Put those pages in tlist back into pmem_mpool. */ 389 if (tpages != 0) { 390 mutex_enter(&pmem_mutex); 391 /* IOunlock, hashout and update the allocation records. */ 392 tlist_out(tlist, tpages); 393 mpool_append(&tlist, tpages); 394 mutex_exit(&pmem_mutex); 395 } 396 if (locked == 1) 397 i_ddi_decr_locked_memory(NULL, NULL, pcp->dp_projp, NULL, 398 ptob(pcp->dp_npages)); 399 /* Freeing pmem_cookie. */ 400 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 401 kmem_free(pcp->dp_pparray, npages * sizeof (page_t *)); 402 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 403 return (DDI_FAILURE); 404 } 405 406 /* 407 * Free all small pages inside cookie, and return pages from large pages into 408 * mpool, if all the pages from one large page is in mpool, free it as a whole. 409 */ 410 void 411 devmap_pmem_free(devmap_pmem_cookie_t cookie) 412 { 413 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 414 pgcnt_t i; 415 pgcnt_t tpages = 0; 416 page_t *pp; 417 pmem_lpg_t *pl1, *plp; 418 pmem_lpg_t *pf_lpgs = NULL; 419 uint_t npls = 0; 420 pmem_lpg_t *last_pl = NULL; 421 pmem_lpg_t *plast_pl = NULL; 422 423 ASSERT(pcp); 424 mutex_enter(&pmem_mutex); 425 /* Free small pages and return them to memory pool. */ 426 for (i = pcp->dp_npages; i > 0; i--) { 427 pp = pcp->dp_pparray[i - 1]; 428 page_hashout(pp, NULL); 429 /* 430 * Remove the mapping of this single page, this mapping is 431 * created using hat_devload() in segdev_faultpage(). 432 */ 433 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 434 if (!FROM_LPG(pp)) { 435 /* Normal small page. */ 436 page_free(pp, 1); 437 page_unresv(1); 438 } else { 439 /* Small page from large pages. */ 440 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 441 if (plp && !(plp->pl_pfree)) { 442 /* 443 * Move this record to pf_lpgs list, this large 444 * page may be able to be freed as a whole. 445 */ 446 pmem_lpg_sub(&pmem_occ_lpgs, plp); 447 pmem_lpg_concat(&pf_lpgs, &plp); 448 plp->pl_pfree = 1; 449 npls++; 450 last_pl = NULL; 451 } else { 452 /* Search in pf_lpgs list. */ 453 plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl); 454 } 455 ASSERT(plp); 456 /* Mark this page as free. */ 457 BT_SET(plp->pl_bitmap, PFIND(pp)); 458 /* Record this page in pmem_mpool. */ 459 mpool_append(&pp, 1); 460 } 461 } 462 463 /* 464 * Find out the large pages whose pages have been freed, remove them 465 * from plp list, free them and the associated pmem_lpg struct. 466 */ 467 for (plp = pf_lpgs; npls != 0; npls--) { 468 pl1 = plp; 469 plp = plp->pl_next; 470 if (lpg_isfree(pl1)) { 471 /* 472 * Get one free large page. Find all pages in this 473 * large page and remove them from pmem_mpool. 474 */ 475 lpg_free(pl1->pl_pp); 476 /* Remove associated allocation records. */ 477 pmem_lpg_sub(&pf_lpgs, pl1); 478 pmem_lpg_free(&pf_lpgs, pl1); 479 tpages -= pmem_pgcnt; 480 } else 481 pl1->pl_pfree = 0; 482 } 483 /* Update allocation records accordingly. */ 484 pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs); 485 mutex_exit(&pmem_mutex); 486 487 i_ddi_decr_locked_memory(NULL, NULL, (kproject_t *)pcp->dp_projp, NULL, 488 ptob(pcp->dp_npages)); 489 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 490 kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *)); 491 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 492 } 493 494 /* 495 * To extract page frame number from specified range in a cookie. 496 */ 497 int 498 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages, 499 pfn_t *pfnarray) 500 { 501 struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie; 502 pgcnt_t i; 503 504 if (pcp == NULL || start + npages > pcp->dp_npages) 505 return (DDI_FAILURE); 506 507 for (i = start; i < start + npages; i++) 508 pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum; 509 return (DDI_SUCCESS); 510 } 511 512 void 513 pmem_init() 514 { 515 mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL); 516 pmem_lszc = MIN(1, page_num_pagesizes() - 1); 517 pmem_lpgsize = page_get_pagesize(pmem_lszc); 518 pmem_pgcnt = pmem_lpgsize >> PAGESHIFT; 519 bzero(&pmem_seg, sizeof (struct seg)); 520 pmem_seg.s_as = &kas; 521 } 522 523 /* Allocate kernel memory for one pmem cookie with n pages. */ 524 static int 525 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags) 526 { 527 struct devmap_pmem_cookie *pcp; 528 529 if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie), 530 kflags)) == NULL) 531 return (DDI_FAILURE); 532 pcp = *pcpp; 533 if ((pcp->dp_vnp = 534 kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) { 535 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 536 return (DDI_FAILURE); 537 } 538 if ((pcp->dp_pparray = 539 kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) { 540 kmem_free(pcp->dp_vnp, sizeof (vnode_t)); 541 kmem_free(pcp, sizeof (struct devmap_pmem_cookie)); 542 return (DDI_FAILURE); 543 } 544 return (DDI_SUCCESS); 545 } 546 547 /* Try to lock down n pages resource for current project. */ 548 static int 549 pmem_lock(pgcnt_t n, kproject_t **prjpp) 550 { 551 mutex_enter(&curproc->p_lock); 552 if (i_ddi_incr_locked_memory(curproc, NULL, NULL, NULL, 553 ptob(n)) != 0) { 554 mutex_exit(&curproc->p_lock); 555 return (DDI_FAILURE); 556 } 557 /* Store this project in cookie for later lock/unlock. */ 558 *prjpp = curproc->p_task->tk_proj; 559 mutex_exit(&curproc->p_lock); 560 return (DDI_SUCCESS); 561 } 562 563 /* To check if all the pages in a large page are freed. */ 564 static int 565 lpg_isfree(pmem_lpg_t *plp) 566 { 567 uint_t i; 568 569 for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++) 570 if (plp->pl_bitmap[i] != BT_ULMAXMASK) 571 return (0); 572 /* All 1 means all pages are freed. */ 573 return (1); 574 } 575 576 /* 577 * Using pp to get the associated large page allocation record, searching in 578 * the splp linked list with *last as the heuristic pointer. Return NULL if 579 * not found. 580 */ 581 static pmem_lpg_t * 582 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last) 583 { 584 pmem_lpg_t *plp; 585 pgcnt_t root_pfn; 586 587 ASSERT(pp); 588 if (splp == NULL) 589 return (NULL); 590 root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1); 591 592 /* Try last winner first. */ 593 if (*last && root_pfn == page_pptonum((*last)->pl_pp)) 594 goto pl_found; 595 596 /* Else search the whole pmem_lpg list. */ 597 for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) { 598 plp = plp->pl_next; 599 if (plp == splp) { 600 plp = NULL; 601 break; 602 } 603 ASSERT(plp->pl_pp); 604 } 605 606 *last = plp; 607 608 pl_found: 609 return (*last); 610 } 611 612 /* 613 * Remove one pmem_lpg plp from the oplpp list. 614 */ 615 static void 616 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp) 617 { 618 if (*oplpp == plp) 619 *oplpp = plp->pl_next; /* go to next pmem_lpg */ 620 621 if (*oplpp == plp) 622 *oplpp = NULL; /* pmem_lpg list is gone */ 623 else { 624 plp->pl_prev->pl_next = plp->pl_next; 625 plp->pl_next->pl_prev = plp->pl_prev; 626 } 627 plp->pl_prev = plp->pl_next = plp; /* make plp a list of one */ 628 } 629 630 /* 631 * Concatenate page list nplpp onto the end of list plpp. 632 */ 633 static void 634 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp) 635 { 636 pmem_lpg_t *s1p, *s2p, *e1p, *e2p; 637 638 if (*nplpp == NULL) { 639 return; 640 } 641 if (*plpp == NULL) { 642 *plpp = *nplpp; 643 return; 644 } 645 s1p = *plpp; 646 e1p = s1p->pl_prev; 647 s2p = *nplpp; 648 e2p = s2p->pl_prev; 649 s1p->pl_prev = e2p; 650 e2p->pl_next = s1p; 651 e1p->pl_next = s2p; 652 s2p->pl_prev = e1p; 653 } 654 655 /* 656 * Allocate and initialize the allocation record of one large page, the init 657 * value is 'allocated'. 658 */ 659 static pmem_lpg_t * 660 pmem_lpg_alloc(uint_t kflags) 661 { 662 pmem_lpg_t *plp; 663 664 ASSERT(pmem_pgcnt % BT_NBIPUL == 0); 665 plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags); 666 if (plp == NULL) 667 return (NULL); 668 plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags); 669 if (plp->pl_bitmap == NULL) { 670 kmem_free(plp, sizeof (*plp)); 671 return (NULL); 672 } 673 plp->pl_next = plp->pl_prev = plp; 674 return (plp); 675 } 676 677 /* Free one allocation record pointed by oplp. */ 678 static void 679 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp) 680 { 681 if (*headp == plp) 682 *headp = plp->pl_next; /* go to next pmem_lpg_t */ 683 684 if (*headp == plp) 685 *headp = NULL; /* this list is gone */ 686 else { 687 plp->pl_prev->pl_next = plp->pl_next; 688 plp->pl_next->pl_prev = plp->pl_prev; 689 } 690 kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt)); 691 kmem_free(plp, sizeof (*plp)); 692 } 693 694 /* Free one large page headed by spp from pmem_mpool. */ 695 static void 696 lpg_free(page_t *spp) 697 { 698 page_t *pp1 = spp; 699 uint_t i; 700 701 ASSERT(MUTEX_HELD(&pmem_mutex)); 702 for (i = 0; i < pmem_pgcnt; i++) { 703 /* Break pp1 from pmem_mpool. */ 704 page_sub(&pmem_mpool, pp1); 705 pp1++; 706 } 707 /* Free pages in this large page. */ 708 page_free_pages(spp); 709 page_unresv(pmem_pgcnt); 710 pmem_nmpages -= pmem_pgcnt; 711 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 712 } 713 714 /* Put n pages in *ppp list back into pmem_mpool. */ 715 static void 716 mpool_append(page_t **ppp, pgcnt_t n) 717 { 718 ASSERT(MUTEX_HELD(&pmem_mutex)); 719 /* Put back pages. */ 720 page_list_concat(&pmem_mpool, ppp); 721 pmem_nmpages += n; 722 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 723 } 724 725 /* 726 * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp 727 * list, and return the number of grabbed pages. 728 */ 729 static pgcnt_t 730 mpool_break(page_t **ppp, pgcnt_t n) 731 { 732 pgcnt_t i; 733 734 ASSERT(MUTEX_HELD(&pmem_mutex)); 735 /* Grab the pages. */ 736 i = MIN(pmem_nmpages, n); 737 *ppp = pmem_mpool; 738 page_list_break(ppp, &pmem_mpool, i); 739 pmem_nmpages -= i; 740 ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool)); 741 return (i); 742 } 743 744 /* 745 * Create n large pages, lpages and plpp contains the number of small pages and 746 * allocation records list respectively. 747 */ 748 static int 749 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp, 750 vnode_t *vnp, u_offset_t *offp, uint_t kflags) 751 { 752 pgcnt_t i; 753 pmem_lpg_t *plp; 754 page_t *pp; 755 756 for (i = 0, *lpages = 0; i < n; i++) { 757 /* Allocte one large page each time. */ 758 pp = page_create_va_large(vnp, *offp, pmem_lpgsize, 759 PG_EXCL, &pmem_seg, (caddr_t)*offp, NULL); 760 if (pp == NULL) 761 break; 762 *offp += pmem_lpgsize; 763 page_list_concat(lppp, &pp); 764 *lpages += pmem_pgcnt; 765 /* Add one allocation record for this large page. */ 766 if ((plp = pmem_lpg_alloc(kflags)) == NULL) 767 return (DDI_FAILURE); 768 plp->pl_pp = pp; 769 pmem_lpg_concat(plpp, &plp); 770 } 771 return (DDI_SUCCESS); 772 } 773 774 /* 775 * Break the last r small pages from the large page list *lppp (with totally n 776 * small pages) and put them into pmem_mpool. 777 */ 778 static void 779 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp) 780 { 781 page_t *pp, *pp1; 782 pgcnt_t i; 783 pmem_lpg_t *plp; 784 785 if (r == 0) 786 return; 787 ASSERT(*lppp != NULL && r < pmem_pgcnt); 788 page_list_break(lppp, &pp, n - r); 789 790 /* The residual should reside in the last large page. */ 791 plp = oplp->pl_prev; 792 /* IOunlock and hashout the residual pages. */ 793 for (pp1 = pp, i = 0; i < r; i++) { 794 page_io_unlock(pp1); 795 page_hashout(pp1, NULL); 796 /* Mark this page as free. */ 797 BT_SET(plp->pl_bitmap, PFIND(pp1)); 798 pp1 = pp1->p_next; 799 } 800 ASSERT(pp1 == pp); 801 /* Put these residual pages into memory pool. */ 802 mutex_enter(&pmem_mutex); 803 mpool_append(&pp, r); 804 mutex_exit(&pmem_mutex); 805 } 806 807 /* Freeing large pages in lpp and the associated allocation records in plp. */ 808 static void 809 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp) 810 { 811 pgcnt_t i, j; 812 page_t *pp = lpp, *pp1; 813 pmem_lpg_t *plp1, *plp2; 814 815 for (i = 0; i < lpgs; i++) { 816 for (j = 0; j < pmem_pgcnt; j++) { 817 /* IO unlock and hashout this small page. */ 818 page_io_unlock(pp); 819 page_hashout(pp, NULL); 820 pp1 = pp->p_next; 821 pp->p_prev = pp->p_next = pp; 822 pp = pp1; 823 } 824 /* Free one large page at one time. */ 825 page_free_pages(lpp); 826 lpp = pp; 827 } 828 /* Free associate pmem large page allocation records. */ 829 for (plp1 = *plpp; *plpp; plp1 = plp2) { 830 plp2 = plp1->pl_next; 831 pmem_lpg_free(plpp, plp1); 832 } 833 } 834 835 /* 836 * IOlock and hashin all pages in tlist, associate them with vnode *pvnp 837 * and offset starting with *poffp. Update allocation records accordingly at 838 * the same time. 839 */ 840 static void 841 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp) 842 { 843 page_t *pp; 844 pgcnt_t i = 0; 845 pmem_lpg_t *plp, *last_pl = NULL; 846 847 ASSERT(MUTEX_HELD(&pmem_mutex)); 848 for (pp = tlist; i < tpages; i++) { 849 ASSERT(FROM_LPG(pp)); 850 page_io_lock(pp); 851 (void) page_hashin(pp, pvnp, *poffp, NULL); 852 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 853 /* Mark this page as allocated. */ 854 BT_CLEAR(plp->pl_bitmap, PFIND(pp)); 855 *poffp += PAGESIZE; 856 pp = pp->p_next; 857 } 858 ASSERT(pp == tlist); 859 } 860 861 /* 862 * IOunlock and hashout all pages in tlist, update allocation records 863 * accordingly at the same time. 864 */ 865 static void 866 tlist_out(page_t *tlist, pgcnt_t tpages) 867 { 868 page_t *pp; 869 pgcnt_t i = 0; 870 pmem_lpg_t *plp, *last_pl = NULL; 871 872 ASSERT(MUTEX_HELD(&pmem_mutex)); 873 for (pp = tlist; i < tpages; i++) { 874 ASSERT(FROM_LPG(pp)); 875 page_io_unlock(pp); 876 page_hashout(pp, NULL); 877 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl); 878 /* Mark this page as free. */ 879 BT_SET(plp->pl_bitmap, PFIND(pp)); 880 pp = pp->p_next; 881 } 882 ASSERT(pp == tlist); 883 } 884