1 /* 2 * High memory handling common code and variables. 3 * 4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de 5 * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de 6 * 7 * 8 * Redesigned the x86 32-bit VM architecture to deal with 9 * 64-bit physical space. With current x86 CPUs this 10 * means up to 64 Gigabytes physical RAM. 11 * 12 * Rewrote high memory support to move the page cache into 13 * high memory. Implemented permanent (schedulable) kmaps 14 * based on Linus' idea. 15 * 16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 17 */ 18 19 #include <linux/mm.h> 20 #include <linux/module.h> 21 #include <linux/swap.h> 22 #include <linux/bio.h> 23 #include <linux/pagemap.h> 24 #include <linux/mempool.h> 25 #include <linux/blkdev.h> 26 #include <linux/init.h> 27 #include <linux/hash.h> 28 #include <linux/highmem.h> 29 #include <linux/blktrace_api.h> 30 #include <asm/tlbflush.h> 31 32 static mempool_t *page_pool, *isa_page_pool; 33 34 static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) 35 { 36 return mempool_alloc_pages(gfp_mask | GFP_DMA, data); 37 } 38 39 /* 40 * Virtual_count is not a pure "count". 41 * 0 means that it is not mapped, and has not been mapped 42 * since a TLB flush - it is usable. 43 * 1 means that there are no users, but it has been mapped 44 * since the last TLB flush - so we can't use it. 45 * n means that there are (n-1) current users of it. 46 */ 47 #ifdef CONFIG_HIGHMEM 48 49 static int pkmap_count[LAST_PKMAP]; 50 static unsigned int last_pkmap_nr; 51 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 52 53 pte_t * pkmap_page_table; 54 55 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); 56 57 static void flush_all_zero_pkmaps(void) 58 { 59 int i; 60 61 flush_cache_kmaps(); 62 63 for (i = 0; i < LAST_PKMAP; i++) { 64 struct page *page; 65 66 /* 67 * zero means we don't have anything to do, 68 * >1 means that it is still in use. Only 69 * a count of 1 means that it is free but 70 * needs to be unmapped 71 */ 72 if (pkmap_count[i] != 1) 73 continue; 74 pkmap_count[i] = 0; 75 76 /* sanity check */ 77 BUG_ON(pte_none(pkmap_page_table[i])); 78 79 /* 80 * Don't need an atomic fetch-and-clear op here; 81 * no-one has the page mapped, and cannot get at 82 * its virtual address (and hence PTE) without first 83 * getting the kmap_lock (which is held here). 84 * So no dangers, even with speculative execution. 85 */ 86 page = pte_page(pkmap_page_table[i]); 87 pte_clear(&init_mm, (unsigned long)page_address(page), 88 &pkmap_page_table[i]); 89 90 set_page_address(page, NULL); 91 } 92 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 93 } 94 95 static inline unsigned long map_new_virtual(struct page *page) 96 { 97 unsigned long vaddr; 98 int count; 99 100 start: 101 count = LAST_PKMAP; 102 /* Find an empty entry */ 103 for (;;) { 104 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; 105 if (!last_pkmap_nr) { 106 flush_all_zero_pkmaps(); 107 count = LAST_PKMAP; 108 } 109 if (!pkmap_count[last_pkmap_nr]) 110 break; /* Found a usable entry */ 111 if (--count) 112 continue; 113 114 /* 115 * Sleep for somebody else to unmap their entries 116 */ 117 { 118 DECLARE_WAITQUEUE(wait, current); 119 120 __set_current_state(TASK_UNINTERRUPTIBLE); 121 add_wait_queue(&pkmap_map_wait, &wait); 122 spin_unlock(&kmap_lock); 123 schedule(); 124 remove_wait_queue(&pkmap_map_wait, &wait); 125 spin_lock(&kmap_lock); 126 127 /* Somebody else might have mapped it while we slept */ 128 if (page_address(page)) 129 return (unsigned long)page_address(page); 130 131 /* Re-start */ 132 goto start; 133 } 134 } 135 vaddr = PKMAP_ADDR(last_pkmap_nr); 136 set_pte_at(&init_mm, vaddr, 137 &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); 138 139 pkmap_count[last_pkmap_nr] = 1; 140 set_page_address(page, (void *)vaddr); 141 142 return vaddr; 143 } 144 145 void fastcall *kmap_high(struct page *page) 146 { 147 unsigned long vaddr; 148 149 /* 150 * For highmem pages, we can't trust "virtual" until 151 * after we have the lock. 152 * 153 * We cannot call this from interrupts, as it may block 154 */ 155 spin_lock(&kmap_lock); 156 vaddr = (unsigned long)page_address(page); 157 if (!vaddr) 158 vaddr = map_new_virtual(page); 159 pkmap_count[PKMAP_NR(vaddr)]++; 160 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); 161 spin_unlock(&kmap_lock); 162 return (void*) vaddr; 163 } 164 165 EXPORT_SYMBOL(kmap_high); 166 167 void fastcall kunmap_high(struct page *page) 168 { 169 unsigned long vaddr; 170 unsigned long nr; 171 int need_wakeup; 172 173 spin_lock(&kmap_lock); 174 vaddr = (unsigned long)page_address(page); 175 BUG_ON(!vaddr); 176 nr = PKMAP_NR(vaddr); 177 178 /* 179 * A count must never go down to zero 180 * without a TLB flush! 181 */ 182 need_wakeup = 0; 183 switch (--pkmap_count[nr]) { 184 case 0: 185 BUG(); 186 case 1: 187 /* 188 * Avoid an unnecessary wake_up() function call. 189 * The common case is pkmap_count[] == 1, but 190 * no waiters. 191 * The tasks queued in the wait-queue are guarded 192 * by both the lock in the wait-queue-head and by 193 * the kmap_lock. As the kmap_lock is held here, 194 * no need for the wait-queue-head's lock. Simply 195 * test if the queue is empty. 196 */ 197 need_wakeup = waitqueue_active(&pkmap_map_wait); 198 } 199 spin_unlock(&kmap_lock); 200 201 /* do wake-up, if needed, race-free outside of the spin lock */ 202 if (need_wakeup) 203 wake_up(&pkmap_map_wait); 204 } 205 206 EXPORT_SYMBOL(kunmap_high); 207 208 #define POOL_SIZE 64 209 210 static __init int init_emergency_pool(void) 211 { 212 struct sysinfo i; 213 si_meminfo(&i); 214 si_swapinfo(&i); 215 216 if (!i.totalhigh) 217 return 0; 218 219 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 220 BUG_ON(!page_pool); 221 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 222 223 return 0; 224 } 225 226 __initcall(init_emergency_pool); 227 228 /* 229 * highmem version, map in to vec 230 */ 231 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) 232 { 233 unsigned long flags; 234 unsigned char *vto; 235 236 local_irq_save(flags); 237 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); 238 memcpy(vto + to->bv_offset, vfrom, to->bv_len); 239 kunmap_atomic(vto, KM_BOUNCE_READ); 240 local_irq_restore(flags); 241 } 242 243 #else /* CONFIG_HIGHMEM */ 244 245 #define bounce_copy_vec(to, vfrom) \ 246 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) 247 248 #endif 249 250 #define ISA_POOL_SIZE 16 251 252 /* 253 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA 254 * as the max address, so check if the pool has already been created. 255 */ 256 int init_emergency_isa_pool(void) 257 { 258 if (isa_page_pool) 259 return 0; 260 261 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, 262 mempool_free_pages, (void *) 0); 263 BUG_ON(!isa_page_pool); 264 265 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); 266 return 0; 267 } 268 269 /* 270 * Simple bounce buffer support for highmem pages. Depending on the 271 * queue gfp mask set, *to may or may not be a highmem page. kmap it 272 * always, it will do the Right Thing 273 */ 274 static void copy_to_high_bio_irq(struct bio *to, struct bio *from) 275 { 276 unsigned char *vfrom; 277 struct bio_vec *tovec, *fromvec; 278 int i; 279 280 __bio_for_each_segment(tovec, to, i, 0) { 281 fromvec = from->bi_io_vec + i; 282 283 /* 284 * not bounced 285 */ 286 if (tovec->bv_page == fromvec->bv_page) 287 continue; 288 289 /* 290 * fromvec->bv_offset and fromvec->bv_len might have been 291 * modified by the block layer, so use the original copy, 292 * bounce_copy_vec already uses tovec->bv_len 293 */ 294 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 295 296 flush_dcache_page(tovec->bv_page); 297 bounce_copy_vec(tovec, vfrom); 298 } 299 } 300 301 static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) 302 { 303 struct bio *bio_orig = bio->bi_private; 304 struct bio_vec *bvec, *org_vec; 305 int i; 306 307 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 308 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); 309 310 /* 311 * free up bounce indirect pages used 312 */ 313 __bio_for_each_segment(bvec, bio, i, 0) { 314 org_vec = bio_orig->bi_io_vec + i; 315 if (bvec->bv_page == org_vec->bv_page) 316 continue; 317 318 dec_zone_page_state(bvec->bv_page, NR_BOUNCE); 319 mempool_free(bvec->bv_page, pool); 320 } 321 322 bio_endio(bio_orig, bio_orig->bi_size, err); 323 bio_put(bio); 324 } 325 326 static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) 327 { 328 if (bio->bi_size) 329 return 1; 330 331 bounce_end_io(bio, page_pool, err); 332 return 0; 333 } 334 335 static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) 336 { 337 if (bio->bi_size) 338 return 1; 339 340 bounce_end_io(bio, isa_page_pool, err); 341 return 0; 342 } 343 344 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) 345 { 346 struct bio *bio_orig = bio->bi_private; 347 348 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 349 copy_to_high_bio_irq(bio_orig, bio); 350 351 bounce_end_io(bio, pool, err); 352 } 353 354 static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) 355 { 356 if (bio->bi_size) 357 return 1; 358 359 __bounce_end_io_read(bio, page_pool, err); 360 return 0; 361 } 362 363 static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) 364 { 365 if (bio->bi_size) 366 return 1; 367 368 __bounce_end_io_read(bio, isa_page_pool, err); 369 return 0; 370 } 371 372 static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, 373 mempool_t *pool) 374 { 375 struct page *page; 376 struct bio *bio = NULL; 377 int i, rw = bio_data_dir(*bio_orig); 378 struct bio_vec *to, *from; 379 380 bio_for_each_segment(from, *bio_orig, i) { 381 page = from->bv_page; 382 383 /* 384 * is destination page below bounce pfn? 385 */ 386 if (page_to_pfn(page) < q->bounce_pfn) 387 continue; 388 389 /* 390 * irk, bounce it 391 */ 392 if (!bio) 393 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); 394 395 to = bio->bi_io_vec + i; 396 397 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 398 to->bv_len = from->bv_len; 399 to->bv_offset = from->bv_offset; 400 inc_zone_page_state(to->bv_page, NR_BOUNCE); 401 402 if (rw == WRITE) { 403 char *vto, *vfrom; 404 405 flush_dcache_page(from->bv_page); 406 vto = page_address(to->bv_page) + to->bv_offset; 407 vfrom = kmap(from->bv_page) + from->bv_offset; 408 memcpy(vto, vfrom, to->bv_len); 409 kunmap(from->bv_page); 410 } 411 } 412 413 /* 414 * no pages bounced 415 */ 416 if (!bio) 417 return; 418 419 /* 420 * at least one page was bounced, fill in possible non-highmem 421 * pages 422 */ 423 __bio_for_each_segment(from, *bio_orig, i, 0) { 424 to = bio_iovec_idx(bio, i); 425 if (!to->bv_page) { 426 to->bv_page = from->bv_page; 427 to->bv_len = from->bv_len; 428 to->bv_offset = from->bv_offset; 429 } 430 } 431 432 bio->bi_bdev = (*bio_orig)->bi_bdev; 433 bio->bi_flags |= (1 << BIO_BOUNCED); 434 bio->bi_sector = (*bio_orig)->bi_sector; 435 bio->bi_rw = (*bio_orig)->bi_rw; 436 437 bio->bi_vcnt = (*bio_orig)->bi_vcnt; 438 bio->bi_idx = (*bio_orig)->bi_idx; 439 bio->bi_size = (*bio_orig)->bi_size; 440 441 if (pool == page_pool) { 442 bio->bi_end_io = bounce_end_io_write; 443 if (rw == READ) 444 bio->bi_end_io = bounce_end_io_read; 445 } else { 446 bio->bi_end_io = bounce_end_io_write_isa; 447 if (rw == READ) 448 bio->bi_end_io = bounce_end_io_read_isa; 449 } 450 451 bio->bi_private = *bio_orig; 452 *bio_orig = bio; 453 } 454 455 void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) 456 { 457 mempool_t *pool; 458 459 /* 460 * for non-isa bounce case, just check if the bounce pfn is equal 461 * to or bigger than the highest pfn in the system -- in that case, 462 * don't waste time iterating over bio segments 463 */ 464 if (!(q->bounce_gfp & GFP_DMA)) { 465 if (q->bounce_pfn >= blk_max_pfn) 466 return; 467 pool = page_pool; 468 } else { 469 BUG_ON(!isa_page_pool); 470 pool = isa_page_pool; 471 } 472 473 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); 474 475 /* 476 * slow path 477 */ 478 __blk_queue_bounce(q, bio_orig, pool); 479 } 480 481 EXPORT_SYMBOL(blk_queue_bounce); 482 483 #if defined(HASHED_PAGE_VIRTUAL) 484 485 #define PA_HASH_ORDER 7 486 487 /* 488 * Describes one page->virtual association 489 */ 490 struct page_address_map { 491 struct page *page; 492 void *virtual; 493 struct list_head list; 494 }; 495 496 /* 497 * page_address_map freelist, allocated from page_address_maps. 498 */ 499 static struct list_head page_address_pool; /* freelist */ 500 static spinlock_t pool_lock; /* protects page_address_pool */ 501 502 /* 503 * Hash table bucket 504 */ 505 static struct page_address_slot { 506 struct list_head lh; /* List of page_address_maps */ 507 spinlock_t lock; /* Protect this bucket's list */ 508 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 509 510 static struct page_address_slot *page_slot(struct page *page) 511 { 512 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 513 } 514 515 void *page_address(struct page *page) 516 { 517 unsigned long flags; 518 void *ret; 519 struct page_address_slot *pas; 520 521 if (!PageHighMem(page)) 522 return lowmem_page_address(page); 523 524 pas = page_slot(page); 525 ret = NULL; 526 spin_lock_irqsave(&pas->lock, flags); 527 if (!list_empty(&pas->lh)) { 528 struct page_address_map *pam; 529 530 list_for_each_entry(pam, &pas->lh, list) { 531 if (pam->page == page) { 532 ret = pam->virtual; 533 goto done; 534 } 535 } 536 } 537 done: 538 spin_unlock_irqrestore(&pas->lock, flags); 539 return ret; 540 } 541 542 EXPORT_SYMBOL(page_address); 543 544 void set_page_address(struct page *page, void *virtual) 545 { 546 unsigned long flags; 547 struct page_address_slot *pas; 548 struct page_address_map *pam; 549 550 BUG_ON(!PageHighMem(page)); 551 552 pas = page_slot(page); 553 if (virtual) { /* Add */ 554 BUG_ON(list_empty(&page_address_pool)); 555 556 spin_lock_irqsave(&pool_lock, flags); 557 pam = list_entry(page_address_pool.next, 558 struct page_address_map, list); 559 list_del(&pam->list); 560 spin_unlock_irqrestore(&pool_lock, flags); 561 562 pam->page = page; 563 pam->virtual = virtual; 564 565 spin_lock_irqsave(&pas->lock, flags); 566 list_add_tail(&pam->list, &pas->lh); 567 spin_unlock_irqrestore(&pas->lock, flags); 568 } else { /* Remove */ 569 spin_lock_irqsave(&pas->lock, flags); 570 list_for_each_entry(pam, &pas->lh, list) { 571 if (pam->page == page) { 572 list_del(&pam->list); 573 spin_unlock_irqrestore(&pas->lock, flags); 574 spin_lock_irqsave(&pool_lock, flags); 575 list_add_tail(&pam->list, &page_address_pool); 576 spin_unlock_irqrestore(&pool_lock, flags); 577 goto done; 578 } 579 } 580 spin_unlock_irqrestore(&pas->lock, flags); 581 } 582 done: 583 return; 584 } 585 586 static struct page_address_map page_address_maps[LAST_PKMAP]; 587 588 void __init page_address_init(void) 589 { 590 int i; 591 592 INIT_LIST_HEAD(&page_address_pool); 593 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) 594 list_add(&page_address_maps[i].list, &page_address_pool); 595 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 596 INIT_LIST_HEAD(&page_address_htable[i].lh); 597 spin_lock_init(&page_address_htable[i].lock); 598 } 599 spin_lock_init(&pool_lock); 600 } 601 602 #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 603