xref: /linux/mm/highmem.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  * High memory handling common code and variables.
3  *
4  * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5  *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
6  *
7  *
8  * Redesigned the x86 32-bit VM architecture to deal with
9  * 64-bit physical space. With current x86 CPUs this
10  * means up to 64 Gigabytes physical RAM.
11  *
12  * Rewrote high memory support to move the page cache into
13  * high memory. Implemented permanent (schedulable) kmaps
14  * based on Linus' idea.
15  *
16  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17  */
18 
19 #include <linux/mm.h>
20 #include <linux/module.h>
21 #include <linux/swap.h>
22 #include <linux/bio.h>
23 #include <linux/pagemap.h>
24 #include <linux/mempool.h>
25 #include <linux/blkdev.h>
26 #include <linux/init.h>
27 #include <linux/hash.h>
28 #include <linux/highmem.h>
29 #include <linux/blktrace_api.h>
30 #include <asm/tlbflush.h>
31 
32 static mempool_t *page_pool, *isa_page_pool;
33 
34 static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
35 {
36 	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
37 }
38 
39 /*
40  * Virtual_count is not a pure "count".
41  *  0 means that it is not mapped, and has not been mapped
42  *    since a TLB flush - it is usable.
43  *  1 means that there are no users, but it has been mapped
44  *    since the last TLB flush - so we can't use it.
45  *  n means that there are (n-1) current users of it.
46  */
47 #ifdef CONFIG_HIGHMEM
48 
49 static int pkmap_count[LAST_PKMAP];
50 static unsigned int last_pkmap_nr;
51 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
52 
53 pte_t * pkmap_page_table;
54 
55 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
56 
57 static void flush_all_zero_pkmaps(void)
58 {
59 	int i;
60 
61 	flush_cache_kmaps();
62 
63 	for (i = 0; i < LAST_PKMAP; i++) {
64 		struct page *page;
65 
66 		/*
67 		 * zero means we don't have anything to do,
68 		 * >1 means that it is still in use. Only
69 		 * a count of 1 means that it is free but
70 		 * needs to be unmapped
71 		 */
72 		if (pkmap_count[i] != 1)
73 			continue;
74 		pkmap_count[i] = 0;
75 
76 		/* sanity check */
77 		BUG_ON(pte_none(pkmap_page_table[i]));
78 
79 		/*
80 		 * Don't need an atomic fetch-and-clear op here;
81 		 * no-one has the page mapped, and cannot get at
82 		 * its virtual address (and hence PTE) without first
83 		 * getting the kmap_lock (which is held here).
84 		 * So no dangers, even with speculative execution.
85 		 */
86 		page = pte_page(pkmap_page_table[i]);
87 		pte_clear(&init_mm, (unsigned long)page_address(page),
88 			  &pkmap_page_table[i]);
89 
90 		set_page_address(page, NULL);
91 	}
92 	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
93 }
94 
95 static inline unsigned long map_new_virtual(struct page *page)
96 {
97 	unsigned long vaddr;
98 	int count;
99 
100 start:
101 	count = LAST_PKMAP;
102 	/* Find an empty entry */
103 	for (;;) {
104 		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
105 		if (!last_pkmap_nr) {
106 			flush_all_zero_pkmaps();
107 			count = LAST_PKMAP;
108 		}
109 		if (!pkmap_count[last_pkmap_nr])
110 			break;	/* Found a usable entry */
111 		if (--count)
112 			continue;
113 
114 		/*
115 		 * Sleep for somebody else to unmap their entries
116 		 */
117 		{
118 			DECLARE_WAITQUEUE(wait, current);
119 
120 			__set_current_state(TASK_UNINTERRUPTIBLE);
121 			add_wait_queue(&pkmap_map_wait, &wait);
122 			spin_unlock(&kmap_lock);
123 			schedule();
124 			remove_wait_queue(&pkmap_map_wait, &wait);
125 			spin_lock(&kmap_lock);
126 
127 			/* Somebody else might have mapped it while we slept */
128 			if (page_address(page))
129 				return (unsigned long)page_address(page);
130 
131 			/* Re-start */
132 			goto start;
133 		}
134 	}
135 	vaddr = PKMAP_ADDR(last_pkmap_nr);
136 	set_pte_at(&init_mm, vaddr,
137 		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
138 
139 	pkmap_count[last_pkmap_nr] = 1;
140 	set_page_address(page, (void *)vaddr);
141 
142 	return vaddr;
143 }
144 
145 void fastcall *kmap_high(struct page *page)
146 {
147 	unsigned long vaddr;
148 
149 	/*
150 	 * For highmem pages, we can't trust "virtual" until
151 	 * after we have the lock.
152 	 *
153 	 * We cannot call this from interrupts, as it may block
154 	 */
155 	spin_lock(&kmap_lock);
156 	vaddr = (unsigned long)page_address(page);
157 	if (!vaddr)
158 		vaddr = map_new_virtual(page);
159 	pkmap_count[PKMAP_NR(vaddr)]++;
160 	BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
161 	spin_unlock(&kmap_lock);
162 	return (void*) vaddr;
163 }
164 
165 EXPORT_SYMBOL(kmap_high);
166 
167 void fastcall kunmap_high(struct page *page)
168 {
169 	unsigned long vaddr;
170 	unsigned long nr;
171 	int need_wakeup;
172 
173 	spin_lock(&kmap_lock);
174 	vaddr = (unsigned long)page_address(page);
175 	BUG_ON(!vaddr);
176 	nr = PKMAP_NR(vaddr);
177 
178 	/*
179 	 * A count must never go down to zero
180 	 * without a TLB flush!
181 	 */
182 	need_wakeup = 0;
183 	switch (--pkmap_count[nr]) {
184 	case 0:
185 		BUG();
186 	case 1:
187 		/*
188 		 * Avoid an unnecessary wake_up() function call.
189 		 * The common case is pkmap_count[] == 1, but
190 		 * no waiters.
191 		 * The tasks queued in the wait-queue are guarded
192 		 * by both the lock in the wait-queue-head and by
193 		 * the kmap_lock.  As the kmap_lock is held here,
194 		 * no need for the wait-queue-head's lock.  Simply
195 		 * test if the queue is empty.
196 		 */
197 		need_wakeup = waitqueue_active(&pkmap_map_wait);
198 	}
199 	spin_unlock(&kmap_lock);
200 
201 	/* do wake-up, if needed, race-free outside of the spin lock */
202 	if (need_wakeup)
203 		wake_up(&pkmap_map_wait);
204 }
205 
206 EXPORT_SYMBOL(kunmap_high);
207 
208 #define POOL_SIZE	64
209 
210 static __init int init_emergency_pool(void)
211 {
212 	struct sysinfo i;
213 	si_meminfo(&i);
214 	si_swapinfo(&i);
215 
216 	if (!i.totalhigh)
217 		return 0;
218 
219 	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
220 	BUG_ON(!page_pool);
221 	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
222 
223 	return 0;
224 }
225 
226 __initcall(init_emergency_pool);
227 
228 /*
229  * highmem version, map in to vec
230  */
231 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
232 {
233 	unsigned long flags;
234 	unsigned char *vto;
235 
236 	local_irq_save(flags);
237 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
238 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
239 	kunmap_atomic(vto, KM_BOUNCE_READ);
240 	local_irq_restore(flags);
241 }
242 
243 #else /* CONFIG_HIGHMEM */
244 
245 #define bounce_copy_vec(to, vfrom)	\
246 	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
247 
248 #endif
249 
250 #define ISA_POOL_SIZE	16
251 
252 /*
253  * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
254  * as the max address, so check if the pool has already been created.
255  */
256 int init_emergency_isa_pool(void)
257 {
258 	if (isa_page_pool)
259 		return 0;
260 
261 	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
262 				       mempool_free_pages, (void *) 0);
263 	BUG_ON(!isa_page_pool);
264 
265 	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
266 	return 0;
267 }
268 
269 /*
270  * Simple bounce buffer support for highmem pages. Depending on the
271  * queue gfp mask set, *to may or may not be a highmem page. kmap it
272  * always, it will do the Right Thing
273  */
274 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
275 {
276 	unsigned char *vfrom;
277 	struct bio_vec *tovec, *fromvec;
278 	int i;
279 
280 	__bio_for_each_segment(tovec, to, i, 0) {
281 		fromvec = from->bi_io_vec + i;
282 
283 		/*
284 		 * not bounced
285 		 */
286 		if (tovec->bv_page == fromvec->bv_page)
287 			continue;
288 
289 		/*
290 		 * fromvec->bv_offset and fromvec->bv_len might have been
291 		 * modified by the block layer, so use the original copy,
292 		 * bounce_copy_vec already uses tovec->bv_len
293 		 */
294 		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
295 
296 		flush_dcache_page(tovec->bv_page);
297 		bounce_copy_vec(tovec, vfrom);
298 	}
299 }
300 
301 static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
302 {
303 	struct bio *bio_orig = bio->bi_private;
304 	struct bio_vec *bvec, *org_vec;
305 	int i;
306 
307 	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
308 		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
309 
310 	/*
311 	 * free up bounce indirect pages used
312 	 */
313 	__bio_for_each_segment(bvec, bio, i, 0) {
314 		org_vec = bio_orig->bi_io_vec + i;
315 		if (bvec->bv_page == org_vec->bv_page)
316 			continue;
317 
318 		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
319 		mempool_free(bvec->bv_page, pool);
320 	}
321 
322 	bio_endio(bio_orig, bio_orig->bi_size, err);
323 	bio_put(bio);
324 }
325 
326 static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
327 {
328 	if (bio->bi_size)
329 		return 1;
330 
331 	bounce_end_io(bio, page_pool, err);
332 	return 0;
333 }
334 
335 static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
336 {
337 	if (bio->bi_size)
338 		return 1;
339 
340 	bounce_end_io(bio, isa_page_pool, err);
341 	return 0;
342 }
343 
344 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
345 {
346 	struct bio *bio_orig = bio->bi_private;
347 
348 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
349 		copy_to_high_bio_irq(bio_orig, bio);
350 
351 	bounce_end_io(bio, pool, err);
352 }
353 
354 static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
355 {
356 	if (bio->bi_size)
357 		return 1;
358 
359 	__bounce_end_io_read(bio, page_pool, err);
360 	return 0;
361 }
362 
363 static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
364 {
365 	if (bio->bi_size)
366 		return 1;
367 
368 	__bounce_end_io_read(bio, isa_page_pool, err);
369 	return 0;
370 }
371 
372 static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
373 			       mempool_t *pool)
374 {
375 	struct page *page;
376 	struct bio *bio = NULL;
377 	int i, rw = bio_data_dir(*bio_orig);
378 	struct bio_vec *to, *from;
379 
380 	bio_for_each_segment(from, *bio_orig, i) {
381 		page = from->bv_page;
382 
383 		/*
384 		 * is destination page below bounce pfn?
385 		 */
386 		if (page_to_pfn(page) < q->bounce_pfn)
387 			continue;
388 
389 		/*
390 		 * irk, bounce it
391 		 */
392 		if (!bio)
393 			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
394 
395 		to = bio->bi_io_vec + i;
396 
397 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
398 		to->bv_len = from->bv_len;
399 		to->bv_offset = from->bv_offset;
400 		inc_zone_page_state(to->bv_page, NR_BOUNCE);
401 
402 		if (rw == WRITE) {
403 			char *vto, *vfrom;
404 
405 			flush_dcache_page(from->bv_page);
406 			vto = page_address(to->bv_page) + to->bv_offset;
407 			vfrom = kmap(from->bv_page) + from->bv_offset;
408 			memcpy(vto, vfrom, to->bv_len);
409 			kunmap(from->bv_page);
410 		}
411 	}
412 
413 	/*
414 	 * no pages bounced
415 	 */
416 	if (!bio)
417 		return;
418 
419 	/*
420 	 * at least one page was bounced, fill in possible non-highmem
421 	 * pages
422 	 */
423 	__bio_for_each_segment(from, *bio_orig, i, 0) {
424 		to = bio_iovec_idx(bio, i);
425 		if (!to->bv_page) {
426 			to->bv_page = from->bv_page;
427 			to->bv_len = from->bv_len;
428 			to->bv_offset = from->bv_offset;
429 		}
430 	}
431 
432 	bio->bi_bdev = (*bio_orig)->bi_bdev;
433 	bio->bi_flags |= (1 << BIO_BOUNCED);
434 	bio->bi_sector = (*bio_orig)->bi_sector;
435 	bio->bi_rw = (*bio_orig)->bi_rw;
436 
437 	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
438 	bio->bi_idx = (*bio_orig)->bi_idx;
439 	bio->bi_size = (*bio_orig)->bi_size;
440 
441 	if (pool == page_pool) {
442 		bio->bi_end_io = bounce_end_io_write;
443 		if (rw == READ)
444 			bio->bi_end_io = bounce_end_io_read;
445 	} else {
446 		bio->bi_end_io = bounce_end_io_write_isa;
447 		if (rw == READ)
448 			bio->bi_end_io = bounce_end_io_read_isa;
449 	}
450 
451 	bio->bi_private = *bio_orig;
452 	*bio_orig = bio;
453 }
454 
455 void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
456 {
457 	mempool_t *pool;
458 
459 	/*
460 	 * for non-isa bounce case, just check if the bounce pfn is equal
461 	 * to or bigger than the highest pfn in the system -- in that case,
462 	 * don't waste time iterating over bio segments
463 	 */
464 	if (!(q->bounce_gfp & GFP_DMA)) {
465 		if (q->bounce_pfn >= blk_max_pfn)
466 			return;
467 		pool = page_pool;
468 	} else {
469 		BUG_ON(!isa_page_pool);
470 		pool = isa_page_pool;
471 	}
472 
473 	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
474 
475 	/*
476 	 * slow path
477 	 */
478 	__blk_queue_bounce(q, bio_orig, pool);
479 }
480 
481 EXPORT_SYMBOL(blk_queue_bounce);
482 
483 #if defined(HASHED_PAGE_VIRTUAL)
484 
485 #define PA_HASH_ORDER	7
486 
487 /*
488  * Describes one page->virtual association
489  */
490 struct page_address_map {
491 	struct page *page;
492 	void *virtual;
493 	struct list_head list;
494 };
495 
496 /*
497  * page_address_map freelist, allocated from page_address_maps.
498  */
499 static struct list_head page_address_pool;	/* freelist */
500 static spinlock_t pool_lock;			/* protects page_address_pool */
501 
502 /*
503  * Hash table bucket
504  */
505 static struct page_address_slot {
506 	struct list_head lh;			/* List of page_address_maps */
507 	spinlock_t lock;			/* Protect this bucket's list */
508 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
509 
510 static struct page_address_slot *page_slot(struct page *page)
511 {
512 	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
513 }
514 
515 void *page_address(struct page *page)
516 {
517 	unsigned long flags;
518 	void *ret;
519 	struct page_address_slot *pas;
520 
521 	if (!PageHighMem(page))
522 		return lowmem_page_address(page);
523 
524 	pas = page_slot(page);
525 	ret = NULL;
526 	spin_lock_irqsave(&pas->lock, flags);
527 	if (!list_empty(&pas->lh)) {
528 		struct page_address_map *pam;
529 
530 		list_for_each_entry(pam, &pas->lh, list) {
531 			if (pam->page == page) {
532 				ret = pam->virtual;
533 				goto done;
534 			}
535 		}
536 	}
537 done:
538 	spin_unlock_irqrestore(&pas->lock, flags);
539 	return ret;
540 }
541 
542 EXPORT_SYMBOL(page_address);
543 
544 void set_page_address(struct page *page, void *virtual)
545 {
546 	unsigned long flags;
547 	struct page_address_slot *pas;
548 	struct page_address_map *pam;
549 
550 	BUG_ON(!PageHighMem(page));
551 
552 	pas = page_slot(page);
553 	if (virtual) {		/* Add */
554 		BUG_ON(list_empty(&page_address_pool));
555 
556 		spin_lock_irqsave(&pool_lock, flags);
557 		pam = list_entry(page_address_pool.next,
558 				struct page_address_map, list);
559 		list_del(&pam->list);
560 		spin_unlock_irqrestore(&pool_lock, flags);
561 
562 		pam->page = page;
563 		pam->virtual = virtual;
564 
565 		spin_lock_irqsave(&pas->lock, flags);
566 		list_add_tail(&pam->list, &pas->lh);
567 		spin_unlock_irqrestore(&pas->lock, flags);
568 	} else {		/* Remove */
569 		spin_lock_irqsave(&pas->lock, flags);
570 		list_for_each_entry(pam, &pas->lh, list) {
571 			if (pam->page == page) {
572 				list_del(&pam->list);
573 				spin_unlock_irqrestore(&pas->lock, flags);
574 				spin_lock_irqsave(&pool_lock, flags);
575 				list_add_tail(&pam->list, &page_address_pool);
576 				spin_unlock_irqrestore(&pool_lock, flags);
577 				goto done;
578 			}
579 		}
580 		spin_unlock_irqrestore(&pas->lock, flags);
581 	}
582 done:
583 	return;
584 }
585 
586 static struct page_address_map page_address_maps[LAST_PKMAP];
587 
588 void __init page_address_init(void)
589 {
590 	int i;
591 
592 	INIT_LIST_HEAD(&page_address_pool);
593 	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
594 		list_add(&page_address_maps[i].list, &page_address_pool);
595 	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
596 		INIT_LIST_HEAD(&page_address_htable[i].lh);
597 		spin_lock_init(&page_address_htable[i].lock);
598 	}
599 	spin_lock_init(&pool_lock);
600 }
601 
602 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
603