xref: /linux/mm/shmem.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *		 2000 Transmeta Corp.
6  *		 2000-2001 Christoph Rohland
7  *		 2000-2001 SAP AG
8  *		 2002 Red Hat Inc.
9  * Copyright (C) 2002-2005 Hugh Dickins.
10  * Copyright (C) 2002-2005 VERITAS Software Corporation.
11  * Copyright (C) 2004 Andi Kleen, SuSE Labs
12  *
13  * Extended attribute support for tmpfs:
14  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16  *
17  * This file is released under the GPL.
18  */
19 
20 /*
21  * This virtual memory filesystem is heavily based on the ramfs. It
22  * extends ramfs by the ability to use swap and honor resource limits
23  * which makes it a completely usable filesystem.
24  */
25 
26 #include <linux/config.h>
27 #include <linux/module.h>
28 #include <linux/init.h>
29 #include <linux/devfs_fs_kernel.h>
30 #include <linux/fs.h>
31 #include <linux/mm.h>
32 #include <linux/mman.h>
33 #include <linux/file.h>
34 #include <linux/swap.h>
35 #include <linux/pagemap.h>
36 #include <linux/string.h>
37 #include <linux/slab.h>
38 #include <linux/backing-dev.h>
39 #include <linux/shmem_fs.h>
40 #include <linux/mount.h>
41 #include <linux/writeback.h>
42 #include <linux/vfs.h>
43 #include <linux/blkdev.h>
44 #include <linux/security.h>
45 #include <linux/swapops.h>
46 #include <linux/mempolicy.h>
47 #include <linux/namei.h>
48 #include <linux/ctype.h>
49 #include <linux/migrate.h>
50 
51 #include <asm/uaccess.h>
52 #include <asm/div64.h>
53 #include <asm/pgtable.h>
54 
55 /* This magic number is used in glibc for posix shared memory */
56 #define TMPFS_MAGIC	0x01021994
57 
58 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
59 #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
60 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
61 
62 #define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
63 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
64 
65 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
66 
67 /* info->flags needs VM_flags to handle pagein/truncate races efficiently */
68 #define SHMEM_PAGEIN	 VM_READ
69 #define SHMEM_TRUNCATE	 VM_WRITE
70 
71 /* Definition to limit shmem_truncate's steps between cond_rescheds */
72 #define LATENCY_LIMIT	 64
73 
74 /* Pretend that each entry is of this size in directory's i_size */
75 #define BOGO_DIRENT_SIZE 20
76 
77 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
78 enum sgp_type {
79 	SGP_QUICK,	/* don't try more than file page cache lookup */
80 	SGP_READ,	/* don't exceed i_size, don't allocate page */
81 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
82 	SGP_WRITE,	/* may exceed i_size, may allocate page */
83 };
84 
85 static int shmem_getpage(struct inode *inode, unsigned long idx,
86 			 struct page **pagep, enum sgp_type sgp, int *type);
87 
88 static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
89 {
90 	/*
91 	 * The above definition of ENTRIES_PER_PAGE, and the use of
92 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
93 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
94 	 */
95 	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
96 }
97 
98 static inline void shmem_dir_free(struct page *page)
99 {
100 	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
101 }
102 
103 static struct page **shmem_dir_map(struct page *page)
104 {
105 	return (struct page **)kmap_atomic(page, KM_USER0);
106 }
107 
108 static inline void shmem_dir_unmap(struct page **dir)
109 {
110 	kunmap_atomic(dir, KM_USER0);
111 }
112 
113 static swp_entry_t *shmem_swp_map(struct page *page)
114 {
115 	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
116 }
117 
118 static inline void shmem_swp_balance_unmap(void)
119 {
120 	/*
121 	 * When passing a pointer to an i_direct entry, to code which
122 	 * also handles indirect entries and so will shmem_swp_unmap,
123 	 * we must arrange for the preempt count to remain in balance.
124 	 * What kmap_atomic of a lowmem page does depends on config
125 	 * and architecture, so pretend to kmap_atomic some lowmem page.
126 	 */
127 	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
128 }
129 
130 static inline void shmem_swp_unmap(swp_entry_t *entry)
131 {
132 	kunmap_atomic(entry, KM_USER1);
133 }
134 
135 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
136 {
137 	return sb->s_fs_info;
138 }
139 
140 /*
141  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
142  * for shared memory and for shared anonymous (/dev/zero) mappings
143  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
144  * consistent with the pre-accounting of private mappings ...
145  */
146 static inline int shmem_acct_size(unsigned long flags, loff_t size)
147 {
148 	return (flags & VM_ACCOUNT)?
149 		security_vm_enough_memory(VM_ACCT(size)): 0;
150 }
151 
152 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
153 {
154 	if (flags & VM_ACCOUNT)
155 		vm_unacct_memory(VM_ACCT(size));
156 }
157 
158 /*
159  * ... whereas tmpfs objects are accounted incrementally as
160  * pages are allocated, in order to allow huge sparse files.
161  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
162  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
163  */
164 static inline int shmem_acct_block(unsigned long flags)
165 {
166 	return (flags & VM_ACCOUNT)?
167 		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
168 }
169 
170 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
171 {
172 	if (!(flags & VM_ACCOUNT))
173 		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
174 }
175 
176 static struct super_operations shmem_ops;
177 static struct address_space_operations shmem_aops;
178 static struct file_operations shmem_file_operations;
179 static struct inode_operations shmem_inode_operations;
180 static struct inode_operations shmem_dir_inode_operations;
181 static struct vm_operations_struct shmem_vm_ops;
182 
183 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
184 	.ra_pages	= 0,	/* No readahead */
185 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
186 	.unplug_io_fn	= default_unplug_io_fn,
187 };
188 
189 static LIST_HEAD(shmem_swaplist);
190 static DEFINE_SPINLOCK(shmem_swaplist_lock);
191 
192 static void shmem_free_blocks(struct inode *inode, long pages)
193 {
194 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
195 	if (sbinfo->max_blocks) {
196 		spin_lock(&sbinfo->stat_lock);
197 		sbinfo->free_blocks += pages;
198 		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
199 		spin_unlock(&sbinfo->stat_lock);
200 	}
201 }
202 
203 /*
204  * shmem_recalc_inode - recalculate the size of an inode
205  *
206  * @inode: inode to recalc
207  *
208  * We have to calculate the free blocks since the mm can drop
209  * undirtied hole pages behind our back.
210  *
211  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
212  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
213  *
214  * It has to be called with the spinlock held.
215  */
216 static void shmem_recalc_inode(struct inode *inode)
217 {
218 	struct shmem_inode_info *info = SHMEM_I(inode);
219 	long freed;
220 
221 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
222 	if (freed > 0) {
223 		info->alloced -= freed;
224 		shmem_unacct_blocks(info->flags, freed);
225 		shmem_free_blocks(inode, freed);
226 	}
227 }
228 
229 /*
230  * shmem_swp_entry - find the swap vector position in the info structure
231  *
232  * @info:  info structure for the inode
233  * @index: index of the page to find
234  * @page:  optional page to add to the structure. Has to be preset to
235  *         all zeros
236  *
237  * If there is no space allocated yet it will return NULL when
238  * page is NULL, else it will use the page for the needed block,
239  * setting it to NULL on return to indicate that it has been used.
240  *
241  * The swap vector is organized the following way:
242  *
243  * There are SHMEM_NR_DIRECT entries directly stored in the
244  * shmem_inode_info structure. So small files do not need an addional
245  * allocation.
246  *
247  * For pages with index > SHMEM_NR_DIRECT there is the pointer
248  * i_indirect which points to a page which holds in the first half
249  * doubly indirect blocks, in the second half triple indirect blocks:
250  *
251  * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
252  * following layout (for SHMEM_NR_DIRECT == 16):
253  *
254  * i_indirect -> dir --> 16-19
255  * 	      |	     +-> 20-23
256  * 	      |
257  * 	      +-->dir2 --> 24-27
258  * 	      |	       +-> 28-31
259  * 	      |	       +-> 32-35
260  * 	      |	       +-> 36-39
261  * 	      |
262  * 	      +-->dir3 --> 40-43
263  * 	       	       +-> 44-47
264  * 	      	       +-> 48-51
265  * 	      	       +-> 52-55
266  */
267 static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
268 {
269 	unsigned long offset;
270 	struct page **dir;
271 	struct page *subdir;
272 
273 	if (index < SHMEM_NR_DIRECT) {
274 		shmem_swp_balance_unmap();
275 		return info->i_direct+index;
276 	}
277 	if (!info->i_indirect) {
278 		if (page) {
279 			info->i_indirect = *page;
280 			*page = NULL;
281 		}
282 		return NULL;			/* need another page */
283 	}
284 
285 	index -= SHMEM_NR_DIRECT;
286 	offset = index % ENTRIES_PER_PAGE;
287 	index /= ENTRIES_PER_PAGE;
288 	dir = shmem_dir_map(info->i_indirect);
289 
290 	if (index >= ENTRIES_PER_PAGE/2) {
291 		index -= ENTRIES_PER_PAGE/2;
292 		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
293 		index %= ENTRIES_PER_PAGE;
294 		subdir = *dir;
295 		if (!subdir) {
296 			if (page) {
297 				*dir = *page;
298 				*page = NULL;
299 			}
300 			shmem_dir_unmap(dir);
301 			return NULL;		/* need another page */
302 		}
303 		shmem_dir_unmap(dir);
304 		dir = shmem_dir_map(subdir);
305 	}
306 
307 	dir += index;
308 	subdir = *dir;
309 	if (!subdir) {
310 		if (!page || !(subdir = *page)) {
311 			shmem_dir_unmap(dir);
312 			return NULL;		/* need a page */
313 		}
314 		*dir = subdir;
315 		*page = NULL;
316 	}
317 	shmem_dir_unmap(dir);
318 	return shmem_swp_map(subdir) + offset;
319 }
320 
321 static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
322 {
323 	long incdec = value? 1: -1;
324 
325 	entry->val = value;
326 	info->swapped += incdec;
327 	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
328 		struct page *page = kmap_atomic_to_page(entry);
329 		set_page_private(page, page_private(page) + incdec);
330 	}
331 }
332 
333 /*
334  * shmem_swp_alloc - get the position of the swap entry for the page.
335  *                   If it does not exist allocate the entry.
336  *
337  * @info:	info structure for the inode
338  * @index:	index of the page to find
339  * @sgp:	check and recheck i_size? skip allocation?
340  */
341 static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
342 {
343 	struct inode *inode = &info->vfs_inode;
344 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
345 	struct page *page = NULL;
346 	swp_entry_t *entry;
347 
348 	if (sgp != SGP_WRITE &&
349 	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
350 		return ERR_PTR(-EINVAL);
351 
352 	while (!(entry = shmem_swp_entry(info, index, &page))) {
353 		if (sgp == SGP_READ)
354 			return shmem_swp_map(ZERO_PAGE(0));
355 		/*
356 		 * Test free_blocks against 1 not 0, since we have 1 data
357 		 * page (and perhaps indirect index pages) yet to allocate:
358 		 * a waste to allocate index if we cannot allocate data.
359 		 */
360 		if (sbinfo->max_blocks) {
361 			spin_lock(&sbinfo->stat_lock);
362 			if (sbinfo->free_blocks <= 1) {
363 				spin_unlock(&sbinfo->stat_lock);
364 				return ERR_PTR(-ENOSPC);
365 			}
366 			sbinfo->free_blocks--;
367 			inode->i_blocks += BLOCKS_PER_PAGE;
368 			spin_unlock(&sbinfo->stat_lock);
369 		}
370 
371 		spin_unlock(&info->lock);
372 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
373 		if (page)
374 			set_page_private(page, 0);
375 		spin_lock(&info->lock);
376 
377 		if (!page) {
378 			shmem_free_blocks(inode, 1);
379 			return ERR_PTR(-ENOMEM);
380 		}
381 		if (sgp != SGP_WRITE &&
382 		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
383 			entry = ERR_PTR(-EINVAL);
384 			break;
385 		}
386 		if (info->next_index <= index)
387 			info->next_index = index + 1;
388 	}
389 	if (page) {
390 		/* another task gave its page, or truncated the file */
391 		shmem_free_blocks(inode, 1);
392 		shmem_dir_free(page);
393 	}
394 	if (info->next_index <= index && !IS_ERR(entry))
395 		info->next_index = index + 1;
396 	return entry;
397 }
398 
399 /*
400  * shmem_free_swp - free some swap entries in a directory
401  *
402  * @dir:   pointer to the directory
403  * @edir:  pointer after last entry of the directory
404  */
405 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
406 {
407 	swp_entry_t *ptr;
408 	int freed = 0;
409 
410 	for (ptr = dir; ptr < edir; ptr++) {
411 		if (ptr->val) {
412 			free_swap_and_cache(*ptr);
413 			*ptr = (swp_entry_t){0};
414 			freed++;
415 		}
416 	}
417 	return freed;
418 }
419 
420 static int shmem_map_and_free_swp(struct page *subdir,
421 		int offset, int limit, struct page ***dir)
422 {
423 	swp_entry_t *ptr;
424 	int freed = 0;
425 
426 	ptr = shmem_swp_map(subdir);
427 	for (; offset < limit; offset += LATENCY_LIMIT) {
428 		int size = limit - offset;
429 		if (size > LATENCY_LIMIT)
430 			size = LATENCY_LIMIT;
431 		freed += shmem_free_swp(ptr+offset, ptr+offset+size);
432 		if (need_resched()) {
433 			shmem_swp_unmap(ptr);
434 			if (*dir) {
435 				shmem_dir_unmap(*dir);
436 				*dir = NULL;
437 			}
438 			cond_resched();
439 			ptr = shmem_swp_map(subdir);
440 		}
441 	}
442 	shmem_swp_unmap(ptr);
443 	return freed;
444 }
445 
446 static void shmem_free_pages(struct list_head *next)
447 {
448 	struct page *page;
449 	int freed = 0;
450 
451 	do {
452 		page = container_of(next, struct page, lru);
453 		next = next->next;
454 		shmem_dir_free(page);
455 		freed++;
456 		if (freed >= LATENCY_LIMIT) {
457 			cond_resched();
458 			freed = 0;
459 		}
460 	} while (next);
461 }
462 
463 static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
464 {
465 	struct shmem_inode_info *info = SHMEM_I(inode);
466 	unsigned long idx;
467 	unsigned long size;
468 	unsigned long limit;
469 	unsigned long stage;
470 	unsigned long diroff;
471 	struct page **dir;
472 	struct page *topdir;
473 	struct page *middir;
474 	struct page *subdir;
475 	swp_entry_t *ptr;
476 	LIST_HEAD(pages_to_free);
477 	long nr_pages_to_free = 0;
478 	long nr_swaps_freed = 0;
479 	int offset;
480 	int freed;
481 	int punch_hole = 0;
482 
483 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
484 	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
485 	if (idx >= info->next_index)
486 		return;
487 
488 	spin_lock(&info->lock);
489 	info->flags |= SHMEM_TRUNCATE;
490 	if (likely(end == (loff_t) -1)) {
491 		limit = info->next_index;
492 		info->next_index = idx;
493 	} else {
494 		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
495 		if (limit > info->next_index)
496 			limit = info->next_index;
497 		punch_hole = 1;
498 	}
499 
500 	topdir = info->i_indirect;
501 	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
502 		info->i_indirect = NULL;
503 		nr_pages_to_free++;
504 		list_add(&topdir->lru, &pages_to_free);
505 	}
506 	spin_unlock(&info->lock);
507 
508 	if (info->swapped && idx < SHMEM_NR_DIRECT) {
509 		ptr = info->i_direct;
510 		size = limit;
511 		if (size > SHMEM_NR_DIRECT)
512 			size = SHMEM_NR_DIRECT;
513 		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
514 	}
515 	if (!topdir)
516 		goto done2;
517 
518 	BUG_ON(limit <= SHMEM_NR_DIRECT);
519 	limit -= SHMEM_NR_DIRECT;
520 	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
521 	offset = idx % ENTRIES_PER_PAGE;
522 	idx -= offset;
523 
524 	dir = shmem_dir_map(topdir);
525 	stage = ENTRIES_PER_PAGEPAGE/2;
526 	if (idx < ENTRIES_PER_PAGEPAGE/2) {
527 		middir = topdir;
528 		diroff = idx/ENTRIES_PER_PAGE;
529 	} else {
530 		dir += ENTRIES_PER_PAGE/2;
531 		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
532 		while (stage <= idx)
533 			stage += ENTRIES_PER_PAGEPAGE;
534 		middir = *dir;
535 		if (*dir) {
536 			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
537 				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
538 			if (!diroff && !offset) {
539 				*dir = NULL;
540 				nr_pages_to_free++;
541 				list_add(&middir->lru, &pages_to_free);
542 			}
543 			shmem_dir_unmap(dir);
544 			dir = shmem_dir_map(middir);
545 		} else {
546 			diroff = 0;
547 			offset = 0;
548 			idx = stage;
549 		}
550 	}
551 
552 	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
553 		if (unlikely(idx == stage)) {
554 			shmem_dir_unmap(dir);
555 			dir = shmem_dir_map(topdir) +
556 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
557 			while (!*dir) {
558 				dir++;
559 				idx += ENTRIES_PER_PAGEPAGE;
560 				if (idx >= limit)
561 					goto done1;
562 			}
563 			stage = idx + ENTRIES_PER_PAGEPAGE;
564 			middir = *dir;
565 			*dir = NULL;
566 			nr_pages_to_free++;
567 			list_add(&middir->lru, &pages_to_free);
568 			shmem_dir_unmap(dir);
569 			cond_resched();
570 			dir = shmem_dir_map(middir);
571 			diroff = 0;
572 		}
573 		subdir = dir[diroff];
574 		if (subdir && page_private(subdir)) {
575 			size = limit - idx;
576 			if (size > ENTRIES_PER_PAGE)
577 				size = ENTRIES_PER_PAGE;
578 			freed = shmem_map_and_free_swp(subdir,
579 						offset, size, &dir);
580 			if (!dir)
581 				dir = shmem_dir_map(middir);
582 			nr_swaps_freed += freed;
583 			if (offset)
584 				spin_lock(&info->lock);
585 			set_page_private(subdir, page_private(subdir) - freed);
586 			if (offset)
587 				spin_unlock(&info->lock);
588 			if (!punch_hole)
589 				BUG_ON(page_private(subdir) > offset);
590 		}
591 		if (offset)
592 			offset = 0;
593 		else if (subdir && !page_private(subdir)) {
594 			dir[diroff] = NULL;
595 			nr_pages_to_free++;
596 			list_add(&subdir->lru, &pages_to_free);
597 		}
598 	}
599 done1:
600 	shmem_dir_unmap(dir);
601 done2:
602 	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
603 		/*
604 		 * Call truncate_inode_pages again: racing shmem_unuse_inode
605 		 * may have swizzled a page in from swap since vmtruncate or
606 		 * generic_delete_inode did it, before we lowered next_index.
607 		 * Also, though shmem_getpage checks i_size before adding to
608 		 * cache, no recheck after: so fix the narrow window there too.
609 		 */
610 		truncate_inode_pages_range(inode->i_mapping, start, end);
611 	}
612 
613 	spin_lock(&info->lock);
614 	info->flags &= ~SHMEM_TRUNCATE;
615 	info->swapped -= nr_swaps_freed;
616 	if (nr_pages_to_free)
617 		shmem_free_blocks(inode, nr_pages_to_free);
618 	shmem_recalc_inode(inode);
619 	spin_unlock(&info->lock);
620 
621 	/*
622 	 * Empty swap vector directory pages to be freed?
623 	 */
624 	if (!list_empty(&pages_to_free)) {
625 		pages_to_free.prev->next = NULL;
626 		shmem_free_pages(pages_to_free.next);
627 	}
628 }
629 
630 static void shmem_truncate(struct inode *inode)
631 {
632 	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
633 }
634 
635 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
636 {
637 	struct inode *inode = dentry->d_inode;
638 	struct page *page = NULL;
639 	int error;
640 
641 	if (attr->ia_valid & ATTR_SIZE) {
642 		if (attr->ia_size < inode->i_size) {
643 			/*
644 			 * If truncating down to a partial page, then
645 			 * if that page is already allocated, hold it
646 			 * in memory until the truncation is over, so
647 			 * truncate_partial_page cannnot miss it were
648 			 * it assigned to swap.
649 			 */
650 			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
651 				(void) shmem_getpage(inode,
652 					attr->ia_size>>PAGE_CACHE_SHIFT,
653 						&page, SGP_READ, NULL);
654 			}
655 			/*
656 			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
657 			 * detect if any pages might have been added to cache
658 			 * after truncate_inode_pages.  But we needn't bother
659 			 * if it's being fully truncated to zero-length: the
660 			 * nrpages check is efficient enough in that case.
661 			 */
662 			if (attr->ia_size) {
663 				struct shmem_inode_info *info = SHMEM_I(inode);
664 				spin_lock(&info->lock);
665 				info->flags &= ~SHMEM_PAGEIN;
666 				spin_unlock(&info->lock);
667 			}
668 		}
669 	}
670 
671 	error = inode_change_ok(inode, attr);
672 	if (!error)
673 		error = inode_setattr(inode, attr);
674 	if (page)
675 		page_cache_release(page);
676 	return error;
677 }
678 
679 static void shmem_delete_inode(struct inode *inode)
680 {
681 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
682 	struct shmem_inode_info *info = SHMEM_I(inode);
683 
684 	if (inode->i_op->truncate == shmem_truncate) {
685 		truncate_inode_pages(inode->i_mapping, 0);
686 		shmem_unacct_size(info->flags, inode->i_size);
687 		inode->i_size = 0;
688 		shmem_truncate(inode);
689 		if (!list_empty(&info->swaplist)) {
690 			spin_lock(&shmem_swaplist_lock);
691 			list_del_init(&info->swaplist);
692 			spin_unlock(&shmem_swaplist_lock);
693 		}
694 	}
695 	BUG_ON(inode->i_blocks);
696 	if (sbinfo->max_inodes) {
697 		spin_lock(&sbinfo->stat_lock);
698 		sbinfo->free_inodes++;
699 		spin_unlock(&sbinfo->stat_lock);
700 	}
701 	clear_inode(inode);
702 }
703 
704 static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
705 {
706 	swp_entry_t *ptr;
707 
708 	for (ptr = dir; ptr < edir; ptr++) {
709 		if (ptr->val == entry.val)
710 			return ptr - dir;
711 	}
712 	return -1;
713 }
714 
715 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
716 {
717 	struct inode *inode;
718 	unsigned long idx;
719 	unsigned long size;
720 	unsigned long limit;
721 	unsigned long stage;
722 	struct page **dir;
723 	struct page *subdir;
724 	swp_entry_t *ptr;
725 	int offset;
726 
727 	idx = 0;
728 	ptr = info->i_direct;
729 	spin_lock(&info->lock);
730 	limit = info->next_index;
731 	size = limit;
732 	if (size > SHMEM_NR_DIRECT)
733 		size = SHMEM_NR_DIRECT;
734 	offset = shmem_find_swp(entry, ptr, ptr+size);
735 	if (offset >= 0) {
736 		shmem_swp_balance_unmap();
737 		goto found;
738 	}
739 	if (!info->i_indirect)
740 		goto lost2;
741 
742 	dir = shmem_dir_map(info->i_indirect);
743 	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
744 
745 	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
746 		if (unlikely(idx == stage)) {
747 			shmem_dir_unmap(dir-1);
748 			dir = shmem_dir_map(info->i_indirect) +
749 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
750 			while (!*dir) {
751 				dir++;
752 				idx += ENTRIES_PER_PAGEPAGE;
753 				if (idx >= limit)
754 					goto lost1;
755 			}
756 			stage = idx + ENTRIES_PER_PAGEPAGE;
757 			subdir = *dir;
758 			shmem_dir_unmap(dir);
759 			dir = shmem_dir_map(subdir);
760 		}
761 		subdir = *dir;
762 		if (subdir && page_private(subdir)) {
763 			ptr = shmem_swp_map(subdir);
764 			size = limit - idx;
765 			if (size > ENTRIES_PER_PAGE)
766 				size = ENTRIES_PER_PAGE;
767 			offset = shmem_find_swp(entry, ptr, ptr+size);
768 			if (offset >= 0) {
769 				shmem_dir_unmap(dir);
770 				goto found;
771 			}
772 			shmem_swp_unmap(ptr);
773 		}
774 	}
775 lost1:
776 	shmem_dir_unmap(dir-1);
777 lost2:
778 	spin_unlock(&info->lock);
779 	return 0;
780 found:
781 	idx += offset;
782 	inode = &info->vfs_inode;
783 	if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
784 		info->flags |= SHMEM_PAGEIN;
785 		shmem_swp_set(info, ptr + offset, 0);
786 	}
787 	shmem_swp_unmap(ptr);
788 	spin_unlock(&info->lock);
789 	/*
790 	 * Decrement swap count even when the entry is left behind:
791 	 * try_to_unuse will skip over mms, then reincrement count.
792 	 */
793 	swap_free(entry);
794 	return 1;
795 }
796 
797 /*
798  * shmem_unuse() search for an eventually swapped out shmem page.
799  */
800 int shmem_unuse(swp_entry_t entry, struct page *page)
801 {
802 	struct list_head *p, *next;
803 	struct shmem_inode_info *info;
804 	int found = 0;
805 
806 	spin_lock(&shmem_swaplist_lock);
807 	list_for_each_safe(p, next, &shmem_swaplist) {
808 		info = list_entry(p, struct shmem_inode_info, swaplist);
809 		if (!info->swapped)
810 			list_del_init(&info->swaplist);
811 		else if (shmem_unuse_inode(info, entry, page)) {
812 			/* move head to start search for next from here */
813 			list_move_tail(&shmem_swaplist, &info->swaplist);
814 			found = 1;
815 			break;
816 		}
817 	}
818 	spin_unlock(&shmem_swaplist_lock);
819 	return found;
820 }
821 
822 /*
823  * Move the page from the page cache to the swap cache.
824  */
825 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
826 {
827 	struct shmem_inode_info *info;
828 	swp_entry_t *entry, swap;
829 	struct address_space *mapping;
830 	unsigned long index;
831 	struct inode *inode;
832 
833 	BUG_ON(!PageLocked(page));
834 	BUG_ON(page_mapped(page));
835 
836 	mapping = page->mapping;
837 	index = page->index;
838 	inode = mapping->host;
839 	info = SHMEM_I(inode);
840 	if (info->flags & VM_LOCKED)
841 		goto redirty;
842 	swap = get_swap_page();
843 	if (!swap.val)
844 		goto redirty;
845 
846 	spin_lock(&info->lock);
847 	shmem_recalc_inode(inode);
848 	if (index >= info->next_index) {
849 		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
850 		goto unlock;
851 	}
852 	entry = shmem_swp_entry(info, index, NULL);
853 	BUG_ON(!entry);
854 	BUG_ON(entry->val);
855 
856 	if (move_to_swap_cache(page, swap) == 0) {
857 		shmem_swp_set(info, entry, swap.val);
858 		shmem_swp_unmap(entry);
859 		spin_unlock(&info->lock);
860 		if (list_empty(&info->swaplist)) {
861 			spin_lock(&shmem_swaplist_lock);
862 			/* move instead of add in case we're racing */
863 			list_move_tail(&info->swaplist, &shmem_swaplist);
864 			spin_unlock(&shmem_swaplist_lock);
865 		}
866 		unlock_page(page);
867 		return 0;
868 	}
869 
870 	shmem_swp_unmap(entry);
871 unlock:
872 	spin_unlock(&info->lock);
873 	swap_free(swap);
874 redirty:
875 	set_page_dirty(page);
876 	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
877 }
878 
879 #ifdef CONFIG_NUMA
880 static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
881 {
882 	char *nodelist = strchr(value, ':');
883 	int err = 1;
884 
885 	if (nodelist) {
886 		/* NUL-terminate policy string */
887 		*nodelist++ = '\0';
888 		if (nodelist_parse(nodelist, *policy_nodes))
889 			goto out;
890 	}
891 	if (!strcmp(value, "default")) {
892 		*policy = MPOL_DEFAULT;
893 		/* Don't allow a nodelist */
894 		if (!nodelist)
895 			err = 0;
896 	} else if (!strcmp(value, "prefer")) {
897 		*policy = MPOL_PREFERRED;
898 		/* Insist on a nodelist of one node only */
899 		if (nodelist) {
900 			char *rest = nodelist;
901 			while (isdigit(*rest))
902 				rest++;
903 			if (!*rest)
904 				err = 0;
905 		}
906 	} else if (!strcmp(value, "bind")) {
907 		*policy = MPOL_BIND;
908 		/* Insist on a nodelist */
909 		if (nodelist)
910 			err = 0;
911 	} else if (!strcmp(value, "interleave")) {
912 		*policy = MPOL_INTERLEAVE;
913 		/* Default to nodes online if no nodelist */
914 		if (!nodelist)
915 			*policy_nodes = node_online_map;
916 		err = 0;
917 	}
918 out:
919 	/* Restore string for error message */
920 	if (nodelist)
921 		*--nodelist = ':';
922 	return err;
923 }
924 
925 static struct page *shmem_swapin_async(struct shared_policy *p,
926 				       swp_entry_t entry, unsigned long idx)
927 {
928 	struct page *page;
929 	struct vm_area_struct pvma;
930 
931 	/* Create a pseudo vma that just contains the policy */
932 	memset(&pvma, 0, sizeof(struct vm_area_struct));
933 	pvma.vm_end = PAGE_SIZE;
934 	pvma.vm_pgoff = idx;
935 	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
936 	page = read_swap_cache_async(entry, &pvma, 0);
937 	mpol_free(pvma.vm_policy);
938 	return page;
939 }
940 
941 struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
942 			  unsigned long idx)
943 {
944 	struct shared_policy *p = &info->policy;
945 	int i, num;
946 	struct page *page;
947 	unsigned long offset;
948 
949 	num = valid_swaphandles(entry, &offset);
950 	for (i = 0; i < num; offset++, i++) {
951 		page = shmem_swapin_async(p,
952 				swp_entry(swp_type(entry), offset), idx);
953 		if (!page)
954 			break;
955 		page_cache_release(page);
956 	}
957 	lru_add_drain();	/* Push any new pages onto the LRU now */
958 	return shmem_swapin_async(p, entry, idx);
959 }
960 
961 static struct page *
962 shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
963 		 unsigned long idx)
964 {
965 	struct vm_area_struct pvma;
966 	struct page *page;
967 
968 	memset(&pvma, 0, sizeof(struct vm_area_struct));
969 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
970 	pvma.vm_pgoff = idx;
971 	pvma.vm_end = PAGE_SIZE;
972 	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
973 	mpol_free(pvma.vm_policy);
974 	return page;
975 }
976 #else
977 static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
978 {
979 	return 1;
980 }
981 
982 static inline struct page *
983 shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
984 {
985 	swapin_readahead(entry, 0, NULL);
986 	return read_swap_cache_async(entry, NULL, 0);
987 }
988 
989 static inline struct page *
990 shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
991 {
992 	return alloc_page(gfp | __GFP_ZERO);
993 }
994 #endif
995 
996 /*
997  * shmem_getpage - either get the page from swap or allocate a new one
998  *
999  * If we allocate a new one we do not mark it dirty. That's up to the
1000  * vm. If we swap it in we mark it dirty since we also free the swap
1001  * entry since a page cannot live in both the swap and page cache
1002  */
1003 static int shmem_getpage(struct inode *inode, unsigned long idx,
1004 			struct page **pagep, enum sgp_type sgp, int *type)
1005 {
1006 	struct address_space *mapping = inode->i_mapping;
1007 	struct shmem_inode_info *info = SHMEM_I(inode);
1008 	struct shmem_sb_info *sbinfo;
1009 	struct page *filepage = *pagep;
1010 	struct page *swappage;
1011 	swp_entry_t *entry;
1012 	swp_entry_t swap;
1013 	int error;
1014 
1015 	if (idx >= SHMEM_MAX_INDEX)
1016 		return -EFBIG;
1017 	/*
1018 	 * Normally, filepage is NULL on entry, and either found
1019 	 * uptodate immediately, or allocated and zeroed, or read
1020 	 * in under swappage, which is then assigned to filepage.
1021 	 * But shmem_prepare_write passes in a locked filepage,
1022 	 * which may be found not uptodate by other callers too,
1023 	 * and may need to be copied from the swappage read in.
1024 	 */
1025 repeat:
1026 	if (!filepage)
1027 		filepage = find_lock_page(mapping, idx);
1028 	if (filepage && PageUptodate(filepage))
1029 		goto done;
1030 	error = 0;
1031 	if (sgp == SGP_QUICK)
1032 		goto failed;
1033 
1034 	spin_lock(&info->lock);
1035 	shmem_recalc_inode(inode);
1036 	entry = shmem_swp_alloc(info, idx, sgp);
1037 	if (IS_ERR(entry)) {
1038 		spin_unlock(&info->lock);
1039 		error = PTR_ERR(entry);
1040 		goto failed;
1041 	}
1042 	swap = *entry;
1043 
1044 	if (swap.val) {
1045 		/* Look it up and read it in.. */
1046 		swappage = lookup_swap_cache(swap);
1047 		if (!swappage) {
1048 			shmem_swp_unmap(entry);
1049 			spin_unlock(&info->lock);
1050 			/* here we actually do the io */
1051 			if (type && *type == VM_FAULT_MINOR) {
1052 				inc_page_state(pgmajfault);
1053 				*type = VM_FAULT_MAJOR;
1054 			}
1055 			swappage = shmem_swapin(info, swap, idx);
1056 			if (!swappage) {
1057 				spin_lock(&info->lock);
1058 				entry = shmem_swp_alloc(info, idx, sgp);
1059 				if (IS_ERR(entry))
1060 					error = PTR_ERR(entry);
1061 				else {
1062 					if (entry->val == swap.val)
1063 						error = -ENOMEM;
1064 					shmem_swp_unmap(entry);
1065 				}
1066 				spin_unlock(&info->lock);
1067 				if (error)
1068 					goto failed;
1069 				goto repeat;
1070 			}
1071 			wait_on_page_locked(swappage);
1072 			page_cache_release(swappage);
1073 			goto repeat;
1074 		}
1075 
1076 		/* We have to do this with page locked to prevent races */
1077 		if (TestSetPageLocked(swappage)) {
1078 			shmem_swp_unmap(entry);
1079 			spin_unlock(&info->lock);
1080 			wait_on_page_locked(swappage);
1081 			page_cache_release(swappage);
1082 			goto repeat;
1083 		}
1084 		if (!PageSwapCache(swappage)) {
1085 			/* Page migration has occured */
1086 			shmem_swp_unmap(entry);
1087 			spin_unlock(&info->lock);
1088 			unlock_page(swappage);
1089 			page_cache_release(swappage);
1090 			goto repeat;
1091 		}
1092 		if (PageWriteback(swappage)) {
1093 			shmem_swp_unmap(entry);
1094 			spin_unlock(&info->lock);
1095 			wait_on_page_writeback(swappage);
1096 			unlock_page(swappage);
1097 			page_cache_release(swappage);
1098 			goto repeat;
1099 		}
1100 		if (!PageUptodate(swappage)) {
1101 			shmem_swp_unmap(entry);
1102 			spin_unlock(&info->lock);
1103 			unlock_page(swappage);
1104 			page_cache_release(swappage);
1105 			error = -EIO;
1106 			goto failed;
1107 		}
1108 
1109 		if (filepage) {
1110 			shmem_swp_set(info, entry, 0);
1111 			shmem_swp_unmap(entry);
1112 			delete_from_swap_cache(swappage);
1113 			spin_unlock(&info->lock);
1114 			copy_highpage(filepage, swappage);
1115 			unlock_page(swappage);
1116 			page_cache_release(swappage);
1117 			flush_dcache_page(filepage);
1118 			SetPageUptodate(filepage);
1119 			set_page_dirty(filepage);
1120 			swap_free(swap);
1121 		} else if (!(error = move_from_swap_cache(
1122 				swappage, idx, mapping))) {
1123 			info->flags |= SHMEM_PAGEIN;
1124 			shmem_swp_set(info, entry, 0);
1125 			shmem_swp_unmap(entry);
1126 			spin_unlock(&info->lock);
1127 			filepage = swappage;
1128 			swap_free(swap);
1129 		} else {
1130 			shmem_swp_unmap(entry);
1131 			spin_unlock(&info->lock);
1132 			unlock_page(swappage);
1133 			page_cache_release(swappage);
1134 			if (error == -ENOMEM) {
1135 				/* let kswapd refresh zone for GFP_ATOMICs */
1136 				blk_congestion_wait(WRITE, HZ/50);
1137 			}
1138 			goto repeat;
1139 		}
1140 	} else if (sgp == SGP_READ && !filepage) {
1141 		shmem_swp_unmap(entry);
1142 		filepage = find_get_page(mapping, idx);
1143 		if (filepage &&
1144 		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1145 			spin_unlock(&info->lock);
1146 			wait_on_page_locked(filepage);
1147 			page_cache_release(filepage);
1148 			filepage = NULL;
1149 			goto repeat;
1150 		}
1151 		spin_unlock(&info->lock);
1152 	} else {
1153 		shmem_swp_unmap(entry);
1154 		sbinfo = SHMEM_SB(inode->i_sb);
1155 		if (sbinfo->max_blocks) {
1156 			spin_lock(&sbinfo->stat_lock);
1157 			if (sbinfo->free_blocks == 0 ||
1158 			    shmem_acct_block(info->flags)) {
1159 				spin_unlock(&sbinfo->stat_lock);
1160 				spin_unlock(&info->lock);
1161 				error = -ENOSPC;
1162 				goto failed;
1163 			}
1164 			sbinfo->free_blocks--;
1165 			inode->i_blocks += BLOCKS_PER_PAGE;
1166 			spin_unlock(&sbinfo->stat_lock);
1167 		} else if (shmem_acct_block(info->flags)) {
1168 			spin_unlock(&info->lock);
1169 			error = -ENOSPC;
1170 			goto failed;
1171 		}
1172 
1173 		if (!filepage) {
1174 			spin_unlock(&info->lock);
1175 			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1176 						    info,
1177 						    idx);
1178 			if (!filepage) {
1179 				shmem_unacct_blocks(info->flags, 1);
1180 				shmem_free_blocks(inode, 1);
1181 				error = -ENOMEM;
1182 				goto failed;
1183 			}
1184 
1185 			spin_lock(&info->lock);
1186 			entry = shmem_swp_alloc(info, idx, sgp);
1187 			if (IS_ERR(entry))
1188 				error = PTR_ERR(entry);
1189 			else {
1190 				swap = *entry;
1191 				shmem_swp_unmap(entry);
1192 			}
1193 			if (error || swap.val || 0 != add_to_page_cache_lru(
1194 					filepage, mapping, idx, GFP_ATOMIC)) {
1195 				spin_unlock(&info->lock);
1196 				page_cache_release(filepage);
1197 				shmem_unacct_blocks(info->flags, 1);
1198 				shmem_free_blocks(inode, 1);
1199 				filepage = NULL;
1200 				if (error)
1201 					goto failed;
1202 				goto repeat;
1203 			}
1204 			info->flags |= SHMEM_PAGEIN;
1205 		}
1206 
1207 		info->alloced++;
1208 		spin_unlock(&info->lock);
1209 		flush_dcache_page(filepage);
1210 		SetPageUptodate(filepage);
1211 	}
1212 done:
1213 	if (*pagep != filepage) {
1214 		unlock_page(filepage);
1215 		*pagep = filepage;
1216 	}
1217 	return 0;
1218 
1219 failed:
1220 	if (*pagep != filepage) {
1221 		unlock_page(filepage);
1222 		page_cache_release(filepage);
1223 	}
1224 	return error;
1225 }
1226 
1227 struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1228 {
1229 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
1230 	struct page *page = NULL;
1231 	unsigned long idx;
1232 	int error;
1233 
1234 	idx = (address - vma->vm_start) >> PAGE_SHIFT;
1235 	idx += vma->vm_pgoff;
1236 	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1237 	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1238 		return NOPAGE_SIGBUS;
1239 
1240 	error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
1241 	if (error)
1242 		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1243 
1244 	mark_page_accessed(page);
1245 	return page;
1246 }
1247 
1248 static int shmem_populate(struct vm_area_struct *vma,
1249 	unsigned long addr, unsigned long len,
1250 	pgprot_t prot, unsigned long pgoff, int nonblock)
1251 {
1252 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
1253 	struct mm_struct *mm = vma->vm_mm;
1254 	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1255 	unsigned long size;
1256 
1257 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1258 	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
1259 		return -EINVAL;
1260 
1261 	while ((long) len > 0) {
1262 		struct page *page = NULL;
1263 		int err;
1264 		/*
1265 		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
1266 		 */
1267 		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
1268 		if (err)
1269 			return err;
1270 		/* Page may still be null, but only if nonblock was set. */
1271 		if (page) {
1272 			mark_page_accessed(page);
1273 			err = install_page(mm, vma, addr, page, prot);
1274 			if (err) {
1275 				page_cache_release(page);
1276 				return err;
1277 			}
1278 		} else if (vma->vm_flags & VM_NONLINEAR) {
1279 			/* No page was found just because we can't read it in
1280 			 * now (being here implies nonblock != 0), but the page
1281 			 * may exist, so set the PTE to fault it in later. */
1282     			err = install_file_pte(mm, vma, addr, pgoff, prot);
1283 			if (err)
1284 	    			return err;
1285 		}
1286 
1287 		len -= PAGE_SIZE;
1288 		addr += PAGE_SIZE;
1289 		pgoff++;
1290 	}
1291 	return 0;
1292 }
1293 
1294 #ifdef CONFIG_NUMA
1295 int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1296 {
1297 	struct inode *i = vma->vm_file->f_dentry->d_inode;
1298 	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1299 }
1300 
1301 struct mempolicy *
1302 shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1303 {
1304 	struct inode *i = vma->vm_file->f_dentry->d_inode;
1305 	unsigned long idx;
1306 
1307 	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1308 	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1309 }
1310 #endif
1311 
1312 int shmem_lock(struct file *file, int lock, struct user_struct *user)
1313 {
1314 	struct inode *inode = file->f_dentry->d_inode;
1315 	struct shmem_inode_info *info = SHMEM_I(inode);
1316 	int retval = -ENOMEM;
1317 
1318 	spin_lock(&info->lock);
1319 	if (lock && !(info->flags & VM_LOCKED)) {
1320 		if (!user_shm_lock(inode->i_size, user))
1321 			goto out_nomem;
1322 		info->flags |= VM_LOCKED;
1323 	}
1324 	if (!lock && (info->flags & VM_LOCKED) && user) {
1325 		user_shm_unlock(inode->i_size, user);
1326 		info->flags &= ~VM_LOCKED;
1327 	}
1328 	retval = 0;
1329 out_nomem:
1330 	spin_unlock(&info->lock);
1331 	return retval;
1332 }
1333 
1334 int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1335 {
1336 	file_accessed(file);
1337 	vma->vm_ops = &shmem_vm_ops;
1338 	return 0;
1339 }
1340 
1341 static struct inode *
1342 shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1343 {
1344 	struct inode *inode;
1345 	struct shmem_inode_info *info;
1346 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1347 
1348 	if (sbinfo->max_inodes) {
1349 		spin_lock(&sbinfo->stat_lock);
1350 		if (!sbinfo->free_inodes) {
1351 			spin_unlock(&sbinfo->stat_lock);
1352 			return NULL;
1353 		}
1354 		sbinfo->free_inodes--;
1355 		spin_unlock(&sbinfo->stat_lock);
1356 	}
1357 
1358 	inode = new_inode(sb);
1359 	if (inode) {
1360 		inode->i_mode = mode;
1361 		inode->i_uid = current->fsuid;
1362 		inode->i_gid = current->fsgid;
1363 		inode->i_blksize = PAGE_CACHE_SIZE;
1364 		inode->i_blocks = 0;
1365 		inode->i_mapping->a_ops = &shmem_aops;
1366 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1367 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1368 		info = SHMEM_I(inode);
1369 		memset(info, 0, (char *)inode - (char *)info);
1370 		spin_lock_init(&info->lock);
1371 		INIT_LIST_HEAD(&info->swaplist);
1372 
1373 		switch (mode & S_IFMT) {
1374 		default:
1375 			init_special_inode(inode, mode, dev);
1376 			break;
1377 		case S_IFREG:
1378 			inode->i_op = &shmem_inode_operations;
1379 			inode->i_fop = &shmem_file_operations;
1380 			mpol_shared_policy_init(&info->policy, sbinfo->policy,
1381 							&sbinfo->policy_nodes);
1382 			break;
1383 		case S_IFDIR:
1384 			inode->i_nlink++;
1385 			/* Some things misbehave if size == 0 on a directory */
1386 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
1387 			inode->i_op = &shmem_dir_inode_operations;
1388 			inode->i_fop = &simple_dir_operations;
1389 			break;
1390 		case S_IFLNK:
1391 			/*
1392 			 * Must not load anything in the rbtree,
1393 			 * mpol_free_shared_policy will not be called.
1394 			 */
1395 			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1396 						NULL);
1397 			break;
1398 		}
1399 	} else if (sbinfo->max_inodes) {
1400 		spin_lock(&sbinfo->stat_lock);
1401 		sbinfo->free_inodes++;
1402 		spin_unlock(&sbinfo->stat_lock);
1403 	}
1404 	return inode;
1405 }
1406 
1407 #ifdef CONFIG_TMPFS
1408 static struct inode_operations shmem_symlink_inode_operations;
1409 static struct inode_operations shmem_symlink_inline_operations;
1410 
1411 /*
1412  * Normally tmpfs makes no use of shmem_prepare_write, but it
1413  * lets a tmpfs file be used read-write below the loop driver.
1414  */
1415 static int
1416 shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1417 {
1418 	struct inode *inode = page->mapping->host;
1419 	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
1420 }
1421 
1422 static ssize_t
1423 shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1424 {
1425 	struct inode	*inode = file->f_dentry->d_inode;
1426 	loff_t		pos;
1427 	unsigned long	written;
1428 	ssize_t		err;
1429 
1430 	if ((ssize_t) count < 0)
1431 		return -EINVAL;
1432 
1433 	if (!access_ok(VERIFY_READ, buf, count))
1434 		return -EFAULT;
1435 
1436 	mutex_lock(&inode->i_mutex);
1437 
1438 	pos = *ppos;
1439 	written = 0;
1440 
1441 	err = generic_write_checks(file, &pos, &count, 0);
1442 	if (err || !count)
1443 		goto out;
1444 
1445 	err = remove_suid(file->f_dentry);
1446 	if (err)
1447 		goto out;
1448 
1449 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1450 
1451 	do {
1452 		struct page *page = NULL;
1453 		unsigned long bytes, index, offset;
1454 		char *kaddr;
1455 		int left;
1456 
1457 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1458 		index = pos >> PAGE_CACHE_SHIFT;
1459 		bytes = PAGE_CACHE_SIZE - offset;
1460 		if (bytes > count)
1461 			bytes = count;
1462 
1463 		/*
1464 		 * We don't hold page lock across copy from user -
1465 		 * what would it guard against? - so no deadlock here.
1466 		 * But it still may be a good idea to prefault below.
1467 		 */
1468 
1469 		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1470 		if (err)
1471 			break;
1472 
1473 		left = bytes;
1474 		if (PageHighMem(page)) {
1475 			volatile unsigned char dummy;
1476 			__get_user(dummy, buf);
1477 			__get_user(dummy, buf + bytes - 1);
1478 
1479 			kaddr = kmap_atomic(page, KM_USER0);
1480 			left = __copy_from_user_inatomic(kaddr + offset,
1481 							buf, bytes);
1482 			kunmap_atomic(kaddr, KM_USER0);
1483 		}
1484 		if (left) {
1485 			kaddr = kmap(page);
1486 			left = __copy_from_user(kaddr + offset, buf, bytes);
1487 			kunmap(page);
1488 		}
1489 
1490 		written += bytes;
1491 		count -= bytes;
1492 		pos += bytes;
1493 		buf += bytes;
1494 		if (pos > inode->i_size)
1495 			i_size_write(inode, pos);
1496 
1497 		flush_dcache_page(page);
1498 		set_page_dirty(page);
1499 		mark_page_accessed(page);
1500 		page_cache_release(page);
1501 
1502 		if (left) {
1503 			pos -= left;
1504 			written -= left;
1505 			err = -EFAULT;
1506 			break;
1507 		}
1508 
1509 		/*
1510 		 * Our dirty pages are not counted in nr_dirty,
1511 		 * and we do not attempt to balance dirty pages.
1512 		 */
1513 
1514 		cond_resched();
1515 	} while (count);
1516 
1517 	*ppos = pos;
1518 	if (written)
1519 		err = written;
1520 out:
1521 	mutex_unlock(&inode->i_mutex);
1522 	return err;
1523 }
1524 
1525 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1526 {
1527 	struct inode *inode = filp->f_dentry->d_inode;
1528 	struct address_space *mapping = inode->i_mapping;
1529 	unsigned long index, offset;
1530 
1531 	index = *ppos >> PAGE_CACHE_SHIFT;
1532 	offset = *ppos & ~PAGE_CACHE_MASK;
1533 
1534 	for (;;) {
1535 		struct page *page = NULL;
1536 		unsigned long end_index, nr, ret;
1537 		loff_t i_size = i_size_read(inode);
1538 
1539 		end_index = i_size >> PAGE_CACHE_SHIFT;
1540 		if (index > end_index)
1541 			break;
1542 		if (index == end_index) {
1543 			nr = i_size & ~PAGE_CACHE_MASK;
1544 			if (nr <= offset)
1545 				break;
1546 		}
1547 
1548 		desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1549 		if (desc->error) {
1550 			if (desc->error == -EINVAL)
1551 				desc->error = 0;
1552 			break;
1553 		}
1554 
1555 		/*
1556 		 * We must evaluate after, since reads (unlike writes)
1557 		 * are called without i_mutex protection against truncate
1558 		 */
1559 		nr = PAGE_CACHE_SIZE;
1560 		i_size = i_size_read(inode);
1561 		end_index = i_size >> PAGE_CACHE_SHIFT;
1562 		if (index == end_index) {
1563 			nr = i_size & ~PAGE_CACHE_MASK;
1564 			if (nr <= offset) {
1565 				if (page)
1566 					page_cache_release(page);
1567 				break;
1568 			}
1569 		}
1570 		nr -= offset;
1571 
1572 		if (page) {
1573 			/*
1574 			 * If users can be writing to this page using arbitrary
1575 			 * virtual addresses, take care about potential aliasing
1576 			 * before reading the page on the kernel side.
1577 			 */
1578 			if (mapping_writably_mapped(mapping))
1579 				flush_dcache_page(page);
1580 			/*
1581 			 * Mark the page accessed if we read the beginning.
1582 			 */
1583 			if (!offset)
1584 				mark_page_accessed(page);
1585 		} else {
1586 			page = ZERO_PAGE(0);
1587 			page_cache_get(page);
1588 		}
1589 
1590 		/*
1591 		 * Ok, we have the page, and it's up-to-date, so
1592 		 * now we can copy it to user space...
1593 		 *
1594 		 * The actor routine returns how many bytes were actually used..
1595 		 * NOTE! This may not be the same as how much of a user buffer
1596 		 * we filled up (we may be padding etc), so we can only update
1597 		 * "pos" here (the actor routine has to update the user buffer
1598 		 * pointers and the remaining count).
1599 		 */
1600 		ret = actor(desc, page, offset, nr);
1601 		offset += ret;
1602 		index += offset >> PAGE_CACHE_SHIFT;
1603 		offset &= ~PAGE_CACHE_MASK;
1604 
1605 		page_cache_release(page);
1606 		if (ret != nr || !desc->count)
1607 			break;
1608 
1609 		cond_resched();
1610 	}
1611 
1612 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1613 	file_accessed(filp);
1614 }
1615 
1616 static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1617 {
1618 	read_descriptor_t desc;
1619 
1620 	if ((ssize_t) count < 0)
1621 		return -EINVAL;
1622 	if (!access_ok(VERIFY_WRITE, buf, count))
1623 		return -EFAULT;
1624 	if (!count)
1625 		return 0;
1626 
1627 	desc.written = 0;
1628 	desc.count = count;
1629 	desc.arg.buf = buf;
1630 	desc.error = 0;
1631 
1632 	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1633 	if (desc.written)
1634 		return desc.written;
1635 	return desc.error;
1636 }
1637 
1638 static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1639 			 size_t count, read_actor_t actor, void *target)
1640 {
1641 	read_descriptor_t desc;
1642 
1643 	if (!count)
1644 		return 0;
1645 
1646 	desc.written = 0;
1647 	desc.count = count;
1648 	desc.arg.data = target;
1649 	desc.error = 0;
1650 
1651 	do_shmem_file_read(in_file, ppos, &desc, actor);
1652 	if (desc.written)
1653 		return desc.written;
1654 	return desc.error;
1655 }
1656 
1657 static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
1658 {
1659 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1660 
1661 	buf->f_type = TMPFS_MAGIC;
1662 	buf->f_bsize = PAGE_CACHE_SIZE;
1663 	buf->f_namelen = NAME_MAX;
1664 	spin_lock(&sbinfo->stat_lock);
1665 	if (sbinfo->max_blocks) {
1666 		buf->f_blocks = sbinfo->max_blocks;
1667 		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1668 	}
1669 	if (sbinfo->max_inodes) {
1670 		buf->f_files = sbinfo->max_inodes;
1671 		buf->f_ffree = sbinfo->free_inodes;
1672 	}
1673 	/* else leave those fields 0 like simple_statfs */
1674 	spin_unlock(&sbinfo->stat_lock);
1675 	return 0;
1676 }
1677 
1678 /*
1679  * File creation. Allocate an inode, and we're done..
1680  */
1681 static int
1682 shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1683 {
1684 	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1685 	int error = -ENOSPC;
1686 
1687 	if (inode) {
1688 		error = security_inode_init_security(inode, dir, NULL, NULL,
1689 						     NULL);
1690 		if (error) {
1691 			if (error != -EOPNOTSUPP) {
1692 				iput(inode);
1693 				return error;
1694 			}
1695 			error = 0;
1696 		}
1697 		if (dir->i_mode & S_ISGID) {
1698 			inode->i_gid = dir->i_gid;
1699 			if (S_ISDIR(mode))
1700 				inode->i_mode |= S_ISGID;
1701 		}
1702 		dir->i_size += BOGO_DIRENT_SIZE;
1703 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1704 		d_instantiate(dentry, inode);
1705 		dget(dentry); /* Extra count - pin the dentry in core */
1706 	}
1707 	return error;
1708 }
1709 
1710 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1711 {
1712 	int error;
1713 
1714 	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1715 		return error;
1716 	dir->i_nlink++;
1717 	return 0;
1718 }
1719 
1720 static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1721 		struct nameidata *nd)
1722 {
1723 	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1724 }
1725 
1726 /*
1727  * Link a file..
1728  */
1729 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1730 {
1731 	struct inode *inode = old_dentry->d_inode;
1732 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1733 
1734 	/*
1735 	 * No ordinary (disk based) filesystem counts links as inodes;
1736 	 * but each new link needs a new dentry, pinning lowmem, and
1737 	 * tmpfs dentries cannot be pruned until they are unlinked.
1738 	 */
1739 	if (sbinfo->max_inodes) {
1740 		spin_lock(&sbinfo->stat_lock);
1741 		if (!sbinfo->free_inodes) {
1742 			spin_unlock(&sbinfo->stat_lock);
1743 			return -ENOSPC;
1744 		}
1745 		sbinfo->free_inodes--;
1746 		spin_unlock(&sbinfo->stat_lock);
1747 	}
1748 
1749 	dir->i_size += BOGO_DIRENT_SIZE;
1750 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1751 	inode->i_nlink++;
1752 	atomic_inc(&inode->i_count);	/* New dentry reference */
1753 	dget(dentry);		/* Extra pinning count for the created dentry */
1754 	d_instantiate(dentry, inode);
1755 	return 0;
1756 }
1757 
1758 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1759 {
1760 	struct inode *inode = dentry->d_inode;
1761 
1762 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1763 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1764 		if (sbinfo->max_inodes) {
1765 			spin_lock(&sbinfo->stat_lock);
1766 			sbinfo->free_inodes++;
1767 			spin_unlock(&sbinfo->stat_lock);
1768 		}
1769 	}
1770 
1771 	dir->i_size -= BOGO_DIRENT_SIZE;
1772 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1773 	inode->i_nlink--;
1774 	dput(dentry);	/* Undo the count from "create" - this does all the work */
1775 	return 0;
1776 }
1777 
1778 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1779 {
1780 	if (!simple_empty(dentry))
1781 		return -ENOTEMPTY;
1782 
1783 	dentry->d_inode->i_nlink--;
1784 	dir->i_nlink--;
1785 	return shmem_unlink(dir, dentry);
1786 }
1787 
1788 /*
1789  * The VFS layer already does all the dentry stuff for rename,
1790  * we just have to decrement the usage count for the target if
1791  * it exists so that the VFS layer correctly free's it when it
1792  * gets overwritten.
1793  */
1794 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1795 {
1796 	struct inode *inode = old_dentry->d_inode;
1797 	int they_are_dirs = S_ISDIR(inode->i_mode);
1798 
1799 	if (!simple_empty(new_dentry))
1800 		return -ENOTEMPTY;
1801 
1802 	if (new_dentry->d_inode) {
1803 		(void) shmem_unlink(new_dir, new_dentry);
1804 		if (they_are_dirs)
1805 			old_dir->i_nlink--;
1806 	} else if (they_are_dirs) {
1807 		old_dir->i_nlink--;
1808 		new_dir->i_nlink++;
1809 	}
1810 
1811 	old_dir->i_size -= BOGO_DIRENT_SIZE;
1812 	new_dir->i_size += BOGO_DIRENT_SIZE;
1813 	old_dir->i_ctime = old_dir->i_mtime =
1814 	new_dir->i_ctime = new_dir->i_mtime =
1815 	inode->i_ctime = CURRENT_TIME;
1816 	return 0;
1817 }
1818 
1819 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1820 {
1821 	int error;
1822 	int len;
1823 	struct inode *inode;
1824 	struct page *page = NULL;
1825 	char *kaddr;
1826 	struct shmem_inode_info *info;
1827 
1828 	len = strlen(symname) + 1;
1829 	if (len > PAGE_CACHE_SIZE)
1830 		return -ENAMETOOLONG;
1831 
1832 	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1833 	if (!inode)
1834 		return -ENOSPC;
1835 
1836 	error = security_inode_init_security(inode, dir, NULL, NULL,
1837 					     NULL);
1838 	if (error) {
1839 		if (error != -EOPNOTSUPP) {
1840 			iput(inode);
1841 			return error;
1842 		}
1843 		error = 0;
1844 	}
1845 
1846 	info = SHMEM_I(inode);
1847 	inode->i_size = len-1;
1848 	if (len <= (char *)inode - (char *)info) {
1849 		/* do it inline */
1850 		memcpy(info, symname, len);
1851 		inode->i_op = &shmem_symlink_inline_operations;
1852 	} else {
1853 		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1854 		if (error) {
1855 			iput(inode);
1856 			return error;
1857 		}
1858 		inode->i_op = &shmem_symlink_inode_operations;
1859 		kaddr = kmap_atomic(page, KM_USER0);
1860 		memcpy(kaddr, symname, len);
1861 		kunmap_atomic(kaddr, KM_USER0);
1862 		set_page_dirty(page);
1863 		page_cache_release(page);
1864 	}
1865 	if (dir->i_mode & S_ISGID)
1866 		inode->i_gid = dir->i_gid;
1867 	dir->i_size += BOGO_DIRENT_SIZE;
1868 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1869 	d_instantiate(dentry, inode);
1870 	dget(dentry);
1871 	return 0;
1872 }
1873 
1874 static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1875 {
1876 	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1877 	return NULL;
1878 }
1879 
1880 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1881 {
1882 	struct page *page = NULL;
1883 	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1884 	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1885 	return page;
1886 }
1887 
1888 static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1889 {
1890 	if (!IS_ERR(nd_get_link(nd))) {
1891 		struct page *page = cookie;
1892 		kunmap(page);
1893 		mark_page_accessed(page);
1894 		page_cache_release(page);
1895 	}
1896 }
1897 
1898 static struct inode_operations shmem_symlink_inline_operations = {
1899 	.readlink	= generic_readlink,
1900 	.follow_link	= shmem_follow_link_inline,
1901 };
1902 
1903 static struct inode_operations shmem_symlink_inode_operations = {
1904 	.truncate	= shmem_truncate,
1905 	.readlink	= generic_readlink,
1906 	.follow_link	= shmem_follow_link,
1907 	.put_link	= shmem_put_link,
1908 };
1909 
1910 static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1911 	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1912 	int *policy, nodemask_t *policy_nodes)
1913 {
1914 	char *this_char, *value, *rest;
1915 
1916 	while (options != NULL) {
1917 		this_char = options;
1918 		for (;;) {
1919 			/*
1920 			 * NUL-terminate this option: unfortunately,
1921 			 * mount options form a comma-separated list,
1922 			 * but mpol's nodelist may also contain commas.
1923 			 */
1924 			options = strchr(options, ',');
1925 			if (options == NULL)
1926 				break;
1927 			options++;
1928 			if (!isdigit(*options)) {
1929 				options[-1] = '\0';
1930 				break;
1931 			}
1932 		}
1933 		if (!*this_char)
1934 			continue;
1935 		if ((value = strchr(this_char,'=')) != NULL) {
1936 			*value++ = 0;
1937 		} else {
1938 			printk(KERN_ERR
1939 			    "tmpfs: No value for mount option '%s'\n",
1940 			    this_char);
1941 			return 1;
1942 		}
1943 
1944 		if (!strcmp(this_char,"size")) {
1945 			unsigned long long size;
1946 			size = memparse(value,&rest);
1947 			if (*rest == '%') {
1948 				size <<= PAGE_SHIFT;
1949 				size *= totalram_pages;
1950 				do_div(size, 100);
1951 				rest++;
1952 			}
1953 			if (*rest)
1954 				goto bad_val;
1955 			*blocks = size >> PAGE_CACHE_SHIFT;
1956 		} else if (!strcmp(this_char,"nr_blocks")) {
1957 			*blocks = memparse(value,&rest);
1958 			if (*rest)
1959 				goto bad_val;
1960 		} else if (!strcmp(this_char,"nr_inodes")) {
1961 			*inodes = memparse(value,&rest);
1962 			if (*rest)
1963 				goto bad_val;
1964 		} else if (!strcmp(this_char,"mode")) {
1965 			if (!mode)
1966 				continue;
1967 			*mode = simple_strtoul(value,&rest,8);
1968 			if (*rest)
1969 				goto bad_val;
1970 		} else if (!strcmp(this_char,"uid")) {
1971 			if (!uid)
1972 				continue;
1973 			*uid = simple_strtoul(value,&rest,0);
1974 			if (*rest)
1975 				goto bad_val;
1976 		} else if (!strcmp(this_char,"gid")) {
1977 			if (!gid)
1978 				continue;
1979 			*gid = simple_strtoul(value,&rest,0);
1980 			if (*rest)
1981 				goto bad_val;
1982 		} else if (!strcmp(this_char,"mpol")) {
1983 			if (shmem_parse_mpol(value,policy,policy_nodes))
1984 				goto bad_val;
1985 		} else {
1986 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1987 			       this_char);
1988 			return 1;
1989 		}
1990 	}
1991 	return 0;
1992 
1993 bad_val:
1994 	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1995 	       value, this_char);
1996 	return 1;
1997 
1998 }
1999 
2000 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2001 {
2002 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2003 	unsigned long max_blocks = sbinfo->max_blocks;
2004 	unsigned long max_inodes = sbinfo->max_inodes;
2005 	int policy = sbinfo->policy;
2006 	nodemask_t policy_nodes = sbinfo->policy_nodes;
2007 	unsigned long blocks;
2008 	unsigned long inodes;
2009 	int error = -EINVAL;
2010 
2011 	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
2012 				&max_inodes, &policy, &policy_nodes))
2013 		return error;
2014 
2015 	spin_lock(&sbinfo->stat_lock);
2016 	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2017 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2018 	if (max_blocks < blocks)
2019 		goto out;
2020 	if (max_inodes < inodes)
2021 		goto out;
2022 	/*
2023 	 * Those tests also disallow limited->unlimited while any are in
2024 	 * use, so i_blocks will always be zero when max_blocks is zero;
2025 	 * but we must separately disallow unlimited->limited, because
2026 	 * in that case we have no record of how much is already in use.
2027 	 */
2028 	if (max_blocks && !sbinfo->max_blocks)
2029 		goto out;
2030 	if (max_inodes && !sbinfo->max_inodes)
2031 		goto out;
2032 
2033 	error = 0;
2034 	sbinfo->max_blocks  = max_blocks;
2035 	sbinfo->free_blocks = max_blocks - blocks;
2036 	sbinfo->max_inodes  = max_inodes;
2037 	sbinfo->free_inodes = max_inodes - inodes;
2038 	sbinfo->policy = policy;
2039 	sbinfo->policy_nodes = policy_nodes;
2040 out:
2041 	spin_unlock(&sbinfo->stat_lock);
2042 	return error;
2043 }
2044 #endif
2045 
2046 static void shmem_put_super(struct super_block *sb)
2047 {
2048 	kfree(sb->s_fs_info);
2049 	sb->s_fs_info = NULL;
2050 }
2051 
2052 static int shmem_fill_super(struct super_block *sb,
2053 			    void *data, int silent)
2054 {
2055 	struct inode *inode;
2056 	struct dentry *root;
2057 	int mode   = S_IRWXUGO | S_ISVTX;
2058 	uid_t uid = current->fsuid;
2059 	gid_t gid = current->fsgid;
2060 	int err = -ENOMEM;
2061 	struct shmem_sb_info *sbinfo;
2062 	unsigned long blocks = 0;
2063 	unsigned long inodes = 0;
2064 	int policy = MPOL_DEFAULT;
2065 	nodemask_t policy_nodes = node_online_map;
2066 
2067 #ifdef CONFIG_TMPFS
2068 	/*
2069 	 * Per default we only allow half of the physical ram per
2070 	 * tmpfs instance, limiting inodes to one per page of lowmem;
2071 	 * but the internal instance is left unlimited.
2072 	 */
2073 	if (!(sb->s_flags & MS_NOUSER)) {
2074 		blocks = totalram_pages / 2;
2075 		inodes = totalram_pages - totalhigh_pages;
2076 		if (inodes > blocks)
2077 			inodes = blocks;
2078 		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
2079 					&inodes, &policy, &policy_nodes))
2080 			return -EINVAL;
2081 	}
2082 #else
2083 	sb->s_flags |= MS_NOUSER;
2084 #endif
2085 
2086 	/* Round up to L1_CACHE_BYTES to resist false sharing */
2087 	sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
2088 				L1_CACHE_BYTES), GFP_KERNEL);
2089 	if (!sbinfo)
2090 		return -ENOMEM;
2091 
2092 	spin_lock_init(&sbinfo->stat_lock);
2093 	sbinfo->max_blocks = blocks;
2094 	sbinfo->free_blocks = blocks;
2095 	sbinfo->max_inodes = inodes;
2096 	sbinfo->free_inodes = inodes;
2097 	sbinfo->policy = policy;
2098 	sbinfo->policy_nodes = policy_nodes;
2099 
2100 	sb->s_fs_info = sbinfo;
2101 	sb->s_maxbytes = SHMEM_MAX_BYTES;
2102 	sb->s_blocksize = PAGE_CACHE_SIZE;
2103 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2104 	sb->s_magic = TMPFS_MAGIC;
2105 	sb->s_op = &shmem_ops;
2106 	sb->s_time_gran = 1;
2107 
2108 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2109 	if (!inode)
2110 		goto failed;
2111 	inode->i_uid = uid;
2112 	inode->i_gid = gid;
2113 	root = d_alloc_root(inode);
2114 	if (!root)
2115 		goto failed_iput;
2116 	sb->s_root = root;
2117 	return 0;
2118 
2119 failed_iput:
2120 	iput(inode);
2121 failed:
2122 	shmem_put_super(sb);
2123 	return err;
2124 }
2125 
2126 static struct kmem_cache *shmem_inode_cachep;
2127 
2128 static struct inode *shmem_alloc_inode(struct super_block *sb)
2129 {
2130 	struct shmem_inode_info *p;
2131 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
2132 	if (!p)
2133 		return NULL;
2134 	return &p->vfs_inode;
2135 }
2136 
2137 static void shmem_destroy_inode(struct inode *inode)
2138 {
2139 	if ((inode->i_mode & S_IFMT) == S_IFREG) {
2140 		/* only struct inode is valid if it's an inline symlink */
2141 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2142 	}
2143 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2144 }
2145 
2146 static void init_once(void *foo, struct kmem_cache *cachep,
2147 		      unsigned long flags)
2148 {
2149 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2150 
2151 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2152 	    SLAB_CTOR_CONSTRUCTOR) {
2153 		inode_init_once(&p->vfs_inode);
2154 	}
2155 }
2156 
2157 static int init_inodecache(void)
2158 {
2159 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2160 				sizeof(struct shmem_inode_info),
2161 				0, 0, init_once, NULL);
2162 	if (shmem_inode_cachep == NULL)
2163 		return -ENOMEM;
2164 	return 0;
2165 }
2166 
2167 static void destroy_inodecache(void)
2168 {
2169 	if (kmem_cache_destroy(shmem_inode_cachep))
2170 		printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2171 }
2172 
2173 static struct address_space_operations shmem_aops = {
2174 	.writepage	= shmem_writepage,
2175 	.set_page_dirty	= __set_page_dirty_nobuffers,
2176 #ifdef CONFIG_TMPFS
2177 	.prepare_write	= shmem_prepare_write,
2178 	.commit_write	= simple_commit_write,
2179 #endif
2180 	.migratepage	= migrate_page,
2181 };
2182 
2183 static struct file_operations shmem_file_operations = {
2184 	.mmap		= shmem_mmap,
2185 #ifdef CONFIG_TMPFS
2186 	.llseek		= generic_file_llseek,
2187 	.read		= shmem_file_read,
2188 	.write		= shmem_file_write,
2189 	.fsync		= simple_sync_file,
2190 	.sendfile	= shmem_file_sendfile,
2191 #endif
2192 };
2193 
2194 static struct inode_operations shmem_inode_operations = {
2195 	.truncate	= shmem_truncate,
2196 	.setattr	= shmem_notify_change,
2197 	.truncate_range	= shmem_truncate_range,
2198 };
2199 
2200 static struct inode_operations shmem_dir_inode_operations = {
2201 #ifdef CONFIG_TMPFS
2202 	.create		= shmem_create,
2203 	.lookup		= simple_lookup,
2204 	.link		= shmem_link,
2205 	.unlink		= shmem_unlink,
2206 	.symlink	= shmem_symlink,
2207 	.mkdir		= shmem_mkdir,
2208 	.rmdir		= shmem_rmdir,
2209 	.mknod		= shmem_mknod,
2210 	.rename		= shmem_rename,
2211 #endif
2212 };
2213 
2214 static struct super_operations shmem_ops = {
2215 	.alloc_inode	= shmem_alloc_inode,
2216 	.destroy_inode	= shmem_destroy_inode,
2217 #ifdef CONFIG_TMPFS
2218 	.statfs		= shmem_statfs,
2219 	.remount_fs	= shmem_remount_fs,
2220 #endif
2221 	.delete_inode	= shmem_delete_inode,
2222 	.drop_inode	= generic_delete_inode,
2223 	.put_super	= shmem_put_super,
2224 };
2225 
2226 static struct vm_operations_struct shmem_vm_ops = {
2227 	.nopage		= shmem_nopage,
2228 	.populate	= shmem_populate,
2229 #ifdef CONFIG_NUMA
2230 	.set_policy     = shmem_set_policy,
2231 	.get_policy     = shmem_get_policy,
2232 #endif
2233 };
2234 
2235 
2236 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
2237 	int flags, const char *dev_name, void *data)
2238 {
2239 	return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
2240 }
2241 
2242 static struct file_system_type tmpfs_fs_type = {
2243 	.owner		= THIS_MODULE,
2244 	.name		= "tmpfs",
2245 	.get_sb		= shmem_get_sb,
2246 	.kill_sb	= kill_litter_super,
2247 };
2248 static struct vfsmount *shm_mnt;
2249 
2250 static int __init init_tmpfs(void)
2251 {
2252 	int error;
2253 
2254 	error = init_inodecache();
2255 	if (error)
2256 		goto out3;
2257 
2258 	error = register_filesystem(&tmpfs_fs_type);
2259 	if (error) {
2260 		printk(KERN_ERR "Could not register tmpfs\n");
2261 		goto out2;
2262 	}
2263 #ifdef CONFIG_TMPFS
2264 	devfs_mk_dir("shm");
2265 #endif
2266 	shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
2267 				tmpfs_fs_type.name, NULL);
2268 	if (IS_ERR(shm_mnt)) {
2269 		error = PTR_ERR(shm_mnt);
2270 		printk(KERN_ERR "Could not kern_mount tmpfs\n");
2271 		goto out1;
2272 	}
2273 	return 0;
2274 
2275 out1:
2276 	unregister_filesystem(&tmpfs_fs_type);
2277 out2:
2278 	destroy_inodecache();
2279 out3:
2280 	shm_mnt = ERR_PTR(error);
2281 	return error;
2282 }
2283 module_init(init_tmpfs)
2284 
2285 /*
2286  * shmem_file_setup - get an unlinked file living in tmpfs
2287  *
2288  * @name: name for dentry (to be seen in /proc/<pid>/maps
2289  * @size: size to be set for the file
2290  *
2291  */
2292 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2293 {
2294 	int error;
2295 	struct file *file;
2296 	struct inode *inode;
2297 	struct dentry *dentry, *root;
2298 	struct qstr this;
2299 
2300 	if (IS_ERR(shm_mnt))
2301 		return (void *)shm_mnt;
2302 
2303 	if (size < 0 || size > SHMEM_MAX_BYTES)
2304 		return ERR_PTR(-EINVAL);
2305 
2306 	if (shmem_acct_size(flags, size))
2307 		return ERR_PTR(-ENOMEM);
2308 
2309 	error = -ENOMEM;
2310 	this.name = name;
2311 	this.len = strlen(name);
2312 	this.hash = 0; /* will go */
2313 	root = shm_mnt->mnt_root;
2314 	dentry = d_alloc(root, &this);
2315 	if (!dentry)
2316 		goto put_memory;
2317 
2318 	error = -ENFILE;
2319 	file = get_empty_filp();
2320 	if (!file)
2321 		goto put_dentry;
2322 
2323 	error = -ENOSPC;
2324 	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2325 	if (!inode)
2326 		goto close_file;
2327 
2328 	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2329 	d_instantiate(dentry, inode);
2330 	inode->i_size = size;
2331 	inode->i_nlink = 0;	/* It is unlinked */
2332 	file->f_vfsmnt = mntget(shm_mnt);
2333 	file->f_dentry = dentry;
2334 	file->f_mapping = inode->i_mapping;
2335 	file->f_op = &shmem_file_operations;
2336 	file->f_mode = FMODE_WRITE | FMODE_READ;
2337 	return file;
2338 
2339 close_file:
2340 	put_filp(file);
2341 put_dentry:
2342 	dput(dentry);
2343 put_memory:
2344 	shmem_unacct_size(flags, size);
2345 	return ERR_PTR(error);
2346 }
2347 
2348 /*
2349  * shmem_zero_setup - setup a shared anonymous mapping
2350  *
2351  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2352  */
2353 int shmem_zero_setup(struct vm_area_struct *vma)
2354 {
2355 	struct file *file;
2356 	loff_t size = vma->vm_end - vma->vm_start;
2357 
2358 	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2359 	if (IS_ERR(file))
2360 		return PTR_ERR(file);
2361 
2362 	if (vma->vm_file)
2363 		fput(vma->vm_file);
2364 	vma->vm_file = file;
2365 	vma->vm_ops = &shmem_vm_ops;
2366 	return 0;
2367 }
2368