1 /* 2 * Generic pidhash and scalable, time-bounded PID allocator 3 * 4 * (C) 2002-2003 William Irwin, IBM 5 * (C) 2004 William Irwin, Oracle 6 * (C) 2002-2004 Ingo Molnar, Red Hat 7 * 8 * pid-structures are backing objects for tasks sharing a given ID to chain 9 * against. There is very little to them aside from hashing them and 10 * parking tasks using given ID's on a list. 11 * 12 * The hash is always changed with the tasklist_lock write-acquired, 13 * and the hash is only accessed with the tasklist_lock at least 14 * read-acquired, so there's no additional SMP locking needed here. 15 * 16 * We have a list of bitmap pages, which bitmaps represent the PID space. 17 * Allocating and freeing PIDs is completely lockless. The worst-case 18 * allocation scenario when all but one out of 1 million PIDs possible are 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 21 */ 22 23 #include <linux/mm.h> 24 #include <linux/module.h> 25 #include <linux/slab.h> 26 #include <linux/init.h> 27 #include <linux/bootmem.h> 28 #include <linux/hash.h> 29 #include <linux/pid_namespace.h> 30 31 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 32 static struct hlist_head *pid_hash; 33 static int pidhash_shift; 34 static struct kmem_cache *pid_cachep; 35 36 int pid_max = PID_MAX_DEFAULT; 37 38 #define RESERVED_PIDS 300 39 40 int pid_max_min = RESERVED_PIDS + 1; 41 int pid_max_max = PID_MAX_LIMIT; 42 43 #define BITS_PER_PAGE (PAGE_SIZE*8) 44 #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 45 46 static inline int mk_pid(struct pid_namespace *pid_ns, 47 struct pidmap *map, int off) 48 { 49 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; 50 } 51 52 #define find_next_offset(map, off) \ 53 find_next_zero_bit((map)->page, BITS_PER_PAGE, off) 54 55 /* 56 * PID-map pages start out as NULL, they get allocated upon 57 * first use and are never deallocated. This way a low pid_max 58 * value does not cause lots of bitmaps to be allocated, but 59 * the scheme scales to up to 4 million PIDs, runtime. 60 */ 61 struct pid_namespace init_pid_ns = { 62 .kref = { 63 .refcount = ATOMIC_INIT(2), 64 }, 65 .pidmap = { 66 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 67 }, 68 .last_pid = 0, 69 .child_reaper = &init_task 70 }; 71 72 /* 73 * Note: disable interrupts while the pidmap_lock is held as an 74 * interrupt might come in and do read_lock(&tasklist_lock). 75 * 76 * If we don't disable interrupts there is a nasty deadlock between 77 * detach_pid()->free_pid() and another cpu that does 78 * spin_lock(&pidmap_lock) followed by an interrupt routine that does 79 * read_lock(&tasklist_lock); 80 * 81 * After we clean up the tasklist_lock and know there are no 82 * irq handlers that take it we can leave the interrupts enabled. 83 * For now it is easier to be safe than to prove it can't happen. 84 */ 85 86 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 87 88 static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) 89 { 90 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; 91 int offset = pid & BITS_PER_PAGE_MASK; 92 93 clear_bit(offset, map->page); 94 atomic_inc(&map->nr_free); 95 } 96 97 static int alloc_pidmap(struct pid_namespace *pid_ns) 98 { 99 int i, offset, max_scan, pid, last = pid_ns->last_pid; 100 struct pidmap *map; 101 102 pid = last + 1; 103 if (pid >= pid_max) 104 pid = RESERVED_PIDS; 105 offset = pid & BITS_PER_PAGE_MASK; 106 map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; 107 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 108 for (i = 0; i <= max_scan; ++i) { 109 if (unlikely(!map->page)) { 110 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); 111 /* 112 * Free the page if someone raced with us 113 * installing it: 114 */ 115 spin_lock_irq(&pidmap_lock); 116 if (map->page) 117 kfree(page); 118 else 119 map->page = page; 120 spin_unlock_irq(&pidmap_lock); 121 if (unlikely(!map->page)) 122 break; 123 } 124 if (likely(atomic_read(&map->nr_free))) { 125 do { 126 if (!test_and_set_bit(offset, map->page)) { 127 atomic_dec(&map->nr_free); 128 pid_ns->last_pid = pid; 129 return pid; 130 } 131 offset = find_next_offset(map, offset); 132 pid = mk_pid(pid_ns, map, offset); 133 /* 134 * find_next_offset() found a bit, the pid from it 135 * is in-bounds, and if we fell back to the last 136 * bitmap block and the final block was the same 137 * as the starting point, pid is before last_pid. 138 */ 139 } while (offset < BITS_PER_PAGE && pid < pid_max && 140 (i != max_scan || pid < last || 141 !((last+1) & BITS_PER_PAGE_MASK))); 142 } 143 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 144 ++map; 145 offset = 0; 146 } else { 147 map = &pid_ns->pidmap[0]; 148 offset = RESERVED_PIDS; 149 if (unlikely(last == offset)) 150 break; 151 } 152 pid = mk_pid(pid_ns, map, offset); 153 } 154 return -1; 155 } 156 157 static int next_pidmap(struct pid_namespace *pid_ns, int last) 158 { 159 int offset; 160 struct pidmap *map, *end; 161 162 offset = (last + 1) & BITS_PER_PAGE_MASK; 163 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 164 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 165 for (; map < end; map++, offset = 0) { 166 if (unlikely(!map->page)) 167 continue; 168 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); 169 if (offset < BITS_PER_PAGE) 170 return mk_pid(pid_ns, map, offset); 171 } 172 return -1; 173 } 174 175 fastcall void put_pid(struct pid *pid) 176 { 177 if (!pid) 178 return; 179 if ((atomic_read(&pid->count) == 1) || 180 atomic_dec_and_test(&pid->count)) 181 kmem_cache_free(pid_cachep, pid); 182 } 183 EXPORT_SYMBOL_GPL(put_pid); 184 185 static void delayed_put_pid(struct rcu_head *rhp) 186 { 187 struct pid *pid = container_of(rhp, struct pid, rcu); 188 put_pid(pid); 189 } 190 191 fastcall void free_pid(struct pid *pid) 192 { 193 /* We can be called with write_lock_irq(&tasklist_lock) held */ 194 unsigned long flags; 195 196 spin_lock_irqsave(&pidmap_lock, flags); 197 hlist_del_rcu(&pid->pid_chain); 198 spin_unlock_irqrestore(&pidmap_lock, flags); 199 200 free_pidmap(&init_pid_ns, pid->nr); 201 call_rcu(&pid->rcu, delayed_put_pid); 202 } 203 204 struct pid *alloc_pid(void) 205 { 206 struct pid *pid; 207 enum pid_type type; 208 int nr = -1; 209 210 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); 211 if (!pid) 212 goto out; 213 214 nr = alloc_pidmap(current->nsproxy->pid_ns); 215 if (nr < 0) 216 goto out_free; 217 218 atomic_set(&pid->count, 1); 219 pid->nr = nr; 220 for (type = 0; type < PIDTYPE_MAX; ++type) 221 INIT_HLIST_HEAD(&pid->tasks[type]); 222 223 spin_lock_irq(&pidmap_lock); 224 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); 225 spin_unlock_irq(&pidmap_lock); 226 227 out: 228 return pid; 229 230 out_free: 231 kmem_cache_free(pid_cachep, pid); 232 pid = NULL; 233 goto out; 234 } 235 236 struct pid * fastcall find_pid(int nr) 237 { 238 struct hlist_node *elem; 239 struct pid *pid; 240 241 hlist_for_each_entry_rcu(pid, elem, 242 &pid_hash[pid_hashfn(nr)], pid_chain) { 243 if (pid->nr == nr) 244 return pid; 245 } 246 return NULL; 247 } 248 EXPORT_SYMBOL_GPL(find_pid); 249 250 int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) 251 { 252 struct pid_link *link; 253 struct pid *pid; 254 255 link = &task->pids[type]; 256 link->pid = pid = find_pid(nr); 257 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 258 259 return 0; 260 } 261 262 void fastcall detach_pid(struct task_struct *task, enum pid_type type) 263 { 264 struct pid_link *link; 265 struct pid *pid; 266 int tmp; 267 268 link = &task->pids[type]; 269 pid = link->pid; 270 271 hlist_del_rcu(&link->node); 272 link->pid = NULL; 273 274 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 275 if (!hlist_empty(&pid->tasks[tmp])) 276 return; 277 278 free_pid(pid); 279 } 280 281 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 282 void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, 283 enum pid_type type) 284 { 285 new->pids[type].pid = old->pids[type].pid; 286 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); 287 old->pids[type].pid = NULL; 288 } 289 290 struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) 291 { 292 struct task_struct *result = NULL; 293 if (pid) { 294 struct hlist_node *first; 295 first = rcu_dereference(pid->tasks[type].first); 296 if (first) 297 result = hlist_entry(first, struct task_struct, pids[(type)].node); 298 } 299 return result; 300 } 301 302 /* 303 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 304 */ 305 struct task_struct *find_task_by_pid_type(int type, int nr) 306 { 307 return pid_task(find_pid(nr), type); 308 } 309 310 EXPORT_SYMBOL(find_task_by_pid_type); 311 312 struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 313 { 314 struct pid *pid; 315 rcu_read_lock(); 316 pid = get_pid(task->pids[type].pid); 317 rcu_read_unlock(); 318 return pid; 319 } 320 321 struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) 322 { 323 struct task_struct *result; 324 rcu_read_lock(); 325 result = pid_task(pid, type); 326 if (result) 327 get_task_struct(result); 328 rcu_read_unlock(); 329 return result; 330 } 331 332 struct pid *find_get_pid(pid_t nr) 333 { 334 struct pid *pid; 335 336 rcu_read_lock(); 337 pid = get_pid(find_pid(nr)); 338 rcu_read_unlock(); 339 340 return pid; 341 } 342 343 /* 344 * Used by proc to find the first pid that is greater then or equal to nr. 345 * 346 * If there is a pid at nr this function is exactly the same as find_pid. 347 */ 348 struct pid *find_ge_pid(int nr) 349 { 350 struct pid *pid; 351 352 do { 353 pid = find_pid(nr); 354 if (pid) 355 break; 356 nr = next_pidmap(current->nsproxy->pid_ns, nr); 357 } while (nr > 0); 358 359 return pid; 360 } 361 EXPORT_SYMBOL_GPL(find_get_pid); 362 363 int copy_pid_ns(int flags, struct task_struct *tsk) 364 { 365 struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; 366 int err = 0; 367 368 if (!old_ns) 369 return 0; 370 371 get_pid_ns(old_ns); 372 return err; 373 } 374 375 void free_pid_ns(struct kref *kref) 376 { 377 struct pid_namespace *ns; 378 379 ns = container_of(kref, struct pid_namespace, kref); 380 kfree(ns); 381 } 382 383 /* 384 * The pid hash table is scaled according to the amount of memory in the 385 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 386 * more. 387 */ 388 void __init pidhash_init(void) 389 { 390 int i, pidhash_size; 391 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 392 393 pidhash_shift = max(4, fls(megabytes * 4)); 394 pidhash_shift = min(12, pidhash_shift); 395 pidhash_size = 1 << pidhash_shift; 396 397 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 398 pidhash_size, pidhash_shift, 399 pidhash_size * sizeof(struct hlist_head)); 400 401 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); 402 if (!pid_hash) 403 panic("Could not alloc pidhash!\n"); 404 for (i = 0; i < pidhash_size; i++) 405 INIT_HLIST_HEAD(&pid_hash[i]); 406 } 407 408 void __init pidmap_init(void) 409 { 410 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 411 /* Reserve PID 0. We never call free_pidmap(0) */ 412 set_bit(0, init_pid_ns.pidmap[0].page); 413 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 414 415 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 416 __alignof__(struct pid), 417 SLAB_PANIC, NULL, NULL); 418 } 419