1 /* 2 * Generic pidhash and scalable, time-bounded PID allocator 3 * 4 * (C) 2002-2003 William Irwin, IBM 5 * (C) 2004 William Irwin, Oracle 6 * (C) 2002-2004 Ingo Molnar, Red Hat 7 * 8 * pid-structures are backing objects for tasks sharing a given ID to chain 9 * against. There is very little to them aside from hashing them and 10 * parking tasks using given ID's on a list. 11 * 12 * The hash is always changed with the tasklist_lock write-acquired, 13 * and the hash is only accessed with the tasklist_lock at least 14 * read-acquired, so there's no additional SMP locking needed here. 15 * 16 * We have a list of bitmap pages, which bitmaps represent the PID space. 17 * Allocating and freeing PIDs is completely lockless. The worst-case 18 * allocation scenario when all but one out of 1 million PIDs possible are 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 21 */ 22 23 #include <linux/mm.h> 24 #include <linux/module.h> 25 #include <linux/slab.h> 26 #include <linux/init.h> 27 #include <linux/bootmem.h> 28 #include <linux/hash.h> 29 30 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31 static struct hlist_head *pid_hash; 32 static int pidhash_shift; 33 static kmem_cache_t *pid_cachep; 34 35 int pid_max = PID_MAX_DEFAULT; 36 int last_pid; 37 38 #define RESERVED_PIDS 300 39 40 int pid_max_min = RESERVED_PIDS + 1; 41 int pid_max_max = PID_MAX_LIMIT; 42 43 #define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) 44 #define BITS_PER_PAGE (PAGE_SIZE*8) 45 #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 46 #define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) 47 #define find_next_offset(map, off) \ 48 find_next_zero_bit((map)->page, BITS_PER_PAGE, off) 49 50 /* 51 * PID-map pages start out as NULL, they get allocated upon 52 * first use and are never deallocated. This way a low pid_max 53 * value does not cause lots of bitmaps to be allocated, but 54 * the scheme scales to up to 4 million PIDs, runtime. 55 */ 56 typedef struct pidmap { 57 atomic_t nr_free; 58 void *page; 59 } pidmap_t; 60 61 static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 63 64 /* 65 * Note: disable interrupts while the pidmap_lock is held as an 66 * interrupt might come in and do read_lock(&tasklist_lock). 67 * 68 * If we don't disable interrupts there is a nasty deadlock between 69 * detach_pid()->free_pid() and another cpu that does 70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does 71 * read_lock(&tasklist_lock); 72 * 73 * After we clean up the tasklist_lock and know there are no 74 * irq handlers that take it we can leave the interrupts enabled. 75 * For now it is easier to be safe than to prove it can't happen. 76 */ 77 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 78 79 static fastcall void free_pidmap(int pid) 80 { 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 82 int offset = pid & BITS_PER_PAGE_MASK; 83 84 clear_bit(offset, map->page); 85 atomic_inc(&map->nr_free); 86 } 87 88 static int alloc_pidmap(void) 89 { 90 int i, offset, max_scan, pid, last = last_pid; 91 pidmap_t *map; 92 93 pid = last + 1; 94 if (pid >= pid_max) 95 pid = RESERVED_PIDS; 96 offset = pid & BITS_PER_PAGE_MASK; 97 map = &pidmap_array[pid/BITS_PER_PAGE]; 98 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 99 for (i = 0; i <= max_scan; ++i) { 100 if (unlikely(!map->page)) { 101 unsigned long page = get_zeroed_page(GFP_KERNEL); 102 /* 103 * Free the page if someone raced with us 104 * installing it: 105 */ 106 spin_lock_irq(&pidmap_lock); 107 if (map->page) 108 free_page(page); 109 else 110 map->page = (void *)page; 111 spin_unlock_irq(&pidmap_lock); 112 if (unlikely(!map->page)) 113 break; 114 } 115 if (likely(atomic_read(&map->nr_free))) { 116 do { 117 if (!test_and_set_bit(offset, map->page)) { 118 atomic_dec(&map->nr_free); 119 last_pid = pid; 120 return pid; 121 } 122 offset = find_next_offset(map, offset); 123 pid = mk_pid(map, offset); 124 /* 125 * find_next_offset() found a bit, the pid from it 126 * is in-bounds, and if we fell back to the last 127 * bitmap block and the final block was the same 128 * as the starting point, pid is before last_pid. 129 */ 130 } while (offset < BITS_PER_PAGE && pid < pid_max && 131 (i != max_scan || pid < last || 132 !((last+1) & BITS_PER_PAGE_MASK))); 133 } 134 if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { 135 ++map; 136 offset = 0; 137 } else { 138 map = &pidmap_array[0]; 139 offset = RESERVED_PIDS; 140 if (unlikely(last == offset)) 141 break; 142 } 143 pid = mk_pid(map, offset); 144 } 145 return -1; 146 } 147 148 fastcall void put_pid(struct pid *pid) 149 { 150 if (!pid) 151 return; 152 if ((atomic_read(&pid->count) == 1) || 153 atomic_dec_and_test(&pid->count)) 154 kmem_cache_free(pid_cachep, pid); 155 } 156 157 static void delayed_put_pid(struct rcu_head *rhp) 158 { 159 struct pid *pid = container_of(rhp, struct pid, rcu); 160 put_pid(pid); 161 } 162 163 fastcall void free_pid(struct pid *pid) 164 { 165 /* We can be called with write_lock_irq(&tasklist_lock) held */ 166 unsigned long flags; 167 168 spin_lock_irqsave(&pidmap_lock, flags); 169 hlist_del_rcu(&pid->pid_chain); 170 spin_unlock_irqrestore(&pidmap_lock, flags); 171 172 free_pidmap(pid->nr); 173 call_rcu(&pid->rcu, delayed_put_pid); 174 } 175 176 struct pid *alloc_pid(void) 177 { 178 struct pid *pid; 179 enum pid_type type; 180 int nr = -1; 181 182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); 183 if (!pid) 184 goto out; 185 186 nr = alloc_pidmap(); 187 if (nr < 0) 188 goto out_free; 189 190 atomic_set(&pid->count, 1); 191 pid->nr = nr; 192 for (type = 0; type < PIDTYPE_MAX; ++type) 193 INIT_HLIST_HEAD(&pid->tasks[type]); 194 195 spin_lock_irq(&pidmap_lock); 196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); 197 spin_unlock_irq(&pidmap_lock); 198 199 out: 200 return pid; 201 202 out_free: 203 kmem_cache_free(pid_cachep, pid); 204 pid = NULL; 205 goto out; 206 } 207 208 struct pid * fastcall find_pid(int nr) 209 { 210 struct hlist_node *elem; 211 struct pid *pid; 212 213 hlist_for_each_entry_rcu(pid, elem, 214 &pid_hash[pid_hashfn(nr)], pid_chain) { 215 if (pid->nr == nr) 216 return pid; 217 } 218 return NULL; 219 } 220 221 int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) 222 { 223 struct pid_link *link; 224 struct pid *pid; 225 226 WARN_ON(!task->pid); /* to be removed soon */ 227 WARN_ON(!nr); /* to be removed soon */ 228 229 link = &task->pids[type]; 230 link->pid = pid = find_pid(nr); 231 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 232 233 return 0; 234 } 235 236 void fastcall detach_pid(struct task_struct *task, enum pid_type type) 237 { 238 struct pid_link *link; 239 struct pid *pid; 240 int tmp; 241 242 link = &task->pids[type]; 243 pid = link->pid; 244 245 hlist_del_rcu(&link->node); 246 link->pid = NULL; 247 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 249 if (!hlist_empty(&pid->tasks[tmp])) 250 return; 251 252 free_pid(pid); 253 } 254 255 struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) 256 { 257 struct task_struct *result = NULL; 258 if (pid) { 259 struct hlist_node *first; 260 first = rcu_dereference(pid->tasks[type].first); 261 if (first) 262 result = hlist_entry(first, struct task_struct, pids[(type)].node); 263 } 264 return result; 265 } 266 267 /* 268 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 269 */ 270 struct task_struct *find_task_by_pid_type(int type, int nr) 271 { 272 return pid_task(find_pid(nr), type); 273 } 274 275 EXPORT_SYMBOL(find_task_by_pid_type); 276 277 struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) 278 { 279 struct task_struct *result; 280 rcu_read_lock(); 281 result = pid_task(pid, type); 282 if (result) 283 get_task_struct(result); 284 rcu_read_unlock(); 285 return result; 286 } 287 288 struct pid *find_get_pid(pid_t nr) 289 { 290 struct pid *pid; 291 292 rcu_read_lock(); 293 pid = get_pid(find_pid(nr)); 294 rcu_read_unlock(); 295 296 return pid; 297 } 298 299 /* 300 * The pid hash table is scaled according to the amount of memory in the 301 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 302 * more. 303 */ 304 void __init pidhash_init(void) 305 { 306 int i, pidhash_size; 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 308 309 pidhash_shift = max(4, fls(megabytes * 4)); 310 pidhash_shift = min(12, pidhash_shift); 311 pidhash_size = 1 << pidhash_shift; 312 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 314 pidhash_size, pidhash_shift, 315 pidhash_size * sizeof(struct hlist_head)); 316 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); 318 if (!pid_hash) 319 panic("Could not alloc pidhash!\n"); 320 for (i = 0; i < pidhash_size; i++) 321 INIT_HLIST_HEAD(&pid_hash[i]); 322 } 323 324 void __init pidmap_init(void) 325 { 326 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); 327 /* Reserve PID 0. We never call free_pidmap(0) */ 328 set_bit(0, pidmap_array->page); 329 atomic_dec(&pidmap_array->nr_free); 330 331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 332 __alignof__(struct pid), 333 SLAB_PANIC, NULL, NULL); 334 } 335