xref: /titanic_52/usr/src/uts/common/inet/ipd/ipd.c (revision b52c8fbe58cfd9f349fdfa6cc5cb6f5731946a2b)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /*
12  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
13  */
14 
15 /*
16  * ipd: Internet packet disturber
17  *
18  * The purpose of ipd is to simulate congested and lossy networks when they
19  * don't actually exist. The features of these congested and lossy networks are
20  * events that end up leading to retransmits and thus kicking us out of the
21  * TCP/IP fastpath. Since normally this would require us to have an actually
22  * congested network, which can be problematic, we instead simulate this
23  * behavior.
24  *
25  * 1. ipd's operations and restrictions
26  *
27  * ipd currently has facilities to cause IP traffic to be:
28  *
29  *   - Corrupted with some probability.
30  *   - Delayed for a set number of microseconds.
31  *   - Dropped with some probability.
32  *
33  * Each of these features are enabled on a per-zone basic. The current
34  * implementation restricts this specifically to exclusive stack zones.
35  * Enabling ipd on a given zone causes pfhooks to be installed for that zone's
36  * netstack. Because of the nature of ipd, it currently only supports exclusive
37  * stack zones and as a further restriction, it only allows the global zone
38  * administrative access. ipd can be enabled for the global zone, but doing so
39  * will cause all shared-stack zones to also be affected.
40  *
41  * 2. General architecture and Locking
42  *
43  * ipd consists of a few components. There is a per netstack data structure that
44  * is created and destroyed with the creation and destruction of each exclusive
45  * stack zone. Each of these netstacks is stored in a global list which is
46  * accessed for control of ipd via ioctls. The following diagram touches on the
47  * data structures that are used throughout ipd.
48  *
49  *   ADMINISTRATIVE			         DATA PATH
50  *
51  *    +--------+                          +------+       +------+
52  *    | ipdadm |                          |  ip  |       | nics |
53  *    +--------+                          +------+       +------+
54  *       |  ^                                |               |
55  *       |  | ioctl(2)                       |               |
56  *       V  |                                V               V
57  *    +----------+                     +-------------------------+
58  *    | /dev/ipd |                     | pfhooks packet callback | == ipd_hook()
59  *    +----------+                     +-------------------------+
60  *         |                                         |
61  *         |                                         |
62  *         V                                         |
63  *    +----------------+                             |
64  *    | list_t ipd_nsl |------+                      |
65  *    +----------------+      |                      |
66  *                            |                      |
67  *                            V     per netstack     V
68  *                         +----------------------------+
69  *                         |       ipd_nestack_t        |
70  *                         +----------------------------+
71  *
72  * ipd has two different entry points, one is administrative, the other is the
73  * data path. The administrative path is accessed by a userland component called
74  * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd.
75  * If the administrative path enables a specific zone, then the data path will
76  * become active for that zone. Any packet that leaves that zone's IP stack or
77  * is going to enter it, comes through the callback specified in the hook_t(9S)
78  * structure. This will cause each packet to go through ipd_hook().
79  *
80  * While the locking inside of ipd should be straightforward, unfortunately, the
81  * pfhooks subsystem necessarily complicates this a little bit. There are
82  * currently three different sets of locks in ipd.
83  *
84  *   - Global lock N on the netstack list.
85  *   - Global lock A on the active count.
86  *   - Per-netstack data structure lock Z.
87  *
88  * # Locking rules
89  *
90  * L.1a N must always be acquired first and released last
91  *
92  * If you need to acquire the netstack list lock, either for reading or writing,
93  * then N must be acquired first and before any other locks. It may not be
94  * dropped before any other lock.
95  *
96  * L.1b N must only be acquired from the administrative path and zone creation,
97  *      shutdown, and destruct callbacks.
98  *
99  * The data path, e.g. receiving the per-packet callbacks, should never be
100  * grabbing the list lock. If it is, then the architecture here needs to be
101  * reconsidered.
102  *
103  * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks
104  *     are active.
105  *
106  * The way the pfhooks subsystem is designed is that a reference count is
107  * present on the hook_t while it is active. As long as that reference count is
108  * non-zero, a call to net_hook_unregister will block until it is lowered.
109  * Because the callbacks want the same lock for the netstack that is held by the
110  * administrative path calling into net_hook_unregister, we deadlock.
111  *
112  *  ioctl from ipdadm remove      hook_t cb (from nic)       hook_t cb (from IP)
113  *  -----------------------       --------------------       -------------------
114  *       |                             |                             |
115  *       |                        bump hook_t refcount               |
116  *  mutex_enter(ipd_nsl_lock);    enter ipd_hook()          bump hook_t refcount
117  *  mutex acquired                mutex_enter(ins->ipdn_lock);       |
118  *       |                        mutex acquired            enter ipd_hook()
119  *  mutex_enter(ins->ipdn_lock);       |            mutex_enter(ins->ipdn_lock);
120  *       |                             |                             |
121  *       |                             |                             |
122  *       |                        mutex_exit(ins->ipdn_lock);        |
123  *       |                             |                             |
124  *  mutex acquired                leave ipd_hook()                   |
125  *       |                        decrement hook_t refcount          |
126  *       |                             |                             |
127  *  ipd_teardown_hooks()               |                             |
128  *  net_hook_unregister()              |                             |
129  *  cv_wait() if recount               |                             |
130  *       |                             |                             |
131  *  ---------------------------------------------------------------------------
132  *
133  * At this point, we can see that the second hook callback still doesn't have
134  * the mutex, but it has bumped the hook_t refcount. However, it will never
135  * acquire the mutex that it needs to finish its operation and decrement the
136  * refcount.
137  *
138  * Obviously, deadlocking is not acceptable, thus the following corollary to the
139  * second locking rule:
140  *
141  * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem,
142  *                N must be held.
143  *
144  * There is currently only one path where we have to worry about this. That is
145  * when we are removing a hook, but the zone is not being shutdown, then hooks
146  * are currently active. The only place that this currently happens is in
147  * ipd_check_hooks().
148  *
149  */
150 
151 #include <sys/types.h>
152 #include <sys/file.h>
153 #include <sys/errno.h>
154 #include <sys/open.h>
155 #include <sys/cred.h>
156 #include <sys/ddi.h>
157 #include <sys/sunddi.h>
158 #include <sys/kmem.h>
159 #include <sys/conf.h>
160 #include <sys/stat.h>
161 #include <sys/cmn_err.h>
162 #include <sys/ddi.h>
163 #include <sys/sunddi.h>
164 #include <sys/modctl.h>
165 #include <sys/kstat.h>
166 #include <sys/neti.h>
167 #include <sys/list.h>
168 #include <sys/ksynch.h>
169 #include <sys/sysmacros.h>
170 #include <sys/policy.h>
171 #include <sys/atomic.h>
172 #include <sys/model.h>
173 #include <sys/strsun.h>
174 
175 #include <sys/netstack.h>
176 #include <sys/hook.h>
177 #include <sys/hook_event.h>
178 
179 #include <sys/ipd.h>
180 
181 #define	IPDN_STATUS_DISABLED	0x1
182 #define	IPDN_STATUS_ENABLED	0x2
183 #define	IPDN_STATUS_CONDEMNED	0x4
184 
185 /*
186  * These flags are used to determine whether or not the hooks are registered.
187  */
188 #define	IPDN_HOOK_NONE		0x0
189 #define	IPDN_HOOK_V4IN		0x1
190 #define	IPDN_HOOK_V4OUT		0x2
191 #define	IPDN_HOOK_V6IN		0x4
192 #define	IPDN_HOOK_V6OUT		0x8
193 #define	IPDN_HOOK_ALL		0xf
194 
195 /*
196  * Per-netstack kstats.
197  */
198 typedef struct ipd_nskstat {
199 	kstat_named_t	ink_ndrops;
200 	kstat_named_t	ink_ncorrupts;
201 	kstat_named_t	ink_ndelays;
202 } ipd_nskstat_t;
203 
204 /*
205  * Different parts of this structure have different locking semantics. The list
206  * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock.
207  * The following members are read only: ipdn_netid and ipdn_zoneid. The members
208  * of the kstat structure are always accessible in the data path, but the
209  * counters must be bumped with atomic operations. The ipdn_lock protects every
210  * other aspect of this structure. Please see the big theory statement on the
211  * requirements for lock ordering.
212  */
213 typedef struct ipd_netstack {
214 	list_node_t	ipdn_link;		/* link on ipd_nsl */
215 	netid_t		ipdn_netid;		/* netstack id */
216 	zoneid_t	ipdn_zoneid;		/* zone id */
217 	kstat_t		*ipdn_kstat;		/* kstat_t ptr */
218 	ipd_nskstat_t	ipdn_ksdata;		/* kstat data */
219 	kmutex_t	ipdn_lock;		/* protects following members */
220 	int		ipdn_status;		/* status flags */
221 	net_handle_t	ipdn_v4hdl;		/* IPv4 net handle */
222 	net_handle_t	ipdn_v6hdl;		/* IPv4 net handle */
223 	int		ipdn_hooked;		/* are hooks registered */
224 	hook_t		*ipdn_v4in;		/* IPv4 traffic in hook */
225 	hook_t		*ipdn_v4out;		/* IPv4 traffice out hook */
226 	hook_t		*ipdn_v6in;		/* IPv6 traffic in hook */
227 	hook_t		*ipdn_v6out;		/* IPv6 traffic out hook */
228 	int		ipdn_enabled;		/* which perturbs are on */
229 	int		ipdn_corrupt;		/* corrupt percentage */
230 	int		ipdn_drop;		/* drop percentage */
231 	uint_t		ipdn_delay;		/* delay us */
232 	long		ipdn_rand;		/* random seed */
233 } ipd_netstack_t;
234 
235 /*
236  * ipd internal variables
237  */
238 static dev_info_t	*ipd_devi;		/* device info */
239 static net_instance_t	*ipd_neti;		/* net_instance for hooks */
240 static unsigned int	ipd_max_delay = IPD_MAX_DELAY;	/* max delay in us */
241 static kmutex_t		ipd_nsl_lock;		/* lock for the nestack list */
242 static list_t		ipd_nsl;		/* list of netstacks */
243 static kmutex_t		ipd_nactive_lock;	/* lock for nactive */
244 static unsigned int	ipd_nactive; 		/* number of active netstacks */
245 static int		ipd_nactive_fudge = 4;	/* amount to fudge by in list */
246 
247 /*
248  * Note that this random number implementation is based upon the old BSD 4.1
249  * rand. It's good enough for us!
250  */
251 static int
252 ipd_nextrand(ipd_netstack_t *ins)
253 {
254 	ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345;
255 	return (ins->ipdn_rand & 0x7fffffff);
256 }
257 
258 static void
259 ipd_ksbump(kstat_named_t *nkp)
260 {
261 	atomic_inc_64(&nkp->value.ui64);
262 }
263 
264 /*
265  * This is where all the magic actually happens. The way that this works is we
266  * grab the ins lock to basically get a copy of all the data that we need to do
267  * our job and then let it go to minimize contention. In terms of actual work on
268  * the packet we do them in the following order:
269  *
270  * - drop
271  * - delay
272  * - corrupt
273  */
274 /*ARGSUSED*/
275 static int
276 ipd_hook(hook_event_token_t event, hook_data_t data, void *arg)
277 {
278 	unsigned char *crp;
279 	int dwait, corrupt, drop, rand, off, status;
280 	mblk_t *mbp;
281 	ipd_netstack_t *ins = arg;
282 	hook_pkt_event_t *pkt = (hook_pkt_event_t *)data;
283 
284 	mutex_enter(&ins->ipdn_lock);
285 	status = ins->ipdn_status;
286 	dwait = ins->ipdn_delay;
287 	corrupt = ins->ipdn_corrupt;
288 	drop = ins->ipdn_drop;
289 	rand = ipd_nextrand(ins);
290 	mutex_exit(&ins->ipdn_lock);
291 
292 	/*
293 	 * This probably cannot happen, but we'll do an extra guard just in
294 	 * case.
295 	 */
296 	if (status & IPDN_STATUS_CONDEMNED)
297 		return (0);
298 
299 	if (drop != 0 && rand % 100 < drop) {
300 		freemsg(*pkt->hpe_mp);
301 		*pkt->hpe_mp = NULL;
302 		pkt->hpe_mb = NULL;
303 		pkt->hpe_hdr = NULL;
304 		ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops);
305 
306 		return (1);
307 	}
308 
309 	if (dwait != 0) {
310 		if (dwait < TICK_TO_USEC(1))
311 			drv_usecwait(dwait);
312 		else
313 			delay(drv_usectohz(dwait));
314 		ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays);
315 	}
316 
317 	if (corrupt != 0 && rand % 100 < corrupt) {
318 		/*
319 		 * Since we're corrupting the mblk, just corrupt everything in
320 		 * the chain. While we could corrupt the entire packet, that's a
321 		 * little strong. Instead we're going to just change one of the
322 		 * bytes in each mblock.
323 		 */
324 		mbp = *pkt->hpe_mp;
325 		while (mbp != NULL) {
326 			if (mbp->b_wptr == mbp->b_rptr)
327 				continue;
328 
329 			/*
330 			 * While pfhooks probably won't send us anything else,
331 			 * let's just be extra careful. The stack probably isn't
332 			 * as resiliant to corruption of control messages.
333 			 */
334 			if (DB_TYPE(mbp) != M_DATA)
335 				continue;
336 
337 			off = rand % ((uintptr_t)mbp->b_wptr -
338 			    (uintptr_t)mbp->b_rptr);
339 			crp = mbp->b_rptr + off;
340 			off = rand % 8;
341 			*crp = *crp ^ (1 << off);
342 
343 			mbp = mbp->b_cont;
344 		}
345 		ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts);
346 	}
347 
348 	return (0);
349 }
350 
351 /*
352  * Sets up and registers all the proper hooks needed for the netstack to capture
353  * packets. Callers are assumed to already be holding the ipd_netstack_t's lock.
354  * If there is a failure in setting something up, it is the responsibility of
355  * this function to clean it up. Once this function has been called, it should
356  * not be called until a corresponding call to tear down the hooks has been
357  * done.
358  */
359 static int
360 ipd_setup_hooks(ipd_netstack_t *ins)
361 {
362 	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
363 	ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET);
364 	if (ins->ipdn_v4hdl == NULL)
365 		goto cleanup;
366 
367 	ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6);
368 	if (ins->ipdn_v6hdl == NULL)
369 		goto cleanup;
370 
371 	ins->ipdn_v4in = hook_alloc(HOOK_VERSION);
372 	if (ins->ipdn_v4in == NULL)
373 		goto cleanup;
374 
375 	ins->ipdn_v4in->h_flags = 0;
376 	ins->ipdn_v4in->h_hint = HH_NONE;
377 	ins->ipdn_v4in->h_hintvalue = 0;
378 	ins->ipdn_v4in->h_func = ipd_hook;
379 	ins->ipdn_v4in->h_arg = ins;
380 	ins->ipdn_v4in->h_name = "ipd IPv4 in";
381 
382 	if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
383 	    ins->ipdn_v4in) != 0)
384 		goto cleanup;
385 	ins->ipdn_hooked |= IPDN_HOOK_V4IN;
386 
387 	ins->ipdn_v4out = hook_alloc(HOOK_VERSION);
388 	if (ins->ipdn_v4out == NULL)
389 		goto cleanup;
390 	ins->ipdn_v4out->h_flags = 0;
391 	ins->ipdn_v4out->h_hint = HH_NONE;
392 	ins->ipdn_v4out->h_hintvalue = 0;
393 	ins->ipdn_v4out->h_func = ipd_hook;
394 	ins->ipdn_v4out->h_arg = ins;
395 	ins->ipdn_v4out->h_name = "ipd IPv4 out";
396 
397 	if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
398 	    ins->ipdn_v4out) != 0)
399 		goto cleanup;
400 	ins->ipdn_hooked |= IPDN_HOOK_V4OUT;
401 
402 	ins->ipdn_v6in = hook_alloc(HOOK_VERSION);
403 	if (ins->ipdn_v6in == NULL)
404 		goto cleanup;
405 	ins->ipdn_v6in->h_flags = 0;
406 	ins->ipdn_v6in->h_hint = HH_NONE;
407 	ins->ipdn_v6in->h_hintvalue = 0;
408 	ins->ipdn_v6in->h_func = ipd_hook;
409 	ins->ipdn_v6in->h_arg = ins;
410 	ins->ipdn_v6in->h_name = "ipd IPv6 in";
411 
412 	if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
413 	    ins->ipdn_v6in) != 0)
414 		goto cleanup;
415 	ins->ipdn_hooked |= IPDN_HOOK_V6IN;
416 
417 	ins->ipdn_v6out = hook_alloc(HOOK_VERSION);
418 	if (ins->ipdn_v6out == NULL)
419 		goto cleanup;
420 	ins->ipdn_v6out->h_flags = 0;
421 	ins->ipdn_v6out->h_hint = HH_NONE;
422 	ins->ipdn_v6out->h_hintvalue = 0;
423 	ins->ipdn_v6out->h_func = ipd_hook;
424 	ins->ipdn_v6out->h_arg = ins;
425 	ins->ipdn_v6out->h_name = "ipd IPv6 out";
426 
427 	if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
428 	    ins->ipdn_v6out) != 0)
429 		goto cleanup;
430 	ins->ipdn_hooked |= IPDN_HOOK_V6OUT;
431 	mutex_enter(&ipd_nactive_lock);
432 	ipd_nactive++;
433 	mutex_exit(&ipd_nactive_lock);
434 
435 	return (0);
436 
437 cleanup:
438 	if (ins->ipdn_hooked & IPDN_HOOK_V6OUT)
439 		(void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
440 		    ins->ipdn_v6out);
441 
442 	if (ins->ipdn_hooked & IPDN_HOOK_V6IN)
443 		(void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
444 		    ins->ipdn_v6in);
445 
446 	if (ins->ipdn_hooked & IPDN_HOOK_V4OUT)
447 		(void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
448 		    ins->ipdn_v4out);
449 
450 	if (ins->ipdn_hooked & IPDN_HOOK_V4IN)
451 		(void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
452 		    ins->ipdn_v4in);
453 
454 	ins->ipdn_hooked = IPDN_HOOK_NONE;
455 
456 	if (ins->ipdn_v6out != NULL)
457 		hook_free(ins->ipdn_v6out);
458 
459 	if (ins->ipdn_v6in != NULL)
460 		hook_free(ins->ipdn_v6in);
461 
462 	if (ins->ipdn_v4out != NULL)
463 		hook_free(ins->ipdn_v4out);
464 
465 	if (ins->ipdn_v4in != NULL)
466 		hook_free(ins->ipdn_v4in);
467 
468 	if (ins->ipdn_v6hdl != NULL)
469 		(void) net_protocol_release(ins->ipdn_v6hdl);
470 
471 	if (ins->ipdn_v4hdl != NULL)
472 		(void) net_protocol_release(ins->ipdn_v4hdl);
473 
474 	return (1);
475 }
476 
477 static void
478 ipd_teardown_hooks(ipd_netstack_t *ins)
479 {
480 	ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL);
481 	VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
482 	    ins->ipdn_v6out) == 0);
483 	VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
484 	    ins->ipdn_v6in) == 0);
485 	VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
486 	    ins->ipdn_v4out) == 0);
487 	VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
488 	    ins->ipdn_v4in) == 0);
489 
490 	ins->ipdn_hooked = IPDN_HOOK_NONE;
491 
492 	hook_free(ins->ipdn_v6out);
493 	hook_free(ins->ipdn_v6in);
494 	hook_free(ins->ipdn_v4out);
495 	hook_free(ins->ipdn_v4in);
496 
497 	VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0);
498 	VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0);
499 
500 	mutex_enter(&ipd_nactive_lock);
501 	ipd_nactive--;
502 	mutex_exit(&ipd_nactive_lock);
503 }
504 
505 static int
506 ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable)
507 {
508 	int olden, rval;
509 	olden = ins->ipdn_enabled;
510 
511 	if (enable)
512 		ins->ipdn_enabled |= type;
513 	else
514 		ins->ipdn_enabled &= ~type;
515 
516 	/*
517 	 * If hooks were previously enabled.
518 	 */
519 	if (olden == 0 && ins->ipdn_enabled != 0) {
520 		rval = ipd_setup_hooks(ins);
521 		if (rval != 0) {
522 			ins->ipdn_enabled &= ~type;
523 			ASSERT(ins->ipdn_enabled == 0);
524 			return (rval);
525 		}
526 
527 		return (0);
528 	}
529 
530 	if (olden != 0 && ins->ipdn_enabled == 0) {
531 		ASSERT(olden != 0);
532 
533 		/*
534 		 * We have to drop the lock here, lest we cause a deadlock.
535 		 * Unfortunately, there may be hooks that are running and are
536 		 * actively in flight and we have to call the unregister
537 		 * function. Due to the hooks framework, if there is an inflight
538 		 * hook (most likely right now), and we are holding the
539 		 * netstack's lock, those hooks will never return. This is
540 		 * unfortunate.
541 		 *
542 		 * Because we only come into this path holding the list lock, we
543 		 * know that only way that someone else can come in and get to
544 		 * this structure is via the hook callbacks which are going to
545 		 * only be doing reads. They'll also see that everything has
546 		 * been disabled and return. So while this is unfortunate, it
547 		 * should be relatively safe.
548 		 */
549 		mutex_exit(&ins->ipdn_lock);
550 		ipd_teardown_hooks(ins);
551 		mutex_enter(&ins->ipdn_lock);
552 		return (0);
553 	}
554 
555 	/*
556 	 * Othwerise, nothing should have changed here.
557 	 */
558 	ASSERT((olden == 0) == (ins->ipdn_enabled == 0));
559 	return (0);
560 }
561 
562 static int
563 ipd_toggle_corrupt(ipd_netstack_t *ins, int percent)
564 {
565 	int rval;
566 
567 	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
568 
569 	if (percent < 0 || percent > 100)
570 		return (ERANGE);
571 
572 	/*
573 	 * If we've been asked to set the value to a value that we already have,
574 	 * great, then we're done.
575 	 */
576 	if (percent == ins->ipdn_corrupt)
577 		return (0);
578 
579 	ins->ipdn_corrupt = percent;
580 	rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0);
581 
582 	/*
583 	 * If ipd_check_hooks_failed, that must mean that we failed to set up
584 	 * the hooks, so we are going to effectively zero out and fail the
585 	 * request to enable corruption.
586 	 */
587 	if (rval != 0)
588 		ins->ipdn_corrupt = 0;
589 
590 	return (rval);
591 }
592 
593 static int
594 ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
595 {
596 	int rval;
597 
598 	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
599 
600 	if (delay > ipd_max_delay)
601 		return (ERANGE);
602 
603 	/*
604 	 * If we've been asked to set the value to a value that we already have,
605 	 * great, then we're done.
606 	 */
607 	if (delay == ins->ipdn_delay)
608 		return (0);
609 
610 	ins->ipdn_delay = delay;
611 	rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0);
612 
613 	/*
614 	 * If ipd_check_hooks_failed, that must mean that we failed to set up
615 	 * the hooks, so we are going to effectively zero out and fail the
616 	 * request to enable corruption.
617 	 */
618 	if (rval != 0)
619 		ins->ipdn_delay = 0;
620 
621 	return (rval);
622 }
623 static int
624 ipd_toggle_drop(ipd_netstack_t *ins, int percent)
625 {
626 	int rval;
627 
628 	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
629 
630 	if (percent < 0 || percent > 100)
631 		return (ERANGE);
632 
633 	/*
634 	 * If we've been asked to set the value to a value that we already have,
635 	 * great, then we're done.
636 	 */
637 	if (percent == ins->ipdn_drop)
638 		return (0);
639 
640 	ins->ipdn_drop = percent;
641 	rval = ipd_check_hooks(ins, IPD_DROP, percent != 0);
642 
643 	/*
644 	 * If ipd_check_hooks_failed, that must mean that we failed to set up
645 	 * the hooks, so we are going to effectively zero out and fail the
646 	 * request to enable corruption.
647 	 */
648 	if (rval != 0)
649 		ins->ipdn_drop = 0;
650 
651 	return (rval);
652 }
653 
654 static int
655 ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd)
656 {
657 	zoneid_t zid;
658 	ipd_netstack_t *ins;
659 	int rval = 0;
660 
661 	/*
662 	 * If the zone that we're coming from is not the GZ, then we ignore it
663 	 * completely and then instead just set the zoneid to be that of the
664 	 * caller. If the zoneid is that of the GZ, then we don't touch this
665 	 * value.
666 	 */
667 	zid = crgetzoneid(cr);
668 	if (zid != GLOBAL_ZONEID)
669 		ipi->ipip_zoneid = zid;
670 
671 	if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
672 	    zid != GLOBAL_ZONEID)
673 		return (EPERM);
674 
675 	/*
676 	 * We need to hold the ipd_nsl_lock throughout the entire operation,
677 	 * otherwise someone else could come in and remove us from the list and
678 	 * free us, e.g. the netstack destroy handler. By holding the lock, we
679 	 * stop it from being able to do anything wrong.
680 	 */
681 	mutex_enter(&ipd_nsl_lock);
682 	for (ins = list_head(&ipd_nsl); ins != NULL;
683 	    ins = list_next(&ipd_nsl, ins)) {
684 		if (ins->ipdn_zoneid == ipi->ipip_zoneid)
685 			break;
686 	}
687 
688 	if (ins == NULL) {
689 		mutex_exit(&ipd_nsl_lock);
690 		return (EINVAL);
691 	}
692 
693 	mutex_enter(&ins->ipdn_lock);
694 
695 	if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
696 		rval = ESHUTDOWN;
697 		goto cleanup;
698 	}
699 
700 	switch (cmd) {
701 	case IPDIOC_CORRUPT:
702 		rval = ipd_toggle_corrupt(ins, ipi->ipip_arg);
703 		break;
704 	case IPDIOC_DELAY:
705 		rval = ipd_toggle_delay(ins, ipi->ipip_arg);
706 		break;
707 	case IPDIOC_DROP:
708 		rval = ipd_toggle_drop(ins, ipi->ipip_arg);
709 		break;
710 	}
711 
712 cleanup:
713 	mutex_exit(&ins->ipdn_lock);
714 	mutex_exit(&ipd_nsl_lock);
715 	return (rval);
716 }
717 
718 static int
719 ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr)
720 {
721 	zoneid_t zid;
722 	ipd_netstack_t *ins;
723 	int rval = 0;
724 
725 	/*
726 	 * See ipd_ioctl_perturb for the rational here.
727 	 */
728 	zid = crgetzoneid(cr);
729 	if (zid != GLOBAL_ZONEID)
730 		ipi->ipip_zoneid = zid;
731 
732 	if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
733 	    zid != GLOBAL_ZONEID)
734 		return (EPERM);
735 
736 	mutex_enter(&ipd_nsl_lock);
737 	for (ins = list_head(&ipd_nsl); ins != NULL;
738 	    ins = list_next(&ipd_nsl, ins)) {
739 		if (ins->ipdn_zoneid == ipi->ipip_zoneid)
740 			break;
741 	}
742 
743 	if (ins == NULL) {
744 		mutex_exit(&ipd_nsl_lock);
745 		return (EINVAL);
746 	}
747 
748 	mutex_enter(&ins->ipdn_lock);
749 
750 	/*
751 	 * If this is condemned, that means it's very shortly going to be torn
752 	 * down. In that case, there's no reason to actually do anything here,
753 	 * as it will all be done rather shortly in the destroy function.
754 	 * Furthermore, because condemned corresponds with it having hit
755 	 * shutdown, we know that no more packets can be received by this
756 	 * netstack. All this translates to a no-op.
757 	 */
758 	if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
759 		rval = 0;
760 		goto cleanup;
761 	}
762 
763 	rval = EINVAL;
764 	/*
765 	 * Go through and disable the requested pieces. We can safely ignore the
766 	 * return value of ipd_check_hooks because the removal case should never
767 	 * fail, we verify that in the hook teardown case.
768 	 */
769 	if (ipi->ipip_arg & IPD_CORRUPT) {
770 		ins->ipdn_corrupt = 0;
771 		(void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE);
772 		rval = 0;
773 	}
774 
775 	if (ipi->ipip_arg & IPD_DELAY) {
776 		ins->ipdn_delay = 0;
777 		(void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE);
778 		rval = 0;
779 	}
780 
781 	if (ipi->ipip_arg & IPD_DROP) {
782 		ins->ipdn_drop = 0;
783 		(void) ipd_check_hooks(ins, IPD_DROP, B_FALSE);
784 		rval = 0;
785 	}
786 
787 cleanup:
788 	mutex_exit(&ins->ipdn_lock);
789 	mutex_exit(&ipd_nsl_lock);
790 	return (rval);
791 }
792 
793 /*
794  * When this function is called, the value of the ipil_nzones argument controls
795  * how this function works. When called with a value of zero, then we treat that
796  * as the caller asking us what's a reasonable number of entries for me to
797  * allocate memory for. If the zone is the global zone, then we tell them how
798  * many folks are currently active and add a fudge factor. Otherwise the answer
799  * is always one.
800  *
801  * In the non-zero case, we give them that number of zone ids. While this isn't
802  * quite ideal as it might mean that someone misses something, this generally
803  * won't be an issue, as it involves a rather tight race condition in the
804  * current ipdadm implementation.
805  */
806 static int
807 ipd_ioctl_list(intptr_t arg, cred_t *cr)
808 {
809 	zoneid_t zid;
810 	ipd_ioc_info_t *configs;
811 	ipd_netstack_t *ins;
812 	uint_t azones, rzones, nzones, cur;
813 	int rval = 0;
814 	STRUCT_DECL(ipd_ioc_list, h);
815 
816 	STRUCT_INIT(h, get_udatamodel());
817 	if (ddi_copyin((void *)arg, STRUCT_BUF(h),
818 	    STRUCT_SIZE(h), 0) != 0)
819 		return (EFAULT);
820 
821 	zid = crgetzoneid(cr);
822 
823 	rzones = STRUCT_FGET(h, ipil_nzones);
824 	if (rzones == 0) {
825 		if (zid == GLOBAL_ZONEID) {
826 			mutex_enter(&ipd_nactive_lock);
827 			rzones = ipd_nactive + ipd_nactive_fudge;
828 			mutex_exit(&ipd_nactive_lock);
829 		} else {
830 			rzones = 1;
831 		}
832 		STRUCT_FSET(h, ipil_nzones, rzones);
833 		if (ddi_copyout(STRUCT_BUF(h), (void *)arg,
834 		    STRUCT_SIZE(h), 0) != 0)
835 			return (EFAULT);
836 
837 		return (0);
838 	}
839 
840 	mutex_enter(&ipd_nsl_lock);
841 	if (zid == GLOBAL_ZONEID) {
842 		azones = ipd_nactive;
843 	} else {
844 		azones = 1;
845 	}
846 
847 	configs = kmem_alloc(sizeof (ipd_ioc_info_t) * azones, KM_SLEEP);
848 	cur = 0;
849 	for (ins = list_head(&ipd_nsl); ins != NULL;
850 	    ins = list_next(&ipd_nsl, ins)) {
851 		if (ins->ipdn_enabled == 0)
852 			continue;
853 
854 		ASSERT(cur < azones);
855 
856 		if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) {
857 			configs[cur].ipii_zoneid = ins->ipdn_zoneid;
858 
859 			mutex_enter(&ins->ipdn_lock);
860 			configs[cur].ipii_corrupt = ins->ipdn_corrupt;
861 			configs[cur].ipii_delay = ins->ipdn_delay;
862 			configs[cur].ipii_drop = ins->ipdn_drop;
863 			mutex_exit(&ins->ipdn_lock);
864 
865 			++cur;
866 		}
867 
868 		if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid)
869 			break;
870 	}
871 	mutex_exit(&ipd_nsl_lock);
872 
873 	ASSERT(zid != GLOBAL_ZONEID || cur == azones);
874 
875 	if (cur == 0)
876 		STRUCT_FSET(h, ipil_nzones, 0);
877 	else
878 		STRUCT_FSET(h, ipil_nzones, cur);
879 
880 	nzones = MIN(cur, rzones);
881 	if (nzones > 0) {
882 		if (ddi_copyout(configs, STRUCT_FGETP(h, ipil_info),
883 		    nzones * sizeof (ipd_ioc_info_t), NULL) != 0)
884 			rval = EFAULT;
885 	}
886 
887 	kmem_free(configs, sizeof (ipd_ioc_info_t) * azones);
888 	if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0)
889 		return (EFAULT);
890 
891 	return (rval);
892 }
893 
894 static void *
895 ipd_nin_create(const netid_t id)
896 {
897 	ipd_netstack_t *ins;
898 	ipd_nskstat_t *ink;
899 
900 	ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP);
901 	ins->ipdn_status = IPDN_STATUS_DISABLED;
902 	ins->ipdn_netid = id;
903 	ins->ipdn_zoneid = netstackid_to_zoneid(id);
904 	ins->ipdn_rand = gethrtime();
905 	mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL);
906 
907 	ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid,
908 	    "ipd", "net",  KSTAT_TYPE_NAMED,
909 	    sizeof (ipd_nskstat_t) / sizeof (kstat_named_t),
910 	    KSTAT_FLAG_VIRTUAL);
911 
912 	if (ins->ipdn_kstat != NULL) {
913 		if (ins->ipdn_zoneid != GLOBAL_ZONEID)
914 			kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID);
915 
916 		ink = &ins->ipdn_ksdata;
917 		ins->ipdn_kstat->ks_data = ink;
918 		kstat_named_init(&ink->ink_ncorrupts, "corrupts",
919 		    KSTAT_DATA_UINT64);
920 		kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64);
921 		kstat_named_init(&ink->ink_ndelays, "delays",
922 		    KSTAT_DATA_UINT64);
923 		kstat_install(ins->ipdn_kstat);
924 	}
925 
926 	mutex_enter(&ipd_nsl_lock);
927 	list_insert_tail(&ipd_nsl, ins);
928 	mutex_exit(&ipd_nsl_lock);
929 
930 	return (ins);
931 }
932 
933 static void
934 ipd_nin_shutdown(const netid_t id, void *arg)
935 {
936 	ipd_netstack_t *ins = arg;
937 
938 	VERIFY(id == ins->ipdn_netid);
939 	mutex_enter(&ins->ipdn_lock);
940 	ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED ||
941 	    ins->ipdn_status == IPDN_STATUS_ENABLED);
942 	ins->ipdn_status |= IPDN_STATUS_CONDEMNED;
943 	if (ins->ipdn_kstat != NULL)
944 		net_kstat_delete(id, ins->ipdn_kstat);
945 	mutex_exit(&ins->ipdn_lock);
946 }
947 
948 /*ARGSUSED*/
949 static void
950 ipd_nin_destroy(const netid_t id, void *arg)
951 {
952 	ipd_netstack_t *ins = arg;
953 
954 	/*
955 	 * At this point none of the hooks should be able to fire because the
956 	 * zone has been shutdown and we are in the process of destroying it.
957 	 * Thus it should not be possible for someone else to come in and grab
958 	 * our ipd_netstack_t for this zone. Because of that, we know that we
959 	 * are the only ones who could be running here.
960 	 */
961 	mutex_enter(&ipd_nsl_lock);
962 	list_remove(&ipd_nsl, ins);
963 	mutex_exit(&ipd_nsl_lock);
964 
965 	if (ins->ipdn_hooked)
966 		ipd_teardown_hooks(ins);
967 	mutex_destroy(&ins->ipdn_lock);
968 	kmem_free(ins, sizeof (ipd_netstack_t));
969 }
970 
971 /*ARGSUSED*/
972 static int
973 ipd_open(dev_t *devp, int flag, int otype, cred_t *credp)
974 {
975 	if (flag & FEXCL || flag & FNDELAY)
976 		return (EINVAL);
977 
978 	if (otype != OTYP_CHR)
979 		return (EINVAL);
980 
981 	if (!(flag & FREAD && flag & FWRITE))
982 		return (EINVAL);
983 
984 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
985 		return (EPERM);
986 
987 	return (0);
988 }
989 
990 /*ARGSUSED*/
991 static int
992 ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
993 {
994 	int rval;
995 	ipd_ioc_perturb_t ipip;
996 
997 	switch (cmd) {
998 	case IPDIOC_CORRUPT:
999 	case IPDIOC_DELAY:
1000 	case IPDIOC_DROP:
1001 		if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
1002 		    0) != 0)
1003 			return (EFAULT);
1004 		rval = ipd_ioctl_perturb(&ipip, cr, cmd);
1005 		return (rval);
1006 	case IPDIOC_REMOVE:
1007 		if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
1008 		    0) != 0)
1009 			return (EFAULT);
1010 		rval = ipd_ioctl_remove(&ipip, cr);
1011 		return (rval);
1012 	case IPDIOC_LIST:
1013 		/*
1014 		 * Because the list ioctl doesn't have a fixed-size struct due
1015 		 * to needing to pass around a pointer, we instead delegate the
1016 		 * copyin logic to the list code.
1017 		 */
1018 		return (ipd_ioctl_list(arg, cr));
1019 	default:
1020 		break;
1021 	}
1022 	return (ENOTTY);
1023 }
1024 
1025 /*ARGSUSED*/
1026 static int
1027 ipd_close(dev_t dev, int flag, int otype, cred_t *credp)
1028 {
1029 	return (0);
1030 }
1031 
1032 static int
1033 ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1034 {
1035 	minor_t instance;
1036 
1037 	if (cmd != DDI_ATTACH)
1038 		return (DDI_FAILURE);
1039 
1040 	if (ipd_devi != NULL)
1041 		return (DDI_FAILURE);
1042 
1043 	instance = ddi_get_instance(dip);
1044 	if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance,
1045 	    DDI_PSEUDO, 0) == DDI_FAILURE)
1046 		return (DDI_FAILURE);
1047 
1048 	ipd_neti = net_instance_alloc(NETINFO_VERSION);
1049 	if (ipd_neti == NULL) {
1050 		ddi_remove_minor_node(dip, NULL);
1051 		return (DDI_FAILURE);
1052 	}
1053 
1054 	/*
1055 	 * Note that these global structures MUST be initialized before we call
1056 	 * net_instance_register, as that will instantly cause us to drive into
1057 	 * the ipd_nin_create callbacks.
1058 	 */
1059 	list_create(&ipd_nsl, sizeof (ipd_netstack_t),
1060 	    offsetof(ipd_netstack_t, ipdn_link));
1061 	mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL);
1062 	mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL);
1063 
1064 	/* Note, net_instance_alloc sets the version. */
1065 	ipd_neti->nin_name = "ipd";
1066 	ipd_neti->nin_create = ipd_nin_create;
1067 	ipd_neti->nin_destroy = ipd_nin_destroy;
1068 	ipd_neti->nin_shutdown = ipd_nin_shutdown;
1069 	if (net_instance_register(ipd_neti) == DDI_FAILURE) {
1070 		net_instance_free(ipd_neti);
1071 		ddi_remove_minor_node(dip, NULL);
1072 	}
1073 
1074 	ddi_report_dev(dip);
1075 	ipd_devi = dip;
1076 
1077 	return (DDI_SUCCESS);
1078 }
1079 
1080 /*ARGSUSED*/
1081 static int
1082 ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1083 {
1084 	int error;
1085 
1086 	switch (infocmd) {
1087 	case DDI_INFO_DEVT2DEVINFO:
1088 		*result = ipd_devi;
1089 		error = DDI_SUCCESS;
1090 		break;
1091 	case DDI_INFO_DEVT2INSTANCE:
1092 		*result = (void *)(uintptr_t)getminor((dev_t)arg);
1093 		error = DDI_SUCCESS;
1094 	default:
1095 		error = DDI_FAILURE;
1096 		break;
1097 	}
1098 
1099 	return (error);
1100 }
1101 
1102 static int
1103 ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1104 {
1105 	if (cmd != DDI_DETACH)
1106 		return (DDI_FAILURE);
1107 
1108 	mutex_enter(&ipd_nactive_lock);
1109 	if (ipd_nactive > 0) {
1110 		mutex_exit(&ipd_nactive_lock);
1111 		return (EBUSY);
1112 	}
1113 	mutex_exit(&ipd_nactive_lock);
1114 	ASSERT(dip == ipd_devi);
1115 	ddi_remove_minor_node(dip, NULL);
1116 	ipd_devi = NULL;
1117 
1118 	if (ipd_neti != NULL) {
1119 		VERIFY(net_instance_unregister(ipd_neti) == 0);
1120 		net_instance_free(ipd_neti);
1121 	}
1122 
1123 	mutex_destroy(&ipd_nsl_lock);
1124 	mutex_destroy(&ipd_nactive_lock);
1125 	list_destroy(&ipd_nsl);
1126 
1127 	return (DDI_SUCCESS);
1128 }
1129 
1130 static struct cb_ops ipd_cb_ops = {
1131 	ipd_open,	/* open */
1132 	ipd_close,	/* close */
1133 	nodev,		/* strategy */
1134 	nodev,		/* print */
1135 	nodev,		/* dump */
1136 	nodev,		/* read */
1137 	nodev,		/* write */
1138 	ipd_ioctl,	/* ioctl */
1139 	nodev,		/* devmap */
1140 	nodev,		/* mmap */
1141 	nodev,		/* segmap */
1142 	nochpoll,	/* poll */
1143 	ddi_prop_op,	/* cb_prop_op */
1144 	NULL,		/* streamtab */
1145 	D_NEW | D_MP,	/* Driver compatibility flag */
1146 	CB_REV,		/* rev */
1147 	nodev,		/* aread */
1148 	nodev		/* awrite */
1149 };
1150 
1151 static struct dev_ops ipd_ops = {
1152 	DEVO_REV,		/* devo_rev */
1153 	0,			/* refcnt */
1154 	ipd_getinfo,		/* get_dev_info */
1155 	nulldev,		/* identify */
1156 	nulldev,		/* probe */
1157 	ipd_attach,		/* attach */
1158 	ipd_detach,		/* detach */
1159 	nodev,			/* reset */
1160 	&ipd_cb_ops,		/* driver operations */
1161 	NULL,			/* bus operations */
1162 	nodev,			/* dev power */
1163 	ddi_quiesce_not_needed	/* quiesce */
1164 };
1165 
1166 static struct modldrv modldrv = {
1167 	&mod_driverops,
1168 	"Internet packet disturber",
1169 	&ipd_ops
1170 };
1171 
1172 static struct modlinkage modlinkage = {
1173 	MODREV_1,
1174 	{ (void *)&modldrv, NULL }
1175 };
1176 
1177 int
1178 _init(void)
1179 {
1180 	return (mod_install(&modlinkage));
1181 }
1182 
1183 int
1184 _info(struct modinfo *modinfop)
1185 {
1186 	return (mod_info(&modlinkage, modinfop));
1187 }
1188 
1189 int
1190 _fini(void)
1191 {
1192 	return (mod_remove(&modlinkage));
1193 }
1194