1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /*
12 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
13 */
14
15 /*
16 * ipd: Internet packet disturber
17 *
18 * The purpose of ipd is to simulate congested and lossy networks when they
19 * don't actually exist. The features of these congested and lossy networks are
20 * events that end up leading to retransmits and thus kicking us out of the
21 * TCP/IP fastpath. Since normally this would require us to have an actually
22 * congested network, which can be problematic, we instead simulate this
23 * behavior.
24 *
25 * 1. ipd's operations and restrictions
26 *
27 * ipd currently has facilities to cause IP traffic to be:
28 *
29 * - Corrupted with some probability.
30 * - Delayed for a set number of microseconds.
31 * - Dropped with some probability.
32 *
33 * Each of these features are enabled on a per-zone basic. The current
34 * implementation restricts this specifically to exclusive stack zones.
35 * Enabling ipd on a given zone causes pfhooks to be installed for that zone's
36 * netstack. Because of the nature of ipd, it currently only supports exclusive
37 * stack zones and as a further restriction, it only allows the global zone
38 * administrative access. ipd can be enabled for the global zone, but doing so
39 * will cause all shared-stack zones to also be affected.
40 *
41 * 2. General architecture and Locking
42 *
43 * ipd consists of a few components. There is a per netstack data structure that
44 * is created and destroyed with the creation and destruction of each exclusive
45 * stack zone. Each of these netstacks is stored in a global list which is
46 * accessed for control of ipd via ioctls. The following diagram touches on the
47 * data structures that are used throughout ipd.
48 *
49 * ADMINISTRATIVE DATA PATH
50 *
51 * +--------+ +------+ +------+
52 * | ipdadm | | ip | | nics |
53 * +--------+ +------+ +------+
54 * | ^ | |
55 * | | ioctl(2) | |
56 * V | V V
57 * +----------+ +-------------------------+
58 * | /dev/ipd | | pfhooks packet callback | == ipd_hook()
59 * +----------+ +-------------------------+
60 * | |
61 * | |
62 * V |
63 * +----------------+ |
64 * | list_t ipd_nsl |------+ |
65 * +----------------+ | |
66 * | |
67 * V per netstack V
68 * +----------------------------+
69 * | ipd_nestack_t |
70 * +----------------------------+
71 *
72 * ipd has two different entry points, one is administrative, the other is the
73 * data path. The administrative path is accessed by a userland component called
74 * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd.
75 * If the administrative path enables a specific zone, then the data path will
76 * become active for that zone. Any packet that leaves that zone's IP stack or
77 * is going to enter it, comes through the callback specified in the hook_t(9S)
78 * structure. This will cause each packet to go through ipd_hook().
79 *
80 * While the locking inside of ipd should be straightforward, unfortunately, the
81 * pfhooks subsystem necessarily complicates this a little bit. There are
82 * currently three different sets of locks in ipd.
83 *
84 * - Global lock N on the netstack list.
85 * - Global lock A on the active count.
86 * - Per-netstack data structure lock Z.
87 *
88 * # Locking rules
89 *
90 * L.1a N must always be acquired first and released last
91 *
92 * If you need to acquire the netstack list lock, either for reading or writing,
93 * then N must be acquired first and before any other locks. It may not be
94 * dropped before any other lock.
95 *
96 * L.1b N must only be acquired from the administrative path and zone creation,
97 * shutdown, and destruct callbacks.
98 *
99 * The data path, e.g. receiving the per-packet callbacks, should never be
100 * grabbing the list lock. If it is, then the architecture here needs to be
101 * reconsidered.
102 *
103 * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks
104 * are active.
105 *
106 * The way the pfhooks subsystem is designed is that a reference count is
107 * present on the hook_t while it is active. As long as that reference count is
108 * non-zero, a call to net_hook_unregister will block until it is lowered.
109 * Because the callbacks want the same lock for the netstack that is held by the
110 * administrative path calling into net_hook_unregister, we deadlock.
111 *
112 * ioctl from ipdadm remove hook_t cb (from nic) hook_t cb (from IP)
113 * ----------------------- -------------------- -------------------
114 * | | |
115 * | bump hook_t refcount |
116 * mutex_enter(ipd_nsl_lock); enter ipd_hook() bump hook_t refcount
117 * mutex acquired mutex_enter(ins->ipdn_lock); |
118 * | mutex acquired enter ipd_hook()
119 * mutex_enter(ins->ipdn_lock); | mutex_enter(ins->ipdn_lock);
120 * | | |
121 * | | |
122 * | mutex_exit(ins->ipdn_lock); |
123 * | | |
124 * mutex acquired leave ipd_hook() |
125 * | decrement hook_t refcount |
126 * | | |
127 * ipd_teardown_hooks() | |
128 * net_hook_unregister() | |
129 * cv_wait() if recount | |
130 * | | |
131 * ---------------------------------------------------------------------------
132 *
133 * At this point, we can see that the second hook callback still doesn't have
134 * the mutex, but it has bumped the hook_t refcount. However, it will never
135 * acquire the mutex that it needs to finish its operation and decrement the
136 * refcount.
137 *
138 * Obviously, deadlocking is not acceptable, thus the following corollary to the
139 * second locking rule:
140 *
141 * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem,
142 * N must be held.
143 *
144 * There is currently only one path where we have to worry about this. That is
145 * when we are removing a hook, but the zone is not being shutdown, then hooks
146 * are currently active. The only place that this currently happens is in
147 * ipd_check_hooks().
148 *
149 */
150
151 #include <sys/types.h>
152 #include <sys/file.h>
153 #include <sys/errno.h>
154 #include <sys/open.h>
155 #include <sys/cred.h>
156 #include <sys/ddi.h>
157 #include <sys/sunddi.h>
158 #include <sys/kmem.h>
159 #include <sys/conf.h>
160 #include <sys/stat.h>
161 #include <sys/cmn_err.h>
162 #include <sys/ddi.h>
163 #include <sys/sunddi.h>
164 #include <sys/modctl.h>
165 #include <sys/kstat.h>
166 #include <sys/neti.h>
167 #include <sys/list.h>
168 #include <sys/ksynch.h>
169 #include <sys/sysmacros.h>
170 #include <sys/policy.h>
171 #include <sys/atomic.h>
172 #include <sys/model.h>
173 #include <sys/strsun.h>
174
175 #include <sys/netstack.h>
176 #include <sys/hook.h>
177 #include <sys/hook_event.h>
178
179 #include <sys/ipd.h>
180
181 #define IPDN_STATUS_DISABLED 0x1
182 #define IPDN_STATUS_ENABLED 0x2
183 #define IPDN_STATUS_CONDEMNED 0x4
184
185 /*
186 * These flags are used to determine whether or not the hooks are registered.
187 */
188 #define IPDN_HOOK_NONE 0x0
189 #define IPDN_HOOK_V4IN 0x1
190 #define IPDN_HOOK_V4OUT 0x2
191 #define IPDN_HOOK_V6IN 0x4
192 #define IPDN_HOOK_V6OUT 0x8
193 #define IPDN_HOOK_ALL 0xf
194
195 /*
196 * Per-netstack kstats.
197 */
198 typedef struct ipd_nskstat {
199 kstat_named_t ink_ndrops;
200 kstat_named_t ink_ncorrupts;
201 kstat_named_t ink_ndelays;
202 } ipd_nskstat_t;
203
204 /*
205 * Different parts of this structure have different locking semantics. The list
206 * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock.
207 * The following members are read only: ipdn_netid and ipdn_zoneid. The members
208 * of the kstat structure are always accessible in the data path, but the
209 * counters must be bumped with atomic operations. The ipdn_lock protects every
210 * other aspect of this structure. Please see the big theory statement on the
211 * requirements for lock ordering.
212 */
213 typedef struct ipd_netstack {
214 list_node_t ipdn_link; /* link on ipd_nsl */
215 netid_t ipdn_netid; /* netstack id */
216 zoneid_t ipdn_zoneid; /* zone id */
217 kstat_t *ipdn_kstat; /* kstat_t ptr */
218 ipd_nskstat_t ipdn_ksdata; /* kstat data */
219 kmutex_t ipdn_lock; /* protects following members */
220 int ipdn_status; /* status flags */
221 net_handle_t ipdn_v4hdl; /* IPv4 net handle */
222 net_handle_t ipdn_v6hdl; /* IPv4 net handle */
223 int ipdn_hooked; /* are hooks registered */
224 hook_t *ipdn_v4in; /* IPv4 traffic in hook */
225 hook_t *ipdn_v4out; /* IPv4 traffice out hook */
226 hook_t *ipdn_v6in; /* IPv6 traffic in hook */
227 hook_t *ipdn_v6out; /* IPv6 traffic out hook */
228 int ipdn_enabled; /* which perturbs are on */
229 int ipdn_corrupt; /* corrupt percentage */
230 int ipdn_drop; /* drop percentage */
231 uint_t ipdn_delay; /* delay us */
232 long ipdn_rand; /* random seed */
233 } ipd_netstack_t;
234
235 /*
236 * ipd internal variables
237 */
238 static dev_info_t *ipd_devi; /* device info */
239 static net_instance_t *ipd_neti; /* net_instance for hooks */
240 static unsigned int ipd_max_delay = IPD_MAX_DELAY; /* max delay in us */
241 static kmutex_t ipd_nsl_lock; /* lock for the nestack list */
242 static list_t ipd_nsl; /* list of netstacks */
243 static kmutex_t ipd_nactive_lock; /* lock for nactive */
244 static unsigned int ipd_nactive; /* number of active netstacks */
245 static int ipd_nactive_fudge = 4; /* amount to fudge by in list */
246
247 /*
248 * Note that this random number implementation is based upon the old BSD 4.1
249 * rand. It's good enough for us!
250 */
251 static int
ipd_nextrand(ipd_netstack_t * ins)252 ipd_nextrand(ipd_netstack_t *ins)
253 {
254 ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345;
255 return (ins->ipdn_rand & 0x7fffffff);
256 }
257
258 static void
ipd_ksbump(kstat_named_t * nkp)259 ipd_ksbump(kstat_named_t *nkp)
260 {
261 atomic_inc_64(&nkp->value.ui64);
262 }
263
264 /*
265 * This is where all the magic actually happens. The way that this works is we
266 * grab the ins lock to basically get a copy of all the data that we need to do
267 * our job and then let it go to minimize contention. In terms of actual work on
268 * the packet we do them in the following order:
269 *
270 * - drop
271 * - delay
272 * - corrupt
273 */
274 /*ARGSUSED*/
275 static int
ipd_hook(hook_event_token_t event,hook_data_t data,void * arg)276 ipd_hook(hook_event_token_t event, hook_data_t data, void *arg)
277 {
278 unsigned char *crp;
279 int dwait, corrupt, drop, rand, off, status;
280 mblk_t *mbp;
281 ipd_netstack_t *ins = arg;
282 hook_pkt_event_t *pkt = (hook_pkt_event_t *)data;
283
284 mutex_enter(&ins->ipdn_lock);
285 status = ins->ipdn_status;
286 dwait = ins->ipdn_delay;
287 corrupt = ins->ipdn_corrupt;
288 drop = ins->ipdn_drop;
289 rand = ipd_nextrand(ins);
290 mutex_exit(&ins->ipdn_lock);
291
292 /*
293 * This probably cannot happen, but we'll do an extra guard just in
294 * case.
295 */
296 if (status & IPDN_STATUS_CONDEMNED)
297 return (0);
298
299 if (drop != 0 && rand % 100 < drop) {
300 freemsg(*pkt->hpe_mp);
301 *pkt->hpe_mp = NULL;
302 pkt->hpe_mb = NULL;
303 pkt->hpe_hdr = NULL;
304 ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops);
305
306 return (1);
307 }
308
309 if (dwait != 0) {
310 if (dwait < TICK_TO_USEC(1))
311 drv_usecwait(dwait);
312 else
313 delay(drv_usectohz(dwait));
314 ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays);
315 }
316
317 if (corrupt != 0 && rand % 100 < corrupt) {
318 /*
319 * Since we're corrupting the mblk, just corrupt everything in
320 * the chain. While we could corrupt the entire packet, that's a
321 * little strong. Instead we're going to just change one of the
322 * bytes in each mblock.
323 */
324 mbp = *pkt->hpe_mp;
325 while (mbp != NULL) {
326 if (mbp->b_wptr == mbp->b_rptr)
327 continue;
328
329 /*
330 * While pfhooks probably won't send us anything else,
331 * let's just be extra careful. The stack probably isn't
332 * as resiliant to corruption of control messages.
333 */
334 if (DB_TYPE(mbp) != M_DATA)
335 continue;
336
337 off = rand % ((uintptr_t)mbp->b_wptr -
338 (uintptr_t)mbp->b_rptr);
339 crp = mbp->b_rptr + off;
340 off = rand % 8;
341 *crp = *crp ^ (1 << off);
342
343 mbp = mbp->b_cont;
344 }
345 ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts);
346 }
347
348 return (0);
349 }
350
351 /*
352 * Sets up and registers all the proper hooks needed for the netstack to capture
353 * packets. Callers are assumed to already be holding the ipd_netstack_t's lock.
354 * If there is a failure in setting something up, it is the responsibility of
355 * this function to clean it up. Once this function has been called, it should
356 * not be called until a corresponding call to tear down the hooks has been
357 * done.
358 */
359 static int
ipd_setup_hooks(ipd_netstack_t * ins)360 ipd_setup_hooks(ipd_netstack_t *ins)
361 {
362 ASSERT(MUTEX_HELD(&ins->ipdn_lock));
363 ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET);
364 if (ins->ipdn_v4hdl == NULL)
365 goto cleanup;
366
367 ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6);
368 if (ins->ipdn_v6hdl == NULL)
369 goto cleanup;
370
371 ins->ipdn_v4in = hook_alloc(HOOK_VERSION);
372 if (ins->ipdn_v4in == NULL)
373 goto cleanup;
374
375 ins->ipdn_v4in->h_flags = 0;
376 ins->ipdn_v4in->h_hint = HH_NONE;
377 ins->ipdn_v4in->h_hintvalue = 0;
378 ins->ipdn_v4in->h_func = ipd_hook;
379 ins->ipdn_v4in->h_arg = ins;
380 ins->ipdn_v4in->h_name = "ipd IPv4 in";
381
382 if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
383 ins->ipdn_v4in) != 0)
384 goto cleanup;
385 ins->ipdn_hooked |= IPDN_HOOK_V4IN;
386
387 ins->ipdn_v4out = hook_alloc(HOOK_VERSION);
388 if (ins->ipdn_v4out == NULL)
389 goto cleanup;
390 ins->ipdn_v4out->h_flags = 0;
391 ins->ipdn_v4out->h_hint = HH_NONE;
392 ins->ipdn_v4out->h_hintvalue = 0;
393 ins->ipdn_v4out->h_func = ipd_hook;
394 ins->ipdn_v4out->h_arg = ins;
395 ins->ipdn_v4out->h_name = "ipd IPv4 out";
396
397 if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
398 ins->ipdn_v4out) != 0)
399 goto cleanup;
400 ins->ipdn_hooked |= IPDN_HOOK_V4OUT;
401
402 ins->ipdn_v6in = hook_alloc(HOOK_VERSION);
403 if (ins->ipdn_v6in == NULL)
404 goto cleanup;
405 ins->ipdn_v6in->h_flags = 0;
406 ins->ipdn_v6in->h_hint = HH_NONE;
407 ins->ipdn_v6in->h_hintvalue = 0;
408 ins->ipdn_v6in->h_func = ipd_hook;
409 ins->ipdn_v6in->h_arg = ins;
410 ins->ipdn_v6in->h_name = "ipd IPv6 in";
411
412 if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
413 ins->ipdn_v6in) != 0)
414 goto cleanup;
415 ins->ipdn_hooked |= IPDN_HOOK_V6IN;
416
417 ins->ipdn_v6out = hook_alloc(HOOK_VERSION);
418 if (ins->ipdn_v6out == NULL)
419 goto cleanup;
420 ins->ipdn_v6out->h_flags = 0;
421 ins->ipdn_v6out->h_hint = HH_NONE;
422 ins->ipdn_v6out->h_hintvalue = 0;
423 ins->ipdn_v6out->h_func = ipd_hook;
424 ins->ipdn_v6out->h_arg = ins;
425 ins->ipdn_v6out->h_name = "ipd IPv6 out";
426
427 if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
428 ins->ipdn_v6out) != 0)
429 goto cleanup;
430 ins->ipdn_hooked |= IPDN_HOOK_V6OUT;
431 mutex_enter(&ipd_nactive_lock);
432 ipd_nactive++;
433 mutex_exit(&ipd_nactive_lock);
434
435 return (0);
436
437 cleanup:
438 if (ins->ipdn_hooked & IPDN_HOOK_V6OUT)
439 (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
440 ins->ipdn_v6out);
441
442 if (ins->ipdn_hooked & IPDN_HOOK_V6IN)
443 (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
444 ins->ipdn_v6in);
445
446 if (ins->ipdn_hooked & IPDN_HOOK_V4OUT)
447 (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
448 ins->ipdn_v4out);
449
450 if (ins->ipdn_hooked & IPDN_HOOK_V4IN)
451 (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
452 ins->ipdn_v4in);
453
454 ins->ipdn_hooked = IPDN_HOOK_NONE;
455
456 if (ins->ipdn_v6out != NULL)
457 hook_free(ins->ipdn_v6out);
458
459 if (ins->ipdn_v6in != NULL)
460 hook_free(ins->ipdn_v6in);
461
462 if (ins->ipdn_v4out != NULL)
463 hook_free(ins->ipdn_v4out);
464
465 if (ins->ipdn_v4in != NULL)
466 hook_free(ins->ipdn_v4in);
467
468 if (ins->ipdn_v6hdl != NULL)
469 (void) net_protocol_release(ins->ipdn_v6hdl);
470
471 if (ins->ipdn_v4hdl != NULL)
472 (void) net_protocol_release(ins->ipdn_v4hdl);
473
474 return (1);
475 }
476
477 static void
ipd_teardown_hooks(ipd_netstack_t * ins)478 ipd_teardown_hooks(ipd_netstack_t *ins)
479 {
480 ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL);
481 VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
482 ins->ipdn_v6out) == 0);
483 VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
484 ins->ipdn_v6in) == 0);
485 VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
486 ins->ipdn_v4out) == 0);
487 VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
488 ins->ipdn_v4in) == 0);
489
490 ins->ipdn_hooked = IPDN_HOOK_NONE;
491
492 hook_free(ins->ipdn_v6out);
493 hook_free(ins->ipdn_v6in);
494 hook_free(ins->ipdn_v4out);
495 hook_free(ins->ipdn_v4in);
496
497 VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0);
498 VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0);
499
500 mutex_enter(&ipd_nactive_lock);
501 ipd_nactive--;
502 mutex_exit(&ipd_nactive_lock);
503 }
504
505 static int
ipd_check_hooks(ipd_netstack_t * ins,int type,boolean_t enable)506 ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable)
507 {
508 int olden, rval;
509 olden = ins->ipdn_enabled;
510
511 if (enable)
512 ins->ipdn_enabled |= type;
513 else
514 ins->ipdn_enabled &= ~type;
515
516 /*
517 * If hooks were previously enabled.
518 */
519 if (olden == 0 && ins->ipdn_enabled != 0) {
520 rval = ipd_setup_hooks(ins);
521 if (rval != 0) {
522 ins->ipdn_enabled &= ~type;
523 ASSERT(ins->ipdn_enabled == 0);
524 return (rval);
525 }
526
527 return (0);
528 }
529
530 if (olden != 0 && ins->ipdn_enabled == 0) {
531 ASSERT(olden != 0);
532
533 /*
534 * We have to drop the lock here, lest we cause a deadlock.
535 * Unfortunately, there may be hooks that are running and are
536 * actively in flight and we have to call the unregister
537 * function. Due to the hooks framework, if there is an inflight
538 * hook (most likely right now), and we are holding the
539 * netstack's lock, those hooks will never return. This is
540 * unfortunate.
541 *
542 * Because we only come into this path holding the list lock, we
543 * know that only way that someone else can come in and get to
544 * this structure is via the hook callbacks which are going to
545 * only be doing reads. They'll also see that everything has
546 * been disabled and return. So while this is unfortunate, it
547 * should be relatively safe.
548 */
549 mutex_exit(&ins->ipdn_lock);
550 ipd_teardown_hooks(ins);
551 mutex_enter(&ins->ipdn_lock);
552 return (0);
553 }
554
555 /*
556 * Othwerise, nothing should have changed here.
557 */
558 ASSERT((olden == 0) == (ins->ipdn_enabled == 0));
559 return (0);
560 }
561
562 static int
ipd_toggle_corrupt(ipd_netstack_t * ins,int percent)563 ipd_toggle_corrupt(ipd_netstack_t *ins, int percent)
564 {
565 int rval;
566
567 ASSERT(MUTEX_HELD(&ins->ipdn_lock));
568
569 if (percent < 0 || percent > 100)
570 return (ERANGE);
571
572 /*
573 * If we've been asked to set the value to a value that we already have,
574 * great, then we're done.
575 */
576 if (percent == ins->ipdn_corrupt)
577 return (0);
578
579 ins->ipdn_corrupt = percent;
580 rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0);
581
582 /*
583 * If ipd_check_hooks_failed, that must mean that we failed to set up
584 * the hooks, so we are going to effectively zero out and fail the
585 * request to enable corruption.
586 */
587 if (rval != 0)
588 ins->ipdn_corrupt = 0;
589
590 return (rval);
591 }
592
593 static int
ipd_toggle_delay(ipd_netstack_t * ins,uint32_t delay)594 ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
595 {
596 int rval;
597
598 ASSERT(MUTEX_HELD(&ins->ipdn_lock));
599
600 if (delay > ipd_max_delay)
601 return (ERANGE);
602
603 /*
604 * If we've been asked to set the value to a value that we already have,
605 * great, then we're done.
606 */
607 if (delay == ins->ipdn_delay)
608 return (0);
609
610 ins->ipdn_delay = delay;
611 rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0);
612
613 /*
614 * If ipd_check_hooks_failed, that must mean that we failed to set up
615 * the hooks, so we are going to effectively zero out and fail the
616 * request to enable corruption.
617 */
618 if (rval != 0)
619 ins->ipdn_delay = 0;
620
621 return (rval);
622 }
623 static int
ipd_toggle_drop(ipd_netstack_t * ins,int percent)624 ipd_toggle_drop(ipd_netstack_t *ins, int percent)
625 {
626 int rval;
627
628 ASSERT(MUTEX_HELD(&ins->ipdn_lock));
629
630 if (percent < 0 || percent > 100)
631 return (ERANGE);
632
633 /*
634 * If we've been asked to set the value to a value that we already have,
635 * great, then we're done.
636 */
637 if (percent == ins->ipdn_drop)
638 return (0);
639
640 ins->ipdn_drop = percent;
641 rval = ipd_check_hooks(ins, IPD_DROP, percent != 0);
642
643 /*
644 * If ipd_check_hooks_failed, that must mean that we failed to set up
645 * the hooks, so we are going to effectively zero out and fail the
646 * request to enable corruption.
647 */
648 if (rval != 0)
649 ins->ipdn_drop = 0;
650
651 return (rval);
652 }
653
654 static int
ipd_ioctl_perturb(ipd_ioc_perturb_t * ipi,cred_t * cr,intptr_t cmd)655 ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd)
656 {
657 zoneid_t zid;
658 ipd_netstack_t *ins;
659 int rval = 0;
660
661 /*
662 * If the zone that we're coming from is not the GZ, then we ignore it
663 * completely and then instead just set the zoneid to be that of the
664 * caller. If the zoneid is that of the GZ, then we don't touch this
665 * value.
666 */
667 zid = crgetzoneid(cr);
668 if (zid != GLOBAL_ZONEID)
669 ipi->ipip_zoneid = zid;
670
671 if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
672 zid != GLOBAL_ZONEID)
673 return (EPERM);
674
675 /*
676 * We need to hold the ipd_nsl_lock throughout the entire operation,
677 * otherwise someone else could come in and remove us from the list and
678 * free us, e.g. the netstack destroy handler. By holding the lock, we
679 * stop it from being able to do anything wrong.
680 */
681 mutex_enter(&ipd_nsl_lock);
682 for (ins = list_head(&ipd_nsl); ins != NULL;
683 ins = list_next(&ipd_nsl, ins)) {
684 if (ins->ipdn_zoneid == ipi->ipip_zoneid)
685 break;
686 }
687
688 if (ins == NULL) {
689 mutex_exit(&ipd_nsl_lock);
690 return (EINVAL);
691 }
692
693 mutex_enter(&ins->ipdn_lock);
694
695 if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
696 rval = ESHUTDOWN;
697 goto cleanup;
698 }
699
700 switch (cmd) {
701 case IPDIOC_CORRUPT:
702 rval = ipd_toggle_corrupt(ins, ipi->ipip_arg);
703 break;
704 case IPDIOC_DELAY:
705 rval = ipd_toggle_delay(ins, ipi->ipip_arg);
706 break;
707 case IPDIOC_DROP:
708 rval = ipd_toggle_drop(ins, ipi->ipip_arg);
709 break;
710 }
711
712 cleanup:
713 mutex_exit(&ins->ipdn_lock);
714 mutex_exit(&ipd_nsl_lock);
715 return (rval);
716 }
717
718 static int
ipd_ioctl_remove(ipd_ioc_perturb_t * ipi,cred_t * cr)719 ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr)
720 {
721 zoneid_t zid;
722 ipd_netstack_t *ins;
723 int rval = 0;
724
725 /*
726 * See ipd_ioctl_perturb for the rational here.
727 */
728 zid = crgetzoneid(cr);
729 if (zid != GLOBAL_ZONEID)
730 ipi->ipip_zoneid = zid;
731
732 if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
733 zid != GLOBAL_ZONEID)
734 return (EPERM);
735
736 mutex_enter(&ipd_nsl_lock);
737 for (ins = list_head(&ipd_nsl); ins != NULL;
738 ins = list_next(&ipd_nsl, ins)) {
739 if (ins->ipdn_zoneid == ipi->ipip_zoneid)
740 break;
741 }
742
743 if (ins == NULL) {
744 mutex_exit(&ipd_nsl_lock);
745 return (EINVAL);
746 }
747
748 mutex_enter(&ins->ipdn_lock);
749
750 /*
751 * If this is condemned, that means it's very shortly going to be torn
752 * down. In that case, there's no reason to actually do anything here,
753 * as it will all be done rather shortly in the destroy function.
754 * Furthermore, because condemned corresponds with it having hit
755 * shutdown, we know that no more packets can be received by this
756 * netstack. All this translates to a no-op.
757 */
758 if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
759 rval = 0;
760 goto cleanup;
761 }
762
763 rval = EINVAL;
764 /*
765 * Go through and disable the requested pieces. We can safely ignore the
766 * return value of ipd_check_hooks because the removal case should never
767 * fail, we verify that in the hook teardown case.
768 */
769 if (ipi->ipip_arg & IPD_CORRUPT) {
770 ins->ipdn_corrupt = 0;
771 (void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE);
772 rval = 0;
773 }
774
775 if (ipi->ipip_arg & IPD_DELAY) {
776 ins->ipdn_delay = 0;
777 (void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE);
778 rval = 0;
779 }
780
781 if (ipi->ipip_arg & IPD_DROP) {
782 ins->ipdn_drop = 0;
783 (void) ipd_check_hooks(ins, IPD_DROP, B_FALSE);
784 rval = 0;
785 }
786
787 cleanup:
788 mutex_exit(&ins->ipdn_lock);
789 mutex_exit(&ipd_nsl_lock);
790 return (rval);
791 }
792
793 /*
794 * When this function is called, the value of the ipil_nzones argument controls
795 * how this function works. When called with a value of zero, then we treat that
796 * as the caller asking us what's a reasonable number of entries for me to
797 * allocate memory for. If the zone is the global zone, then we tell them how
798 * many folks are currently active and add a fudge factor. Otherwise the answer
799 * is always one.
800 *
801 * In the non-zero case, we give them that number of zone ids. While this isn't
802 * quite ideal as it might mean that someone misses something, this generally
803 * won't be an issue, as it involves a rather tight race condition in the
804 * current ipdadm implementation.
805 */
806 static int
ipd_ioctl_list(intptr_t arg,cred_t * cr)807 ipd_ioctl_list(intptr_t arg, cred_t *cr)
808 {
809 zoneid_t zid;
810 ipd_ioc_info_t *configs;
811 ipd_netstack_t *ins;
812 uint_t azones, rzones, nzones, cur;
813 int rval = 0;
814 STRUCT_DECL(ipd_ioc_list, h);
815
816 STRUCT_INIT(h, get_udatamodel());
817 if (ddi_copyin((void *)arg, STRUCT_BUF(h),
818 STRUCT_SIZE(h), 0) != 0)
819 return (EFAULT);
820
821 zid = crgetzoneid(cr);
822
823 rzones = STRUCT_FGET(h, ipil_nzones);
824 if (rzones == 0) {
825 if (zid == GLOBAL_ZONEID) {
826 mutex_enter(&ipd_nactive_lock);
827 rzones = ipd_nactive + ipd_nactive_fudge;
828 mutex_exit(&ipd_nactive_lock);
829 } else {
830 rzones = 1;
831 }
832 STRUCT_FSET(h, ipil_nzones, rzones);
833 if (ddi_copyout(STRUCT_BUF(h), (void *)arg,
834 STRUCT_SIZE(h), 0) != 0)
835 return (EFAULT);
836
837 return (0);
838 }
839
840 mutex_enter(&ipd_nsl_lock);
841 if (zid == GLOBAL_ZONEID) {
842 azones = ipd_nactive;
843 } else {
844 azones = 1;
845 }
846
847 configs = kmem_alloc(sizeof (ipd_ioc_info_t) * azones, KM_SLEEP);
848 cur = 0;
849 for (ins = list_head(&ipd_nsl); ins != NULL;
850 ins = list_next(&ipd_nsl, ins)) {
851 if (ins->ipdn_enabled == 0)
852 continue;
853
854 ASSERT(cur < azones);
855
856 if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) {
857 configs[cur].ipii_zoneid = ins->ipdn_zoneid;
858
859 mutex_enter(&ins->ipdn_lock);
860 configs[cur].ipii_corrupt = ins->ipdn_corrupt;
861 configs[cur].ipii_delay = ins->ipdn_delay;
862 configs[cur].ipii_drop = ins->ipdn_drop;
863 mutex_exit(&ins->ipdn_lock);
864
865 ++cur;
866 }
867
868 if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid)
869 break;
870 }
871 mutex_exit(&ipd_nsl_lock);
872
873 ASSERT(zid != GLOBAL_ZONEID || cur == azones);
874
875 if (cur == 0)
876 STRUCT_FSET(h, ipil_nzones, 0);
877 else
878 STRUCT_FSET(h, ipil_nzones, cur);
879
880 nzones = MIN(cur, rzones);
881 if (nzones > 0) {
882 if (ddi_copyout(configs, STRUCT_FGETP(h, ipil_info),
883 nzones * sizeof (ipd_ioc_info_t), NULL) != 0)
884 rval = EFAULT;
885 }
886
887 kmem_free(configs, sizeof (ipd_ioc_info_t) * azones);
888 if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0)
889 return (EFAULT);
890
891 return (rval);
892 }
893
894 static void *
ipd_nin_create(const netid_t id)895 ipd_nin_create(const netid_t id)
896 {
897 ipd_netstack_t *ins;
898 ipd_nskstat_t *ink;
899
900 ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP);
901 ins->ipdn_status = IPDN_STATUS_DISABLED;
902 ins->ipdn_netid = id;
903 ins->ipdn_zoneid = netstackid_to_zoneid(id);
904 ins->ipdn_rand = gethrtime();
905 mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL);
906
907 ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid,
908 "ipd", "net", KSTAT_TYPE_NAMED,
909 sizeof (ipd_nskstat_t) / sizeof (kstat_named_t),
910 KSTAT_FLAG_VIRTUAL);
911
912 if (ins->ipdn_kstat != NULL) {
913 if (ins->ipdn_zoneid != GLOBAL_ZONEID)
914 kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID);
915
916 ink = &ins->ipdn_ksdata;
917 ins->ipdn_kstat->ks_data = ink;
918 kstat_named_init(&ink->ink_ncorrupts, "corrupts",
919 KSTAT_DATA_UINT64);
920 kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64);
921 kstat_named_init(&ink->ink_ndelays, "delays",
922 KSTAT_DATA_UINT64);
923 kstat_install(ins->ipdn_kstat);
924 }
925
926 mutex_enter(&ipd_nsl_lock);
927 list_insert_tail(&ipd_nsl, ins);
928 mutex_exit(&ipd_nsl_lock);
929
930 return (ins);
931 }
932
933 static void
ipd_nin_shutdown(const netid_t id,void * arg)934 ipd_nin_shutdown(const netid_t id, void *arg)
935 {
936 ipd_netstack_t *ins = arg;
937
938 VERIFY(id == ins->ipdn_netid);
939 mutex_enter(&ins->ipdn_lock);
940 ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED ||
941 ins->ipdn_status == IPDN_STATUS_ENABLED);
942 ins->ipdn_status |= IPDN_STATUS_CONDEMNED;
943 if (ins->ipdn_kstat != NULL)
944 net_kstat_delete(id, ins->ipdn_kstat);
945 mutex_exit(&ins->ipdn_lock);
946 }
947
948 /*ARGSUSED*/
949 static void
ipd_nin_destroy(const netid_t id,void * arg)950 ipd_nin_destroy(const netid_t id, void *arg)
951 {
952 ipd_netstack_t *ins = arg;
953
954 /*
955 * At this point none of the hooks should be able to fire because the
956 * zone has been shutdown and we are in the process of destroying it.
957 * Thus it should not be possible for someone else to come in and grab
958 * our ipd_netstack_t for this zone. Because of that, we know that we
959 * are the only ones who could be running here.
960 */
961 mutex_enter(&ipd_nsl_lock);
962 list_remove(&ipd_nsl, ins);
963 mutex_exit(&ipd_nsl_lock);
964
965 if (ins->ipdn_hooked)
966 ipd_teardown_hooks(ins);
967 mutex_destroy(&ins->ipdn_lock);
968 kmem_free(ins, sizeof (ipd_netstack_t));
969 }
970
971 /*ARGSUSED*/
972 static int
ipd_open(dev_t * devp,int flag,int otype,cred_t * credp)973 ipd_open(dev_t *devp, int flag, int otype, cred_t *credp)
974 {
975 if (flag & FEXCL || flag & FNDELAY)
976 return (EINVAL);
977
978 if (otype != OTYP_CHR)
979 return (EINVAL);
980
981 if (!(flag & FREAD && flag & FWRITE))
982 return (EINVAL);
983
984 if (secpolicy_ip_config(credp, B_FALSE) != 0)
985 return (EPERM);
986
987 return (0);
988 }
989
990 /*ARGSUSED*/
991 static int
ipd_ioctl(dev_t dev,int cmd,intptr_t arg,int md,cred_t * cr,int * rv)992 ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
993 {
994 int rval;
995 ipd_ioc_perturb_t ipip;
996
997 switch (cmd) {
998 case IPDIOC_CORRUPT:
999 case IPDIOC_DELAY:
1000 case IPDIOC_DROP:
1001 if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
1002 0) != 0)
1003 return (EFAULT);
1004 rval = ipd_ioctl_perturb(&ipip, cr, cmd);
1005 return (rval);
1006 case IPDIOC_REMOVE:
1007 if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
1008 0) != 0)
1009 return (EFAULT);
1010 rval = ipd_ioctl_remove(&ipip, cr);
1011 return (rval);
1012 case IPDIOC_LIST:
1013 /*
1014 * Because the list ioctl doesn't have a fixed-size struct due
1015 * to needing to pass around a pointer, we instead delegate the
1016 * copyin logic to the list code.
1017 */
1018 return (ipd_ioctl_list(arg, cr));
1019 default:
1020 break;
1021 }
1022 return (ENOTTY);
1023 }
1024
1025 /*ARGSUSED*/
1026 static int
ipd_close(dev_t dev,int flag,int otype,cred_t * credp)1027 ipd_close(dev_t dev, int flag, int otype, cred_t *credp)
1028 {
1029 return (0);
1030 }
1031
1032 static int
ipd_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)1033 ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1034 {
1035 minor_t instance;
1036
1037 if (cmd != DDI_ATTACH)
1038 return (DDI_FAILURE);
1039
1040 if (ipd_devi != NULL)
1041 return (DDI_FAILURE);
1042
1043 instance = ddi_get_instance(dip);
1044 if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance,
1045 DDI_PSEUDO, 0) == DDI_FAILURE)
1046 return (DDI_FAILURE);
1047
1048 ipd_neti = net_instance_alloc(NETINFO_VERSION);
1049 if (ipd_neti == NULL) {
1050 ddi_remove_minor_node(dip, NULL);
1051 return (DDI_FAILURE);
1052 }
1053
1054 /*
1055 * Note that these global structures MUST be initialized before we call
1056 * net_instance_register, as that will instantly cause us to drive into
1057 * the ipd_nin_create callbacks.
1058 */
1059 list_create(&ipd_nsl, sizeof (ipd_netstack_t),
1060 offsetof(ipd_netstack_t, ipdn_link));
1061 mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL);
1062 mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL);
1063
1064 /* Note, net_instance_alloc sets the version. */
1065 ipd_neti->nin_name = "ipd";
1066 ipd_neti->nin_create = ipd_nin_create;
1067 ipd_neti->nin_destroy = ipd_nin_destroy;
1068 ipd_neti->nin_shutdown = ipd_nin_shutdown;
1069 if (net_instance_register(ipd_neti) == DDI_FAILURE) {
1070 net_instance_free(ipd_neti);
1071 ddi_remove_minor_node(dip, NULL);
1072 }
1073
1074 ddi_report_dev(dip);
1075 ipd_devi = dip;
1076
1077 return (DDI_SUCCESS);
1078 }
1079
1080 /*ARGSUSED*/
1081 static int
ipd_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)1082 ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1083 {
1084 int error;
1085
1086 switch (infocmd) {
1087 case DDI_INFO_DEVT2DEVINFO:
1088 *result = ipd_devi;
1089 error = DDI_SUCCESS;
1090 break;
1091 case DDI_INFO_DEVT2INSTANCE:
1092 *result = (void *)(uintptr_t)getminor((dev_t)arg);
1093 error = DDI_SUCCESS;
1094 default:
1095 error = DDI_FAILURE;
1096 break;
1097 }
1098
1099 return (error);
1100 }
1101
1102 static int
ipd_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)1103 ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1104 {
1105 if (cmd != DDI_DETACH)
1106 return (DDI_FAILURE);
1107
1108 mutex_enter(&ipd_nactive_lock);
1109 if (ipd_nactive > 0) {
1110 mutex_exit(&ipd_nactive_lock);
1111 return (EBUSY);
1112 }
1113 mutex_exit(&ipd_nactive_lock);
1114 ASSERT(dip == ipd_devi);
1115 ddi_remove_minor_node(dip, NULL);
1116 ipd_devi = NULL;
1117
1118 if (ipd_neti != NULL) {
1119 VERIFY(net_instance_unregister(ipd_neti) == 0);
1120 net_instance_free(ipd_neti);
1121 }
1122
1123 mutex_destroy(&ipd_nsl_lock);
1124 mutex_destroy(&ipd_nactive_lock);
1125 list_destroy(&ipd_nsl);
1126
1127 return (DDI_SUCCESS);
1128 }
1129
1130 static struct cb_ops ipd_cb_ops = {
1131 ipd_open, /* open */
1132 ipd_close, /* close */
1133 nodev, /* strategy */
1134 nodev, /* print */
1135 nodev, /* dump */
1136 nodev, /* read */
1137 nodev, /* write */
1138 ipd_ioctl, /* ioctl */
1139 nodev, /* devmap */
1140 nodev, /* mmap */
1141 nodev, /* segmap */
1142 nochpoll, /* poll */
1143 ddi_prop_op, /* cb_prop_op */
1144 NULL, /* streamtab */
1145 D_NEW | D_MP, /* Driver compatibility flag */
1146 CB_REV, /* rev */
1147 nodev, /* aread */
1148 nodev /* awrite */
1149 };
1150
1151 static struct dev_ops ipd_ops = {
1152 DEVO_REV, /* devo_rev */
1153 0, /* refcnt */
1154 ipd_getinfo, /* get_dev_info */
1155 nulldev, /* identify */
1156 nulldev, /* probe */
1157 ipd_attach, /* attach */
1158 ipd_detach, /* detach */
1159 nodev, /* reset */
1160 &ipd_cb_ops, /* driver operations */
1161 NULL, /* bus operations */
1162 nodev, /* dev power */
1163 ddi_quiesce_not_needed /* quiesce */
1164 };
1165
1166 static struct modldrv modldrv = {
1167 &mod_driverops,
1168 "Internet packet disturber",
1169 &ipd_ops
1170 };
1171
1172 static struct modlinkage modlinkage = {
1173 MODREV_1,
1174 { (void *)&modldrv, NULL }
1175 };
1176
1177 int
_init(void)1178 _init(void)
1179 {
1180 return (mod_install(&modlinkage));
1181 }
1182
1183 int
_info(struct modinfo * modinfop)1184 _info(struct modinfo *modinfop)
1185 {
1186 return (mod_info(&modlinkage, modinfop));
1187 }
1188
1189 int
_fini(void)1190 _fini(void)
1191 {
1192 return (mod_remove(&modlinkage));
1193 }
1194