xref: /freebsd/contrib/libpcap/optimize.c (revision c8e7055577942f62c35b38e995708418197c7497)
1 /*
2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that: (1) source code distributions
7  * retain the above copyright notice and this paragraph in its entirety, (2)
8  * distributions including binary code include the above copyright notice and
9  * this paragraph in its entirety in the documentation or other materials
10  * provided with the distribution, and (3) all advertising materials mentioning
11  * features or use of this software display the following acknowledgement:
12  * ``This product includes software developed by the University of California,
13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
14  * the University nor the names of its contributors may be used to endorse
15  * or promote products derived from this software without specific prior
16  * written permission.
17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
20  *
21  *  Optimization module for BPF code intermediate representation.
22  */
23 
24 #ifdef HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 
28 #include <pcap-types.h>
29 
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <memory.h>
33 #include <string.h>
34 
35 #include <errno.h>
36 
37 #include "pcap-int.h"
38 
39 #include "gencode.h"
40 #include "optimize.h"
41 
42 #ifdef HAVE_OS_PROTO_H
43 #include "os-proto.h"
44 #endif
45 
46 #ifdef BDEBUG
47 /*
48  * The internal "debug printout" flag for the filter expression optimizer.
49  * The code to print that stuff is present only if BDEBUG is defined, so
50  * the flag, and the routine to set it, are defined only if BDEBUG is
51  * defined.
52  */
53 static int pcap_optimizer_debug;
54 
55 /*
56  * Routine to set that flag.
57  *
58  * This is intended for libpcap developers, not for general use.
59  * If you want to set these in a program, you'll have to declare this
60  * routine yourself, with the appropriate DLL import attribute on Windows;
61  * it's not declared in any header file, and won't be declared in any
62  * header file provided by libpcap.
63  */
64 PCAP_API void pcap_set_optimizer_debug(int value);
65 
66 PCAP_API_DEF void
67 pcap_set_optimizer_debug(int value)
68 {
69 	pcap_optimizer_debug = value;
70 }
71 
72 /*
73  * The internal "print dot graph" flag for the filter expression optimizer.
74  * The code to print that stuff is present only if BDEBUG is defined, so
75  * the flag, and the routine to set it, are defined only if BDEBUG is
76  * defined.
77  */
78 static int pcap_print_dot_graph;
79 
80 /*
81  * Routine to set that flag.
82  *
83  * This is intended for libpcap developers, not for general use.
84  * If you want to set these in a program, you'll have to declare this
85  * routine yourself, with the appropriate DLL import attribute on Windows;
86  * it's not declared in any header file, and won't be declared in any
87  * header file provided by libpcap.
88  */
89 PCAP_API void pcap_set_print_dot_graph(int value);
90 
91 PCAP_API_DEF void
92 pcap_set_print_dot_graph(int value)
93 {
94 	pcap_print_dot_graph = value;
95 }
96 
97 #endif
98 
99 /*
100  * lowest_set_bit().
101  *
102  * Takes a 32-bit integer as an argument.
103  *
104  * If handed a non-zero value, returns the index of the lowest set bit,
105  * counting upwards fro zero.
106  *
107  * If handed zero, the results are platform- and compiler-dependent.
108  * Keep it out of the light, don't give it any water, don't feed it
109  * after midnight, and don't pass zero to it.
110  *
111  * This is the same as the count of trailing zeroes in the word.
112  */
113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
114   /*
115    * GCC 3.4 and later; we have __builtin_ctz().
116    */
117   #define lowest_set_bit(mask) __builtin_ctz(mask)
118 #elif defined(_MSC_VER)
119   /*
120    * Visual Studio; we support only 2005 and later, so use
121    * _BitScanForward().
122    */
123 #include <intrin.h>
124 
125 #ifndef __clang__
126 #pragma intrinsic(_BitScanForward)
127 #endif
128 
129 static __forceinline int
130 lowest_set_bit(int mask)
131 {
132 	unsigned long bit;
133 
134 	/*
135 	 * Don't sign-extend mask if long is longer than int.
136 	 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
137 	 */
138 	if (_BitScanForward(&bit, (unsigned int)mask) == 0)
139 		return -1;	/* mask is zero */
140 	return (int)bit;
141 }
142 #elif defined(MSDOS) && defined(__DJGPP__)
143   /*
144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
145    * we've already included.
146    */
147   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
149   /*
150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
151    * or some other platform (UN*X conforming to a sufficient recent version
152    * of the Single UNIX Specification).
153    */
154   #include <strings.h>
155   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
156 #else
157 /*
158  * None of the above.
159  * Use a perfect-hash-function-based function.
160  */
161 static int
162 lowest_set_bit(int mask)
163 {
164 	unsigned int v = (unsigned int)mask;
165 
166 	static const int MultiplyDeBruijnBitPosition[32] = {
167 		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
168 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
169 	};
170 
171 	/*
172 	 * We strip off all but the lowermost set bit (v & ~v),
173 	 * and perform a minimal perfect hash on it to look up the
174 	 * number of low-order zero bits in a table.
175 	 *
176 	 * See:
177 	 *
178 	 *	http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
179 	 *
180 	 *	http://supertech.csail.mit.edu/papers/debruijn.pdf
181 	 */
182 	return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
183 }
184 #endif
185 
186 /*
187  * Represents a deleted instruction.
188  */
189 #define NOP -1
190 
191 /*
192  * Register numbers for use-def values.
193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
194  * location.  A_ATOM is the accumulator and X_ATOM is the index
195  * register.
196  */
197 #define A_ATOM BPF_MEMWORDS
198 #define X_ATOM (BPF_MEMWORDS+1)
199 
200 /*
201  * This define is used to represent *both* the accumulator and
202  * x register in use-def computations.
203  * Currently, the use-def code assumes only one definition per instruction.
204  */
205 #define AX_ATOM N_ATOMS
206 
207 /*
208  * These data structures are used in a Cocke and Shwarz style
209  * value numbering scheme.  Since the flowgraph is acyclic,
210  * exit values can be propagated from a node's predecessors
211  * provided it is uniquely defined.
212  */
213 struct valnode {
214 	int code;
215 	int v0, v1;
216 	int val;
217 	struct valnode *next;
218 };
219 
220 /* Integer constants mapped with the load immediate opcode. */
221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
222 
223 struct vmapinfo {
224 	int is_const;
225 	bpf_int32 const_val;
226 };
227 
228 typedef struct {
229 	/*
230 	 * A flag to indicate that further optimization is needed.
231 	 * Iterative passes are continued until a given pass yields no
232 	 * branch movement.
233 	 */
234 	int done;
235 
236 	int n_blocks;
237 	struct block **blocks;
238 	int n_edges;
239 	struct edge **edges;
240 
241 	/*
242 	 * A bit vector set representation of the dominators.
243 	 * We round up the set size to the next power of two.
244 	 */
245 	int nodewords;
246 	int edgewords;
247 	struct block **levels;
248 	bpf_u_int32 *space;
249 
250 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
251 /*
252  * True if a is in uset {p}
253  */
254 #define SET_MEMBER(p, a) \
255 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
256 
257 /*
258  * Add 'a' to uset p.
259  */
260 #define SET_INSERT(p, a) \
261 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
262 
263 /*
264  * Delete 'a' from uset p.
265  */
266 #define SET_DELETE(p, a) \
267 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
268 
269 /*
270  * a := a intersect b
271  */
272 #define SET_INTERSECT(a, b, n)\
273 {\
274 	register bpf_u_int32 *_x = a, *_y = b;\
275 	register int _n = n;\
276 	while (--_n >= 0) *_x++ &= *_y++;\
277 }
278 
279 /*
280  * a := a - b
281  */
282 #define SET_SUBTRACT(a, b, n)\
283 {\
284 	register bpf_u_int32 *_x = a, *_y = b;\
285 	register int _n = n;\
286 	while (--_n >= 0) *_x++ &=~ *_y++;\
287 }
288 
289 /*
290  * a := a union b
291  */
292 #define SET_UNION(a, b, n)\
293 {\
294 	register bpf_u_int32 *_x = a, *_y = b;\
295 	register int _n = n;\
296 	while (--_n >= 0) *_x++ |= *_y++;\
297 }
298 
299 	uset all_dom_sets;
300 	uset all_closure_sets;
301 	uset all_edge_sets;
302 
303 #define MODULUS 213
304 	struct valnode *hashtbl[MODULUS];
305 	int curval;
306 	int maxval;
307 
308 	struct vmapinfo *vmap;
309 	struct valnode *vnode_base;
310 	struct valnode *next_vnode;
311 } opt_state_t;
312 
313 typedef struct {
314 	/*
315 	 * Some pointers used to convert the basic block form of the code,
316 	 * into the array form that BPF requires.  'fstart' will point to
317 	 * the malloc'd array while 'ftail' is used during the recursive
318 	 * traversal.
319 	 */
320 	struct bpf_insn *fstart;
321 	struct bpf_insn *ftail;
322 } conv_state_t;
323 
324 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
325 static void opt_cleanup(opt_state_t *);
326 
327 static void intern_blocks(opt_state_t *, struct icode *);
328 
329 static void find_inedges(opt_state_t *, struct block *);
330 #ifdef BDEBUG
331 static void opt_dump(compiler_state_t *, struct icode *);
332 #endif
333 
334 #ifndef MAX
335 #define MAX(a,b) ((a)>(b)?(a):(b))
336 #endif
337 
338 static void
339 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
340 {
341 	int level;
342 
343 	if (isMarked(ic, b))
344 		return;
345 
346 	Mark(ic, b);
347 	b->link = 0;
348 
349 	if (JT(b)) {
350 		find_levels_r(opt_state, ic, JT(b));
351 		find_levels_r(opt_state, ic, JF(b));
352 		level = MAX(JT(b)->level, JF(b)->level) + 1;
353 	} else
354 		level = 0;
355 	b->level = level;
356 	b->link = opt_state->levels[level];
357 	opt_state->levels[level] = b;
358 }
359 
360 /*
361  * Level graph.  The levels go from 0 at the leaves to
362  * N_LEVELS at the root.  The opt_state->levels[] array points to the
363  * first node of the level list, whose elements are linked
364  * with the 'link' field of the struct block.
365  */
366 static void
367 find_levels(opt_state_t *opt_state, struct icode *ic)
368 {
369 	memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
370 	unMarkAll(ic);
371 	find_levels_r(opt_state, ic, ic->root);
372 }
373 
374 /*
375  * Find dominator relationships.
376  * Assumes graph has been leveled.
377  */
378 static void
379 find_dom(opt_state_t *opt_state, struct block *root)
380 {
381 	int i;
382 	struct block *b;
383 	bpf_u_int32 *x;
384 
385 	/*
386 	 * Initialize sets to contain all nodes.
387 	 */
388 	x = opt_state->all_dom_sets;
389 	i = opt_state->n_blocks * opt_state->nodewords;
390 	while (--i >= 0)
391 		*x++ = 0xFFFFFFFFU;
392 	/* Root starts off empty. */
393 	for (i = opt_state->nodewords; --i >= 0;)
394 		root->dom[i] = 0;
395 
396 	/* root->level is the highest level no found. */
397 	for (i = root->level; i >= 0; --i) {
398 		for (b = opt_state->levels[i]; b; b = b->link) {
399 			SET_INSERT(b->dom, b->id);
400 			if (JT(b) == 0)
401 				continue;
402 			SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
403 			SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
404 		}
405 	}
406 }
407 
408 static void
409 propedom(opt_state_t *opt_state, struct edge *ep)
410 {
411 	SET_INSERT(ep->edom, ep->id);
412 	if (ep->succ) {
413 		SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
414 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
415 	}
416 }
417 
418 /*
419  * Compute edge dominators.
420  * Assumes graph has been leveled and predecessors established.
421  */
422 static void
423 find_edom(opt_state_t *opt_state, struct block *root)
424 {
425 	int i;
426 	uset x;
427 	struct block *b;
428 
429 	x = opt_state->all_edge_sets;
430 	for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
431 		x[i] = 0xFFFFFFFFU;
432 
433 	/* root->level is the highest level no found. */
434 	memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
435 	memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
436 	for (i = root->level; i >= 0; --i) {
437 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
438 			propedom(opt_state, &b->et);
439 			propedom(opt_state, &b->ef);
440 		}
441 	}
442 }
443 
444 /*
445  * Find the backwards transitive closure of the flow graph.  These sets
446  * are backwards in the sense that we find the set of nodes that reach
447  * a given node, not the set of nodes that can be reached by a node.
448  *
449  * Assumes graph has been leveled.
450  */
451 static void
452 find_closure(opt_state_t *opt_state, struct block *root)
453 {
454 	int i;
455 	struct block *b;
456 
457 	/*
458 	 * Initialize sets to contain no nodes.
459 	 */
460 	memset((char *)opt_state->all_closure_sets, 0,
461 	      opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
462 
463 	/* root->level is the highest level no found. */
464 	for (i = root->level; i >= 0; --i) {
465 		for (b = opt_state->levels[i]; b; b = b->link) {
466 			SET_INSERT(b->closure, b->id);
467 			if (JT(b) == 0)
468 				continue;
469 			SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
470 			SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
471 		}
472 	}
473 }
474 
475 /*
476  * Return the register number that is used by s.  If A and X are both
477  * used, return AX_ATOM.  If no register is used, return -1.
478  *
479  * The implementation should probably change to an array access.
480  */
481 static int
482 atomuse(struct stmt *s)
483 {
484 	register int c = s->code;
485 
486 	if (c == NOP)
487 		return -1;
488 
489 	switch (BPF_CLASS(c)) {
490 
491 	case BPF_RET:
492 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
493 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
494 
495 	case BPF_LD:
496 	case BPF_LDX:
497 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
498 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
499 
500 	case BPF_ST:
501 		return A_ATOM;
502 
503 	case BPF_STX:
504 		return X_ATOM;
505 
506 	case BPF_JMP:
507 	case BPF_ALU:
508 		if (BPF_SRC(c) == BPF_X)
509 			return AX_ATOM;
510 		return A_ATOM;
511 
512 	case BPF_MISC:
513 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
514 	}
515 	abort();
516 	/* NOTREACHED */
517 }
518 
519 /*
520  * Return the register number that is defined by 's'.  We assume that
521  * a single stmt cannot define more than one register.  If no register
522  * is defined, return -1.
523  *
524  * The implementation should probably change to an array access.
525  */
526 static int
527 atomdef(struct stmt *s)
528 {
529 	if (s->code == NOP)
530 		return -1;
531 
532 	switch (BPF_CLASS(s->code)) {
533 
534 	case BPF_LD:
535 	case BPF_ALU:
536 		return A_ATOM;
537 
538 	case BPF_LDX:
539 		return X_ATOM;
540 
541 	case BPF_ST:
542 	case BPF_STX:
543 		return s->k;
544 
545 	case BPF_MISC:
546 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
547 	}
548 	return -1;
549 }
550 
551 /*
552  * Compute the sets of registers used, defined, and killed by 'b'.
553  *
554  * "Used" means that a statement in 'b' uses the register before any
555  * statement in 'b' defines it, i.e. it uses the value left in
556  * that register by a predecessor block of this block.
557  * "Defined" means that a statement in 'b' defines it.
558  * "Killed" means that a statement in 'b' defines it before any
559  * statement in 'b' uses it, i.e. it kills the value left in that
560  * register by a predecessor block of this block.
561  */
562 static void
563 compute_local_ud(struct block *b)
564 {
565 	struct slist *s;
566 	atomset def = 0, use = 0, killed = 0;
567 	int atom;
568 
569 	for (s = b->stmts; s; s = s->next) {
570 		if (s->s.code == NOP)
571 			continue;
572 		atom = atomuse(&s->s);
573 		if (atom >= 0) {
574 			if (atom == AX_ATOM) {
575 				if (!ATOMELEM(def, X_ATOM))
576 					use |= ATOMMASK(X_ATOM);
577 				if (!ATOMELEM(def, A_ATOM))
578 					use |= ATOMMASK(A_ATOM);
579 			}
580 			else if (atom < N_ATOMS) {
581 				if (!ATOMELEM(def, atom))
582 					use |= ATOMMASK(atom);
583 			}
584 			else
585 				abort();
586 		}
587 		atom = atomdef(&s->s);
588 		if (atom >= 0) {
589 			if (!ATOMELEM(use, atom))
590 				killed |= ATOMMASK(atom);
591 			def |= ATOMMASK(atom);
592 		}
593 	}
594 	if (BPF_CLASS(b->s.code) == BPF_JMP) {
595 		/*
596 		 * XXX - what about RET?
597 		 */
598 		atom = atomuse(&b->s);
599 		if (atom >= 0) {
600 			if (atom == AX_ATOM) {
601 				if (!ATOMELEM(def, X_ATOM))
602 					use |= ATOMMASK(X_ATOM);
603 				if (!ATOMELEM(def, A_ATOM))
604 					use |= ATOMMASK(A_ATOM);
605 			}
606 			else if (atom < N_ATOMS) {
607 				if (!ATOMELEM(def, atom))
608 					use |= ATOMMASK(atom);
609 			}
610 			else
611 				abort();
612 		}
613 	}
614 
615 	b->def = def;
616 	b->kill = killed;
617 	b->in_use = use;
618 }
619 
620 /*
621  * Assume graph is already leveled.
622  */
623 static void
624 find_ud(opt_state_t *opt_state, struct block *root)
625 {
626 	int i, maxlevel;
627 	struct block *p;
628 
629 	/*
630 	 * root->level is the highest level no found;
631 	 * count down from there.
632 	 */
633 	maxlevel = root->level;
634 	for (i = maxlevel; i >= 0; --i)
635 		for (p = opt_state->levels[i]; p; p = p->link) {
636 			compute_local_ud(p);
637 			p->out_use = 0;
638 		}
639 
640 	for (i = 1; i <= maxlevel; ++i) {
641 		for (p = opt_state->levels[i]; p; p = p->link) {
642 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
643 			p->in_use |= p->out_use &~ p->kill;
644 		}
645 	}
646 }
647 static void
648 init_val(opt_state_t *opt_state)
649 {
650 	opt_state->curval = 0;
651 	opt_state->next_vnode = opt_state->vnode_base;
652 	memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
653 	memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
654 }
655 
656 /* Because we really don't have an IR, this stuff is a little messy. */
657 static int
658 F(opt_state_t *opt_state, int code, int v0, int v1)
659 {
660 	u_int hash;
661 	int val;
662 	struct valnode *p;
663 
664 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
665 	hash %= MODULUS;
666 
667 	for (p = opt_state->hashtbl[hash]; p; p = p->next)
668 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
669 			return p->val;
670 
671 	val = ++opt_state->curval;
672 	if (BPF_MODE(code) == BPF_IMM &&
673 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
674 		opt_state->vmap[val].const_val = v0;
675 		opt_state->vmap[val].is_const = 1;
676 	}
677 	p = opt_state->next_vnode++;
678 	p->val = val;
679 	p->code = code;
680 	p->v0 = v0;
681 	p->v1 = v1;
682 	p->next = opt_state->hashtbl[hash];
683 	opt_state->hashtbl[hash] = p;
684 
685 	return val;
686 }
687 
688 static inline void
689 vstore(struct stmt *s, int *valp, int newval, int alter)
690 {
691 	if (alter && newval != VAL_UNKNOWN && *valp == newval)
692 		s->code = NOP;
693 	else
694 		*valp = newval;
695 }
696 
697 /*
698  * Do constant-folding on binary operators.
699  * (Unary operators are handled elsewhere.)
700  */
701 static void
702 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
703     struct stmt *s, int v0, int v1)
704 {
705 	bpf_u_int32 a, b;
706 
707 	a = opt_state->vmap[v0].const_val;
708 	b = opt_state->vmap[v1].const_val;
709 
710 	switch (BPF_OP(s->code)) {
711 	case BPF_ADD:
712 		a += b;
713 		break;
714 
715 	case BPF_SUB:
716 		a -= b;
717 		break;
718 
719 	case BPF_MUL:
720 		a *= b;
721 		break;
722 
723 	case BPF_DIV:
724 		if (b == 0)
725 			bpf_error(cstate, "division by zero");
726 		a /= b;
727 		break;
728 
729 	case BPF_MOD:
730 		if (b == 0)
731 			bpf_error(cstate, "modulus by zero");
732 		a %= b;
733 		break;
734 
735 	case BPF_AND:
736 		a &= b;
737 		break;
738 
739 	case BPF_OR:
740 		a |= b;
741 		break;
742 
743 	case BPF_XOR:
744 		a ^= b;
745 		break;
746 
747 	case BPF_LSH:
748 		a <<= b;
749 		break;
750 
751 	case BPF_RSH:
752 		a >>= b;
753 		break;
754 
755 	default:
756 		abort();
757 	}
758 	s->k = a;
759 	s->code = BPF_LD|BPF_IMM;
760 	opt_state->done = 0;
761 }
762 
763 static inline struct slist *
764 this_op(struct slist *s)
765 {
766 	while (s != 0 && s->s.code == NOP)
767 		s = s->next;
768 	return s;
769 }
770 
771 static void
772 opt_not(struct block *b)
773 {
774 	struct block *tmp = JT(b);
775 
776 	JT(b) = JF(b);
777 	JF(b) = tmp;
778 }
779 
780 static void
781 opt_peep(opt_state_t *opt_state, struct block *b)
782 {
783 	struct slist *s;
784 	struct slist *next, *last;
785 	int val;
786 
787 	s = b->stmts;
788 	if (s == 0)
789 		return;
790 
791 	last = s;
792 	for (/*empty*/; /*empty*/; s = next) {
793 		/*
794 		 * Skip over nops.
795 		 */
796 		s = this_op(s);
797 		if (s == 0)
798 			break;	/* nothing left in the block */
799 
800 		/*
801 		 * Find the next real instruction after that one
802 		 * (skipping nops).
803 		 */
804 		next = this_op(s->next);
805 		if (next == 0)
806 			break;	/* no next instruction */
807 		last = next;
808 
809 		/*
810 		 * st  M[k]	-->	st  M[k]
811 		 * ldx M[k]		tax
812 		 */
813 		if (s->s.code == BPF_ST &&
814 		    next->s.code == (BPF_LDX|BPF_MEM) &&
815 		    s->s.k == next->s.k) {
816 			opt_state->done = 0;
817 			next->s.code = BPF_MISC|BPF_TAX;
818 		}
819 		/*
820 		 * ld  #k	-->	ldx  #k
821 		 * tax			txa
822 		 */
823 		if (s->s.code == (BPF_LD|BPF_IMM) &&
824 		    next->s.code == (BPF_MISC|BPF_TAX)) {
825 			s->s.code = BPF_LDX|BPF_IMM;
826 			next->s.code = BPF_MISC|BPF_TXA;
827 			opt_state->done = 0;
828 		}
829 		/*
830 		 * This is an ugly special case, but it happens
831 		 * when you say tcp[k] or udp[k] where k is a constant.
832 		 */
833 		if (s->s.code == (BPF_LD|BPF_IMM)) {
834 			struct slist *add, *tax, *ild;
835 
836 			/*
837 			 * Check that X isn't used on exit from this
838 			 * block (which the optimizer might cause).
839 			 * We know the code generator won't generate
840 			 * any local dependencies.
841 			 */
842 			if (ATOMELEM(b->out_use, X_ATOM))
843 				continue;
844 
845 			/*
846 			 * Check that the instruction following the ldi
847 			 * is an addx, or it's an ldxms with an addx
848 			 * following it (with 0 or more nops between the
849 			 * ldxms and addx).
850 			 */
851 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
852 				add = next;
853 			else
854 				add = this_op(next->next);
855 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
856 				continue;
857 
858 			/*
859 			 * Check that a tax follows that (with 0 or more
860 			 * nops between them).
861 			 */
862 			tax = this_op(add->next);
863 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
864 				continue;
865 
866 			/*
867 			 * Check that an ild follows that (with 0 or more
868 			 * nops between them).
869 			 */
870 			ild = this_op(tax->next);
871 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
872 			    BPF_MODE(ild->s.code) != BPF_IND)
873 				continue;
874 			/*
875 			 * We want to turn this sequence:
876 			 *
877 			 * (004) ldi     #0x2		{s}
878 			 * (005) ldxms   [14]		{next}  -- optional
879 			 * (006) addx			{add}
880 			 * (007) tax			{tax}
881 			 * (008) ild     [x+0]		{ild}
882 			 *
883 			 * into this sequence:
884 			 *
885 			 * (004) nop
886 			 * (005) ldxms   [14]
887 			 * (006) nop
888 			 * (007) nop
889 			 * (008) ild     [x+2]
890 			 *
891 			 * XXX We need to check that X is not
892 			 * subsequently used, because we want to change
893 			 * what'll be in it after this sequence.
894 			 *
895 			 * We know we can eliminate the accumulator
896 			 * modifications earlier in the sequence since
897 			 * it is defined by the last stmt of this sequence
898 			 * (i.e., the last statement of the sequence loads
899 			 * a value into the accumulator, so we can eliminate
900 			 * earlier operations on the accumulator).
901 			 */
902 			ild->s.k += s->s.k;
903 			s->s.code = NOP;
904 			add->s.code = NOP;
905 			tax->s.code = NOP;
906 			opt_state->done = 0;
907 		}
908 	}
909 	/*
910 	 * If the comparison at the end of a block is an equality
911 	 * comparison against a constant, and nobody uses the value
912 	 * we leave in the A register at the end of a block, and
913 	 * the operation preceding the comparison is an arithmetic
914 	 * operation, we can sometime optimize it away.
915 	 */
916 	if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
917 	    !ATOMELEM(b->out_use, A_ATOM)) {
918 	    	/*
919 	    	 * We can optimize away certain subtractions of the
920 	    	 * X register.
921 	    	 */
922 		if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
923 			val = b->val[X_ATOM];
924 			if (opt_state->vmap[val].is_const) {
925 				/*
926 				 * If we have a subtract to do a comparison,
927 				 * and the X register is a known constant,
928 				 * we can merge this value into the
929 				 * comparison:
930 				 *
931 				 * sub x  ->	nop
932 				 * jeq #y	jeq #(x+y)
933 				 */
934 				b->s.k += opt_state->vmap[val].const_val;
935 				last->s.code = NOP;
936 				opt_state->done = 0;
937 			} else if (b->s.k == 0) {
938 				/*
939 				 * If the X register isn't a constant,
940 				 * and the comparison in the test is
941 				 * against 0, we can compare with the
942 				 * X register, instead:
943 				 *
944 				 * sub x  ->	nop
945 				 * jeq #0	jeq x
946 				 */
947 				last->s.code = NOP;
948 				b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
949 				opt_state->done = 0;
950 			}
951 		}
952 		/*
953 		 * Likewise, a constant subtract can be simplified:
954 		 *
955 		 * sub #x ->	nop
956 		 * jeq #y ->	jeq #(x+y)
957 		 */
958 		else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
959 			last->s.code = NOP;
960 			b->s.k += last->s.k;
961 			opt_state->done = 0;
962 		}
963 		/*
964 		 * And, similarly, a constant AND can be simplified
965 		 * if we're testing against 0, i.e.:
966 		 *
967 		 * and #k	nop
968 		 * jeq #0  ->	jset #k
969 		 */
970 		else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
971 		    b->s.k == 0) {
972 			b->s.k = last->s.k;
973 			b->s.code = BPF_JMP|BPF_K|BPF_JSET;
974 			last->s.code = NOP;
975 			opt_state->done = 0;
976 			opt_not(b);
977 		}
978 	}
979 	/*
980 	 * jset #0        ->   never
981 	 * jset #ffffffff ->   always
982 	 */
983 	if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
984 		if (b->s.k == 0)
985 			JT(b) = JF(b);
986 		if ((u_int)b->s.k == 0xffffffffU)
987 			JF(b) = JT(b);
988 	}
989 	/*
990 	 * If we're comparing against the index register, and the index
991 	 * register is a known constant, we can just compare against that
992 	 * constant.
993 	 */
994 	val = b->val[X_ATOM];
995 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
996 		bpf_int32 v = opt_state->vmap[val].const_val;
997 		b->s.code &= ~BPF_X;
998 		b->s.k = v;
999 	}
1000 	/*
1001 	 * If the accumulator is a known constant, we can compute the
1002 	 * comparison result.
1003 	 */
1004 	val = b->val[A_ATOM];
1005 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1006 		bpf_int32 v = opt_state->vmap[val].const_val;
1007 		switch (BPF_OP(b->s.code)) {
1008 
1009 		case BPF_JEQ:
1010 			v = v == b->s.k;
1011 			break;
1012 
1013 		case BPF_JGT:
1014 			v = (unsigned)v > (unsigned)b->s.k;
1015 			break;
1016 
1017 		case BPF_JGE:
1018 			v = (unsigned)v >= (unsigned)b->s.k;
1019 			break;
1020 
1021 		case BPF_JSET:
1022 			v &= b->s.k;
1023 			break;
1024 
1025 		default:
1026 			abort();
1027 		}
1028 		if (JF(b) != JT(b))
1029 			opt_state->done = 0;
1030 		if (v)
1031 			JF(b) = JT(b);
1032 		else
1033 			JT(b) = JF(b);
1034 	}
1035 }
1036 
1037 /*
1038  * Compute the symbolic value of expression of 's', and update
1039  * anything it defines in the value table 'val'.  If 'alter' is true,
1040  * do various optimizations.  This code would be cleaner if symbolic
1041  * evaluation and code transformations weren't folded together.
1042  */
1043 static void
1044 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
1045     struct stmt *s, int val[], int alter)
1046 {
1047 	int op;
1048 	int v;
1049 
1050 	switch (s->code) {
1051 
1052 	case BPF_LD|BPF_ABS|BPF_W:
1053 	case BPF_LD|BPF_ABS|BPF_H:
1054 	case BPF_LD|BPF_ABS|BPF_B:
1055 		v = F(opt_state, s->code, s->k, 0L);
1056 		vstore(s, &val[A_ATOM], v, alter);
1057 		break;
1058 
1059 	case BPF_LD|BPF_IND|BPF_W:
1060 	case BPF_LD|BPF_IND|BPF_H:
1061 	case BPF_LD|BPF_IND|BPF_B:
1062 		v = val[X_ATOM];
1063 		if (alter && opt_state->vmap[v].is_const) {
1064 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1065 			s->k += opt_state->vmap[v].const_val;
1066 			v = F(opt_state, s->code, s->k, 0L);
1067 			opt_state->done = 0;
1068 		}
1069 		else
1070 			v = F(opt_state, s->code, s->k, v);
1071 		vstore(s, &val[A_ATOM], v, alter);
1072 		break;
1073 
1074 	case BPF_LD|BPF_LEN:
1075 		v = F(opt_state, s->code, 0L, 0L);
1076 		vstore(s, &val[A_ATOM], v, alter);
1077 		break;
1078 
1079 	case BPF_LD|BPF_IMM:
1080 		v = K(s->k);
1081 		vstore(s, &val[A_ATOM], v, alter);
1082 		break;
1083 
1084 	case BPF_LDX|BPF_IMM:
1085 		v = K(s->k);
1086 		vstore(s, &val[X_ATOM], v, alter);
1087 		break;
1088 
1089 	case BPF_LDX|BPF_MSH|BPF_B:
1090 		v = F(opt_state, s->code, s->k, 0L);
1091 		vstore(s, &val[X_ATOM], v, alter);
1092 		break;
1093 
1094 	case BPF_ALU|BPF_NEG:
1095 		if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1096 			s->code = BPF_LD|BPF_IMM;
1097 			s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1098 			val[A_ATOM] = K(s->k);
1099 		}
1100 		else
1101 			val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1102 		break;
1103 
1104 	case BPF_ALU|BPF_ADD|BPF_K:
1105 	case BPF_ALU|BPF_SUB|BPF_K:
1106 	case BPF_ALU|BPF_MUL|BPF_K:
1107 	case BPF_ALU|BPF_DIV|BPF_K:
1108 	case BPF_ALU|BPF_MOD|BPF_K:
1109 	case BPF_ALU|BPF_AND|BPF_K:
1110 	case BPF_ALU|BPF_OR|BPF_K:
1111 	case BPF_ALU|BPF_XOR|BPF_K:
1112 	case BPF_ALU|BPF_LSH|BPF_K:
1113 	case BPF_ALU|BPF_RSH|BPF_K:
1114 		op = BPF_OP(s->code);
1115 		if (alter) {
1116 			if (s->k == 0) {
1117 				/* don't optimize away "sub #0"
1118 				 * as it may be needed later to
1119 				 * fixup the generated math code */
1120 				if (op == BPF_ADD ||
1121 				    op == BPF_LSH || op == BPF_RSH ||
1122 				    op == BPF_OR || op == BPF_XOR) {
1123 					s->code = NOP;
1124 					break;
1125 				}
1126 				if (op == BPF_MUL || op == BPF_AND) {
1127 					s->code = BPF_LD|BPF_IMM;
1128 					val[A_ATOM] = K(s->k);
1129 					break;
1130 				}
1131 			}
1132 			if (opt_state->vmap[val[A_ATOM]].is_const) {
1133 				fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
1134 				val[A_ATOM] = K(s->k);
1135 				break;
1136 			}
1137 		}
1138 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1139 		break;
1140 
1141 	case BPF_ALU|BPF_ADD|BPF_X:
1142 	case BPF_ALU|BPF_SUB|BPF_X:
1143 	case BPF_ALU|BPF_MUL|BPF_X:
1144 	case BPF_ALU|BPF_DIV|BPF_X:
1145 	case BPF_ALU|BPF_MOD|BPF_X:
1146 	case BPF_ALU|BPF_AND|BPF_X:
1147 	case BPF_ALU|BPF_OR|BPF_X:
1148 	case BPF_ALU|BPF_XOR|BPF_X:
1149 	case BPF_ALU|BPF_LSH|BPF_X:
1150 	case BPF_ALU|BPF_RSH|BPF_X:
1151 		op = BPF_OP(s->code);
1152 		if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1153 			if (opt_state->vmap[val[A_ATOM]].is_const) {
1154 				fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
1155 				val[A_ATOM] = K(s->k);
1156 			}
1157 			else {
1158 				s->code = BPF_ALU|BPF_K|op;
1159 				s->k = opt_state->vmap[val[X_ATOM]].const_val;
1160 				opt_state->done = 0;
1161 				val[A_ATOM] =
1162 					F(opt_state, s->code, val[A_ATOM], K(s->k));
1163 			}
1164 			break;
1165 		}
1166 		/*
1167 		 * Check if we're doing something to an accumulator
1168 		 * that is 0, and simplify.  This may not seem like
1169 		 * much of a simplification but it could open up further
1170 		 * optimizations.
1171 		 * XXX We could also check for mul by 1, etc.
1172 		 */
1173 		if (alter && opt_state->vmap[val[A_ATOM]].is_const
1174 		    && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1175 			if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1176 				s->code = BPF_MISC|BPF_TXA;
1177 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1178 				break;
1179 			}
1180 			else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1181 				 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1182 				s->code = BPF_LD|BPF_IMM;
1183 				s->k = 0;
1184 				vstore(s, &val[A_ATOM], K(s->k), alter);
1185 				break;
1186 			}
1187 			else if (op == BPF_NEG) {
1188 				s->code = NOP;
1189 				break;
1190 			}
1191 		}
1192 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1193 		break;
1194 
1195 	case BPF_MISC|BPF_TXA:
1196 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1197 		break;
1198 
1199 	case BPF_LD|BPF_MEM:
1200 		v = val[s->k];
1201 		if (alter && opt_state->vmap[v].is_const) {
1202 			s->code = BPF_LD|BPF_IMM;
1203 			s->k = opt_state->vmap[v].const_val;
1204 			opt_state->done = 0;
1205 		}
1206 		vstore(s, &val[A_ATOM], v, alter);
1207 		break;
1208 
1209 	case BPF_MISC|BPF_TAX:
1210 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1211 		break;
1212 
1213 	case BPF_LDX|BPF_MEM:
1214 		v = val[s->k];
1215 		if (alter && opt_state->vmap[v].is_const) {
1216 			s->code = BPF_LDX|BPF_IMM;
1217 			s->k = opt_state->vmap[v].const_val;
1218 			opt_state->done = 0;
1219 		}
1220 		vstore(s, &val[X_ATOM], v, alter);
1221 		break;
1222 
1223 	case BPF_ST:
1224 		vstore(s, &val[s->k], val[A_ATOM], alter);
1225 		break;
1226 
1227 	case BPF_STX:
1228 		vstore(s, &val[s->k], val[X_ATOM], alter);
1229 		break;
1230 	}
1231 }
1232 
1233 static void
1234 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1235 {
1236 	register int atom;
1237 
1238 	atom = atomuse(s);
1239 	if (atom >= 0) {
1240 		if (atom == AX_ATOM) {
1241 			last[X_ATOM] = 0;
1242 			last[A_ATOM] = 0;
1243 		}
1244 		else
1245 			last[atom] = 0;
1246 	}
1247 	atom = atomdef(s);
1248 	if (atom >= 0) {
1249 		if (last[atom]) {
1250 			opt_state->done = 0;
1251 			last[atom]->code = NOP;
1252 		}
1253 		last[atom] = s;
1254 	}
1255 }
1256 
1257 static void
1258 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1259 {
1260 	register struct slist *s;
1261 	register int atom;
1262 	struct stmt *last[N_ATOMS];
1263 
1264 	memset((char *)last, 0, sizeof last);
1265 
1266 	for (s = b->stmts; s != 0; s = s->next)
1267 		deadstmt(opt_state, &s->s, last);
1268 	deadstmt(opt_state, &b->s, last);
1269 
1270 	for (atom = 0; atom < N_ATOMS; ++atom)
1271 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1272 			last[atom]->code = NOP;
1273 			opt_state->done = 0;
1274 		}
1275 }
1276 
1277 static void
1278 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
1279     struct block *b, int do_stmts)
1280 {
1281 	struct slist *s;
1282 	struct edge *p;
1283 	int i;
1284 	bpf_int32 aval, xval;
1285 
1286 #if 0
1287 	for (s = b->stmts; s && s->next; s = s->next)
1288 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
1289 			do_stmts = 0;
1290 			break;
1291 		}
1292 #endif
1293 
1294 	/*
1295 	 * Initialize the atom values.
1296 	 */
1297 	p = b->in_edges;
1298 	if (p == 0) {
1299 		/*
1300 		 * We have no predecessors, so everything is undefined
1301 		 * upon entry to this block.
1302 		 */
1303 		memset((char *)b->val, 0, sizeof(b->val));
1304 	} else {
1305 		/*
1306 		 * Inherit values from our predecessors.
1307 		 *
1308 		 * First, get the values from the predecessor along the
1309 		 * first edge leading to this node.
1310 		 */
1311 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1312 		/*
1313 		 * Now look at all the other nodes leading to this node.
1314 		 * If, for the predecessor along that edge, a register
1315 		 * has a different value from the one we have (i.e.,
1316 		 * control paths are merging, and the merging paths
1317 		 * assign different values to that register), give the
1318 		 * register the undefined value of 0.
1319 		 */
1320 		while ((p = p->next) != NULL) {
1321 			for (i = 0; i < N_ATOMS; ++i)
1322 				if (b->val[i] != p->pred->val[i])
1323 					b->val[i] = 0;
1324 		}
1325 	}
1326 	aval = b->val[A_ATOM];
1327 	xval = b->val[X_ATOM];
1328 	for (s = b->stmts; s; s = s->next)
1329 		opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
1330 
1331 	/*
1332 	 * This is a special case: if we don't use anything from this
1333 	 * block, and we load the accumulator or index register with a
1334 	 * value that is already there, or if this block is a return,
1335 	 * eliminate all the statements.
1336 	 *
1337 	 * XXX - what if it does a store?
1338 	 *
1339 	 * XXX - why does it matter whether we use anything from this
1340 	 * block?  If the accumulator or index register doesn't change
1341 	 * its value, isn't that OK even if we use that value?
1342 	 *
1343 	 * XXX - if we load the accumulator with a different value,
1344 	 * and the block ends with a conditional branch, we obviously
1345 	 * can't eliminate it, as the branch depends on that value.
1346 	 * For the index register, the conditional branch only depends
1347 	 * on the index register value if the test is against the index
1348 	 * register value rather than a constant; if nothing uses the
1349 	 * value we put into the index register, and we're not testing
1350 	 * against the index register's value, and there aren't any
1351 	 * other problems that would keep us from eliminating this
1352 	 * block, can we eliminate it?
1353 	 */
1354 	if (do_stmts &&
1355 	    ((b->out_use == 0 &&
1356 	      aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1357 	      xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1358 	     BPF_CLASS(b->s.code) == BPF_RET)) {
1359 		if (b->stmts != 0) {
1360 			b->stmts = 0;
1361 			opt_state->done = 0;
1362 		}
1363 	} else {
1364 		opt_peep(opt_state, b);
1365 		opt_deadstores(opt_state, b);
1366 	}
1367 	/*
1368 	 * Set up values for branch optimizer.
1369 	 */
1370 	if (BPF_SRC(b->s.code) == BPF_K)
1371 		b->oval = K(b->s.k);
1372 	else
1373 		b->oval = b->val[X_ATOM];
1374 	b->et.code = b->s.code;
1375 	b->ef.code = -b->s.code;
1376 }
1377 
1378 /*
1379  * Return true if any register that is used on exit from 'succ', has
1380  * an exit value that is different from the corresponding exit value
1381  * from 'b'.
1382  */
1383 static int
1384 use_conflict(struct block *b, struct block *succ)
1385 {
1386 	int atom;
1387 	atomset use = succ->out_use;
1388 
1389 	if (use == 0)
1390 		return 0;
1391 
1392 	for (atom = 0; atom < N_ATOMS; ++atom)
1393 		if (ATOMELEM(use, atom))
1394 			if (b->val[atom] != succ->val[atom])
1395 				return 1;
1396 	return 0;
1397 }
1398 
1399 static struct block *
1400 fold_edge(struct block *child, struct edge *ep)
1401 {
1402 	int sense;
1403 	int aval0, aval1, oval0, oval1;
1404 	int code = ep->code;
1405 
1406 	if (code < 0) {
1407 		code = -code;
1408 		sense = 0;
1409 	} else
1410 		sense = 1;
1411 
1412 	if (child->s.code != code)
1413 		return 0;
1414 
1415 	aval0 = child->val[A_ATOM];
1416 	oval0 = child->oval;
1417 	aval1 = ep->pred->val[A_ATOM];
1418 	oval1 = ep->pred->oval;
1419 
1420 	if (aval0 != aval1)
1421 		return 0;
1422 
1423 	if (oval0 == oval1)
1424 		/*
1425 		 * The operands of the branch instructions are
1426 		 * identical, so the result is true if a true
1427 		 * branch was taken to get here, otherwise false.
1428 		 */
1429 		return sense ? JT(child) : JF(child);
1430 
1431 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1432 		/*
1433 		 * At this point, we only know the comparison if we
1434 		 * came down the true branch, and it was an equality
1435 		 * comparison with a constant.
1436 		 *
1437 		 * I.e., if we came down the true branch, and the branch
1438 		 * was an equality comparison with a constant, we know the
1439 		 * accumulator contains that constant.  If we came down
1440 		 * the false branch, or the comparison wasn't with a
1441 		 * constant, we don't know what was in the accumulator.
1442 		 *
1443 		 * We rely on the fact that distinct constants have distinct
1444 		 * value numbers.
1445 		 */
1446 		return JF(child);
1447 
1448 	return 0;
1449 }
1450 
1451 static void
1452 opt_j(opt_state_t *opt_state, struct edge *ep)
1453 {
1454 	register int i, k;
1455 	register struct block *target;
1456 
1457 	if (JT(ep->succ) == 0)
1458 		return;
1459 
1460 	if (JT(ep->succ) == JF(ep->succ)) {
1461 		/*
1462 		 * Common branch targets can be eliminated, provided
1463 		 * there is no data dependency.
1464 		 */
1465 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1466 			opt_state->done = 0;
1467 			ep->succ = JT(ep->succ);
1468 		}
1469 	}
1470 	/*
1471 	 * For each edge dominator that matches the successor of this
1472 	 * edge, promote the edge successor to the its grandchild.
1473 	 *
1474 	 * XXX We violate the set abstraction here in favor a reasonably
1475 	 * efficient loop.
1476 	 */
1477  top:
1478 	for (i = 0; i < opt_state->edgewords; ++i) {
1479 		register bpf_u_int32 x = ep->edom[i];
1480 
1481 		while (x != 0) {
1482 			k = lowest_set_bit(x);
1483 			x &=~ (1 << k);
1484 			k += i * BITS_PER_WORD;
1485 
1486 			target = fold_edge(ep->succ, opt_state->edges[k]);
1487 			/*
1488 			 * Check that there is no data dependency between
1489 			 * nodes that will be violated if we move the edge.
1490 			 */
1491 			if (target != 0 && !use_conflict(ep->pred, target)) {
1492 				opt_state->done = 0;
1493 				ep->succ = target;
1494 				if (JT(target) != 0)
1495 					/*
1496 					 * Start over unless we hit a leaf.
1497 					 */
1498 					goto top;
1499 				return;
1500 			}
1501 		}
1502 	}
1503 }
1504 
1505 
1506 static void
1507 or_pullup(opt_state_t *opt_state, struct block *b)
1508 {
1509 	int val, at_top;
1510 	struct block *pull;
1511 	struct block **diffp, **samep;
1512 	struct edge *ep;
1513 
1514 	ep = b->in_edges;
1515 	if (ep == 0)
1516 		return;
1517 
1518 	/*
1519 	 * Make sure each predecessor loads the same value.
1520 	 * XXX why?
1521 	 */
1522 	val = ep->pred->val[A_ATOM];
1523 	for (ep = ep->next; ep != 0; ep = ep->next)
1524 		if (val != ep->pred->val[A_ATOM])
1525 			return;
1526 
1527 	if (JT(b->in_edges->pred) == b)
1528 		diffp = &JT(b->in_edges->pred);
1529 	else
1530 		diffp = &JF(b->in_edges->pred);
1531 
1532 	at_top = 1;
1533 	for (;;) {
1534 		if (*diffp == 0)
1535 			return;
1536 
1537 		if (JT(*diffp) != JT(b))
1538 			return;
1539 
1540 		if (!SET_MEMBER((*diffp)->dom, b->id))
1541 			return;
1542 
1543 		if ((*diffp)->val[A_ATOM] != val)
1544 			break;
1545 
1546 		diffp = &JF(*diffp);
1547 		at_top = 0;
1548 	}
1549 	samep = &JF(*diffp);
1550 	for (;;) {
1551 		if (*samep == 0)
1552 			return;
1553 
1554 		if (JT(*samep) != JT(b))
1555 			return;
1556 
1557 		if (!SET_MEMBER((*samep)->dom, b->id))
1558 			return;
1559 
1560 		if ((*samep)->val[A_ATOM] == val)
1561 			break;
1562 
1563 		/* XXX Need to check that there are no data dependencies
1564 		   between dp0 and dp1.  Currently, the code generator
1565 		   will not produce such dependencies. */
1566 		samep = &JF(*samep);
1567 	}
1568 #ifdef notdef
1569 	/* XXX This doesn't cover everything. */
1570 	for (i = 0; i < N_ATOMS; ++i)
1571 		if ((*samep)->val[i] != pred->val[i])
1572 			return;
1573 #endif
1574 	/* Pull up the node. */
1575 	pull = *samep;
1576 	*samep = JF(pull);
1577 	JF(pull) = *diffp;
1578 
1579 	/*
1580 	 * At the top of the chain, each predecessor needs to point at the
1581 	 * pulled up node.  Inside the chain, there is only one predecessor
1582 	 * to worry about.
1583 	 */
1584 	if (at_top) {
1585 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1586 			if (JT(ep->pred) == b)
1587 				JT(ep->pred) = pull;
1588 			else
1589 				JF(ep->pred) = pull;
1590 		}
1591 	}
1592 	else
1593 		*diffp = pull;
1594 
1595 	opt_state->done = 0;
1596 }
1597 
1598 static void
1599 and_pullup(opt_state_t *opt_state, struct block *b)
1600 {
1601 	int val, at_top;
1602 	struct block *pull;
1603 	struct block **diffp, **samep;
1604 	struct edge *ep;
1605 
1606 	ep = b->in_edges;
1607 	if (ep == 0)
1608 		return;
1609 
1610 	/*
1611 	 * Make sure each predecessor loads the same value.
1612 	 */
1613 	val = ep->pred->val[A_ATOM];
1614 	for (ep = ep->next; ep != 0; ep = ep->next)
1615 		if (val != ep->pred->val[A_ATOM])
1616 			return;
1617 
1618 	if (JT(b->in_edges->pred) == b)
1619 		diffp = &JT(b->in_edges->pred);
1620 	else
1621 		diffp = &JF(b->in_edges->pred);
1622 
1623 	at_top = 1;
1624 	for (;;) {
1625 		if (*diffp == 0)
1626 			return;
1627 
1628 		if (JF(*diffp) != JF(b))
1629 			return;
1630 
1631 		if (!SET_MEMBER((*diffp)->dom, b->id))
1632 			return;
1633 
1634 		if ((*diffp)->val[A_ATOM] != val)
1635 			break;
1636 
1637 		diffp = &JT(*diffp);
1638 		at_top = 0;
1639 	}
1640 	samep = &JT(*diffp);
1641 	for (;;) {
1642 		if (*samep == 0)
1643 			return;
1644 
1645 		if (JF(*samep) != JF(b))
1646 			return;
1647 
1648 		if (!SET_MEMBER((*samep)->dom, b->id))
1649 			return;
1650 
1651 		if ((*samep)->val[A_ATOM] == val)
1652 			break;
1653 
1654 		/* XXX Need to check that there are no data dependencies
1655 		   between diffp and samep.  Currently, the code generator
1656 		   will not produce such dependencies. */
1657 		samep = &JT(*samep);
1658 	}
1659 #ifdef notdef
1660 	/* XXX This doesn't cover everything. */
1661 	for (i = 0; i < N_ATOMS; ++i)
1662 		if ((*samep)->val[i] != pred->val[i])
1663 			return;
1664 #endif
1665 	/* Pull up the node. */
1666 	pull = *samep;
1667 	*samep = JT(pull);
1668 	JT(pull) = *diffp;
1669 
1670 	/*
1671 	 * At the top of the chain, each predecessor needs to point at the
1672 	 * pulled up node.  Inside the chain, there is only one predecessor
1673 	 * to worry about.
1674 	 */
1675 	if (at_top) {
1676 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1677 			if (JT(ep->pred) == b)
1678 				JT(ep->pred) = pull;
1679 			else
1680 				JF(ep->pred) = pull;
1681 		}
1682 	}
1683 	else
1684 		*diffp = pull;
1685 
1686 	opt_state->done = 0;
1687 }
1688 
1689 static void
1690 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1691     int do_stmts)
1692 {
1693 	int i, maxlevel;
1694 	struct block *p;
1695 
1696 	init_val(opt_state);
1697 	maxlevel = ic->root->level;
1698 
1699 	find_inedges(opt_state, ic->root);
1700 	for (i = maxlevel; i >= 0; --i)
1701 		for (p = opt_state->levels[i]; p; p = p->link)
1702 			opt_blk(cstate, opt_state, p, do_stmts);
1703 
1704 	if (do_stmts)
1705 		/*
1706 		 * No point trying to move branches; it can't possibly
1707 		 * make a difference at this point.
1708 		 */
1709 		return;
1710 
1711 	for (i = 1; i <= maxlevel; ++i) {
1712 		for (p = opt_state->levels[i]; p; p = p->link) {
1713 			opt_j(opt_state, &p->et);
1714 			opt_j(opt_state, &p->ef);
1715 		}
1716 	}
1717 
1718 	find_inedges(opt_state, ic->root);
1719 	for (i = 1; i <= maxlevel; ++i) {
1720 		for (p = opt_state->levels[i]; p; p = p->link) {
1721 			or_pullup(opt_state, p);
1722 			and_pullup(opt_state, p);
1723 		}
1724 	}
1725 }
1726 
1727 static inline void
1728 link_inedge(struct edge *parent, struct block *child)
1729 {
1730 	parent->next = child->in_edges;
1731 	child->in_edges = parent;
1732 }
1733 
1734 static void
1735 find_inedges(opt_state_t *opt_state, struct block *root)
1736 {
1737 	int i;
1738 	struct block *b;
1739 
1740 	for (i = 0; i < opt_state->n_blocks; ++i)
1741 		opt_state->blocks[i]->in_edges = 0;
1742 
1743 	/*
1744 	 * Traverse the graph, adding each edge to the predecessor
1745 	 * list of its successors.  Skip the leaves (i.e. level 0).
1746 	 */
1747 	for (i = root->level; i > 0; --i) {
1748 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
1749 			link_inedge(&b->et, JT(b));
1750 			link_inedge(&b->ef, JF(b));
1751 		}
1752 	}
1753 }
1754 
1755 static void
1756 opt_root(struct block **b)
1757 {
1758 	struct slist *tmp, *s;
1759 
1760 	s = (*b)->stmts;
1761 	(*b)->stmts = 0;
1762 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1763 		*b = JT(*b);
1764 
1765 	tmp = (*b)->stmts;
1766 	if (tmp != 0)
1767 		sappend(s, tmp);
1768 	(*b)->stmts = s;
1769 
1770 	/*
1771 	 * If the root node is a return, then there is no
1772 	 * point executing any statements (since the bpf machine
1773 	 * has no side effects).
1774 	 */
1775 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
1776 		(*b)->stmts = 0;
1777 }
1778 
1779 static void
1780 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1781     int do_stmts)
1782 {
1783 
1784 #ifdef BDEBUG
1785 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1786 		printf("opt_loop(root, %d) begin\n", do_stmts);
1787 		opt_dump(cstate, ic);
1788 	}
1789 #endif
1790 	do {
1791 		opt_state->done = 1;
1792 		find_levels(opt_state, ic);
1793 		find_dom(opt_state, ic->root);
1794 		find_closure(opt_state, ic->root);
1795 		find_ud(opt_state, ic->root);
1796 		find_edom(opt_state, ic->root);
1797 		opt_blks(cstate, opt_state, ic, do_stmts);
1798 #ifdef BDEBUG
1799 		if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1800 			printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1801 			opt_dump(cstate, ic);
1802 		}
1803 #endif
1804 	} while (!opt_state->done);
1805 }
1806 
1807 /*
1808  * Optimize the filter code in its dag representation.
1809  */
1810 void
1811 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
1812 {
1813 	opt_state_t opt_state;
1814 
1815 	opt_init(cstate, &opt_state, ic);
1816 	opt_loop(cstate, &opt_state, ic, 0);
1817 	opt_loop(cstate, &opt_state, ic, 1);
1818 	intern_blocks(&opt_state, ic);
1819 #ifdef BDEBUG
1820 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1821 		printf("after intern_blocks()\n");
1822 		opt_dump(cstate, ic);
1823 	}
1824 #endif
1825 	opt_root(&ic->root);
1826 #ifdef BDEBUG
1827 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1828 		printf("after opt_root()\n");
1829 		opt_dump(cstate, ic);
1830 	}
1831 #endif
1832 	opt_cleanup(&opt_state);
1833 }
1834 
1835 static void
1836 make_marks(struct icode *ic, struct block *p)
1837 {
1838 	if (!isMarked(ic, p)) {
1839 		Mark(ic, p);
1840 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1841 			make_marks(ic, JT(p));
1842 			make_marks(ic, JF(p));
1843 		}
1844 	}
1845 }
1846 
1847 /*
1848  * Mark code array such that isMarked(ic->cur_mark, i) is true
1849  * only for nodes that are alive.
1850  */
1851 static void
1852 mark_code(struct icode *ic)
1853 {
1854 	ic->cur_mark += 1;
1855 	make_marks(ic, ic->root);
1856 }
1857 
1858 /*
1859  * True iff the two stmt lists load the same value from the packet into
1860  * the accumulator.
1861  */
1862 static int
1863 eq_slist(struct slist *x, struct slist *y)
1864 {
1865 	for (;;) {
1866 		while (x && x->s.code == NOP)
1867 			x = x->next;
1868 		while (y && y->s.code == NOP)
1869 			y = y->next;
1870 		if (x == 0)
1871 			return y == 0;
1872 		if (y == 0)
1873 			return x == 0;
1874 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1875 			return 0;
1876 		x = x->next;
1877 		y = y->next;
1878 	}
1879 }
1880 
1881 static inline int
1882 eq_blk(struct block *b0, struct block *b1)
1883 {
1884 	if (b0->s.code == b1->s.code &&
1885 	    b0->s.k == b1->s.k &&
1886 	    b0->et.succ == b1->et.succ &&
1887 	    b0->ef.succ == b1->ef.succ)
1888 		return eq_slist(b0->stmts, b1->stmts);
1889 	return 0;
1890 }
1891 
1892 static void
1893 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1894 {
1895 	struct block *p;
1896 	int i, j;
1897 	int done1; /* don't shadow global */
1898  top:
1899 	done1 = 1;
1900 	for (i = 0; i < opt_state->n_blocks; ++i)
1901 		opt_state->blocks[i]->link = 0;
1902 
1903 	mark_code(ic);
1904 
1905 	for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1906 		if (!isMarked(ic, opt_state->blocks[i]))
1907 			continue;
1908 		for (j = i + 1; j < opt_state->n_blocks; ++j) {
1909 			if (!isMarked(ic, opt_state->blocks[j]))
1910 				continue;
1911 			if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1912 				opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1913 					opt_state->blocks[j]->link : opt_state->blocks[j];
1914 				break;
1915 			}
1916 		}
1917 	}
1918 	for (i = 0; i < opt_state->n_blocks; ++i) {
1919 		p = opt_state->blocks[i];
1920 		if (JT(p) == 0)
1921 			continue;
1922 		if (JT(p)->link) {
1923 			done1 = 0;
1924 			JT(p) = JT(p)->link;
1925 		}
1926 		if (JF(p)->link) {
1927 			done1 = 0;
1928 			JF(p) = JF(p)->link;
1929 		}
1930 	}
1931 	if (!done1)
1932 		goto top;
1933 }
1934 
1935 static void
1936 opt_cleanup(opt_state_t *opt_state)
1937 {
1938 	free((void *)opt_state->vnode_base);
1939 	free((void *)opt_state->vmap);
1940 	free((void *)opt_state->edges);
1941 	free((void *)opt_state->space);
1942 	free((void *)opt_state->levels);
1943 	free((void *)opt_state->blocks);
1944 }
1945 
1946 /*
1947  * Return the number of stmts in 's'.
1948  */
1949 static u_int
1950 slength(struct slist *s)
1951 {
1952 	u_int n = 0;
1953 
1954 	for (; s; s = s->next)
1955 		if (s->s.code != NOP)
1956 			++n;
1957 	return n;
1958 }
1959 
1960 /*
1961  * Return the number of nodes reachable by 'p'.
1962  * All nodes should be initially unmarked.
1963  */
1964 static int
1965 count_blocks(struct icode *ic, struct block *p)
1966 {
1967 	if (p == 0 || isMarked(ic, p))
1968 		return 0;
1969 	Mark(ic, p);
1970 	return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
1971 }
1972 
1973 /*
1974  * Do a depth first search on the flow graph, numbering the
1975  * the basic blocks, and entering them into the 'blocks' array.`
1976  */
1977 static void
1978 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
1979 {
1980 	int n;
1981 
1982 	if (p == 0 || isMarked(ic, p))
1983 		return;
1984 
1985 	Mark(ic, p);
1986 	n = opt_state->n_blocks++;
1987 	p->id = n;
1988 	opt_state->blocks[n] = p;
1989 
1990 	number_blks_r(opt_state, ic, JT(p));
1991 	number_blks_r(opt_state, ic, JF(p));
1992 }
1993 
1994 /*
1995  * Return the number of stmts in the flowgraph reachable by 'p'.
1996  * The nodes should be unmarked before calling.
1997  *
1998  * Note that "stmts" means "instructions", and that this includes
1999  *
2000  *	side-effect statements in 'p' (slength(p->stmts));
2001  *
2002  *	statements in the true branch from 'p' (count_stmts(JT(p)));
2003  *
2004  *	statements in the false branch from 'p' (count_stmts(JF(p)));
2005  *
2006  *	the conditional jump itself (1);
2007  *
2008  *	an extra long jump if the true branch requires it (p->longjt);
2009  *
2010  *	an extra long jump if the false branch requires it (p->longjf).
2011  */
2012 static u_int
2013 count_stmts(struct icode *ic, struct block *p)
2014 {
2015 	u_int n;
2016 
2017 	if (p == 0 || isMarked(ic, p))
2018 		return 0;
2019 	Mark(ic, p);
2020 	n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2021 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2022 }
2023 
2024 /*
2025  * Allocate memory.  All allocation is done before optimization
2026  * is begun.  A linear bound on the size of all data structures is computed
2027  * from the total number of blocks and/or statements.
2028  */
2029 static void
2030 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
2031 {
2032 	bpf_u_int32 *p;
2033 	int i, n, max_stmts;
2034 
2035 	/*
2036 	 * First, count the blocks, so we can malloc an array to map
2037 	 * block number to block.  Then, put the blocks into the array.
2038 	 */
2039 	unMarkAll(ic);
2040 	n = count_blocks(ic, ic->root);
2041 	opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2042 	if (opt_state->blocks == NULL)
2043 		bpf_error(cstate, "malloc");
2044 	unMarkAll(ic);
2045 	opt_state->n_blocks = 0;
2046 	number_blks_r(opt_state, ic, ic->root);
2047 
2048 	opt_state->n_edges = 2 * opt_state->n_blocks;
2049 	opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2050 	if (opt_state->edges == NULL)
2051 		bpf_error(cstate, "malloc");
2052 
2053 	/*
2054 	 * The number of levels is bounded by the number of nodes.
2055 	 */
2056 	opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2057 	if (opt_state->levels == NULL)
2058 		bpf_error(cstate, "malloc");
2059 
2060 	opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2061 	opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2062 
2063 	/* XXX */
2064 	opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2065 				 + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2066 	if (opt_state->space == NULL)
2067 		bpf_error(cstate, "malloc");
2068 	p = opt_state->space;
2069 	opt_state->all_dom_sets = p;
2070 	for (i = 0; i < n; ++i) {
2071 		opt_state->blocks[i]->dom = p;
2072 		p += opt_state->nodewords;
2073 	}
2074 	opt_state->all_closure_sets = p;
2075 	for (i = 0; i < n; ++i) {
2076 		opt_state->blocks[i]->closure = p;
2077 		p += opt_state->nodewords;
2078 	}
2079 	opt_state->all_edge_sets = p;
2080 	for (i = 0; i < n; ++i) {
2081 		register struct block *b = opt_state->blocks[i];
2082 
2083 		b->et.edom = p;
2084 		p += opt_state->edgewords;
2085 		b->ef.edom = p;
2086 		p += opt_state->edgewords;
2087 		b->et.id = i;
2088 		opt_state->edges[i] = &b->et;
2089 		b->ef.id = opt_state->n_blocks + i;
2090 		opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2091 		b->et.pred = b;
2092 		b->ef.pred = b;
2093 	}
2094 	max_stmts = 0;
2095 	for (i = 0; i < n; ++i)
2096 		max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2097 	/*
2098 	 * We allocate at most 3 value numbers per statement,
2099 	 * so this is an upper bound on the number of valnodes
2100 	 * we'll need.
2101 	 */
2102 	opt_state->maxval = 3 * max_stmts;
2103 	opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2104 	opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2105 	if (opt_state->vmap == NULL || opt_state->vnode_base == NULL)
2106 		bpf_error(cstate, "malloc");
2107 }
2108 
2109 /*
2110  * This is only used when supporting optimizer debugging.  It is
2111  * global state, so do *not* do more than one compile in parallel
2112  * and expect it to provide meaningful information.
2113  */
2114 #ifdef BDEBUG
2115 int bids[NBIDS];
2116 #endif
2117 
2118 /*
2119  * Returns true if successful.  Returns false if a branch has
2120  * an offset that is too large.  If so, we have marked that
2121  * branch so that on a subsequent iteration, it will be treated
2122  * properly.
2123  */
2124 static int
2125 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
2126     struct icode *ic, struct block *p)
2127 {
2128 	struct bpf_insn *dst;
2129 	struct slist *src;
2130 	u_int slen;
2131 	u_int off;
2132 	u_int extrajmps;	/* number of extra jumps inserted */
2133 	struct slist **offset = NULL;
2134 
2135 	if (p == 0 || isMarked(ic, p))
2136 		return (1);
2137 	Mark(ic, p);
2138 
2139 	if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
2140 		return (0);
2141 	if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
2142 		return (0);
2143 
2144 	slen = slength(p->stmts);
2145 	dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2146 		/* inflate length by any extra jumps */
2147 
2148 	p->offset = (int)(dst - conv_state->fstart);
2149 
2150 	/* generate offset[] for convenience  */
2151 	if (slen) {
2152 		offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2153 		if (!offset) {
2154 			bpf_error(cstate, "not enough core");
2155 			/*NOTREACHED*/
2156 		}
2157 	}
2158 	src = p->stmts;
2159 	for (off = 0; off < slen && src; off++) {
2160 #if 0
2161 		printf("off=%d src=%x\n", off, src);
2162 #endif
2163 		offset[off] = src;
2164 		src = src->next;
2165 	}
2166 
2167 	off = 0;
2168 	for (src = p->stmts; src; src = src->next) {
2169 		if (src->s.code == NOP)
2170 			continue;
2171 		dst->code = (u_short)src->s.code;
2172 		dst->k = src->s.k;
2173 
2174 		/* fill block-local relative jump */
2175 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2176 #if 0
2177 			if (src->s.jt || src->s.jf) {
2178 				bpf_error(cstate, "illegal jmp destination");
2179 				/*NOTREACHED*/
2180 			}
2181 #endif
2182 			goto filled;
2183 		}
2184 		if (off == slen - 2)	/*???*/
2185 			goto filled;
2186 
2187 	    {
2188 		u_int i;
2189 		int jt, jf;
2190 		const char ljerr[] = "%s for block-local relative jump: off=%d";
2191 
2192 #if 0
2193 		printf("code=%x off=%d %x %x\n", src->s.code,
2194 			off, src->s.jt, src->s.jf);
2195 #endif
2196 
2197 		if (!src->s.jt || !src->s.jf) {
2198 			bpf_error(cstate, ljerr, "no jmp destination", off);
2199 			/*NOTREACHED*/
2200 		}
2201 
2202 		jt = jf = 0;
2203 		for (i = 0; i < slen; i++) {
2204 			if (offset[i] == src->s.jt) {
2205 				if (jt) {
2206 					bpf_error(cstate, ljerr, "multiple matches", off);
2207 					/*NOTREACHED*/
2208 				}
2209 
2210 				if (i - off - 1 >= 256) {
2211 					bpf_error(cstate, ljerr, "out-of-range jump", off);
2212 					/*NOTREACHED*/
2213 				}
2214 				dst->jt = (u_char)(i - off - 1);
2215 				jt++;
2216 			}
2217 			if (offset[i] == src->s.jf) {
2218 				if (jf) {
2219 					bpf_error(cstate, ljerr, "multiple matches", off);
2220 					/*NOTREACHED*/
2221 				}
2222 				if (i - off - 1 >= 256) {
2223 					bpf_error(cstate, ljerr, "out-of-range jump", off);
2224 					/*NOTREACHED*/
2225 				}
2226 				dst->jf = (u_char)(i - off - 1);
2227 				jf++;
2228 			}
2229 		}
2230 		if (!jt || !jf) {
2231 			bpf_error(cstate, ljerr, "no destination found", off);
2232 			/*NOTREACHED*/
2233 		}
2234 	    }
2235 filled:
2236 		++dst;
2237 		++off;
2238 	}
2239 	if (offset)
2240 		free(offset);
2241 
2242 #ifdef BDEBUG
2243 	if (dst - conv_state->fstart < NBIDS)
2244 		bids[dst - conv_state->fstart] = p->id + 1;
2245 #endif
2246 	dst->code = (u_short)p->s.code;
2247 	dst->k = p->s.k;
2248 	if (JT(p)) {
2249 		extrajmps = 0;
2250 		off = JT(p)->offset - (p->offset + slen) - 1;
2251 		if (off >= 256) {
2252 		    /* offset too large for branch, must add a jump */
2253 		    if (p->longjt == 0) {
2254 		    	/* mark this instruction and retry */
2255 			p->longjt++;
2256 			return(0);
2257 		    }
2258 		    /* branch if T to following jump */
2259 		    if (extrajmps >= 256) {
2260 			bpf_error(cstate, "too many extra jumps");
2261 			/*NOTREACHED*/
2262 		    }
2263 		    dst->jt = (u_char)extrajmps;
2264 		    extrajmps++;
2265 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2266 		    dst[extrajmps].k = off - extrajmps;
2267 		}
2268 		else
2269 		    dst->jt = (u_char)off;
2270 		off = JF(p)->offset - (p->offset + slen) - 1;
2271 		if (off >= 256) {
2272 		    /* offset too large for branch, must add a jump */
2273 		    if (p->longjf == 0) {
2274 		    	/* mark this instruction and retry */
2275 			p->longjf++;
2276 			return(0);
2277 		    }
2278 		    /* branch if F to following jump */
2279 		    /* if two jumps are inserted, F goes to second one */
2280 		    if (extrajmps >= 256) {
2281 			bpf_error(cstate, "too many extra jumps");
2282 			/*NOTREACHED*/
2283 		    }
2284 		    dst->jf = (u_char)extrajmps;
2285 		    extrajmps++;
2286 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2287 		    dst[extrajmps].k = off - extrajmps;
2288 		}
2289 		else
2290 		    dst->jf = (u_char)off;
2291 	}
2292 	return (1);
2293 }
2294 
2295 
2296 /*
2297  * Convert flowgraph intermediate representation to the
2298  * BPF array representation.  Set *lenp to the number of instructions.
2299  *
2300  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2301  * not* do free(fp) before returning fp; doing so would make no sense,
2302  * as the BPF array pointed to by the return value of icode_to_fcode()
2303  * must be valid - it's being returned for use in a bpf_program structure.
2304  *
2305  * If it appears that icode_to_fcode() is leaking, the problem is that
2306  * the program using pcap_compile() is failing to free the memory in
2307  * the BPF program when it's done - the leak is in the program, not in
2308  * the routine that happens to be allocating the memory.  (By analogy, if
2309  * a program calls fopen() without ever calling fclose() on the FILE *,
2310  * it will leak the FILE structure; the leak is not in fopen(), it's in
2311  * the program.)  Change the program to use pcap_freecode() when it's
2312  * done with the filter program.  See the pcap man page.
2313  */
2314 struct bpf_insn *
2315 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
2316     struct block *root, u_int *lenp)
2317 {
2318 	u_int n;
2319 	struct bpf_insn *fp;
2320 	conv_state_t conv_state;
2321 
2322 	/*
2323 	 * Loop doing convert_code_r() until no branches remain
2324 	 * with too-large offsets.
2325 	 */
2326 	for (;;) {
2327 	    unMarkAll(ic);
2328 	    n = *lenp = count_stmts(ic, root);
2329 
2330 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2331 	    if (fp == NULL)
2332 		    bpf_error(cstate, "malloc");
2333 	    memset((char *)fp, 0, sizeof(*fp) * n);
2334 	    conv_state.fstart = fp;
2335 	    conv_state.ftail = fp + n;
2336 
2337 	    unMarkAll(ic);
2338 	    if (convert_code_r(cstate, &conv_state, ic, root))
2339 		break;
2340 	    free(fp);
2341 	}
2342 
2343 	return fp;
2344 }
2345 
2346 /*
2347  * Make a copy of a BPF program and put it in the "fcode" member of
2348  * a "pcap_t".
2349  *
2350  * If we fail to allocate memory for the copy, fill in the "errbuf"
2351  * member of the "pcap_t" with an error message, and return -1;
2352  * otherwise, return 0.
2353  */
2354 int
2355 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2356 {
2357 	size_t prog_size;
2358 
2359 	/*
2360 	 * Validate the program.
2361 	 */
2362 	if (!bpf_validate(fp->bf_insns, fp->bf_len)) {
2363 		pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2364 			"BPF program is not valid");
2365 		return (-1);
2366 	}
2367 
2368 	/*
2369 	 * Free up any already installed program.
2370 	 */
2371 	pcap_freecode(&p->fcode);
2372 
2373 	prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2374 	p->fcode.bf_len = fp->bf_len;
2375 	p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2376 	if (p->fcode.bf_insns == NULL) {
2377 		pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2378 		    errno, "malloc");
2379 		return (-1);
2380 	}
2381 	memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2382 	return (0);
2383 }
2384 
2385 #ifdef BDEBUG
2386 static void
2387 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2388     FILE *out)
2389 {
2390 	int icount, noffset;
2391 	int i;
2392 
2393 	if (block == NULL || isMarked(ic, block))
2394 		return;
2395 	Mark(ic, block);
2396 
2397 	icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2398 	noffset = min(block->offset + icount, (int)prog->bf_len);
2399 
2400 	fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2401 	for (i = block->offset; i < noffset; i++) {
2402 		fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2403 	}
2404 	fprintf(out, "\" tooltip=\"");
2405 	for (i = 0; i < BPF_MEMWORDS; i++)
2406 		if (block->val[i] != VAL_UNKNOWN)
2407 			fprintf(out, "val[%d]=%d ", i, block->val[i]);
2408 	fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2409 	fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2410 	fprintf(out, "\"");
2411 	if (JT(block) == NULL)
2412 		fprintf(out, ", peripheries=2");
2413 	fprintf(out, "];\n");
2414 
2415 	dot_dump_node(ic, JT(block), prog, out);
2416 	dot_dump_node(ic, JF(block), prog, out);
2417 }
2418 
2419 static void
2420 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2421 {
2422 	if (block == NULL || isMarked(ic, block))
2423 		return;
2424 	Mark(ic, block);
2425 
2426 	if (JT(block)) {
2427 		fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2428 				block->id, JT(block)->id);
2429 		fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2430 			   block->id, JF(block)->id);
2431 	}
2432 	dot_dump_edge(ic, JT(block), out);
2433 	dot_dump_edge(ic, JF(block), out);
2434 }
2435 
2436 /* Output the block CFG using graphviz/DOT language
2437  * In the CFG, block's code, value index for each registers at EXIT,
2438  * and the jump relationship is show.
2439  *
2440  * example DOT for BPF `ip src host 1.1.1.1' is:
2441     digraph BPF {
2442     	block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2	jf 5" tooltip="val[A]=0 val[X]=0"];
2443     	block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4	jf 5" tooltip="val[A]=0 val[X]=0"];
2444     	block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2445     	block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2446     	"block0":se -> "block1":n [label="T"];
2447     	"block0":sw -> "block3":n [label="F"];
2448     	"block1":se -> "block2":n [label="T"];
2449     	"block1":sw -> "block3":n [label="F"];
2450     }
2451  *
2452  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2453  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2454  */
2455 static void
2456 dot_dump(compiler_state_t *cstate, struct icode *ic)
2457 {
2458 	struct bpf_program f;
2459 	FILE *out = stdout;
2460 
2461 	memset(bids, 0, sizeof bids);
2462 	f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2463 
2464 	fprintf(out, "digraph BPF {\n");
2465 	unMarkAll(ic);
2466 	dot_dump_node(ic, ic->root, &f, out);
2467 	unMarkAll(ic);
2468 	dot_dump_edge(ic, ic->root, out);
2469 	fprintf(out, "}\n");
2470 
2471 	free((char *)f.bf_insns);
2472 }
2473 
2474 static void
2475 plain_dump(compiler_state_t *cstate, struct icode *ic)
2476 {
2477 	struct bpf_program f;
2478 
2479 	memset(bids, 0, sizeof bids);
2480 	f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2481 	bpf_dump(&f, 1);
2482 	putchar('\n');
2483 	free((char *)f.bf_insns);
2484 }
2485 
2486 static void
2487 opt_dump(compiler_state_t *cstate, struct icode *ic)
2488 {
2489 	/*
2490 	 * If the CFG, in DOT format, is requested, output it rather than
2491 	 * the code that would be generated from that graph.
2492 	 */
2493 	if (pcap_print_dot_graph)
2494 		dot_dump(cstate, ic);
2495 	else
2496 		plain_dump(cstate, ic);
2497 }
2498 #endif
2499