1 /*
2 * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that: (1) source code distributions
7 * retain the above copyright notice and this paragraph in its entirety, (2)
8 * distributions including binary code include the above copyright notice and
9 * this paragraph in its entirety in the documentation or other materials
10 * provided with the distribution, and (3) all advertising materials mentioning
11 * features or use of this software display the following acknowledgement:
12 * ``This product includes software developed by the University of California,
13 * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
14 * the University nor the names of its contributors may be used to endorse
15 * or promote products derived from this software without specific prior
16 * written permission.
17 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
20 *
21 * Optimization module for BPF code intermediate representation.
22 */
23
24 #include <config.h>
25
26 #include <pcap-types.h>
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <memory.h>
31 #include <setjmp.h>
32 #include <string.h>
33 #include <limits.h> /* for SIZE_MAX */
34 #include <errno.h>
35
36 #include "pcap-int.h"
37
38 #include "gencode.h"
39 #include "optimize.h"
40 #include "diag-control.h"
41
42 #ifdef HAVE_OS_PROTO_H
43 #include "os-proto.h"
44 #endif
45
46 #ifdef BDEBUG
47 /*
48 * The internal "debug printout" flag for the filter expression optimizer.
49 * The code to print that stuff is present only if BDEBUG is defined, so
50 * the flag, and the routine to set it, are defined only if BDEBUG is
51 * defined.
52 */
53 static int pcap_optimizer_debug;
54
55 /*
56 * Routine to set that flag.
57 *
58 * This is intended for libpcap developers, not for general use.
59 * If you want to set these in a program, you'll have to declare this
60 * routine yourself, with the appropriate DLL import attribute on Windows;
61 * it's not declared in any header file, and won't be declared in any
62 * header file provided by libpcap.
63 */
64 PCAP_API void pcap_set_optimizer_debug(int value);
65
66 PCAP_API_DEF void
pcap_set_optimizer_debug(int value)67 pcap_set_optimizer_debug(int value)
68 {
69 pcap_optimizer_debug = value;
70 }
71
72 /*
73 * The internal "print dot graph" flag for the filter expression optimizer.
74 * The code to print that stuff is present only if BDEBUG is defined, so
75 * the flag, and the routine to set it, are defined only if BDEBUG is
76 * defined.
77 */
78 static int pcap_print_dot_graph;
79
80 /*
81 * Routine to set that flag.
82 *
83 * This is intended for libpcap developers, not for general use.
84 * If you want to set these in a program, you'll have to declare this
85 * routine yourself, with the appropriate DLL import attribute on Windows;
86 * it's not declared in any header file, and won't be declared in any
87 * header file provided by libpcap.
88 */
89 PCAP_API void pcap_set_print_dot_graph(int value);
90
91 PCAP_API_DEF void
pcap_set_print_dot_graph(int value)92 pcap_set_print_dot_graph(int value)
93 {
94 pcap_print_dot_graph = value;
95 }
96
97 #endif
98
99 /*
100 * lowest_set_bit().
101 *
102 * Takes a 32-bit integer as an argument.
103 *
104 * If handed a non-zero value, returns the index of the lowest set bit,
105 * counting upwards from zero.
106 *
107 * If handed zero, the results are platform- and compiler-dependent.
108 * Keep it out of the light, don't give it any water, don't feed it
109 * after midnight, and don't pass zero to it.
110 *
111 * This is the same as the count of trailing zeroes in the word.
112 */
113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
114 /*
115 * GCC 3.4 and later; we have __builtin_ctz().
116 */
117 #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
118 #elif defined(_MSC_VER)
119 /*
120 * Visual Studio; we support only 2005 and later, so use
121 * _BitScanForward().
122 */
123 #include <intrin.h>
124
125 #ifndef __clang__
126 #pragma intrinsic(_BitScanForward)
127 #endif
128
129 static __forceinline u_int
lowest_set_bit(int mask)130 lowest_set_bit(int mask)
131 {
132 unsigned long bit;
133
134 /*
135 * Don't sign-extend mask if long is longer than int.
136 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
137 */
138 if (_BitScanForward(&bit, (unsigned int)mask) == 0)
139 abort(); /* mask is zero */
140 return (u_int)bit;
141 }
142 #elif defined(MSDOS) && defined(__DJGPP__)
143 /*
144 * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
145 * we've already included.
146 */
147 #define lowest_set_bit(mask) ((u_int)(ffs((mask)) - 1))
148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
149 /*
150 * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
151 * or some other platform (UN*X conforming to a sufficient recent version
152 * of the Single UNIX Specification).
153 */
154 #include <strings.h>
155 #define lowest_set_bit(mask) (u_int)((ffs((mask)) - 1))
156 #else
157 /*
158 * None of the above.
159 * Use a perfect-hash-function-based function.
160 */
161 static u_int
lowest_set_bit(int mask)162 lowest_set_bit(int mask)
163 {
164 unsigned int v = (unsigned int)mask;
165
166 static const u_int MultiplyDeBruijnBitPosition[32] = {
167 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
168 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
169 };
170
171 /*
172 * We strip off all but the lowermost set bit (v & ~v),
173 * and perform a minimal perfect hash on it to look up the
174 * number of low-order zero bits in a table.
175 *
176 * See:
177 *
178 * http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
179 *
180 * http://supertech.csail.mit.edu/papers/debruijn.pdf
181 */
182 return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
183 }
184 #endif
185
186 /*
187 * Represents a deleted instruction.
188 */
189 #define NOP -1
190
191 /*
192 * Register numbers for use-def values.
193 * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
194 * location. A_ATOM is the accumulator and X_ATOM is the index
195 * register.
196 */
197 #define A_ATOM BPF_MEMWORDS
198 #define X_ATOM (BPF_MEMWORDS+1)
199
200 /*
201 * This define is used to represent *both* the accumulator and
202 * x register in use-def computations.
203 * Currently, the use-def code assumes only one definition per instruction.
204 */
205 #define AX_ATOM N_ATOMS
206
207 /*
208 * These data structures are used in a Cocke and Schwartz style
209 * value numbering scheme. Since the flowgraph is acyclic,
210 * exit values can be propagated from a node's predecessors
211 * provided it is uniquely defined.
212 */
213 struct valnode {
214 int code;
215 bpf_u_int32 v0, v1;
216 int val; /* the value number */
217 struct valnode *next;
218 };
219
220 /* Integer constants mapped with the load immediate opcode. */
221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
222
223 struct vmapinfo {
224 int is_const;
225 bpf_u_int32 const_val;
226 };
227
228 typedef struct {
229 /*
230 * Place to longjmp to on an error.
231 */
232 jmp_buf top_ctx;
233
234 /*
235 * The buffer into which to put error message.
236 */
237 char *errbuf;
238
239 /*
240 * A flag to indicate that further optimization is needed.
241 * Iterative passes are continued until a given pass yields no
242 * code simplification or branch movement.
243 */
244 int done;
245
246 /*
247 * XXX - detect loops that do nothing but repeated AND/OR pullups
248 * and edge moves.
249 * If 100 passes in a row do nothing but that, treat that as a
250 * sign that we're in a loop that just shuffles in a cycle in
251 * which each pass just shuffles the code and we eventually
252 * get back to the original configuration.
253 *
254 * XXX - we need a non-heuristic way of detecting, or preventing,
255 * such a cycle.
256 */
257 int non_branch_movement_performed;
258
259 u_int n_blocks; /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
260 struct block **blocks;
261 u_int n_edges; /* twice n_blocks, so guaranteed to be > 0 */
262 struct edge **edges;
263
264 /*
265 * A bit vector set representation of the dominators.
266 * We round up the set size to the next power of two.
267 */
268 u_int nodewords; /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
269 u_int edgewords; /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
270 struct block **levels;
271 bpf_u_int32 *space;
272
273 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
274 /*
275 * True if a is in uset {p}
276 */
277 #define SET_MEMBER(p, a) \
278 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
279
280 /*
281 * Add 'a' to uset p.
282 */
283 #define SET_INSERT(p, a) \
284 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
285
286 /*
287 * Delete 'a' from uset p.
288 */
289 #define SET_DELETE(p, a) \
290 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
291
292 /*
293 * a := a intersect b
294 * n must be guaranteed to be > 0
295 */
296 #define SET_INTERSECT(a, b, n)\
297 {\
298 register bpf_u_int32 *_x = a, *_y = b;\
299 register u_int _n = n;\
300 do *_x++ &= *_y++; while (--_n != 0);\
301 }
302
303 /*
304 * a := a - b
305 * n must be guaranteed to be > 0
306 */
307 #define SET_SUBTRACT(a, b, n)\
308 {\
309 register bpf_u_int32 *_x = a, *_y = b;\
310 register u_int _n = n;\
311 do *_x++ &=~ *_y++; while (--_n != 0);\
312 }
313
314 /*
315 * a := a union b
316 * n must be guaranteed to be > 0
317 */
318 #define SET_UNION(a, b, n)\
319 {\
320 register bpf_u_int32 *_x = a, *_y = b;\
321 register u_int _n = n;\
322 do *_x++ |= *_y++; while (--_n != 0);\
323 }
324
325 uset all_dom_sets;
326 uset all_closure_sets;
327 uset all_edge_sets;
328
329 #define MODULUS 213
330 struct valnode *hashtbl[MODULUS];
331 bpf_u_int32 curval;
332 bpf_u_int32 maxval;
333
334 struct vmapinfo *vmap;
335 struct valnode *vnode_base;
336 struct valnode *next_vnode;
337 } opt_state_t;
338
339 typedef struct {
340 /*
341 * Place to longjmp to on an error.
342 */
343 jmp_buf top_ctx;
344
345 /*
346 * The buffer into which to put error message.
347 */
348 char *errbuf;
349
350 /*
351 * Some pointers used to convert the basic block form of the code,
352 * into the array form that BPF requires. 'fstart' will point to
353 * the malloc'd array while 'ftail' is used during the recursive
354 * traversal.
355 */
356 struct bpf_insn *fstart;
357 struct bpf_insn *ftail;
358 } conv_state_t;
359
360 static void opt_init(opt_state_t *, struct icode *);
361 static void opt_cleanup(opt_state_t *);
362 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
363 PCAP_PRINTFLIKE(2, 3);
364
365 static void intern_blocks(opt_state_t *, struct icode *);
366
367 static void find_inedges(opt_state_t *, struct block *);
368 #ifdef BDEBUG
369 static void opt_dump(opt_state_t *, struct icode *);
370 #endif
371
372 #ifndef MAX
373 #define MAX(a,b) ((a)>(b)?(a):(b))
374 #endif
375
376 static void
find_levels_r(opt_state_t * opt_state,struct icode * ic,struct block * b)377 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
378 {
379 int level;
380
381 if (isMarked(ic, b))
382 return;
383
384 Mark(ic, b);
385 b->link = 0;
386
387 if (JT(b)) {
388 find_levels_r(opt_state, ic, JT(b));
389 find_levels_r(opt_state, ic, JF(b));
390 level = MAX(JT(b)->level, JF(b)->level) + 1;
391 } else
392 level = 0;
393 b->level = level;
394 b->link = opt_state->levels[level];
395 opt_state->levels[level] = b;
396 }
397
398 /*
399 * Level graph. The levels go from 0 at the leaves to
400 * N_LEVELS at the root. The opt_state->levels[] array points to the
401 * first node of the level list, whose elements are linked
402 * with the 'link' field of the struct block.
403 */
404 static void
find_levels(opt_state_t * opt_state,struct icode * ic)405 find_levels(opt_state_t *opt_state, struct icode *ic)
406 {
407 memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
408 unMarkAll(ic);
409 find_levels_r(opt_state, ic, ic->root);
410 }
411
412 /*
413 * Find dominator relationships.
414 * Assumes graph has been leveled.
415 */
416 static void
find_dom(opt_state_t * opt_state,struct block * root)417 find_dom(opt_state_t *opt_state, struct block *root)
418 {
419 u_int i;
420 int level;
421 struct block *b;
422 bpf_u_int32 *x;
423
424 /*
425 * Initialize sets to contain all nodes.
426 */
427 x = opt_state->all_dom_sets;
428 /*
429 * In opt_init(), we've made sure the product doesn't overflow.
430 */
431 i = opt_state->n_blocks * opt_state->nodewords;
432 while (i != 0) {
433 --i;
434 *x++ = 0xFFFFFFFFU;
435 }
436 /* Root starts off empty. */
437 for (i = opt_state->nodewords; i != 0;) {
438 --i;
439 root->dom[i] = 0;
440 }
441
442 /* root->level is the highest level no found. */
443 for (level = root->level; level >= 0; --level) {
444 for (b = opt_state->levels[level]; b; b = b->link) {
445 SET_INSERT(b->dom, b->id);
446 if (JT(b) == 0)
447 continue;
448 SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
449 SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
450 }
451 }
452 }
453
454 static void
propedom(opt_state_t * opt_state,struct edge * ep)455 propedom(opt_state_t *opt_state, struct edge *ep)
456 {
457 SET_INSERT(ep->edom, ep->id);
458 if (ep->succ) {
459 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
460 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
461 }
462 }
463
464 /*
465 * Compute edge dominators.
466 * Assumes graph has been leveled and predecessors established.
467 */
468 static void
find_edom(opt_state_t * opt_state,struct block * root)469 find_edom(opt_state_t *opt_state, struct block *root)
470 {
471 u_int i;
472 uset x;
473 int level;
474 struct block *b;
475
476 x = opt_state->all_edge_sets;
477 /*
478 * In opt_init(), we've made sure the product doesn't overflow.
479 */
480 for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
481 --i;
482 x[i] = 0xFFFFFFFFU;
483 }
484
485 /* root->level is the highest level no found. */
486 memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
487 memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
488 for (level = root->level; level >= 0; --level) {
489 for (b = opt_state->levels[level]; b != 0; b = b->link) {
490 propedom(opt_state, &b->et);
491 propedom(opt_state, &b->ef);
492 }
493 }
494 }
495
496 /*
497 * Find the backwards transitive closure of the flow graph. These sets
498 * are backwards in the sense that we find the set of nodes that reach
499 * a given node, not the set of nodes that can be reached by a node.
500 *
501 * Assumes graph has been leveled.
502 */
503 static void
find_closure(opt_state_t * opt_state,struct block * root)504 find_closure(opt_state_t *opt_state, struct block *root)
505 {
506 int level;
507 struct block *b;
508
509 /*
510 * Initialize sets to contain no nodes.
511 */
512 memset((char *)opt_state->all_closure_sets, 0,
513 opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
514
515 /* root->level is the highest level no found. */
516 for (level = root->level; level >= 0; --level) {
517 for (b = opt_state->levels[level]; b; b = b->link) {
518 SET_INSERT(b->closure, b->id);
519 if (JT(b) == 0)
520 continue;
521 SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
522 SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
523 }
524 }
525 }
526
527 /*
528 * Return the register number that is used by s.
529 *
530 * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
531 * are used, the scratch memory location's number if a scratch memory
532 * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
533 *
534 * The implementation should probably change to an array access.
535 */
536 static int
atomuse(struct stmt * s)537 atomuse(struct stmt *s)
538 {
539 register int c = s->code;
540
541 if (c == NOP)
542 return -1;
543
544 switch (BPF_CLASS(c)) {
545
546 case BPF_RET:
547 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
548 (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
549
550 case BPF_LD:
551 case BPF_LDX:
552 /*
553 * As there are fewer than 2^31 memory locations,
554 * s->k should be convertible to int without problems.
555 */
556 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
557 (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
558
559 case BPF_ST:
560 return A_ATOM;
561
562 case BPF_STX:
563 return X_ATOM;
564
565 case BPF_JMP:
566 case BPF_ALU:
567 if (BPF_SRC(c) == BPF_X)
568 return AX_ATOM;
569 return A_ATOM;
570
571 case BPF_MISC:
572 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
573 }
574 abort();
575 /* NOTREACHED */
576 }
577
578 /*
579 * Return the register number that is defined by 's'. We assume that
580 * a single stmt cannot define more than one register. If no register
581 * is defined, return -1.
582 *
583 * The implementation should probably change to an array access.
584 */
585 static int
atomdef(struct stmt * s)586 atomdef(struct stmt *s)
587 {
588 if (s->code == NOP)
589 return -1;
590
591 switch (BPF_CLASS(s->code)) {
592
593 case BPF_LD:
594 case BPF_ALU:
595 return A_ATOM;
596
597 case BPF_LDX:
598 return X_ATOM;
599
600 case BPF_ST:
601 case BPF_STX:
602 return s->k;
603
604 case BPF_MISC:
605 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
606 }
607 return -1;
608 }
609
610 /*
611 * Compute the sets of registers used, defined, and killed by 'b'.
612 *
613 * "Used" means that a statement in 'b' uses the register before any
614 * statement in 'b' defines it, i.e. it uses the value left in
615 * that register by a predecessor block of this block.
616 * "Defined" means that a statement in 'b' defines it.
617 * "Killed" means that a statement in 'b' defines it before any
618 * statement in 'b' uses it, i.e. it kills the value left in that
619 * register by a predecessor block of this block.
620 */
621 static void
compute_local_ud(struct block * b)622 compute_local_ud(struct block *b)
623 {
624 struct slist *s;
625 atomset def = 0, use = 0, killed = 0;
626 int atom;
627
628 for (s = b->stmts; s; s = s->next) {
629 if (s->s.code == NOP)
630 continue;
631 atom = atomuse(&s->s);
632 if (atom >= 0) {
633 if (atom == AX_ATOM) {
634 if (!ATOMELEM(def, X_ATOM))
635 use |= ATOMMASK(X_ATOM);
636 if (!ATOMELEM(def, A_ATOM))
637 use |= ATOMMASK(A_ATOM);
638 }
639 else if (atom < N_ATOMS) {
640 if (!ATOMELEM(def, atom))
641 use |= ATOMMASK(atom);
642 }
643 else
644 abort();
645 }
646 atom = atomdef(&s->s);
647 if (atom >= 0) {
648 if (!ATOMELEM(use, atom))
649 killed |= ATOMMASK(atom);
650 def |= ATOMMASK(atom);
651 }
652 }
653 if (BPF_CLASS(b->s.code) == BPF_JMP) {
654 /*
655 * XXX - what about RET?
656 */
657 atom = atomuse(&b->s);
658 if (atom >= 0) {
659 if (atom == AX_ATOM) {
660 if (!ATOMELEM(def, X_ATOM))
661 use |= ATOMMASK(X_ATOM);
662 if (!ATOMELEM(def, A_ATOM))
663 use |= ATOMMASK(A_ATOM);
664 }
665 else if (atom < N_ATOMS) {
666 if (!ATOMELEM(def, atom))
667 use |= ATOMMASK(atom);
668 }
669 else
670 abort();
671 }
672 }
673
674 b->def = def;
675 b->kill = killed;
676 b->in_use = use;
677 }
678
679 /*
680 * Assume graph is already leveled.
681 */
682 static void
find_ud(opt_state_t * opt_state,struct block * root)683 find_ud(opt_state_t *opt_state, struct block *root)
684 {
685 int i, maxlevel;
686 struct block *p;
687
688 /*
689 * root->level is the highest level no found;
690 * count down from there.
691 */
692 maxlevel = root->level;
693 for (i = maxlevel; i >= 0; --i)
694 for (p = opt_state->levels[i]; p; p = p->link) {
695 compute_local_ud(p);
696 p->out_use = 0;
697 }
698
699 for (i = 1; i <= maxlevel; ++i) {
700 for (p = opt_state->levels[i]; p; p = p->link) {
701 p->out_use |= JT(p)->in_use | JF(p)->in_use;
702 p->in_use |= p->out_use &~ p->kill;
703 }
704 }
705 }
706 static void
init_val(opt_state_t * opt_state)707 init_val(opt_state_t *opt_state)
708 {
709 opt_state->curval = 0;
710 opt_state->next_vnode = opt_state->vnode_base;
711 memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
712 memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
713 }
714
715 /*
716 * Because we really don't have an IR, this stuff is a little messy.
717 *
718 * This routine looks in the table of existing value number for a value
719 * with generated from an operation with the specified opcode and
720 * the specified values. If it finds it, it returns its value number,
721 * otherwise it makes a new entry in the table and returns the
722 * value number of that entry.
723 */
724 static bpf_u_int32
F(opt_state_t * opt_state,int code,bpf_u_int32 v0,bpf_u_int32 v1)725 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
726 {
727 u_int hash;
728 bpf_u_int32 val;
729 struct valnode *p;
730
731 hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
732 hash %= MODULUS;
733
734 for (p = opt_state->hashtbl[hash]; p; p = p->next)
735 if (p->code == code && p->v0 == v0 && p->v1 == v1)
736 return p->val;
737
738 /*
739 * Not found. Allocate a new value, and assign it a new
740 * value number.
741 *
742 * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
743 * increment it before using it as the new value number, which
744 * means we never assign VAL_UNKNOWN.
745 *
746 * XXX - unless we overflow, but we probably won't have 2^32-1
747 * values; we treat 32 bits as effectively infinite.
748 */
749 val = ++opt_state->curval;
750 if (BPF_MODE(code) == BPF_IMM &&
751 (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
752 opt_state->vmap[val].const_val = v0;
753 opt_state->vmap[val].is_const = 1;
754 }
755 p = opt_state->next_vnode++;
756 p->val = val;
757 p->code = code;
758 p->v0 = v0;
759 p->v1 = v1;
760 p->next = opt_state->hashtbl[hash];
761 opt_state->hashtbl[hash] = p;
762
763 return val;
764 }
765
766 static inline void
vstore(struct stmt * s,bpf_u_int32 * valp,bpf_u_int32 newval,int alter)767 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
768 {
769 if (alter && newval != VAL_UNKNOWN && *valp == newval)
770 s->code = NOP;
771 else
772 *valp = newval;
773 }
774
775 /*
776 * Do constant-folding on binary operators.
777 * (Unary operators are handled elsewhere.)
778 */
779 static void
fold_op(opt_state_t * opt_state,struct stmt * s,bpf_u_int32 v0,bpf_u_int32 v1)780 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
781 {
782 bpf_u_int32 a, b;
783
784 a = opt_state->vmap[v0].const_val;
785 b = opt_state->vmap[v1].const_val;
786
787 switch (BPF_OP(s->code)) {
788 case BPF_ADD:
789 a += b;
790 break;
791
792 case BPF_SUB:
793 a -= b;
794 break;
795
796 case BPF_MUL:
797 a *= b;
798 break;
799
800 case BPF_DIV:
801 if (b == 0)
802 opt_error(opt_state, "division by zero");
803 a /= b;
804 break;
805
806 case BPF_MOD:
807 if (b == 0)
808 opt_error(opt_state, "modulus by zero");
809 a %= b;
810 break;
811
812 case BPF_AND:
813 a &= b;
814 break;
815
816 case BPF_OR:
817 a |= b;
818 break;
819
820 case BPF_XOR:
821 a ^= b;
822 break;
823
824 case BPF_LSH:
825 /*
826 * A left shift of more than the width of the type
827 * is undefined in C; we'll just treat it as shifting
828 * all the bits out.
829 *
830 * XXX - the BPF interpreter doesn't check for this,
831 * so its behavior is dependent on the behavior of
832 * the processor on which it's running. There are
833 * processors on which it shifts all the bits out
834 * and processors on which it does no shift.
835 */
836 if (b < 32)
837 a <<= b;
838 else
839 a = 0;
840 break;
841
842 case BPF_RSH:
843 /*
844 * A right shift of more than the width of the type
845 * is undefined in C; we'll just treat it as shifting
846 * all the bits out.
847 *
848 * XXX - the BPF interpreter doesn't check for this,
849 * so its behavior is dependent on the behavior of
850 * the processor on which it's running. There are
851 * processors on which it shifts all the bits out
852 * and processors on which it does no shift.
853 */
854 if (b < 32)
855 a >>= b;
856 else
857 a = 0;
858 break;
859
860 default:
861 abort();
862 }
863 s->k = a;
864 s->code = BPF_LD|BPF_IMM;
865 /*
866 * XXX - optimizer loop detection.
867 */
868 opt_state->non_branch_movement_performed = 1;
869 opt_state->done = 0;
870 }
871
872 static inline struct slist *
this_op(struct slist * s)873 this_op(struct slist *s)
874 {
875 while (s != 0 && s->s.code == NOP)
876 s = s->next;
877 return s;
878 }
879
880 static void
opt_not(struct block * b)881 opt_not(struct block *b)
882 {
883 struct block *tmp = JT(b);
884
885 JT(b) = JF(b);
886 JF(b) = tmp;
887 }
888
889 static void
opt_peep(opt_state_t * opt_state,struct block * b)890 opt_peep(opt_state_t *opt_state, struct block *b)
891 {
892 struct slist *s;
893 struct slist *next, *last;
894 bpf_u_int32 val;
895
896 s = b->stmts;
897 if (s == 0)
898 return;
899
900 last = s;
901 for (/*empty*/; /*empty*/; s = next) {
902 /*
903 * Skip over nops.
904 */
905 s = this_op(s);
906 if (s == 0)
907 break; /* nothing left in the block */
908
909 /*
910 * Find the next real instruction after that one
911 * (skipping nops).
912 */
913 next = this_op(s->next);
914 if (next == 0)
915 break; /* no next instruction */
916 last = next;
917
918 /*
919 * st M[k] --> st M[k]
920 * ldx M[k] tax
921 */
922 if (s->s.code == BPF_ST &&
923 next->s.code == (BPF_LDX|BPF_MEM) &&
924 s->s.k == next->s.k) {
925 /*
926 * XXX - optimizer loop detection.
927 */
928 opt_state->non_branch_movement_performed = 1;
929 opt_state->done = 0;
930 next->s.code = BPF_MISC|BPF_TAX;
931 }
932 /*
933 * ld #k --> ldx #k
934 * tax txa
935 */
936 if (s->s.code == (BPF_LD|BPF_IMM) &&
937 next->s.code == (BPF_MISC|BPF_TAX)) {
938 s->s.code = BPF_LDX|BPF_IMM;
939 next->s.code = BPF_MISC|BPF_TXA;
940 /*
941 * XXX - optimizer loop detection.
942 */
943 opt_state->non_branch_movement_performed = 1;
944 opt_state->done = 0;
945 }
946 /*
947 * This is an ugly special case, but it happens
948 * when you say tcp[k] or udp[k] where k is a constant.
949 */
950 if (s->s.code == (BPF_LD|BPF_IMM)) {
951 struct slist *add, *tax, *ild;
952
953 /*
954 * Check that X isn't used on exit from this
955 * block (which the optimizer might cause).
956 * We know the code generator won't generate
957 * any local dependencies.
958 */
959 if (ATOMELEM(b->out_use, X_ATOM))
960 continue;
961
962 /*
963 * Check that the instruction following the ldi
964 * is an addx, or it's an ldxms with an addx
965 * following it (with 0 or more nops between the
966 * ldxms and addx).
967 */
968 if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
969 add = next;
970 else
971 add = this_op(next->next);
972 if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
973 continue;
974
975 /*
976 * Check that a tax follows that (with 0 or more
977 * nops between them).
978 */
979 tax = this_op(add->next);
980 if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
981 continue;
982
983 /*
984 * Check that an ild follows that (with 0 or more
985 * nops between them).
986 */
987 ild = this_op(tax->next);
988 if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
989 BPF_MODE(ild->s.code) != BPF_IND)
990 continue;
991 /*
992 * We want to turn this sequence:
993 *
994 * (004) ldi #0x2 {s}
995 * (005) ldxms [14] {next} -- optional
996 * (006) addx {add}
997 * (007) tax {tax}
998 * (008) ild [x+0] {ild}
999 *
1000 * into this sequence:
1001 *
1002 * (004) nop
1003 * (005) ldxms [14]
1004 * (006) nop
1005 * (007) nop
1006 * (008) ild [x+2]
1007 *
1008 * XXX We need to check that X is not
1009 * subsequently used, because we want to change
1010 * what'll be in it after this sequence.
1011 *
1012 * We know we can eliminate the accumulator
1013 * modifications earlier in the sequence since
1014 * it is defined by the last stmt of this sequence
1015 * (i.e., the last statement of the sequence loads
1016 * a value into the accumulator, so we can eliminate
1017 * earlier operations on the accumulator).
1018 */
1019 ild->s.k += s->s.k;
1020 s->s.code = NOP;
1021 add->s.code = NOP;
1022 tax->s.code = NOP;
1023 /*
1024 * XXX - optimizer loop detection.
1025 */
1026 opt_state->non_branch_movement_performed = 1;
1027 opt_state->done = 0;
1028 }
1029 }
1030 /*
1031 * If the comparison at the end of a block is an equality
1032 * comparison against a constant, and nobody uses the value
1033 * we leave in the A register at the end of a block, and
1034 * the operation preceding the comparison is an arithmetic
1035 * operation, we can sometime optimize it away.
1036 */
1037 if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
1038 !ATOMELEM(b->out_use, A_ATOM)) {
1039 /*
1040 * We can optimize away certain subtractions of the
1041 * X register.
1042 */
1043 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
1044 val = b->val[X_ATOM];
1045 if (opt_state->vmap[val].is_const) {
1046 /*
1047 * If we have a subtract to do a comparison,
1048 * and the X register is a known constant,
1049 * we can merge this value into the
1050 * comparison:
1051 *
1052 * sub x -> nop
1053 * jeq #y jeq #(x+y)
1054 */
1055 b->s.k += opt_state->vmap[val].const_val;
1056 last->s.code = NOP;
1057 /*
1058 * XXX - optimizer loop detection.
1059 */
1060 opt_state->non_branch_movement_performed = 1;
1061 opt_state->done = 0;
1062 } else if (b->s.k == 0) {
1063 /*
1064 * If the X register isn't a constant,
1065 * and the comparison in the test is
1066 * against 0, we can compare with the
1067 * X register, instead:
1068 *
1069 * sub x -> nop
1070 * jeq #0 jeq x
1071 */
1072 last->s.code = NOP;
1073 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1074 /*
1075 * XXX - optimizer loop detection.
1076 */
1077 opt_state->non_branch_movement_performed = 1;
1078 opt_state->done = 0;
1079 }
1080 }
1081 /*
1082 * Likewise, a constant subtract can be simplified:
1083 *
1084 * sub #x -> nop
1085 * jeq #y -> jeq #(x+y)
1086 */
1087 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1088 last->s.code = NOP;
1089 b->s.k += last->s.k;
1090 /*
1091 * XXX - optimizer loop detection.
1092 */
1093 opt_state->non_branch_movement_performed = 1;
1094 opt_state->done = 0;
1095 }
1096 /*
1097 * And, similarly, a constant AND can be simplified
1098 * if we're testing against 0, i.e.:
1099 *
1100 * and #k nop
1101 * jeq #0 -> jset #k
1102 */
1103 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1104 b->s.k == 0) {
1105 b->s.k = last->s.k;
1106 b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1107 last->s.code = NOP;
1108 /*
1109 * XXX - optimizer loop detection.
1110 */
1111 opt_state->non_branch_movement_performed = 1;
1112 opt_state->done = 0;
1113 opt_not(b);
1114 }
1115 }
1116 /*
1117 * jset #0 -> never
1118 * jset #ffffffff -> always
1119 */
1120 if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1121 if (b->s.k == 0)
1122 JT(b) = JF(b);
1123 if (b->s.k == 0xffffffffU)
1124 JF(b) = JT(b);
1125 }
1126 /*
1127 * If we're comparing against the index register, and the index
1128 * register is a known constant, we can just compare against that
1129 * constant.
1130 */
1131 val = b->val[X_ATOM];
1132 if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1133 bpf_u_int32 v = opt_state->vmap[val].const_val;
1134 b->s.code &= ~BPF_X;
1135 b->s.k = v;
1136 }
1137 /*
1138 * If the accumulator is a known constant, we can compute the
1139 * comparison result.
1140 */
1141 val = b->val[A_ATOM];
1142 if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1143 bpf_u_int32 v = opt_state->vmap[val].const_val;
1144 switch (BPF_OP(b->s.code)) {
1145
1146 case BPF_JEQ:
1147 v = v == b->s.k;
1148 break;
1149
1150 case BPF_JGT:
1151 v = v > b->s.k;
1152 break;
1153
1154 case BPF_JGE:
1155 v = v >= b->s.k;
1156 break;
1157
1158 case BPF_JSET:
1159 v &= b->s.k;
1160 break;
1161
1162 default:
1163 abort();
1164 }
1165 if (JF(b) != JT(b)) {
1166 /*
1167 * XXX - optimizer loop detection.
1168 */
1169 opt_state->non_branch_movement_performed = 1;
1170 opt_state->done = 0;
1171 }
1172 if (v)
1173 JF(b) = JT(b);
1174 else
1175 JT(b) = JF(b);
1176 }
1177 }
1178
1179 /*
1180 * Compute the symbolic value of expression of 's', and update
1181 * anything it defines in the value table 'val'. If 'alter' is true,
1182 * do various optimizations. This code would be cleaner if symbolic
1183 * evaluation and code transformations weren't folded together.
1184 */
1185 static void
opt_stmt(opt_state_t * opt_state,struct stmt * s,bpf_u_int32 val[],int alter)1186 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
1187 {
1188 int op;
1189 bpf_u_int32 v;
1190
1191 switch (s->code) {
1192
1193 case BPF_LD|BPF_ABS|BPF_W:
1194 case BPF_LD|BPF_ABS|BPF_H:
1195 case BPF_LD|BPF_ABS|BPF_B:
1196 v = F(opt_state, s->code, s->k, 0L);
1197 vstore(s, &val[A_ATOM], v, alter);
1198 break;
1199
1200 case BPF_LD|BPF_IND|BPF_W:
1201 case BPF_LD|BPF_IND|BPF_H:
1202 case BPF_LD|BPF_IND|BPF_B:
1203 v = val[X_ATOM];
1204 if (alter && opt_state->vmap[v].is_const) {
1205 s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1206 s->k += opt_state->vmap[v].const_val;
1207 v = F(opt_state, s->code, s->k, 0L);
1208 /*
1209 * XXX - optimizer loop detection.
1210 */
1211 opt_state->non_branch_movement_performed = 1;
1212 opt_state->done = 0;
1213 }
1214 else
1215 v = F(opt_state, s->code, s->k, v);
1216 vstore(s, &val[A_ATOM], v, alter);
1217 break;
1218
1219 case BPF_LD|BPF_LEN:
1220 v = F(opt_state, s->code, 0L, 0L);
1221 vstore(s, &val[A_ATOM], v, alter);
1222 break;
1223
1224 case BPF_LD|BPF_IMM:
1225 v = K(s->k);
1226 vstore(s, &val[A_ATOM], v, alter);
1227 break;
1228
1229 case BPF_LDX|BPF_IMM:
1230 v = K(s->k);
1231 vstore(s, &val[X_ATOM], v, alter);
1232 break;
1233
1234 case BPF_LDX|BPF_MSH|BPF_B:
1235 v = F(opt_state, s->code, s->k, 0L);
1236 vstore(s, &val[X_ATOM], v, alter);
1237 break;
1238
1239 case BPF_ALU|BPF_NEG:
1240 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1241 s->code = BPF_LD|BPF_IMM;
1242 /*
1243 * Do this negation as unsigned arithmetic; that's
1244 * what modern BPF engines do, and it guarantees
1245 * that all possible values can be negated. (Yeah,
1246 * negating 0x80000000, the minimum signed 32-bit
1247 * two's-complement value, results in 0x80000000,
1248 * so it's still negative, but we *should* be doing
1249 * all unsigned arithmetic here, to match what
1250 * modern BPF engines do.)
1251 *
1252 * Express it as 0U - (unsigned value) so that we
1253 * don't get compiler warnings about negating an
1254 * unsigned value and don't get UBSan warnings
1255 * about the result of negating 0x80000000 being
1256 * undefined.
1257 */
1258 s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
1259 val[A_ATOM] = K(s->k);
1260 }
1261 else
1262 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1263 break;
1264
1265 case BPF_ALU|BPF_ADD|BPF_K:
1266 case BPF_ALU|BPF_SUB|BPF_K:
1267 case BPF_ALU|BPF_MUL|BPF_K:
1268 case BPF_ALU|BPF_DIV|BPF_K:
1269 case BPF_ALU|BPF_MOD|BPF_K:
1270 case BPF_ALU|BPF_AND|BPF_K:
1271 case BPF_ALU|BPF_OR|BPF_K:
1272 case BPF_ALU|BPF_XOR|BPF_K:
1273 case BPF_ALU|BPF_LSH|BPF_K:
1274 case BPF_ALU|BPF_RSH|BPF_K:
1275 op = BPF_OP(s->code);
1276 if (alter) {
1277 if (s->k == 0) {
1278 /*
1279 * Optimize operations where the constant
1280 * is zero.
1281 *
1282 * Don't optimize away "sub #0"
1283 * as it may be needed later to
1284 * fixup the generated math code.
1285 *
1286 * Fail if we're dividing by zero or taking
1287 * a modulus by zero.
1288 */
1289 if (op == BPF_ADD ||
1290 op == BPF_LSH || op == BPF_RSH ||
1291 op == BPF_OR || op == BPF_XOR) {
1292 s->code = NOP;
1293 break;
1294 }
1295 if (op == BPF_MUL || op == BPF_AND) {
1296 s->code = BPF_LD|BPF_IMM;
1297 val[A_ATOM] = K(s->k);
1298 break;
1299 }
1300 if (op == BPF_DIV)
1301 opt_error(opt_state,
1302 "division by zero");
1303 if (op == BPF_MOD)
1304 opt_error(opt_state,
1305 "modulus by zero");
1306 }
1307 if (opt_state->vmap[val[A_ATOM]].is_const) {
1308 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1309 val[A_ATOM] = K(s->k);
1310 break;
1311 }
1312 }
1313 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1314 break;
1315
1316 case BPF_ALU|BPF_ADD|BPF_X:
1317 case BPF_ALU|BPF_SUB|BPF_X:
1318 case BPF_ALU|BPF_MUL|BPF_X:
1319 case BPF_ALU|BPF_DIV|BPF_X:
1320 case BPF_ALU|BPF_MOD|BPF_X:
1321 case BPF_ALU|BPF_AND|BPF_X:
1322 case BPF_ALU|BPF_OR|BPF_X:
1323 case BPF_ALU|BPF_XOR|BPF_X:
1324 case BPF_ALU|BPF_LSH|BPF_X:
1325 case BPF_ALU|BPF_RSH|BPF_X:
1326 op = BPF_OP(s->code);
1327 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1328 if (opt_state->vmap[val[A_ATOM]].is_const) {
1329 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1330 val[A_ATOM] = K(s->k);
1331 }
1332 else {
1333 s->code = BPF_ALU|BPF_K|op;
1334 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1335 if ((op == BPF_LSH || op == BPF_RSH) &&
1336 s->k > 31)
1337 opt_error(opt_state,
1338 "shift by more than 31 bits");
1339 /*
1340 * XXX - optimizer loop detection.
1341 */
1342 opt_state->non_branch_movement_performed = 1;
1343 opt_state->done = 0;
1344 val[A_ATOM] =
1345 F(opt_state, s->code, val[A_ATOM], K(s->k));
1346 }
1347 break;
1348 }
1349 /*
1350 * Check if we're doing something to an accumulator
1351 * that is 0, and simplify. This may not seem like
1352 * much of a simplification but it could open up further
1353 * optimizations.
1354 * XXX We could also check for mul by 1, etc.
1355 */
1356 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1357 && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1358 if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1359 s->code = BPF_MISC|BPF_TXA;
1360 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1361 break;
1362 }
1363 else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1364 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1365 s->code = BPF_LD|BPF_IMM;
1366 s->k = 0;
1367 vstore(s, &val[A_ATOM], K(s->k), alter);
1368 break;
1369 }
1370 else if (op == BPF_NEG) {
1371 s->code = NOP;
1372 break;
1373 }
1374 }
1375 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1376 break;
1377
1378 case BPF_MISC|BPF_TXA:
1379 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1380 break;
1381
1382 case BPF_LD|BPF_MEM:
1383 v = val[s->k];
1384 if (alter && opt_state->vmap[v].is_const) {
1385 s->code = BPF_LD|BPF_IMM;
1386 s->k = opt_state->vmap[v].const_val;
1387 /*
1388 * XXX - optimizer loop detection.
1389 */
1390 opt_state->non_branch_movement_performed = 1;
1391 opt_state->done = 0;
1392 }
1393 vstore(s, &val[A_ATOM], v, alter);
1394 break;
1395
1396 case BPF_MISC|BPF_TAX:
1397 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1398 break;
1399
1400 case BPF_LDX|BPF_MEM:
1401 v = val[s->k];
1402 if (alter && opt_state->vmap[v].is_const) {
1403 s->code = BPF_LDX|BPF_IMM;
1404 s->k = opt_state->vmap[v].const_val;
1405 /*
1406 * XXX - optimizer loop detection.
1407 */
1408 opt_state->non_branch_movement_performed = 1;
1409 opt_state->done = 0;
1410 }
1411 vstore(s, &val[X_ATOM], v, alter);
1412 break;
1413
1414 case BPF_ST:
1415 vstore(s, &val[s->k], val[A_ATOM], alter);
1416 break;
1417
1418 case BPF_STX:
1419 vstore(s, &val[s->k], val[X_ATOM], alter);
1420 break;
1421 }
1422 }
1423
1424 static void
deadstmt(opt_state_t * opt_state,register struct stmt * s,register struct stmt * last[])1425 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1426 {
1427 register int atom;
1428
1429 atom = atomuse(s);
1430 if (atom >= 0) {
1431 if (atom == AX_ATOM) {
1432 last[X_ATOM] = 0;
1433 last[A_ATOM] = 0;
1434 }
1435 else
1436 last[atom] = 0;
1437 }
1438 atom = atomdef(s);
1439 if (atom >= 0) {
1440 if (last[atom]) {
1441 /*
1442 * XXX - optimizer loop detection.
1443 */
1444 opt_state->non_branch_movement_performed = 1;
1445 opt_state->done = 0;
1446 last[atom]->code = NOP;
1447 }
1448 last[atom] = s;
1449 }
1450 }
1451
1452 static void
opt_deadstores(opt_state_t * opt_state,register struct block * b)1453 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1454 {
1455 register struct slist *s;
1456 register int atom;
1457 struct stmt *last[N_ATOMS];
1458
1459 memset((char *)last, 0, sizeof last);
1460
1461 for (s = b->stmts; s != 0; s = s->next)
1462 deadstmt(opt_state, &s->s, last);
1463 deadstmt(opt_state, &b->s, last);
1464
1465 for (atom = 0; atom < N_ATOMS; ++atom)
1466 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1467 last[atom]->code = NOP;
1468 /*
1469 * The store was removed as it's dead,
1470 * so the value stored into now has
1471 * an unknown value.
1472 */
1473 vstore(0, &b->val[atom], VAL_UNKNOWN, 0);
1474 /*
1475 * XXX - optimizer loop detection.
1476 */
1477 opt_state->non_branch_movement_performed = 1;
1478 opt_state->done = 0;
1479 }
1480 }
1481
1482 static void
opt_blk(opt_state_t * opt_state,struct block * b,int do_stmts)1483 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1484 {
1485 struct slist *s;
1486 struct edge *p;
1487 int i;
1488 bpf_u_int32 aval, xval;
1489
1490 #if 0
1491 for (s = b->stmts; s && s->next; s = s->next)
1492 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1493 do_stmts = 0;
1494 break;
1495 }
1496 #endif
1497
1498 /*
1499 * Initialize the atom values.
1500 */
1501 p = b->in_edges;
1502 if (p == 0) {
1503 /*
1504 * We have no predecessors, so everything is undefined
1505 * upon entry to this block.
1506 */
1507 memset((char *)b->val, 0, sizeof(b->val));
1508 } else {
1509 /*
1510 * Inherit values from our predecessors.
1511 *
1512 * First, get the values from the predecessor along the
1513 * first edge leading to this node.
1514 */
1515 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1516 /*
1517 * Now look at all the other nodes leading to this node.
1518 * If, for the predecessor along that edge, a register
1519 * has a different value from the one we have (i.e.,
1520 * control paths are merging, and the merging paths
1521 * assign different values to that register), give the
1522 * register the undefined value of 0.
1523 */
1524 while ((p = p->next) != NULL) {
1525 for (i = 0; i < N_ATOMS; ++i)
1526 if (b->val[i] != p->pred->val[i])
1527 b->val[i] = 0;
1528 }
1529 }
1530 aval = b->val[A_ATOM];
1531 xval = b->val[X_ATOM];
1532 for (s = b->stmts; s; s = s->next)
1533 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1534
1535 /*
1536 * This is a special case: if we don't use anything from this
1537 * block, and we load the accumulator or index register with a
1538 * value that is already there, or if this block is a return,
1539 * eliminate all the statements.
1540 *
1541 * XXX - what if it does a store? Presumably that falls under
1542 * the heading of "if we don't use anything from this block",
1543 * i.e., if we use any memory location set to a different
1544 * value by this block, then we use something from this block.
1545 *
1546 * XXX - why does it matter whether we use anything from this
1547 * block? If the accumulator or index register doesn't change
1548 * its value, isn't that OK even if we use that value?
1549 *
1550 * XXX - if we load the accumulator with a different value,
1551 * and the block ends with a conditional branch, we obviously
1552 * can't eliminate it, as the branch depends on that value.
1553 * For the index register, the conditional branch only depends
1554 * on the index register value if the test is against the index
1555 * register value rather than a constant; if nothing uses the
1556 * value we put into the index register, and we're not testing
1557 * against the index register's value, and there aren't any
1558 * other problems that would keep us from eliminating this
1559 * block, can we eliminate it?
1560 */
1561 if (do_stmts &&
1562 ((b->out_use == 0 &&
1563 aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1564 xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1565 BPF_CLASS(b->s.code) == BPF_RET)) {
1566 if (b->stmts != 0) {
1567 b->stmts = 0;
1568 /*
1569 * XXX - optimizer loop detection.
1570 */
1571 opt_state->non_branch_movement_performed = 1;
1572 opt_state->done = 0;
1573 }
1574 } else {
1575 opt_peep(opt_state, b);
1576 opt_deadstores(opt_state, b);
1577 }
1578 /*
1579 * Set up values for branch optimizer.
1580 */
1581 if (BPF_SRC(b->s.code) == BPF_K)
1582 b->oval = K(b->s.k);
1583 else
1584 b->oval = b->val[X_ATOM];
1585 b->et.code = b->s.code;
1586 b->ef.code = -b->s.code;
1587 }
1588
1589 /*
1590 * Return true if any register that is used on exit from 'succ', has
1591 * an exit value that is different from the corresponding exit value
1592 * from 'b'.
1593 */
1594 static int
use_conflict(struct block * b,struct block * succ)1595 use_conflict(struct block *b, struct block *succ)
1596 {
1597 int atom;
1598 atomset use = succ->out_use;
1599
1600 if (use == 0)
1601 return 0;
1602
1603 for (atom = 0; atom < N_ATOMS; ++atom)
1604 if (ATOMELEM(use, atom))
1605 if (b->val[atom] != succ->val[atom])
1606 return 1;
1607 return 0;
1608 }
1609
1610 /*
1611 * Given a block that is the successor of an edge, and an edge that
1612 * dominates that edge, return either a pointer to a child of that
1613 * block (a block to which that block jumps) if that block is a
1614 * candidate to replace the successor of the latter edge or NULL
1615 * if neither of the children of the first block are candidates.
1616 */
1617 static struct block *
fold_edge(struct block * child,struct edge * ep)1618 fold_edge(struct block *child, struct edge *ep)
1619 {
1620 int sense;
1621 bpf_u_int32 aval0, aval1, oval0, oval1;
1622 int code = ep->code;
1623
1624 if (code < 0) {
1625 /*
1626 * This edge is a "branch if false" edge.
1627 */
1628 code = -code;
1629 sense = 0;
1630 } else {
1631 /*
1632 * This edge is a "branch if true" edge.
1633 */
1634 sense = 1;
1635 }
1636
1637 /*
1638 * If the opcode for the branch at the end of the block we
1639 * were handed isn't the same as the opcode for the branch
1640 * to which the edge we were handed corresponds, the tests
1641 * for those branches aren't testing the same conditions,
1642 * so the blocks to which the first block branches aren't
1643 * candidates to replace the successor of the edge.
1644 */
1645 if (child->s.code != code)
1646 return 0;
1647
1648 aval0 = child->val[A_ATOM];
1649 oval0 = child->oval;
1650 aval1 = ep->pred->val[A_ATOM];
1651 oval1 = ep->pred->oval;
1652
1653 /*
1654 * If the A register value on exit from the successor block
1655 * isn't the same as the A register value on exit from the
1656 * predecessor of the edge, the blocks to which the first
1657 * block branches aren't candidates to replace the successor
1658 * of the edge.
1659 */
1660 if (aval0 != aval1)
1661 return 0;
1662
1663 if (oval0 == oval1)
1664 /*
1665 * The operands of the branch instructions are
1666 * identical, so the branches are testing the
1667 * same condition, and the result is true if a true
1668 * branch was taken to get here, otherwise false.
1669 */
1670 return sense ? JT(child) : JF(child);
1671
1672 if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1673 /*
1674 * At this point, we only know the comparison if we
1675 * came down the true branch, and it was an equality
1676 * comparison with a constant.
1677 *
1678 * I.e., if we came down the true branch, and the branch
1679 * was an equality comparison with a constant, we know the
1680 * accumulator contains that constant. If we came down
1681 * the false branch, or the comparison wasn't with a
1682 * constant, we don't know what was in the accumulator.
1683 *
1684 * We rely on the fact that distinct constants have distinct
1685 * value numbers.
1686 */
1687 return JF(child);
1688
1689 return 0;
1690 }
1691
1692 /*
1693 * If we can make this edge go directly to a child of the edge's current
1694 * successor, do so.
1695 */
1696 static void
opt_j(opt_state_t * opt_state,struct edge * ep)1697 opt_j(opt_state_t *opt_state, struct edge *ep)
1698 {
1699 register u_int i, k;
1700 register struct block *target;
1701
1702 /*
1703 * Does this edge go to a block where, if the test
1704 * at the end of it succeeds, it goes to a block
1705 * that's a leaf node of the DAG, i.e. a return
1706 * statement?
1707 * If so, there's nothing to optimize.
1708 */
1709 if (JT(ep->succ) == 0)
1710 return;
1711
1712 /*
1713 * Does this edge go to a block that goes, in turn, to
1714 * the same block regardless of whether the test at the
1715 * end succeeds or fails?
1716 */
1717 if (JT(ep->succ) == JF(ep->succ)) {
1718 /*
1719 * Common branch targets can be eliminated, provided
1720 * there is no data dependency.
1721 *
1722 * Check whether any register used on exit from the
1723 * block to which the successor of this edge goes
1724 * has a value at that point that's different from
1725 * the value it has on exit from the predecessor of
1726 * this edge. If not, the predecessor of this edge
1727 * can just go to the block to which the successor
1728 * of this edge goes, bypassing the successor of this
1729 * edge, as the successor of this edge isn't doing
1730 * any calculations whose results are different
1731 * from what the blocks before it did and isn't
1732 * doing any tests the results of which matter.
1733 */
1734 if (!use_conflict(ep->pred, JT(ep->succ))) {
1735 /*
1736 * No, there isn't.
1737 * Make this edge go to the block to
1738 * which the successor of that edge
1739 * goes.
1740 *
1741 * XXX - optimizer loop detection.
1742 */
1743 opt_state->non_branch_movement_performed = 1;
1744 opt_state->done = 0;
1745 ep->succ = JT(ep->succ);
1746 }
1747 }
1748 /*
1749 * For each edge dominator that matches the successor of this
1750 * edge, promote the edge successor to the its grandchild.
1751 *
1752 * XXX We violate the set abstraction here in favor a reasonably
1753 * efficient loop.
1754 */
1755 top:
1756 for (i = 0; i < opt_state->edgewords; ++i) {
1757 /* i'th word in the bitset of dominators */
1758 register bpf_u_int32 x = ep->edom[i];
1759
1760 while (x != 0) {
1761 /* Find the next dominator in that word and mark it as found */
1762 k = lowest_set_bit(x);
1763 x &=~ ((bpf_u_int32)1 << k);
1764 k += i * BITS_PER_WORD;
1765
1766 target = fold_edge(ep->succ, opt_state->edges[k]);
1767 /*
1768 * We have a candidate to replace the successor
1769 * of ep.
1770 *
1771 * Check that there is no data dependency between
1772 * nodes that will be violated if we move the edge;
1773 * i.e., if any register used on exit from the
1774 * candidate has a value at that point different
1775 * from the value it has when we exit the
1776 * predecessor of that edge, there's a data
1777 * dependency that will be violated.
1778 */
1779 if (target != 0 && !use_conflict(ep->pred, target)) {
1780 /*
1781 * It's safe to replace the successor of
1782 * ep; do so, and note that we've made
1783 * at least one change.
1784 *
1785 * XXX - this is one of the operations that
1786 * happens when the optimizer gets into
1787 * one of those infinite loops.
1788 */
1789 opt_state->done = 0;
1790 ep->succ = target;
1791 if (JT(target) != 0)
1792 /*
1793 * Start over unless we hit a leaf.
1794 */
1795 goto top;
1796 return;
1797 }
1798 }
1799 }
1800 }
1801
1802 /*
1803 * XXX - is this, and and_pullup(), what's described in section 6.1.2
1804 * "Predicate Assertion Propagation" in the BPF+ paper?
1805 *
1806 * Note that this looks at block dominators, not edge dominators.
1807 * Don't think so.
1808 *
1809 * "A or B" compiles into
1810 *
1811 * A
1812 * t / \ f
1813 * / B
1814 * / t / \ f
1815 * \ /
1816 * \ /
1817 * X
1818 *
1819 *
1820 */
1821 static void
or_pullup(opt_state_t * opt_state,struct block * b,struct block * root)1822 or_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
1823 {
1824 bpf_u_int32 val;
1825 int at_top;
1826 struct block *pull;
1827 struct block **diffp, **samep;
1828 struct edge *ep;
1829
1830 ep = b->in_edges;
1831 if (ep == 0)
1832 return;
1833
1834 /*
1835 * Make sure each predecessor loads the same value.
1836 * XXX why?
1837 */
1838 val = ep->pred->val[A_ATOM];
1839 for (ep = ep->next; ep != 0; ep = ep->next)
1840 if (val != ep->pred->val[A_ATOM])
1841 return;
1842
1843 /*
1844 * For the first edge in the list of edges coming into this block,
1845 * see whether the predecessor of that edge comes here via a true
1846 * branch or a false branch.
1847 */
1848 if (JT(b->in_edges->pred) == b)
1849 diffp = &JT(b->in_edges->pred); /* jt */
1850 else
1851 diffp = &JF(b->in_edges->pred); /* jf */
1852
1853 /*
1854 * diffp is a pointer to a pointer to the block.
1855 *
1856 * Go down the false chain looking as far as you can,
1857 * making sure that each jump-compare is doing the
1858 * same as the original block.
1859 *
1860 * If you reach the bottom before you reach a
1861 * different jump-compare, just exit. There's nothing
1862 * to do here. XXX - no, this version is checking for
1863 * the value leaving the block; that's from the BPF+
1864 * pullup routine.
1865 */
1866 at_top = 1;
1867 for (;;) {
1868 /*
1869 * Done if that's not going anywhere XXX
1870 */
1871 if (*diffp == 0)
1872 return;
1873
1874 /*
1875 * Done if that predecessor blah blah blah isn't
1876 * going the same place we're going XXX
1877 *
1878 * Does the true edge of this block point to the same
1879 * location as the true edge of b?
1880 */
1881 if (JT(*diffp) != JT(b))
1882 return;
1883
1884 /*
1885 * Done if this node isn't a dominator of that
1886 * node blah blah blah XXX
1887 *
1888 * Does b dominate diffp?
1889 */
1890 if (!SET_MEMBER((*diffp)->dom, b->id))
1891 return;
1892
1893 /*
1894 * Break out of the loop if that node's value of A
1895 * isn't the value of A above XXX
1896 */
1897 if ((*diffp)->val[A_ATOM] != val)
1898 break;
1899
1900 /*
1901 * Get the JF for that node XXX
1902 * Go down the false path.
1903 */
1904 diffp = &JF(*diffp);
1905 at_top = 0;
1906 }
1907
1908 /*
1909 * Now that we've found a different jump-compare in a chain
1910 * below b, search further down until we find another
1911 * jump-compare that looks at the original value. This
1912 * jump-compare should get pulled up. XXX again we're
1913 * comparing values not jump-compares.
1914 */
1915 samep = &JF(*diffp);
1916 for (;;) {
1917 /*
1918 * Done if that's not going anywhere XXX
1919 */
1920 if (*samep == 0)
1921 return;
1922
1923 /*
1924 * Done if that predecessor blah blah blah isn't
1925 * going the same place we're going XXX
1926 */
1927 if (JT(*samep) != JT(b))
1928 return;
1929
1930 /*
1931 * Done if this node isn't a dominator of that
1932 * node blah blah blah XXX
1933 *
1934 * Does b dominate samep?
1935 */
1936 if (!SET_MEMBER((*samep)->dom, b->id))
1937 return;
1938
1939 /*
1940 * Break out of the loop if that node's value of A
1941 * is the value of A above XXX
1942 */
1943 if ((*samep)->val[A_ATOM] == val)
1944 break;
1945
1946 /* XXX Need to check that there are no data dependencies
1947 between dp0 and dp1. Currently, the code generator
1948 will not produce such dependencies. */
1949 samep = &JF(*samep);
1950 }
1951 #ifdef notdef
1952 /* XXX This doesn't cover everything. */
1953 for (i = 0; i < N_ATOMS; ++i)
1954 if ((*samep)->val[i] != pred->val[i])
1955 return;
1956 #endif
1957 /* Pull up the node. */
1958 pull = *samep;
1959 *samep = JF(pull);
1960 JF(pull) = *diffp;
1961
1962 /*
1963 * At the top of the chain, each predecessor needs to point at the
1964 * pulled up node. Inside the chain, there is only one predecessor
1965 * to worry about.
1966 */
1967 if (at_top) {
1968 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1969 if (JT(ep->pred) == b)
1970 JT(ep->pred) = pull;
1971 else
1972 JF(ep->pred) = pull;
1973 }
1974 }
1975 else
1976 *diffp = pull;
1977
1978 /*
1979 * XXX - this is one of the operations that happens when the
1980 * optimizer gets into one of those infinite loops.
1981 */
1982 opt_state->done = 0;
1983
1984 /*
1985 * Recompute dominator sets as control flow graph has changed.
1986 */
1987 find_dom(opt_state, root);
1988 }
1989
1990 static void
and_pullup(opt_state_t * opt_state,struct block * b,struct block * root)1991 and_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
1992 {
1993 bpf_u_int32 val;
1994 int at_top;
1995 struct block *pull;
1996 struct block **diffp, **samep;
1997 struct edge *ep;
1998
1999 ep = b->in_edges;
2000 if (ep == 0)
2001 return;
2002
2003 /*
2004 * Make sure each predecessor loads the same value.
2005 */
2006 val = ep->pred->val[A_ATOM];
2007 for (ep = ep->next; ep != 0; ep = ep->next)
2008 if (val != ep->pred->val[A_ATOM])
2009 return;
2010
2011 if (JT(b->in_edges->pred) == b)
2012 diffp = &JT(b->in_edges->pred);
2013 else
2014 diffp = &JF(b->in_edges->pred);
2015
2016 at_top = 1;
2017 for (;;) {
2018 if (*diffp == 0)
2019 return;
2020
2021 if (JF(*diffp) != JF(b))
2022 return;
2023
2024 if (!SET_MEMBER((*diffp)->dom, b->id))
2025 return;
2026
2027 if ((*diffp)->val[A_ATOM] != val)
2028 break;
2029
2030 diffp = &JT(*diffp);
2031 at_top = 0;
2032 }
2033 samep = &JT(*diffp);
2034 for (;;) {
2035 if (*samep == 0)
2036 return;
2037
2038 if (JF(*samep) != JF(b))
2039 return;
2040
2041 if (!SET_MEMBER((*samep)->dom, b->id))
2042 return;
2043
2044 if ((*samep)->val[A_ATOM] == val)
2045 break;
2046
2047 /* XXX Need to check that there are no data dependencies
2048 between diffp and samep. Currently, the code generator
2049 will not produce such dependencies. */
2050 samep = &JT(*samep);
2051 }
2052 #ifdef notdef
2053 /* XXX This doesn't cover everything. */
2054 for (i = 0; i < N_ATOMS; ++i)
2055 if ((*samep)->val[i] != pred->val[i])
2056 return;
2057 #endif
2058 /* Pull up the node. */
2059 pull = *samep;
2060 *samep = JT(pull);
2061 JT(pull) = *diffp;
2062
2063 /*
2064 * At the top of the chain, each predecessor needs to point at the
2065 * pulled up node. Inside the chain, there is only one predecessor
2066 * to worry about.
2067 */
2068 if (at_top) {
2069 for (ep = b->in_edges; ep != 0; ep = ep->next) {
2070 if (JT(ep->pred) == b)
2071 JT(ep->pred) = pull;
2072 else
2073 JF(ep->pred) = pull;
2074 }
2075 }
2076 else
2077 *diffp = pull;
2078
2079 /*
2080 * XXX - this is one of the operations that happens when the
2081 * optimizer gets into one of those infinite loops.
2082 */
2083 opt_state->done = 0;
2084
2085 /*
2086 * Recompute dominator sets as control flow graph has changed.
2087 */
2088 find_dom(opt_state, root);
2089 }
2090
2091 static void
opt_blks(opt_state_t * opt_state,struct icode * ic,int do_stmts)2092 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2093 {
2094 int i, maxlevel;
2095 struct block *p;
2096
2097 init_val(opt_state);
2098 maxlevel = ic->root->level;
2099
2100 find_inedges(opt_state, ic->root);
2101 for (i = maxlevel; i >= 0; --i)
2102 for (p = opt_state->levels[i]; p; p = p->link)
2103 opt_blk(opt_state, p, do_stmts);
2104
2105 if (do_stmts)
2106 /*
2107 * No point trying to move branches; it can't possibly
2108 * make a difference at this point.
2109 *
2110 * XXX - this might be after we detect a loop where
2111 * we were just looping infinitely moving branches
2112 * in such a fashion that we went through two or more
2113 * versions of the machine code, eventually returning
2114 * to the first version. (We're really not doing a
2115 * full loop detection, we're just testing for two
2116 * passes in a row where we do nothing but
2117 * move branches.)
2118 */
2119 return;
2120
2121 /*
2122 * Is this what the BPF+ paper describes in sections 6.1.1,
2123 * 6.1.2, and 6.1.3?
2124 */
2125 for (i = 1; i <= maxlevel; ++i) {
2126 for (p = opt_state->levels[i]; p; p = p->link) {
2127 opt_j(opt_state, &p->et);
2128 opt_j(opt_state, &p->ef);
2129 }
2130 }
2131
2132 find_inedges(opt_state, ic->root);
2133 for (i = 1; i <= maxlevel; ++i) {
2134 for (p = opt_state->levels[i]; p; p = p->link) {
2135 or_pullup(opt_state, p, ic->root);
2136 and_pullup(opt_state, p, ic->root);
2137 }
2138 }
2139 }
2140
2141 static inline void
link_inedge(struct edge * parent,struct block * child)2142 link_inedge(struct edge *parent, struct block *child)
2143 {
2144 parent->next = child->in_edges;
2145 child->in_edges = parent;
2146 }
2147
2148 static void
find_inedges(opt_state_t * opt_state,struct block * root)2149 find_inedges(opt_state_t *opt_state, struct block *root)
2150 {
2151 u_int i;
2152 int level;
2153 struct block *b;
2154
2155 for (i = 0; i < opt_state->n_blocks; ++i)
2156 opt_state->blocks[i]->in_edges = 0;
2157
2158 /*
2159 * Traverse the graph, adding each edge to the predecessor
2160 * list of its successors. Skip the leaves (i.e. level 0).
2161 */
2162 for (level = root->level; level > 0; --level) {
2163 for (b = opt_state->levels[level]; b != 0; b = b->link) {
2164 link_inedge(&b->et, JT(b));
2165 link_inedge(&b->ef, JF(b));
2166 }
2167 }
2168 }
2169
2170 static void
opt_root(struct block ** b)2171 opt_root(struct block **b)
2172 {
2173 struct slist *tmp, *s;
2174
2175 s = (*b)->stmts;
2176 (*b)->stmts = 0;
2177 while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
2178 *b = JT(*b);
2179
2180 tmp = (*b)->stmts;
2181 if (tmp != 0)
2182 sappend(s, tmp);
2183 (*b)->stmts = s;
2184
2185 /*
2186 * If the root node is a return, then there is no
2187 * point executing any statements (since the bpf machine
2188 * has no side effects).
2189 */
2190 if (BPF_CLASS((*b)->s.code) == BPF_RET)
2191 (*b)->stmts = 0;
2192 }
2193
2194 static void
opt_loop(opt_state_t * opt_state,struct icode * ic,int do_stmts)2195 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2196 {
2197
2198 #ifdef BDEBUG
2199 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2200 printf("opt_loop(root, %d) begin\n", do_stmts);
2201 opt_dump(opt_state, ic);
2202 }
2203 #endif
2204
2205 /*
2206 * XXX - optimizer loop detection.
2207 */
2208 int loop_count = 0;
2209 for (;;) {
2210 opt_state->done = 1;
2211 /*
2212 * XXX - optimizer loop detection.
2213 */
2214 opt_state->non_branch_movement_performed = 0;
2215 find_levels(opt_state, ic);
2216 find_dom(opt_state, ic->root);
2217 find_closure(opt_state, ic->root);
2218 find_ud(opt_state, ic->root);
2219 find_edom(opt_state, ic->root);
2220 opt_blks(opt_state, ic, do_stmts);
2221 #ifdef BDEBUG
2222 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2223 printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
2224 opt_dump(opt_state, ic);
2225 }
2226 #endif
2227
2228 /*
2229 * Was anything done in this optimizer pass?
2230 */
2231 if (opt_state->done) {
2232 /*
2233 * No, so we've reached a fixed point.
2234 * We're done.
2235 */
2236 break;
2237 }
2238
2239 /*
2240 * XXX - was anything done other than branch movement
2241 * in this pass?
2242 */
2243 if (opt_state->non_branch_movement_performed) {
2244 /*
2245 * Yes. Clear any loop-detection counter;
2246 * we're making some form of progress (assuming
2247 * we can't get into a cycle doing *other*
2248 * optimizations...).
2249 */
2250 loop_count = 0;
2251 } else {
2252 /*
2253 * No - increment the counter, and quit if
2254 * it's up to 100.
2255 */
2256 loop_count++;
2257 if (loop_count >= 100) {
2258 /*
2259 * We've done nothing but branch movement
2260 * for 100 passes; we're probably
2261 * in a cycle and will never reach a
2262 * fixed point.
2263 *
2264 * XXX - yes, we really need a non-
2265 * heuristic way of detecting a cycle.
2266 */
2267 opt_state->done = 1;
2268 break;
2269 }
2270 }
2271 }
2272 }
2273
2274 /*
2275 * Optimize the filter code in its dag representation.
2276 * Return 0 on success, -1 on error.
2277 */
2278 int
bpf_optimize(struct icode * ic,char * errbuf)2279 bpf_optimize(struct icode *ic, char *errbuf)
2280 {
2281 opt_state_t opt_state;
2282
2283 memset(&opt_state, 0, sizeof(opt_state));
2284 opt_state.errbuf = errbuf;
2285 opt_state.non_branch_movement_performed = 0;
2286 if (setjmp(opt_state.top_ctx)) {
2287 opt_cleanup(&opt_state);
2288 return -1;
2289 }
2290 opt_init(&opt_state, ic);
2291 opt_loop(&opt_state, ic, 0);
2292 opt_loop(&opt_state, ic, 1);
2293 intern_blocks(&opt_state, ic);
2294 #ifdef BDEBUG
2295 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2296 printf("after intern_blocks()\n");
2297 opt_dump(&opt_state, ic);
2298 }
2299 #endif
2300 opt_root(&ic->root);
2301 #ifdef BDEBUG
2302 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2303 printf("after opt_root()\n");
2304 opt_dump(&opt_state, ic);
2305 }
2306 #endif
2307 opt_cleanup(&opt_state);
2308 return 0;
2309 }
2310
2311 static void
make_marks(struct icode * ic,struct block * p)2312 make_marks(struct icode *ic, struct block *p)
2313 {
2314 if (!isMarked(ic, p)) {
2315 Mark(ic, p);
2316 if (BPF_CLASS(p->s.code) != BPF_RET) {
2317 make_marks(ic, JT(p));
2318 make_marks(ic, JF(p));
2319 }
2320 }
2321 }
2322
2323 /*
2324 * Mark code array such that isMarked(ic->cur_mark, i) is true
2325 * only for nodes that are alive.
2326 */
2327 static void
mark_code(struct icode * ic)2328 mark_code(struct icode *ic)
2329 {
2330 ic->cur_mark += 1;
2331 make_marks(ic, ic->root);
2332 }
2333
2334 /*
2335 * True iff the two stmt lists load the same value from the packet into
2336 * the accumulator.
2337 */
2338 static int
eq_slist(struct slist * x,struct slist * y)2339 eq_slist(struct slist *x, struct slist *y)
2340 {
2341 for (;;) {
2342 while (x && x->s.code == NOP)
2343 x = x->next;
2344 while (y && y->s.code == NOP)
2345 y = y->next;
2346 if (x == 0)
2347 return y == 0;
2348 if (y == 0)
2349 return x == 0;
2350 if (x->s.code != y->s.code || x->s.k != y->s.k)
2351 return 0;
2352 x = x->next;
2353 y = y->next;
2354 }
2355 }
2356
2357 static inline int
eq_blk(struct block * b0,struct block * b1)2358 eq_blk(struct block *b0, struct block *b1)
2359 {
2360 if (b0->s.code == b1->s.code &&
2361 b0->s.k == b1->s.k &&
2362 b0->et.succ == b1->et.succ &&
2363 b0->ef.succ == b1->ef.succ)
2364 return eq_slist(b0->stmts, b1->stmts);
2365 return 0;
2366 }
2367
2368 static void
intern_blocks(opt_state_t * opt_state,struct icode * ic)2369 intern_blocks(opt_state_t *opt_state, struct icode *ic)
2370 {
2371 struct block *p;
2372 u_int i, j;
2373 int done1; /* don't shadow global */
2374 top:
2375 done1 = 1;
2376 for (i = 0; i < opt_state->n_blocks; ++i)
2377 opt_state->blocks[i]->link = 0;
2378
2379 mark_code(ic);
2380
2381 for (i = opt_state->n_blocks - 1; i != 0; ) {
2382 --i;
2383 if (!isMarked(ic, opt_state->blocks[i]))
2384 continue;
2385 for (j = i + 1; j < opt_state->n_blocks; ++j) {
2386 if (!isMarked(ic, opt_state->blocks[j]))
2387 continue;
2388 if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2389 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2390 opt_state->blocks[j]->link : opt_state->blocks[j];
2391 break;
2392 }
2393 }
2394 }
2395 for (i = 0; i < opt_state->n_blocks; ++i) {
2396 p = opt_state->blocks[i];
2397 if (JT(p) == 0)
2398 continue;
2399 if (JT(p)->link) {
2400 done1 = 0;
2401 JT(p) = JT(p)->link;
2402 }
2403 if (JF(p)->link) {
2404 done1 = 0;
2405 JF(p) = JF(p)->link;
2406 }
2407 }
2408 if (!done1)
2409 goto top;
2410 }
2411
2412 static void
opt_cleanup(opt_state_t * opt_state)2413 opt_cleanup(opt_state_t *opt_state)
2414 {
2415 free((void *)opt_state->vnode_base);
2416 free((void *)opt_state->vmap);
2417 free((void *)opt_state->edges);
2418 free((void *)opt_state->space);
2419 free((void *)opt_state->levels);
2420 free((void *)opt_state->blocks);
2421 }
2422
2423 /*
2424 * For optimizer errors.
2425 */
2426 static void PCAP_NORETURN
opt_error(opt_state_t * opt_state,const char * fmt,...)2427 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2428 {
2429 va_list ap;
2430
2431 if (opt_state->errbuf != NULL) {
2432 va_start(ap, fmt);
2433 (void)vsnprintf(opt_state->errbuf,
2434 PCAP_ERRBUF_SIZE, fmt, ap);
2435 va_end(ap);
2436 }
2437 longjmp(opt_state->top_ctx, 1);
2438 /* NOTREACHED */
2439 #ifdef _AIX
2440 PCAP_UNREACHABLE
2441 #endif /* _AIX */
2442 }
2443
2444 /*
2445 * Return the number of stmts in 's'.
2446 */
2447 static u_int
slength(struct slist * s)2448 slength(struct slist *s)
2449 {
2450 u_int n = 0;
2451
2452 for (; s; s = s->next)
2453 if (s->s.code != NOP)
2454 ++n;
2455 return n;
2456 }
2457
2458 /*
2459 * Return the number of nodes reachable by 'p'.
2460 * All nodes should be initially unmarked.
2461 */
2462 static int
count_blocks(struct icode * ic,struct block * p)2463 count_blocks(struct icode *ic, struct block *p)
2464 {
2465 if (p == 0 || isMarked(ic, p))
2466 return 0;
2467 Mark(ic, p);
2468 return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2469 }
2470
2471 /*
2472 * Do a depth first search on the flow graph, numbering the
2473 * the basic blocks, and entering them into the 'blocks' array.`
2474 */
2475 static void
number_blks_r(opt_state_t * opt_state,struct icode * ic,struct block * p)2476 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2477 {
2478 u_int n;
2479
2480 if (p == 0 || isMarked(ic, p))
2481 return;
2482
2483 Mark(ic, p);
2484 n = opt_state->n_blocks++;
2485 if (opt_state->n_blocks == 0) {
2486 /*
2487 * Overflow.
2488 */
2489 opt_error(opt_state, "filter is too complex to optimize");
2490 }
2491 p->id = n;
2492 opt_state->blocks[n] = p;
2493
2494 number_blks_r(opt_state, ic, JT(p));
2495 number_blks_r(opt_state, ic, JF(p));
2496 }
2497
2498 /*
2499 * Return the number of stmts in the flowgraph reachable by 'p'.
2500 * The nodes should be unmarked before calling.
2501 *
2502 * Note that "stmts" means "instructions", and that this includes
2503 *
2504 * side-effect statements in 'p' (slength(p->stmts));
2505 *
2506 * statements in the true branch from 'p' (count_stmts(JT(p)));
2507 *
2508 * statements in the false branch from 'p' (count_stmts(JF(p)));
2509 *
2510 * the conditional jump itself (1);
2511 *
2512 * an extra long jump if the true branch requires it (p->longjt);
2513 *
2514 * an extra long jump if the false branch requires it (p->longjf).
2515 */
2516 static u_int
count_stmts(struct icode * ic,struct block * p)2517 count_stmts(struct icode *ic, struct block *p)
2518 {
2519 u_int n;
2520
2521 if (p == 0 || isMarked(ic, p))
2522 return 0;
2523 Mark(ic, p);
2524 n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2525 return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2526 }
2527
2528 /*
2529 * Allocate memory. All allocation is done before optimization
2530 * is begun. A linear bound on the size of all data structures is computed
2531 * from the total number of blocks and/or statements.
2532 */
2533 static void
opt_init(opt_state_t * opt_state,struct icode * ic)2534 opt_init(opt_state_t *opt_state, struct icode *ic)
2535 {
2536 bpf_u_int32 *p;
2537 int i, n, max_stmts;
2538 u_int product;
2539 size_t block_memsize, edge_memsize;
2540
2541 /*
2542 * First, count the blocks, so we can malloc an array to map
2543 * block number to block. Then, put the blocks into the array.
2544 */
2545 unMarkAll(ic);
2546 n = count_blocks(ic, ic->root);
2547 opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2548 if (opt_state->blocks == NULL)
2549 opt_error(opt_state, "malloc");
2550 unMarkAll(ic);
2551 opt_state->n_blocks = 0;
2552 number_blks_r(opt_state, ic, ic->root);
2553
2554 /*
2555 * This "should not happen".
2556 */
2557 if (opt_state->n_blocks == 0)
2558 opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
2559
2560 opt_state->n_edges = 2 * opt_state->n_blocks;
2561 if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
2562 /*
2563 * Overflow.
2564 */
2565 opt_error(opt_state, "filter is too complex to optimize");
2566 }
2567 opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2568 if (opt_state->edges == NULL) {
2569 opt_error(opt_state, "malloc");
2570 }
2571
2572 /*
2573 * The number of levels is bounded by the number of nodes.
2574 */
2575 opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2576 if (opt_state->levels == NULL) {
2577 opt_error(opt_state, "malloc");
2578 }
2579
2580 opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
2581 opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
2582
2583 /*
2584 * Make sure opt_state->n_blocks * opt_state->nodewords fits
2585 * in a u_int; we use it as a u_int number-of-iterations
2586 * value.
2587 */
2588 product = opt_state->n_blocks * opt_state->nodewords;
2589 if ((product / opt_state->n_blocks) != opt_state->nodewords) {
2590 /*
2591 * XXX - just punt and don't try to optimize?
2592 * In practice, this is unlikely to happen with
2593 * a normal filter.
2594 */
2595 opt_error(opt_state, "filter is too complex to optimize");
2596 }
2597
2598 /*
2599 * Make sure the total memory required for that doesn't
2600 * overflow.
2601 */
2602 block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
2603 if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
2604 opt_error(opt_state, "filter is too complex to optimize");
2605 }
2606
2607 /*
2608 * Make sure opt_state->n_edges * opt_state->edgewords fits
2609 * in a u_int; we use it as a u_int number-of-iterations
2610 * value.
2611 */
2612 product = opt_state->n_edges * opt_state->edgewords;
2613 if ((product / opt_state->n_edges) != opt_state->edgewords) {
2614 opt_error(opt_state, "filter is too complex to optimize");
2615 }
2616
2617 /*
2618 * Make sure the total memory required for that doesn't
2619 * overflow.
2620 */
2621 edge_memsize = (size_t)product * sizeof(*opt_state->space);
2622 if (edge_memsize / product != sizeof(*opt_state->space)) {
2623 opt_error(opt_state, "filter is too complex to optimize");
2624 }
2625
2626 /*
2627 * Make sure the total memory required for both of them doesn't
2628 * overflow.
2629 */
2630 if (block_memsize > SIZE_MAX - edge_memsize) {
2631 opt_error(opt_state, "filter is too complex to optimize");
2632 }
2633
2634 /* XXX */
2635 opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
2636 if (opt_state->space == NULL) {
2637 opt_error(opt_state, "malloc");
2638 }
2639 p = opt_state->space;
2640 opt_state->all_dom_sets = p;
2641 for (i = 0; i < n; ++i) {
2642 opt_state->blocks[i]->dom = p;
2643 p += opt_state->nodewords;
2644 }
2645 opt_state->all_closure_sets = p;
2646 for (i = 0; i < n; ++i) {
2647 opt_state->blocks[i]->closure = p;
2648 p += opt_state->nodewords;
2649 }
2650 opt_state->all_edge_sets = p;
2651 for (i = 0; i < n; ++i) {
2652 register struct block *b = opt_state->blocks[i];
2653
2654 b->et.edom = p;
2655 p += opt_state->edgewords;
2656 b->ef.edom = p;
2657 p += opt_state->edgewords;
2658 b->et.id = i;
2659 opt_state->edges[i] = &b->et;
2660 b->ef.id = opt_state->n_blocks + i;
2661 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2662 b->et.pred = b;
2663 b->ef.pred = b;
2664 }
2665 max_stmts = 0;
2666 for (i = 0; i < n; ++i)
2667 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2668 /*
2669 * We allocate at most 3 value numbers per statement,
2670 * so this is an upper bound on the number of valnodes
2671 * we'll need.
2672 */
2673 opt_state->maxval = 3 * max_stmts;
2674 opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2675 if (opt_state->vmap == NULL) {
2676 opt_error(opt_state, "malloc");
2677 }
2678 opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2679 if (opt_state->vnode_base == NULL) {
2680 opt_error(opt_state, "malloc");
2681 }
2682 }
2683
2684 /*
2685 * This is only used when supporting optimizer debugging. It is
2686 * global state, so do *not* do more than one compile in parallel
2687 * and expect it to provide meaningful information.
2688 */
2689 #ifdef BDEBUG
2690 int bids[NBIDS];
2691 #endif
2692
2693 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2694 PCAP_PRINTFLIKE(2, 3);
2695
2696 /*
2697 * Returns true if successful. Returns false if a branch has
2698 * an offset that is too large. If so, we have marked that
2699 * branch so that on a subsequent iteration, it will be treated
2700 * properly.
2701 */
2702 static int
convert_code_r(conv_state_t * conv_state,struct icode * ic,struct block * p)2703 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2704 {
2705 struct bpf_insn *dst;
2706 struct slist *src;
2707 u_int slen;
2708 u_int off;
2709 struct slist **offset = NULL;
2710
2711 if (p == 0 || isMarked(ic, p))
2712 return (1);
2713 Mark(ic, p);
2714
2715 if (convert_code_r(conv_state, ic, JF(p)) == 0)
2716 return (0);
2717 if (convert_code_r(conv_state, ic, JT(p)) == 0)
2718 return (0);
2719
2720 slen = slength(p->stmts);
2721 dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2722 /* inflate length by any extra jumps */
2723
2724 p->offset = (int)(dst - conv_state->fstart);
2725
2726 /* generate offset[] for convenience */
2727 if (slen) {
2728 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2729 if (!offset) {
2730 conv_error(conv_state, "not enough core");
2731 /*NOTREACHED*/
2732 }
2733 }
2734 src = p->stmts;
2735 for (off = 0; off < slen && src; off++) {
2736 #if 0
2737 printf("off=%d src=%x\n", off, src);
2738 #endif
2739 offset[off] = src;
2740 src = src->next;
2741 }
2742
2743 off = 0;
2744 for (src = p->stmts; src; src = src->next) {
2745 if (src->s.code == NOP)
2746 continue;
2747 dst->code = (u_short)src->s.code;
2748 dst->k = src->s.k;
2749
2750 /* fill block-local relative jump */
2751 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2752 #if 0
2753 if (src->s.jt || src->s.jf) {
2754 free(offset);
2755 conv_error(conv_state, "illegal jmp destination");
2756 /*NOTREACHED*/
2757 }
2758 #endif
2759 goto filled;
2760 }
2761 if (off == slen - 2) /*???*/
2762 goto filled;
2763
2764 {
2765 u_int i;
2766 int jt, jf;
2767 const char ljerr[] = "%s for block-local relative jump: off=%d";
2768
2769 #if 0
2770 printf("code=%x off=%d %x %x\n", src->s.code,
2771 off, src->s.jt, src->s.jf);
2772 #endif
2773
2774 if (!src->s.jt || !src->s.jf) {
2775 free(offset);
2776 conv_error(conv_state, ljerr, "no jmp destination", off);
2777 /*NOTREACHED*/
2778 }
2779
2780 jt = jf = 0;
2781 for (i = 0; i < slen; i++) {
2782 if (offset[i] == src->s.jt) {
2783 if (jt) {
2784 free(offset);
2785 conv_error(conv_state, ljerr, "multiple matches", off);
2786 /*NOTREACHED*/
2787 }
2788
2789 if (i - off - 1 >= 256) {
2790 free(offset);
2791 conv_error(conv_state, ljerr, "out-of-range jump", off);
2792 /*NOTREACHED*/
2793 }
2794 dst->jt = (u_char)(i - off - 1);
2795 jt++;
2796 }
2797 if (offset[i] == src->s.jf) {
2798 if (jf) {
2799 free(offset);
2800 conv_error(conv_state, ljerr, "multiple matches", off);
2801 /*NOTREACHED*/
2802 }
2803 if (i - off - 1 >= 256) {
2804 free(offset);
2805 conv_error(conv_state, ljerr, "out-of-range jump", off);
2806 /*NOTREACHED*/
2807 }
2808 dst->jf = (u_char)(i - off - 1);
2809 jf++;
2810 }
2811 }
2812 if (!jt || !jf) {
2813 free(offset);
2814 conv_error(conv_state, ljerr, "no destination found", off);
2815 /*NOTREACHED*/
2816 }
2817 }
2818 filled:
2819 ++dst;
2820 ++off;
2821 }
2822 if (offset)
2823 free(offset);
2824
2825 #ifdef BDEBUG
2826 if (dst - conv_state->fstart < NBIDS)
2827 bids[dst - conv_state->fstart] = p->id + 1;
2828 #endif
2829 dst->code = (u_short)p->s.code;
2830 dst->k = p->s.k;
2831 if (JT(p)) {
2832 /* number of extra jumps inserted */
2833 u_char extrajmps = 0;
2834 off = JT(p)->offset - (p->offset + slen) - 1;
2835 if (off >= 256) {
2836 /* offset too large for branch, must add a jump */
2837 if (p->longjt == 0) {
2838 /* mark this instruction and retry */
2839 p->longjt++;
2840 return(0);
2841 }
2842 dst->jt = extrajmps;
2843 extrajmps++;
2844 dst[extrajmps].code = BPF_JMP|BPF_JA;
2845 dst[extrajmps].k = off - extrajmps;
2846 }
2847 else
2848 dst->jt = (u_char)off;
2849 off = JF(p)->offset - (p->offset + slen) - 1;
2850 if (off >= 256) {
2851 /* offset too large for branch, must add a jump */
2852 if (p->longjf == 0) {
2853 /* mark this instruction and retry */
2854 p->longjf++;
2855 return(0);
2856 }
2857 /* branch if F to following jump */
2858 /* if two jumps are inserted, F goes to second one */
2859 dst->jf = extrajmps;
2860 extrajmps++;
2861 dst[extrajmps].code = BPF_JMP|BPF_JA;
2862 dst[extrajmps].k = off - extrajmps;
2863 }
2864 else
2865 dst->jf = (u_char)off;
2866 }
2867 return (1);
2868 }
2869
2870
2871 /*
2872 * Convert flowgraph intermediate representation to the
2873 * BPF array representation. Set *lenp to the number of instructions.
2874 *
2875 * This routine does *NOT* leak the memory pointed to by fp. It *must
2876 * not* do free(fp) before returning fp; doing so would make no sense,
2877 * as the BPF array pointed to by the return value of icode_to_fcode()
2878 * must be valid - it's being returned for use in a bpf_program structure.
2879 *
2880 * If it appears that icode_to_fcode() is leaking, the problem is that
2881 * the program using pcap_compile() is failing to free the memory in
2882 * the BPF program when it's done - the leak is in the program, not in
2883 * the routine that happens to be allocating the memory. (By analogy, if
2884 * a program calls fopen() without ever calling fclose() on the FILE *,
2885 * it will leak the FILE structure; the leak is not in fopen(), it's in
2886 * the program.) Change the program to use pcap_freecode() when it's
2887 * done with the filter program. See the pcap man page.
2888 */
2889 struct bpf_insn *
icode_to_fcode(struct icode * ic,struct block * root,u_int * lenp,char * errbuf)2890 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2891 char *errbuf)
2892 {
2893 u_int n;
2894 struct bpf_insn *fp;
2895 conv_state_t conv_state;
2896
2897 conv_state.fstart = NULL;
2898 conv_state.errbuf = errbuf;
2899 if (setjmp(conv_state.top_ctx) != 0) {
2900 free(conv_state.fstart);
2901 return NULL;
2902 }
2903
2904 /*
2905 * Loop doing convert_code_r() until no branches remain
2906 * with too-large offsets.
2907 */
2908 for (;;) {
2909 unMarkAll(ic);
2910 n = *lenp = count_stmts(ic, root);
2911
2912 fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2913 if (fp == NULL) {
2914 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
2915 "malloc");
2916 return NULL;
2917 }
2918 memset((char *)fp, 0, sizeof(*fp) * n);
2919 conv_state.fstart = fp;
2920 conv_state.ftail = fp + n;
2921
2922 unMarkAll(ic);
2923 if (convert_code_r(&conv_state, ic, root))
2924 break;
2925 free(fp);
2926 }
2927
2928 return fp;
2929 }
2930
2931 /*
2932 * For iconv_to_fconv() errors.
2933 */
2934 static void PCAP_NORETURN
conv_error(conv_state_t * conv_state,const char * fmt,...)2935 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2936 {
2937 va_list ap;
2938
2939 va_start(ap, fmt);
2940 (void)vsnprintf(conv_state->errbuf,
2941 PCAP_ERRBUF_SIZE, fmt, ap);
2942 va_end(ap);
2943 longjmp(conv_state->top_ctx, 1);
2944 /* NOTREACHED */
2945 #ifdef _AIX
2946 PCAP_UNREACHABLE
2947 #endif /* _AIX */
2948 }
2949
2950 /*
2951 * Make a copy of a BPF program and put it in the "fcode" member of
2952 * a "pcap_t".
2953 *
2954 * If we fail to allocate memory for the copy, fill in the "errbuf"
2955 * member of the "pcap_t" with an error message, and return -1;
2956 * otherwise, return 0.
2957 */
2958 int
pcapint_install_bpf_program(pcap_t * p,struct bpf_program * fp)2959 pcapint_install_bpf_program(pcap_t *p, struct bpf_program *fp)
2960 {
2961 size_t prog_size;
2962
2963 /*
2964 * Validate the program.
2965 */
2966 if (!pcapint_validate_filter(fp->bf_insns, fp->bf_len)) {
2967 snprintf(p->errbuf, sizeof(p->errbuf),
2968 "BPF program is not valid");
2969 return (-1);
2970 }
2971
2972 /*
2973 * Free up any already installed program.
2974 */
2975 pcap_freecode(&p->fcode);
2976
2977 prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2978 p->fcode.bf_len = fp->bf_len;
2979 p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2980 if (p->fcode.bf_insns == NULL) {
2981 pcapint_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2982 errno, "malloc");
2983 return (-1);
2984 }
2985 memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2986 return (0);
2987 }
2988
2989 #ifdef BDEBUG
2990 static void
dot_dump_node(struct icode * ic,struct block * block,struct bpf_program * prog,FILE * out)2991 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2992 FILE *out)
2993 {
2994 int icount, noffset;
2995 int i;
2996
2997 if (block == NULL || isMarked(ic, block))
2998 return;
2999 Mark(ic, block);
3000
3001 icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
3002 noffset = min(block->offset + icount, (int)prog->bf_len);
3003
3004 fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
3005 for (i = block->offset; i < noffset; i++) {
3006 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
3007 }
3008 fprintf(out, "\" tooltip=\"");
3009 for (i = 0; i < BPF_MEMWORDS; i++)
3010 if (block->val[i] != VAL_UNKNOWN)
3011 fprintf(out, "val[%d]=%d ", i, block->val[i]);
3012 fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
3013 fprintf(out, "val[X]=%d", block->val[X_ATOM]);
3014 fprintf(out, "\"");
3015 if (JT(block) == NULL)
3016 fprintf(out, ", peripheries=2");
3017 fprintf(out, "];\n");
3018
3019 dot_dump_node(ic, JT(block), prog, out);
3020 dot_dump_node(ic, JF(block), prog, out);
3021 }
3022
3023 static void
dot_dump_edge(struct icode * ic,struct block * block,FILE * out)3024 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
3025 {
3026 if (block == NULL || isMarked(ic, block))
3027 return;
3028 Mark(ic, block);
3029
3030 if (JT(block)) {
3031 fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
3032 block->id, JT(block)->id);
3033 fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
3034 block->id, JF(block)->id);
3035 }
3036 dot_dump_edge(ic, JT(block), out);
3037 dot_dump_edge(ic, JF(block), out);
3038 }
3039
3040 /* Output the block CFG using graphviz/DOT language
3041 * In the CFG, block's code, value index for each registers at EXIT,
3042 * and the jump relationship is show.
3043 *
3044 * example DOT for BPF `ip src host 1.1.1.1' is:
3045 digraph BPF {
3046 block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh [12]\n(001) jeq #0x800 jt 2 jf 5" tooltip="val[A]=0 val[X]=0"];
3047 block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld [26]\n(003) jeq #0x1010101 jt 4 jf 5" tooltip="val[A]=0 val[X]=0"];
3048 block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
3049 block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
3050 "block0":se -> "block1":n [label="T"];
3051 "block0":sw -> "block3":n [label="F"];
3052 "block1":se -> "block2":n [label="T"];
3053 "block1":sw -> "block3":n [label="F"];
3054 }
3055 *
3056 * After install graphviz on https://www.graphviz.org/, save it as bpf.dot
3057 * and run `dot -Tpng -O bpf.dot' to draw the graph.
3058 */
3059 static int
dot_dump(struct icode * ic,char * errbuf)3060 dot_dump(struct icode *ic, char *errbuf)
3061 {
3062 struct bpf_program f;
3063 FILE *out = stdout;
3064
3065 memset(bids, 0, sizeof bids);
3066 f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3067 if (f.bf_insns == NULL)
3068 return -1;
3069
3070 fprintf(out, "digraph BPF {\n");
3071 unMarkAll(ic);
3072 dot_dump_node(ic, ic->root, &f, out);
3073 unMarkAll(ic);
3074 dot_dump_edge(ic, ic->root, out);
3075 fprintf(out, "}\n");
3076
3077 free((char *)f.bf_insns);
3078 return 0;
3079 }
3080
3081 static int
plain_dump(struct icode * ic,char * errbuf)3082 plain_dump(struct icode *ic, char *errbuf)
3083 {
3084 struct bpf_program f;
3085
3086 memset(bids, 0, sizeof bids);
3087 f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3088 if (f.bf_insns == NULL)
3089 return -1;
3090 bpf_dump(&f, 1);
3091 putchar('\n');
3092 free((char *)f.bf_insns);
3093 return 0;
3094 }
3095
3096 static void
opt_dump(opt_state_t * opt_state,struct icode * ic)3097 opt_dump(opt_state_t *opt_state, struct icode *ic)
3098 {
3099 int status;
3100 char errbuf[PCAP_ERRBUF_SIZE];
3101
3102 /*
3103 * If the CFG, in DOT format, is requested, output it rather than
3104 * the code that would be generated from that graph.
3105 */
3106 if (pcap_print_dot_graph)
3107 status = dot_dump(ic, errbuf);
3108 else
3109 status = plain_dump(ic, errbuf);
3110 if (status == -1)
3111 opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
3112 }
3113 #endif
3114