xref: /titanic_50/usr/src/uts/common/ipp/flowacct/flowacct.c (revision 60946fe0e0cab23f683e9de92451aa45b7913135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/kmem.h>
31 #include <sys/conf.h>
32 #include <sys/atomic.h>
33 #include <netinet/in.h>
34 #include <netinet/in_systm.h>
35 #include <netinet/ip6.h>
36 #include <sys/socket.h>
37 #include <sys/acct.h>
38 #include <sys/exacct.h>
39 #include <inet/common.h>
40 #include <inet/ip.h>
41 #include <inet/ip6.h>
42 #include <sys/ddi.h>
43 #include <sys/strsun.h>
44 #include <ipp/flowacct/flowacct_impl.h>
45 
46 /*
47  * flowacct - IPQoS accounting module. The module maintains an array
48  * of 256 hash buckets. When the action routine is invoked for a flow,
49  * if the flow (identified by the 5-tuple: saddr, daddr, sport, dport, proto)
50  * is already present in the flow table (indexed by the hash function FLOW_HASH)
51  * then a check is made to see if an item for this flow with the same
52  * dsfield, projid & user id is present. If it is, then the number of packets
53  * and the bytes are incremented for that item. If the item does
54  * not exist a new item is added for the flow. If the flow is not present
55  * an entry is made for the flow.
56  *
57  * A timer runs thru the table and writes all the flow items that have
58  * timed out to the accounting file (via exacct PSARC/1999/119), if present
59  * Configuration commands to change the timing interval is provided. The
60  * flow timeout value can also be configured. While the timeout is in nsec,
61  * the flow timer interval is in usec.
62  * Information for an active flow can be obtained by using kstats.
63  */
64 
65 /* Used in computing the hash index */
66 #define	FLOWACCT_ADDR_HASH(addr) 			\
67 	((addr).s6_addr8[8] ^ (addr).s6_addr8[9] ^ 	\
68 	(addr).s6_addr8[10] ^ (addr).s6_addr8[13] ^ 	\
69 	(addr).s6_addr8[14] ^ (addr).s6_addr8[15])
70 
71 #define	FLOWACCT_FLOW_HASH(f)				\
72 	(((FLOWACCT_ADDR_HASH(f->saddr)) + 		\
73 	(FLOWACCT_ADDR_HASH(f->daddr)) + 		\
74 	(f->proto) + (f->sport) + (f->dport)) 		\
75 	% FLOW_TBL_COUNT)
76 
77 /*
78  * Compute difference between a and b in nsec and store in delta.
79  * delta should be a hrtime_t. Taken from ip_mroute.c.
80  */
81 #define	FLOWACCT_DELTA(a, b, delta) { \
82 	int xxs; \
83  \
84 	delta = (a).tv_nsec - (b).tv_nsec; \
85 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
86 		switch (xxs) { \
87 		case 2: \
88 		    delta += NANOSEC; \
89 		    /*FALLTHRU*/ \
90 		case 1: \
91 		    delta += NANOSEC; \
92 		    break; \
93 		default: \
94 		    delta += ((hrtime_t)NANOSEC * xxs); \
95 		} \
96 	} \
97 }
98 
99 /* Debug level */
100 int flowacct_debug = 0;
101 
102 /* Collect timed out flows to be written to the accounting file */
103 typedef struct flow_records_s {
104 	flow_usage_t *fl_use;
105 	struct flow_records_s *next;
106 }flow_records_t;
107 
108 /* Get port information from the packet. Ignore fragments. */
109 static void
110 flowacct_port_info(header_t *header, void *iph, int af, mblk_t *mp)
111 {
112 	uint16_t *up;
113 
114 	if (af == AF_INET) {
115 		ipha_t *ipha = (ipha_t *)iph;
116 		uint32_t u2, u1;
117 		uint_t iplen;
118 
119 		u2 = ntohs(ipha->ipha_fragment_offset_and_flags);
120 		u1 = u2 & (IPH_MF | IPH_OFFSET);
121 		if (u1 != 0) {
122 			return;
123 		}
124 		iplen = (ipha->ipha_version_and_hdr_length & 0xF) << 2;
125 		up = (uint16_t *)(mp->b_rptr + iplen);
126 		header->sport = (uint16_t)*up++;
127 		header->dport = (uint16_t)*up;
128 	} else {
129 		ip6_t *ip6h = (ip6_t *)iph;
130 		uint_t  length = IPV6_HDR_LEN;
131 		uint_t  ehdrlen;
132 		uint8_t *nexthdrp, *whereptr, *endptr;
133 		ip6_dest_t *desthdr;
134 		ip6_rthdr_t *rthdr;
135 		ip6_hbh_t *hbhhdr;
136 
137 		whereptr = ((uint8_t *)&ip6h[1]);
138 		endptr = mp->b_wptr;
139 		nexthdrp = &ip6h->ip6_nxt;
140 		while (whereptr < endptr) {
141 			switch (*nexthdrp) {
142 			case IPPROTO_HOPOPTS:
143 				hbhhdr = (ip6_hbh_t *)whereptr;
144 				ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
145 				if ((uchar_t *)hbhhdr +  ehdrlen > endptr)
146 					return;
147 				nexthdrp = &hbhhdr->ip6h_nxt;
148 				break;
149 			case IPPROTO_DSTOPTS:
150 				desthdr = (ip6_dest_t *)whereptr;
151 				ehdrlen = 8 * (desthdr->ip6d_len + 1);
152 				if ((uchar_t *)desthdr +  ehdrlen > endptr)
153 					return;
154 				nexthdrp = &desthdr->ip6d_nxt;
155 				break;
156 			case IPPROTO_ROUTING:
157 				rthdr = (ip6_rthdr_t *)whereptr;
158 				ehdrlen =  8 * (rthdr->ip6r_len + 1);
159 				if ((uchar_t *)rthdr +  ehdrlen > endptr)
160 					return;
161 				nexthdrp = &rthdr->ip6r_nxt;
162 				break;
163 			case IPPROTO_FRAGMENT:
164 				return;
165 			case IPPROTO_TCP:
166 			case IPPROTO_UDP:
167 			case IPPROTO_SCTP:
168 				/*
169 				 * Verify we have at least ICMP_MIN_TP_HDR_LEN
170 				 * bytes of the ULP's header to get the port
171 				 * info.
172 				 */
173 				if (((uchar_t *)ip6h + length +
174 				    ICMP_MIN_TP_HDR_LEN)  > endptr) {
175 					return;
176 				}
177 				/* Get the protocol & ports */
178 				header->proto = *nexthdrp;
179 				up = (uint16_t *)((uchar_t *)ip6h + length);
180 				header->sport = (uint16_t)*up++;
181 				header->dport = (uint16_t)*up;
182 				return;
183 			case IPPROTO_ICMPV6:
184 			case IPPROTO_ENCAP:
185 			case IPPROTO_IPV6:
186 			case IPPROTO_ESP:
187 			case IPPROTO_AH:
188 				header->proto = *nexthdrp;
189 				return;
190 			case IPPROTO_NONE:
191 			default:
192 				return;
193 			}
194 			length += ehdrlen;
195 			whereptr += ehdrlen;
196 		}
197 	}
198 }
199 
200 /*
201  * flowacct_find_ids(mp, header)
202  *
203  * attempt to discern the uid and projid of the originator of a packet by
204  * looking at the dblks making up the packet - yeuch!
205  *
206  * We do it by skipping any fragments with a credp of NULL (originated in
207  * kernel), taking the first value that isn't NULL to be the cred_t for the
208  * whole packet.
209  */
210 static void
211 flowacct_find_ids(mblk_t *mp, header_t *header)
212 {
213 	cred_t *cr;
214 
215 	while (DB_CRED(mp) == NULL && mp->b_cont != NULL)
216 		mp = mp->b_cont;
217 
218 	if ((cr = DB_CRED(mp)) != NULL) {
219 		header->uid = crgetuid(cr);
220 		header->projid = crgetprojid(cr);
221 	} else {
222 		header->uid = (uid_t)-1;
223 		header->projid = -1;
224 	}
225 }
226 
227 /*
228  * Extract header information in a header_t structure so that we don't have
229  * have to parse the packet everytime.
230  */
231 static int
232 flowacct_extract_header(mblk_t *mp, header_t *header)
233 {
234 	ipha_t *ipha;
235 	ip6_t *ip6h;
236 #define	rptr	((uchar_t *)ipha)
237 
238 	/* 0 means no port extracted. */
239 	header->sport = 0;
240 	header->dport = 0;
241 	flowacct_find_ids(mp, header);
242 
243 	V6_SET_ZERO(header->saddr);
244 	V6_SET_ZERO(header->daddr);
245 
246 	ipha = (ipha_t *)mp->b_rptr;
247 	header->isv4 = IPH_HDR_VERSION(ipha) == IPV4_VERSION;
248 	if (header->isv4) {
249 		ipha = (ipha_t *)mp->b_rptr;
250 		V4_PART_OF_V6(header->saddr) = (int32_t)ipha->ipha_src;
251 		V4_PART_OF_V6(header->daddr) = (int32_t)ipha->ipha_dst;
252 		header->dsfield = ipha->ipha_type_of_service;
253 		header->proto = ipha->ipha_protocol;
254 		header->pktlen = ntohs(ipha->ipha_length);
255 		if ((header->proto == IPPROTO_TCP) ||
256 		    (header->proto == IPPROTO_UDP) ||
257 		    (header->proto == IPPROTO_SCTP)) {
258 			flowacct_port_info(header, ipha, AF_INET, mp);
259 		}
260 	} else {
261 		/*
262 		 * Need to pullup everything.
263 		 */
264 		if (mp->b_cont != NULL) {
265 			if (!pullupmsg(mp, -1)) {
266 				flowacct0dbg(("flowacct_extract_header: "\
267 				    "pullup error"));
268 				return (-1);
269 			}
270 		}
271 		ip6h = (ip6_t *)mp->b_rptr;
272 		bcopy(ip6h->ip6_src.s6_addr32, header->saddr.s6_addr32,
273 		    sizeof (ip6h->ip6_src.s6_addr32));
274 		bcopy(ip6h->ip6_dst.s6_addr32, header->daddr.s6_addr32,
275 		    sizeof (ip6h->ip6_dst.s6_addr32));
276 		header->dsfield = __IPV6_TCLASS_FROM_FLOW(ip6h->ip6_vcf);
277 		header->proto = ip6h->ip6_nxt;
278 		header->pktlen = ntohs(ip6h->ip6_plen) +
279 		    ip_hdr_length_v6(mp, ip6h);
280 		flowacct_port_info(header, ip6h, AF_INET6, mp);
281 
282 	}
283 #undef	rptr
284 	return (0);
285 }
286 
287 /* Check if the flow (identified by the 5-tuple) exists in the hash table */
288 static flow_t *
289 flowacct_flow_present(header_t *header, int index,
290     flowacct_data_t *flowacct_data)
291 {
292 	list_hdr_t *hdr = flowacct_data->flows_tbl[index].head;
293 	flow_t *flow;
294 
295 	while (hdr != NULL) {
296 		flow = (flow_t *)hdr->objp;
297 		if ((flow != NULL) &&
298 		    (IN6_ARE_ADDR_EQUAL(&flow->saddr, &header->saddr)) &&
299 		    (IN6_ARE_ADDR_EQUAL(&flow->daddr, &header->daddr)) &&
300 		    (flow->proto == header->proto) &&
301 		    (flow->sport == header->sport) &&
302 		    (flow->dport == header->dport)) {
303 			return (flow);
304 		}
305 		hdr = hdr->next;
306 	}
307 	return ((flow_t *)NULL);
308 }
309 
310 /*
311  * Add an object to the list at insert_point. This could be a flow item or
312  * a flow itself.
313  */
314 static list_hdr_t *
315 flowacct_add_obj(list_head_t *tophdr, list_hdr_t *insert_point, void *obj)
316 {
317 	list_hdr_t *new_hdr;
318 
319 	if (tophdr == NULL) {
320 		return ((list_hdr_t *)NULL);
321 	}
322 
323 	new_hdr = (list_hdr_t *)kmem_zalloc(FLOWACCT_HDR_SZ, KM_NOSLEEP);
324 	if (new_hdr == NULL) {
325 		flowacct0dbg(("flowacct_add_obj: error allocating mem"));
326 		return ((list_hdr_t *)NULL);
327 	}
328 	gethrestime(&new_hdr->last_seen);
329 	new_hdr->objp = obj;
330 	tophdr->nbr_items++;
331 
332 	if (insert_point == NULL) {
333 		if (tophdr->head == NULL) {
334 			tophdr->head = new_hdr;
335 			tophdr->tail = new_hdr;
336 			return (new_hdr);
337 		}
338 
339 		new_hdr->next = tophdr->head;
340 		tophdr->head->prev = new_hdr;
341 		tophdr->head = new_hdr;
342 		return (new_hdr);
343 	}
344 
345 	if (insert_point == tophdr->tail) {
346 		tophdr->tail->next = new_hdr;
347 		new_hdr->prev = tophdr->tail;
348 		tophdr->tail = new_hdr;
349 		return (new_hdr);
350 	}
351 
352 	new_hdr->next = insert_point->next;
353 	new_hdr->prev = insert_point;
354 	insert_point->next->prev = new_hdr;
355 	insert_point->next = new_hdr;
356 	return (new_hdr);
357 }
358 
359 /* Delete an obj from the list. This could be a flow item or the flow itself */
360 static void
361 flowacct_del_obj(list_head_t *tophdr, list_hdr_t *hdr, uint_t mode)
362 {
363 	size_t	length;
364 	uint_t	type;
365 
366 	if ((tophdr == NULL) || (hdr == NULL)) {
367 		return;
368 	}
369 
370 	type = ((flow_t *)hdr->objp)->type;
371 
372 	tophdr->nbr_items--;
373 
374 	if (hdr->next != NULL) {
375 		hdr->next->prev = hdr->prev;
376 	}
377 	if (hdr->prev != NULL) {
378 		hdr->prev->next = hdr->next;
379 	}
380 	if (tophdr->head == hdr) {
381 		tophdr->head = hdr->next;
382 	}
383 	if (tophdr->tail == hdr) {
384 		tophdr->tail = hdr->prev;
385 	}
386 
387 	if (mode == FLOWACCT_DEL_OBJ) {
388 		switch (type) {
389 		case FLOWACCT_FLOW:
390 			length = FLOWACCT_FLOW_SZ;
391 			break;
392 		case FLOWACCT_ITEM:
393 			length = FLOWACCT_ITEM_SZ;
394 			break;
395 		}
396 		kmem_free(hdr->objp, length);
397 		hdr->objp = NULL;
398 	}
399 
400 	kmem_free((void *)hdr, FLOWACCT_HDR_SZ);
401 }
402 
403 /*
404  * Checks if the given item (identified by dsfield, project id and uid)
405  * is already present for the flow.
406  */
407 static flow_item_t *
408 flowacct_item_present(flow_t *flow, uint8_t dsfield, pid_t proj_id, uint_t uid)
409 {
410 	list_hdr_t	*itemhdr;
411 	flow_item_t	*item;
412 
413 	itemhdr = flow->items.head;
414 
415 	while (itemhdr != NULL) {
416 		item = (flow_item_t *)itemhdr->objp;
417 
418 		if ((item->dsfield != dsfield) || (item->projid != proj_id) ||
419 		    (item->uid != uid)) {
420 			itemhdr = itemhdr->next;
421 			continue;
422 		}
423 		return (item);
424 	}
425 
426 	return ((flow_item_t *)NULL);
427 }
428 
429 /*
430  * Add the flow to the table, if not already present. If the flow is
431  * present in the table, add the item. Also, update the flow stats.
432  * Additionally, re-adjust the timout list as well.
433  */
434 static int
435 flowacct_update_flows_tbl(header_t *header, flowacct_data_t *flowacct_data)
436 {
437 	int index;
438 	list_head_t *fhead;
439 	list_head_t *thead;
440 	list_head_t *ihead;
441 	boolean_t added_flow = B_FALSE;
442 	timespec_t  now;
443 	flow_item_t *item;
444 	flow_t *flow;
445 
446 	index = FLOWACCT_FLOW_HASH(header);
447 	fhead = &flowacct_data->flows_tbl[index];
448 
449 	/* The timeout list */
450 	thead = &flowacct_data->flows_tbl[FLOW_TBL_COUNT];
451 
452 	mutex_enter(&fhead->lock);
453 	flow = flowacct_flow_present(header, index, flowacct_data);
454 	if (flow == NULL) {
455 		flow = (flow_t *)kmem_zalloc(FLOWACCT_FLOW_SZ, KM_NOSLEEP);
456 		if (flow == NULL) {
457 			mutex_exit(&fhead->lock);
458 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
459 			    "error"));
460 			return (-1);
461 		}
462 		flow->hdr = flowacct_add_obj(fhead, fhead->tail, (void *)flow);
463 		if (flow->hdr == NULL) {
464 			mutex_exit(&fhead->lock);
465 			kmem_free(flow, FLOWACCT_FLOW_SZ);
466 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
467 			    "error"));
468 			return (-1);
469 		}
470 
471 		flow->type = FLOWACCT_FLOW;
472 		flow->isv4 = header->isv4;
473 		bcopy(header->saddr.s6_addr32, flow->saddr.s6_addr32,
474 		    sizeof (header->saddr.s6_addr32));
475 		bcopy(header->daddr.s6_addr32, flow->daddr.s6_addr32,
476 		    sizeof (header->daddr.s6_addr32));
477 		flow->proto = header->proto;
478 		flow->sport = header->sport;
479 		flow->dport = header->dport;
480 		flow->back_ptr = fhead;
481 		added_flow = B_TRUE;
482 	} else {
483 		/*
484 		 * We need to make sure that this 'flow' is not deleted
485 		 * either by a scheduled timeout or an explict call
486 		 * to flowacct_timer() below.
487 		 */
488 		flow->inuse = B_TRUE;
489 	}
490 
491 	ihead = &flow->items;
492 	item = flowacct_item_present(flow, header->dsfield, header->projid,
493 	    header->uid);
494 	if (item == NULL) {
495 		boolean_t just_once = B_TRUE;
496 		/*
497 		 * For all practical purposes, we limit the no. of entries in
498 		 * the flow table - i.e. the max_limt that a user specifies is
499 		 * the maximum no. of flow items in the table.
500 		 */
501 	try_again:
502 		atomic_add_32(&flowacct_data->nflows, 1);
503 		if (flowacct_data->nflows > flowacct_data->max_limit) {
504 			atomic_add_32(&flowacct_data->nflows, -1);
505 
506 			/* Try timing out once */
507 			if (just_once) {
508 				/*
509 				 * Need to release the lock, as this entry
510 				 * could contain a flow that can be timed
511 				 * out.
512 				 */
513 				mutex_exit(&fhead->lock);
514 				flowacct_timer(FLOWACCT_JUST_ONE,
515 				    flowacct_data);
516 				mutex_enter(&fhead->lock);
517 				/* Lets check again */
518 				just_once = B_FALSE;
519 				goto try_again;
520 			} else {
521 				flow->inuse = B_FALSE;
522 				/* Need to remove the flow, if one was added */
523 				if (added_flow) {
524 					flowacct_del_obj(fhead, flow->hdr,
525 					    FLOWACCT_DEL_OBJ);
526 				}
527 				mutex_exit(&fhead->lock);
528 				flowacct1dbg(("flowacct_update_flows_tbl: "\
529 				    "maximum active flows exceeded\n"));
530 				return (-1);
531 			}
532 		}
533 		item = (flow_item_t *)kmem_zalloc(FLOWACCT_ITEM_SZ, KM_NOSLEEP);
534 		if (item == NULL) {
535 			flow->inuse = B_FALSE;
536 			/* Need to remove the flow, if one was added */
537 			if (added_flow) {
538 				flowacct_del_obj(fhead, flow->hdr,
539 				    FLOWACCT_DEL_OBJ);
540 			}
541 			mutex_exit(&fhead->lock);
542 			atomic_add_32(&flowacct_data->nflows, -1);
543 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
544 			    "error"));
545 			return (-1);
546 		}
547 		item->hdr = flowacct_add_obj(ihead, ihead->tail, (void *)item);
548 		if (item->hdr == NULL) {
549 			flow->inuse = B_FALSE;
550 			/* Need to remove the flow, if one was added */
551 			if (added_flow) {
552 				flowacct_del_obj(fhead, flow->hdr,
553 				    FLOWACCT_DEL_OBJ);
554 			}
555 			mutex_exit(&fhead->lock);
556 			atomic_add_32(&flowacct_data->nflows, -1);
557 			kmem_free(item, FLOWACCT_ITEM_SZ);
558 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
559 			    "error\n"));
560 			return (-1);
561 		}
562 		/* If a flow was added, add it too */
563 		if (added_flow) {
564 			atomic_add_64(&flowacct_data->usedmem,
565 			    FLOWACCT_FLOW_RECORD_SZ);
566 		}
567 		atomic_add_64(&flowacct_data->usedmem, FLOWACCT_ITEM_RECORD_SZ);
568 
569 		item->type = FLOWACCT_ITEM;
570 		item->dsfield = header->dsfield;
571 		item->projid = header->projid;
572 		item->uid = header->uid;
573 		item->npackets = 1;
574 		item->nbytes = header->pktlen;
575 		item->creation_time = item->hdr->last_seen;
576 	} else {
577 		item->npackets++;
578 		item->nbytes += header->pktlen;
579 	}
580 	gethrestime(&now);
581 	flow->hdr->last_seen = item->hdr->last_seen = now;
582 	mutex_exit(&fhead->lock);
583 
584 	/*
585 	 * Re-adjust the timeout list. The timer takes the thead lock
586 	 * follwed by fhead lock(s), so we release fhead, take thead
587 	 * and re-take fhead.
588 	 */
589 	mutex_enter(&thead->lock);
590 	mutex_enter(&fhead->lock);
591 	/* If the flow was added, append it to the tail of the timeout list */
592 	if (added_flow) {
593 		if (thead->head == NULL) {
594 			thead->head = flow->hdr;
595 			thead->tail = flow->hdr;
596 		} else {
597 			thead->tail->timeout_next = flow->hdr;
598 			flow->hdr->timeout_prev = thead->tail;
599 			thead->tail = flow->hdr;
600 		}
601 	/*
602 	 * Else, move this flow to the tail of the timeout list, if it is not
603 	 * already.
604 	 * flow->hdr in the timeout list :-
605 	 * timeout_next = NULL, timeout_prev != NULL, at the tail end.
606 	 * timeout_next != NULL, timeout_prev = NULL, at the head.
607 	 * timeout_next != NULL, timeout_prev != NULL, in the middle.
608 	 * timeout_next = NULL, timeout_prev = NULL, not in the timeout list,
609 	 * ignore such flow.
610 	 */
611 	} else if ((flow->hdr->timeout_next != NULL) ||
612 	    (flow->hdr->timeout_prev != NULL)) {
613 		if (flow->hdr != thead->tail) {
614 			if (flow->hdr == thead->head) {
615 				thead->head->timeout_next->timeout_prev = NULL;
616 				thead->head = thead->head->timeout_next;
617 				flow->hdr->timeout_next = NULL;
618 				thead->tail->timeout_next = flow->hdr;
619 				flow->hdr->timeout_prev = thead->tail;
620 				thead->tail = flow->hdr;
621 			} else {
622 				flow->hdr->timeout_prev->timeout_next =
623 				    flow->hdr->timeout_next;
624 				flow->hdr->timeout_next->timeout_prev =
625 				    flow->hdr->timeout_prev;
626 				flow->hdr->timeout_next = NULL;
627 				thead->tail->timeout_next = flow->hdr;
628 				flow->hdr->timeout_prev = thead->tail;
629 				thead->tail = flow->hdr;
630 			}
631 		}
632 	}
633 	/*
634 	 * Unset this variable, now it is fine even if this
635 	 * flow gets deleted (i.e. after timing out its
636 	 * flow items) since we are done using it.
637 	 */
638 	flow->inuse = B_FALSE;
639 	mutex_exit(&fhead->lock);
640 	mutex_exit(&thead->lock);
641 	atomic_add_64(&flowacct_data->tbytes, header->pktlen);
642 	return (0);
643 }
644 
645 /* Timer for timing out flows/items from the flow table */
646 void
647 flowacct_timeout_flows(void *args)
648 {
649 	flowacct_data_t *flowacct_data = (flowacct_data_t *)args;
650 	flowacct_timer(FLOWACCT_FLOW_TIMER, flowacct_data);
651 	flowacct_data->flow_tid = timeout(flowacct_timeout_flows, flowacct_data,
652 	    drv_usectohz(flowacct_data->timer));
653 }
654 
655 
656 /* Delete the item from the flow in the flow table */
657 static void
658 flowacct_timeout_item(flow_t **flow, list_hdr_t **item_hdr)
659 {
660 	list_hdr_t *next_it_hdr;
661 
662 	next_it_hdr = (*item_hdr)->next;
663 	flowacct_del_obj(&(*flow)->items, *item_hdr, FLOWACCT_DEL_OBJ);
664 	*item_hdr = next_it_hdr;
665 }
666 
667 /* Create a flow record for this timed out item */
668 static flow_records_t *
669 flowacct_create_record(flow_t *flow, list_hdr_t *ithdr)
670 {
671 	int count;
672 	flow_item_t *item = (flow_item_t *)ithdr->objp;
673 	flow_records_t *tmp_frec = NULL;
674 
675 	/* Record to be written into the accounting file */
676 	tmp_frec = kmem_zalloc(sizeof (flow_records_t), KM_NOSLEEP);
677 	if (tmp_frec == NULL) {
678 		flowacct0dbg(("flowacct_create_record: mem alloc error.\n"));
679 		return (NULL);
680 	}
681 	tmp_frec->fl_use = kmem_zalloc(sizeof (flow_usage_t), KM_NOSLEEP);
682 	if (tmp_frec->fl_use == NULL) {
683 		flowacct0dbg(("flowacct_create_record: mem alloc error\n"));
684 		kmem_free(tmp_frec, sizeof (flow_records_t));
685 		return (NULL);
686 	}
687 
688 	/* Copy the IP address */
689 	for (count = 0; count < 4; count++) {
690 		tmp_frec->fl_use->fu_saddr[count] =
691 		    htonl(flow->saddr.s6_addr32[count]);
692 		tmp_frec->fl_use->fu_daddr[count] =
693 		    htonl(flow->daddr.s6_addr32[count]);
694 	}
695 
696 	/*
697 	 * Ports, protocol, version, dsfield, project id, uid, nbytes, npackets
698 	 * creation time and last seen.
699 	 */
700 	tmp_frec->fl_use->fu_sport = htons(flow->sport);
701 	tmp_frec->fl_use->fu_dport = htons(flow->dport);
702 	tmp_frec->fl_use->fu_protocol = flow->proto;
703 	tmp_frec->fl_use->fu_isv4 = flow->isv4;
704 	tmp_frec->fl_use->fu_dsfield = item->dsfield;
705 	tmp_frec->fl_use->fu_projid = item->projid;
706 	tmp_frec->fl_use->fu_userid = item->uid;
707 	tmp_frec->fl_use->fu_nbytes = item->nbytes;
708 	tmp_frec->fl_use->fu_npackets = item->npackets;
709 	tmp_frec->fl_use->fu_lseen =
710 	    (uint64_t)(ulong_t)ithdr->last_seen.tv_sec;
711 	tmp_frec->fl_use->fu_ctime =
712 	    (uint64_t)(ulong_t)item->creation_time.tv_sec;
713 
714 	return (tmp_frec);
715 }
716 
717 /*
718  * Scan thru the timeout list and write the records to the accounting file, if
719  * possible. Basically step thru the timeout list maintained in the last
720  * hash bucket, FLOW_COUNT_TBL + 1, and timeout flows. This could be called
721  * from the timer, FLOWACCT_TIMER - delete only timed out flows or when this
722  * instance is deleted, FLOWACCT_PURGE_FLOW - delete all the flows from the
723  * table or as FLOWACCT_JUST_ONE - delete the first timed out flow. Since the
724  * flows are cronologically arranged in the timeout list,  when called as
725  * FLOWACCT_TIMER and FLOWACCT_JUST_ONE, we can stop when we come across
726  * the first flow that has not timed out (which means none of the following
727  * flows would have timed out).
728  */
729 void
730 flowacct_timer(int type, flowacct_data_t *flowacct_data)
731 {
732 	hrtime_t diff;
733 	timespec_t now;
734 	list_head_t *head, *thead;
735 	flow_t *flow;
736 	flow_item_t *item;
737 	list_hdr_t *fl_hdr, *next_fl_hdr;
738 	list_hdr_t *ithdr = (list_hdr_t *)NULL;
739 	flow_records_t *frec = NULL, *tmp_frec, *tail;
740 	uint64_t flow_size;
741 	uint64_t item_size;
742 
743 	ASSERT(flowacct_data != NULL);
744 
745 	/* 2s-complement for subtraction */
746 	flow_size = ~FLOWACCT_FLOW_RECORD_SZ + 1;
747 	item_size = ~FLOWACCT_ITEM_RECORD_SZ + 1;
748 
749 	/* Get the current time */
750 	gethrestime(&now);
751 
752 	/*
753 	 * For each flow in the table, scan thru all the items and delete
754 	 * those that have exceeded the timeout. If all the items in a
755 	 * flow have timed out, delete the flow entry as well. Finally,
756 	 * write all the delted items to the accounting file.
757 	 */
758 	thead = &flowacct_data->flows_tbl[FLOW_TBL_COUNT];
759 
760 	mutex_enter(&thead->lock);
761 	fl_hdr = thead->head;
762 	while (fl_hdr != NULL) {
763 		uint32_t	items_deleted = 0;
764 
765 		next_fl_hdr = fl_hdr->timeout_next;
766 		flow = (flow_t *)fl_hdr->objp;
767 		head = flow->back_ptr;
768 		mutex_enter(&head->lock);
769 
770 		/*LINTED*/
771 		FLOWACCT_DELTA(now, fl_hdr->last_seen, diff);
772 
773 		/*
774 		 * If type is FLOW_TIMER, then check if the item has timed out.
775 		 * If type is FLOW_PURGE delete the entry anyways.
776 		 */
777 		if ((type != FLOWACCT_PURGE_FLOW) &&
778 		    (diff < flowacct_data->timeout)) {
779 			mutex_exit(&head->lock);
780 			mutex_exit(&thead->lock);
781 			goto write_records;
782 		}
783 
784 		ithdr = flow->items.head;
785 		while (ithdr != NULL) {
786 			item = (flow_item_t *)ithdr->objp;
787 			/*
788 			 * Fill in the flow record to be
789 			 * written to the accounting file.
790 			 */
791 			tmp_frec = flowacct_create_record(flow, ithdr);
792 			/*
793 			 * If we don't have memory for records,
794 			 * we will come back in case this is
795 			 * called as FLOW_TIMER, else we will
796 			 * go ahead and delete the item from
797 			 * the table (when asked to PURGE the
798 			 * table), so there could be some
799 			 * entries not written to the file
800 			 * when this action instance is
801 			 * deleted.
802 			 */
803 			if (tmp_frec != NULL) {
804 				tmp_frec->fl_use->fu_aname =
805 				    flowacct_data->act_name;
806 				if (frec == NULL) {
807 					frec = tmp_frec;
808 					tail = frec;
809 				} else {
810 					tail->next = tmp_frec;
811 					tail = tmp_frec;
812 				}
813 			} else if (type != FLOWACCT_PURGE_FLOW) {
814 				mutex_exit(&head->lock);
815 				mutex_exit(&thead->lock);
816 				atomic_add_32(&flowacct_data->nflows,
817 				    (~items_deleted + 1));
818 				goto write_records;
819 			}
820 
821 			/* Update stats */
822 			atomic_add_64(&flowacct_data->tbytes, (~item->nbytes +
823 			    1));
824 
825 			/* Delete the item */
826 			flowacct_timeout_item(&flow, &ithdr);
827 			items_deleted++;
828 			atomic_add_64(&flowacct_data->usedmem, item_size);
829 		}
830 		ASSERT(flow->items.nbr_items == 0);
831 		atomic_add_32(&flowacct_data->nflows, (~items_deleted + 1));
832 
833 		/*
834 		 * Don't delete this flow if we are making place for
835 		 * a new item for this flow.
836 		 */
837 		if (!flow->inuse) {
838 			if (fl_hdr->timeout_prev != NULL) {
839 				fl_hdr->timeout_prev->timeout_next =
840 				    fl_hdr->timeout_next;
841 			} else {
842 				thead->head = fl_hdr->timeout_next;
843 			}
844 			if (fl_hdr->timeout_next != NULL) {
845 				fl_hdr->timeout_next->timeout_prev =
846 				    fl_hdr->timeout_prev;
847 			} else {
848 				thead->tail = fl_hdr->timeout_prev;
849 			}
850 			fl_hdr->timeout_prev = NULL;
851 			fl_hdr->timeout_next = NULL;
852 			flowacct_del_obj(head, fl_hdr, FLOWACCT_DEL_OBJ);
853 			atomic_add_64(&flowacct_data->usedmem, flow_size);
854 		}
855 		mutex_exit(&head->lock);
856 		if (type == FLOWACCT_JUST_ONE) {
857 			mutex_exit(&thead->lock);
858 			goto write_records;
859 		}
860 		fl_hdr = next_fl_hdr;
861 	}
862 	mutex_exit(&thead->lock);
863 write_records:
864 	/* Write all the timed out flows to the accounting file */
865 	while (frec != NULL) {
866 		tmp_frec = frec->next;
867 		exacct_commit_flow(frec->fl_use);
868 		kmem_free(frec->fl_use, sizeof (flow_usage_t));
869 		kmem_free(frec, sizeof (flow_records_t));
870 		frec = tmp_frec;
871 	}
872 }
873 
874 /*
875  * Get the IP header contents from the packet, update the flow table with
876  * this item and return.
877  */
878 int
879 flowacct_process(mblk_t **mpp, flowacct_data_t *flowacct_data)
880 {
881 	header_t *header;
882 	mblk_t *mp = *mpp;
883 
884 	ASSERT(mp != NULL);
885 
886 	/* If we don't find an M_DATA, return error */
887 	if (mp->b_datap->db_type != M_DATA) {
888 		if ((mp->b_cont != NULL) &&
889 		    (mp->b_cont->b_datap->db_type == M_DATA)) {
890 			mp = mp->b_cont;
891 		} else {
892 			flowacct0dbg(("flowacct_process: no data\n"));
893 			atomic_add_64(&flowacct_data->epackets, 1);
894 			return (EINVAL);
895 		}
896 	}
897 
898 	header = kmem_zalloc(FLOWACCT_HEADER_SZ, KM_NOSLEEP);
899 	if (header == NULL) {
900 		flowacct0dbg(("flowacct_process: error allocing mem"));
901 		atomic_add_64(&flowacct_data->epackets, 1);
902 		return (ENOMEM);
903 	}
904 
905 	/* Get all the required information into header. */
906 	if (flowacct_extract_header(mp, header) != 0) {
907 		kmem_free(header, FLOWACCT_HEADER_SZ);
908 		atomic_add_64(&flowacct_data->epackets, 1);
909 		return (EINVAL);
910 	}
911 
912 	/* Updated the flow table with this entry */
913 	if (flowacct_update_flows_tbl(header, flowacct_data) != 0) {
914 		kmem_free(header, FLOWACCT_HEADER_SZ);
915 		atomic_add_64(&flowacct_data->epackets, 1);
916 		return (ENOMEM);
917 	}
918 
919 	/* Update global stats */
920 	atomic_add_64(&flowacct_data->npackets, 1);
921 	atomic_add_64(&flowacct_data->nbytes, header->pktlen);
922 
923 	kmem_free(header, FLOWACCT_HEADER_SZ);
924 	if (flowacct_data->flow_tid == 0) {
925 		flowacct_data->flow_tid = timeout(flowacct_timeout_flows,
926 		    flowacct_data, drv_usectohz(flowacct_data->timer));
927 	}
928 	return (0);
929 }
930