xref: /titanic_44/usr/src/uts/common/ipp/flowacct/flowacct.c (revision 999620f696af5092beda95a29692ad3ff16e9434)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/kmem.h>
29 #include <sys/conf.h>
30 #include <sys/atomic.h>
31 #include <netinet/in.h>
32 #include <netinet/in_systm.h>
33 #include <netinet/ip6.h>
34 #include <sys/socket.h>
35 #include <sys/acct.h>
36 #include <sys/exacct.h>
37 #include <inet/common.h>
38 #include <inet/ip.h>
39 #include <inet/ip6.h>
40 #include <sys/ddi.h>
41 #include <sys/strsun.h>
42 #include <sys/strsubr.h>
43 #include <ipp/flowacct/flowacct_impl.h>
44 
45 /*
46  * flowacct - IPQoS accounting module. The module maintains an array
47  * of 256 hash buckets. When the action routine is invoked for a flow,
48  * if the flow (identified by the 5-tuple: saddr, daddr, sport, dport, proto)
49  * is already present in the flow table (indexed by the hash function FLOW_HASH)
50  * then a check is made to see if an item for this flow with the same
51  * dsfield, projid & user id is present. If it is, then the number of packets
52  * and the bytes are incremented for that item. If the item does
53  * not exist a new item is added for the flow. If the flow is not present
54  * an entry is made for the flow.
55  *
56  * A timer runs thru the table and writes all the flow items that have
57  * timed out to the accounting file (via exacct PSARC/1999/119), if present
58  * Configuration commands to change the timing interval is provided. The
59  * flow timeout value can also be configured. While the timeout is in nsec,
60  * the flow timer interval is in usec.
61  * Information for an active flow can be obtained by using kstats.
62  */
63 
64 /* Used in computing the hash index */
65 #define	FLOWACCT_ADDR_HASH(addr) 			\
66 	((addr).s6_addr8[8] ^ (addr).s6_addr8[9] ^ 	\
67 	(addr).s6_addr8[10] ^ (addr).s6_addr8[13] ^ 	\
68 	(addr).s6_addr8[14] ^ (addr).s6_addr8[15])
69 
70 #define	FLOWACCT_FLOW_HASH(f)				\
71 	(((FLOWACCT_ADDR_HASH(f->saddr)) + 		\
72 	(FLOWACCT_ADDR_HASH(f->daddr)) + 		\
73 	(f->proto) + (f->sport) + (f->dport)) 		\
74 	% FLOW_TBL_COUNT)
75 
76 /*
77  * Compute difference between a and b in nsec and store in delta.
78  * delta should be a hrtime_t. Taken from ip_mroute.c.
79  */
80 #define	FLOWACCT_DELTA(a, b, delta) { \
81 	int xxs; \
82  \
83 	delta = (a).tv_nsec - (b).tv_nsec; \
84 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
85 		switch (xxs) { \
86 		case 2: \
87 		    delta += NANOSEC; \
88 		    /*FALLTHRU*/ \
89 		case 1: \
90 		    delta += NANOSEC; \
91 		    break; \
92 		default: \
93 		    delta += ((hrtime_t)NANOSEC * xxs); \
94 		} \
95 	} \
96 }
97 
98 /* Debug level */
99 int flowacct_debug = 0;
100 
101 /* Collect timed out flows to be written to the accounting file */
102 typedef struct flow_records_s {
103 	flow_usage_t *fl_use;
104 	struct flow_records_s *next;
105 }flow_records_t;
106 
107 /* Get port information from the packet. Ignore fragments. */
108 static void
109 flowacct_port_info(header_t *header, void *iph, int af, mblk_t *mp)
110 {
111 	uint16_t *up;
112 
113 	if (af == AF_INET) {
114 		ipha_t *ipha = (ipha_t *)iph;
115 		uint32_t u2, u1;
116 		uint_t iplen;
117 
118 		u2 = ntohs(ipha->ipha_fragment_offset_and_flags);
119 		u1 = u2 & (IPH_MF | IPH_OFFSET);
120 		if (u1 != 0) {
121 			return;
122 		}
123 		iplen = (ipha->ipha_version_and_hdr_length & 0xF) << 2;
124 		up = (uint16_t *)(mp->b_rptr + iplen);
125 		header->sport = (uint16_t)*up++;
126 		header->dport = (uint16_t)*up;
127 	} else {
128 		ip6_t *ip6h = (ip6_t *)iph;
129 		uint_t  length = IPV6_HDR_LEN;
130 		uint_t  ehdrlen;
131 		uint8_t *nexthdrp, *whereptr, *endptr;
132 		ip6_dest_t *desthdr;
133 		ip6_rthdr_t *rthdr;
134 		ip6_hbh_t *hbhhdr;
135 
136 		whereptr = ((uint8_t *)&ip6h[1]);
137 		endptr = mp->b_wptr;
138 		nexthdrp = &ip6h->ip6_nxt;
139 		while (whereptr < endptr) {
140 			switch (*nexthdrp) {
141 			case IPPROTO_HOPOPTS:
142 				hbhhdr = (ip6_hbh_t *)whereptr;
143 				ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
144 				if ((uchar_t *)hbhhdr +  ehdrlen > endptr)
145 					return;
146 				nexthdrp = &hbhhdr->ip6h_nxt;
147 				break;
148 			case IPPROTO_DSTOPTS:
149 				desthdr = (ip6_dest_t *)whereptr;
150 				ehdrlen = 8 * (desthdr->ip6d_len + 1);
151 				if ((uchar_t *)desthdr +  ehdrlen > endptr)
152 					return;
153 				nexthdrp = &desthdr->ip6d_nxt;
154 				break;
155 			case IPPROTO_ROUTING:
156 				rthdr = (ip6_rthdr_t *)whereptr;
157 				ehdrlen =  8 * (rthdr->ip6r_len + 1);
158 				if ((uchar_t *)rthdr +  ehdrlen > endptr)
159 					return;
160 				nexthdrp = &rthdr->ip6r_nxt;
161 				break;
162 			case IPPROTO_FRAGMENT:
163 				return;
164 			case IPPROTO_TCP:
165 			case IPPROTO_UDP:
166 			case IPPROTO_SCTP:
167 				/*
168 				 * Verify we have at least ICMP_MIN_TP_HDR_LEN
169 				 * bytes of the ULP's header to get the port
170 				 * info.
171 				 */
172 				if (((uchar_t *)ip6h + length +
173 				    ICMP_MIN_TP_HDR_LEN)  > endptr) {
174 					return;
175 				}
176 				/* Get the protocol & ports */
177 				header->proto = *nexthdrp;
178 				up = (uint16_t *)((uchar_t *)ip6h + length);
179 				header->sport = (uint16_t)*up++;
180 				header->dport = (uint16_t)*up;
181 				return;
182 			case IPPROTO_ICMPV6:
183 			case IPPROTO_ENCAP:
184 			case IPPROTO_IPV6:
185 			case IPPROTO_ESP:
186 			case IPPROTO_AH:
187 				header->proto = *nexthdrp;
188 				return;
189 			case IPPROTO_NONE:
190 			default:
191 				return;
192 			}
193 			length += ehdrlen;
194 			whereptr += ehdrlen;
195 		}
196 	}
197 }
198 
199 /*
200  * flowacct_find_ids(mp, header)
201  *
202  * attempt to discern the uid and projid of the originator of a packet by
203  * looking at the dblks making up the packet - yeuch!
204  *
205  * We do it by skipping any fragments with a credp of NULL (originated in
206  * kernel), taking the first value that isn't NULL to be the cred_t for the
207  * whole packet.
208  */
209 static void
210 flowacct_find_ids(mblk_t *mp, header_t *header)
211 {
212 	cred_t *cr;
213 
214 	cr = msg_getcred(mp, NULL);
215 	if (cr != NULL) {
216 		header->uid = crgetuid(cr);
217 		header->projid = crgetprojid(cr);
218 	} else {
219 		header->uid = (uid_t)-1;
220 		header->projid = -1;
221 	}
222 }
223 
224 /*
225  * Extract header information in a header_t structure so that we don't have
226  * have to parse the packet everytime.
227  */
228 static int
229 flowacct_extract_header(mblk_t *mp, header_t *header)
230 {
231 	ipha_t *ipha;
232 	ip6_t *ip6h;
233 #define	rptr	((uchar_t *)ipha)
234 
235 	/* 0 means no port extracted. */
236 	header->sport = 0;
237 	header->dport = 0;
238 	flowacct_find_ids(mp, header);
239 
240 	V6_SET_ZERO(header->saddr);
241 	V6_SET_ZERO(header->daddr);
242 
243 	ipha = (ipha_t *)mp->b_rptr;
244 	header->isv4 = IPH_HDR_VERSION(ipha) == IPV4_VERSION;
245 	if (header->isv4) {
246 		ipha = (ipha_t *)mp->b_rptr;
247 		V4_PART_OF_V6(header->saddr) = (int32_t)ipha->ipha_src;
248 		V4_PART_OF_V6(header->daddr) = (int32_t)ipha->ipha_dst;
249 		header->dsfield = ipha->ipha_type_of_service;
250 		header->proto = ipha->ipha_protocol;
251 		header->pktlen = ntohs(ipha->ipha_length);
252 		if ((header->proto == IPPROTO_TCP) ||
253 		    (header->proto == IPPROTO_UDP) ||
254 		    (header->proto == IPPROTO_SCTP)) {
255 			flowacct_port_info(header, ipha, AF_INET, mp);
256 		}
257 	} else {
258 		/*
259 		 * Need to pullup everything.
260 		 */
261 		if (mp->b_cont != NULL) {
262 			if (!pullupmsg(mp, -1)) {
263 				flowacct0dbg(("flowacct_extract_header: "\
264 				    "pullup error"));
265 				return (-1);
266 			}
267 		}
268 		ip6h = (ip6_t *)mp->b_rptr;
269 		bcopy(ip6h->ip6_src.s6_addr32, header->saddr.s6_addr32,
270 		    sizeof (ip6h->ip6_src.s6_addr32));
271 		bcopy(ip6h->ip6_dst.s6_addr32, header->daddr.s6_addr32,
272 		    sizeof (ip6h->ip6_dst.s6_addr32));
273 		header->dsfield = __IPV6_TCLASS_FROM_FLOW(ip6h->ip6_vcf);
274 		header->proto = ip6h->ip6_nxt;
275 		header->pktlen = ntohs(ip6h->ip6_plen) +
276 		    ip_hdr_length_v6(mp, ip6h);
277 		flowacct_port_info(header, ip6h, AF_INET6, mp);
278 
279 	}
280 #undef	rptr
281 	return (0);
282 }
283 
284 /* Check if the flow (identified by the 5-tuple) exists in the hash table */
285 static flow_t *
286 flowacct_flow_present(header_t *header, int index,
287     flowacct_data_t *flowacct_data)
288 {
289 	list_hdr_t *hdr = flowacct_data->flows_tbl[index].head;
290 	flow_t *flow;
291 
292 	while (hdr != NULL) {
293 		flow = (flow_t *)hdr->objp;
294 		if ((flow != NULL) &&
295 		    (IN6_ARE_ADDR_EQUAL(&flow->saddr, &header->saddr)) &&
296 		    (IN6_ARE_ADDR_EQUAL(&flow->daddr, &header->daddr)) &&
297 		    (flow->proto == header->proto) &&
298 		    (flow->sport == header->sport) &&
299 		    (flow->dport == header->dport)) {
300 			return (flow);
301 		}
302 		hdr = hdr->next;
303 	}
304 	return ((flow_t *)NULL);
305 }
306 
307 /*
308  * Add an object to the list at insert_point. This could be a flow item or
309  * a flow itself.
310  */
311 static list_hdr_t *
312 flowacct_add_obj(list_head_t *tophdr, list_hdr_t *insert_point, void *obj)
313 {
314 	list_hdr_t *new_hdr;
315 
316 	if (tophdr == NULL) {
317 		return ((list_hdr_t *)NULL);
318 	}
319 
320 	new_hdr = (list_hdr_t *)kmem_zalloc(FLOWACCT_HDR_SZ, KM_NOSLEEP);
321 	if (new_hdr == NULL) {
322 		flowacct0dbg(("flowacct_add_obj: error allocating mem"));
323 		return ((list_hdr_t *)NULL);
324 	}
325 	gethrestime(&new_hdr->last_seen);
326 	new_hdr->objp = obj;
327 	tophdr->nbr_items++;
328 
329 	if (insert_point == NULL) {
330 		if (tophdr->head == NULL) {
331 			tophdr->head = new_hdr;
332 			tophdr->tail = new_hdr;
333 			return (new_hdr);
334 		}
335 
336 		new_hdr->next = tophdr->head;
337 		tophdr->head->prev = new_hdr;
338 		tophdr->head = new_hdr;
339 		return (new_hdr);
340 	}
341 
342 	if (insert_point == tophdr->tail) {
343 		tophdr->tail->next = new_hdr;
344 		new_hdr->prev = tophdr->tail;
345 		tophdr->tail = new_hdr;
346 		return (new_hdr);
347 	}
348 
349 	new_hdr->next = insert_point->next;
350 	new_hdr->prev = insert_point;
351 	insert_point->next->prev = new_hdr;
352 	insert_point->next = new_hdr;
353 	return (new_hdr);
354 }
355 
356 /* Delete an obj from the list. This could be a flow item or the flow itself */
357 static void
358 flowacct_del_obj(list_head_t *tophdr, list_hdr_t *hdr, uint_t mode)
359 {
360 	size_t	length;
361 	uint_t	type;
362 
363 	if ((tophdr == NULL) || (hdr == NULL)) {
364 		return;
365 	}
366 
367 	type = ((flow_t *)hdr->objp)->type;
368 
369 	tophdr->nbr_items--;
370 
371 	if (hdr->next != NULL) {
372 		hdr->next->prev = hdr->prev;
373 	}
374 	if (hdr->prev != NULL) {
375 		hdr->prev->next = hdr->next;
376 	}
377 	if (tophdr->head == hdr) {
378 		tophdr->head = hdr->next;
379 	}
380 	if (tophdr->tail == hdr) {
381 		tophdr->tail = hdr->prev;
382 	}
383 
384 	if (mode == FLOWACCT_DEL_OBJ) {
385 		switch (type) {
386 		case FLOWACCT_FLOW:
387 			length = FLOWACCT_FLOW_SZ;
388 			break;
389 		case FLOWACCT_ITEM:
390 			length = FLOWACCT_ITEM_SZ;
391 			break;
392 		}
393 		kmem_free(hdr->objp, length);
394 		hdr->objp = NULL;
395 	}
396 
397 	kmem_free((void *)hdr, FLOWACCT_HDR_SZ);
398 }
399 
400 /*
401  * Checks if the given item (identified by dsfield, project id and uid)
402  * is already present for the flow.
403  */
404 static flow_item_t *
405 flowacct_item_present(flow_t *flow, uint8_t dsfield, pid_t proj_id, uint_t uid)
406 {
407 	list_hdr_t	*itemhdr;
408 	flow_item_t	*item;
409 
410 	itemhdr = flow->items.head;
411 
412 	while (itemhdr != NULL) {
413 		item = (flow_item_t *)itemhdr->objp;
414 
415 		if ((item->dsfield != dsfield) || (item->projid != proj_id) ||
416 		    (item->uid != uid)) {
417 			itemhdr = itemhdr->next;
418 			continue;
419 		}
420 		return (item);
421 	}
422 
423 	return ((flow_item_t *)NULL);
424 }
425 
426 /*
427  * Add the flow to the table, if not already present. If the flow is
428  * present in the table, add the item. Also, update the flow stats.
429  * Additionally, re-adjust the timout list as well.
430  */
431 static int
432 flowacct_update_flows_tbl(header_t *header, flowacct_data_t *flowacct_data)
433 {
434 	int index;
435 	list_head_t *fhead;
436 	list_head_t *thead;
437 	list_head_t *ihead;
438 	boolean_t added_flow = B_FALSE;
439 	timespec_t  now;
440 	flow_item_t *item;
441 	flow_t *flow;
442 
443 	index = FLOWACCT_FLOW_HASH(header);
444 	fhead = &flowacct_data->flows_tbl[index];
445 
446 	/* The timeout list */
447 	thead = &flowacct_data->flows_tbl[FLOW_TBL_COUNT];
448 
449 	mutex_enter(&fhead->lock);
450 	flow = flowacct_flow_present(header, index, flowacct_data);
451 	if (flow == NULL) {
452 		flow = (flow_t *)kmem_zalloc(FLOWACCT_FLOW_SZ, KM_NOSLEEP);
453 		if (flow == NULL) {
454 			mutex_exit(&fhead->lock);
455 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
456 			    "error"));
457 			return (-1);
458 		}
459 		flow->hdr = flowacct_add_obj(fhead, fhead->tail, (void *)flow);
460 		if (flow->hdr == NULL) {
461 			mutex_exit(&fhead->lock);
462 			kmem_free(flow, FLOWACCT_FLOW_SZ);
463 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
464 			    "error"));
465 			return (-1);
466 		}
467 
468 		flow->type = FLOWACCT_FLOW;
469 		flow->isv4 = header->isv4;
470 		bcopy(header->saddr.s6_addr32, flow->saddr.s6_addr32,
471 		    sizeof (header->saddr.s6_addr32));
472 		bcopy(header->daddr.s6_addr32, flow->daddr.s6_addr32,
473 		    sizeof (header->daddr.s6_addr32));
474 		flow->proto = header->proto;
475 		flow->sport = header->sport;
476 		flow->dport = header->dport;
477 		flow->back_ptr = fhead;
478 		added_flow = B_TRUE;
479 	} else {
480 		/*
481 		 * We need to make sure that this 'flow' is not deleted
482 		 * either by a scheduled timeout or an explict call
483 		 * to flowacct_timer() below.
484 		 */
485 		flow->inuse = B_TRUE;
486 	}
487 
488 	ihead = &flow->items;
489 	item = flowacct_item_present(flow, header->dsfield, header->projid,
490 	    header->uid);
491 	if (item == NULL) {
492 		boolean_t just_once = B_TRUE;
493 		/*
494 		 * For all practical purposes, we limit the no. of entries in
495 		 * the flow table - i.e. the max_limt that a user specifies is
496 		 * the maximum no. of flow items in the table.
497 		 */
498 	try_again:
499 		atomic_add_32(&flowacct_data->nflows, 1);
500 		if (flowacct_data->nflows > flowacct_data->max_limit) {
501 			atomic_add_32(&flowacct_data->nflows, -1);
502 
503 			/* Try timing out once */
504 			if (just_once) {
505 				/*
506 				 * Need to release the lock, as this entry
507 				 * could contain a flow that can be timed
508 				 * out.
509 				 */
510 				mutex_exit(&fhead->lock);
511 				flowacct_timer(FLOWACCT_JUST_ONE,
512 				    flowacct_data);
513 				mutex_enter(&fhead->lock);
514 				/* Lets check again */
515 				just_once = B_FALSE;
516 				goto try_again;
517 			} else {
518 				flow->inuse = B_FALSE;
519 				/* Need to remove the flow, if one was added */
520 				if (added_flow) {
521 					flowacct_del_obj(fhead, flow->hdr,
522 					    FLOWACCT_DEL_OBJ);
523 				}
524 				mutex_exit(&fhead->lock);
525 				flowacct1dbg(("flowacct_update_flows_tbl: "\
526 				    "maximum active flows exceeded\n"));
527 				return (-1);
528 			}
529 		}
530 		item = (flow_item_t *)kmem_zalloc(FLOWACCT_ITEM_SZ, KM_NOSLEEP);
531 		if (item == NULL) {
532 			flow->inuse = B_FALSE;
533 			/* Need to remove the flow, if one was added */
534 			if (added_flow) {
535 				flowacct_del_obj(fhead, flow->hdr,
536 				    FLOWACCT_DEL_OBJ);
537 			}
538 			mutex_exit(&fhead->lock);
539 			atomic_add_32(&flowacct_data->nflows, -1);
540 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
541 			    "error"));
542 			return (-1);
543 		}
544 		item->hdr = flowacct_add_obj(ihead, ihead->tail, (void *)item);
545 		if (item->hdr == NULL) {
546 			flow->inuse = B_FALSE;
547 			/* Need to remove the flow, if one was added */
548 			if (added_flow) {
549 				flowacct_del_obj(fhead, flow->hdr,
550 				    FLOWACCT_DEL_OBJ);
551 			}
552 			mutex_exit(&fhead->lock);
553 			atomic_add_32(&flowacct_data->nflows, -1);
554 			kmem_free(item, FLOWACCT_ITEM_SZ);
555 			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
556 			    "error\n"));
557 			return (-1);
558 		}
559 		/* If a flow was added, add it too */
560 		if (added_flow) {
561 			atomic_add_64(&flowacct_data->usedmem,
562 			    FLOWACCT_FLOW_RECORD_SZ);
563 		}
564 		atomic_add_64(&flowacct_data->usedmem, FLOWACCT_ITEM_RECORD_SZ);
565 
566 		item->type = FLOWACCT_ITEM;
567 		item->dsfield = header->dsfield;
568 		item->projid = header->projid;
569 		item->uid = header->uid;
570 		item->npackets = 1;
571 		item->nbytes = header->pktlen;
572 		item->creation_time = item->hdr->last_seen;
573 	} else {
574 		item->npackets++;
575 		item->nbytes += header->pktlen;
576 	}
577 	gethrestime(&now);
578 	flow->hdr->last_seen = item->hdr->last_seen = now;
579 	mutex_exit(&fhead->lock);
580 
581 	/*
582 	 * Re-adjust the timeout list. The timer takes the thead lock
583 	 * follwed by fhead lock(s), so we release fhead, take thead
584 	 * and re-take fhead.
585 	 */
586 	mutex_enter(&thead->lock);
587 	mutex_enter(&fhead->lock);
588 	/* If the flow was added, append it to the tail of the timeout list */
589 	if (added_flow) {
590 		if (thead->head == NULL) {
591 			thead->head = flow->hdr;
592 			thead->tail = flow->hdr;
593 		} else {
594 			thead->tail->timeout_next = flow->hdr;
595 			flow->hdr->timeout_prev = thead->tail;
596 			thead->tail = flow->hdr;
597 		}
598 	/*
599 	 * Else, move this flow to the tail of the timeout list, if it is not
600 	 * already.
601 	 * flow->hdr in the timeout list :-
602 	 * timeout_next = NULL, timeout_prev != NULL, at the tail end.
603 	 * timeout_next != NULL, timeout_prev = NULL, at the head.
604 	 * timeout_next != NULL, timeout_prev != NULL, in the middle.
605 	 * timeout_next = NULL, timeout_prev = NULL, not in the timeout list,
606 	 * ignore such flow.
607 	 */
608 	} else if ((flow->hdr->timeout_next != NULL) ||
609 	    (flow->hdr->timeout_prev != NULL)) {
610 		if (flow->hdr != thead->tail) {
611 			if (flow->hdr == thead->head) {
612 				thead->head->timeout_next->timeout_prev = NULL;
613 				thead->head = thead->head->timeout_next;
614 				flow->hdr->timeout_next = NULL;
615 				thead->tail->timeout_next = flow->hdr;
616 				flow->hdr->timeout_prev = thead->tail;
617 				thead->tail = flow->hdr;
618 			} else {
619 				flow->hdr->timeout_prev->timeout_next =
620 				    flow->hdr->timeout_next;
621 				flow->hdr->timeout_next->timeout_prev =
622 				    flow->hdr->timeout_prev;
623 				flow->hdr->timeout_next = NULL;
624 				thead->tail->timeout_next = flow->hdr;
625 				flow->hdr->timeout_prev = thead->tail;
626 				thead->tail = flow->hdr;
627 			}
628 		}
629 	}
630 	/*
631 	 * Unset this variable, now it is fine even if this
632 	 * flow gets deleted (i.e. after timing out its
633 	 * flow items) since we are done using it.
634 	 */
635 	flow->inuse = B_FALSE;
636 	mutex_exit(&fhead->lock);
637 	mutex_exit(&thead->lock);
638 	atomic_add_64(&flowacct_data->tbytes, header->pktlen);
639 	return (0);
640 }
641 
642 /* Timer for timing out flows/items from the flow table */
643 void
644 flowacct_timeout_flows(void *args)
645 {
646 	flowacct_data_t *flowacct_data = (flowacct_data_t *)args;
647 	flowacct_timer(FLOWACCT_FLOW_TIMER, flowacct_data);
648 	flowacct_data->flow_tid = timeout(flowacct_timeout_flows, flowacct_data,
649 	    drv_usectohz(flowacct_data->timer));
650 }
651 
652 
653 /* Delete the item from the flow in the flow table */
654 static void
655 flowacct_timeout_item(flow_t **flow, list_hdr_t **item_hdr)
656 {
657 	list_hdr_t *next_it_hdr;
658 
659 	next_it_hdr = (*item_hdr)->next;
660 	flowacct_del_obj(&(*flow)->items, *item_hdr, FLOWACCT_DEL_OBJ);
661 	*item_hdr = next_it_hdr;
662 }
663 
664 /* Create a flow record for this timed out item */
665 static flow_records_t *
666 flowacct_create_record(flow_t *flow, list_hdr_t *ithdr)
667 {
668 	int count;
669 	flow_item_t *item = (flow_item_t *)ithdr->objp;
670 	flow_records_t *tmp_frec = NULL;
671 
672 	/* Record to be written into the accounting file */
673 	tmp_frec = kmem_zalloc(sizeof (flow_records_t), KM_NOSLEEP);
674 	if (tmp_frec == NULL) {
675 		flowacct0dbg(("flowacct_create_record: mem alloc error.\n"));
676 		return (NULL);
677 	}
678 	tmp_frec->fl_use = kmem_zalloc(sizeof (flow_usage_t), KM_NOSLEEP);
679 	if (tmp_frec->fl_use == NULL) {
680 		flowacct0dbg(("flowacct_create_record: mem alloc error\n"));
681 		kmem_free(tmp_frec, sizeof (flow_records_t));
682 		return (NULL);
683 	}
684 
685 	/* Copy the IP address */
686 	for (count = 0; count < 4; count++) {
687 		tmp_frec->fl_use->fu_saddr[count] =
688 		    htonl(flow->saddr.s6_addr32[count]);
689 		tmp_frec->fl_use->fu_daddr[count] =
690 		    htonl(flow->daddr.s6_addr32[count]);
691 	}
692 
693 	/*
694 	 * Ports, protocol, version, dsfield, project id, uid, nbytes, npackets
695 	 * creation time and last seen.
696 	 */
697 	tmp_frec->fl_use->fu_sport = htons(flow->sport);
698 	tmp_frec->fl_use->fu_dport = htons(flow->dport);
699 	tmp_frec->fl_use->fu_protocol = flow->proto;
700 	tmp_frec->fl_use->fu_isv4 = flow->isv4;
701 	tmp_frec->fl_use->fu_dsfield = item->dsfield;
702 	tmp_frec->fl_use->fu_projid = item->projid;
703 	tmp_frec->fl_use->fu_userid = item->uid;
704 	tmp_frec->fl_use->fu_nbytes = item->nbytes;
705 	tmp_frec->fl_use->fu_npackets = item->npackets;
706 	tmp_frec->fl_use->fu_lseen =
707 	    (uint64_t)(ulong_t)ithdr->last_seen.tv_sec;
708 	tmp_frec->fl_use->fu_ctime =
709 	    (uint64_t)(ulong_t)item->creation_time.tv_sec;
710 
711 	return (tmp_frec);
712 }
713 
714 /*
715  * Scan thru the timeout list and write the records to the accounting file, if
716  * possible. Basically step thru the timeout list maintained in the last
717  * hash bucket, FLOW_COUNT_TBL + 1, and timeout flows. This could be called
718  * from the timer, FLOWACCT_TIMER - delete only timed out flows or when this
719  * instance is deleted, FLOWACCT_PURGE_FLOW - delete all the flows from the
720  * table or as FLOWACCT_JUST_ONE - delete the first timed out flow. Since the
721  * flows are cronologically arranged in the timeout list,  when called as
722  * FLOWACCT_TIMER and FLOWACCT_JUST_ONE, we can stop when we come across
723  * the first flow that has not timed out (which means none of the following
724  * flows would have timed out).
725  */
726 void
727 flowacct_timer(int type, flowacct_data_t *flowacct_data)
728 {
729 	hrtime_t diff;
730 	timespec_t now;
731 	list_head_t *head, *thead;
732 	flow_t *flow;
733 	flow_item_t *item;
734 	list_hdr_t *fl_hdr, *next_fl_hdr;
735 	list_hdr_t *ithdr = (list_hdr_t *)NULL;
736 	flow_records_t *frec = NULL, *tmp_frec, *tail;
737 	uint64_t flow_size;
738 	uint64_t item_size;
739 
740 	ASSERT(flowacct_data != NULL);
741 
742 	/* 2s-complement for subtraction */
743 	flow_size = ~FLOWACCT_FLOW_RECORD_SZ + 1;
744 	item_size = ~FLOWACCT_ITEM_RECORD_SZ + 1;
745 
746 	/* Get the current time */
747 	gethrestime(&now);
748 
749 	/*
750 	 * For each flow in the table, scan thru all the items and delete
751 	 * those that have exceeded the timeout. If all the items in a
752 	 * flow have timed out, delete the flow entry as well. Finally,
753 	 * write all the delted items to the accounting file.
754 	 */
755 	thead = &flowacct_data->flows_tbl[FLOW_TBL_COUNT];
756 
757 	mutex_enter(&thead->lock);
758 	fl_hdr = thead->head;
759 	while (fl_hdr != NULL) {
760 		uint32_t	items_deleted = 0;
761 
762 		next_fl_hdr = fl_hdr->timeout_next;
763 		flow = (flow_t *)fl_hdr->objp;
764 		head = flow->back_ptr;
765 		mutex_enter(&head->lock);
766 
767 		/*LINTED*/
768 		FLOWACCT_DELTA(now, fl_hdr->last_seen, diff);
769 
770 		/*
771 		 * If type is FLOW_TIMER, then check if the item has timed out.
772 		 * If type is FLOW_PURGE delete the entry anyways.
773 		 */
774 		if ((type != FLOWACCT_PURGE_FLOW) &&
775 		    (diff < flowacct_data->timeout)) {
776 			mutex_exit(&head->lock);
777 			mutex_exit(&thead->lock);
778 			goto write_records;
779 		}
780 
781 		ithdr = flow->items.head;
782 		while (ithdr != NULL) {
783 			item = (flow_item_t *)ithdr->objp;
784 			/*
785 			 * Fill in the flow record to be
786 			 * written to the accounting file.
787 			 */
788 			tmp_frec = flowacct_create_record(flow, ithdr);
789 			/*
790 			 * If we don't have memory for records,
791 			 * we will come back in case this is
792 			 * called as FLOW_TIMER, else we will
793 			 * go ahead and delete the item from
794 			 * the table (when asked to PURGE the
795 			 * table), so there could be some
796 			 * entries not written to the file
797 			 * when this action instance is
798 			 * deleted.
799 			 */
800 			if (tmp_frec != NULL) {
801 				tmp_frec->fl_use->fu_aname =
802 				    flowacct_data->act_name;
803 				if (frec == NULL) {
804 					frec = tmp_frec;
805 					tail = frec;
806 				} else {
807 					tail->next = tmp_frec;
808 					tail = tmp_frec;
809 				}
810 			} else if (type != FLOWACCT_PURGE_FLOW) {
811 				mutex_exit(&head->lock);
812 				mutex_exit(&thead->lock);
813 				atomic_add_32(&flowacct_data->nflows,
814 				    (~items_deleted + 1));
815 				goto write_records;
816 			}
817 
818 			/* Update stats */
819 			atomic_add_64(&flowacct_data->tbytes, (~item->nbytes +
820 			    1));
821 
822 			/* Delete the item */
823 			flowacct_timeout_item(&flow, &ithdr);
824 			items_deleted++;
825 			atomic_add_64(&flowacct_data->usedmem, item_size);
826 		}
827 		ASSERT(flow->items.nbr_items == 0);
828 		atomic_add_32(&flowacct_data->nflows, (~items_deleted + 1));
829 
830 		/*
831 		 * Don't delete this flow if we are making place for
832 		 * a new item for this flow.
833 		 */
834 		if (!flow->inuse) {
835 			if (fl_hdr->timeout_prev != NULL) {
836 				fl_hdr->timeout_prev->timeout_next =
837 				    fl_hdr->timeout_next;
838 			} else {
839 				thead->head = fl_hdr->timeout_next;
840 			}
841 			if (fl_hdr->timeout_next != NULL) {
842 				fl_hdr->timeout_next->timeout_prev =
843 				    fl_hdr->timeout_prev;
844 			} else {
845 				thead->tail = fl_hdr->timeout_prev;
846 			}
847 			fl_hdr->timeout_prev = NULL;
848 			fl_hdr->timeout_next = NULL;
849 			flowacct_del_obj(head, fl_hdr, FLOWACCT_DEL_OBJ);
850 			atomic_add_64(&flowacct_data->usedmem, flow_size);
851 		}
852 		mutex_exit(&head->lock);
853 		if (type == FLOWACCT_JUST_ONE) {
854 			mutex_exit(&thead->lock);
855 			goto write_records;
856 		}
857 		fl_hdr = next_fl_hdr;
858 	}
859 	mutex_exit(&thead->lock);
860 write_records:
861 	/* Write all the timed out flows to the accounting file */
862 	while (frec != NULL) {
863 		tmp_frec = frec->next;
864 		exacct_commit_flow(frec->fl_use);
865 		kmem_free(frec->fl_use, sizeof (flow_usage_t));
866 		kmem_free(frec, sizeof (flow_records_t));
867 		frec = tmp_frec;
868 	}
869 }
870 
871 /*
872  * Get the IP header contents from the packet, update the flow table with
873  * this item and return.
874  */
875 int
876 flowacct_process(mblk_t **mpp, flowacct_data_t *flowacct_data)
877 {
878 	header_t *header;
879 	mblk_t *mp = *mpp;
880 
881 	ASSERT(mp != NULL);
882 
883 	/* If we don't find an M_DATA, return error */
884 	if (mp->b_datap->db_type != M_DATA) {
885 		if ((mp->b_cont != NULL) &&
886 		    (mp->b_cont->b_datap->db_type == M_DATA)) {
887 			mp = mp->b_cont;
888 		} else {
889 			flowacct0dbg(("flowacct_process: no data\n"));
890 			atomic_add_64(&flowacct_data->epackets, 1);
891 			return (EINVAL);
892 		}
893 	}
894 
895 	header = kmem_zalloc(FLOWACCT_HEADER_SZ, KM_NOSLEEP);
896 	if (header == NULL) {
897 		flowacct0dbg(("flowacct_process: error allocing mem"));
898 		atomic_add_64(&flowacct_data->epackets, 1);
899 		return (ENOMEM);
900 	}
901 
902 	/* Get all the required information into header. */
903 	if (flowacct_extract_header(mp, header) != 0) {
904 		kmem_free(header, FLOWACCT_HEADER_SZ);
905 		atomic_add_64(&flowacct_data->epackets, 1);
906 		return (EINVAL);
907 	}
908 
909 	/* Updated the flow table with this entry */
910 	if (flowacct_update_flows_tbl(header, flowacct_data) != 0) {
911 		kmem_free(header, FLOWACCT_HEADER_SZ);
912 		atomic_add_64(&flowacct_data->epackets, 1);
913 		return (ENOMEM);
914 	}
915 
916 	/* Update global stats */
917 	atomic_add_64(&flowacct_data->npackets, 1);
918 	atomic_add_64(&flowacct_data->nbytes, header->pktlen);
919 
920 	kmem_free(header, FLOWACCT_HEADER_SZ);
921 	if (flowacct_data->flow_tid == 0) {
922 		flowacct_data->flow_tid = timeout(flowacct_timeout_flows,
923 		    flowacct_data, drv_usectohz(flowacct_data->timer));
924 	}
925 	return (0);
926 }
927