/*
 * Copyright 2002 Damien Miller <djm@mindrot.org> All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This is software implementation of Cisco's NetFlow(tm) traffic       
 * reporting system. It operates by listening (via libpcap) on a        
 * promiscuous interface and tracking traffic flows.                    
 *
 * Traffic flows are recorded by source/destination/protocol
 * IP address or, in the case of TCP and UDP, by
 * src_addr:src_port/dest_addr:dest_port/protocol
 *
 * Flows expire automatically after a period of inactivity (default: 1
 * hour) They may also be evicted (in order of age) in situations where
 * there are more flows than slots available.
 *
 * Netflow compatible packets are sent to a specified target host upon
 * flow expiry.
 *
 * As this implementation watches traffic promiscuously, it is likely to
 * place significant load on hosts or gateways on which it is installed.
 */

#include "common.h"
#include "sys-tree.h"
#include "convtime.h"
#include "softflowd.h"
#include "treetype.h"
#include "freelist.h"
#include "log.h"
#include <pcap.h>

/* Global variables */
static int verbose_flag = 0;		/* Debugging flag */
static u_int16_t if_index = 0;		/* "manual" interface index */

/* Signal handler flags */
static volatile sig_atomic_t graceful_shutdown_request = 0;	

/* Context for libpcap callback functions */
struct CB_CTXT {
	struct FLOWTRACK *ft;
	int linktype;
	int fatal;
	int want_v6;
};

/* Describes a datalink header and how to extract v4/v6 frames from it */
struct DATALINK {
	int dlt;		/* BPF datalink type */
	int skiplen;		/* Number of bytes to skip datalink header */
	int ft_off;		/* Datalink frametype offset */
	int ft_len;		/* Datalink frametype length */
	int ft_is_be;		/* Set if frametype is big-endian */
	u_int32_t ft_mask;	/* Mask applied to frametype */
	u_int32_t ft_v4;	/* IPv4 frametype */
	u_int32_t ft_v6;	/* IPv6 frametype */
};

/* Datalink types that we know about */
static const struct DATALINK lt[] = {
	{ DLT_EN10MB,	14, 12,  2,  1, 0xffffffff,  0x0800,   0x86dd },
	{ DLT_PPP,	 5,  3,  2,  1, 0xffffffff,  0x0021,   0x0057 },
#ifdef DLT_LINUX_SLL
	{ DLT_LINUX_SLL,16, 14,  2,  1, 0xffffffff,  0x0800,   0x86dd },
#endif
	{ DLT_RAW,	 0,  0,  1,  1, 0x000000f0,  0x0040,   0x0060 },
	{ DLT_NULL,	 4,  0,  4,  0, 0xffffffff, AF_INET, AF_INET6 },
#ifdef DLT_LOOP
	{ DLT_LOOP,	 4,  0,  4,  1, 0xffffffff, AF_INET, AF_INET6 },
#endif
	{ -1,		-1, -1, -1, -1, 0x00000000,  0xffff,   0xffff },
};

/* Netflow send functions */
typedef int (netflow_send_func_t)(struct FLOW **, int, int, u_int16_t,
	u_int64_t *, struct timeval *, int, struct OPTION *);
struct NETFLOW_SENDER {
	int version;
	netflow_send_func_t *func;
	int v6_capable;
};

/* Array of NetFlow export function that we know of. NB. nf[0] is default */
static const struct NETFLOW_SENDER nf[] = {
	{ 5, send_netflow_v5, 0 },
	{ 1, send_netflow_v1, 0 },
	{ 9, send_netflow_v9, 1 },
	{ -1, NULL, 0 },
};

/* Describes a location where we send NetFlow packets to */
struct NETFLOW_TARGET {
	int fd;
	const struct NETFLOW_SENDER *dialect;
};

/* Signal handlers */
static void sighand_graceful_shutdown(int signum)
{
	graceful_shutdown_request = signum;
}

static void sighand_other(int signum)
{
	/* XXX: this may not be completely safe */
	logit(LOG_WARNING, "Exiting immediately on unexpected signal %d",
	    signum);
	_exit(0);
}

/*
 * This is the flow comparison function.
 */
static int
flow_compare(struct FLOW *a, struct FLOW *b)
{
	/* Be careful to avoid signed vs unsigned issues here */
	int r;

	if (a->af != b->af)
		return (a->af > b->af ? 1 : -1);

	if ((r = memcmp(&a->addr[0], &b->addr[0], sizeof(a->addr[0]))) != 0)
		return (r > 0 ? 1 : -1);

	if ((r = memcmp(&a->addr[1], &b->addr[1], sizeof(a->addr[1]))) != 0)
		return (r > 0 ? 1 : -1);

#ifdef notyet
	if (a->ip6_flowlabel[0] != 0 && b->ip6_flowlabel[0] != 0 && 
	    a->ip6_flowlabel[0] != b->ip6_flowlabel[0])
		return (a->ip6_flowlabel[0] > b->ip6_flowlabel[0] ? 1 : -1);

	if (a->ip6_flowlabel[1] != 0 && b->ip6_flowlabel[1] != 0 && 
	    a->ip6_flowlabel[1] != b->ip6_flowlabel[1])
		return (a->ip6_flowlabel[1] > b->ip6_flowlabel[1] ? 1 : -1);
#endif

	if (a->protocol != b->protocol)
		return (a->protocol > b->protocol ? 1 : -1);

	if (a->port[0] != b->port[0])
		return (ntohs(a->port[0]) > ntohs(b->port[0]) ? 1 : -1);

	if (a->port[1] != b->port[1])
		return (ntohs(a->port[1]) > ntohs(b->port[1]) ? 1 : -1);

	return (0);
}

/* Generate functions for flow tree */
FLOW_PROTOTYPE(FLOWS, FLOW, trp, flow_compare);
FLOW_GENERATE(FLOWS, FLOW, trp, flow_compare);

/*
 * This is the expiry comparison function.
 */
static int
expiry_compare(struct EXPIRY *a, struct EXPIRY *b)
{
	if (a->expires_at != b->expires_at)
		return (a->expires_at > b->expires_at ? 1 : -1);

	/* Make expiry entries unique by comparing flow sequence */
	if (a->flow->flow_seq != b->flow->flow_seq)
		return (a->flow->flow_seq > b->flow->flow_seq ? 1 : -1);

	return (0);
}

/* Generate functions for flow tree */
EXPIRY_PROTOTYPE(EXPIRIES, EXPIRY, trp, expiry_compare);
EXPIRY_GENERATE(EXPIRIES, EXPIRY, trp, expiry_compare);

static struct FLOW *
flow_get(struct FLOWTRACK *ft)
{
	return freelist_get(&ft->flow_freelist);
}

static void
flow_put(struct FLOWTRACK *ft, struct FLOW *flow)
{
	return freelist_put(&ft->flow_freelist, flow);
}

static struct EXPIRY *
expiry_get(struct FLOWTRACK *ft)
{
	return freelist_get(&ft->expiry_freelist);
}

static void
expiry_put(struct FLOWTRACK *ft, struct EXPIRY *expiry)
{
	return freelist_put(&ft->expiry_freelist, expiry);
}

#if 0
/* Dump a packet */
static void
dump_packet(const u_int8_t *p, int len)
{
	char buf[1024], tmp[3];
	int i;

	for (*buf = '\0', i = 0; i < len; i++) {
		snprintf(tmp, sizeof(tmp), "%02x%s", p[i], i % 2 ? " " : "");
		if (strlcat(buf, tmp, sizeof(buf) - 4) >= sizeof(buf) - 4) {
			strlcat(buf, "...", sizeof(buf));
			break;
		}
	}
	logit(LOG_INFO, "packet len %d: %s", len, buf);
}
#endif

/* Format a time in an ISOish format */
static const char *
format_time(time_t t)
{
	struct tm *tm;
	static char buf[32];

	tm = gmtime(&t);
	strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", tm);

	return (buf);

}

/* Format a flow in a verbose and ugly way */
static const char *
format_flow(struct FLOW *flow)
{
	char addr1[64], addr2[64], stime[32], ftime[32];
	static char buf[1024];

	inet_ntop(flow->af, &flow->addr[0], addr1, sizeof(addr1));
	inet_ntop(flow->af, &flow->addr[1], addr2, sizeof(addr2));

	snprintf(stime, sizeof(ftime), "%s", 
	    format_time(flow->flow_start.tv_sec));
	snprintf(ftime, sizeof(ftime), "%s", 
	    format_time(flow->flow_last.tv_sec));

	snprintf(buf, sizeof(buf),  "seq:%"PRIu64" [%s]:%hu <> [%s]:%hu proto:%u "
	    "octets>:%u packets>:%u octets<:%u packets<:%u "
	    "start:%s.%03ld finish:%s.%03ld tcp>:%02x tcp<:%02x "
	    "flowlabel>:%08x flowlabel<:%08x ",
	    flow->flow_seq,
	    addr1, ntohs(flow->port[0]), addr2, ntohs(flow->port[1]),
	    (int)flow->protocol, 
	    flow->octets[0], flow->packets[0], 
	    flow->octets[1], flow->packets[1], 
	    stime, (flow->flow_start.tv_usec + 500) / 1000, 
	    ftime, (flow->flow_last.tv_usec + 500) / 1000,
	    flow->tcp_flags[0], flow->tcp_flags[1],
	    flow->ip6_flowlabel[0], flow->ip6_flowlabel[1]);

	return (buf);
}

/* Format a flow in a brief way */
static const char *
format_flow_brief(struct FLOW *flow)
{
	char addr1[64], addr2[64];
	static char buf[1024];

	inet_ntop(flow->af, &flow->addr[0], addr1, sizeof(addr1));
	inet_ntop(flow->af, &flow->addr[1], addr2, sizeof(addr2));

	snprintf(buf, sizeof(buf), 
	    "seq:%"PRIu64" [%s]:%hu <> [%s]:%hu proto:%u",
	    flow->flow_seq,
	    addr1, ntohs(flow->port[0]), addr2, ntohs(flow->port[1]),
	    (int)flow->protocol);

	return (buf);
}

/* Fill in transport-layer (tcp/udp) portions of flow record */
static int
transport_to_flowrec(struct FLOW *flow, const u_int8_t *pkt, 
    const size_t caplen, int isfrag, int protocol, int ndx)
{
	const struct tcphdr *tcp = (const struct tcphdr *)pkt;
	const struct udphdr *udp = (const struct udphdr *)pkt;
	const struct icmp *icmp = (const struct icmp *)pkt;

	/*
	 * XXX to keep flow in proper canonical format, it may be necessary to
	 * swap the array slots based on the order of the port numbers does
	 * this matter in practice??? I don't think so - return flows will
	 * always match, because of their symmetrical addr/ports
	 */

	switch (protocol) {
	case IPPROTO_TCP:
		/* Check for runt packet, but don't error out on short frags */
		if (caplen < sizeof(*tcp))
			return (isfrag ? 0 : 1);
		flow->port[ndx] = tcp->th_sport;
		flow->port[ndx ^ 1] = tcp->th_dport;
		flow->tcp_flags[ndx] |= tcp->th_flags;
		break;
	case IPPROTO_UDP:
		/* Check for runt packet, but don't error out on short frags */
		if (caplen < sizeof(*udp))
			return (isfrag ? 0 : 1);
		flow->port[ndx] = udp->uh_sport;
		flow->port[ndx ^ 1] = udp->uh_dport;
		break;
	case IPPROTO_ICMP:
		/*
		 * Encode ICMP type * 256 + code into dest port like
		 * Cisco routers
		 */
		flow->port[ndx] = 0;
		flow->port[ndx ^ 1] = htons(icmp->icmp_type * 256 +
		    icmp->icmp_code);
		break;
	}
	return (0);
}

/* Convert a IPv4 packet to a partial flow record (used for comparison) */
static int
ipv4_to_flowrec(struct FLOW *flow, const u_int8_t *pkt, size_t caplen, 
    size_t len, int *isfrag, int af)
{
	const struct ip *ip = (const struct ip *)pkt;
	int ndx;

	if (caplen < 20 || caplen < ip->ip_hl * 4)
		return (-1);	/* Runt packet */
	if (ip->ip_v != 4)
		return (-1);	/* Unsupported IP version */
	
	/* Prepare to store flow in canonical format */
	ndx = memcmp(&ip->ip_src, &ip->ip_dst, sizeof(ip->ip_src)) > 0 ? 1 : 0;
	
	flow->af = af;
	flow->addr[ndx].v4 = ip->ip_src;
	flow->addr[ndx ^ 1].v4 = ip->ip_dst;
	flow->protocol = ip->ip_p;
	flow->octets[ndx] = len;
	flow->packets[ndx] = 1;

	*isfrag = (ntohs(ip->ip_off) & (IP_OFFMASK|IP_MF)) ? 1 : 0;

	/* Don't try to examine higher level headers if not first fragment */
	if (*isfrag && (ntohs(ip->ip_off) & IP_OFFMASK) != 0)
		return (0);

	return (transport_to_flowrec(flow, pkt + (ip->ip_hl * 4), 
	    caplen - (ip->ip_hl * 4), *isfrag, ip->ip_p, ndx));
}

/* Convert a IPv6 packet to a partial flow record (used for comparison) */
static int
ipv6_to_flowrec(struct FLOW *flow, const u_int8_t *pkt, size_t caplen, 
    size_t len, int *isfrag, int af)
{
	const struct ip6_hdr *ip6 = (const struct ip6_hdr *)pkt;
	const struct ip6_ext *eh6;
	const struct ip6_frag *fh6;
	int ndx, nxt;

	if (caplen < sizeof(*ip6))
		return (-1);	/* Runt packet */

	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
		return (-1);	/* Unsupported IPv6 version */

	/* Prepare to store flow in canonical format */
	ndx = memcmp(&ip6->ip6_src, &ip6->ip6_dst,
	    sizeof(ip6->ip6_src)) > 0 ? 1 : 0;
	
	flow->af = af;
	flow->ip6_flowlabel[ndx] = ip6->ip6_flow & IPV6_FLOWLABEL_MASK;
	flow->addr[ndx].v6 = ip6->ip6_src;
	flow->addr[ndx ^ 1].v6 = ip6->ip6_dst;
	flow->octets[ndx] = len;
	flow->packets[ndx] = 1;

	*isfrag = 0;
	nxt = ip6->ip6_nxt;
	pkt += sizeof(*ip6);
	caplen -= sizeof(*ip6);

	/* Now loop through headers, looking for transport header */
	for (;;) {
		eh6 = (const struct ip6_ext *)pkt;
		if (nxt == IPPROTO_HOPOPTS || 
		    nxt == IPPROTO_ROUTING || 
		    nxt == IPPROTO_DSTOPTS) {
			if (caplen < sizeof(*eh6) ||
			    caplen < (eh6->ip6e_len + 1) << 3)
				return (1); /* Runt */
			nxt = eh6->ip6e_nxt;
			pkt += (eh6->ip6e_len + 1) << 3;
			caplen -= (eh6->ip6e_len + 1) << 3;
		} else if (nxt == IPPROTO_FRAGMENT) {
			*isfrag = 1;
			fh6 = (const struct ip6_frag *)eh6;
			if (caplen < sizeof(*fh6))
				return (1); /* Runt */
			/*
			 * Don't try to examine higher level headers if 
			 * not first fragment
			 */
			if ((fh6->ip6f_offlg & IP6F_OFF_MASK) != 0)
				return (0);
			nxt = fh6->ip6f_nxt;
			pkt += sizeof(*fh6);
			caplen -= sizeof(*fh6);
		} else 
			break;
	}
	flow->protocol = nxt;

	return (transport_to_flowrec(flow, pkt, caplen, *isfrag, nxt, ndx));
}

static void
flow_update_expiry(struct FLOWTRACK *ft, struct FLOW *flow)
{
	EXPIRY_REMOVE(EXPIRIES, &ft->expiries, flow->expiry);

	/* Flows over 2 GiB traffic */
	if (flow->octets[0] > (1U << 31) || flow->octets[1] > (1U << 31)) {
		flow->expiry->expires_at = 0;
		flow->expiry->reason = R_OVERBYTES;
		goto out;
	}
	
	/* Flows over maximum life seconds */
	if (ft->maximum_lifetime != 0 && 
	    flow->flow_last.tv_sec - flow->flow_start.tv_sec > 
	    ft->maximum_lifetime) {
		flow->expiry->expires_at = 0;
		flow->expiry->reason = R_MAXLIFE;
		goto out;
	}
	
	if (flow->protocol == IPPROTO_TCP) {
		/* Reset TCP flows */
		if (ft->tcp_rst_timeout != 0 &&
		    ((flow->tcp_flags[0] & TH_RST) ||
		    (flow->tcp_flags[1] & TH_RST))) {
			flow->expiry->expires_at = flow->flow_last.tv_sec + 
			    ft->tcp_rst_timeout;
			flow->expiry->reason = R_TCP_RST;
			goto out;
		}
		/* Finished TCP flows */
		if (ft->tcp_fin_timeout != 0 &&
		    ((flow->tcp_flags[0] & TH_FIN) &&
		    (flow->tcp_flags[1] & TH_FIN))) {
			flow->expiry->expires_at = flow->flow_last.tv_sec + 
			    ft->tcp_fin_timeout;
			flow->expiry->reason = R_TCP_FIN;
			goto out;
		}

		/* TCP flows */
		if (ft->tcp_timeout != 0) {
			flow->expiry->expires_at = flow->flow_last.tv_sec + 
			    ft->tcp_timeout;
			flow->expiry->reason = R_TCP;
			goto out;
		}
	}

	if (ft->udp_timeout != 0 && flow->protocol == IPPROTO_UDP) {
		/* UDP flows */
		flow->expiry->expires_at = flow->flow_last.tv_sec + 
		    ft->udp_timeout;
		flow->expiry->reason = R_UDP;
		goto out;
	}

	if (ft->icmp_timeout != 0 &&
	    ((flow->af == AF_INET && flow->protocol == IPPROTO_ICMP) || 
	    ((flow->af == AF_INET6 && flow->protocol == IPPROTO_ICMPV6)))) {
		/* ICMP flows */
		flow->expiry->expires_at = flow->flow_last.tv_sec + 
		    ft->icmp_timeout;
		flow->expiry->reason = R_ICMP;
		goto out;
	}

	/* Everything else */
	flow->expiry->expires_at = flow->flow_last.tv_sec + 
	    ft->general_timeout;
	flow->expiry->reason = R_GENERAL;

 out:
	if (ft->maximum_lifetime != 0 && flow->expiry->expires_at != 0) {
		flow->expiry->expires_at = MIN(flow->expiry->expires_at,
		    flow->flow_start.tv_sec + ft->maximum_lifetime);
	}

	EXPIRY_INSERT(EXPIRIES, &ft->expiries, flow->expiry);
}


/* Return values from process_packet */
#define PP_OK		0
#define PP_BAD_PACKET	-2
#define PP_MALLOC_FAIL	-3

/*
 * Main per-packet processing function. Take a packet (provided by 
 * libpcap) and attempt to find a matching flow. If no such flow exists, 
 * then create one. 
 *
 * Also marks flows for fast expiry, based on flow or packet attributes
 * (the actual expiry is performed elsewhere)
 */
static int
process_packet(struct FLOWTRACK *ft, const u_int8_t *pkt, int af,
    const u_int32_t caplen, const u_int32_t len, 
    const struct timeval *received_time)
{
	struct FLOW tmp, *flow;
	int frag;

	ft->total_packets++;

	/* Convert the IP packet to a flow identity */
	memset(&tmp, 0, sizeof(tmp));
	switch (af) {
	case AF_INET:
		if (ipv4_to_flowrec(&tmp, pkt, caplen, len, &frag, af) == -1)
			goto bad;
		break;
	case AF_INET6:
		if (ipv6_to_flowrec(&tmp, pkt, caplen, len, &frag, af) == -1)
			goto bad;
		break;
	default:
 bad: 
		ft->bad_packets++;
		return (PP_BAD_PACKET);
	}

	if (frag)
		ft->frag_packets++;

	/* Zero out bits of the flow that aren't relevant to tracking level */
	switch (ft->track_level) {
	case TRACK_IP_ONLY:
		tmp.protocol = 0;
		/* FALLTHROUGH */
	case TRACK_IP_PROTO:
		tmp.port[0] = tmp.port[1] = 0;
		tmp.tcp_flags[0] = tmp.tcp_flags[1] = 0;
		/* FALLTHROUGH */
	case TRACK_FULL:
		break;
	}

	/* If a matching flow does not exist, create and insert one */
	if ((flow = FLOW_FIND(FLOWS, &ft->flows, &tmp)) == NULL) {
		/* Allocate and fill in the flow */
		if ((flow = flow_get(ft)) == NULL) {
			logit(LOG_ERR, "process_packet: flow_get failed",
			    sizeof(*flow));
			return (PP_MALLOC_FAIL);
		}
		memcpy(flow, &tmp, sizeof(*flow));
		memcpy(&flow->flow_start, received_time,
		    sizeof(flow->flow_start));
		flow->flow_seq = ft->next_flow_seq++;
		FLOW_INSERT(FLOWS, &ft->flows, flow);

		/* Allocate and fill in the associated expiry event */
		if ((flow->expiry = expiry_get(ft)) == NULL) {
			logit(LOG_ERR, "process_packet: expiry_get failed",
			    sizeof(*flow->expiry));
			return (PP_MALLOC_FAIL);
		}
		flow->expiry->flow = flow;
		/* Must be non-zero (0 means expire immediately) */
		flow->expiry->expires_at = 1;
		flow->expiry->reason = R_GENERAL;
		EXPIRY_INSERT(EXPIRIES, &ft->expiries, flow->expiry);

		ft->num_flows++;
		if (verbose_flag)
			logit(LOG_DEBUG, "ADD FLOW %s",
			    format_flow_brief(flow));
	} else {
		/* Update flow statistics */
		flow->packets[0] += tmp.packets[0];
		flow->octets[0] += tmp.octets[0];
		flow->tcp_flags[0] |= tmp.tcp_flags[0];
		flow->packets[1] += tmp.packets[1];
		flow->octets[1] += tmp.octets[1];
		flow->tcp_flags[1] |= tmp.tcp_flags[1];
	}
	
	memcpy(&flow->flow_last, received_time, sizeof(flow->flow_last));

	if (flow->expiry->expires_at != 0)
		flow_update_expiry(ft, flow);

	return (PP_OK);
}

/*
 * Subtract two timevals. Returns (t1 - t2) in milliseconds.
 */
u_int32_t
timeval_sub_ms(const struct timeval *t1, const struct timeval *t2)
{
	struct timeval res;

	res.tv_sec = t1->tv_sec - t2->tv_sec;
	res.tv_usec = t1->tv_usec - t2->tv_usec;
	if (res.tv_usec < 0) {
		res.tv_usec += 1000000L;
		res.tv_sec--;
	}
	return ((u_int32_t)res.tv_sec * 1000 + (u_int32_t)res.tv_usec / 1000);
}

static void
update_statistic(struct STATISTIC *s, double new, double n)
{
	if (n == 1.0) {
		s->min = s->mean = s->max = new;
		return;
	}

	s->min = MIN(s->min, new);
	s->max = MAX(s->max, new);

	s->mean = s->mean + ((new - s->mean) / n);
}

/* Update global statistics */
static void
update_statistics(struct FLOWTRACK *ft, struct FLOW *flow)
{
	double tmp;
	static double n = 1.0;

	ft->flows_expired++;
	ft->flows_pp[flow->protocol % 256]++;

	tmp = (double)flow->flow_last.tv_sec +
	    ((double)flow->flow_last.tv_usec / 1000000.0);
	tmp -= (double)flow->flow_start.tv_sec +
	    ((double)flow->flow_start.tv_usec / 1000000.0);
	if (tmp < 0.0)
		tmp = 0.0;

	update_statistic(&ft->duration, tmp, n);
	update_statistic(&ft->duration_pp[flow->protocol], tmp, 
	    (double)ft->flows_pp[flow->protocol % 256]);

	tmp = flow->octets[0] + flow->octets[1];
	update_statistic(&ft->octets, tmp, n);
	ft->octets_pp[flow->protocol % 256] += tmp;

	tmp = flow->packets[0] + flow->packets[1];
	update_statistic(&ft->packets, tmp, n);
	ft->packets_pp[flow->protocol % 256] += tmp;

	n++;
}

static void 
update_expiry_stats(struct FLOWTRACK *ft, struct EXPIRY *e)
{
	switch (e->reason) {
	case R_GENERAL:
		ft->expired_general++;
		break;
	case R_TCP:
		ft->expired_tcp++;
		break;
	case R_TCP_RST:
		ft->expired_tcp_rst++;
		break;
	case R_TCP_FIN:
		ft->expired_tcp_fin++;
		break;
	case R_UDP:
		ft->expired_udp++;
		break;
	case R_ICMP:
		ft->expired_icmp++;
		break;
	case R_MAXLIFE:
		ft->expired_maxlife++;
		break;
	case R_OVERBYTES:
		ft->expired_overbytes++;
		break;
	case R_OVERFLOWS:
		ft->expired_maxflows++;
		break;
	case R_FLUSH:
		ft->expired_flush++;
		break;
	}	
}

/* How long before the next expiry event in millisecond */
static int
next_expire(struct FLOWTRACK *ft)
{
	struct EXPIRY *expiry;
	struct timeval now;
	u_int32_t expires_at, ret, fudge;

	gettimeofday(&now, NULL);

	if ((expiry = EXPIRY_MIN(EXPIRIES, &ft->expiries)) == NULL)
		return (-1); /* indefinite */

	expires_at = expiry->expires_at;

	/* Don't cluster urgent expiries */
	if (expires_at == 0 && (expiry->reason == R_OVERBYTES || 
	    expiry->reason == R_OVERFLOWS || expiry->reason == R_FLUSH))
		return (0); /* Now */

	/* Cluster expiries by expiry_interval */
	if (ft->expiry_interval > 1) {
		if ((fudge = expires_at % ft->expiry_interval) > 0)
			expires_at += ft->expiry_interval - fudge;
	}

	if (expires_at < now.tv_sec)
		return (0); /* Now */

	ret = 999 + (expires_at - now.tv_sec) * 1000;
	return (ret);
}

/*
 * Scan the tree of expiry events and process expired flows. If zap_all
 * is set, then forcibly expire all flows.
 */
#define CE_EXPIRE_NORMAL	0  /* Normal expiry processing */
#define CE_EXPIRE_ALL		-1 /* Expire all flows immediately */
#define CE_EXPIRE_FORCED	1  /* Only expire force-expired flows */
static int
check_expired(struct FLOWTRACK *ft, struct NETFLOW_TARGET *target, int ex)
{
	struct FLOW **expired_flows, **oldexp;
	int num_expired, i, r;
	struct timeval now;

	struct EXPIRY *expiry, *nexpiry;

	gettimeofday(&now, NULL);
	r = 0;
	num_expired = 0;
	expired_flows = NULL;

	if (verbose_flag)
		logit(LOG_DEBUG, "Starting expiry scan: mode %d", ex);

	for(expiry = EXPIRY_MIN(EXPIRIES, &ft->expiries);
	    expiry != NULL;
	    expiry = nexpiry) {
		nexpiry = EXPIRY_NEXT(EXPIRIES, &ft->expiries, expiry);
		if ((expiry->expires_at == 0) || (ex == CE_EXPIRE_ALL) || 
		    (ex != CE_EXPIRE_FORCED &&
		    (expiry->expires_at < now.tv_sec))) {
			/* Flow has expired */

			if (ft->maximum_lifetime != 0 && 
	    		    expiry->flow->flow_last.tv_sec - 
			    expiry->flow->flow_start.tv_sec >= 
	    		    ft->maximum_lifetime)
					expiry->reason = R_MAXLIFE;

			if (verbose_flag)
				logit(LOG_DEBUG,
				    "Queuing flow seq:%"PRIu64" (%p) for expiry "
				    "reason %d", expiry->flow->flow_seq,
				    expiry->flow, expiry->reason);

			/* Add to array of expired flows */
			oldexp = expired_flows;
			expired_flows = realloc(expired_flows,
			    sizeof(*expired_flows) * (num_expired + 1));
			/* Don't fatal on realloc failures */
			if (expired_flows == NULL)
				expired_flows = oldexp;
			else {
				expired_flows[num_expired] = expiry->flow;
				num_expired++;
			}

			if (ex == CE_EXPIRE_ALL)
				expiry->reason = R_FLUSH;

			update_expiry_stats(ft, expiry);

			/* Remove from flow tree, destroy expiry event */
			FLOW_REMOVE(FLOWS, &ft->flows, expiry->flow);
			EXPIRY_REMOVE(EXPIRIES, &ft->expiries, expiry);
			expiry->flow->expiry = NULL;
			expiry_put(ft, expiry);

			ft->num_flows--;
		}
	}

	if (verbose_flag)
		logit(LOG_DEBUG, "Finished scan %d flow(s) to be evicted",
		    num_expired);
	
	/* Processing for expired flows */
	if (num_expired > 0) {
		if (target != NULL && target->fd != -1) {
			r = target->dialect->func(expired_flows, num_expired, 
			    target->fd, if_index, &ft->flows_exported, 
			    &ft->system_boot_time,  verbose_flag, &ft->option);
			if (verbose_flag)
				logit(LOG_DEBUG, "sent %d netflow packets", r);
			if (r > 0) {
				ft->packets_sent += r;
				/* XXX what if r < num_expired * 2 ? */
			} else {
				ft->flows_dropped += num_expired * 2;
			}
		}
		for (i = 0; i < num_expired; i++) {
			if (verbose_flag) {
				logit(LOG_DEBUG, "EXPIRED: %s (%p)", 
				    format_flow(expired_flows[i]),
				    expired_flows[i]);
			}
			update_statistics(ft, expired_flows[i]);
			flow_put(ft, expired_flows[i]);
		}
	
		free(expired_flows);
	}

	return (r == -1 ? -1 : num_expired);
}

/*
 * Force expiry of num_to_expire flows (e.g. when flow table overfull) 
 */
static void
force_expire(struct FLOWTRACK *ft, u_int32_t num_to_expire)
{
	struct EXPIRY *expiry, **expiryv;
	int i;

	/* XXX move all overflow processing here (maybe) */
	if (verbose_flag)
		logit(LOG_INFO, "Forcing expiry of %d flows",
		    num_to_expire);

	/*
	 * Do this in two steps, as it is dangerous to change a key on 
	 * a tree entry without first removing it and then re-adding it.
	 * It is even worse when this has to be done during a FOREACH :)
	 * To get around this, we make a list of expired flows and _then_ 
	 * alter them 
	 */
	 
	if ((expiryv = calloc(num_to_expire, sizeof(*expiryv))) == NULL) {
		/*
		 * On malloc failure, expire ALL flows. I assume that 
		 * setting all the keys in a tree to the same value is 
		 * safe.
		 */
		logit(LOG_ERR, "Out of memory while expiring flows - "
		    "all flows expired");
		EXPIRY_FOREACH(expiry, EXPIRIES, &ft->expiries) {
			expiry->expires_at = 0;
			expiry->reason = R_OVERFLOWS;
			ft->flows_force_expired++;
		}
		return;
	}
	
	/* Make the list of flows to expire */
	i = 0;
	EXPIRY_FOREACH(expiry, EXPIRIES, &ft->expiries) {
		if (i >= num_to_expire)
			break;
		expiryv[i++] = expiry;
	}
	if (i < num_to_expire) {
		logit(LOG_ERR, "Needed to expire %d flows, "
		    "but only %d active", num_to_expire, i);
		num_to_expire = i;
	}

	for(i = 0; i < num_to_expire; i++) {
		EXPIRY_REMOVE(EXPIRIES, &ft->expiries, expiryv[i]);
		expiryv[i]->expires_at = 0;
		expiryv[i]->reason = R_OVERFLOWS;
		EXPIRY_INSERT(EXPIRIES, &ft->expiries, expiryv[i]);
	}
	ft->flows_force_expired += num_to_expire;
	free(expiryv);
	/* XXX - this is overcomplicated, perhaps use a separate queue */
}

/* Delete all flows that we know about without processing */
static int
delete_all_flows(struct FLOWTRACK *ft)
{
	struct FLOW *flow, *nflow;
	int i;
	
	i = 0;
	for(flow = FLOW_MIN(FLOWS, &ft->flows); flow != NULL; flow = nflow) {
		nflow = FLOW_NEXT(FLOWS, &ft->flows, flow);
		FLOW_REMOVE(FLOWS, &ft->flows, flow);
		
		EXPIRY_REMOVE(EXPIRIES, &ft->expiries, flow->expiry);
		expiry_put(ft, flow->expiry);

		ft->num_flows--;
		flow_put(ft, flow);
		i++;
	}
	
	return (i);
}

/*
 * Log our current status. 
 * Includes summary counters and (in verbose mode) the list of current flows
 * and the tree of expiry events.
 */
static int
statistics(struct FLOWTRACK *ft, FILE *out, pcap_t *pcap)
{
	int i;
	struct protoent *pe;
	char proto[32];
	struct pcap_stat ps;

	fprintf(out, "Number of active flows: %d\n", ft->num_flows);
	fprintf(out, "Packets processed: %"PRIu64"\n", ft->total_packets);
	if (ft->non_sampled_packets) 
		fprintf(out, "Packets non-sampled: %"PRIu64"\n",
			ft->non_sampled_packets);
	fprintf(out, "Fragments: %"PRIu64"\n", ft->frag_packets);
	fprintf(out, "Ignored packets: %"PRIu64" (%"PRIu64" non-IP, %"PRIu64" too short)\n",
	    ft->non_ip_packets + ft->bad_packets, ft->non_ip_packets, ft->bad_packets);
	fprintf(out, "Flows expired: %"PRIu64" (%"PRIu64" forced)\n", 
	    ft->flows_expired, ft->flows_force_expired);
	fprintf(out, "Flows exported: %"PRIu64" in %"PRIu64" packets (%"PRIu64" failures)\n",
	    ft->flows_exported, ft->packets_sent, ft->flows_dropped);

	if (pcap_stats(pcap, &ps) == 0) {
		fprintf(out, "Packets received by libpcap: %lu\n",
		    (unsigned long)ps.ps_recv);
		fprintf(out, "Packets dropped by libpcap: %lu\n",
		    (unsigned long)ps.ps_drop);
		fprintf(out, "Packets dropped by interface: %lu\n",
		    (unsigned long)ps.ps_ifdrop);
	}

	fprintf(out, "\n");

	if (ft->flows_expired != 0) {
		fprintf(out, "Expired flow statistics:  minimum       average       maximum\n");
		fprintf(out, "  Flow bytes:        %12.0f  %12.0f  %12.0f\n", 
		    ft->octets.min, ft->octets.mean, ft->octets.max);
		fprintf(out, "  Flow packets:      %12.0f  %12.0f  %12.0f\n", 
		    ft->packets.min, ft->packets.mean, ft->packets.max);
		fprintf(out, "  Duration:          %12.2fs %12.2fs %12.2fs\n", 
		    ft->duration.min, ft->duration.mean, ft->duration.max);

		fprintf(out, "\n");
		fprintf(out, "Expired flow reasons:\n");
		fprintf(out, "       tcp = %9"PRIu64"   tcp.rst = %9"PRIu64"   "
		    "tcp.fin = %9"PRIu64"\n", ft->expired_tcp, ft->expired_tcp_rst,
		    ft->expired_tcp_fin);
		fprintf(out, "       udp = %9"PRIu64"      icmp = %9"PRIu64"   "
		    "general = %9"PRIu64"\n", ft->expired_udp, ft->expired_icmp,
		    ft->expired_general);
		fprintf(out, "   maxlife = %9"PRIu64"\n", ft->expired_maxlife);
		fprintf(out, "over 2 GiB = %9"PRIu64"\n", ft->expired_overbytes);
		fprintf(out, "  maxflows = %9"PRIu64"\n", ft->expired_maxflows);
		fprintf(out, "   flushed = %9"PRIu64"\n", ft->expired_flush);

		fprintf(out, "\n");

		fprintf(out, "Per-protocol statistics:     Octets      "
		    "Packets   Avg Life    Max Life\n");
		for(i = 0; i < 256; i++) {
			if (ft->packets_pp[i]) {
				pe = getprotobynumber(i);
				snprintf(proto, sizeof(proto), "%s (%d)", 
				    pe != NULL ? pe->p_name : "Unknown", i);
				fprintf(out, "  %17s: %14"PRIu64" %12"PRIu64"   %8.2fs "
				    "%10.2fs\n", proto,
				    ft->octets_pp[i], 
				    ft->packets_pp[i],
				    ft->duration_pp[i].mean,
				    ft->duration_pp[i].max);
			}
		}
	}

	return (0);
}

static void
dump_flows(struct FLOWTRACK *ft, FILE *out)
{
	struct EXPIRY *expiry;
	time_t now;

	now = time(NULL);

	EXPIRY_FOREACH(expiry, EXPIRIES, &ft->expiries) {
		fprintf(out, "ACTIVE %s\n", format_flow(expiry->flow));
		if ((long int) expiry->expires_at - now < 0) {
			fprintf(out, 
			    "EXPIRY EVENT for flow %"PRIu64" now%s\n",
			    expiry->flow->flow_seq, 
			    expiry->expires_at == 0 ? " (FORCED)": "");
		} else {
			fprintf(out, 
			    "EXPIRY EVENT for flow %"PRIu64" in %ld seconds\n",
			    expiry->flow->flow_seq, 
			    (long int) expiry->expires_at - now);
		}
		fprintf(out, "\n");
	}
}

/*
 * Figure out how many bytes to skip from front of packet to get past 
 * datalink headers. If pkt is specified, also check whether determine
 * whether or not it is one that we are interested in (IPv4 or IPv6 for now)
 *
 * Returns number of bytes to skip or -1 to indicate that entire 
 * packet should be skipped
 */
static int 
datalink_check(int linktype, const u_int8_t *pkt, u_int32_t caplen, int *af)
{
	int i, j;
	u_int32_t frametype;
	static const struct DATALINK *dl = NULL;

	/* Try to cache last used linktype */
	if (dl == NULL || dl->dlt != linktype) {
		for (i = 0; lt[i].dlt != linktype && lt[i].dlt != -1; i++)
			;
		dl = &lt[i];
	}
	if (dl->dlt == -1 || pkt == NULL)
		return (dl->dlt);
	if (caplen <= dl->skiplen)
		return (-1);

	/* Suck out the frametype */
	frametype = 0;
	if (dl->ft_is_be) {
		for (j = 0; j < dl->ft_len; j++) {
			frametype <<= 8;
			frametype |= pkt[j + dl->ft_off];
		}
	} else {
		for (j = dl->ft_len - 1; j >= 0 ; j--) {
			frametype <<= 8;
			frametype |= pkt[j + dl->ft_off];
		}
	}
	frametype &= dl->ft_mask;

	if (frametype == dl->ft_v4)
		*af = AF_INET;
	else if (frametype == dl->ft_v6)
		*af = AF_INET6;
	else
		return (-1);
	
	return (dl->skiplen);
}

/*
 * Per-packet callback function from libpcap. Pass the packet (if it is IP)
 * sans datalink headers to process_packet.
 */
static void
flow_cb(u_char *user_data, const struct pcap_pkthdr* phdr, 
    const u_char *pkt)
{
	int s, af;
	struct CB_CTXT *cb_ctxt = (struct CB_CTXT *)user_data;
	struct timeval tv;

	if (cb_ctxt->ft->option.sample &&
	    (cb_ctxt->ft->total_packets +
	     cb_ctxt->ft->non_sampled_packets) %
	    cb_ctxt->ft->option.sample > 0) {
		cb_ctxt->ft->non_sampled_packets++;
		return;
	}
	s = datalink_check(cb_ctxt->linktype, pkt, phdr->caplen, &af);
	if (s < 0 || (!cb_ctxt->want_v6 && af == AF_INET6)) {
		cb_ctxt->ft->non_ip_packets++;
	} else {
		tv.tv_sec = phdr->ts.tv_sec;
		tv.tv_usec = phdr->ts.tv_usec;
		if (process_packet(cb_ctxt->ft, pkt + s, af,
		    phdr->caplen - s, phdr->len - s, &tv) == PP_MALLOC_FAIL)
			cb_ctxt->fatal = 1;
	}
}

static void
print_timeouts(struct FLOWTRACK *ft, FILE *out)
{
	fprintf(out, "           TCP timeout: %ds\n", ft->tcp_timeout);
	fprintf(out, "  TCP post-RST timeout: %ds\n", ft->tcp_rst_timeout);
	fprintf(out, "  TCP post-FIN timeout: %ds\n", ft->tcp_fin_timeout);
	fprintf(out, "           UDP timeout: %ds\n", ft->udp_timeout);
	fprintf(out, "          ICMP timeout: %ds\n", ft->icmp_timeout);
	fprintf(out, "       General timeout: %ds\n", ft->general_timeout);
	fprintf(out, "      Maximum lifetime: %ds\n", ft->maximum_lifetime);
	fprintf(out, "       Expiry interval: %ds\n", ft->expiry_interval);
}

static int
accept_control(int lsock, struct NETFLOW_TARGET *target, struct FLOWTRACK *ft,
    pcap_t *pcap, int *exit_request, int *stop_collection_flag)
{
	unsigned char buf[64], *p;
	FILE *ctlf;
	int fd, ret;

	if ((fd = accept(lsock, NULL, NULL)) == -1) {
		logit(LOG_ERR, "ctl accept: %s - exiting",
		    strerror(errno));
		return(-1);
	}
	if ((ctlf = fdopen(fd, "r+")) == NULL) {
		logit(LOG_ERR, "fdopen: %s - exiting\n",
		    strerror(errno));
		close(fd);
		return (-1);
	}
	setlinebuf(ctlf);

	if (fgets(buf, sizeof(buf), ctlf) == NULL) {
		logit(LOG_ERR, "Control socket yielded no data");
		return (0);
	}
	if ((p = strchr(buf, '\n')) != NULL)
		*p = '\0';
	
	if (verbose_flag)
		logit(LOG_DEBUG, "Control socket \"%s\"", buf);

	/* XXX - use dispatch table */
	ret = -1;
	if (strcmp(buf, "help") == 0) {
		fprintf(ctlf, "Valid control words are:\n");
		fprintf(ctlf, "\tdebug+ debug- delete-all dump-flows exit "
		    "expire-all\n");
		fprintf(ctlf, "\tshutdown start-gather statistics stop-gather "
		    "timeouts\n");
		fprintf(ctlf, "\tsend-template\n");
		ret = 0;
	} else if (strcmp(buf, "shutdown") == 0) {
		fprintf(ctlf, "softflowd[%u]: Shutting down gracefully...\n", 
		    getpid());
		graceful_shutdown_request = 1;
		ret = 1;
	} else if (strcmp(buf, "exit") == 0) {
		fprintf(ctlf, "softflowd[%u]: Exiting now...\n", getpid());
		*exit_request = 1;
		ret = 1;
	} else if (strcmp(buf, "expire-all") == 0) {
		netflow9_resend_template();
		fprintf(ctlf, "softflowd[%u]: Expired %d flows.\n", getpid(), 
		    check_expired(ft, target, CE_EXPIRE_ALL));
		ret = 0;
	} else if (strcmp(buf, "send-template") == 0) {
		netflow9_resend_template();
		fprintf(ctlf, "softflowd[%u]: Template will be sent at "
		    "next flow export\n", getpid());
		ret = 0;
	} else if (strcmp(buf, "delete-all") == 0) {
		fprintf(ctlf, "softflowd[%u]: Deleted %d flows.\n", getpid(), 
		    delete_all_flows(ft));
		ret = 0;
	} else if (strcmp(buf, "statistics") == 0) {
		fprintf(ctlf, "softflowd[%u]: Accumulated statistics "
		    "since %s UTC:\n", getpid(),
		    format_time(ft->system_boot_time.tv_sec));
		statistics(ft, ctlf, pcap);
		ret = 0;
	} else if (strcmp(buf, "debug+") == 0) {
		fprintf(ctlf, "softflowd[%u]: Debug level increased.\n",
		    getpid());
		verbose_flag = 1;
		ret = 0;
	} else if (strcmp(buf, "debug-") == 0) {
		fprintf(ctlf, "softflowd[%u]: Debug level decreased.\n",
		    getpid());
		verbose_flag = 0;
		ret = 0;
	} else if (strcmp(buf, "stop-gather") == 0) {
		fprintf(ctlf, "softflowd[%u]: Data collection stopped.\n",
		    getpid());
		*stop_collection_flag = 1;
		ret = 0;
	} else if (strcmp(buf, "start-gather") == 0) {
		fprintf(ctlf, "softflowd[%u]: Data collection resumed.\n",
		    getpid());
		*stop_collection_flag = 0;
		ret = 0;
	} else if (strcmp(buf, "dump-flows") == 0) {
		fprintf(ctlf, "softflowd[%u]: Dumping flow data:\n",
		    getpid());
		dump_flows(ft, ctlf);
		ret = 0;
	} else if (strcmp(buf, "timeouts") == 0) {
		fprintf(ctlf, "softflowd[%u]: Printing timeouts:\n",
		    getpid());
		print_timeouts(ft, ctlf);
		ret = 0;
	} else {
		fprintf(ctlf, "Unknown control commmand \"%s\"\n", buf);
		ret = 0;
	}

	fclose(ctlf);
	close(fd);
	
	return (ret);
}

static int
connsock(struct sockaddr_storage *addr, socklen_t len, int hoplimit)
{
	int s;
	unsigned int h6;
	unsigned char h4;
	struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;

	if ((s = socket(addr->ss_family, SOCK_DGRAM, 0)) == -1) {
		fprintf(stderr, "socket() error: %s\n", 
		    strerror(errno));
		exit(1);
	}
	if (connect(s, (struct sockaddr*)addr, len) == -1) {
		fprintf(stderr, "connect() error: %s\n",
		    strerror(errno));
		exit(1);
	}

	switch (addr->ss_family) {
	case AF_INET:
		/* Default to link-local TTL for multicast addresses */
		if (hoplimit == -1 && IN_MULTICAST(in4->sin_addr.s_addr))
			hoplimit = 1;
		if (hoplimit == -1)
			break;
		h4 = hoplimit;
		if (setsockopt(s, IPPROTO_IP, IP_MULTICAST_TTL,
		    &h4, sizeof(h4)) == -1) {
			fprintf(stderr, "setsockopt(IP_MULTICAST_TTL, "
			    "%u): %s\n", h4, strerror(errno));
			exit(1);
		}
		break;
	case AF_INET6:
		/* Default to link-local hoplimit for multicast addresses */
		if (hoplimit == -1 && IN6_IS_ADDR_MULTICAST(&in6->sin6_addr))
			hoplimit = 1;
		if (hoplimit == -1)
			break;
		h6 = hoplimit;
		if (setsockopt(s, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
		    &h6, sizeof(h6)) == -1) {
			fprintf(stderr, "setsockopt(IPV6_MULTICAST_HOPS, %u): "
			"%s\n", h6, strerror(errno));
			exit(1);
		}
	}

	return(s);
}

static int 
unix_listener(const char *path)
{
	struct sockaddr_un addr;
	socklen_t addrlen;
	int s;

	memset(&addr, '\0', sizeof(addr));
	addr.sun_family = AF_UNIX;
	
	if (strlcpy(addr.sun_path, path, sizeof(addr.sun_path)) >=
	    sizeof(addr.sun_path)) {
		fprintf(stderr, "control socket path too long\n");
		exit(1);
	}
	
	addr.sun_path[sizeof(addr.sun_path) - 1] = '\0';
	
	addrlen = offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
#ifdef SOCK_HAS_LEN 
	addr.sun_len = addrlen;
#endif

	if ((s = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
		fprintf(stderr, "unix domain socket() error: %s\n", 
		    strerror(errno));
		exit(1);
	}
	unlink(path);
	if (bind(s, (struct sockaddr*)&addr, addrlen) == -1) {
		fprintf(stderr, "unix domain bind(\"%s\") error: %s\n",
		    addr.sun_path, strerror(errno));
		exit(1);
	}
	if (listen(s, 64) == -1) {
		fprintf(stderr, "unix domain listen() error: %s\n",
		    strerror(errno));
		exit(1);
	}
	
	return (s);
}

static void
setup_packet_capture(struct pcap **pcap, int *linktype, 
    char *dev, char *capfile, char *bpf_prog, int need_v6)
{
	char ebuf[PCAP_ERRBUF_SIZE];
	struct bpf_program prog_c;
	u_int32_t bpf_mask, bpf_net;

	/* Open pcap */
	if (dev != NULL) {
		if ((*pcap = pcap_open_live(dev, 
		    need_v6 ? LIBPCAP_SNAPLEN_V6 : LIBPCAP_SNAPLEN_V4, 
		    1, 0, ebuf)) == NULL) {
			fprintf(stderr, "pcap_open_live: %s\n", ebuf);
			exit(1);
		}
		if (pcap_lookupnet(dev, &bpf_net, &bpf_mask, ebuf) == -1)
			bpf_net = bpf_mask = 0;
	} else {
		if ((*pcap = pcap_open_offline(capfile, ebuf)) == NULL) {
			fprintf(stderr, "pcap_open_offline(%s): %s\n", 
			    capfile, ebuf);
			exit(1);
		}
		bpf_net = bpf_mask = 0;
	}
	*linktype = pcap_datalink(*pcap);
	if (datalink_check(*linktype, NULL, 0, NULL) == -1) {
		fprintf(stderr, "Unsupported datalink type %d\n", *linktype);
		exit(1);
	}
	/* Attach BPF filter, if specified */
	if (bpf_prog != NULL) {
		if (pcap_compile(*pcap, &prog_c, bpf_prog, 1, bpf_mask) == -1) {
			fprintf(stderr, "pcap_compile(\"%s\"): %s\n", 
			    bpf_prog, pcap_geterr(*pcap));
			exit(1);
		}
		if (pcap_setfilter(*pcap, &prog_c) == -1) {
			fprintf(stderr, "pcap_setfilter: %s\n", 
			    pcap_geterr(*pcap));
			exit(1);
		}
	}

#ifdef BIOCLOCK
	/*
	 * If we are reading from an device (not a file), then 
	 * lock the underlying BPF device to prevent changes in the 
	 * unprivileged child
	 */
	if (dev != NULL && ioctl(pcap_fileno(*pcap), BIOCLOCK) < 0) {
		fprintf(stderr, "ioctl(BIOCLOCK) failed: %s\n",
		    strerror(errno));
		exit(1);
	}
#endif
}

static void
init_flowtrack(struct FLOWTRACK *ft)
{
	/* Set up flow-tracking structure */
	memset(ft, '\0', sizeof(*ft));
	ft->next_flow_seq = 1;
	FLOW_INIT(&ft->flows);
	EXPIRY_INIT(&ft->expiries);
	
	freelist_init(&ft->flow_freelist, sizeof(struct FLOW));
	freelist_init(&ft->expiry_freelist, sizeof(struct EXPIRY));

	ft->max_flows = DEFAULT_MAX_FLOWS;

	ft->track_level = TRACK_FULL;

	ft->tcp_timeout = DEFAULT_TCP_TIMEOUT;
	ft->tcp_rst_timeout = DEFAULT_TCP_RST_TIMEOUT;
	ft->tcp_fin_timeout = DEFAULT_TCP_FIN_TIMEOUT;
	ft->udp_timeout = DEFAULT_UDP_TIMEOUT;
	ft->icmp_timeout = DEFAULT_ICMP_TIMEOUT;
	ft->general_timeout = DEFAULT_GENERAL_TIMEOUT;
	ft->maximum_lifetime = DEFAULT_MAXIMUM_LIFETIME;
	ft->expiry_interval = DEFAULT_EXPIRY_INTERVAL;
}

static char *
argv_join(int argc, char **argv)
{
	int i;
	size_t ret_len;
	char *ret;

	ret_len = 0;
	ret = NULL;
	for (i = 0; i < argc; i++) {
		ret_len += strlen(argv[i]);
		if ((ret = realloc(ret, ret_len + 2)) == NULL) {
			fprintf(stderr, "Memory allocation failed.\n");
			exit(1);
		}
		if (i == 0)
			ret[0] = '\0';
		else {
			ret_len++; /* Make room for ' ' */
			strlcat(ret, " ", ret_len + 1);
		}
			
		strlcat(ret, argv[i], ret_len + 1);
	}

	return (ret);
}

/* Display commandline usage information */
static void
usage(void)
{
	fprintf(stderr, 
"Usage: %s [options] [bpf_program]\n"
"This is %s version %s. Valid commandline options:\n"
"  -i [idx:]interface Specify interface to listen on\n"
"  -r pcap_file       Specify packet capture file to read\n"
"  -t timeout=time    Specify named timeout\n"
"  -m max_flows       Specify maximum number of flows to track (default %d)\n"
"  -n host:port       Send Cisco NetFlow(tm)-compatible packets to host:port\n"
"  -p pidfile         Record pid in specified file\n"
"                     (default: %s)\n"
"  -c pidfile         Location of control socket\n"
"                     (default: %s)\n"
"  -v 1|5|9           NetFlow export packet version\n"
"  -L hoplimit        Set TTL/hoplimit for export datagrams\n"
"  -T full|proto|ip   Set flow tracking level (default: full)\n"
"  -6                 Track IPv6 flows, regardless of whether selected \n"
"                     NetFlow export protocol supports it\n"
"  -d                 Don't daemonise (run in foreground)\n"
"  -D                 Debug mode: foreground + verbosity + track v6 flows\n"
"  -s sampling_rate   Specify periodical sampling rate (denominator)\n"
"  -h                 Display this help\n"
"\n"
"Valid timeout names and default values:\n"
"  tcp     (default %6d)"
"  tcp.rst (default %6d)"
"  tcp.fin (default %6d)\n"
"  udp     (default %6d)"
"  icmp    (default %6d)"
"  general (default %6d)\n"
"  maxlife (default %6d)"
"  expint  (default %6d)\n"
"\n" ,
	    PROGNAME, PROGNAME, PROGVER, DEFAULT_MAX_FLOWS, DEFAULT_PIDFILE,
	    DEFAULT_CTLSOCK, DEFAULT_TCP_TIMEOUT, DEFAULT_TCP_RST_TIMEOUT,
	    DEFAULT_TCP_FIN_TIMEOUT, DEFAULT_UDP_TIMEOUT, DEFAULT_ICMP_TIMEOUT,
	    DEFAULT_GENERAL_TIMEOUT, DEFAULT_MAXIMUM_LIFETIME,
	    DEFAULT_EXPIRY_INTERVAL);
}

static void
set_timeout(struct FLOWTRACK *ft, const char *to_spec)
{
	char *name, *value;
	int timeout;

	if ((name = strdup(to_spec)) == NULL) {
		fprintf(stderr, "Out of memory\n");
		exit(1);
	}
	if ((value = strchr(name, '=')) == NULL ||
	    *(++value) == '\0') {
		fprintf(stderr, "Invalid -t option \"%s\".\n", name);
		usage();
		exit(1);
	}
	*(value - 1) = '\0';
	timeout = convtime(value);
	if (timeout < 0) {
		fprintf(stderr, "Invalid -t timeout.\n");
		usage();
		exit(1);
	}
	if (strcmp(name, "tcp") == 0)
		ft->tcp_timeout = timeout;
	else if (strcmp(name, "tcp.rst") == 0)
		ft->tcp_rst_timeout = timeout;
	else if (strcmp(name, "tcp.fin") == 0)
		ft->tcp_fin_timeout = timeout;
	else if (strcmp(name, "udp") == 0)
		ft->udp_timeout = timeout;
	else if (strcmp(name, "icmp") == 0)
		ft->icmp_timeout = timeout;
	else if (strcmp(name, "general") == 0)
		ft->general_timeout = timeout;
	else if (strcmp(name, "maxlife") == 0)
		ft->maximum_lifetime = timeout;
	else if (strcmp(name, "expint") == 0)
		ft->expiry_interval = timeout;
	else {
		fprintf(stderr, "Invalid -t name.\n");
		usage();
		exit(1);
	}

	if (ft->general_timeout == 0) {
		fprintf(stderr, "\"general\" flow timeout must be "
		    "greater than zero\n");
		exit(1);
	}

	free(name);
}

static void
parse_hostport(const char *s, struct sockaddr *addr, socklen_t *len)
{
	char *orig, *host, *port;
	struct addrinfo hints, *res;
	int herr;

	if ((host = orig = strdup(s)) == NULL) {
		fprintf(stderr, "Out of memory\n");
		exit(1);
	}
	if ((port = strrchr(host, ':')) == NULL ||
	    *(++port) == '\0' || *host == '\0') {
		fprintf(stderr, "Invalid -n argument.\n");
		usage();
		exit(1);
	}
	*(port - 1) = '\0';
	
	/* Accept [host]:port for numeric IPv6 addresses */
	if (*host == '[' && *(port - 2) == ']') {
		host++;
		*(port - 2) = '\0';
	}

	memset(&hints, '\0', sizeof(hints));
	hints.ai_socktype = SOCK_DGRAM;
	if ((herr = getaddrinfo(host, port, &hints, &res)) == -1) {
		fprintf(stderr, "Address lookup failed: %s\n",
		    gai_strerror(herr));
		exit(1);
	}
	if (res == NULL || res->ai_addr == NULL) {
		fprintf(stderr, "No addresses found for [%s]:%s\n", host, port);
		exit(1);
	}
	if (res->ai_addrlen > *len) {
		fprintf(stderr, "Address too long\n");
		exit(1);
	}
	memcpy(addr, res->ai_addr, res->ai_addrlen);
	free(orig);
	*len = res->ai_addrlen;
}

/* 
 * Drop privileges and chroot, will exit on failure
 */
static void 
drop_privs(void)
{
	struct passwd *pw;
	
	if ((pw = getpwnam(PRIVDROP_USER)) == NULL) {
		logit(LOG_ERR, "Unable to find unprivileged user \"%s\"", 
		    PRIVDROP_USER);
		exit(1);
	}
	if (chdir(PRIVDROP_CHROOT_DIR) != 0) {
		logit(LOG_ERR, "Unable to chdir to chroot directory \"%s\": %s",
		    PRIVDROP_CHROOT_DIR, strerror(errno));
		exit(1);
	}
	if (chroot(PRIVDROP_CHROOT_DIR) != 0) {
		logit(LOG_ERR, "Unable to chroot to directory \"%s\": %s",
		    PRIVDROP_CHROOT_DIR, strerror(errno));
		exit(1);
	}
	if (chdir("/") != 0) {
		logit(LOG_ERR, "Unable to chdir to chroot root: %s",
		    strerror(errno));
		exit(1);
	}
	if (setgroups(1, &pw->pw_gid) != 0) {
		logit(LOG_ERR, "Couldn't setgroups (%u): %s",
		    (unsigned int)pw->pw_gid, strerror(errno));
		exit(1);
	}
#if defined(HAVE_SETRESGID)
	if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1) {
#elif defined(HAVE_SETREGID)
	if (setregid(pw->pw_gid, pw->pw_gid) == -1) {
#else
	if (setegid(pw->pw_gid) == -1 || setgid(pw->pw_gid) == -1) {
#endif
		logit(LOG_ERR, "Couldn't set gid (%u): %s",
		    (unsigned int)pw->pw_gid, strerror(errno));
		exit(1);
	}

#if defined(HAVE_SETRESUID)
	if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1) {
#elif defined(HAVE_SETREUID)
	if (setreuid(pw->pw_uid, pw->pw_uid) == -1) {
#else
	if (seteuid(pw->pw_uid) == -1 || setuid(pw->pw_uid) == -1) {
#endif
		logit(LOG_ERR, "Couldn't set uid (%u): %s",
		    (unsigned int)pw->pw_uid, strerror(errno));
		exit(1);
	}
}

int
main(int argc, char **argv)
{
	char *dev, *capfile, *bpf_prog, dest_addr[256], dest_serv[256];
	const char *pidfile_path, *ctlsock_path;
	extern char *optarg;
	extern int optind;
	int ch, dontfork_flag, linktype, ctlsock, i, err, always_v6, r;
	int stop_collection_flag, exit_request, hoplimit;
	pcap_t *pcap = NULL;
	struct sockaddr_storage dest;
	struct FLOWTRACK flowtrack;
	socklen_t dest_len;
	struct NETFLOW_TARGET target;
	struct CB_CTXT cb_ctxt;
	struct pollfd pl[2];

	closefrom(STDERR_FILENO + 1);

	init_flowtrack(&flowtrack);

	memset(&dest, '\0', sizeof(dest));
	dest_len = 0;
	memset(&target, '\0', sizeof(target));
	target.fd = -1;
	target.dialect = &nf[0];
	hoplimit = -1;
	bpf_prog = NULL;
	ctlsock = -1;
	dev = capfile = NULL;
	pidfile_path = DEFAULT_PIDFILE;
	ctlsock_path = DEFAULT_CTLSOCK;
	dontfork_flag = 0;
	always_v6 = 0;

	while ((ch = getopt(argc, argv, "6hdDL:T:i:r:f:t:n:m:p:c:v:s:")) != -1) {
		switch (ch) {
		case '6':
			always_v6 = 1;
			break;
		case 'h':
			usage();
			return (0);
		case 'D':
			verbose_flag = 1;
			always_v6 = 1;
			/* FALLTHROUGH */
		case 'd':
			dontfork_flag = 1;
			break;
		case 'i':
			if (capfile != NULL || dev != NULL) {
				fprintf(stderr, "Packet source already "
				    "specified.\n\n");
				usage();
				exit(1);
			}
			dev = strsep(&optarg, ":");
			if (optarg != NULL) {
				if_index = (u_int16_t) atoi(dev);
				dev = optarg;
			}
			if (verbose_flag)
				fprintf(stderr, "Using %s (idx: %d)\n", dev, if_index);
			break;
		case 'r':
			if (capfile != NULL || dev != NULL) {
				fprintf(stderr, "Packet source already "
				    "specified.\n\n");
				usage();
				exit(1);
			}
			capfile = optarg;
			dontfork_flag = 1;
			ctlsock_path = NULL;
			break;
		case 't':
			/* Will exit on failure */
			set_timeout(&flowtrack, optarg); 
			break;
		case 'T':
			if (strcasecmp(optarg, "full") == 0)
				flowtrack.track_level = TRACK_FULL;
			else if (strcasecmp(optarg, "proto") == 0)
				flowtrack.track_level = TRACK_IP_PROTO;
			else if (strcasecmp(optarg, "ip") == 0)
				flowtrack.track_level = TRACK_IP_ONLY;
			else {
				fprintf(stderr, "Unknown flow tracking "
				    "level\n");
				usage();
				exit(1);
			}
			break;
		case 'L':
			hoplimit = atoi(optarg);
			if (hoplimit < 0 || hoplimit > 255) {
				fprintf(stderr, "Invalid hop limit\n\n");
				usage();
				exit(1);
			}
			break;
		case 'm':
			if ((flowtrack.max_flows = atoi(optarg)) < 0) {
				fprintf(stderr, "Invalid maximum flows\n\n");
				usage();
				exit(1);
			}
			break;
		case 'n':
			/* Will exit on failure */
			dest_len = sizeof(dest);
			parse_hostport(optarg, (struct sockaddr *)&dest,
			    &dest_len);
			break;
		case 'p':
			pidfile_path = optarg;
			break;
		case 'c':
			if (strcmp(optarg, "none") == 0)
				ctlsock_path = NULL;
			else
				ctlsock_path = optarg;
			break;
		case 'v':
			for(i = 0, r = atoi(optarg); nf[i].version != -1; i++) {
				if (nf[i].version == r)
					break;
			}
			if (nf[i].version == -1) {
				fprintf(stderr, "Invalid NetFlow version\n");
				exit(1);
			}
			target.dialect = &nf[i];
			break;
		case 's':
			flowtrack.option.sample = atoi(optarg);
			if (flowtrack.option.sample < 2) {
				flowtrack.option.sample = 0;
			}
			break;
		default:
			fprintf(stderr, "Invalid commandline option.\n");
			usage();
			exit(1);
		}
	}

	if (capfile == NULL && dev == NULL) {
		fprintf(stderr, "-i or -r option not specified.\n");
		usage();
		exit(1);
	}
	
	/* join remaining arguments (if any) into bpf program */
	bpf_prog = argv_join(argc - optind, argv + optind);

	/* Will exit on failure */
	setup_packet_capture(&pcap, &linktype, dev, capfile, bpf_prog,
	    target.dialect->v6_capable || always_v6);
	
	/* Netflow send socket */
	if (dest.ss_family != 0) {
		if ((err = getnameinfo((struct sockaddr *)&dest,
		    dest_len, dest_addr, sizeof(dest_addr), 
		    dest_serv, sizeof(dest_serv), NI_NUMERICHOST)) == -1) {
			fprintf(stderr, "getnameinfo: %d\n", err);
			exit(1);
		}
		target.fd = connsock(&dest, dest_len, hoplimit);
	}
	
	/* Control socket */
	if (ctlsock_path != NULL)
		ctlsock = unix_listener(ctlsock_path); /* Will exit on fail */
	
	if (dontfork_flag) {
		loginit(PROGNAME, 1);
	} else {	
		FILE *pidfile;

		r = daemon(0, 0);
		loginit(PROGNAME, 0);

		if ((pidfile = fopen(pidfile_path, "w")) == NULL) {
			fprintf(stderr, "Couldn't open pidfile %s: %s\n",
			    pidfile_path, strerror(errno));
			exit(1);
		}
		fprintf(pidfile, "%u\n", getpid());
		fclose(pidfile);

		signal(SIGINT, sighand_graceful_shutdown);
		signal(SIGTERM, sighand_graceful_shutdown);
		signal(SIGSEGV, sighand_other);

		setprotoent(1);
		drop_privs();
	}

	logit(LOG_NOTICE, "%s v%s starting data collection", 
	    PROGNAME, PROGVER);
	if (dest.ss_family != 0) {
		logit(LOG_NOTICE, "Exporting flows to [%s]:%s",
		    dest_addr, dest_serv);
	}

	/* Main processing loop */
	gettimeofday(&flowtrack.system_boot_time, NULL);
	stop_collection_flag = 0;
	memset(&cb_ctxt, '\0', sizeof(cb_ctxt));
	cb_ctxt.ft = &flowtrack;
	cb_ctxt.linktype = linktype;
	cb_ctxt.want_v6 = target.dialect->v6_capable || always_v6;

	for (r = 0; graceful_shutdown_request == 0; r = 0) {
		/*
		 * Silly libpcap's timeout function doesn't work, so we
		 * do it here (only if we are reading live)
		 */
		if (capfile == NULL) {
			memset(pl, '\0', sizeof(pl));

			/* This can only be set via the control socket */
			if (!stop_collection_flag) {
				pl[0].events = POLLIN|POLLERR|POLLHUP;
				pl[0].fd = pcap_fileno(pcap);
			}
			if (ctlsock != -1) {
				pl[1].fd = ctlsock;
				pl[1].events = POLLIN|POLLERR|POLLHUP;
			}

			r = poll(pl, (ctlsock == -1) ? 1 : 2, 
			    next_expire(&flowtrack));
			if (r == -1 && errno != EINTR) {
				logit(LOG_ERR, "Exiting on poll: %s", 
				    strerror(errno));
				break;
			}
		}

		/* Accept connection on control socket if present */
		if (ctlsock != -1 && pl[1].revents != 0) {
			if (accept_control(ctlsock, &target, &flowtrack, pcap,
			    &exit_request, &stop_collection_flag) != 0)
				break;
		}

		/* If we have data, run it through libpcap */
		if (!stop_collection_flag && 
		    (capfile != NULL || pl[0].revents != 0)) {
			r = pcap_dispatch(pcap, flowtrack.max_flows, flow_cb,
			    (void*)&cb_ctxt);
			if (r == -1) {
				logit(LOG_ERR, "Exiting on pcap_dispatch: %s", 
				    pcap_geterr(pcap));
				break;
			} else if (r == 0 && capfile != NULL) {
				logit(LOG_NOTICE, "Shutting down after "
				    "pcap EOF");
				graceful_shutdown_request = 1;
				break;
			}
		}
		r = 0;

		/* Fatal error from per-packet functions */
		if (cb_ctxt.fatal) {
			logit(LOG_WARNING, "Fatal error - exiting immediately");
			break;
		}

		/*
		 * Expiry processing happens every recheck_rate seconds
		 * or whenever we have exceeded the maximum number of active 
		 * flows
		 */
		if (flowtrack.num_flows > flowtrack.max_flows || 
		    next_expire(&flowtrack) == 0) {
expiry_check:
			/*
			 * If we are reading from a capture file, we never
			 * expire flows based on time - instead we only 
			 * expire flows when the flow table is full. 
			 */
			if (check_expired(&flowtrack, &target, 
			    capfile == NULL ? CE_EXPIRE_NORMAL :
			    CE_EXPIRE_FORCED) < 0)
				logit(LOG_WARNING, "Unable to export flows");
	
			/*
			 * If we are over max_flows, force-expire the oldest 
			 * out first and immediately reprocess to evict them
			 */
			if (flowtrack.num_flows > flowtrack.max_flows) {
				force_expire(&flowtrack,
				    flowtrack.num_flows - flowtrack.max_flows);
				goto expiry_check;
			}
		}
	}

	/* Flags set by signal handlers or control socket */
	if (graceful_shutdown_request) {
		logit(LOG_WARNING, "Shutting down on user request");
		check_expired(&flowtrack, &target, CE_EXPIRE_ALL);
	} else if (exit_request)
		logit(LOG_WARNING, "Exiting immediately on user request");
	else
		logit(LOG_ERR, "Exiting immediately on internal error");
		
	if (capfile != NULL && dontfork_flag)
		statistics(&flowtrack, stdout, pcap);

	pcap_close(pcap);
	
	if (target.fd != -1)
		close(target.fd);

	unlink(pidfile_path);
	if (ctlsock_path != NULL)
		unlink(ctlsock_path);
	
	return(r == 0 ? 0 : 1);
}